Merge branch 'for-chris' of git://repo.or.cz/linux-btrfs-devel into integration
[firefly-linux-kernel-4.4.55.git] / fs / btrfs / volumes.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/bio.h>
20 #include <linux/slab.h>
21 #include <linux/buffer_head.h>
22 #include <linux/blkdev.h>
23 #include <linux/random.h>
24 #include <linux/iocontext.h>
25 #include <linux/capability.h>
26 #include <linux/kthread.h>
27 #include <asm/div64.h>
28 #include "compat.h"
29 #include "ctree.h"
30 #include "extent_map.h"
31 #include "disk-io.h"
32 #include "transaction.h"
33 #include "print-tree.h"
34 #include "volumes.h"
35 #include "async-thread.h"
36
37 static int init_first_rw_device(struct btrfs_trans_handle *trans,
38                                 struct btrfs_root *root,
39                                 struct btrfs_device *device);
40 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
41
42 static DEFINE_MUTEX(uuid_mutex);
43 static LIST_HEAD(fs_uuids);
44
45 static void lock_chunks(struct btrfs_root *root)
46 {
47         mutex_lock(&root->fs_info->chunk_mutex);
48 }
49
50 static void unlock_chunks(struct btrfs_root *root)
51 {
52         mutex_unlock(&root->fs_info->chunk_mutex);
53 }
54
55 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
56 {
57         struct btrfs_device *device;
58         WARN_ON(fs_devices->opened);
59         while (!list_empty(&fs_devices->devices)) {
60                 device = list_entry(fs_devices->devices.next,
61                                     struct btrfs_device, dev_list);
62                 list_del(&device->dev_list);
63                 kfree(device->name);
64                 kfree(device);
65         }
66         kfree(fs_devices);
67 }
68
69 int btrfs_cleanup_fs_uuids(void)
70 {
71         struct btrfs_fs_devices *fs_devices;
72
73         while (!list_empty(&fs_uuids)) {
74                 fs_devices = list_entry(fs_uuids.next,
75                                         struct btrfs_fs_devices, list);
76                 list_del(&fs_devices->list);
77                 free_fs_devices(fs_devices);
78         }
79         return 0;
80 }
81
82 static noinline struct btrfs_device *__find_device(struct list_head *head,
83                                                    u64 devid, u8 *uuid)
84 {
85         struct btrfs_device *dev;
86
87         list_for_each_entry(dev, head, dev_list) {
88                 if (dev->devid == devid &&
89                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
90                         return dev;
91                 }
92         }
93         return NULL;
94 }
95
96 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
97 {
98         struct btrfs_fs_devices *fs_devices;
99
100         list_for_each_entry(fs_devices, &fs_uuids, list) {
101                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
102                         return fs_devices;
103         }
104         return NULL;
105 }
106
107 static void requeue_list(struct btrfs_pending_bios *pending_bios,
108                         struct bio *head, struct bio *tail)
109 {
110
111         struct bio *old_head;
112
113         old_head = pending_bios->head;
114         pending_bios->head = head;
115         if (pending_bios->tail)
116                 tail->bi_next = old_head;
117         else
118                 pending_bios->tail = tail;
119 }
120
121 /*
122  * we try to collect pending bios for a device so we don't get a large
123  * number of procs sending bios down to the same device.  This greatly
124  * improves the schedulers ability to collect and merge the bios.
125  *
126  * But, it also turns into a long list of bios to process and that is sure
127  * to eventually make the worker thread block.  The solution here is to
128  * make some progress and then put this work struct back at the end of
129  * the list if the block device is congested.  This way, multiple devices
130  * can make progress from a single worker thread.
131  */
132 static noinline int run_scheduled_bios(struct btrfs_device *device)
133 {
134         struct bio *pending;
135         struct backing_dev_info *bdi;
136         struct btrfs_fs_info *fs_info;
137         struct btrfs_pending_bios *pending_bios;
138         struct bio *tail;
139         struct bio *cur;
140         int again = 0;
141         unsigned long num_run;
142         unsigned long batch_run = 0;
143         unsigned long limit;
144         unsigned long last_waited = 0;
145         int force_reg = 0;
146         int sync_pending = 0;
147         struct blk_plug plug;
148
149         /*
150          * this function runs all the bios we've collected for
151          * a particular device.  We don't want to wander off to
152          * another device without first sending all of these down.
153          * So, setup a plug here and finish it off before we return
154          */
155         blk_start_plug(&plug);
156
157         bdi = blk_get_backing_dev_info(device->bdev);
158         fs_info = device->dev_root->fs_info;
159         limit = btrfs_async_submit_limit(fs_info);
160         limit = limit * 2 / 3;
161
162 loop:
163         spin_lock(&device->io_lock);
164
165 loop_lock:
166         num_run = 0;
167
168         /* take all the bios off the list at once and process them
169          * later on (without the lock held).  But, remember the
170          * tail and other pointers so the bios can be properly reinserted
171          * into the list if we hit congestion
172          */
173         if (!force_reg && device->pending_sync_bios.head) {
174                 pending_bios = &device->pending_sync_bios;
175                 force_reg = 1;
176         } else {
177                 pending_bios = &device->pending_bios;
178                 force_reg = 0;
179         }
180
181         pending = pending_bios->head;
182         tail = pending_bios->tail;
183         WARN_ON(pending && !tail);
184
185         /*
186          * if pending was null this time around, no bios need processing
187          * at all and we can stop.  Otherwise it'll loop back up again
188          * and do an additional check so no bios are missed.
189          *
190          * device->running_pending is used to synchronize with the
191          * schedule_bio code.
192          */
193         if (device->pending_sync_bios.head == NULL &&
194             device->pending_bios.head == NULL) {
195                 again = 0;
196                 device->running_pending = 0;
197         } else {
198                 again = 1;
199                 device->running_pending = 1;
200         }
201
202         pending_bios->head = NULL;
203         pending_bios->tail = NULL;
204
205         spin_unlock(&device->io_lock);
206
207         while (pending) {
208
209                 rmb();
210                 /* we want to work on both lists, but do more bios on the
211                  * sync list than the regular list
212                  */
213                 if ((num_run > 32 &&
214                     pending_bios != &device->pending_sync_bios &&
215                     device->pending_sync_bios.head) ||
216                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
217                     device->pending_bios.head)) {
218                         spin_lock(&device->io_lock);
219                         requeue_list(pending_bios, pending, tail);
220                         goto loop_lock;
221                 }
222
223                 cur = pending;
224                 pending = pending->bi_next;
225                 cur->bi_next = NULL;
226                 atomic_dec(&fs_info->nr_async_bios);
227
228                 if (atomic_read(&fs_info->nr_async_bios) < limit &&
229                     waitqueue_active(&fs_info->async_submit_wait))
230                         wake_up(&fs_info->async_submit_wait);
231
232                 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
233
234                 /*
235                  * if we're doing the sync list, record that our
236                  * plug has some sync requests on it
237                  *
238                  * If we're doing the regular list and there are
239                  * sync requests sitting around, unplug before
240                  * we add more
241                  */
242                 if (pending_bios == &device->pending_sync_bios) {
243                         sync_pending = 1;
244                 } else if (sync_pending) {
245                         blk_finish_plug(&plug);
246                         blk_start_plug(&plug);
247                         sync_pending = 0;
248                 }
249
250                 submit_bio(cur->bi_rw, cur);
251                 num_run++;
252                 batch_run++;
253                 if (need_resched())
254                         cond_resched();
255
256                 /*
257                  * we made progress, there is more work to do and the bdi
258                  * is now congested.  Back off and let other work structs
259                  * run instead
260                  */
261                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
262                     fs_info->fs_devices->open_devices > 1) {
263                         struct io_context *ioc;
264
265                         ioc = current->io_context;
266
267                         /*
268                          * the main goal here is that we don't want to
269                          * block if we're going to be able to submit
270                          * more requests without blocking.
271                          *
272                          * This code does two great things, it pokes into
273                          * the elevator code from a filesystem _and_
274                          * it makes assumptions about how batching works.
275                          */
276                         if (ioc && ioc->nr_batch_requests > 0 &&
277                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
278                             (last_waited == 0 ||
279                              ioc->last_waited == last_waited)) {
280                                 /*
281                                  * we want to go through our batch of
282                                  * requests and stop.  So, we copy out
283                                  * the ioc->last_waited time and test
284                                  * against it before looping
285                                  */
286                                 last_waited = ioc->last_waited;
287                                 if (need_resched())
288                                         cond_resched();
289                                 continue;
290                         }
291                         spin_lock(&device->io_lock);
292                         requeue_list(pending_bios, pending, tail);
293                         device->running_pending = 1;
294
295                         spin_unlock(&device->io_lock);
296                         btrfs_requeue_work(&device->work);
297                         goto done;
298                 }
299                 /* unplug every 64 requests just for good measure */
300                 if (batch_run % 64 == 0) {
301                         blk_finish_plug(&plug);
302                         blk_start_plug(&plug);
303                         sync_pending = 0;
304                 }
305         }
306
307         cond_resched();
308         if (again)
309                 goto loop;
310
311         spin_lock(&device->io_lock);
312         if (device->pending_bios.head || device->pending_sync_bios.head)
313                 goto loop_lock;
314         spin_unlock(&device->io_lock);
315
316 done:
317         blk_finish_plug(&plug);
318         return 0;
319 }
320
321 static void pending_bios_fn(struct btrfs_work *work)
322 {
323         struct btrfs_device *device;
324
325         device = container_of(work, struct btrfs_device, work);
326         run_scheduled_bios(device);
327 }
328
329 static noinline int device_list_add(const char *path,
330                            struct btrfs_super_block *disk_super,
331                            u64 devid, struct btrfs_fs_devices **fs_devices_ret)
332 {
333         struct btrfs_device *device;
334         struct btrfs_fs_devices *fs_devices;
335         u64 found_transid = btrfs_super_generation(disk_super);
336         char *name;
337
338         fs_devices = find_fsid(disk_super->fsid);
339         if (!fs_devices) {
340                 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
341                 if (!fs_devices)
342                         return -ENOMEM;
343                 INIT_LIST_HEAD(&fs_devices->devices);
344                 INIT_LIST_HEAD(&fs_devices->alloc_list);
345                 list_add(&fs_devices->list, &fs_uuids);
346                 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
347                 fs_devices->latest_devid = devid;
348                 fs_devices->latest_trans = found_transid;
349                 mutex_init(&fs_devices->device_list_mutex);
350                 device = NULL;
351         } else {
352                 device = __find_device(&fs_devices->devices, devid,
353                                        disk_super->dev_item.uuid);
354         }
355         if (!device) {
356                 if (fs_devices->opened)
357                         return -EBUSY;
358
359                 device = kzalloc(sizeof(*device), GFP_NOFS);
360                 if (!device) {
361                         /* we can safely leave the fs_devices entry around */
362                         return -ENOMEM;
363                 }
364                 device->devid = devid;
365                 device->work.func = pending_bios_fn;
366                 memcpy(device->uuid, disk_super->dev_item.uuid,
367                        BTRFS_UUID_SIZE);
368                 spin_lock_init(&device->io_lock);
369                 device->name = kstrdup(path, GFP_NOFS);
370                 if (!device->name) {
371                         kfree(device);
372                         return -ENOMEM;
373                 }
374                 INIT_LIST_HEAD(&device->dev_alloc_list);
375
376                 /* init readahead state */
377                 spin_lock_init(&device->reada_lock);
378                 device->reada_curr_zone = NULL;
379                 atomic_set(&device->reada_in_flight, 0);
380                 device->reada_next = 0;
381                 INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
382                 INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
383
384                 mutex_lock(&fs_devices->device_list_mutex);
385                 list_add_rcu(&device->dev_list, &fs_devices->devices);
386                 mutex_unlock(&fs_devices->device_list_mutex);
387
388                 device->fs_devices = fs_devices;
389                 fs_devices->num_devices++;
390         } else if (!device->name || strcmp(device->name, path)) {
391                 name = kstrdup(path, GFP_NOFS);
392                 if (!name)
393                         return -ENOMEM;
394                 kfree(device->name);
395                 device->name = name;
396                 if (device->missing) {
397                         fs_devices->missing_devices--;
398                         device->missing = 0;
399                 }
400         }
401
402         if (found_transid > fs_devices->latest_trans) {
403                 fs_devices->latest_devid = devid;
404                 fs_devices->latest_trans = found_transid;
405         }
406         *fs_devices_ret = fs_devices;
407         return 0;
408 }
409
410 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
411 {
412         struct btrfs_fs_devices *fs_devices;
413         struct btrfs_device *device;
414         struct btrfs_device *orig_dev;
415
416         fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
417         if (!fs_devices)
418                 return ERR_PTR(-ENOMEM);
419
420         INIT_LIST_HEAD(&fs_devices->devices);
421         INIT_LIST_HEAD(&fs_devices->alloc_list);
422         INIT_LIST_HEAD(&fs_devices->list);
423         mutex_init(&fs_devices->device_list_mutex);
424         fs_devices->latest_devid = orig->latest_devid;
425         fs_devices->latest_trans = orig->latest_trans;
426         memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
427
428         /* We have held the volume lock, it is safe to get the devices. */
429         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
430                 device = kzalloc(sizeof(*device), GFP_NOFS);
431                 if (!device)
432                         goto error;
433
434                 device->name = kstrdup(orig_dev->name, GFP_NOFS);
435                 if (!device->name) {
436                         kfree(device);
437                         goto error;
438                 }
439
440                 device->devid = orig_dev->devid;
441                 device->work.func = pending_bios_fn;
442                 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
443                 spin_lock_init(&device->io_lock);
444                 INIT_LIST_HEAD(&device->dev_list);
445                 INIT_LIST_HEAD(&device->dev_alloc_list);
446
447                 list_add(&device->dev_list, &fs_devices->devices);
448                 device->fs_devices = fs_devices;
449                 fs_devices->num_devices++;
450         }
451         return fs_devices;
452 error:
453         free_fs_devices(fs_devices);
454         return ERR_PTR(-ENOMEM);
455 }
456
457 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
458 {
459         struct btrfs_device *device, *next;
460
461         mutex_lock(&uuid_mutex);
462 again:
463         /* This is the initialized path, it is safe to release the devices. */
464         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
465                 if (device->in_fs_metadata)
466                         continue;
467
468                 if (device->bdev) {
469                         blkdev_put(device->bdev, device->mode);
470                         device->bdev = NULL;
471                         fs_devices->open_devices--;
472                 }
473                 if (device->writeable) {
474                         list_del_init(&device->dev_alloc_list);
475                         device->writeable = 0;
476                         fs_devices->rw_devices--;
477                 }
478                 list_del_init(&device->dev_list);
479                 fs_devices->num_devices--;
480                 kfree(device->name);
481                 kfree(device);
482         }
483
484         if (fs_devices->seed) {
485                 fs_devices = fs_devices->seed;
486                 goto again;
487         }
488
489         mutex_unlock(&uuid_mutex);
490         return 0;
491 }
492
493 static void __free_device(struct work_struct *work)
494 {
495         struct btrfs_device *device;
496
497         device = container_of(work, struct btrfs_device, rcu_work);
498
499         if (device->bdev)
500                 blkdev_put(device->bdev, device->mode);
501
502         kfree(device->name);
503         kfree(device);
504 }
505
506 static void free_device(struct rcu_head *head)
507 {
508         struct btrfs_device *device;
509
510         device = container_of(head, struct btrfs_device, rcu);
511
512         INIT_WORK(&device->rcu_work, __free_device);
513         schedule_work(&device->rcu_work);
514 }
515
516 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
517 {
518         struct btrfs_device *device;
519
520         if (--fs_devices->opened > 0)
521                 return 0;
522
523         mutex_lock(&fs_devices->device_list_mutex);
524         list_for_each_entry(device, &fs_devices->devices, dev_list) {
525                 struct btrfs_device *new_device;
526
527                 if (device->bdev)
528                         fs_devices->open_devices--;
529
530                 if (device->writeable) {
531                         list_del_init(&device->dev_alloc_list);
532                         fs_devices->rw_devices--;
533                 }
534
535                 if (device->can_discard)
536                         fs_devices->num_can_discard--;
537
538                 new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
539                 BUG_ON(!new_device);
540                 memcpy(new_device, device, sizeof(*new_device));
541                 new_device->name = kstrdup(device->name, GFP_NOFS);
542                 BUG_ON(device->name && !new_device->name);
543                 new_device->bdev = NULL;
544                 new_device->writeable = 0;
545                 new_device->in_fs_metadata = 0;
546                 new_device->can_discard = 0;
547                 list_replace_rcu(&device->dev_list, &new_device->dev_list);
548
549                 call_rcu(&device->rcu, free_device);
550         }
551         mutex_unlock(&fs_devices->device_list_mutex);
552
553         WARN_ON(fs_devices->open_devices);
554         WARN_ON(fs_devices->rw_devices);
555         fs_devices->opened = 0;
556         fs_devices->seeding = 0;
557
558         return 0;
559 }
560
561 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
562 {
563         struct btrfs_fs_devices *seed_devices = NULL;
564         int ret;
565
566         mutex_lock(&uuid_mutex);
567         ret = __btrfs_close_devices(fs_devices);
568         if (!fs_devices->opened) {
569                 seed_devices = fs_devices->seed;
570                 fs_devices->seed = NULL;
571         }
572         mutex_unlock(&uuid_mutex);
573
574         while (seed_devices) {
575                 fs_devices = seed_devices;
576                 seed_devices = fs_devices->seed;
577                 __btrfs_close_devices(fs_devices);
578                 free_fs_devices(fs_devices);
579         }
580         return ret;
581 }
582
583 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
584                                 fmode_t flags, void *holder)
585 {
586         struct request_queue *q;
587         struct block_device *bdev;
588         struct list_head *head = &fs_devices->devices;
589         struct btrfs_device *device;
590         struct block_device *latest_bdev = NULL;
591         struct buffer_head *bh;
592         struct btrfs_super_block *disk_super;
593         u64 latest_devid = 0;
594         u64 latest_transid = 0;
595         u64 devid;
596         int seeding = 1;
597         int ret = 0;
598
599         flags |= FMODE_EXCL;
600
601         list_for_each_entry(device, head, dev_list) {
602                 if (device->bdev)
603                         continue;
604                 if (!device->name)
605                         continue;
606
607                 bdev = blkdev_get_by_path(device->name, flags, holder);
608                 if (IS_ERR(bdev)) {
609                         printk(KERN_INFO "open %s failed\n", device->name);
610                         goto error;
611                 }
612                 set_blocksize(bdev, 4096);
613
614                 bh = btrfs_read_dev_super(bdev);
615                 if (!bh)
616                         goto error_close;
617
618                 disk_super = (struct btrfs_super_block *)bh->b_data;
619                 devid = btrfs_stack_device_id(&disk_super->dev_item);
620                 if (devid != device->devid)
621                         goto error_brelse;
622
623                 if (memcmp(device->uuid, disk_super->dev_item.uuid,
624                            BTRFS_UUID_SIZE))
625                         goto error_brelse;
626
627                 device->generation = btrfs_super_generation(disk_super);
628                 if (!latest_transid || device->generation > latest_transid) {
629                         latest_devid = devid;
630                         latest_transid = device->generation;
631                         latest_bdev = bdev;
632                 }
633
634                 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
635                         device->writeable = 0;
636                 } else {
637                         device->writeable = !bdev_read_only(bdev);
638                         seeding = 0;
639                 }
640
641                 q = bdev_get_queue(bdev);
642                 if (blk_queue_discard(q)) {
643                         device->can_discard = 1;
644                         fs_devices->num_can_discard++;
645                 }
646
647                 device->bdev = bdev;
648                 device->in_fs_metadata = 0;
649                 device->mode = flags;
650
651                 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
652                         fs_devices->rotating = 1;
653
654                 fs_devices->open_devices++;
655                 if (device->writeable) {
656                         fs_devices->rw_devices++;
657                         list_add(&device->dev_alloc_list,
658                                  &fs_devices->alloc_list);
659                 }
660                 brelse(bh);
661                 continue;
662
663 error_brelse:
664                 brelse(bh);
665 error_close:
666                 blkdev_put(bdev, flags);
667 error:
668                 continue;
669         }
670         if (fs_devices->open_devices == 0) {
671                 ret = -EINVAL;
672                 goto out;
673         }
674         fs_devices->seeding = seeding;
675         fs_devices->opened = 1;
676         fs_devices->latest_bdev = latest_bdev;
677         fs_devices->latest_devid = latest_devid;
678         fs_devices->latest_trans = latest_transid;
679         fs_devices->total_rw_bytes = 0;
680 out:
681         return ret;
682 }
683
684 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
685                        fmode_t flags, void *holder)
686 {
687         int ret;
688
689         mutex_lock(&uuid_mutex);
690         if (fs_devices->opened) {
691                 fs_devices->opened++;
692                 ret = 0;
693         } else {
694                 ret = __btrfs_open_devices(fs_devices, flags, holder);
695         }
696         mutex_unlock(&uuid_mutex);
697         return ret;
698 }
699
700 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
701                           struct btrfs_fs_devices **fs_devices_ret)
702 {
703         struct btrfs_super_block *disk_super;
704         struct block_device *bdev;
705         struct buffer_head *bh;
706         int ret;
707         u64 devid;
708         u64 transid;
709
710         mutex_lock(&uuid_mutex);
711
712         flags |= FMODE_EXCL;
713         bdev = blkdev_get_by_path(path, flags, holder);
714
715         if (IS_ERR(bdev)) {
716                 ret = PTR_ERR(bdev);
717                 goto error;
718         }
719
720         ret = set_blocksize(bdev, 4096);
721         if (ret)
722                 goto error_close;
723         bh = btrfs_read_dev_super(bdev);
724         if (!bh) {
725                 ret = -EINVAL;
726                 goto error_close;
727         }
728         disk_super = (struct btrfs_super_block *)bh->b_data;
729         devid = btrfs_stack_device_id(&disk_super->dev_item);
730         transid = btrfs_super_generation(disk_super);
731         if (disk_super->label[0])
732                 printk(KERN_INFO "device label %s ", disk_super->label);
733         else
734                 printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
735         printk(KERN_CONT "devid %llu transid %llu %s\n",
736                (unsigned long long)devid, (unsigned long long)transid, path);
737         ret = device_list_add(path, disk_super, devid, fs_devices_ret);
738
739         brelse(bh);
740 error_close:
741         blkdev_put(bdev, flags);
742 error:
743         mutex_unlock(&uuid_mutex);
744         return ret;
745 }
746
747 /* helper to account the used device space in the range */
748 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
749                                    u64 end, u64 *length)
750 {
751         struct btrfs_key key;
752         struct btrfs_root *root = device->dev_root;
753         struct btrfs_dev_extent *dev_extent;
754         struct btrfs_path *path;
755         u64 extent_end;
756         int ret;
757         int slot;
758         struct extent_buffer *l;
759
760         *length = 0;
761
762         if (start >= device->total_bytes)
763                 return 0;
764
765         path = btrfs_alloc_path();
766         if (!path)
767                 return -ENOMEM;
768         path->reada = 2;
769
770         key.objectid = device->devid;
771         key.offset = start;
772         key.type = BTRFS_DEV_EXTENT_KEY;
773
774         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
775         if (ret < 0)
776                 goto out;
777         if (ret > 0) {
778                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
779                 if (ret < 0)
780                         goto out;
781         }
782
783         while (1) {
784                 l = path->nodes[0];
785                 slot = path->slots[0];
786                 if (slot >= btrfs_header_nritems(l)) {
787                         ret = btrfs_next_leaf(root, path);
788                         if (ret == 0)
789                                 continue;
790                         if (ret < 0)
791                                 goto out;
792
793                         break;
794                 }
795                 btrfs_item_key_to_cpu(l, &key, slot);
796
797                 if (key.objectid < device->devid)
798                         goto next;
799
800                 if (key.objectid > device->devid)
801                         break;
802
803                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
804                         goto next;
805
806                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
807                 extent_end = key.offset + btrfs_dev_extent_length(l,
808                                                                   dev_extent);
809                 if (key.offset <= start && extent_end > end) {
810                         *length = end - start + 1;
811                         break;
812                 } else if (key.offset <= start && extent_end > start)
813                         *length += extent_end - start;
814                 else if (key.offset > start && extent_end <= end)
815                         *length += extent_end - key.offset;
816                 else if (key.offset > start && key.offset <= end) {
817                         *length += end - key.offset + 1;
818                         break;
819                 } else if (key.offset > end)
820                         break;
821
822 next:
823                 path->slots[0]++;
824         }
825         ret = 0;
826 out:
827         btrfs_free_path(path);
828         return ret;
829 }
830
831 /*
832  * find_free_dev_extent - find free space in the specified device
833  * @device:     the device which we search the free space in
834  * @num_bytes:  the size of the free space that we need
835  * @start:      store the start of the free space.
836  * @len:        the size of the free space. that we find, or the size of the max
837  *              free space if we don't find suitable free space
838  *
839  * this uses a pretty simple search, the expectation is that it is
840  * called very infrequently and that a given device has a small number
841  * of extents
842  *
843  * @start is used to store the start of the free space if we find. But if we
844  * don't find suitable free space, it will be used to store the start position
845  * of the max free space.
846  *
847  * @len is used to store the size of the free space that we find.
848  * But if we don't find suitable free space, it is used to store the size of
849  * the max free space.
850  */
851 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
852                          u64 *start, u64 *len)
853 {
854         struct btrfs_key key;
855         struct btrfs_root *root = device->dev_root;
856         struct btrfs_dev_extent *dev_extent;
857         struct btrfs_path *path;
858         u64 hole_size;
859         u64 max_hole_start;
860         u64 max_hole_size;
861         u64 extent_end;
862         u64 search_start;
863         u64 search_end = device->total_bytes;
864         int ret;
865         int slot;
866         struct extent_buffer *l;
867
868         /* FIXME use last free of some kind */
869
870         /* we don't want to overwrite the superblock on the drive,
871          * so we make sure to start at an offset of at least 1MB
872          */
873         search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
874
875         max_hole_start = search_start;
876         max_hole_size = 0;
877         hole_size = 0;
878
879         if (search_start >= search_end) {
880                 ret = -ENOSPC;
881                 goto error;
882         }
883
884         path = btrfs_alloc_path();
885         if (!path) {
886                 ret = -ENOMEM;
887                 goto error;
888         }
889         path->reada = 2;
890
891         key.objectid = device->devid;
892         key.offset = search_start;
893         key.type = BTRFS_DEV_EXTENT_KEY;
894
895         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
896         if (ret < 0)
897                 goto out;
898         if (ret > 0) {
899                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
900                 if (ret < 0)
901                         goto out;
902         }
903
904         while (1) {
905                 l = path->nodes[0];
906                 slot = path->slots[0];
907                 if (slot >= btrfs_header_nritems(l)) {
908                         ret = btrfs_next_leaf(root, path);
909                         if (ret == 0)
910                                 continue;
911                         if (ret < 0)
912                                 goto out;
913
914                         break;
915                 }
916                 btrfs_item_key_to_cpu(l, &key, slot);
917
918                 if (key.objectid < device->devid)
919                         goto next;
920
921                 if (key.objectid > device->devid)
922                         break;
923
924                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
925                         goto next;
926
927                 if (key.offset > search_start) {
928                         hole_size = key.offset - search_start;
929
930                         if (hole_size > max_hole_size) {
931                                 max_hole_start = search_start;
932                                 max_hole_size = hole_size;
933                         }
934
935                         /*
936                          * If this free space is greater than which we need,
937                          * it must be the max free space that we have found
938                          * until now, so max_hole_start must point to the start
939                          * of this free space and the length of this free space
940                          * is stored in max_hole_size. Thus, we return
941                          * max_hole_start and max_hole_size and go back to the
942                          * caller.
943                          */
944                         if (hole_size >= num_bytes) {
945                                 ret = 0;
946                                 goto out;
947                         }
948                 }
949
950                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
951                 extent_end = key.offset + btrfs_dev_extent_length(l,
952                                                                   dev_extent);
953                 if (extent_end > search_start)
954                         search_start = extent_end;
955 next:
956                 path->slots[0]++;
957                 cond_resched();
958         }
959
960         /*
961          * At this point, search_start should be the end of
962          * allocated dev extents, and when shrinking the device,
963          * search_end may be smaller than search_start.
964          */
965         if (search_end > search_start)
966                 hole_size = search_end - search_start;
967
968         if (hole_size > max_hole_size) {
969                 max_hole_start = search_start;
970                 max_hole_size = hole_size;
971         }
972
973         /* See above. */
974         if (hole_size < num_bytes)
975                 ret = -ENOSPC;
976         else
977                 ret = 0;
978
979 out:
980         btrfs_free_path(path);
981 error:
982         *start = max_hole_start;
983         if (len)
984                 *len = max_hole_size;
985         return ret;
986 }
987
988 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
989                           struct btrfs_device *device,
990                           u64 start)
991 {
992         int ret;
993         struct btrfs_path *path;
994         struct btrfs_root *root = device->dev_root;
995         struct btrfs_key key;
996         struct btrfs_key found_key;
997         struct extent_buffer *leaf = NULL;
998         struct btrfs_dev_extent *extent = NULL;
999
1000         path = btrfs_alloc_path();
1001         if (!path)
1002                 return -ENOMEM;
1003
1004         key.objectid = device->devid;
1005         key.offset = start;
1006         key.type = BTRFS_DEV_EXTENT_KEY;
1007 again:
1008         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1009         if (ret > 0) {
1010                 ret = btrfs_previous_item(root, path, key.objectid,
1011                                           BTRFS_DEV_EXTENT_KEY);
1012                 if (ret)
1013                         goto out;
1014                 leaf = path->nodes[0];
1015                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1016                 extent = btrfs_item_ptr(leaf, path->slots[0],
1017                                         struct btrfs_dev_extent);
1018                 BUG_ON(found_key.offset > start || found_key.offset +
1019                        btrfs_dev_extent_length(leaf, extent) < start);
1020                 key = found_key;
1021                 btrfs_release_path(path);
1022                 goto again;
1023         } else if (ret == 0) {
1024                 leaf = path->nodes[0];
1025                 extent = btrfs_item_ptr(leaf, path->slots[0],
1026                                         struct btrfs_dev_extent);
1027         }
1028         BUG_ON(ret);
1029
1030         if (device->bytes_used > 0) {
1031                 u64 len = btrfs_dev_extent_length(leaf, extent);
1032                 device->bytes_used -= len;
1033                 spin_lock(&root->fs_info->free_chunk_lock);
1034                 root->fs_info->free_chunk_space += len;
1035                 spin_unlock(&root->fs_info->free_chunk_lock);
1036         }
1037         ret = btrfs_del_item(trans, root, path);
1038
1039 out:
1040         btrfs_free_path(path);
1041         return ret;
1042 }
1043
1044 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1045                            struct btrfs_device *device,
1046                            u64 chunk_tree, u64 chunk_objectid,
1047                            u64 chunk_offset, u64 start, u64 num_bytes)
1048 {
1049         int ret;
1050         struct btrfs_path *path;
1051         struct btrfs_root *root = device->dev_root;
1052         struct btrfs_dev_extent *extent;
1053         struct extent_buffer *leaf;
1054         struct btrfs_key key;
1055
1056         WARN_ON(!device->in_fs_metadata);
1057         path = btrfs_alloc_path();
1058         if (!path)
1059                 return -ENOMEM;
1060
1061         key.objectid = device->devid;
1062         key.offset = start;
1063         key.type = BTRFS_DEV_EXTENT_KEY;
1064         ret = btrfs_insert_empty_item(trans, root, path, &key,
1065                                       sizeof(*extent));
1066         BUG_ON(ret);
1067
1068         leaf = path->nodes[0];
1069         extent = btrfs_item_ptr(leaf, path->slots[0],
1070                                 struct btrfs_dev_extent);
1071         btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
1072         btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
1073         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1074
1075         write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
1076                     (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
1077                     BTRFS_UUID_SIZE);
1078
1079         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1080         btrfs_mark_buffer_dirty(leaf);
1081         btrfs_free_path(path);
1082         return ret;
1083 }
1084
1085 static noinline int find_next_chunk(struct btrfs_root *root,
1086                                     u64 objectid, u64 *offset)
1087 {
1088         struct btrfs_path *path;
1089         int ret;
1090         struct btrfs_key key;
1091         struct btrfs_chunk *chunk;
1092         struct btrfs_key found_key;
1093
1094         path = btrfs_alloc_path();
1095         if (!path)
1096                 return -ENOMEM;
1097
1098         key.objectid = objectid;
1099         key.offset = (u64)-1;
1100         key.type = BTRFS_CHUNK_ITEM_KEY;
1101
1102         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1103         if (ret < 0)
1104                 goto error;
1105
1106         BUG_ON(ret == 0);
1107
1108         ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
1109         if (ret) {
1110                 *offset = 0;
1111         } else {
1112                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1113                                       path->slots[0]);
1114                 if (found_key.objectid != objectid)
1115                         *offset = 0;
1116                 else {
1117                         chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
1118                                                struct btrfs_chunk);
1119                         *offset = found_key.offset +
1120                                 btrfs_chunk_length(path->nodes[0], chunk);
1121                 }
1122         }
1123         ret = 0;
1124 error:
1125         btrfs_free_path(path);
1126         return ret;
1127 }
1128
1129 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
1130 {
1131         int ret;
1132         struct btrfs_key key;
1133         struct btrfs_key found_key;
1134         struct btrfs_path *path;
1135
1136         root = root->fs_info->chunk_root;
1137
1138         path = btrfs_alloc_path();
1139         if (!path)
1140                 return -ENOMEM;
1141
1142         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1143         key.type = BTRFS_DEV_ITEM_KEY;
1144         key.offset = (u64)-1;
1145
1146         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1147         if (ret < 0)
1148                 goto error;
1149
1150         BUG_ON(ret == 0);
1151
1152         ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
1153                                   BTRFS_DEV_ITEM_KEY);
1154         if (ret) {
1155                 *objectid = 1;
1156         } else {
1157                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1158                                       path->slots[0]);
1159                 *objectid = found_key.offset + 1;
1160         }
1161         ret = 0;
1162 error:
1163         btrfs_free_path(path);
1164         return ret;
1165 }
1166
1167 /*
1168  * the device information is stored in the chunk root
1169  * the btrfs_device struct should be fully filled in
1170  */
1171 int btrfs_add_device(struct btrfs_trans_handle *trans,
1172                      struct btrfs_root *root,
1173                      struct btrfs_device *device)
1174 {
1175         int ret;
1176         struct btrfs_path *path;
1177         struct btrfs_dev_item *dev_item;
1178         struct extent_buffer *leaf;
1179         struct btrfs_key key;
1180         unsigned long ptr;
1181
1182         root = root->fs_info->chunk_root;
1183
1184         path = btrfs_alloc_path();
1185         if (!path)
1186                 return -ENOMEM;
1187
1188         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1189         key.type = BTRFS_DEV_ITEM_KEY;
1190         key.offset = device->devid;
1191
1192         ret = btrfs_insert_empty_item(trans, root, path, &key,
1193                                       sizeof(*dev_item));
1194         if (ret)
1195                 goto out;
1196
1197         leaf = path->nodes[0];
1198         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1199
1200         btrfs_set_device_id(leaf, dev_item, device->devid);
1201         btrfs_set_device_generation(leaf, dev_item, 0);
1202         btrfs_set_device_type(leaf, dev_item, device->type);
1203         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1204         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1205         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1206         btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1207         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1208         btrfs_set_device_group(leaf, dev_item, 0);
1209         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1210         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1211         btrfs_set_device_start_offset(leaf, dev_item, 0);
1212
1213         ptr = (unsigned long)btrfs_device_uuid(dev_item);
1214         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1215         ptr = (unsigned long)btrfs_device_fsid(dev_item);
1216         write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
1217         btrfs_mark_buffer_dirty(leaf);
1218
1219         ret = 0;
1220 out:
1221         btrfs_free_path(path);
1222         return ret;
1223 }
1224
1225 static int btrfs_rm_dev_item(struct btrfs_root *root,
1226                              struct btrfs_device *device)
1227 {
1228         int ret;
1229         struct btrfs_path *path;
1230         struct btrfs_key key;
1231         struct btrfs_trans_handle *trans;
1232
1233         root = root->fs_info->chunk_root;
1234
1235         path = btrfs_alloc_path();
1236         if (!path)
1237                 return -ENOMEM;
1238
1239         trans = btrfs_start_transaction(root, 0);
1240         if (IS_ERR(trans)) {
1241                 btrfs_free_path(path);
1242                 return PTR_ERR(trans);
1243         }
1244         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1245         key.type = BTRFS_DEV_ITEM_KEY;
1246         key.offset = device->devid;
1247         lock_chunks(root);
1248
1249         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1250         if (ret < 0)
1251                 goto out;
1252
1253         if (ret > 0) {
1254                 ret = -ENOENT;
1255                 goto out;
1256         }
1257
1258         ret = btrfs_del_item(trans, root, path);
1259         if (ret)
1260                 goto out;
1261 out:
1262         btrfs_free_path(path);
1263         unlock_chunks(root);
1264         btrfs_commit_transaction(trans, root);
1265         return ret;
1266 }
1267
1268 int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1269 {
1270         struct btrfs_device *device;
1271         struct btrfs_device *next_device;
1272         struct block_device *bdev;
1273         struct buffer_head *bh = NULL;
1274         struct btrfs_super_block *disk_super;
1275         struct btrfs_fs_devices *cur_devices;
1276         u64 all_avail;
1277         u64 devid;
1278         u64 num_devices;
1279         u8 *dev_uuid;
1280         int ret = 0;
1281         bool clear_super = false;
1282
1283         mutex_lock(&uuid_mutex);
1284
1285         all_avail = root->fs_info->avail_data_alloc_bits |
1286                 root->fs_info->avail_system_alloc_bits |
1287                 root->fs_info->avail_metadata_alloc_bits;
1288
1289         if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
1290             root->fs_info->fs_devices->num_devices <= 4) {
1291                 printk(KERN_ERR "btrfs: unable to go below four devices "
1292                        "on raid10\n");
1293                 ret = -EINVAL;
1294                 goto out;
1295         }
1296
1297         if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
1298             root->fs_info->fs_devices->num_devices <= 2) {
1299                 printk(KERN_ERR "btrfs: unable to go below two "
1300                        "devices on raid1\n");
1301                 ret = -EINVAL;
1302                 goto out;
1303         }
1304
1305         if (strcmp(device_path, "missing") == 0) {
1306                 struct list_head *devices;
1307                 struct btrfs_device *tmp;
1308
1309                 device = NULL;
1310                 devices = &root->fs_info->fs_devices->devices;
1311                 /*
1312                  * It is safe to read the devices since the volume_mutex
1313                  * is held.
1314                  */
1315                 list_for_each_entry(tmp, devices, dev_list) {
1316                         if (tmp->in_fs_metadata && !tmp->bdev) {
1317                                 device = tmp;
1318                                 break;
1319                         }
1320                 }
1321                 bdev = NULL;
1322                 bh = NULL;
1323                 disk_super = NULL;
1324                 if (!device) {
1325                         printk(KERN_ERR "btrfs: no missing devices found to "
1326                                "remove\n");
1327                         goto out;
1328                 }
1329         } else {
1330                 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
1331                                           root->fs_info->bdev_holder);
1332                 if (IS_ERR(bdev)) {
1333                         ret = PTR_ERR(bdev);
1334                         goto out;
1335                 }
1336
1337                 set_blocksize(bdev, 4096);
1338                 bh = btrfs_read_dev_super(bdev);
1339                 if (!bh) {
1340                         ret = -EINVAL;
1341                         goto error_close;
1342                 }
1343                 disk_super = (struct btrfs_super_block *)bh->b_data;
1344                 devid = btrfs_stack_device_id(&disk_super->dev_item);
1345                 dev_uuid = disk_super->dev_item.uuid;
1346                 device = btrfs_find_device(root, devid, dev_uuid,
1347                                            disk_super->fsid);
1348                 if (!device) {
1349                         ret = -ENOENT;
1350                         goto error_brelse;
1351                 }
1352         }
1353
1354         if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1355                 printk(KERN_ERR "btrfs: unable to remove the only writeable "
1356                        "device\n");
1357                 ret = -EINVAL;
1358                 goto error_brelse;
1359         }
1360
1361         if (device->writeable) {
1362                 lock_chunks(root);
1363                 list_del_init(&device->dev_alloc_list);
1364                 unlock_chunks(root);
1365                 root->fs_info->fs_devices->rw_devices--;
1366                 clear_super = true;
1367         }
1368
1369         ret = btrfs_shrink_device(device, 0);
1370         if (ret)
1371                 goto error_undo;
1372
1373         ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1374         if (ret)
1375                 goto error_undo;
1376
1377         spin_lock(&root->fs_info->free_chunk_lock);
1378         root->fs_info->free_chunk_space = device->total_bytes -
1379                 device->bytes_used;
1380         spin_unlock(&root->fs_info->free_chunk_lock);
1381
1382         device->in_fs_metadata = 0;
1383         btrfs_scrub_cancel_dev(root, device);
1384
1385         /*
1386          * the device list mutex makes sure that we don't change
1387          * the device list while someone else is writing out all
1388          * the device supers.
1389          */
1390
1391         cur_devices = device->fs_devices;
1392         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1393         list_del_rcu(&device->dev_list);
1394
1395         device->fs_devices->num_devices--;
1396
1397         if (device->missing)
1398                 root->fs_info->fs_devices->missing_devices--;
1399
1400         next_device = list_entry(root->fs_info->fs_devices->devices.next,
1401                                  struct btrfs_device, dev_list);
1402         if (device->bdev == root->fs_info->sb->s_bdev)
1403                 root->fs_info->sb->s_bdev = next_device->bdev;
1404         if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1405                 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1406
1407         if (device->bdev)
1408                 device->fs_devices->open_devices--;
1409
1410         call_rcu(&device->rcu, free_device);
1411         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1412
1413         num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1414         btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1415
1416         if (cur_devices->open_devices == 0) {
1417                 struct btrfs_fs_devices *fs_devices;
1418                 fs_devices = root->fs_info->fs_devices;
1419                 while (fs_devices) {
1420                         if (fs_devices->seed == cur_devices)
1421                                 break;
1422                         fs_devices = fs_devices->seed;
1423                 }
1424                 fs_devices->seed = cur_devices->seed;
1425                 cur_devices->seed = NULL;
1426                 lock_chunks(root);
1427                 __btrfs_close_devices(cur_devices);
1428                 unlock_chunks(root);
1429                 free_fs_devices(cur_devices);
1430         }
1431
1432         /*
1433          * at this point, the device is zero sized.  We want to
1434          * remove it from the devices list and zero out the old super
1435          */
1436         if (clear_super) {
1437                 /* make sure this device isn't detected as part of
1438                  * the FS anymore
1439                  */
1440                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1441                 set_buffer_dirty(bh);
1442                 sync_dirty_buffer(bh);
1443         }
1444
1445         ret = 0;
1446
1447 error_brelse:
1448         brelse(bh);
1449 error_close:
1450         if (bdev)
1451                 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1452 out:
1453         mutex_unlock(&uuid_mutex);
1454         return ret;
1455 error_undo:
1456         if (device->writeable) {
1457                 lock_chunks(root);
1458                 list_add(&device->dev_alloc_list,
1459                          &root->fs_info->fs_devices->alloc_list);
1460                 unlock_chunks(root);
1461                 root->fs_info->fs_devices->rw_devices++;
1462         }
1463         goto error_brelse;
1464 }
1465
1466 /*
1467  * does all the dirty work required for changing file system's UUID.
1468  */
1469 static int btrfs_prepare_sprout(struct btrfs_root *root)
1470 {
1471         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1472         struct btrfs_fs_devices *old_devices;
1473         struct btrfs_fs_devices *seed_devices;
1474         struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1475         struct btrfs_device *device;
1476         u64 super_flags;
1477
1478         BUG_ON(!mutex_is_locked(&uuid_mutex));
1479         if (!fs_devices->seeding)
1480                 return -EINVAL;
1481
1482         seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1483         if (!seed_devices)
1484                 return -ENOMEM;
1485
1486         old_devices = clone_fs_devices(fs_devices);
1487         if (IS_ERR(old_devices)) {
1488                 kfree(seed_devices);
1489                 return PTR_ERR(old_devices);
1490         }
1491
1492         list_add(&old_devices->list, &fs_uuids);
1493
1494         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1495         seed_devices->opened = 1;
1496         INIT_LIST_HEAD(&seed_devices->devices);
1497         INIT_LIST_HEAD(&seed_devices->alloc_list);
1498         mutex_init(&seed_devices->device_list_mutex);
1499
1500         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1501         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
1502                               synchronize_rcu);
1503         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1504
1505         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1506         list_for_each_entry(device, &seed_devices->devices, dev_list) {
1507                 device->fs_devices = seed_devices;
1508         }
1509
1510         fs_devices->seeding = 0;
1511         fs_devices->num_devices = 0;
1512         fs_devices->open_devices = 0;
1513         fs_devices->seed = seed_devices;
1514
1515         generate_random_uuid(fs_devices->fsid);
1516         memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1517         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1518         super_flags = btrfs_super_flags(disk_super) &
1519                       ~BTRFS_SUPER_FLAG_SEEDING;
1520         btrfs_set_super_flags(disk_super, super_flags);
1521
1522         return 0;
1523 }
1524
1525 /*
1526  * strore the expected generation for seed devices in device items.
1527  */
1528 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1529                                struct btrfs_root *root)
1530 {
1531         struct btrfs_path *path;
1532         struct extent_buffer *leaf;
1533         struct btrfs_dev_item *dev_item;
1534         struct btrfs_device *device;
1535         struct btrfs_key key;
1536         u8 fs_uuid[BTRFS_UUID_SIZE];
1537         u8 dev_uuid[BTRFS_UUID_SIZE];
1538         u64 devid;
1539         int ret;
1540
1541         path = btrfs_alloc_path();
1542         if (!path)
1543                 return -ENOMEM;
1544
1545         root = root->fs_info->chunk_root;
1546         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1547         key.offset = 0;
1548         key.type = BTRFS_DEV_ITEM_KEY;
1549
1550         while (1) {
1551                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1552                 if (ret < 0)
1553                         goto error;
1554
1555                 leaf = path->nodes[0];
1556 next_slot:
1557                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1558                         ret = btrfs_next_leaf(root, path);
1559                         if (ret > 0)
1560                                 break;
1561                         if (ret < 0)
1562                                 goto error;
1563                         leaf = path->nodes[0];
1564                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1565                         btrfs_release_path(path);
1566                         continue;
1567                 }
1568
1569                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1570                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1571                     key.type != BTRFS_DEV_ITEM_KEY)
1572                         break;
1573
1574                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
1575                                           struct btrfs_dev_item);
1576                 devid = btrfs_device_id(leaf, dev_item);
1577                 read_extent_buffer(leaf, dev_uuid,
1578                                    (unsigned long)btrfs_device_uuid(dev_item),
1579                                    BTRFS_UUID_SIZE);
1580                 read_extent_buffer(leaf, fs_uuid,
1581                                    (unsigned long)btrfs_device_fsid(dev_item),
1582                                    BTRFS_UUID_SIZE);
1583                 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
1584                 BUG_ON(!device);
1585
1586                 if (device->fs_devices->seeding) {
1587                         btrfs_set_device_generation(leaf, dev_item,
1588                                                     device->generation);
1589                         btrfs_mark_buffer_dirty(leaf);
1590                 }
1591
1592                 path->slots[0]++;
1593                 goto next_slot;
1594         }
1595         ret = 0;
1596 error:
1597         btrfs_free_path(path);
1598         return ret;
1599 }
1600
1601 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1602 {
1603         struct request_queue *q;
1604         struct btrfs_trans_handle *trans;
1605         struct btrfs_device *device;
1606         struct block_device *bdev;
1607         struct list_head *devices;
1608         struct super_block *sb = root->fs_info->sb;
1609         u64 total_bytes;
1610         int seeding_dev = 0;
1611         int ret = 0;
1612
1613         if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1614                 return -EINVAL;
1615
1616         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
1617                                   root->fs_info->bdev_holder);
1618         if (IS_ERR(bdev))
1619                 return PTR_ERR(bdev);
1620
1621         if (root->fs_info->fs_devices->seeding) {
1622                 seeding_dev = 1;
1623                 down_write(&sb->s_umount);
1624                 mutex_lock(&uuid_mutex);
1625         }
1626
1627         filemap_write_and_wait(bdev->bd_inode->i_mapping);
1628
1629         devices = &root->fs_info->fs_devices->devices;
1630         /*
1631          * we have the volume lock, so we don't need the extra
1632          * device list mutex while reading the list here.
1633          */
1634         list_for_each_entry(device, devices, dev_list) {
1635                 if (device->bdev == bdev) {
1636                         ret = -EEXIST;
1637                         goto error;
1638                 }
1639         }
1640
1641         device = kzalloc(sizeof(*device), GFP_NOFS);
1642         if (!device) {
1643                 /* we can safely leave the fs_devices entry around */
1644                 ret = -ENOMEM;
1645                 goto error;
1646         }
1647
1648         device->name = kstrdup(device_path, GFP_NOFS);
1649         if (!device->name) {
1650                 kfree(device);
1651                 ret = -ENOMEM;
1652                 goto error;
1653         }
1654
1655         ret = find_next_devid(root, &device->devid);
1656         if (ret) {
1657                 kfree(device->name);
1658                 kfree(device);
1659                 goto error;
1660         }
1661
1662         trans = btrfs_start_transaction(root, 0);
1663         if (IS_ERR(trans)) {
1664                 kfree(device->name);
1665                 kfree(device);
1666                 ret = PTR_ERR(trans);
1667                 goto error;
1668         }
1669
1670         lock_chunks(root);
1671
1672         q = bdev_get_queue(bdev);
1673         if (blk_queue_discard(q))
1674                 device->can_discard = 1;
1675         device->writeable = 1;
1676         device->work.func = pending_bios_fn;
1677         generate_random_uuid(device->uuid);
1678         spin_lock_init(&device->io_lock);
1679         device->generation = trans->transid;
1680         device->io_width = root->sectorsize;
1681         device->io_align = root->sectorsize;
1682         device->sector_size = root->sectorsize;
1683         device->total_bytes = i_size_read(bdev->bd_inode);
1684         device->disk_total_bytes = device->total_bytes;
1685         device->dev_root = root->fs_info->dev_root;
1686         device->bdev = bdev;
1687         device->in_fs_metadata = 1;
1688         device->mode = FMODE_EXCL;
1689         set_blocksize(device->bdev, 4096);
1690
1691         if (seeding_dev) {
1692                 sb->s_flags &= ~MS_RDONLY;
1693                 ret = btrfs_prepare_sprout(root);
1694                 BUG_ON(ret);
1695         }
1696
1697         device->fs_devices = root->fs_info->fs_devices;
1698
1699         /*
1700          * we don't want write_supers to jump in here with our device
1701          * half setup
1702          */
1703         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1704         list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
1705         list_add(&device->dev_alloc_list,
1706                  &root->fs_info->fs_devices->alloc_list);
1707         root->fs_info->fs_devices->num_devices++;
1708         root->fs_info->fs_devices->open_devices++;
1709         root->fs_info->fs_devices->rw_devices++;
1710         if (device->can_discard)
1711                 root->fs_info->fs_devices->num_can_discard++;
1712         root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1713
1714         spin_lock(&root->fs_info->free_chunk_lock);
1715         root->fs_info->free_chunk_space += device->total_bytes;
1716         spin_unlock(&root->fs_info->free_chunk_lock);
1717
1718         if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1719                 root->fs_info->fs_devices->rotating = 1;
1720
1721         total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
1722         btrfs_set_super_total_bytes(root->fs_info->super_copy,
1723                                     total_bytes + device->total_bytes);
1724
1725         total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
1726         btrfs_set_super_num_devices(root->fs_info->super_copy,
1727                                     total_bytes + 1);
1728         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1729
1730         if (seeding_dev) {
1731                 ret = init_first_rw_device(trans, root, device);
1732                 BUG_ON(ret);
1733                 ret = btrfs_finish_sprout(trans, root);
1734                 BUG_ON(ret);
1735         } else {
1736                 ret = btrfs_add_device(trans, root, device);
1737         }
1738
1739         /*
1740          * we've got more storage, clear any full flags on the space
1741          * infos
1742          */
1743         btrfs_clear_space_info_full(root->fs_info);
1744
1745         unlock_chunks(root);
1746         btrfs_commit_transaction(trans, root);
1747
1748         if (seeding_dev) {
1749                 mutex_unlock(&uuid_mutex);
1750                 up_write(&sb->s_umount);
1751
1752                 ret = btrfs_relocate_sys_chunks(root);
1753                 BUG_ON(ret);
1754         }
1755
1756         return ret;
1757 error:
1758         blkdev_put(bdev, FMODE_EXCL);
1759         if (seeding_dev) {
1760                 mutex_unlock(&uuid_mutex);
1761                 up_write(&sb->s_umount);
1762         }
1763         return ret;
1764 }
1765
1766 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1767                                         struct btrfs_device *device)
1768 {
1769         int ret;
1770         struct btrfs_path *path;
1771         struct btrfs_root *root;
1772         struct btrfs_dev_item *dev_item;
1773         struct extent_buffer *leaf;
1774         struct btrfs_key key;
1775
1776         root = device->dev_root->fs_info->chunk_root;
1777
1778         path = btrfs_alloc_path();
1779         if (!path)
1780                 return -ENOMEM;
1781
1782         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1783         key.type = BTRFS_DEV_ITEM_KEY;
1784         key.offset = device->devid;
1785
1786         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1787         if (ret < 0)
1788                 goto out;
1789
1790         if (ret > 0) {
1791                 ret = -ENOENT;
1792                 goto out;
1793         }
1794
1795         leaf = path->nodes[0];
1796         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1797
1798         btrfs_set_device_id(leaf, dev_item, device->devid);
1799         btrfs_set_device_type(leaf, dev_item, device->type);
1800         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1801         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1802         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1803         btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
1804         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1805         btrfs_mark_buffer_dirty(leaf);
1806
1807 out:
1808         btrfs_free_path(path);
1809         return ret;
1810 }
1811
1812 static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1813                       struct btrfs_device *device, u64 new_size)
1814 {
1815         struct btrfs_super_block *super_copy =
1816                 device->dev_root->fs_info->super_copy;
1817         u64 old_total = btrfs_super_total_bytes(super_copy);
1818         u64 diff = new_size - device->total_bytes;
1819
1820         if (!device->writeable)
1821                 return -EACCES;
1822         if (new_size <= device->total_bytes)
1823                 return -EINVAL;
1824
1825         btrfs_set_super_total_bytes(super_copy, old_total + diff);
1826         device->fs_devices->total_rw_bytes += diff;
1827
1828         device->total_bytes = new_size;
1829         device->disk_total_bytes = new_size;
1830         btrfs_clear_space_info_full(device->dev_root->fs_info);
1831
1832         return btrfs_update_device(trans, device);
1833 }
1834
1835 int btrfs_grow_device(struct btrfs_trans_handle *trans,
1836                       struct btrfs_device *device, u64 new_size)
1837 {
1838         int ret;
1839         lock_chunks(device->dev_root);
1840         ret = __btrfs_grow_device(trans, device, new_size);
1841         unlock_chunks(device->dev_root);
1842         return ret;
1843 }
1844
1845 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1846                             struct btrfs_root *root,
1847                             u64 chunk_tree, u64 chunk_objectid,
1848                             u64 chunk_offset)
1849 {
1850         int ret;
1851         struct btrfs_path *path;
1852         struct btrfs_key key;
1853
1854         root = root->fs_info->chunk_root;
1855         path = btrfs_alloc_path();
1856         if (!path)
1857                 return -ENOMEM;
1858
1859         key.objectid = chunk_objectid;
1860         key.offset = chunk_offset;
1861         key.type = BTRFS_CHUNK_ITEM_KEY;
1862
1863         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1864         BUG_ON(ret);
1865
1866         ret = btrfs_del_item(trans, root, path);
1867
1868         btrfs_free_path(path);
1869         return ret;
1870 }
1871
1872 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1873                         chunk_offset)
1874 {
1875         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
1876         struct btrfs_disk_key *disk_key;
1877         struct btrfs_chunk *chunk;
1878         u8 *ptr;
1879         int ret = 0;
1880         u32 num_stripes;
1881         u32 array_size;
1882         u32 len = 0;
1883         u32 cur;
1884         struct btrfs_key key;
1885
1886         array_size = btrfs_super_sys_array_size(super_copy);
1887
1888         ptr = super_copy->sys_chunk_array;
1889         cur = 0;
1890
1891         while (cur < array_size) {
1892                 disk_key = (struct btrfs_disk_key *)ptr;
1893                 btrfs_disk_key_to_cpu(&key, disk_key);
1894
1895                 len = sizeof(*disk_key);
1896
1897                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1898                         chunk = (struct btrfs_chunk *)(ptr + len);
1899                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1900                         len += btrfs_chunk_item_size(num_stripes);
1901                 } else {
1902                         ret = -EIO;
1903                         break;
1904                 }
1905                 if (key.objectid == chunk_objectid &&
1906                     key.offset == chunk_offset) {
1907                         memmove(ptr, ptr + len, array_size - (cur + len));
1908                         array_size -= len;
1909                         btrfs_set_super_sys_array_size(super_copy, array_size);
1910                 } else {
1911                         ptr += len;
1912                         cur += len;
1913                 }
1914         }
1915         return ret;
1916 }
1917
1918 static int btrfs_relocate_chunk(struct btrfs_root *root,
1919                          u64 chunk_tree, u64 chunk_objectid,
1920                          u64 chunk_offset)
1921 {
1922         struct extent_map_tree *em_tree;
1923         struct btrfs_root *extent_root;
1924         struct btrfs_trans_handle *trans;
1925         struct extent_map *em;
1926         struct map_lookup *map;
1927         int ret;
1928         int i;
1929
1930         root = root->fs_info->chunk_root;
1931         extent_root = root->fs_info->extent_root;
1932         em_tree = &root->fs_info->mapping_tree.map_tree;
1933
1934         ret = btrfs_can_relocate(extent_root, chunk_offset);
1935         if (ret)
1936                 return -ENOSPC;
1937
1938         /* step one, relocate all the extents inside this chunk */
1939         ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1940         if (ret)
1941                 return ret;
1942
1943         trans = btrfs_start_transaction(root, 0);
1944         BUG_ON(IS_ERR(trans));
1945
1946         lock_chunks(root);
1947
1948         /*
1949          * step two, delete the device extents and the
1950          * chunk tree entries
1951          */
1952         read_lock(&em_tree->lock);
1953         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1954         read_unlock(&em_tree->lock);
1955
1956         BUG_ON(em->start > chunk_offset ||
1957                em->start + em->len < chunk_offset);
1958         map = (struct map_lookup *)em->bdev;
1959
1960         for (i = 0; i < map->num_stripes; i++) {
1961                 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
1962                                             map->stripes[i].physical);
1963                 BUG_ON(ret);
1964
1965                 if (map->stripes[i].dev) {
1966                         ret = btrfs_update_device(trans, map->stripes[i].dev);
1967                         BUG_ON(ret);
1968                 }
1969         }
1970         ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
1971                                chunk_offset);
1972
1973         BUG_ON(ret);
1974
1975         trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
1976
1977         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1978                 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1979                 BUG_ON(ret);
1980         }
1981
1982         ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
1983         BUG_ON(ret);
1984
1985         write_lock(&em_tree->lock);
1986         remove_extent_mapping(em_tree, em);
1987         write_unlock(&em_tree->lock);
1988
1989         kfree(map);
1990         em->bdev = NULL;
1991
1992         /* once for the tree */
1993         free_extent_map(em);
1994         /* once for us */
1995         free_extent_map(em);
1996
1997         unlock_chunks(root);
1998         btrfs_end_transaction(trans, root);
1999         return 0;
2000 }
2001
2002 static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
2003 {
2004         struct btrfs_root *chunk_root = root->fs_info->chunk_root;
2005         struct btrfs_path *path;
2006         struct extent_buffer *leaf;
2007         struct btrfs_chunk *chunk;
2008         struct btrfs_key key;
2009         struct btrfs_key found_key;
2010         u64 chunk_tree = chunk_root->root_key.objectid;
2011         u64 chunk_type;
2012         bool retried = false;
2013         int failed = 0;
2014         int ret;
2015
2016         path = btrfs_alloc_path();
2017         if (!path)
2018                 return -ENOMEM;
2019
2020 again:
2021         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2022         key.offset = (u64)-1;
2023         key.type = BTRFS_CHUNK_ITEM_KEY;
2024
2025         while (1) {
2026                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2027                 if (ret < 0)
2028                         goto error;
2029                 BUG_ON(ret == 0);
2030
2031                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
2032                                           key.type);
2033                 if (ret < 0)
2034                         goto error;
2035                 if (ret > 0)
2036                         break;
2037
2038                 leaf = path->nodes[0];
2039                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2040
2041                 chunk = btrfs_item_ptr(leaf, path->slots[0],
2042                                        struct btrfs_chunk);
2043                 chunk_type = btrfs_chunk_type(leaf, chunk);
2044                 btrfs_release_path(path);
2045
2046                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
2047                         ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
2048                                                    found_key.objectid,
2049                                                    found_key.offset);
2050                         if (ret == -ENOSPC)
2051                                 failed++;
2052                         else if (ret)
2053                                 BUG();
2054                 }
2055
2056                 if (found_key.offset == 0)
2057                         break;
2058                 key.offset = found_key.offset - 1;
2059         }
2060         ret = 0;
2061         if (failed && !retried) {
2062                 failed = 0;
2063                 retried = true;
2064                 goto again;
2065         } else if (failed && retried) {
2066                 WARN_ON(1);
2067                 ret = -ENOSPC;
2068         }
2069 error:
2070         btrfs_free_path(path);
2071         return ret;
2072 }
2073
2074 static int insert_balance_item(struct btrfs_root *root,
2075                                struct btrfs_balance_control *bctl)
2076 {
2077         struct btrfs_trans_handle *trans;
2078         struct btrfs_balance_item *item;
2079         struct btrfs_disk_balance_args disk_bargs;
2080         struct btrfs_path *path;
2081         struct extent_buffer *leaf;
2082         struct btrfs_key key;
2083         int ret, err;
2084
2085         path = btrfs_alloc_path();
2086         if (!path)
2087                 return -ENOMEM;
2088
2089         trans = btrfs_start_transaction(root, 0);
2090         if (IS_ERR(trans)) {
2091                 btrfs_free_path(path);
2092                 return PTR_ERR(trans);
2093         }
2094
2095         key.objectid = BTRFS_BALANCE_OBJECTID;
2096         key.type = BTRFS_BALANCE_ITEM_KEY;
2097         key.offset = 0;
2098
2099         ret = btrfs_insert_empty_item(trans, root, path, &key,
2100                                       sizeof(*item));
2101         if (ret)
2102                 goto out;
2103
2104         leaf = path->nodes[0];
2105         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2106
2107         memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
2108
2109         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
2110         btrfs_set_balance_data(leaf, item, &disk_bargs);
2111         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
2112         btrfs_set_balance_meta(leaf, item, &disk_bargs);
2113         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
2114         btrfs_set_balance_sys(leaf, item, &disk_bargs);
2115
2116         btrfs_set_balance_flags(leaf, item, bctl->flags);
2117
2118         btrfs_mark_buffer_dirty(leaf);
2119 out:
2120         btrfs_free_path(path);
2121         err = btrfs_commit_transaction(trans, root);
2122         if (err && !ret)
2123                 ret = err;
2124         return ret;
2125 }
2126
2127 static int del_balance_item(struct btrfs_root *root)
2128 {
2129         struct btrfs_trans_handle *trans;
2130         struct btrfs_path *path;
2131         struct btrfs_key key;
2132         int ret, err;
2133
2134         path = btrfs_alloc_path();
2135         if (!path)
2136                 return -ENOMEM;
2137
2138         trans = btrfs_start_transaction(root, 0);
2139         if (IS_ERR(trans)) {
2140                 btrfs_free_path(path);
2141                 return PTR_ERR(trans);
2142         }
2143
2144         key.objectid = BTRFS_BALANCE_OBJECTID;
2145         key.type = BTRFS_BALANCE_ITEM_KEY;
2146         key.offset = 0;
2147
2148         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2149         if (ret < 0)
2150                 goto out;
2151         if (ret > 0) {
2152                 ret = -ENOENT;
2153                 goto out;
2154         }
2155
2156         ret = btrfs_del_item(trans, root, path);
2157 out:
2158         btrfs_free_path(path);
2159         err = btrfs_commit_transaction(trans, root);
2160         if (err && !ret)
2161                 ret = err;
2162         return ret;
2163 }
2164
2165 /*
2166  * This is a heuristic used to reduce the number of chunks balanced on
2167  * resume after balance was interrupted.
2168  */
2169 static void update_balance_args(struct btrfs_balance_control *bctl)
2170 {
2171         /*
2172          * Turn on soft mode for chunk types that were being converted.
2173          */
2174         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
2175                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
2176         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
2177                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
2178         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
2179                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
2180
2181         /*
2182          * Turn on usage filter if is not already used.  The idea is
2183          * that chunks that we have already balanced should be
2184          * reasonably full.  Don't do it for chunks that are being
2185          * converted - that will keep us from relocating unconverted
2186          * (albeit full) chunks.
2187          */
2188         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2189             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2190                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
2191                 bctl->data.usage = 90;
2192         }
2193         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2194             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2195                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
2196                 bctl->sys.usage = 90;
2197         }
2198         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2199             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2200                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
2201                 bctl->meta.usage = 90;
2202         }
2203 }
2204
2205 /*
2206  * Should be called with both balance and volume mutexes held to
2207  * serialize other volume operations (add_dev/rm_dev/resize) with
2208  * restriper.  Same goes for unset_balance_control.
2209  */
2210 static void set_balance_control(struct btrfs_balance_control *bctl)
2211 {
2212         struct btrfs_fs_info *fs_info = bctl->fs_info;
2213
2214         BUG_ON(fs_info->balance_ctl);
2215
2216         spin_lock(&fs_info->balance_lock);
2217         fs_info->balance_ctl = bctl;
2218         spin_unlock(&fs_info->balance_lock);
2219 }
2220
2221 static void unset_balance_control(struct btrfs_fs_info *fs_info)
2222 {
2223         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2224
2225         BUG_ON(!fs_info->balance_ctl);
2226
2227         spin_lock(&fs_info->balance_lock);
2228         fs_info->balance_ctl = NULL;
2229         spin_unlock(&fs_info->balance_lock);
2230
2231         kfree(bctl);
2232 }
2233
2234 /*
2235  * Balance filters.  Return 1 if chunk should be filtered out
2236  * (should not be balanced).
2237  */
2238 static int chunk_profiles_filter(u64 chunk_profile,
2239                                  struct btrfs_balance_args *bargs)
2240 {
2241         chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
2242
2243         if (chunk_profile == 0)
2244                 chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2245
2246         if (bargs->profiles & chunk_profile)
2247                 return 0;
2248
2249         return 1;
2250 }
2251
2252 static u64 div_factor_fine(u64 num, int factor)
2253 {
2254         if (factor <= 0)
2255                 return 0;
2256         if (factor >= 100)
2257                 return num;
2258
2259         num *= factor;
2260         do_div(num, 100);
2261         return num;
2262 }
2263
2264 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2265                               struct btrfs_balance_args *bargs)
2266 {
2267         struct btrfs_block_group_cache *cache;
2268         u64 chunk_used, user_thresh;
2269         int ret = 1;
2270
2271         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2272         chunk_used = btrfs_block_group_used(&cache->item);
2273
2274         user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
2275         if (chunk_used < user_thresh)
2276                 ret = 0;
2277
2278         btrfs_put_block_group(cache);
2279         return ret;
2280 }
2281
2282 static int chunk_devid_filter(struct extent_buffer *leaf,
2283                               struct btrfs_chunk *chunk,
2284                               struct btrfs_balance_args *bargs)
2285 {
2286         struct btrfs_stripe *stripe;
2287         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2288         int i;
2289
2290         for (i = 0; i < num_stripes; i++) {
2291                 stripe = btrfs_stripe_nr(chunk, i);
2292                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
2293                         return 0;
2294         }
2295
2296         return 1;
2297 }
2298
2299 /* [pstart, pend) */
2300 static int chunk_drange_filter(struct extent_buffer *leaf,
2301                                struct btrfs_chunk *chunk,
2302                                u64 chunk_offset,
2303                                struct btrfs_balance_args *bargs)
2304 {
2305         struct btrfs_stripe *stripe;
2306         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2307         u64 stripe_offset;
2308         u64 stripe_length;
2309         int factor;
2310         int i;
2311
2312         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
2313                 return 0;
2314
2315         if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2316              BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
2317                 factor = 2;
2318         else
2319                 factor = 1;
2320         factor = num_stripes / factor;
2321
2322         for (i = 0; i < num_stripes; i++) {
2323                 stripe = btrfs_stripe_nr(chunk, i);
2324                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
2325                         continue;
2326
2327                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
2328                 stripe_length = btrfs_chunk_length(leaf, chunk);
2329                 do_div(stripe_length, factor);
2330
2331                 if (stripe_offset < bargs->pend &&
2332                     stripe_offset + stripe_length > bargs->pstart)
2333                         return 0;
2334         }
2335
2336         return 1;
2337 }
2338
2339 /* [vstart, vend) */
2340 static int chunk_vrange_filter(struct extent_buffer *leaf,
2341                                struct btrfs_chunk *chunk,
2342                                u64 chunk_offset,
2343                                struct btrfs_balance_args *bargs)
2344 {
2345         if (chunk_offset < bargs->vend &&
2346             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
2347                 /* at least part of the chunk is inside this vrange */
2348                 return 0;
2349
2350         return 1;
2351 }
2352
2353 static int chunk_soft_convert_filter(u64 chunk_profile,
2354                                      struct btrfs_balance_args *bargs)
2355 {
2356         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
2357                 return 0;
2358
2359         chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
2360
2361         if (chunk_profile == 0)
2362                 chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2363
2364         if (bargs->target & chunk_profile)
2365                 return 1;
2366
2367         return 0;
2368 }
2369
2370 static int should_balance_chunk(struct btrfs_root *root,
2371                                 struct extent_buffer *leaf,
2372                                 struct btrfs_chunk *chunk, u64 chunk_offset)
2373 {
2374         struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
2375         struct btrfs_balance_args *bargs = NULL;
2376         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
2377
2378         /* type filter */
2379         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
2380               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
2381                 return 0;
2382         }
2383
2384         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
2385                 bargs = &bctl->data;
2386         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
2387                 bargs = &bctl->sys;
2388         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
2389                 bargs = &bctl->meta;
2390
2391         /* profiles filter */
2392         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
2393             chunk_profiles_filter(chunk_type, bargs)) {
2394                 return 0;
2395         }
2396
2397         /* usage filter */
2398         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
2399             chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
2400                 return 0;
2401         }
2402
2403         /* devid filter */
2404         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
2405             chunk_devid_filter(leaf, chunk, bargs)) {
2406                 return 0;
2407         }
2408
2409         /* drange filter, makes sense only with devid filter */
2410         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
2411             chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
2412                 return 0;
2413         }
2414
2415         /* vrange filter */
2416         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
2417             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
2418                 return 0;
2419         }
2420
2421         /* soft profile changing mode */
2422         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
2423             chunk_soft_convert_filter(chunk_type, bargs)) {
2424                 return 0;
2425         }
2426
2427         return 1;
2428 }
2429
2430 static u64 div_factor(u64 num, int factor)
2431 {
2432         if (factor == 10)
2433                 return num;
2434         num *= factor;
2435         do_div(num, 10);
2436         return num;
2437 }
2438
2439 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2440 {
2441         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2442         struct btrfs_root *chunk_root = fs_info->chunk_root;
2443         struct btrfs_root *dev_root = fs_info->dev_root;
2444         struct list_head *devices;
2445         struct btrfs_device *device;
2446         u64 old_size;
2447         u64 size_to_free;
2448         struct btrfs_chunk *chunk;
2449         struct btrfs_path *path;
2450         struct btrfs_key key;
2451         struct btrfs_key found_key;
2452         struct btrfs_trans_handle *trans;
2453         struct extent_buffer *leaf;
2454         int slot;
2455         int ret;
2456         int enospc_errors = 0;
2457         bool counting = true;
2458
2459         /* step one make some room on all the devices */
2460         devices = &fs_info->fs_devices->devices;
2461         list_for_each_entry(device, devices, dev_list) {
2462                 old_size = device->total_bytes;
2463                 size_to_free = div_factor(old_size, 1);
2464                 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
2465                 if (!device->writeable ||
2466                     device->total_bytes - device->bytes_used > size_to_free)
2467                         continue;
2468
2469                 ret = btrfs_shrink_device(device, old_size - size_to_free);
2470                 if (ret == -ENOSPC)
2471                         break;
2472                 BUG_ON(ret);
2473
2474                 trans = btrfs_start_transaction(dev_root, 0);
2475                 BUG_ON(IS_ERR(trans));
2476
2477                 ret = btrfs_grow_device(trans, device, old_size);
2478                 BUG_ON(ret);
2479
2480                 btrfs_end_transaction(trans, dev_root);
2481         }
2482
2483         /* step two, relocate all the chunks */
2484         path = btrfs_alloc_path();
2485         if (!path) {
2486                 ret = -ENOMEM;
2487                 goto error;
2488         }
2489
2490         /* zero out stat counters */
2491         spin_lock(&fs_info->balance_lock);
2492         memset(&bctl->stat, 0, sizeof(bctl->stat));
2493         spin_unlock(&fs_info->balance_lock);
2494 again:
2495         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2496         key.offset = (u64)-1;
2497         key.type = BTRFS_CHUNK_ITEM_KEY;
2498
2499         while (1) {
2500                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
2501                     atomic_read(&fs_info->balance_cancel_req)) {
2502                         ret = -ECANCELED;
2503                         goto error;
2504                 }
2505
2506                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2507                 if (ret < 0)
2508                         goto error;
2509
2510                 /*
2511                  * this shouldn't happen, it means the last relocate
2512                  * failed
2513                  */
2514                 if (ret == 0)
2515                         BUG(); /* FIXME break ? */
2516
2517                 ret = btrfs_previous_item(chunk_root, path, 0,
2518                                           BTRFS_CHUNK_ITEM_KEY);
2519                 if (ret) {
2520                         ret = 0;
2521                         break;
2522                 }
2523
2524                 leaf = path->nodes[0];
2525                 slot = path->slots[0];
2526                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2527
2528                 if (found_key.objectid != key.objectid)
2529                         break;
2530
2531                 /* chunk zero is special */
2532                 if (found_key.offset == 0)
2533                         break;
2534
2535                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
2536
2537                 if (!counting) {
2538                         spin_lock(&fs_info->balance_lock);
2539                         bctl->stat.considered++;
2540                         spin_unlock(&fs_info->balance_lock);
2541                 }
2542
2543                 ret = should_balance_chunk(chunk_root, leaf, chunk,
2544                                            found_key.offset);
2545                 btrfs_release_path(path);
2546                 if (!ret)
2547                         goto loop;
2548
2549                 if (counting) {
2550                         spin_lock(&fs_info->balance_lock);
2551                         bctl->stat.expected++;
2552                         spin_unlock(&fs_info->balance_lock);
2553                         goto loop;
2554                 }
2555
2556                 ret = btrfs_relocate_chunk(chunk_root,
2557                                            chunk_root->root_key.objectid,
2558                                            found_key.objectid,
2559                                            found_key.offset);
2560                 if (ret && ret != -ENOSPC)
2561                         goto error;
2562                 if (ret == -ENOSPC) {
2563                         enospc_errors++;
2564                 } else {
2565                         spin_lock(&fs_info->balance_lock);
2566                         bctl->stat.completed++;
2567                         spin_unlock(&fs_info->balance_lock);
2568                 }
2569 loop:
2570                 key.offset = found_key.offset - 1;
2571         }
2572
2573         if (counting) {
2574                 btrfs_release_path(path);
2575                 counting = false;
2576                 goto again;
2577         }
2578 error:
2579         btrfs_free_path(path);
2580         if (enospc_errors) {
2581                 printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
2582                        enospc_errors);
2583                 if (!ret)
2584                         ret = -ENOSPC;
2585         }
2586
2587         return ret;
2588 }
2589
2590 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
2591 {
2592         /* cancel requested || normal exit path */
2593         return atomic_read(&fs_info->balance_cancel_req) ||
2594                 (atomic_read(&fs_info->balance_pause_req) == 0 &&
2595                  atomic_read(&fs_info->balance_cancel_req) == 0);
2596 }
2597
2598 static void __cancel_balance(struct btrfs_fs_info *fs_info)
2599 {
2600         int ret;
2601
2602         unset_balance_control(fs_info);
2603         ret = del_balance_item(fs_info->tree_root);
2604         BUG_ON(ret);
2605 }
2606
2607 void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
2608                                struct btrfs_ioctl_balance_args *bargs);
2609
2610 /*
2611  * Should be called with both balance and volume mutexes held
2612  */
2613 int btrfs_balance(struct btrfs_balance_control *bctl,
2614                   struct btrfs_ioctl_balance_args *bargs)
2615 {
2616         struct btrfs_fs_info *fs_info = bctl->fs_info;
2617         u64 allowed;
2618         int ret;
2619
2620         if (btrfs_fs_closing(fs_info) ||
2621             atomic_read(&fs_info->balance_pause_req) ||
2622             atomic_read(&fs_info->balance_cancel_req)) {
2623                 ret = -EINVAL;
2624                 goto out;
2625         }
2626
2627         /*
2628          * In case of mixed groups both data and meta should be picked,
2629          * and identical options should be given for both of them.
2630          */
2631         allowed = btrfs_super_incompat_flags(fs_info->super_copy);
2632         if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
2633             (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) {
2634                 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
2635                     !(bctl->flags & BTRFS_BALANCE_METADATA) ||
2636                     memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
2637                         printk(KERN_ERR "btrfs: with mixed groups data and "
2638                                "metadata balance options must be the same\n");
2639                         ret = -EINVAL;
2640                         goto out;
2641                 }
2642         }
2643
2644         /*
2645          * Profile changing sanity checks.  Skip them if a simple
2646          * balance is requested.
2647          */
2648         if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) &
2649               BTRFS_BALANCE_ARGS_CONVERT))
2650                 goto do_balance;
2651
2652         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2653         if (fs_info->fs_devices->num_devices == 1)
2654                 allowed |= BTRFS_BLOCK_GROUP_DUP;
2655         else if (fs_info->fs_devices->num_devices < 4)
2656                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
2657         else
2658                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
2659                                 BTRFS_BLOCK_GROUP_RAID10);
2660
2661         if (!profile_is_valid(bctl->data.target, 1) ||
2662             bctl->data.target & ~allowed) {
2663                 printk(KERN_ERR "btrfs: unable to start balance with target "
2664                        "data profile %llu\n",
2665                        (unsigned long long)bctl->data.target);
2666                 ret = -EINVAL;
2667                 goto out;
2668         }
2669         if (!profile_is_valid(bctl->meta.target, 1) ||
2670             bctl->meta.target & ~allowed) {
2671                 printk(KERN_ERR "btrfs: unable to start balance with target "
2672                        "metadata profile %llu\n",
2673                        (unsigned long long)bctl->meta.target);
2674                 ret = -EINVAL;
2675                 goto out;
2676         }
2677         if (!profile_is_valid(bctl->sys.target, 1) ||
2678             bctl->sys.target & ~allowed) {
2679                 printk(KERN_ERR "btrfs: unable to start balance with target "
2680                        "system profile %llu\n",
2681                        (unsigned long long)bctl->sys.target);
2682                 ret = -EINVAL;
2683                 goto out;
2684         }
2685
2686         if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) {
2687                 printk(KERN_ERR "btrfs: dup for data is not allowed\n");
2688                 ret = -EINVAL;
2689                 goto out;
2690         }
2691
2692         /* allow to reduce meta or sys integrity only if force set */
2693         allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2694                         BTRFS_BLOCK_GROUP_RAID10;
2695         if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2696              (fs_info->avail_system_alloc_bits & allowed) &&
2697              !(bctl->sys.target & allowed)) ||
2698             ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2699              (fs_info->avail_metadata_alloc_bits & allowed) &&
2700              !(bctl->meta.target & allowed))) {
2701                 if (bctl->flags & BTRFS_BALANCE_FORCE) {
2702                         printk(KERN_INFO "btrfs: force reducing metadata "
2703                                "integrity\n");
2704                 } else {
2705                         printk(KERN_ERR "btrfs: balance will reduce metadata "
2706                                "integrity, use force if you want this\n");
2707                         ret = -EINVAL;
2708                         goto out;
2709                 }
2710         }
2711
2712 do_balance:
2713         ret = insert_balance_item(fs_info->tree_root, bctl);
2714         if (ret && ret != -EEXIST)
2715                 goto out;
2716
2717         if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
2718                 BUG_ON(ret == -EEXIST);
2719                 set_balance_control(bctl);
2720         } else {
2721                 BUG_ON(ret != -EEXIST);
2722                 spin_lock(&fs_info->balance_lock);
2723                 update_balance_args(bctl);
2724                 spin_unlock(&fs_info->balance_lock);
2725         }
2726
2727         atomic_inc(&fs_info->balance_running);
2728         mutex_unlock(&fs_info->balance_mutex);
2729
2730         ret = __btrfs_balance(fs_info);
2731
2732         mutex_lock(&fs_info->balance_mutex);
2733         atomic_dec(&fs_info->balance_running);
2734
2735         if (bargs) {
2736                 memset(bargs, 0, sizeof(*bargs));
2737                 update_ioctl_balance_args(fs_info, 0, bargs);
2738         }
2739
2740         if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
2741             balance_need_close(fs_info)) {
2742                 __cancel_balance(fs_info);
2743         }
2744
2745         wake_up(&fs_info->balance_wait_q);
2746
2747         return ret;
2748 out:
2749         if (bctl->flags & BTRFS_BALANCE_RESUME)
2750                 __cancel_balance(fs_info);
2751         else
2752                 kfree(bctl);
2753         return ret;
2754 }
2755
2756 static int balance_kthread(void *data)
2757 {
2758         struct btrfs_balance_control *bctl =
2759                         (struct btrfs_balance_control *)data;
2760         struct btrfs_fs_info *fs_info = bctl->fs_info;
2761         int ret = 0;
2762
2763         mutex_lock(&fs_info->volume_mutex);
2764         mutex_lock(&fs_info->balance_mutex);
2765
2766         set_balance_control(bctl);
2767
2768         if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
2769                 printk(KERN_INFO "btrfs: force skipping balance\n");
2770         } else {
2771                 printk(KERN_INFO "btrfs: continuing balance\n");
2772                 ret = btrfs_balance(bctl, NULL);
2773         }
2774
2775         mutex_unlock(&fs_info->balance_mutex);
2776         mutex_unlock(&fs_info->volume_mutex);
2777         return ret;
2778 }
2779
2780 int btrfs_recover_balance(struct btrfs_root *tree_root)
2781 {
2782         struct task_struct *tsk;
2783         struct btrfs_balance_control *bctl;
2784         struct btrfs_balance_item *item;
2785         struct btrfs_disk_balance_args disk_bargs;
2786         struct btrfs_path *path;
2787         struct extent_buffer *leaf;
2788         struct btrfs_key key;
2789         int ret;
2790
2791         path = btrfs_alloc_path();
2792         if (!path)
2793                 return -ENOMEM;
2794
2795         bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
2796         if (!bctl) {
2797                 ret = -ENOMEM;
2798                 goto out;
2799         }
2800
2801         key.objectid = BTRFS_BALANCE_OBJECTID;
2802         key.type = BTRFS_BALANCE_ITEM_KEY;
2803         key.offset = 0;
2804
2805         ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
2806         if (ret < 0)
2807                 goto out_bctl;
2808         if (ret > 0) { /* ret = -ENOENT; */
2809                 ret = 0;
2810                 goto out_bctl;
2811         }
2812
2813         leaf = path->nodes[0];
2814         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2815
2816         bctl->fs_info = tree_root->fs_info;
2817         bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
2818
2819         btrfs_balance_data(leaf, item, &disk_bargs);
2820         btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
2821         btrfs_balance_meta(leaf, item, &disk_bargs);
2822         btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
2823         btrfs_balance_sys(leaf, item, &disk_bargs);
2824         btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
2825
2826         tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
2827         if (IS_ERR(tsk))
2828                 ret = PTR_ERR(tsk);
2829         else
2830                 goto out;
2831
2832 out_bctl:
2833         kfree(bctl);
2834 out:
2835         btrfs_free_path(path);
2836         return ret;
2837 }
2838
2839 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
2840 {
2841         int ret = 0;
2842
2843         mutex_lock(&fs_info->balance_mutex);
2844         if (!fs_info->balance_ctl) {
2845                 mutex_unlock(&fs_info->balance_mutex);
2846                 return -ENOTCONN;
2847         }
2848
2849         if (atomic_read(&fs_info->balance_running)) {
2850                 atomic_inc(&fs_info->balance_pause_req);
2851                 mutex_unlock(&fs_info->balance_mutex);
2852
2853                 wait_event(fs_info->balance_wait_q,
2854                            atomic_read(&fs_info->balance_running) == 0);
2855
2856                 mutex_lock(&fs_info->balance_mutex);
2857                 /* we are good with balance_ctl ripped off from under us */
2858                 BUG_ON(atomic_read(&fs_info->balance_running));
2859                 atomic_dec(&fs_info->balance_pause_req);
2860         } else {
2861                 ret = -ENOTCONN;
2862         }
2863
2864         mutex_unlock(&fs_info->balance_mutex);
2865         return ret;
2866 }
2867
2868 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
2869 {
2870         mutex_lock(&fs_info->balance_mutex);
2871         if (!fs_info->balance_ctl) {
2872                 mutex_unlock(&fs_info->balance_mutex);
2873                 return -ENOTCONN;
2874         }
2875
2876         atomic_inc(&fs_info->balance_cancel_req);
2877         /*
2878          * if we are running just wait and return, balance item is
2879          * deleted in btrfs_balance in this case
2880          */
2881         if (atomic_read(&fs_info->balance_running)) {
2882                 mutex_unlock(&fs_info->balance_mutex);
2883                 wait_event(fs_info->balance_wait_q,
2884                            atomic_read(&fs_info->balance_running) == 0);
2885                 mutex_lock(&fs_info->balance_mutex);
2886         } else {
2887                 /* __cancel_balance needs volume_mutex */
2888                 mutex_unlock(&fs_info->balance_mutex);
2889                 mutex_lock(&fs_info->volume_mutex);
2890                 mutex_lock(&fs_info->balance_mutex);
2891
2892                 if (fs_info->balance_ctl)
2893                         __cancel_balance(fs_info);
2894
2895                 mutex_unlock(&fs_info->volume_mutex);
2896         }
2897
2898         BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
2899         atomic_dec(&fs_info->balance_cancel_req);
2900         mutex_unlock(&fs_info->balance_mutex);
2901         return 0;
2902 }
2903
2904 /*
2905  * shrinking a device means finding all of the device extents past
2906  * the new size, and then following the back refs to the chunks.
2907  * The chunk relocation code actually frees the device extent
2908  */
2909 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2910 {
2911         struct btrfs_trans_handle *trans;
2912         struct btrfs_root *root = device->dev_root;
2913         struct btrfs_dev_extent *dev_extent = NULL;
2914         struct btrfs_path *path;
2915         u64 length;
2916         u64 chunk_tree;
2917         u64 chunk_objectid;
2918         u64 chunk_offset;
2919         int ret;
2920         int slot;
2921         int failed = 0;
2922         bool retried = false;
2923         struct extent_buffer *l;
2924         struct btrfs_key key;
2925         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2926         u64 old_total = btrfs_super_total_bytes(super_copy);
2927         u64 old_size = device->total_bytes;
2928         u64 diff = device->total_bytes - new_size;
2929
2930         if (new_size >= device->total_bytes)
2931                 return -EINVAL;
2932
2933         path = btrfs_alloc_path();
2934         if (!path)
2935                 return -ENOMEM;
2936
2937         path->reada = 2;
2938
2939         lock_chunks(root);
2940
2941         device->total_bytes = new_size;
2942         if (device->writeable) {
2943                 device->fs_devices->total_rw_bytes -= diff;
2944                 spin_lock(&root->fs_info->free_chunk_lock);
2945                 root->fs_info->free_chunk_space -= diff;
2946                 spin_unlock(&root->fs_info->free_chunk_lock);
2947         }
2948         unlock_chunks(root);
2949
2950 again:
2951         key.objectid = device->devid;
2952         key.offset = (u64)-1;
2953         key.type = BTRFS_DEV_EXTENT_KEY;
2954
2955         while (1) {
2956                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2957                 if (ret < 0)
2958                         goto done;
2959
2960                 ret = btrfs_previous_item(root, path, 0, key.type);
2961                 if (ret < 0)
2962                         goto done;
2963                 if (ret) {
2964                         ret = 0;
2965                         btrfs_release_path(path);
2966                         break;
2967                 }
2968
2969                 l = path->nodes[0];
2970                 slot = path->slots[0];
2971                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
2972
2973                 if (key.objectid != device->devid) {
2974                         btrfs_release_path(path);
2975                         break;
2976                 }
2977
2978                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2979                 length = btrfs_dev_extent_length(l, dev_extent);
2980
2981                 if (key.offset + length <= new_size) {
2982                         btrfs_release_path(path);
2983                         break;
2984                 }
2985
2986                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2987                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
2988                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2989                 btrfs_release_path(path);
2990
2991                 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
2992                                            chunk_offset);
2993                 if (ret && ret != -ENOSPC)
2994                         goto done;
2995                 if (ret == -ENOSPC)
2996                         failed++;
2997                 key.offset -= 1;
2998         }
2999
3000         if (failed && !retried) {
3001                 failed = 0;
3002                 retried = true;
3003                 goto again;
3004         } else if (failed && retried) {
3005                 ret = -ENOSPC;
3006                 lock_chunks(root);
3007
3008                 device->total_bytes = old_size;
3009                 if (device->writeable)
3010                         device->fs_devices->total_rw_bytes += diff;
3011                 spin_lock(&root->fs_info->free_chunk_lock);
3012                 root->fs_info->free_chunk_space += diff;
3013                 spin_unlock(&root->fs_info->free_chunk_lock);
3014                 unlock_chunks(root);
3015                 goto done;
3016         }
3017
3018         /* Shrinking succeeded, else we would be at "done". */
3019         trans = btrfs_start_transaction(root, 0);
3020         if (IS_ERR(trans)) {
3021                 ret = PTR_ERR(trans);
3022                 goto done;
3023         }
3024
3025         lock_chunks(root);
3026
3027         device->disk_total_bytes = new_size;
3028         /* Now btrfs_update_device() will change the on-disk size. */
3029         ret = btrfs_update_device(trans, device);
3030         if (ret) {
3031                 unlock_chunks(root);
3032                 btrfs_end_transaction(trans, root);
3033                 goto done;
3034         }
3035         WARN_ON(diff > old_total);
3036         btrfs_set_super_total_bytes(super_copy, old_total - diff);
3037         unlock_chunks(root);
3038         btrfs_end_transaction(trans, root);
3039 done:
3040         btrfs_free_path(path);
3041         return ret;
3042 }
3043
3044 static int btrfs_add_system_chunk(struct btrfs_root *root,
3045                            struct btrfs_key *key,
3046                            struct btrfs_chunk *chunk, int item_size)
3047 {
3048         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3049         struct btrfs_disk_key disk_key;
3050         u32 array_size;
3051         u8 *ptr;
3052
3053         array_size = btrfs_super_sys_array_size(super_copy);
3054         if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
3055                 return -EFBIG;
3056
3057         ptr = super_copy->sys_chunk_array + array_size;
3058         btrfs_cpu_key_to_disk(&disk_key, key);
3059         memcpy(ptr, &disk_key, sizeof(disk_key));
3060         ptr += sizeof(disk_key);
3061         memcpy(ptr, chunk, item_size);
3062         item_size += sizeof(disk_key);
3063         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
3064         return 0;
3065 }
3066
3067 /*
3068  * sort the devices in descending order by max_avail, total_avail
3069  */
3070 static int btrfs_cmp_device_info(const void *a, const void *b)
3071 {
3072         const struct btrfs_device_info *di_a = a;
3073         const struct btrfs_device_info *di_b = b;
3074
3075         if (di_a->max_avail > di_b->max_avail)
3076                 return -1;
3077         if (di_a->max_avail < di_b->max_avail)
3078                 return 1;
3079         if (di_a->total_avail > di_b->total_avail)
3080                 return -1;
3081         if (di_a->total_avail < di_b->total_avail)
3082                 return 1;
3083         return 0;
3084 }
3085
3086 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3087                                struct btrfs_root *extent_root,
3088                                struct map_lookup **map_ret,
3089                                u64 *num_bytes_out, u64 *stripe_size_out,
3090                                u64 start, u64 type)
3091 {
3092         struct btrfs_fs_info *info = extent_root->fs_info;
3093         struct btrfs_fs_devices *fs_devices = info->fs_devices;
3094         struct list_head *cur;
3095         struct map_lookup *map = NULL;
3096         struct extent_map_tree *em_tree;
3097         struct extent_map *em;
3098         struct btrfs_device_info *devices_info = NULL;
3099         u64 total_avail;
3100         int num_stripes;        /* total number of stripes to allocate */
3101         int sub_stripes;        /* sub_stripes info for map */
3102         int dev_stripes;        /* stripes per dev */
3103         int devs_max;           /* max devs to use */
3104         int devs_min;           /* min devs needed */
3105         int devs_increment;     /* ndevs has to be a multiple of this */
3106         int ncopies;            /* how many copies to data has */
3107         int ret;
3108         u64 max_stripe_size;
3109         u64 max_chunk_size;
3110         u64 stripe_size;
3111         u64 num_bytes;
3112         int ndevs;
3113         int i;
3114         int j;
3115
3116         if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
3117             (type & BTRFS_BLOCK_GROUP_DUP)) {
3118                 WARN_ON(1);
3119                 type &= ~BTRFS_BLOCK_GROUP_DUP;
3120         }
3121
3122         if (list_empty(&fs_devices->alloc_list))
3123                 return -ENOSPC;
3124
3125         sub_stripes = 1;
3126         dev_stripes = 1;
3127         devs_increment = 1;
3128         ncopies = 1;
3129         devs_max = 0;   /* 0 == as many as possible */
3130         devs_min = 1;
3131
3132         /*
3133          * define the properties of each RAID type.
3134          * FIXME: move this to a global table and use it in all RAID
3135          * calculation code
3136          */
3137         if (type & (BTRFS_BLOCK_GROUP_DUP)) {
3138                 dev_stripes = 2;
3139                 ncopies = 2;
3140                 devs_max = 1;
3141         } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
3142                 devs_min = 2;
3143         } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
3144                 devs_increment = 2;
3145                 ncopies = 2;
3146                 devs_max = 2;
3147                 devs_min = 2;
3148         } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
3149                 sub_stripes = 2;
3150                 devs_increment = 2;
3151                 ncopies = 2;
3152                 devs_min = 4;
3153         } else {
3154                 devs_max = 1;
3155         }
3156
3157         if (type & BTRFS_BLOCK_GROUP_DATA) {
3158                 max_stripe_size = 1024 * 1024 * 1024;
3159                 max_chunk_size = 10 * max_stripe_size;
3160         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
3161                 /* for larger filesystems, use larger metadata chunks */
3162                 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
3163                         max_stripe_size = 1024 * 1024 * 1024;
3164                 else
3165                         max_stripe_size = 256 * 1024 * 1024;
3166                 max_chunk_size = max_stripe_size;
3167         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
3168                 max_stripe_size = 8 * 1024 * 1024;
3169                 max_chunk_size = 2 * max_stripe_size;
3170         } else {
3171                 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
3172                        type);
3173                 BUG_ON(1);
3174         }
3175
3176         /* we don't want a chunk larger than 10% of writeable space */
3177         max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
3178                              max_chunk_size);
3179
3180         devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
3181                                GFP_NOFS);
3182         if (!devices_info)
3183                 return -ENOMEM;
3184
3185         cur = fs_devices->alloc_list.next;
3186
3187         /*
3188          * in the first pass through the devices list, we gather information
3189          * about the available holes on each device.
3190          */
3191         ndevs = 0;
3192         while (cur != &fs_devices->alloc_list) {
3193                 struct btrfs_device *device;
3194                 u64 max_avail;
3195                 u64 dev_offset;
3196
3197                 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
3198
3199                 cur = cur->next;
3200
3201                 if (!device->writeable) {
3202                         printk(KERN_ERR
3203                                "btrfs: read-only device in alloc_list\n");
3204                         WARN_ON(1);
3205                         continue;
3206                 }
3207
3208                 if (!device->in_fs_metadata)
3209                         continue;
3210
3211                 if (device->total_bytes > device->bytes_used)
3212                         total_avail = device->total_bytes - device->bytes_used;
3213                 else
3214                         total_avail = 0;
3215
3216                 /* If there is no space on this device, skip it. */
3217                 if (total_avail == 0)
3218                         continue;
3219
3220                 ret = find_free_dev_extent(device,
3221                                            max_stripe_size * dev_stripes,
3222                                            &dev_offset, &max_avail);
3223                 if (ret && ret != -ENOSPC)
3224                         goto error;
3225
3226                 if (ret == 0)
3227                         max_avail = max_stripe_size * dev_stripes;
3228
3229                 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
3230                         continue;
3231
3232                 devices_info[ndevs].dev_offset = dev_offset;
3233                 devices_info[ndevs].max_avail = max_avail;
3234                 devices_info[ndevs].total_avail = total_avail;
3235                 devices_info[ndevs].dev = device;
3236                 ++ndevs;
3237         }
3238
3239         /*
3240          * now sort the devices by hole size / available space
3241          */
3242         sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
3243              btrfs_cmp_device_info, NULL);
3244
3245         /* round down to number of usable stripes */
3246         ndevs -= ndevs % devs_increment;
3247
3248         if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
3249                 ret = -ENOSPC;
3250                 goto error;
3251         }
3252
3253         if (devs_max && ndevs > devs_max)
3254                 ndevs = devs_max;
3255         /*
3256          * the primary goal is to maximize the number of stripes, so use as many
3257          * devices as possible, even if the stripes are not maximum sized.
3258          */
3259         stripe_size = devices_info[ndevs-1].max_avail;
3260         num_stripes = ndevs * dev_stripes;
3261
3262         if (stripe_size * num_stripes > max_chunk_size * ncopies) {
3263                 stripe_size = max_chunk_size * ncopies;
3264                 do_div(stripe_size, num_stripes);
3265         }
3266
3267         do_div(stripe_size, dev_stripes);
3268         do_div(stripe_size, BTRFS_STRIPE_LEN);
3269         stripe_size *= BTRFS_STRIPE_LEN;
3270
3271         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3272         if (!map) {
3273                 ret = -ENOMEM;
3274                 goto error;
3275         }
3276         map->num_stripes = num_stripes;
3277
3278         for (i = 0; i < ndevs; ++i) {
3279                 for (j = 0; j < dev_stripes; ++j) {
3280                         int s = i * dev_stripes + j;
3281                         map->stripes[s].dev = devices_info[i].dev;
3282                         map->stripes[s].physical = devices_info[i].dev_offset +
3283                                                    j * stripe_size;
3284                 }
3285         }
3286         map->sector_size = extent_root->sectorsize;
3287         map->stripe_len = BTRFS_STRIPE_LEN;
3288         map->io_align = BTRFS_STRIPE_LEN;
3289         map->io_width = BTRFS_STRIPE_LEN;
3290         map->type = type;
3291         map->sub_stripes = sub_stripes;
3292
3293         *map_ret = map;
3294         num_bytes = stripe_size * (num_stripes / ncopies);
3295
3296         *stripe_size_out = stripe_size;
3297         *num_bytes_out = num_bytes;
3298
3299         trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
3300
3301         em = alloc_extent_map();
3302         if (!em) {
3303                 ret = -ENOMEM;
3304                 goto error;
3305         }
3306         em->bdev = (struct block_device *)map;
3307         em->start = start;
3308         em->len = num_bytes;
3309         em->block_start = 0;
3310         em->block_len = em->len;
3311
3312         em_tree = &extent_root->fs_info->mapping_tree.map_tree;
3313         write_lock(&em_tree->lock);
3314         ret = add_extent_mapping(em_tree, em);
3315         write_unlock(&em_tree->lock);
3316         BUG_ON(ret);
3317         free_extent_map(em);
3318
3319         ret = btrfs_make_block_group(trans, extent_root, 0, type,
3320                                      BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3321                                      start, num_bytes);
3322         BUG_ON(ret);
3323
3324         for (i = 0; i < map->num_stripes; ++i) {
3325                 struct btrfs_device *device;
3326                 u64 dev_offset;
3327
3328                 device = map->stripes[i].dev;
3329                 dev_offset = map->stripes[i].physical;
3330
3331                 ret = btrfs_alloc_dev_extent(trans, device,
3332                                 info->chunk_root->root_key.objectid,
3333                                 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3334                                 start, dev_offset, stripe_size);
3335                 BUG_ON(ret);
3336         }
3337
3338         kfree(devices_info);
3339         return 0;
3340
3341 error:
3342         kfree(map);
3343         kfree(devices_info);
3344         return ret;
3345 }
3346
3347 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
3348                                 struct btrfs_root *extent_root,
3349                                 struct map_lookup *map, u64 chunk_offset,
3350                                 u64 chunk_size, u64 stripe_size)
3351 {
3352         u64 dev_offset;
3353         struct btrfs_key key;
3354         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
3355         struct btrfs_device *device;
3356         struct btrfs_chunk *chunk;
3357         struct btrfs_stripe *stripe;
3358         size_t item_size = btrfs_chunk_item_size(map->num_stripes);
3359         int index = 0;
3360         int ret;
3361
3362         chunk = kzalloc(item_size, GFP_NOFS);
3363         if (!chunk)
3364                 return -ENOMEM;
3365
3366         index = 0;
3367         while (index < map->num_stripes) {
3368                 device = map->stripes[index].dev;
3369                 device->bytes_used += stripe_size;
3370                 ret = btrfs_update_device(trans, device);
3371                 BUG_ON(ret);
3372                 index++;
3373         }
3374
3375         spin_lock(&extent_root->fs_info->free_chunk_lock);
3376         extent_root->fs_info->free_chunk_space -= (stripe_size *
3377                                                    map->num_stripes);
3378         spin_unlock(&extent_root->fs_info->free_chunk_lock);
3379
3380         index = 0;
3381         stripe = &chunk->stripe;
3382         while (index < map->num_stripes) {
3383                 device = map->stripes[index].dev;
3384                 dev_offset = map->stripes[index].physical;
3385
3386                 btrfs_set_stack_stripe_devid(stripe, device->devid);
3387                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
3388                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
3389                 stripe++;
3390                 index++;
3391         }
3392
3393         btrfs_set_stack_chunk_length(chunk, chunk_size);
3394         btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
3395         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
3396         btrfs_set_stack_chunk_type(chunk, map->type);
3397         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
3398         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
3399         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
3400         btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
3401         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
3402
3403         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3404         key.type = BTRFS_CHUNK_ITEM_KEY;
3405         key.offset = chunk_offset;
3406
3407         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
3408         BUG_ON(ret);
3409
3410         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3411                 ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
3412                                              item_size);
3413                 BUG_ON(ret);
3414         }
3415
3416         kfree(chunk);
3417         return 0;
3418 }
3419
3420 /*
3421  * Chunk allocation falls into two parts. The first part does works
3422  * that make the new allocated chunk useable, but not do any operation
3423  * that modifies the chunk tree. The second part does the works that
3424  * require modifying the chunk tree. This division is important for the
3425  * bootstrap process of adding storage to a seed btrfs.
3426  */
3427 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3428                       struct btrfs_root *extent_root, u64 type)
3429 {
3430         u64 chunk_offset;
3431         u64 chunk_size;
3432         u64 stripe_size;
3433         struct map_lookup *map;
3434         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
3435         int ret;
3436
3437         ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3438                               &chunk_offset);
3439         if (ret)
3440                 return ret;
3441
3442         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
3443                                   &stripe_size, chunk_offset, type);
3444         if (ret)
3445                 return ret;
3446
3447         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
3448                                    chunk_size, stripe_size);
3449         BUG_ON(ret);
3450         return 0;
3451 }
3452
3453 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3454                                          struct btrfs_root *root,
3455                                          struct btrfs_device *device)
3456 {
3457         u64 chunk_offset;
3458         u64 sys_chunk_offset;
3459         u64 chunk_size;
3460         u64 sys_chunk_size;
3461         u64 stripe_size;
3462         u64 sys_stripe_size;
3463         u64 alloc_profile;
3464         struct map_lookup *map;
3465         struct map_lookup *sys_map;
3466         struct btrfs_fs_info *fs_info = root->fs_info;
3467         struct btrfs_root *extent_root = fs_info->extent_root;
3468         int ret;
3469
3470         ret = find_next_chunk(fs_info->chunk_root,
3471                               BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
3472         if (ret)
3473                 return ret;
3474
3475         alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
3476                                 fs_info->avail_metadata_alloc_bits;
3477         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3478
3479         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
3480                                   &stripe_size, chunk_offset, alloc_profile);
3481         BUG_ON(ret);
3482
3483         sys_chunk_offset = chunk_offset + chunk_size;
3484
3485         alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
3486                                 fs_info->avail_system_alloc_bits;
3487         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3488
3489         ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
3490                                   &sys_chunk_size, &sys_stripe_size,
3491                                   sys_chunk_offset, alloc_profile);
3492         BUG_ON(ret);
3493
3494         ret = btrfs_add_device(trans, fs_info->chunk_root, device);
3495         BUG_ON(ret);
3496
3497         /*
3498          * Modifying chunk tree needs allocating new blocks from both
3499          * system block group and metadata block group. So we only can
3500          * do operations require modifying the chunk tree after both
3501          * block groups were created.
3502          */
3503         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
3504                                    chunk_size, stripe_size);
3505         BUG_ON(ret);
3506
3507         ret = __finish_chunk_alloc(trans, extent_root, sys_map,
3508                                    sys_chunk_offset, sys_chunk_size,
3509                                    sys_stripe_size);
3510         BUG_ON(ret);
3511         return 0;
3512 }
3513
3514 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
3515 {
3516         struct extent_map *em;
3517         struct map_lookup *map;
3518         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
3519         int readonly = 0;
3520         int i;
3521
3522         read_lock(&map_tree->map_tree.lock);
3523         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3524         read_unlock(&map_tree->map_tree.lock);
3525         if (!em)
3526                 return 1;
3527
3528         if (btrfs_test_opt(root, DEGRADED)) {
3529                 free_extent_map(em);
3530                 return 0;
3531         }
3532
3533         map = (struct map_lookup *)em->bdev;
3534         for (i = 0; i < map->num_stripes; i++) {
3535                 if (!map->stripes[i].dev->writeable) {
3536                         readonly = 1;
3537                         break;
3538                 }
3539         }
3540         free_extent_map(em);
3541         return readonly;
3542 }
3543
3544 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
3545 {
3546         extent_map_tree_init(&tree->map_tree);
3547 }
3548
3549 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
3550 {
3551         struct extent_map *em;
3552
3553         while (1) {
3554                 write_lock(&tree->map_tree.lock);
3555                 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
3556                 if (em)
3557                         remove_extent_mapping(&tree->map_tree, em);
3558                 write_unlock(&tree->map_tree.lock);
3559                 if (!em)
3560                         break;
3561                 kfree(em->bdev);
3562                 /* once for us */
3563                 free_extent_map(em);
3564                 /* once for the tree */
3565                 free_extent_map(em);
3566         }
3567 }
3568
3569 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
3570 {
3571         struct extent_map *em;
3572         struct map_lookup *map;
3573         struct extent_map_tree *em_tree = &map_tree->map_tree;
3574         int ret;
3575
3576         read_lock(&em_tree->lock);
3577         em = lookup_extent_mapping(em_tree, logical, len);
3578         read_unlock(&em_tree->lock);
3579         BUG_ON(!em);
3580
3581         BUG_ON(em->start > logical || em->start + em->len < logical);
3582         map = (struct map_lookup *)em->bdev;
3583         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
3584                 ret = map->num_stripes;
3585         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
3586                 ret = map->sub_stripes;
3587         else
3588                 ret = 1;
3589         free_extent_map(em);
3590         return ret;
3591 }
3592
3593 static int find_live_mirror(struct map_lookup *map, int first, int num,
3594                             int optimal)
3595 {
3596         int i;
3597         if (map->stripes[optimal].dev->bdev)
3598                 return optimal;
3599         for (i = first; i < first + num; i++) {
3600                 if (map->stripes[i].dev->bdev)
3601                         return i;
3602         }
3603         /* we couldn't find one that doesn't fail.  Just return something
3604          * and the io error handling code will clean up eventually
3605          */
3606         return optimal;
3607 }
3608
3609 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3610                              u64 logical, u64 *length,
3611                              struct btrfs_bio **bbio_ret,
3612                              int mirror_num)
3613 {
3614         struct extent_map *em;
3615         struct map_lookup *map;
3616         struct extent_map_tree *em_tree = &map_tree->map_tree;
3617         u64 offset;
3618         u64 stripe_offset;
3619         u64 stripe_end_offset;
3620         u64 stripe_nr;
3621         u64 stripe_nr_orig;
3622         u64 stripe_nr_end;
3623         int stripe_index;
3624         int i;
3625         int ret = 0;
3626         int num_stripes;
3627         int max_errors = 0;
3628         struct btrfs_bio *bbio = NULL;
3629
3630         read_lock(&em_tree->lock);
3631         em = lookup_extent_mapping(em_tree, logical, *length);
3632         read_unlock(&em_tree->lock);
3633
3634         if (!em) {
3635                 printk(KERN_CRIT "unable to find logical %llu len %llu\n",
3636                        (unsigned long long)logical,
3637                        (unsigned long long)*length);
3638                 BUG();
3639         }
3640
3641         BUG_ON(em->start > logical || em->start + em->len < logical);
3642         map = (struct map_lookup *)em->bdev;
3643         offset = logical - em->start;
3644
3645         if (mirror_num > map->num_stripes)
3646                 mirror_num = 0;
3647
3648         stripe_nr = offset;
3649         /*
3650          * stripe_nr counts the total number of stripes we have to stride
3651          * to get to this block
3652          */
3653         do_div(stripe_nr, map->stripe_len);
3654
3655         stripe_offset = stripe_nr * map->stripe_len;
3656         BUG_ON(offset < stripe_offset);
3657
3658         /* stripe_offset is the offset of this block in its stripe*/
3659         stripe_offset = offset - stripe_offset;
3660
3661         if (rw & REQ_DISCARD)
3662                 *length = min_t(u64, em->len - offset, *length);
3663         else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
3664                 /* we limit the length of each bio to what fits in a stripe */
3665                 *length = min_t(u64, em->len - offset,
3666                                 map->stripe_len - stripe_offset);
3667         } else {
3668                 *length = em->len - offset;
3669         }
3670
3671         if (!bbio_ret)
3672                 goto out;
3673
3674         num_stripes = 1;
3675         stripe_index = 0;
3676         stripe_nr_orig = stripe_nr;
3677         stripe_nr_end = (offset + *length + map->stripe_len - 1) &
3678                         (~(map->stripe_len - 1));
3679         do_div(stripe_nr_end, map->stripe_len);
3680         stripe_end_offset = stripe_nr_end * map->stripe_len -
3681                             (offset + *length);
3682         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3683                 if (rw & REQ_DISCARD)
3684                         num_stripes = min_t(u64, map->num_stripes,
3685                                             stripe_nr_end - stripe_nr_orig);
3686                 stripe_index = do_div(stripe_nr, map->num_stripes);
3687         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3688                 if (rw & (REQ_WRITE | REQ_DISCARD))
3689                         num_stripes = map->num_stripes;
3690                 else if (mirror_num)
3691                         stripe_index = mirror_num - 1;
3692                 else {
3693                         stripe_index = find_live_mirror(map, 0,
3694                                             map->num_stripes,
3695                                             current->pid % map->num_stripes);
3696                         mirror_num = stripe_index + 1;
3697                 }
3698
3699         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3700                 if (rw & (REQ_WRITE | REQ_DISCARD)) {
3701                         num_stripes = map->num_stripes;
3702                 } else if (mirror_num) {
3703                         stripe_index = mirror_num - 1;
3704                 } else {
3705                         mirror_num = 1;
3706                 }
3707
3708         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3709                 int factor = map->num_stripes / map->sub_stripes;
3710
3711                 stripe_index = do_div(stripe_nr, factor);
3712                 stripe_index *= map->sub_stripes;
3713
3714                 if (rw & REQ_WRITE)
3715                         num_stripes = map->sub_stripes;
3716                 else if (rw & REQ_DISCARD)
3717                         num_stripes = min_t(u64, map->sub_stripes *
3718                                             (stripe_nr_end - stripe_nr_orig),
3719                                             map->num_stripes);
3720                 else if (mirror_num)
3721                         stripe_index += mirror_num - 1;
3722                 else {
3723                         stripe_index = find_live_mirror(map, stripe_index,
3724                                               map->sub_stripes, stripe_index +
3725                                               current->pid % map->sub_stripes);
3726                         mirror_num = stripe_index + 1;
3727                 }
3728         } else {
3729                 /*
3730                  * after this do_div call, stripe_nr is the number of stripes
3731                  * on this device we have to walk to find the data, and
3732                  * stripe_index is the number of our device in the stripe array
3733                  */
3734                 stripe_index = do_div(stripe_nr, map->num_stripes);
3735                 mirror_num = stripe_index + 1;
3736         }
3737         BUG_ON(stripe_index >= map->num_stripes);
3738
3739         bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
3740         if (!bbio) {
3741                 ret = -ENOMEM;
3742                 goto out;
3743         }
3744         atomic_set(&bbio->error, 0);
3745
3746         if (rw & REQ_DISCARD) {
3747                 int factor = 0;
3748                 int sub_stripes = 0;
3749                 u64 stripes_per_dev = 0;
3750                 u32 remaining_stripes = 0;
3751
3752                 if (map->type &
3753                     (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
3754                         if (map->type & BTRFS_BLOCK_GROUP_RAID0)
3755                                 sub_stripes = 1;
3756                         else
3757                                 sub_stripes = map->sub_stripes;
3758
3759                         factor = map->num_stripes / sub_stripes;
3760                         stripes_per_dev = div_u64_rem(stripe_nr_end -
3761                                                       stripe_nr_orig,
3762                                                       factor,
3763                                                       &remaining_stripes);
3764                 }
3765
3766                 for (i = 0; i < num_stripes; i++) {
3767                         bbio->stripes[i].physical =
3768                                 map->stripes[stripe_index].physical +
3769                                 stripe_offset + stripe_nr * map->stripe_len;
3770                         bbio->stripes[i].dev = map->stripes[stripe_index].dev;
3771
3772                         if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3773                                          BTRFS_BLOCK_GROUP_RAID10)) {
3774                                 bbio->stripes[i].length = stripes_per_dev *
3775                                                           map->stripe_len;
3776                                 if (i / sub_stripes < remaining_stripes)
3777                                         bbio->stripes[i].length +=
3778                                                 map->stripe_len;
3779                                 if (i < sub_stripes)
3780                                         bbio->stripes[i].length -=
3781                                                 stripe_offset;
3782                                 if ((i / sub_stripes + 1) %
3783                                     sub_stripes == remaining_stripes)
3784                                         bbio->stripes[i].length -=
3785                                                 stripe_end_offset;
3786                                 if (i == sub_stripes - 1)
3787                                         stripe_offset = 0;
3788                         } else
3789                                 bbio->stripes[i].length = *length;
3790
3791                         stripe_index++;
3792                         if (stripe_index == map->num_stripes) {
3793                                 /* This could only happen for RAID0/10 */
3794                                 stripe_index = 0;
3795                                 stripe_nr++;
3796                         }
3797                 }
3798         } else {
3799                 for (i = 0; i < num_stripes; i++) {
3800                         bbio->stripes[i].physical =
3801                                 map->stripes[stripe_index].physical +
3802                                 stripe_offset +
3803                                 stripe_nr * map->stripe_len;
3804                         bbio->stripes[i].dev =
3805                                 map->stripes[stripe_index].dev;
3806                         stripe_index++;
3807                 }
3808         }
3809
3810         if (rw & REQ_WRITE) {
3811                 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
3812                                  BTRFS_BLOCK_GROUP_RAID10 |
3813                                  BTRFS_BLOCK_GROUP_DUP)) {
3814                         max_errors = 1;
3815                 }
3816         }
3817
3818         *bbio_ret = bbio;
3819         bbio->num_stripes = num_stripes;
3820         bbio->max_errors = max_errors;
3821         bbio->mirror_num = mirror_num;
3822 out:
3823         free_extent_map(em);
3824         return ret;
3825 }
3826
3827 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3828                       u64 logical, u64 *length,
3829                       struct btrfs_bio **bbio_ret, int mirror_num)
3830 {
3831         return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
3832                                  mirror_num);
3833 }
3834
3835 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3836                      u64 chunk_start, u64 physical, u64 devid,
3837                      u64 **logical, int *naddrs, int *stripe_len)
3838 {
3839         struct extent_map_tree *em_tree = &map_tree->map_tree;
3840         struct extent_map *em;
3841         struct map_lookup *map;
3842         u64 *buf;
3843         u64 bytenr;
3844         u64 length;
3845         u64 stripe_nr;
3846         int i, j, nr = 0;
3847
3848         read_lock(&em_tree->lock);
3849         em = lookup_extent_mapping(em_tree, chunk_start, 1);
3850         read_unlock(&em_tree->lock);
3851
3852         BUG_ON(!em || em->start != chunk_start);
3853         map = (struct map_lookup *)em->bdev;
3854
3855         length = em->len;
3856         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
3857                 do_div(length, map->num_stripes / map->sub_stripes);
3858         else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
3859                 do_div(length, map->num_stripes);
3860
3861         buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
3862         BUG_ON(!buf);
3863
3864         for (i = 0; i < map->num_stripes; i++) {
3865                 if (devid && map->stripes[i].dev->devid != devid)
3866                         continue;
3867                 if (map->stripes[i].physical > physical ||
3868                     map->stripes[i].physical + length <= physical)
3869                         continue;
3870
3871                 stripe_nr = physical - map->stripes[i].physical;
3872                 do_div(stripe_nr, map->stripe_len);
3873
3874                 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3875                         stripe_nr = stripe_nr * map->num_stripes + i;
3876                         do_div(stripe_nr, map->sub_stripes);
3877                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3878                         stripe_nr = stripe_nr * map->num_stripes + i;
3879                 }
3880                 bytenr = chunk_start + stripe_nr * map->stripe_len;
3881                 WARN_ON(nr >= map->num_stripes);
3882                 for (j = 0; j < nr; j++) {
3883                         if (buf[j] == bytenr)
3884                                 break;
3885                 }
3886                 if (j == nr) {
3887                         WARN_ON(nr >= map->num_stripes);
3888                         buf[nr++] = bytenr;
3889                 }
3890         }
3891
3892         *logical = buf;
3893         *naddrs = nr;
3894         *stripe_len = map->stripe_len;
3895
3896         free_extent_map(em);
3897         return 0;
3898 }
3899
3900 static void btrfs_end_bio(struct bio *bio, int err)
3901 {
3902         struct btrfs_bio *bbio = bio->bi_private;
3903         int is_orig_bio = 0;
3904
3905         if (err)
3906                 atomic_inc(&bbio->error);
3907
3908         if (bio == bbio->orig_bio)
3909                 is_orig_bio = 1;
3910
3911         if (atomic_dec_and_test(&bbio->stripes_pending)) {
3912                 if (!is_orig_bio) {
3913                         bio_put(bio);
3914                         bio = bbio->orig_bio;
3915                 }
3916                 bio->bi_private = bbio->private;
3917                 bio->bi_end_io = bbio->end_io;
3918                 bio->bi_bdev = (struct block_device *)
3919                                         (unsigned long)bbio->mirror_num;
3920                 /* only send an error to the higher layers if it is
3921                  * beyond the tolerance of the multi-bio
3922                  */
3923                 if (atomic_read(&bbio->error) > bbio->max_errors) {
3924                         err = -EIO;
3925                 } else {
3926                         /*
3927                          * this bio is actually up to date, we didn't
3928                          * go over the max number of errors
3929                          */
3930                         set_bit(BIO_UPTODATE, &bio->bi_flags);
3931                         err = 0;
3932                 }
3933                 kfree(bbio);
3934
3935                 bio_endio(bio, err);
3936         } else if (!is_orig_bio) {
3937                 bio_put(bio);
3938         }
3939 }
3940
3941 struct async_sched {
3942         struct bio *bio;
3943         int rw;
3944         struct btrfs_fs_info *info;
3945         struct btrfs_work work;
3946 };
3947
3948 /*
3949  * see run_scheduled_bios for a description of why bios are collected for
3950  * async submit.
3951  *
3952  * This will add one bio to the pending list for a device and make sure
3953  * the work struct is scheduled.
3954  */
3955 static noinline int schedule_bio(struct btrfs_root *root,
3956                                  struct btrfs_device *device,
3957                                  int rw, struct bio *bio)
3958 {
3959         int should_queue = 1;
3960         struct btrfs_pending_bios *pending_bios;
3961
3962         /* don't bother with additional async steps for reads, right now */
3963         if (!(rw & REQ_WRITE)) {
3964                 bio_get(bio);
3965                 submit_bio(rw, bio);
3966                 bio_put(bio);
3967                 return 0;
3968         }
3969
3970         /*
3971          * nr_async_bios allows us to reliably return congestion to the
3972          * higher layers.  Otherwise, the async bio makes it appear we have
3973          * made progress against dirty pages when we've really just put it
3974          * on a queue for later
3975          */
3976         atomic_inc(&root->fs_info->nr_async_bios);
3977         WARN_ON(bio->bi_next);
3978         bio->bi_next = NULL;
3979         bio->bi_rw |= rw;
3980
3981         spin_lock(&device->io_lock);
3982         if (bio->bi_rw & REQ_SYNC)
3983                 pending_bios = &device->pending_sync_bios;
3984         else
3985                 pending_bios = &device->pending_bios;
3986
3987         if (pending_bios->tail)
3988                 pending_bios->tail->bi_next = bio;
3989
3990         pending_bios->tail = bio;
3991         if (!pending_bios->head)
3992                 pending_bios->head = bio;
3993         if (device->running_pending)
3994                 should_queue = 0;
3995
3996         spin_unlock(&device->io_lock);
3997
3998         if (should_queue)
3999                 btrfs_queue_worker(&root->fs_info->submit_workers,
4000                                    &device->work);
4001         return 0;
4002 }
4003
4004 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4005                   int mirror_num, int async_submit)
4006 {
4007         struct btrfs_mapping_tree *map_tree;
4008         struct btrfs_device *dev;
4009         struct bio *first_bio = bio;
4010         u64 logical = (u64)bio->bi_sector << 9;
4011         u64 length = 0;
4012         u64 map_length;
4013         int ret;
4014         int dev_nr = 0;
4015         int total_devs = 1;
4016         struct btrfs_bio *bbio = NULL;
4017
4018         length = bio->bi_size;
4019         map_tree = &root->fs_info->mapping_tree;
4020         map_length = length;
4021
4022         ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
4023                               mirror_num);
4024         BUG_ON(ret);
4025
4026         total_devs = bbio->num_stripes;
4027         if (map_length < length) {
4028                 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
4029                        "len %llu\n", (unsigned long long)logical,
4030                        (unsigned long long)length,
4031                        (unsigned long long)map_length);
4032                 BUG();
4033         }
4034
4035         bbio->orig_bio = first_bio;
4036         bbio->private = first_bio->bi_private;
4037         bbio->end_io = first_bio->bi_end_io;
4038         atomic_set(&bbio->stripes_pending, bbio->num_stripes);
4039
4040         while (dev_nr < total_devs) {
4041                 if (dev_nr < total_devs - 1) {
4042                         bio = bio_clone(first_bio, GFP_NOFS);
4043                         BUG_ON(!bio);
4044                 } else {
4045                         bio = first_bio;
4046                 }
4047                 bio->bi_private = bbio;
4048                 bio->bi_end_io = btrfs_end_bio;
4049                 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
4050                 dev = bbio->stripes[dev_nr].dev;
4051                 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
4052                         pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
4053                                  "(%s id %llu), size=%u\n", rw,
4054                                  (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
4055                                  dev->name, dev->devid, bio->bi_size);
4056                         bio->bi_bdev = dev->bdev;
4057                         if (async_submit)
4058                                 schedule_bio(root, dev, rw, bio);
4059                         else
4060                                 submit_bio(rw, bio);
4061                 } else {
4062                         bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
4063                         bio->bi_sector = logical >> 9;
4064                         bio_endio(bio, -EIO);
4065                 }
4066                 dev_nr++;
4067         }
4068         return 0;
4069 }
4070
4071 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
4072                                        u8 *uuid, u8 *fsid)
4073 {
4074         struct btrfs_device *device;
4075         struct btrfs_fs_devices *cur_devices;
4076
4077         cur_devices = root->fs_info->fs_devices;
4078         while (cur_devices) {
4079                 if (!fsid ||
4080                     !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
4081                         device = __find_device(&cur_devices->devices,
4082                                                devid, uuid);
4083                         if (device)
4084                                 return device;
4085                 }
4086                 cur_devices = cur_devices->seed;
4087         }
4088         return NULL;
4089 }
4090
4091 static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
4092                                             u64 devid, u8 *dev_uuid)
4093 {
4094         struct btrfs_device *device;
4095         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
4096
4097         device = kzalloc(sizeof(*device), GFP_NOFS);
4098         if (!device)
4099                 return NULL;
4100         list_add(&device->dev_list,
4101                  &fs_devices->devices);
4102         device->dev_root = root->fs_info->dev_root;
4103         device->devid = devid;
4104         device->work.func = pending_bios_fn;
4105         device->fs_devices = fs_devices;
4106         device->missing = 1;
4107         fs_devices->num_devices++;
4108         fs_devices->missing_devices++;
4109         spin_lock_init(&device->io_lock);
4110         INIT_LIST_HEAD(&device->dev_alloc_list);
4111         memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
4112         return device;
4113 }
4114
4115 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
4116                           struct extent_buffer *leaf,
4117                           struct btrfs_chunk *chunk)
4118 {
4119         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
4120         struct map_lookup *map;
4121         struct extent_map *em;
4122         u64 logical;
4123         u64 length;
4124         u64 devid;
4125         u8 uuid[BTRFS_UUID_SIZE];
4126         int num_stripes;
4127         int ret;
4128         int i;
4129
4130         logical = key->offset;
4131         length = btrfs_chunk_length(leaf, chunk);
4132
4133         read_lock(&map_tree->map_tree.lock);
4134         em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
4135         read_unlock(&map_tree->map_tree.lock);
4136
4137         /* already mapped? */
4138         if (em && em->start <= logical && em->start + em->len > logical) {
4139                 free_extent_map(em);
4140                 return 0;
4141         } else if (em) {
4142                 free_extent_map(em);
4143         }
4144
4145         em = alloc_extent_map();
4146         if (!em)
4147                 return -ENOMEM;
4148         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
4149         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
4150         if (!map) {
4151                 free_extent_map(em);
4152                 return -ENOMEM;
4153         }
4154
4155         em->bdev = (struct block_device *)map;
4156         em->start = logical;
4157         em->len = length;
4158         em->block_start = 0;
4159         em->block_len = em->len;
4160
4161         map->num_stripes = num_stripes;
4162         map->io_width = btrfs_chunk_io_width(leaf, chunk);
4163         map->io_align = btrfs_chunk_io_align(leaf, chunk);
4164         map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
4165         map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
4166         map->type = btrfs_chunk_type(leaf, chunk);
4167         map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
4168         for (i = 0; i < num_stripes; i++) {
4169                 map->stripes[i].physical =
4170                         btrfs_stripe_offset_nr(leaf, chunk, i);
4171                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
4172                 read_extent_buffer(leaf, uuid, (unsigned long)
4173                                    btrfs_stripe_dev_uuid_nr(chunk, i),
4174                                    BTRFS_UUID_SIZE);
4175                 map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
4176                                                         NULL);
4177                 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
4178                         kfree(map);
4179                         free_extent_map(em);
4180                         return -EIO;
4181                 }
4182                 if (!map->stripes[i].dev) {
4183                         map->stripes[i].dev =
4184                                 add_missing_dev(root, devid, uuid);
4185                         if (!map->stripes[i].dev) {
4186                                 kfree(map);
4187                                 free_extent_map(em);
4188                                 return -EIO;
4189                         }
4190                 }
4191                 map->stripes[i].dev->in_fs_metadata = 1;
4192         }
4193
4194         write_lock(&map_tree->map_tree.lock);
4195         ret = add_extent_mapping(&map_tree->map_tree, em);
4196         write_unlock(&map_tree->map_tree.lock);
4197         BUG_ON(ret);
4198         free_extent_map(em);
4199
4200         return 0;
4201 }
4202
4203 static int fill_device_from_item(struct extent_buffer *leaf,
4204                                  struct btrfs_dev_item *dev_item,
4205                                  struct btrfs_device *device)
4206 {
4207         unsigned long ptr;
4208
4209         device->devid = btrfs_device_id(leaf, dev_item);
4210         device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
4211         device->total_bytes = device->disk_total_bytes;
4212         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
4213         device->type = btrfs_device_type(leaf, dev_item);
4214         device->io_align = btrfs_device_io_align(leaf, dev_item);
4215         device->io_width = btrfs_device_io_width(leaf, dev_item);
4216         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
4217
4218         ptr = (unsigned long)btrfs_device_uuid(dev_item);
4219         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
4220
4221         return 0;
4222 }
4223
4224 static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
4225 {
4226         struct btrfs_fs_devices *fs_devices;
4227         int ret;
4228
4229         BUG_ON(!mutex_is_locked(&uuid_mutex));
4230
4231         fs_devices = root->fs_info->fs_devices->seed;
4232         while (fs_devices) {
4233                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
4234                         ret = 0;
4235                         goto out;
4236                 }
4237                 fs_devices = fs_devices->seed;
4238         }
4239
4240         fs_devices = find_fsid(fsid);
4241         if (!fs_devices) {
4242                 ret = -ENOENT;
4243                 goto out;
4244         }
4245
4246         fs_devices = clone_fs_devices(fs_devices);
4247         if (IS_ERR(fs_devices)) {
4248                 ret = PTR_ERR(fs_devices);
4249                 goto out;
4250         }
4251
4252         ret = __btrfs_open_devices(fs_devices, FMODE_READ,
4253                                    root->fs_info->bdev_holder);
4254         if (ret)
4255                 goto out;
4256
4257         if (!fs_devices->seeding) {
4258                 __btrfs_close_devices(fs_devices);
4259                 free_fs_devices(fs_devices);
4260                 ret = -EINVAL;
4261                 goto out;
4262         }
4263
4264         fs_devices->seed = root->fs_info->fs_devices->seed;
4265         root->fs_info->fs_devices->seed = fs_devices;
4266 out:
4267         return ret;
4268 }
4269
4270 static int read_one_dev(struct btrfs_root *root,
4271                         struct extent_buffer *leaf,
4272                         struct btrfs_dev_item *dev_item)
4273 {
4274         struct btrfs_device *device;
4275         u64 devid;
4276         int ret;
4277         u8 fs_uuid[BTRFS_UUID_SIZE];
4278         u8 dev_uuid[BTRFS_UUID_SIZE];
4279
4280         devid = btrfs_device_id(leaf, dev_item);
4281         read_extent_buffer(leaf, dev_uuid,
4282                            (unsigned long)btrfs_device_uuid(dev_item),
4283                            BTRFS_UUID_SIZE);
4284         read_extent_buffer(leaf, fs_uuid,
4285                            (unsigned long)btrfs_device_fsid(dev_item),
4286                            BTRFS_UUID_SIZE);
4287
4288         if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
4289                 ret = open_seed_devices(root, fs_uuid);
4290                 if (ret && !btrfs_test_opt(root, DEGRADED))
4291                         return ret;
4292         }
4293
4294         device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
4295         if (!device || !device->bdev) {
4296                 if (!btrfs_test_opt(root, DEGRADED))
4297                         return -EIO;
4298
4299                 if (!device) {
4300                         printk(KERN_WARNING "warning devid %llu missing\n",
4301                                (unsigned long long)devid);
4302                         device = add_missing_dev(root, devid, dev_uuid);
4303                         if (!device)
4304                                 return -ENOMEM;
4305                 } else if (!device->missing) {
4306                         /*
4307                          * this happens when a device that was properly setup
4308                          * in the device info lists suddenly goes bad.
4309                          * device->bdev is NULL, and so we have to set
4310                          * device->missing to one here
4311                          */
4312                         root->fs_info->fs_devices->missing_devices++;
4313                         device->missing = 1;
4314                 }
4315         }
4316
4317         if (device->fs_devices != root->fs_info->fs_devices) {
4318                 BUG_ON(device->writeable);
4319                 if (device->generation !=
4320                     btrfs_device_generation(leaf, dev_item))
4321                         return -EINVAL;
4322         }
4323
4324         fill_device_from_item(leaf, dev_item, device);
4325         device->dev_root = root->fs_info->dev_root;
4326         device->in_fs_metadata = 1;
4327         if (device->writeable) {
4328                 device->fs_devices->total_rw_bytes += device->total_bytes;
4329                 spin_lock(&root->fs_info->free_chunk_lock);
4330                 root->fs_info->free_chunk_space += device->total_bytes -
4331                         device->bytes_used;
4332                 spin_unlock(&root->fs_info->free_chunk_lock);
4333         }
4334         ret = 0;
4335         return ret;
4336 }
4337
4338 int btrfs_read_sys_array(struct btrfs_root *root)
4339 {
4340         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
4341         struct extent_buffer *sb;
4342         struct btrfs_disk_key *disk_key;
4343         struct btrfs_chunk *chunk;
4344         u8 *ptr;
4345         unsigned long sb_ptr;
4346         int ret = 0;
4347         u32 num_stripes;
4348         u32 array_size;
4349         u32 len = 0;
4350         u32 cur;
4351         struct btrfs_key key;
4352
4353         sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
4354                                           BTRFS_SUPER_INFO_SIZE);
4355         if (!sb)
4356                 return -ENOMEM;
4357         btrfs_set_buffer_uptodate(sb);
4358         btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
4359
4360         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
4361         array_size = btrfs_super_sys_array_size(super_copy);
4362
4363         ptr = super_copy->sys_chunk_array;
4364         sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
4365         cur = 0;
4366
4367         while (cur < array_size) {
4368                 disk_key = (struct btrfs_disk_key *)ptr;
4369                 btrfs_disk_key_to_cpu(&key, disk_key);
4370
4371                 len = sizeof(*disk_key); ptr += len;
4372                 sb_ptr += len;
4373                 cur += len;
4374
4375                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
4376                         chunk = (struct btrfs_chunk *)sb_ptr;
4377                         ret = read_one_chunk(root, &key, sb, chunk);
4378                         if (ret)
4379                                 break;
4380                         num_stripes = btrfs_chunk_num_stripes(sb, chunk);
4381                         len = btrfs_chunk_item_size(num_stripes);
4382                 } else {
4383                         ret = -EIO;
4384                         break;
4385                 }
4386                 ptr += len;
4387                 sb_ptr += len;
4388                 cur += len;
4389         }
4390         free_extent_buffer(sb);
4391         return ret;
4392 }
4393
4394 int btrfs_read_chunk_tree(struct btrfs_root *root)
4395 {
4396         struct btrfs_path *path;
4397         struct extent_buffer *leaf;
4398         struct btrfs_key key;
4399         struct btrfs_key found_key;
4400         int ret;
4401         int slot;
4402
4403         root = root->fs_info->chunk_root;
4404
4405         path = btrfs_alloc_path();
4406         if (!path)
4407                 return -ENOMEM;
4408
4409         mutex_lock(&uuid_mutex);
4410         lock_chunks(root);
4411
4412         /* first we search for all of the device items, and then we
4413          * read in all of the chunk items.  This way we can create chunk
4414          * mappings that reference all of the devices that are afound
4415          */
4416         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
4417         key.offset = 0;
4418         key.type = 0;
4419 again:
4420         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4421         if (ret < 0)
4422                 goto error;
4423         while (1) {
4424                 leaf = path->nodes[0];
4425                 slot = path->slots[0];
4426                 if (slot >= btrfs_header_nritems(leaf)) {
4427                         ret = btrfs_next_leaf(root, path);
4428                         if (ret == 0)
4429                                 continue;
4430                         if (ret < 0)
4431                                 goto error;
4432                         break;
4433                 }
4434                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
4435                 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
4436                         if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
4437                                 break;
4438                         if (found_key.type == BTRFS_DEV_ITEM_KEY) {
4439                                 struct btrfs_dev_item *dev_item;
4440                                 dev_item = btrfs_item_ptr(leaf, slot,
4441                                                   struct btrfs_dev_item);
4442                                 ret = read_one_dev(root, leaf, dev_item);
4443                                 if (ret)
4444                                         goto error;
4445                         }
4446                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
4447                         struct btrfs_chunk *chunk;
4448                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
4449                         ret = read_one_chunk(root, &found_key, leaf, chunk);
4450                         if (ret)
4451                                 goto error;
4452                 }
4453                 path->slots[0]++;
4454         }
4455         if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
4456                 key.objectid = 0;
4457                 btrfs_release_path(path);
4458                 goto again;
4459         }
4460         ret = 0;
4461 error:
4462         unlock_chunks(root);
4463         mutex_unlock(&uuid_mutex);
4464
4465         btrfs_free_path(path);
4466         return ret;
4467 }