Btrfs: fix kfree on list_head in btrfs_lookup_csums_range error cleanup
[firefly-linux-kernel-4.4.55.git] / fs / btrfs / ctree.c
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/rbtree.h>
22 #include "ctree.h"
23 #include "disk-io.h"
24 #include "transaction.h"
25 #include "print-tree.h"
26 #include "locking.h"
27
28 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
29                       *root, struct btrfs_path *path, int level);
30 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
31                       *root, struct btrfs_key *ins_key,
32                       struct btrfs_path *path, int data_size, int extend);
33 static int push_node_left(struct btrfs_trans_handle *trans,
34                           struct btrfs_root *root, struct extent_buffer *dst,
35                           struct extent_buffer *src, int empty);
36 static int balance_node_right(struct btrfs_trans_handle *trans,
37                               struct btrfs_root *root,
38                               struct extent_buffer *dst_buf,
39                               struct extent_buffer *src_buf);
40 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
41                     int level, int slot);
42 static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
43                                  struct extent_buffer *eb);
44 static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
45
46 struct btrfs_path *btrfs_alloc_path(void)
47 {
48         struct btrfs_path *path;
49         path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
50         return path;
51 }
52
53 /*
54  * set all locked nodes in the path to blocking locks.  This should
55  * be done before scheduling
56  */
57 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
58 {
59         int i;
60         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
61                 if (!p->nodes[i] || !p->locks[i])
62                         continue;
63                 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
64                 if (p->locks[i] == BTRFS_READ_LOCK)
65                         p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
66                 else if (p->locks[i] == BTRFS_WRITE_LOCK)
67                         p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
68         }
69 }
70
71 /*
72  * reset all the locked nodes in the patch to spinning locks.
73  *
74  * held is used to keep lockdep happy, when lockdep is enabled
75  * we set held to a blocking lock before we go around and
76  * retake all the spinlocks in the path.  You can safely use NULL
77  * for held
78  */
79 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
80                                         struct extent_buffer *held, int held_rw)
81 {
82         int i;
83
84 #ifdef CONFIG_DEBUG_LOCK_ALLOC
85         /* lockdep really cares that we take all of these spinlocks
86          * in the right order.  If any of the locks in the path are not
87          * currently blocking, it is going to complain.  So, make really
88          * really sure by forcing the path to blocking before we clear
89          * the path blocking.
90          */
91         if (held) {
92                 btrfs_set_lock_blocking_rw(held, held_rw);
93                 if (held_rw == BTRFS_WRITE_LOCK)
94                         held_rw = BTRFS_WRITE_LOCK_BLOCKING;
95                 else if (held_rw == BTRFS_READ_LOCK)
96                         held_rw = BTRFS_READ_LOCK_BLOCKING;
97         }
98         btrfs_set_path_blocking(p);
99 #endif
100
101         for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
102                 if (p->nodes[i] && p->locks[i]) {
103                         btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
104                         if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
105                                 p->locks[i] = BTRFS_WRITE_LOCK;
106                         else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
107                                 p->locks[i] = BTRFS_READ_LOCK;
108                 }
109         }
110
111 #ifdef CONFIG_DEBUG_LOCK_ALLOC
112         if (held)
113                 btrfs_clear_lock_blocking_rw(held, held_rw);
114 #endif
115 }
116
117 /* this also releases the path */
118 void btrfs_free_path(struct btrfs_path *p)
119 {
120         if (!p)
121                 return;
122         btrfs_release_path(p);
123         kmem_cache_free(btrfs_path_cachep, p);
124 }
125
126 /*
127  * path release drops references on the extent buffers in the path
128  * and it drops any locks held by this path
129  *
130  * It is safe to call this on paths that no locks or extent buffers held.
131  */
132 noinline void btrfs_release_path(struct btrfs_path *p)
133 {
134         int i;
135
136         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
137                 p->slots[i] = 0;
138                 if (!p->nodes[i])
139                         continue;
140                 if (p->locks[i]) {
141                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
142                         p->locks[i] = 0;
143                 }
144                 free_extent_buffer(p->nodes[i]);
145                 p->nodes[i] = NULL;
146         }
147 }
148
149 /*
150  * safely gets a reference on the root node of a tree.  A lock
151  * is not taken, so a concurrent writer may put a different node
152  * at the root of the tree.  See btrfs_lock_root_node for the
153  * looping required.
154  *
155  * The extent buffer returned by this has a reference taken, so
156  * it won't disappear.  It may stop being the root of the tree
157  * at any time because there are no locks held.
158  */
159 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
160 {
161         struct extent_buffer *eb;
162
163         while (1) {
164                 rcu_read_lock();
165                 eb = rcu_dereference(root->node);
166
167                 /*
168                  * RCU really hurts here, we could free up the root node because
169                  * it was cow'ed but we may not get the new root node yet so do
170                  * the inc_not_zero dance and if it doesn't work then
171                  * synchronize_rcu and try again.
172                  */
173                 if (atomic_inc_not_zero(&eb->refs)) {
174                         rcu_read_unlock();
175                         break;
176                 }
177                 rcu_read_unlock();
178                 synchronize_rcu();
179         }
180         return eb;
181 }
182
183 /* loop around taking references on and locking the root node of the
184  * tree until you end up with a lock on the root.  A locked buffer
185  * is returned, with a reference held.
186  */
187 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
188 {
189         struct extent_buffer *eb;
190
191         while (1) {
192                 eb = btrfs_root_node(root);
193                 btrfs_tree_lock(eb);
194                 if (eb == root->node)
195                         break;
196                 btrfs_tree_unlock(eb);
197                 free_extent_buffer(eb);
198         }
199         return eb;
200 }
201
202 /* loop around taking references on and locking the root node of the
203  * tree until you end up with a lock on the root.  A locked buffer
204  * is returned, with a reference held.
205  */
206 static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
207 {
208         struct extent_buffer *eb;
209
210         while (1) {
211                 eb = btrfs_root_node(root);
212                 btrfs_tree_read_lock(eb);
213                 if (eb == root->node)
214                         break;
215                 btrfs_tree_read_unlock(eb);
216                 free_extent_buffer(eb);
217         }
218         return eb;
219 }
220
221 /* cowonly root (everything not a reference counted cow subvolume), just get
222  * put onto a simple dirty list.  transaction.c walks this to make sure they
223  * get properly updated on disk.
224  */
225 static void add_root_to_dirty_list(struct btrfs_root *root)
226 {
227         spin_lock(&root->fs_info->trans_lock);
228         if (root->track_dirty && list_empty(&root->dirty_list)) {
229                 list_add(&root->dirty_list,
230                          &root->fs_info->dirty_cowonly_roots);
231         }
232         spin_unlock(&root->fs_info->trans_lock);
233 }
234
235 /*
236  * used by snapshot creation to make a copy of a root for a tree with
237  * a given objectid.  The buffer with the new root node is returned in
238  * cow_ret, and this func returns zero on success or a negative error code.
239  */
240 int btrfs_copy_root(struct btrfs_trans_handle *trans,
241                       struct btrfs_root *root,
242                       struct extent_buffer *buf,
243                       struct extent_buffer **cow_ret, u64 new_root_objectid)
244 {
245         struct extent_buffer *cow;
246         int ret = 0;
247         int level;
248         struct btrfs_disk_key disk_key;
249
250         WARN_ON(root->ref_cows && trans->transid !=
251                 root->fs_info->running_transaction->transid);
252         WARN_ON(root->ref_cows && trans->transid != root->last_trans);
253
254         level = btrfs_header_level(buf);
255         if (level == 0)
256                 btrfs_item_key(buf, &disk_key, 0);
257         else
258                 btrfs_node_key(buf, &disk_key, 0);
259
260         cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
261                                      new_root_objectid, &disk_key, level,
262                                      buf->start, 0);
263         if (IS_ERR(cow))
264                 return PTR_ERR(cow);
265
266         copy_extent_buffer(cow, buf, 0, 0, cow->len);
267         btrfs_set_header_bytenr(cow, cow->start);
268         btrfs_set_header_generation(cow, trans->transid);
269         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
270         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
271                                      BTRFS_HEADER_FLAG_RELOC);
272         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
273                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
274         else
275                 btrfs_set_header_owner(cow, new_root_objectid);
276
277         write_extent_buffer(cow, root->fs_info->fsid,
278                             (unsigned long)btrfs_header_fsid(cow),
279                             BTRFS_FSID_SIZE);
280
281         WARN_ON(btrfs_header_generation(buf) > trans->transid);
282         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
283                 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
284         else
285                 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
286
287         if (ret)
288                 return ret;
289
290         btrfs_mark_buffer_dirty(cow);
291         *cow_ret = cow;
292         return 0;
293 }
294
295 enum mod_log_op {
296         MOD_LOG_KEY_REPLACE,
297         MOD_LOG_KEY_ADD,
298         MOD_LOG_KEY_REMOVE,
299         MOD_LOG_KEY_REMOVE_WHILE_FREEING,
300         MOD_LOG_KEY_REMOVE_WHILE_MOVING,
301         MOD_LOG_MOVE_KEYS,
302         MOD_LOG_ROOT_REPLACE,
303 };
304
305 struct tree_mod_move {
306         int dst_slot;
307         int nr_items;
308 };
309
310 struct tree_mod_root {
311         u64 logical;
312         u8 level;
313 };
314
315 struct tree_mod_elem {
316         struct rb_node node;
317         u64 index;              /* shifted logical */
318         u64 seq;
319         enum mod_log_op op;
320
321         /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
322         int slot;
323
324         /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
325         u64 generation;
326
327         /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
328         struct btrfs_disk_key key;
329         u64 blockptr;
330
331         /* this is used for op == MOD_LOG_MOVE_KEYS */
332         struct tree_mod_move move;
333
334         /* this is used for op == MOD_LOG_ROOT_REPLACE */
335         struct tree_mod_root old_root;
336 };
337
338 static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
339 {
340         read_lock(&fs_info->tree_mod_log_lock);
341 }
342
343 static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
344 {
345         read_unlock(&fs_info->tree_mod_log_lock);
346 }
347
348 static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
349 {
350         write_lock(&fs_info->tree_mod_log_lock);
351 }
352
353 static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
354 {
355         write_unlock(&fs_info->tree_mod_log_lock);
356 }
357
358 /*
359  * Increment the upper half of tree_mod_seq, set lower half zero.
360  *
361  * Must be called with fs_info->tree_mod_seq_lock held.
362  */
363 static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
364 {
365         u64 seq = atomic64_read(&fs_info->tree_mod_seq);
366         seq &= 0xffffffff00000000ull;
367         seq += 1ull << 32;
368         atomic64_set(&fs_info->tree_mod_seq, seq);
369         return seq;
370 }
371
372 /*
373  * Increment the lower half of tree_mod_seq.
374  *
375  * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers
376  * are generated should not technically require a spin lock here. (Rationale:
377  * incrementing the minor while incrementing the major seq number is between its
378  * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it
379  * just returns a unique sequence number as usual.) We have decided to leave
380  * that requirement in here and rethink it once we notice it really imposes a
381  * problem on some workload.
382  */
383 static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info)
384 {
385         return atomic64_inc_return(&fs_info->tree_mod_seq);
386 }
387
388 /*
389  * return the last minor in the previous major tree_mod_seq number
390  */
391 u64 btrfs_tree_mod_seq_prev(u64 seq)
392 {
393         return (seq & 0xffffffff00000000ull) - 1ull;
394 }
395
396 /*
397  * This adds a new blocker to the tree mod log's blocker list if the @elem
398  * passed does not already have a sequence number set. So when a caller expects
399  * to record tree modifications, it should ensure to set elem->seq to zero
400  * before calling btrfs_get_tree_mod_seq.
401  * Returns a fresh, unused tree log modification sequence number, even if no new
402  * blocker was added.
403  */
404 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
405                            struct seq_list *elem)
406 {
407         u64 seq;
408
409         tree_mod_log_write_lock(fs_info);
410         spin_lock(&fs_info->tree_mod_seq_lock);
411         if (!elem->seq) {
412                 elem->seq = btrfs_inc_tree_mod_seq_major(fs_info);
413                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
414         }
415         seq = btrfs_inc_tree_mod_seq_minor(fs_info);
416         spin_unlock(&fs_info->tree_mod_seq_lock);
417         tree_mod_log_write_unlock(fs_info);
418
419         return seq;
420 }
421
422 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
423                             struct seq_list *elem)
424 {
425         struct rb_root *tm_root;
426         struct rb_node *node;
427         struct rb_node *next;
428         struct seq_list *cur_elem;
429         struct tree_mod_elem *tm;
430         u64 min_seq = (u64)-1;
431         u64 seq_putting = elem->seq;
432
433         if (!seq_putting)
434                 return;
435
436         spin_lock(&fs_info->tree_mod_seq_lock);
437         list_del(&elem->list);
438         elem->seq = 0;
439
440         list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
441                 if (cur_elem->seq < min_seq) {
442                         if (seq_putting > cur_elem->seq) {
443                                 /*
444                                  * blocker with lower sequence number exists, we
445                                  * cannot remove anything from the log
446                                  */
447                                 spin_unlock(&fs_info->tree_mod_seq_lock);
448                                 return;
449                         }
450                         min_seq = cur_elem->seq;
451                 }
452         }
453         spin_unlock(&fs_info->tree_mod_seq_lock);
454
455         /*
456          * anything that's lower than the lowest existing (read: blocked)
457          * sequence number can be removed from the tree.
458          */
459         tree_mod_log_write_lock(fs_info);
460         tm_root = &fs_info->tree_mod_log;
461         for (node = rb_first(tm_root); node; node = next) {
462                 next = rb_next(node);
463                 tm = container_of(node, struct tree_mod_elem, node);
464                 if (tm->seq > min_seq)
465                         continue;
466                 rb_erase(node, tm_root);
467                 kfree(tm);
468         }
469         tree_mod_log_write_unlock(fs_info);
470 }
471
472 /*
473  * key order of the log:
474  *       index -> sequence
475  *
476  * the index is the shifted logical of the *new* root node for root replace
477  * operations, or the shifted logical of the affected block for all other
478  * operations.
479  */
480 static noinline int
481 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
482 {
483         struct rb_root *tm_root;
484         struct rb_node **new;
485         struct rb_node *parent = NULL;
486         struct tree_mod_elem *cur;
487
488         BUG_ON(!tm || !tm->seq);
489
490         tm_root = &fs_info->tree_mod_log;
491         new = &tm_root->rb_node;
492         while (*new) {
493                 cur = container_of(*new, struct tree_mod_elem, node);
494                 parent = *new;
495                 if (cur->index < tm->index)
496                         new = &((*new)->rb_left);
497                 else if (cur->index > tm->index)
498                         new = &((*new)->rb_right);
499                 else if (cur->seq < tm->seq)
500                         new = &((*new)->rb_left);
501                 else if (cur->seq > tm->seq)
502                         new = &((*new)->rb_right);
503                 else {
504                         kfree(tm);
505                         return -EEXIST;
506                 }
507         }
508
509         rb_link_node(&tm->node, parent, new);
510         rb_insert_color(&tm->node, tm_root);
511         return 0;
512 }
513
514 /*
515  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
516  * returns zero with the tree_mod_log_lock acquired. The caller must hold
517  * this until all tree mod log insertions are recorded in the rb tree and then
518  * call tree_mod_log_write_unlock() to release.
519  */
520 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
521                                     struct extent_buffer *eb) {
522         smp_mb();
523         if (list_empty(&(fs_info)->tree_mod_seq_list))
524                 return 1;
525         if (eb && btrfs_header_level(eb) == 0)
526                 return 1;
527
528         tree_mod_log_write_lock(fs_info);
529         if (list_empty(&fs_info->tree_mod_seq_list)) {
530                 /*
531                  * someone emptied the list while we were waiting for the lock.
532                  * we must not add to the list when no blocker exists.
533                  */
534                 tree_mod_log_write_unlock(fs_info);
535                 return 1;
536         }
537
538         return 0;
539 }
540
541 /*
542  * This allocates memory and gets a tree modification sequence number.
543  *
544  * Returns <0 on error.
545  * Returns >0 (the added sequence number) on success.
546  */
547 static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
548                                  struct tree_mod_elem **tm_ret)
549 {
550         struct tree_mod_elem *tm;
551
552         /*
553          * once we switch from spin locks to something different, we should
554          * honor the flags parameter here.
555          */
556         tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC);
557         if (!tm)
558                 return -ENOMEM;
559
560         spin_lock(&fs_info->tree_mod_seq_lock);
561         tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
562         spin_unlock(&fs_info->tree_mod_seq_lock);
563
564         return tm->seq;
565 }
566
567 static inline int
568 __tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
569                           struct extent_buffer *eb, int slot,
570                           enum mod_log_op op, gfp_t flags)
571 {
572         int ret;
573         struct tree_mod_elem *tm;
574
575         ret = tree_mod_alloc(fs_info, flags, &tm);
576         if (ret < 0)
577                 return ret;
578
579         tm->index = eb->start >> PAGE_CACHE_SHIFT;
580         if (op != MOD_LOG_KEY_ADD) {
581                 btrfs_node_key(eb, &tm->key, slot);
582                 tm->blockptr = btrfs_node_blockptr(eb, slot);
583         }
584         tm->op = op;
585         tm->slot = slot;
586         tm->generation = btrfs_node_ptr_generation(eb, slot);
587
588         return __tree_mod_log_insert(fs_info, tm);
589 }
590
591 static noinline int
592 tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
593                              struct extent_buffer *eb, int slot,
594                              enum mod_log_op op, gfp_t flags)
595 {
596         int ret;
597
598         if (tree_mod_dont_log(fs_info, eb))
599                 return 0;
600
601         ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags);
602
603         tree_mod_log_write_unlock(fs_info);
604         return ret;
605 }
606
607 static noinline int
608 tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
609                         int slot, enum mod_log_op op)
610 {
611         return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS);
612 }
613
614 static noinline int
615 tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info,
616                              struct extent_buffer *eb, int slot,
617                              enum mod_log_op op)
618 {
619         return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS);
620 }
621
622 static noinline int
623 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
624                          struct extent_buffer *eb, int dst_slot, int src_slot,
625                          int nr_items, gfp_t flags)
626 {
627         struct tree_mod_elem *tm;
628         int ret;
629         int i;
630
631         if (tree_mod_dont_log(fs_info, eb))
632                 return 0;
633
634         /*
635          * When we override something during the move, we log these removals.
636          * This can only happen when we move towards the beginning of the
637          * buffer, i.e. dst_slot < src_slot.
638          */
639         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
640                 ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot,
641                                               MOD_LOG_KEY_REMOVE_WHILE_MOVING);
642                 BUG_ON(ret < 0);
643         }
644
645         ret = tree_mod_alloc(fs_info, flags, &tm);
646         if (ret < 0)
647                 goto out;
648
649         tm->index = eb->start >> PAGE_CACHE_SHIFT;
650         tm->slot = src_slot;
651         tm->move.dst_slot = dst_slot;
652         tm->move.nr_items = nr_items;
653         tm->op = MOD_LOG_MOVE_KEYS;
654
655         ret = __tree_mod_log_insert(fs_info, tm);
656 out:
657         tree_mod_log_write_unlock(fs_info);
658         return ret;
659 }
660
661 static inline void
662 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
663 {
664         int i;
665         u32 nritems;
666         int ret;
667
668         if (btrfs_header_level(eb) == 0)
669                 return;
670
671         nritems = btrfs_header_nritems(eb);
672         for (i = nritems - 1; i >= 0; i--) {
673                 ret = tree_mod_log_insert_key_locked(fs_info, eb, i,
674                                               MOD_LOG_KEY_REMOVE_WHILE_FREEING);
675                 BUG_ON(ret < 0);
676         }
677 }
678
679 static noinline int
680 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
681                          struct extent_buffer *old_root,
682                          struct extent_buffer *new_root, gfp_t flags,
683                          int log_removal)
684 {
685         struct tree_mod_elem *tm;
686         int ret;
687
688         if (tree_mod_dont_log(fs_info, NULL))
689                 return 0;
690
691         if (log_removal)
692                 __tree_mod_log_free_eb(fs_info, old_root);
693
694         ret = tree_mod_alloc(fs_info, flags, &tm);
695         if (ret < 0)
696                 goto out;
697
698         tm->index = new_root->start >> PAGE_CACHE_SHIFT;
699         tm->old_root.logical = old_root->start;
700         tm->old_root.level = btrfs_header_level(old_root);
701         tm->generation = btrfs_header_generation(old_root);
702         tm->op = MOD_LOG_ROOT_REPLACE;
703
704         ret = __tree_mod_log_insert(fs_info, tm);
705 out:
706         tree_mod_log_write_unlock(fs_info);
707         return ret;
708 }
709
710 static struct tree_mod_elem *
711 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
712                       int smallest)
713 {
714         struct rb_root *tm_root;
715         struct rb_node *node;
716         struct tree_mod_elem *cur = NULL;
717         struct tree_mod_elem *found = NULL;
718         u64 index = start >> PAGE_CACHE_SHIFT;
719
720         tree_mod_log_read_lock(fs_info);
721         tm_root = &fs_info->tree_mod_log;
722         node = tm_root->rb_node;
723         while (node) {
724                 cur = container_of(node, struct tree_mod_elem, node);
725                 if (cur->index < index) {
726                         node = node->rb_left;
727                 } else if (cur->index > index) {
728                         node = node->rb_right;
729                 } else if (cur->seq < min_seq) {
730                         node = node->rb_left;
731                 } else if (!smallest) {
732                         /* we want the node with the highest seq */
733                         if (found)
734                                 BUG_ON(found->seq > cur->seq);
735                         found = cur;
736                         node = node->rb_left;
737                 } else if (cur->seq > min_seq) {
738                         /* we want the node with the smallest seq */
739                         if (found)
740                                 BUG_ON(found->seq < cur->seq);
741                         found = cur;
742                         node = node->rb_right;
743                 } else {
744                         found = cur;
745                         break;
746                 }
747         }
748         tree_mod_log_read_unlock(fs_info);
749
750         return found;
751 }
752
753 /*
754  * this returns the element from the log with the smallest time sequence
755  * value that's in the log (the oldest log item). any element with a time
756  * sequence lower than min_seq will be ignored.
757  */
758 static struct tree_mod_elem *
759 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
760                            u64 min_seq)
761 {
762         return __tree_mod_log_search(fs_info, start, min_seq, 1);
763 }
764
765 /*
766  * this returns the element from the log with the largest time sequence
767  * value that's in the log (the most recent log item). any element with
768  * a time sequence lower than min_seq will be ignored.
769  */
770 static struct tree_mod_elem *
771 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
772 {
773         return __tree_mod_log_search(fs_info, start, min_seq, 0);
774 }
775
776 static noinline void
777 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
778                      struct extent_buffer *src, unsigned long dst_offset,
779                      unsigned long src_offset, int nr_items)
780 {
781         int ret;
782         int i;
783
784         if (tree_mod_dont_log(fs_info, NULL))
785                 return;
786
787         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) {
788                 tree_mod_log_write_unlock(fs_info);
789                 return;
790         }
791
792         for (i = 0; i < nr_items; i++) {
793                 ret = tree_mod_log_insert_key_locked(fs_info, src,
794                                                 i + src_offset,
795                                                 MOD_LOG_KEY_REMOVE);
796                 BUG_ON(ret < 0);
797                 ret = tree_mod_log_insert_key_locked(fs_info, dst,
798                                                      i + dst_offset,
799                                                      MOD_LOG_KEY_ADD);
800                 BUG_ON(ret < 0);
801         }
802
803         tree_mod_log_write_unlock(fs_info);
804 }
805
806 static inline void
807 tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
808                      int dst_offset, int src_offset, int nr_items)
809 {
810         int ret;
811         ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
812                                        nr_items, GFP_NOFS);
813         BUG_ON(ret < 0);
814 }
815
816 static noinline void
817 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
818                           struct extent_buffer *eb, int slot, int atomic)
819 {
820         int ret;
821
822         ret = tree_mod_log_insert_key_mask(fs_info, eb, slot,
823                                            MOD_LOG_KEY_REPLACE,
824                                            atomic ? GFP_ATOMIC : GFP_NOFS);
825         BUG_ON(ret < 0);
826 }
827
828 static noinline void
829 tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
830 {
831         if (tree_mod_dont_log(fs_info, eb))
832                 return;
833
834         __tree_mod_log_free_eb(fs_info, eb);
835
836         tree_mod_log_write_unlock(fs_info);
837 }
838
839 static noinline void
840 tree_mod_log_set_root_pointer(struct btrfs_root *root,
841                               struct extent_buffer *new_root_node,
842                               int log_removal)
843 {
844         int ret;
845         ret = tree_mod_log_insert_root(root->fs_info, root->node,
846                                        new_root_node, GFP_NOFS, log_removal);
847         BUG_ON(ret < 0);
848 }
849
850 /*
851  * check if the tree block can be shared by multiple trees
852  */
853 int btrfs_block_can_be_shared(struct btrfs_root *root,
854                               struct extent_buffer *buf)
855 {
856         /*
857          * Tree blocks not in refernece counted trees and tree roots
858          * are never shared. If a block was allocated after the last
859          * snapshot and the block was not allocated by tree relocation,
860          * we know the block is not shared.
861          */
862         if (root->ref_cows &&
863             buf != root->node && buf != root->commit_root &&
864             (btrfs_header_generation(buf) <=
865              btrfs_root_last_snapshot(&root->root_item) ||
866              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
867                 return 1;
868 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
869         if (root->ref_cows &&
870             btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
871                 return 1;
872 #endif
873         return 0;
874 }
875
876 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
877                                        struct btrfs_root *root,
878                                        struct extent_buffer *buf,
879                                        struct extent_buffer *cow,
880                                        int *last_ref)
881 {
882         u64 refs;
883         u64 owner;
884         u64 flags;
885         u64 new_flags = 0;
886         int ret;
887
888         /*
889          * Backrefs update rules:
890          *
891          * Always use full backrefs for extent pointers in tree block
892          * allocated by tree relocation.
893          *
894          * If a shared tree block is no longer referenced by its owner
895          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
896          * use full backrefs for extent pointers in tree block.
897          *
898          * If a tree block is been relocating
899          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
900          * use full backrefs for extent pointers in tree block.
901          * The reason for this is some operations (such as drop tree)
902          * are only allowed for blocks use full backrefs.
903          */
904
905         if (btrfs_block_can_be_shared(root, buf)) {
906                 ret = btrfs_lookup_extent_info(trans, root, buf->start,
907                                                btrfs_header_level(buf), 1,
908                                                &refs, &flags);
909                 if (ret)
910                         return ret;
911                 if (refs == 0) {
912                         ret = -EROFS;
913                         btrfs_std_error(root->fs_info, ret);
914                         return ret;
915                 }
916         } else {
917                 refs = 1;
918                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
919                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
920                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
921                 else
922                         flags = 0;
923         }
924
925         owner = btrfs_header_owner(buf);
926         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
927                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
928
929         if (refs > 1) {
930                 if ((owner == root->root_key.objectid ||
931                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
932                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
933                         ret = btrfs_inc_ref(trans, root, buf, 1, 1);
934                         BUG_ON(ret); /* -ENOMEM */
935
936                         if (root->root_key.objectid ==
937                             BTRFS_TREE_RELOC_OBJECTID) {
938                                 ret = btrfs_dec_ref(trans, root, buf, 0, 1);
939                                 BUG_ON(ret); /* -ENOMEM */
940                                 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
941                                 BUG_ON(ret); /* -ENOMEM */
942                         }
943                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
944                 } else {
945
946                         if (root->root_key.objectid ==
947                             BTRFS_TREE_RELOC_OBJECTID)
948                                 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
949                         else
950                                 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
951                         BUG_ON(ret); /* -ENOMEM */
952                 }
953                 if (new_flags != 0) {
954                         int level = btrfs_header_level(buf);
955
956                         ret = btrfs_set_disk_extent_flags(trans, root,
957                                                           buf->start,
958                                                           buf->len,
959                                                           new_flags, level, 0);
960                         if (ret)
961                                 return ret;
962                 }
963         } else {
964                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
965                         if (root->root_key.objectid ==
966                             BTRFS_TREE_RELOC_OBJECTID)
967                                 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
968                         else
969                                 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
970                         BUG_ON(ret); /* -ENOMEM */
971                         ret = btrfs_dec_ref(trans, root, buf, 1, 1);
972                         BUG_ON(ret); /* -ENOMEM */
973                 }
974                 clean_tree_block(trans, root, buf);
975                 *last_ref = 1;
976         }
977         return 0;
978 }
979
980 /*
981  * does the dirty work in cow of a single block.  The parent block (if
982  * supplied) is updated to point to the new cow copy.  The new buffer is marked
983  * dirty and returned locked.  If you modify the block it needs to be marked
984  * dirty again.
985  *
986  * search_start -- an allocation hint for the new block
987  *
988  * empty_size -- a hint that you plan on doing more cow.  This is the size in
989  * bytes the allocator should try to find free next to the block it returns.
990  * This is just a hint and may be ignored by the allocator.
991  */
992 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
993                              struct btrfs_root *root,
994                              struct extent_buffer *buf,
995                              struct extent_buffer *parent, int parent_slot,
996                              struct extent_buffer **cow_ret,
997                              u64 search_start, u64 empty_size)
998 {
999         struct btrfs_disk_key disk_key;
1000         struct extent_buffer *cow;
1001         int level, ret;
1002         int last_ref = 0;
1003         int unlock_orig = 0;
1004         u64 parent_start;
1005
1006         if (*cow_ret == buf)
1007                 unlock_orig = 1;
1008
1009         btrfs_assert_tree_locked(buf);
1010
1011         WARN_ON(root->ref_cows && trans->transid !=
1012                 root->fs_info->running_transaction->transid);
1013         WARN_ON(root->ref_cows && trans->transid != root->last_trans);
1014
1015         level = btrfs_header_level(buf);
1016
1017         if (level == 0)
1018                 btrfs_item_key(buf, &disk_key, 0);
1019         else
1020                 btrfs_node_key(buf, &disk_key, 0);
1021
1022         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1023                 if (parent)
1024                         parent_start = parent->start;
1025                 else
1026                         parent_start = 0;
1027         } else
1028                 parent_start = 0;
1029
1030         cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
1031                                      root->root_key.objectid, &disk_key,
1032                                      level, search_start, empty_size);
1033         if (IS_ERR(cow))
1034                 return PTR_ERR(cow);
1035
1036         /* cow is set to blocking by btrfs_init_new_buffer */
1037
1038         copy_extent_buffer(cow, buf, 0, 0, cow->len);
1039         btrfs_set_header_bytenr(cow, cow->start);
1040         btrfs_set_header_generation(cow, trans->transid);
1041         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1042         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1043                                      BTRFS_HEADER_FLAG_RELOC);
1044         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1045                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1046         else
1047                 btrfs_set_header_owner(cow, root->root_key.objectid);
1048
1049         write_extent_buffer(cow, root->fs_info->fsid,
1050                             (unsigned long)btrfs_header_fsid(cow),
1051                             BTRFS_FSID_SIZE);
1052
1053         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1054         if (ret) {
1055                 btrfs_abort_transaction(trans, root, ret);
1056                 return ret;
1057         }
1058
1059         if (root->ref_cows)
1060                 btrfs_reloc_cow_block(trans, root, buf, cow);
1061
1062         if (buf == root->node) {
1063                 WARN_ON(parent && parent != buf);
1064                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1065                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1066                         parent_start = buf->start;
1067                 else
1068                         parent_start = 0;
1069
1070                 extent_buffer_get(cow);
1071                 tree_mod_log_set_root_pointer(root, cow, 1);
1072                 rcu_assign_pointer(root->node, cow);
1073
1074                 btrfs_free_tree_block(trans, root, buf, parent_start,
1075                                       last_ref);
1076                 free_extent_buffer(buf);
1077                 add_root_to_dirty_list(root);
1078         } else {
1079                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1080                         parent_start = parent->start;
1081                 else
1082                         parent_start = 0;
1083
1084                 WARN_ON(trans->transid != btrfs_header_generation(parent));
1085                 tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
1086                                         MOD_LOG_KEY_REPLACE);
1087                 btrfs_set_node_blockptr(parent, parent_slot,
1088                                         cow->start);
1089                 btrfs_set_node_ptr_generation(parent, parent_slot,
1090                                               trans->transid);
1091                 btrfs_mark_buffer_dirty(parent);
1092                 if (last_ref)
1093                         tree_mod_log_free_eb(root->fs_info, buf);
1094                 btrfs_free_tree_block(trans, root, buf, parent_start,
1095                                       last_ref);
1096         }
1097         if (unlock_orig)
1098                 btrfs_tree_unlock(buf);
1099         free_extent_buffer_stale(buf);
1100         btrfs_mark_buffer_dirty(cow);
1101         *cow_ret = cow;
1102         return 0;
1103 }
1104
1105 /*
1106  * returns the logical address of the oldest predecessor of the given root.
1107  * entries older than time_seq are ignored.
1108  */
1109 static struct tree_mod_elem *
1110 __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1111                            struct extent_buffer *eb_root, u64 time_seq)
1112 {
1113         struct tree_mod_elem *tm;
1114         struct tree_mod_elem *found = NULL;
1115         u64 root_logical = eb_root->start;
1116         int looped = 0;
1117
1118         if (!time_seq)
1119                 return 0;
1120
1121         /*
1122          * the very last operation that's logged for a root is the replacement
1123          * operation (if it is replaced at all). this has the index of the *new*
1124          * root, making it the very first operation that's logged for this root.
1125          */
1126         while (1) {
1127                 tm = tree_mod_log_search_oldest(fs_info, root_logical,
1128                                                 time_seq);
1129                 if (!looped && !tm)
1130                         return 0;
1131                 /*
1132                  * if there are no tree operation for the oldest root, we simply
1133                  * return it. this should only happen if that (old) root is at
1134                  * level 0.
1135                  */
1136                 if (!tm)
1137                         break;
1138
1139                 /*
1140                  * if there's an operation that's not a root replacement, we
1141                  * found the oldest version of our root. normally, we'll find a
1142                  * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1143                  */
1144                 if (tm->op != MOD_LOG_ROOT_REPLACE)
1145                         break;
1146
1147                 found = tm;
1148                 root_logical = tm->old_root.logical;
1149                 looped = 1;
1150         }
1151
1152         /* if there's no old root to return, return what we found instead */
1153         if (!found)
1154                 found = tm;
1155
1156         return found;
1157 }
1158
1159 /*
1160  * tm is a pointer to the first operation to rewind within eb. then, all
1161  * previous operations will be rewinded (until we reach something older than
1162  * time_seq).
1163  */
1164 static void
1165 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1166                       u64 time_seq, struct tree_mod_elem *first_tm)
1167 {
1168         u32 n;
1169         struct rb_node *next;
1170         struct tree_mod_elem *tm = first_tm;
1171         unsigned long o_dst;
1172         unsigned long o_src;
1173         unsigned long p_size = sizeof(struct btrfs_key_ptr);
1174
1175         n = btrfs_header_nritems(eb);
1176         tree_mod_log_read_lock(fs_info);
1177         while (tm && tm->seq >= time_seq) {
1178                 /*
1179                  * all the operations are recorded with the operator used for
1180                  * the modification. as we're going backwards, we do the
1181                  * opposite of each operation here.
1182                  */
1183                 switch (tm->op) {
1184                 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1185                         BUG_ON(tm->slot < n);
1186                         /* Fallthrough */
1187                 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1188                 case MOD_LOG_KEY_REMOVE:
1189                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1190                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1191                         btrfs_set_node_ptr_generation(eb, tm->slot,
1192                                                       tm->generation);
1193                         n++;
1194                         break;
1195                 case MOD_LOG_KEY_REPLACE:
1196                         BUG_ON(tm->slot >= n);
1197                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1198                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1199                         btrfs_set_node_ptr_generation(eb, tm->slot,
1200                                                       tm->generation);
1201                         break;
1202                 case MOD_LOG_KEY_ADD:
1203                         /* if a move operation is needed it's in the log */
1204                         n--;
1205                         break;
1206                 case MOD_LOG_MOVE_KEYS:
1207                         o_dst = btrfs_node_key_ptr_offset(tm->slot);
1208                         o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1209                         memmove_extent_buffer(eb, o_dst, o_src,
1210                                               tm->move.nr_items * p_size);
1211                         break;
1212                 case MOD_LOG_ROOT_REPLACE:
1213                         /*
1214                          * this operation is special. for roots, this must be
1215                          * handled explicitly before rewinding.
1216                          * for non-roots, this operation may exist if the node
1217                          * was a root: root A -> child B; then A gets empty and
1218                          * B is promoted to the new root. in the mod log, we'll
1219                          * have a root-replace operation for B, a tree block
1220                          * that is no root. we simply ignore that operation.
1221                          */
1222                         break;
1223                 }
1224                 next = rb_next(&tm->node);
1225                 if (!next)
1226                         break;
1227                 tm = container_of(next, struct tree_mod_elem, node);
1228                 if (tm->index != first_tm->index)
1229                         break;
1230         }
1231         tree_mod_log_read_unlock(fs_info);
1232         btrfs_set_header_nritems(eb, n);
1233 }
1234
1235 /*
1236  * Called with eb read locked. If the buffer cannot be rewinded, the same buffer
1237  * is returned. If rewind operations happen, a fresh buffer is returned. The
1238  * returned buffer is always read-locked. If the returned buffer is not the
1239  * input buffer, the lock on the input buffer is released and the input buffer
1240  * is freed (its refcount is decremented).
1241  */
1242 static struct extent_buffer *
1243 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1244                     u64 time_seq)
1245 {
1246         struct extent_buffer *eb_rewin;
1247         struct tree_mod_elem *tm;
1248
1249         if (!time_seq)
1250                 return eb;
1251
1252         if (btrfs_header_level(eb) == 0)
1253                 return eb;
1254
1255         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1256         if (!tm)
1257                 return eb;
1258
1259         if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1260                 BUG_ON(tm->slot != 0);
1261                 eb_rewin = alloc_dummy_extent_buffer(eb->start,
1262                                                 fs_info->tree_root->nodesize);
1263                 BUG_ON(!eb_rewin);
1264                 btrfs_set_header_bytenr(eb_rewin, eb->start);
1265                 btrfs_set_header_backref_rev(eb_rewin,
1266                                              btrfs_header_backref_rev(eb));
1267                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1268                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1269         } else {
1270                 eb_rewin = btrfs_clone_extent_buffer(eb);
1271                 BUG_ON(!eb_rewin);
1272         }
1273
1274         extent_buffer_get(eb_rewin);
1275         btrfs_tree_read_unlock(eb);
1276         free_extent_buffer(eb);
1277
1278         extent_buffer_get(eb_rewin);
1279         btrfs_tree_read_lock(eb_rewin);
1280         __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1281         WARN_ON(btrfs_header_nritems(eb_rewin) >
1282                 BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1283
1284         return eb_rewin;
1285 }
1286
1287 /*
1288  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1289  * value. If there are no changes, the current root->root_node is returned. If
1290  * anything changed in between, there's a fresh buffer allocated on which the
1291  * rewind operations are done. In any case, the returned buffer is read locked.
1292  * Returns NULL on error (with no locks held).
1293  */
1294 static inline struct extent_buffer *
1295 get_old_root(struct btrfs_root *root, u64 time_seq)
1296 {
1297         struct tree_mod_elem *tm;
1298         struct extent_buffer *eb = NULL;
1299         struct extent_buffer *eb_root;
1300         struct extent_buffer *old;
1301         struct tree_mod_root *old_root = NULL;
1302         u64 old_generation = 0;
1303         u64 logical;
1304         u32 blocksize;
1305
1306         eb_root = btrfs_read_lock_root_node(root);
1307         tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1308         if (!tm)
1309                 return eb_root;
1310
1311         if (tm->op == MOD_LOG_ROOT_REPLACE) {
1312                 old_root = &tm->old_root;
1313                 old_generation = tm->generation;
1314                 logical = old_root->logical;
1315         } else {
1316                 logical = eb_root->start;
1317         }
1318
1319         tm = tree_mod_log_search(root->fs_info, logical, time_seq);
1320         if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1321                 btrfs_tree_read_unlock(eb_root);
1322                 free_extent_buffer(eb_root);
1323                 blocksize = btrfs_level_size(root, old_root->level);
1324                 old = read_tree_block(root, logical, blocksize, 0);
1325                 if (!old || !extent_buffer_uptodate(old)) {
1326                         free_extent_buffer(old);
1327                         pr_warn("btrfs: failed to read tree block %llu from get_old_root\n",
1328                                 logical);
1329                         WARN_ON(1);
1330                 } else {
1331                         eb = btrfs_clone_extent_buffer(old);
1332                         free_extent_buffer(old);
1333                 }
1334         } else if (old_root) {
1335                 btrfs_tree_read_unlock(eb_root);
1336                 free_extent_buffer(eb_root);
1337                 eb = alloc_dummy_extent_buffer(logical, root->nodesize);
1338         } else {
1339                 eb = btrfs_clone_extent_buffer(eb_root);
1340                 btrfs_tree_read_unlock(eb_root);
1341                 free_extent_buffer(eb_root);
1342         }
1343
1344         if (!eb)
1345                 return NULL;
1346         extent_buffer_get(eb);
1347         btrfs_tree_read_lock(eb);
1348         if (old_root) {
1349                 btrfs_set_header_bytenr(eb, eb->start);
1350                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1351                 btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1352                 btrfs_set_header_level(eb, old_root->level);
1353                 btrfs_set_header_generation(eb, old_generation);
1354         }
1355         if (tm)
1356                 __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
1357         else
1358                 WARN_ON(btrfs_header_level(eb) != 0);
1359         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
1360
1361         return eb;
1362 }
1363
1364 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1365 {
1366         struct tree_mod_elem *tm;
1367         int level;
1368         struct extent_buffer *eb_root = btrfs_root_node(root);
1369
1370         tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1371         if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1372                 level = tm->old_root.level;
1373         } else {
1374                 level = btrfs_header_level(eb_root);
1375         }
1376         free_extent_buffer(eb_root);
1377
1378         return level;
1379 }
1380
1381 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1382                                    struct btrfs_root *root,
1383                                    struct extent_buffer *buf)
1384 {
1385         /* ensure we can see the force_cow */
1386         smp_rmb();
1387
1388         /*
1389          * We do not need to cow a block if
1390          * 1) this block is not created or changed in this transaction;
1391          * 2) this block does not belong to TREE_RELOC tree;
1392          * 3) the root is not forced COW.
1393          *
1394          * What is forced COW:
1395          *    when we create snapshot during commiting the transaction,
1396          *    after we've finished coping src root, we must COW the shared
1397          *    block to ensure the metadata consistency.
1398          */
1399         if (btrfs_header_generation(buf) == trans->transid &&
1400             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1401             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1402               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1403             !root->force_cow)
1404                 return 0;
1405         return 1;
1406 }
1407
1408 /*
1409  * cows a single block, see __btrfs_cow_block for the real work.
1410  * This version of it has extra checks so that a block isn't cow'd more than
1411  * once per transaction, as long as it hasn't been written yet
1412  */
1413 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1414                     struct btrfs_root *root, struct extent_buffer *buf,
1415                     struct extent_buffer *parent, int parent_slot,
1416                     struct extent_buffer **cow_ret)
1417 {
1418         u64 search_start;
1419         int ret;
1420
1421         if (trans->transaction != root->fs_info->running_transaction)
1422                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1423                        (unsigned long long)trans->transid,
1424                        (unsigned long long)
1425                        root->fs_info->running_transaction->transid);
1426
1427         if (trans->transid != root->fs_info->generation)
1428                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1429                        (unsigned long long)trans->transid,
1430                        (unsigned long long)root->fs_info->generation);
1431
1432         if (!should_cow_block(trans, root, buf)) {
1433                 *cow_ret = buf;
1434                 return 0;
1435         }
1436
1437         search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
1438
1439         if (parent)
1440                 btrfs_set_lock_blocking(parent);
1441         btrfs_set_lock_blocking(buf);
1442
1443         ret = __btrfs_cow_block(trans, root, buf, parent,
1444                                  parent_slot, cow_ret, search_start, 0);
1445
1446         trace_btrfs_cow_block(root, buf, *cow_ret);
1447
1448         return ret;
1449 }
1450
1451 /*
1452  * helper function for defrag to decide if two blocks pointed to by a
1453  * node are actually close by
1454  */
1455 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1456 {
1457         if (blocknr < other && other - (blocknr + blocksize) < 32768)
1458                 return 1;
1459         if (blocknr > other && blocknr - (other + blocksize) < 32768)
1460                 return 1;
1461         return 0;
1462 }
1463
1464 /*
1465  * compare two keys in a memcmp fashion
1466  */
1467 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
1468 {
1469         struct btrfs_key k1;
1470
1471         btrfs_disk_key_to_cpu(&k1, disk);
1472
1473         return btrfs_comp_cpu_keys(&k1, k2);
1474 }
1475
1476 /*
1477  * same as comp_keys only with two btrfs_key's
1478  */
1479 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
1480 {
1481         if (k1->objectid > k2->objectid)
1482                 return 1;
1483         if (k1->objectid < k2->objectid)
1484                 return -1;
1485         if (k1->type > k2->type)
1486                 return 1;
1487         if (k1->type < k2->type)
1488                 return -1;
1489         if (k1->offset > k2->offset)
1490                 return 1;
1491         if (k1->offset < k2->offset)
1492                 return -1;
1493         return 0;
1494 }
1495
1496 /*
1497  * this is used by the defrag code to go through all the
1498  * leaves pointed to by a node and reallocate them so that
1499  * disk order is close to key order
1500  */
1501 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1502                        struct btrfs_root *root, struct extent_buffer *parent,
1503                        int start_slot, u64 *last_ret,
1504                        struct btrfs_key *progress)
1505 {
1506         struct extent_buffer *cur;
1507         u64 blocknr;
1508         u64 gen;
1509         u64 search_start = *last_ret;
1510         u64 last_block = 0;
1511         u64 other;
1512         u32 parent_nritems;
1513         int end_slot;
1514         int i;
1515         int err = 0;
1516         int parent_level;
1517         int uptodate;
1518         u32 blocksize;
1519         int progress_passed = 0;
1520         struct btrfs_disk_key disk_key;
1521
1522         parent_level = btrfs_header_level(parent);
1523
1524         WARN_ON(trans->transaction != root->fs_info->running_transaction);
1525         WARN_ON(trans->transid != root->fs_info->generation);
1526
1527         parent_nritems = btrfs_header_nritems(parent);
1528         blocksize = btrfs_level_size(root, parent_level - 1);
1529         end_slot = parent_nritems;
1530
1531         if (parent_nritems == 1)
1532                 return 0;
1533
1534         btrfs_set_lock_blocking(parent);
1535
1536         for (i = start_slot; i < end_slot; i++) {
1537                 int close = 1;
1538
1539                 btrfs_node_key(parent, &disk_key, i);
1540                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1541                         continue;
1542
1543                 progress_passed = 1;
1544                 blocknr = btrfs_node_blockptr(parent, i);
1545                 gen = btrfs_node_ptr_generation(parent, i);
1546                 if (last_block == 0)
1547                         last_block = blocknr;
1548
1549                 if (i > 0) {
1550                         other = btrfs_node_blockptr(parent, i - 1);
1551                         close = close_blocks(blocknr, other, blocksize);
1552                 }
1553                 if (!close && i < end_slot - 2) {
1554                         other = btrfs_node_blockptr(parent, i + 1);
1555                         close = close_blocks(blocknr, other, blocksize);
1556                 }
1557                 if (close) {
1558                         last_block = blocknr;
1559                         continue;
1560                 }
1561
1562                 cur = btrfs_find_tree_block(root, blocknr, blocksize);
1563                 if (cur)
1564                         uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1565                 else
1566                         uptodate = 0;
1567                 if (!cur || !uptodate) {
1568                         if (!cur) {
1569                                 cur = read_tree_block(root, blocknr,
1570                                                          blocksize, gen);
1571                                 if (!cur || !extent_buffer_uptodate(cur)) {
1572                                         free_extent_buffer(cur);
1573                                         return -EIO;
1574                                 }
1575                         } else if (!uptodate) {
1576                                 err = btrfs_read_buffer(cur, gen);
1577                                 if (err) {
1578                                         free_extent_buffer(cur);
1579                                         return err;
1580                                 }
1581                         }
1582                 }
1583                 if (search_start == 0)
1584                         search_start = last_block;
1585
1586                 btrfs_tree_lock(cur);
1587                 btrfs_set_lock_blocking(cur);
1588                 err = __btrfs_cow_block(trans, root, cur, parent, i,
1589                                         &cur, search_start,
1590                                         min(16 * blocksize,
1591                                             (end_slot - i) * blocksize));
1592                 if (err) {
1593                         btrfs_tree_unlock(cur);
1594                         free_extent_buffer(cur);
1595                         break;
1596                 }
1597                 search_start = cur->start;
1598                 last_block = cur->start;
1599                 *last_ret = search_start;
1600                 btrfs_tree_unlock(cur);
1601                 free_extent_buffer(cur);
1602         }
1603         return err;
1604 }
1605
1606 /*
1607  * The leaf data grows from end-to-front in the node.
1608  * this returns the address of the start of the last item,
1609  * which is the stop of the leaf data stack
1610  */
1611 static inline unsigned int leaf_data_end(struct btrfs_root *root,
1612                                          struct extent_buffer *leaf)
1613 {
1614         u32 nr = btrfs_header_nritems(leaf);
1615         if (nr == 0)
1616                 return BTRFS_LEAF_DATA_SIZE(root);
1617         return btrfs_item_offset_nr(leaf, nr - 1);
1618 }
1619
1620
1621 /*
1622  * search for key in the extent_buffer.  The items start at offset p,
1623  * and they are item_size apart.  There are 'max' items in p.
1624  *
1625  * the slot in the array is returned via slot, and it points to
1626  * the place where you would insert key if it is not found in
1627  * the array.
1628  *
1629  * slot may point to max if the key is bigger than all of the keys
1630  */
1631 static noinline int generic_bin_search(struct extent_buffer *eb,
1632                                        unsigned long p,
1633                                        int item_size, struct btrfs_key *key,
1634                                        int max, int *slot)
1635 {
1636         int low = 0;
1637         int high = max;
1638         int mid;
1639         int ret;
1640         struct btrfs_disk_key *tmp = NULL;
1641         struct btrfs_disk_key unaligned;
1642         unsigned long offset;
1643         char *kaddr = NULL;
1644         unsigned long map_start = 0;
1645         unsigned long map_len = 0;
1646         int err;
1647
1648         while (low < high) {
1649                 mid = (low + high) / 2;
1650                 offset = p + mid * item_size;
1651
1652                 if (!kaddr || offset < map_start ||
1653                     (offset + sizeof(struct btrfs_disk_key)) >
1654                     map_start + map_len) {
1655
1656                         err = map_private_extent_buffer(eb, offset,
1657                                                 sizeof(struct btrfs_disk_key),
1658                                                 &kaddr, &map_start, &map_len);
1659
1660                         if (!err) {
1661                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1662                                                         map_start);
1663                         } else {
1664                                 read_extent_buffer(eb, &unaligned,
1665                                                    offset, sizeof(unaligned));
1666                                 tmp = &unaligned;
1667                         }
1668
1669                 } else {
1670                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
1671                                                         map_start);
1672                 }
1673                 ret = comp_keys(tmp, key);
1674
1675                 if (ret < 0)
1676                         low = mid + 1;
1677                 else if (ret > 0)
1678                         high = mid;
1679                 else {
1680                         *slot = mid;
1681                         return 0;
1682                 }
1683         }
1684         *slot = low;
1685         return 1;
1686 }
1687
1688 /*
1689  * simple bin_search frontend that does the right thing for
1690  * leaves vs nodes
1691  */
1692 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1693                       int level, int *slot)
1694 {
1695         if (level == 0)
1696                 return generic_bin_search(eb,
1697                                           offsetof(struct btrfs_leaf, items),
1698                                           sizeof(struct btrfs_item),
1699                                           key, btrfs_header_nritems(eb),
1700                                           slot);
1701         else
1702                 return generic_bin_search(eb,
1703                                           offsetof(struct btrfs_node, ptrs),
1704                                           sizeof(struct btrfs_key_ptr),
1705                                           key, btrfs_header_nritems(eb),
1706                                           slot);
1707 }
1708
1709 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1710                      int level, int *slot)
1711 {
1712         return bin_search(eb, key, level, slot);
1713 }
1714
1715 static void root_add_used(struct btrfs_root *root, u32 size)
1716 {
1717         spin_lock(&root->accounting_lock);
1718         btrfs_set_root_used(&root->root_item,
1719                             btrfs_root_used(&root->root_item) + size);
1720         spin_unlock(&root->accounting_lock);
1721 }
1722
1723 static void root_sub_used(struct btrfs_root *root, u32 size)
1724 {
1725         spin_lock(&root->accounting_lock);
1726         btrfs_set_root_used(&root->root_item,
1727                             btrfs_root_used(&root->root_item) - size);
1728         spin_unlock(&root->accounting_lock);
1729 }
1730
1731 /* given a node and slot number, this reads the blocks it points to.  The
1732  * extent buffer is returned with a reference taken (but unlocked).
1733  * NULL is returned on error.
1734  */
1735 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
1736                                    struct extent_buffer *parent, int slot)
1737 {
1738         int level = btrfs_header_level(parent);
1739         struct extent_buffer *eb;
1740
1741         if (slot < 0)
1742                 return NULL;
1743         if (slot >= btrfs_header_nritems(parent))
1744                 return NULL;
1745
1746         BUG_ON(level == 0);
1747
1748         eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
1749                              btrfs_level_size(root, level - 1),
1750                              btrfs_node_ptr_generation(parent, slot));
1751         if (eb && !extent_buffer_uptodate(eb)) {
1752                 free_extent_buffer(eb);
1753                 eb = NULL;
1754         }
1755
1756         return eb;
1757 }
1758
1759 /*
1760  * node level balancing, used to make sure nodes are in proper order for
1761  * item deletion.  We balance from the top down, so we have to make sure
1762  * that a deletion won't leave an node completely empty later on.
1763  */
1764 static noinline int balance_level(struct btrfs_trans_handle *trans,
1765                          struct btrfs_root *root,
1766                          struct btrfs_path *path, int level)
1767 {
1768         struct extent_buffer *right = NULL;
1769         struct extent_buffer *mid;
1770         struct extent_buffer *left = NULL;
1771         struct extent_buffer *parent = NULL;
1772         int ret = 0;
1773         int wret;
1774         int pslot;
1775         int orig_slot = path->slots[level];
1776         u64 orig_ptr;
1777
1778         if (level == 0)
1779                 return 0;
1780
1781         mid = path->nodes[level];
1782
1783         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1784                 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1785         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1786
1787         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1788
1789         if (level < BTRFS_MAX_LEVEL - 1) {
1790                 parent = path->nodes[level + 1];
1791                 pslot = path->slots[level + 1];
1792         }
1793
1794         /*
1795          * deal with the case where there is only one pointer in the root
1796          * by promoting the node below to a root
1797          */
1798         if (!parent) {
1799                 struct extent_buffer *child;
1800
1801                 if (btrfs_header_nritems(mid) != 1)
1802                         return 0;
1803
1804                 /* promote the child to a root */
1805                 child = read_node_slot(root, mid, 0);
1806                 if (!child) {
1807                         ret = -EROFS;
1808                         btrfs_std_error(root->fs_info, ret);
1809                         goto enospc;
1810                 }
1811
1812                 btrfs_tree_lock(child);
1813                 btrfs_set_lock_blocking(child);
1814                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1815                 if (ret) {
1816                         btrfs_tree_unlock(child);
1817                         free_extent_buffer(child);
1818                         goto enospc;
1819                 }
1820
1821                 tree_mod_log_set_root_pointer(root, child, 1);
1822                 rcu_assign_pointer(root->node, child);
1823
1824                 add_root_to_dirty_list(root);
1825                 btrfs_tree_unlock(child);
1826
1827                 path->locks[level] = 0;
1828                 path->nodes[level] = NULL;
1829                 clean_tree_block(trans, root, mid);
1830                 btrfs_tree_unlock(mid);
1831                 /* once for the path */
1832                 free_extent_buffer(mid);
1833
1834                 root_sub_used(root, mid->len);
1835                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1836                 /* once for the root ptr */
1837                 free_extent_buffer_stale(mid);
1838                 return 0;
1839         }
1840         if (btrfs_header_nritems(mid) >
1841             BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1842                 return 0;
1843
1844         left = read_node_slot(root, parent, pslot - 1);
1845         if (left) {
1846                 btrfs_tree_lock(left);
1847                 btrfs_set_lock_blocking(left);
1848                 wret = btrfs_cow_block(trans, root, left,
1849                                        parent, pslot - 1, &left);
1850                 if (wret) {
1851                         ret = wret;
1852                         goto enospc;
1853                 }
1854         }
1855         right = read_node_slot(root, parent, pslot + 1);
1856         if (right) {
1857                 btrfs_tree_lock(right);
1858                 btrfs_set_lock_blocking(right);
1859                 wret = btrfs_cow_block(trans, root, right,
1860                                        parent, pslot + 1, &right);
1861                 if (wret) {
1862                         ret = wret;
1863                         goto enospc;
1864                 }
1865         }
1866
1867         /* first, try to make some room in the middle buffer */
1868         if (left) {
1869                 orig_slot += btrfs_header_nritems(left);
1870                 wret = push_node_left(trans, root, left, mid, 1);
1871                 if (wret < 0)
1872                         ret = wret;
1873         }
1874
1875         /*
1876          * then try to empty the right most buffer into the middle
1877          */
1878         if (right) {
1879                 wret = push_node_left(trans, root, mid, right, 1);
1880                 if (wret < 0 && wret != -ENOSPC)
1881                         ret = wret;
1882                 if (btrfs_header_nritems(right) == 0) {
1883                         clean_tree_block(trans, root, right);
1884                         btrfs_tree_unlock(right);
1885                         del_ptr(root, path, level + 1, pslot + 1);
1886                         root_sub_used(root, right->len);
1887                         btrfs_free_tree_block(trans, root, right, 0, 1);
1888                         free_extent_buffer_stale(right);
1889                         right = NULL;
1890                 } else {
1891                         struct btrfs_disk_key right_key;
1892                         btrfs_node_key(right, &right_key, 0);
1893                         tree_mod_log_set_node_key(root->fs_info, parent,
1894                                                   pslot + 1, 0);
1895                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1896                         btrfs_mark_buffer_dirty(parent);
1897                 }
1898         }
1899         if (btrfs_header_nritems(mid) == 1) {
1900                 /*
1901                  * we're not allowed to leave a node with one item in the
1902                  * tree during a delete.  A deletion from lower in the tree
1903                  * could try to delete the only pointer in this node.
1904                  * So, pull some keys from the left.
1905                  * There has to be a left pointer at this point because
1906                  * otherwise we would have pulled some pointers from the
1907                  * right
1908                  */
1909                 if (!left) {
1910                         ret = -EROFS;
1911                         btrfs_std_error(root->fs_info, ret);
1912                         goto enospc;
1913                 }
1914                 wret = balance_node_right(trans, root, mid, left);
1915                 if (wret < 0) {
1916                         ret = wret;
1917                         goto enospc;
1918                 }
1919                 if (wret == 1) {
1920                         wret = push_node_left(trans, root, left, mid, 1);
1921                         if (wret < 0)
1922                                 ret = wret;
1923                 }
1924                 BUG_ON(wret == 1);
1925         }
1926         if (btrfs_header_nritems(mid) == 0) {
1927                 clean_tree_block(trans, root, mid);
1928                 btrfs_tree_unlock(mid);
1929                 del_ptr(root, path, level + 1, pslot);
1930                 root_sub_used(root, mid->len);
1931                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1932                 free_extent_buffer_stale(mid);
1933                 mid = NULL;
1934         } else {
1935                 /* update the parent key to reflect our changes */
1936                 struct btrfs_disk_key mid_key;
1937                 btrfs_node_key(mid, &mid_key, 0);
1938                 tree_mod_log_set_node_key(root->fs_info, parent,
1939                                           pslot, 0);
1940                 btrfs_set_node_key(parent, &mid_key, pslot);
1941                 btrfs_mark_buffer_dirty(parent);
1942         }
1943
1944         /* update the path */
1945         if (left) {
1946                 if (btrfs_header_nritems(left) > orig_slot) {
1947                         extent_buffer_get(left);
1948                         /* left was locked after cow */
1949                         path->nodes[level] = left;
1950                         path->slots[level + 1] -= 1;
1951                         path->slots[level] = orig_slot;
1952                         if (mid) {
1953                                 btrfs_tree_unlock(mid);
1954                                 free_extent_buffer(mid);
1955                         }
1956                 } else {
1957                         orig_slot -= btrfs_header_nritems(left);
1958                         path->slots[level] = orig_slot;
1959                 }
1960         }
1961         /* double check we haven't messed things up */
1962         if (orig_ptr !=
1963             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1964                 BUG();
1965 enospc:
1966         if (right) {
1967                 btrfs_tree_unlock(right);
1968                 free_extent_buffer(right);
1969         }
1970         if (left) {
1971                 if (path->nodes[level] != left)
1972                         btrfs_tree_unlock(left);
1973                 free_extent_buffer(left);
1974         }
1975         return ret;
1976 }
1977
1978 /* Node balancing for insertion.  Here we only split or push nodes around
1979  * when they are completely full.  This is also done top down, so we
1980  * have to be pessimistic.
1981  */
1982 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1983                                           struct btrfs_root *root,
1984                                           struct btrfs_path *path, int level)
1985 {
1986         struct extent_buffer *right = NULL;
1987         struct extent_buffer *mid;
1988         struct extent_buffer *left = NULL;
1989         struct extent_buffer *parent = NULL;
1990         int ret = 0;
1991         int wret;
1992         int pslot;
1993         int orig_slot = path->slots[level];
1994
1995         if (level == 0)
1996                 return 1;
1997
1998         mid = path->nodes[level];
1999         WARN_ON(btrfs_header_generation(mid) != trans->transid);
2000
2001         if (level < BTRFS_MAX_LEVEL - 1) {
2002                 parent = path->nodes[level + 1];
2003                 pslot = path->slots[level + 1];
2004         }
2005
2006         if (!parent)
2007                 return 1;
2008
2009         left = read_node_slot(root, parent, pslot - 1);
2010
2011         /* first, try to make some room in the middle buffer */
2012         if (left) {
2013                 u32 left_nr;
2014
2015                 btrfs_tree_lock(left);
2016                 btrfs_set_lock_blocking(left);
2017
2018                 left_nr = btrfs_header_nritems(left);
2019                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2020                         wret = 1;
2021                 } else {
2022                         ret = btrfs_cow_block(trans, root, left, parent,
2023                                               pslot - 1, &left);
2024                         if (ret)
2025                                 wret = 1;
2026                         else {
2027                                 wret = push_node_left(trans, root,
2028                                                       left, mid, 0);
2029                         }
2030                 }
2031                 if (wret < 0)
2032                         ret = wret;
2033                 if (wret == 0) {
2034                         struct btrfs_disk_key disk_key;
2035                         orig_slot += left_nr;
2036                         btrfs_node_key(mid, &disk_key, 0);
2037                         tree_mod_log_set_node_key(root->fs_info, parent,
2038                                                   pslot, 0);
2039                         btrfs_set_node_key(parent, &disk_key, pslot);
2040                         btrfs_mark_buffer_dirty(parent);
2041                         if (btrfs_header_nritems(left) > orig_slot) {
2042                                 path->nodes[level] = left;
2043                                 path->slots[level + 1] -= 1;
2044                                 path->slots[level] = orig_slot;
2045                                 btrfs_tree_unlock(mid);
2046                                 free_extent_buffer(mid);
2047                         } else {
2048                                 orig_slot -=
2049                                         btrfs_header_nritems(left);
2050                                 path->slots[level] = orig_slot;
2051                                 btrfs_tree_unlock(left);
2052                                 free_extent_buffer(left);
2053                         }
2054                         return 0;
2055                 }
2056                 btrfs_tree_unlock(left);
2057                 free_extent_buffer(left);
2058         }
2059         right = read_node_slot(root, parent, pslot + 1);
2060
2061         /*
2062          * then try to empty the right most buffer into the middle
2063          */
2064         if (right) {
2065                 u32 right_nr;
2066
2067                 btrfs_tree_lock(right);
2068                 btrfs_set_lock_blocking(right);
2069
2070                 right_nr = btrfs_header_nritems(right);
2071                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2072                         wret = 1;
2073                 } else {
2074                         ret = btrfs_cow_block(trans, root, right,
2075                                               parent, pslot + 1,
2076                                               &right);
2077                         if (ret)
2078                                 wret = 1;
2079                         else {
2080                                 wret = balance_node_right(trans, root,
2081                                                           right, mid);
2082                         }
2083                 }
2084                 if (wret < 0)
2085                         ret = wret;
2086                 if (wret == 0) {
2087                         struct btrfs_disk_key disk_key;
2088
2089                         btrfs_node_key(right, &disk_key, 0);
2090                         tree_mod_log_set_node_key(root->fs_info, parent,
2091                                                   pslot + 1, 0);
2092                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
2093                         btrfs_mark_buffer_dirty(parent);
2094
2095                         if (btrfs_header_nritems(mid) <= orig_slot) {
2096                                 path->nodes[level] = right;
2097                                 path->slots[level + 1] += 1;
2098                                 path->slots[level] = orig_slot -
2099                                         btrfs_header_nritems(mid);
2100                                 btrfs_tree_unlock(mid);
2101                                 free_extent_buffer(mid);
2102                         } else {
2103                                 btrfs_tree_unlock(right);
2104                                 free_extent_buffer(right);
2105                         }
2106                         return 0;
2107                 }
2108                 btrfs_tree_unlock(right);
2109                 free_extent_buffer(right);
2110         }
2111         return 1;
2112 }
2113
2114 /*
2115  * readahead one full node of leaves, finding things that are close
2116  * to the block in 'slot', and triggering ra on them.
2117  */
2118 static void reada_for_search(struct btrfs_root *root,
2119                              struct btrfs_path *path,
2120                              int level, int slot, u64 objectid)
2121 {
2122         struct extent_buffer *node;
2123         struct btrfs_disk_key disk_key;
2124         u32 nritems;
2125         u64 search;
2126         u64 target;
2127         u64 nread = 0;
2128         u64 gen;
2129         int direction = path->reada;
2130         struct extent_buffer *eb;
2131         u32 nr;
2132         u32 blocksize;
2133         u32 nscan = 0;
2134
2135         if (level != 1)
2136                 return;
2137
2138         if (!path->nodes[level])
2139                 return;
2140
2141         node = path->nodes[level];
2142
2143         search = btrfs_node_blockptr(node, slot);
2144         blocksize = btrfs_level_size(root, level - 1);
2145         eb = btrfs_find_tree_block(root, search, blocksize);
2146         if (eb) {
2147                 free_extent_buffer(eb);
2148                 return;
2149         }
2150
2151         target = search;
2152
2153         nritems = btrfs_header_nritems(node);
2154         nr = slot;
2155
2156         while (1) {
2157                 if (direction < 0) {
2158                         if (nr == 0)
2159                                 break;
2160                         nr--;
2161                 } else if (direction > 0) {
2162                         nr++;
2163                         if (nr >= nritems)
2164                                 break;
2165                 }
2166                 if (path->reada < 0 && objectid) {
2167                         btrfs_node_key(node, &disk_key, nr);
2168                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
2169                                 break;
2170                 }
2171                 search = btrfs_node_blockptr(node, nr);
2172                 if ((search <= target && target - search <= 65536) ||
2173                     (search > target && search - target <= 65536)) {
2174                         gen = btrfs_node_ptr_generation(node, nr);
2175                         readahead_tree_block(root, search, blocksize, gen);
2176                         nread += blocksize;
2177                 }
2178                 nscan++;
2179                 if ((nread > 65536 || nscan > 32))
2180                         break;
2181         }
2182 }
2183
2184 /*
2185  * returns -EAGAIN if it had to drop the path, or zero if everything was in
2186  * cache
2187  */
2188 static noinline int reada_for_balance(struct btrfs_root *root,
2189                                       struct btrfs_path *path, int level)
2190 {
2191         int slot;
2192         int nritems;
2193         struct extent_buffer *parent;
2194         struct extent_buffer *eb;
2195         u64 gen;
2196         u64 block1 = 0;
2197         u64 block2 = 0;
2198         int ret = 0;
2199         int blocksize;
2200
2201         parent = path->nodes[level + 1];
2202         if (!parent)
2203                 return 0;
2204
2205         nritems = btrfs_header_nritems(parent);
2206         slot = path->slots[level + 1];
2207         blocksize = btrfs_level_size(root, level);
2208
2209         if (slot > 0) {
2210                 block1 = btrfs_node_blockptr(parent, slot - 1);
2211                 gen = btrfs_node_ptr_generation(parent, slot - 1);
2212                 eb = btrfs_find_tree_block(root, block1, blocksize);
2213                 /*
2214                  * if we get -eagain from btrfs_buffer_uptodate, we
2215                  * don't want to return eagain here.  That will loop
2216                  * forever
2217                  */
2218                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2219                         block1 = 0;
2220                 free_extent_buffer(eb);
2221         }
2222         if (slot + 1 < nritems) {
2223                 block2 = btrfs_node_blockptr(parent, slot + 1);
2224                 gen = btrfs_node_ptr_generation(parent, slot + 1);
2225                 eb = btrfs_find_tree_block(root, block2, blocksize);
2226                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2227                         block2 = 0;
2228                 free_extent_buffer(eb);
2229         }
2230         if (block1 || block2) {
2231                 ret = -EAGAIN;
2232
2233                 /* release the whole path */
2234                 btrfs_release_path(path);
2235
2236                 /* read the blocks */
2237                 if (block1)
2238                         readahead_tree_block(root, block1, blocksize, 0);
2239                 if (block2)
2240                         readahead_tree_block(root, block2, blocksize, 0);
2241
2242                 if (block1) {
2243                         eb = read_tree_block(root, block1, blocksize, 0);
2244                         free_extent_buffer(eb);
2245                 }
2246                 if (block2) {
2247                         eb = read_tree_block(root, block2, blocksize, 0);
2248                         free_extent_buffer(eb);
2249                 }
2250         }
2251         return ret;
2252 }
2253
2254
2255 /*
2256  * when we walk down the tree, it is usually safe to unlock the higher layers
2257  * in the tree.  The exceptions are when our path goes through slot 0, because
2258  * operations on the tree might require changing key pointers higher up in the
2259  * tree.
2260  *
2261  * callers might also have set path->keep_locks, which tells this code to keep
2262  * the lock if the path points to the last slot in the block.  This is part of
2263  * walking through the tree, and selecting the next slot in the higher block.
2264  *
2265  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2266  * if lowest_unlock is 1, level 0 won't be unlocked
2267  */
2268 static noinline void unlock_up(struct btrfs_path *path, int level,
2269                                int lowest_unlock, int min_write_lock_level,
2270                                int *write_lock_level)
2271 {
2272         int i;
2273         int skip_level = level;
2274         int no_skips = 0;
2275         struct extent_buffer *t;
2276
2277         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2278                 if (!path->nodes[i])
2279                         break;
2280                 if (!path->locks[i])
2281                         break;
2282                 if (!no_skips && path->slots[i] == 0) {
2283                         skip_level = i + 1;
2284                         continue;
2285                 }
2286                 if (!no_skips && path->keep_locks) {
2287                         u32 nritems;
2288                         t = path->nodes[i];
2289                         nritems = btrfs_header_nritems(t);
2290                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
2291                                 skip_level = i + 1;
2292                                 continue;
2293                         }
2294                 }
2295                 if (skip_level < i && i >= lowest_unlock)
2296                         no_skips = 1;
2297
2298                 t = path->nodes[i];
2299                 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2300                         btrfs_tree_unlock_rw(t, path->locks[i]);
2301                         path->locks[i] = 0;
2302                         if (write_lock_level &&
2303                             i > min_write_lock_level &&
2304                             i <= *write_lock_level) {
2305                                 *write_lock_level = i - 1;
2306                         }
2307                 }
2308         }
2309 }
2310
2311 /*
2312  * This releases any locks held in the path starting at level and
2313  * going all the way up to the root.
2314  *
2315  * btrfs_search_slot will keep the lock held on higher nodes in a few
2316  * corner cases, such as COW of the block at slot zero in the node.  This
2317  * ignores those rules, and it should only be called when there are no
2318  * more updates to be done higher up in the tree.
2319  */
2320 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2321 {
2322         int i;
2323
2324         if (path->keep_locks)
2325                 return;
2326
2327         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2328                 if (!path->nodes[i])
2329                         continue;
2330                 if (!path->locks[i])
2331                         continue;
2332                 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2333                 path->locks[i] = 0;
2334         }
2335 }
2336
2337 /*
2338  * helper function for btrfs_search_slot.  The goal is to find a block
2339  * in cache without setting the path to blocking.  If we find the block
2340  * we return zero and the path is unchanged.
2341  *
2342  * If we can't find the block, we set the path blocking and do some
2343  * reada.  -EAGAIN is returned and the search must be repeated.
2344  */
2345 static int
2346 read_block_for_search(struct btrfs_trans_handle *trans,
2347                        struct btrfs_root *root, struct btrfs_path *p,
2348                        struct extent_buffer **eb_ret, int level, int slot,
2349                        struct btrfs_key *key, u64 time_seq)
2350 {
2351         u64 blocknr;
2352         u64 gen;
2353         u32 blocksize;
2354         struct extent_buffer *b = *eb_ret;
2355         struct extent_buffer *tmp;
2356         int ret;
2357
2358         blocknr = btrfs_node_blockptr(b, slot);
2359         gen = btrfs_node_ptr_generation(b, slot);
2360         blocksize = btrfs_level_size(root, level - 1);
2361
2362         tmp = btrfs_find_tree_block(root, blocknr, blocksize);
2363         if (tmp) {
2364                 /* first we do an atomic uptodate check */
2365                 if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) {
2366                         if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2367                                 /*
2368                                  * we found an up to date block without
2369                                  * sleeping, return
2370                                  * right away
2371                                  */
2372                                 *eb_ret = tmp;
2373                                 return 0;
2374                         }
2375                         /* the pages were up to date, but we failed
2376                          * the generation number check.  Do a full
2377                          * read for the generation number that is correct.
2378                          * We must do this without dropping locks so
2379                          * we can trust our generation number
2380                          */
2381                         free_extent_buffer(tmp);
2382                         btrfs_set_path_blocking(p);
2383
2384                         /* now we're allowed to do a blocking uptodate check */
2385                         tmp = read_tree_block(root, blocknr, blocksize, gen);
2386                         if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) {
2387                                 *eb_ret = tmp;
2388                                 return 0;
2389                         }
2390                         free_extent_buffer(tmp);
2391                         btrfs_release_path(p);
2392                         return -EIO;
2393                 }
2394         }
2395
2396         /*
2397          * reduce lock contention at high levels
2398          * of the btree by dropping locks before
2399          * we read.  Don't release the lock on the current
2400          * level because we need to walk this node to figure
2401          * out which blocks to read.
2402          */
2403         btrfs_unlock_up_safe(p, level + 1);
2404         btrfs_set_path_blocking(p);
2405
2406         free_extent_buffer(tmp);
2407         if (p->reada)
2408                 reada_for_search(root, p, level, slot, key->objectid);
2409
2410         btrfs_release_path(p);
2411
2412         ret = -EAGAIN;
2413         tmp = read_tree_block(root, blocknr, blocksize, 0);
2414         if (tmp) {
2415                 /*
2416                  * If the read above didn't mark this buffer up to date,
2417                  * it will never end up being up to date.  Set ret to EIO now
2418                  * and give up so that our caller doesn't loop forever
2419                  * on our EAGAINs.
2420                  */
2421                 if (!btrfs_buffer_uptodate(tmp, 0, 0))
2422                         ret = -EIO;
2423                 free_extent_buffer(tmp);
2424         }
2425         return ret;
2426 }
2427
2428 /*
2429  * helper function for btrfs_search_slot.  This does all of the checks
2430  * for node-level blocks and does any balancing required based on
2431  * the ins_len.
2432  *
2433  * If no extra work was required, zero is returned.  If we had to
2434  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2435  * start over
2436  */
2437 static int
2438 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2439                        struct btrfs_root *root, struct btrfs_path *p,
2440                        struct extent_buffer *b, int level, int ins_len,
2441                        int *write_lock_level)
2442 {
2443         int ret;
2444         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2445             BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
2446                 int sret;
2447
2448                 if (*write_lock_level < level + 1) {
2449                         *write_lock_level = level + 1;
2450                         btrfs_release_path(p);
2451                         goto again;
2452                 }
2453
2454                 sret = reada_for_balance(root, p, level);
2455                 if (sret)
2456                         goto again;
2457
2458                 btrfs_set_path_blocking(p);
2459                 sret = split_node(trans, root, p, level);
2460                 btrfs_clear_path_blocking(p, NULL, 0);
2461
2462                 BUG_ON(sret > 0);
2463                 if (sret) {
2464                         ret = sret;
2465                         goto done;
2466                 }
2467                 b = p->nodes[level];
2468         } else if (ins_len < 0 && btrfs_header_nritems(b) <
2469                    BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
2470                 int sret;
2471
2472                 if (*write_lock_level < level + 1) {
2473                         *write_lock_level = level + 1;
2474                         btrfs_release_path(p);
2475                         goto again;
2476                 }
2477
2478                 sret = reada_for_balance(root, p, level);
2479                 if (sret)
2480                         goto again;
2481
2482                 btrfs_set_path_blocking(p);
2483                 sret = balance_level(trans, root, p, level);
2484                 btrfs_clear_path_blocking(p, NULL, 0);
2485
2486                 if (sret) {
2487                         ret = sret;
2488                         goto done;
2489                 }
2490                 b = p->nodes[level];
2491                 if (!b) {
2492                         btrfs_release_path(p);
2493                         goto again;
2494                 }
2495                 BUG_ON(btrfs_header_nritems(b) == 1);
2496         }
2497         return 0;
2498
2499 again:
2500         ret = -EAGAIN;
2501 done:
2502         return ret;
2503 }
2504
2505 /*
2506  * look for key in the tree.  path is filled in with nodes along the way
2507  * if key is found, we return zero and you can find the item in the leaf
2508  * level of the path (level 0)
2509  *
2510  * If the key isn't found, the path points to the slot where it should
2511  * be inserted, and 1 is returned.  If there are other errors during the
2512  * search a negative error number is returned.
2513  *
2514  * if ins_len > 0, nodes and leaves will be split as we walk down the
2515  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
2516  * possible)
2517  */
2518 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2519                       *root, struct btrfs_key *key, struct btrfs_path *p, int
2520                       ins_len, int cow)
2521 {
2522         struct extent_buffer *b;
2523         int slot;
2524         int ret;
2525         int err;
2526         int level;
2527         int lowest_unlock = 1;
2528         int root_lock;
2529         /* everything at write_lock_level or lower must be write locked */
2530         int write_lock_level = 0;
2531         u8 lowest_level = 0;
2532         int min_write_lock_level;
2533
2534         lowest_level = p->lowest_level;
2535         WARN_ON(lowest_level && ins_len > 0);
2536         WARN_ON(p->nodes[0] != NULL);
2537
2538         if (ins_len < 0) {
2539                 lowest_unlock = 2;
2540
2541                 /* when we are removing items, we might have to go up to level
2542                  * two as we update tree pointers  Make sure we keep write
2543                  * for those levels as well
2544                  */
2545                 write_lock_level = 2;
2546         } else if (ins_len > 0) {
2547                 /*
2548                  * for inserting items, make sure we have a write lock on
2549                  * level 1 so we can update keys
2550                  */
2551                 write_lock_level = 1;
2552         }
2553
2554         if (!cow)
2555                 write_lock_level = -1;
2556
2557         if (cow && (p->keep_locks || p->lowest_level))
2558                 write_lock_level = BTRFS_MAX_LEVEL;
2559
2560         min_write_lock_level = write_lock_level;
2561
2562 again:
2563         /*
2564          * we try very hard to do read locks on the root
2565          */
2566         root_lock = BTRFS_READ_LOCK;
2567         level = 0;
2568         if (p->search_commit_root) {
2569                 /*
2570                  * the commit roots are read only
2571                  * so we always do read locks
2572                  */
2573                 b = root->commit_root;
2574                 extent_buffer_get(b);
2575                 level = btrfs_header_level(b);
2576                 if (!p->skip_locking)
2577                         btrfs_tree_read_lock(b);
2578         } else {
2579                 if (p->skip_locking) {
2580                         b = btrfs_root_node(root);
2581                         level = btrfs_header_level(b);
2582                 } else {
2583                         /* we don't know the level of the root node
2584                          * until we actually have it read locked
2585                          */
2586                         b = btrfs_read_lock_root_node(root);
2587                         level = btrfs_header_level(b);
2588                         if (level <= write_lock_level) {
2589                                 /* whoops, must trade for write lock */
2590                                 btrfs_tree_read_unlock(b);
2591                                 free_extent_buffer(b);
2592                                 b = btrfs_lock_root_node(root);
2593                                 root_lock = BTRFS_WRITE_LOCK;
2594
2595                                 /* the level might have changed, check again */
2596                                 level = btrfs_header_level(b);
2597                         }
2598                 }
2599         }
2600         p->nodes[level] = b;
2601         if (!p->skip_locking)
2602                 p->locks[level] = root_lock;
2603
2604         while (b) {
2605                 level = btrfs_header_level(b);
2606
2607                 /*
2608                  * setup the path here so we can release it under lock
2609                  * contention with the cow code
2610                  */
2611                 if (cow) {
2612                         /*
2613                          * if we don't really need to cow this block
2614                          * then we don't want to set the path blocking,
2615                          * so we test it here
2616                          */
2617                         if (!should_cow_block(trans, root, b))
2618                                 goto cow_done;
2619
2620                         btrfs_set_path_blocking(p);
2621
2622                         /*
2623                          * must have write locks on this node and the
2624                          * parent
2625                          */
2626                         if (level > write_lock_level ||
2627                             (level + 1 > write_lock_level &&
2628                             level + 1 < BTRFS_MAX_LEVEL &&
2629                             p->nodes[level + 1])) {
2630                                 write_lock_level = level + 1;
2631                                 btrfs_release_path(p);
2632                                 goto again;
2633                         }
2634
2635                         err = btrfs_cow_block(trans, root, b,
2636                                               p->nodes[level + 1],
2637                                               p->slots[level + 1], &b);
2638                         if (err) {
2639                                 ret = err;
2640                                 goto done;
2641                         }
2642                 }
2643 cow_done:
2644                 BUG_ON(!cow && ins_len);
2645
2646                 p->nodes[level] = b;
2647                 btrfs_clear_path_blocking(p, NULL, 0);
2648
2649                 /*
2650                  * we have a lock on b and as long as we aren't changing
2651                  * the tree, there is no way to for the items in b to change.
2652                  * It is safe to drop the lock on our parent before we
2653                  * go through the expensive btree search on b.
2654                  *
2655                  * If cow is true, then we might be changing slot zero,
2656                  * which may require changing the parent.  So, we can't
2657                  * drop the lock until after we know which slot we're
2658                  * operating on.
2659                  */
2660                 if (!cow)
2661                         btrfs_unlock_up_safe(p, level + 1);
2662
2663                 ret = bin_search(b, key, level, &slot);
2664
2665                 if (level != 0) {
2666                         int dec = 0;
2667                         if (ret && slot > 0) {
2668                                 dec = 1;
2669                                 slot -= 1;
2670                         }
2671                         p->slots[level] = slot;
2672                         err = setup_nodes_for_search(trans, root, p, b, level,
2673                                              ins_len, &write_lock_level);
2674                         if (err == -EAGAIN)
2675                                 goto again;
2676                         if (err) {
2677                                 ret = err;
2678                                 goto done;
2679                         }
2680                         b = p->nodes[level];
2681                         slot = p->slots[level];
2682
2683                         /*
2684                          * slot 0 is special, if we change the key
2685                          * we have to update the parent pointer
2686                          * which means we must have a write lock
2687                          * on the parent
2688                          */
2689                         if (slot == 0 && cow &&
2690                             write_lock_level < level + 1) {
2691                                 write_lock_level = level + 1;
2692                                 btrfs_release_path(p);
2693                                 goto again;
2694                         }
2695
2696                         unlock_up(p, level, lowest_unlock,
2697                                   min_write_lock_level, &write_lock_level);
2698
2699                         if (level == lowest_level) {
2700                                 if (dec)
2701                                         p->slots[level]++;
2702                                 goto done;
2703                         }
2704
2705                         err = read_block_for_search(trans, root, p,
2706                                                     &b, level, slot, key, 0);
2707                         if (err == -EAGAIN)
2708                                 goto again;
2709                         if (err) {
2710                                 ret = err;
2711                                 goto done;
2712                         }
2713
2714                         if (!p->skip_locking) {
2715                                 level = btrfs_header_level(b);
2716                                 if (level <= write_lock_level) {
2717                                         err = btrfs_try_tree_write_lock(b);
2718                                         if (!err) {
2719                                                 btrfs_set_path_blocking(p);
2720                                                 btrfs_tree_lock(b);
2721                                                 btrfs_clear_path_blocking(p, b,
2722                                                                   BTRFS_WRITE_LOCK);
2723                                         }
2724                                         p->locks[level] = BTRFS_WRITE_LOCK;
2725                                 } else {
2726                                         err = btrfs_try_tree_read_lock(b);
2727                                         if (!err) {
2728                                                 btrfs_set_path_blocking(p);
2729                                                 btrfs_tree_read_lock(b);
2730                                                 btrfs_clear_path_blocking(p, b,
2731                                                                   BTRFS_READ_LOCK);
2732                                         }
2733                                         p->locks[level] = BTRFS_READ_LOCK;
2734                                 }
2735                                 p->nodes[level] = b;
2736                         }
2737                 } else {
2738                         p->slots[level] = slot;
2739                         if (ins_len > 0 &&
2740                             btrfs_leaf_free_space(root, b) < ins_len) {
2741                                 if (write_lock_level < 1) {
2742                                         write_lock_level = 1;
2743                                         btrfs_release_path(p);
2744                                         goto again;
2745                                 }
2746
2747                                 btrfs_set_path_blocking(p);
2748                                 err = split_leaf(trans, root, key,
2749                                                  p, ins_len, ret == 0);
2750                                 btrfs_clear_path_blocking(p, NULL, 0);
2751
2752                                 BUG_ON(err > 0);
2753                                 if (err) {
2754                                         ret = err;
2755                                         goto done;
2756                                 }
2757                         }
2758                         if (!p->search_for_split)
2759                                 unlock_up(p, level, lowest_unlock,
2760                                           min_write_lock_level, &write_lock_level);
2761                         goto done;
2762                 }
2763         }
2764         ret = 1;
2765 done:
2766         /*
2767          * we don't really know what they plan on doing with the path
2768          * from here on, so for now just mark it as blocking
2769          */
2770         if (!p->leave_spinning)
2771                 btrfs_set_path_blocking(p);
2772         if (ret < 0)
2773                 btrfs_release_path(p);
2774         return ret;
2775 }
2776
2777 /*
2778  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2779  * current state of the tree together with the operations recorded in the tree
2780  * modification log to search for the key in a previous version of this tree, as
2781  * denoted by the time_seq parameter.
2782  *
2783  * Naturally, there is no support for insert, delete or cow operations.
2784  *
2785  * The resulting path and return value will be set up as if we called
2786  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2787  */
2788 int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2789                           struct btrfs_path *p, u64 time_seq)
2790 {
2791         struct extent_buffer *b;
2792         int slot;
2793         int ret;
2794         int err;
2795         int level;
2796         int lowest_unlock = 1;
2797         u8 lowest_level = 0;
2798
2799         lowest_level = p->lowest_level;
2800         WARN_ON(p->nodes[0] != NULL);
2801
2802         if (p->search_commit_root) {
2803                 BUG_ON(time_seq);
2804                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2805         }
2806
2807 again:
2808         b = get_old_root(root, time_seq);
2809         level = btrfs_header_level(b);
2810         p->locks[level] = BTRFS_READ_LOCK;
2811
2812         while (b) {
2813                 level = btrfs_header_level(b);
2814                 p->nodes[level] = b;
2815                 btrfs_clear_path_blocking(p, NULL, 0);
2816
2817                 /*
2818                  * we have a lock on b and as long as we aren't changing
2819                  * the tree, there is no way to for the items in b to change.
2820                  * It is safe to drop the lock on our parent before we
2821                  * go through the expensive btree search on b.
2822                  */
2823                 btrfs_unlock_up_safe(p, level + 1);
2824
2825                 ret = bin_search(b, key, level, &slot);
2826
2827                 if (level != 0) {
2828                         int dec = 0;
2829                         if (ret && slot > 0) {
2830                                 dec = 1;
2831                                 slot -= 1;
2832                         }
2833                         p->slots[level] = slot;
2834                         unlock_up(p, level, lowest_unlock, 0, NULL);
2835
2836                         if (level == lowest_level) {
2837                                 if (dec)
2838                                         p->slots[level]++;
2839                                 goto done;
2840                         }
2841
2842                         err = read_block_for_search(NULL, root, p, &b, level,
2843                                                     slot, key, time_seq);
2844                         if (err == -EAGAIN)
2845                                 goto again;
2846                         if (err) {
2847                                 ret = err;
2848                                 goto done;
2849                         }
2850
2851                         level = btrfs_header_level(b);
2852                         err = btrfs_try_tree_read_lock(b);
2853                         if (!err) {
2854                                 btrfs_set_path_blocking(p);
2855                                 btrfs_tree_read_lock(b);
2856                                 btrfs_clear_path_blocking(p, b,
2857                                                           BTRFS_READ_LOCK);
2858                         }
2859                         b = tree_mod_log_rewind(root->fs_info, b, time_seq);
2860                         p->locks[level] = BTRFS_READ_LOCK;
2861                         p->nodes[level] = b;
2862                 } else {
2863                         p->slots[level] = slot;
2864                         unlock_up(p, level, lowest_unlock, 0, NULL);
2865                         goto done;
2866                 }
2867         }
2868         ret = 1;
2869 done:
2870         if (!p->leave_spinning)
2871                 btrfs_set_path_blocking(p);
2872         if (ret < 0)
2873                 btrfs_release_path(p);
2874
2875         return ret;
2876 }
2877
2878 /*
2879  * helper to use instead of search slot if no exact match is needed but
2880  * instead the next or previous item should be returned.
2881  * When find_higher is true, the next higher item is returned, the next lower
2882  * otherwise.
2883  * When return_any and find_higher are both true, and no higher item is found,
2884  * return the next lower instead.
2885  * When return_any is true and find_higher is false, and no lower item is found,
2886  * return the next higher instead.
2887  * It returns 0 if any item is found, 1 if none is found (tree empty), and
2888  * < 0 on error
2889  */
2890 int btrfs_search_slot_for_read(struct btrfs_root *root,
2891                                struct btrfs_key *key, struct btrfs_path *p,
2892                                int find_higher, int return_any)
2893 {
2894         int ret;
2895         struct extent_buffer *leaf;
2896
2897 again:
2898         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2899         if (ret <= 0)
2900                 return ret;
2901         /*
2902          * a return value of 1 means the path is at the position where the
2903          * item should be inserted. Normally this is the next bigger item,
2904          * but in case the previous item is the last in a leaf, path points
2905          * to the first free slot in the previous leaf, i.e. at an invalid
2906          * item.
2907          */
2908         leaf = p->nodes[0];
2909
2910         if (find_higher) {
2911                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2912                         ret = btrfs_next_leaf(root, p);
2913                         if (ret <= 0)
2914                                 return ret;
2915                         if (!return_any)
2916                                 return 1;
2917                         /*
2918                          * no higher item found, return the next
2919                          * lower instead
2920                          */
2921                         return_any = 0;
2922                         find_higher = 0;
2923                         btrfs_release_path(p);
2924                         goto again;
2925                 }
2926         } else {
2927                 if (p->slots[0] == 0) {
2928                         ret = btrfs_prev_leaf(root, p);
2929                         if (ret < 0)
2930                                 return ret;
2931                         if (!ret) {
2932                                 p->slots[0] = btrfs_header_nritems(leaf) - 1;
2933                                 return 0;
2934                         }
2935                         if (!return_any)
2936                                 return 1;
2937                         /*
2938                          * no lower item found, return the next
2939                          * higher instead
2940                          */
2941                         return_any = 0;
2942                         find_higher = 1;
2943                         btrfs_release_path(p);
2944                         goto again;
2945                 } else {
2946                         --p->slots[0];
2947                 }
2948         }
2949         return 0;
2950 }
2951
2952 /*
2953  * adjust the pointers going up the tree, starting at level
2954  * making sure the right key of each node is points to 'key'.
2955  * This is used after shifting pointers to the left, so it stops
2956  * fixing up pointers when a given leaf/node is not in slot 0 of the
2957  * higher levels
2958  *
2959  */
2960 static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
2961                            struct btrfs_disk_key *key, int level)
2962 {
2963         int i;
2964         struct extent_buffer *t;
2965
2966         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2967                 int tslot = path->slots[i];
2968                 if (!path->nodes[i])
2969                         break;
2970                 t = path->nodes[i];
2971                 tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
2972                 btrfs_set_node_key(t, key, tslot);
2973                 btrfs_mark_buffer_dirty(path->nodes[i]);
2974                 if (tslot != 0)
2975                         break;
2976         }
2977 }
2978
2979 /*
2980  * update item key.
2981  *
2982  * This function isn't completely safe. It's the caller's responsibility
2983  * that the new key won't break the order
2984  */
2985 void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
2986                              struct btrfs_key *new_key)
2987 {
2988         struct btrfs_disk_key disk_key;
2989         struct extent_buffer *eb;
2990         int slot;
2991
2992         eb = path->nodes[0];
2993         slot = path->slots[0];
2994         if (slot > 0) {
2995                 btrfs_item_key(eb, &disk_key, slot - 1);
2996                 BUG_ON(comp_keys(&disk_key, new_key) >= 0);
2997         }
2998         if (slot < btrfs_header_nritems(eb) - 1) {
2999                 btrfs_item_key(eb, &disk_key, slot + 1);
3000                 BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3001         }
3002
3003         btrfs_cpu_key_to_disk(&disk_key, new_key);
3004         btrfs_set_item_key(eb, &disk_key, slot);
3005         btrfs_mark_buffer_dirty(eb);
3006         if (slot == 0)
3007                 fixup_low_keys(root, path, &disk_key, 1);
3008 }
3009
3010 /*
3011  * try to push data from one node into the next node left in the
3012  * tree.
3013  *
3014  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3015  * error, and > 0 if there was no room in the left hand block.
3016  */
3017 static int push_node_left(struct btrfs_trans_handle *trans,
3018                           struct btrfs_root *root, struct extent_buffer *dst,
3019                           struct extent_buffer *src, int empty)
3020 {
3021         int push_items = 0;
3022         int src_nritems;
3023         int dst_nritems;
3024         int ret = 0;
3025
3026         src_nritems = btrfs_header_nritems(src);
3027         dst_nritems = btrfs_header_nritems(dst);
3028         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3029         WARN_ON(btrfs_header_generation(src) != trans->transid);
3030         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3031
3032         if (!empty && src_nritems <= 8)
3033                 return 1;
3034
3035         if (push_items <= 0)
3036                 return 1;
3037
3038         if (empty) {
3039                 push_items = min(src_nritems, push_items);
3040                 if (push_items < src_nritems) {
3041                         /* leave at least 8 pointers in the node if
3042                          * we aren't going to empty it
3043                          */
3044                         if (src_nritems - push_items < 8) {
3045                                 if (push_items <= 8)
3046                                         return 1;
3047                                 push_items -= 8;
3048                         }
3049                 }
3050         } else
3051                 push_items = min(src_nritems - 8, push_items);
3052
3053         tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
3054                              push_items);
3055         copy_extent_buffer(dst, src,
3056                            btrfs_node_key_ptr_offset(dst_nritems),
3057                            btrfs_node_key_ptr_offset(0),
3058                            push_items * sizeof(struct btrfs_key_ptr));
3059
3060         if (push_items < src_nritems) {
3061                 /*
3062                  * don't call tree_mod_log_eb_move here, key removal was already
3063                  * fully logged by tree_mod_log_eb_copy above.
3064                  */
3065                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3066                                       btrfs_node_key_ptr_offset(push_items),
3067                                       (src_nritems - push_items) *
3068                                       sizeof(struct btrfs_key_ptr));
3069         }
3070         btrfs_set_header_nritems(src, src_nritems - push_items);
3071         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3072         btrfs_mark_buffer_dirty(src);
3073         btrfs_mark_buffer_dirty(dst);
3074
3075         return ret;
3076 }
3077
3078 /*
3079  * try to push data from one node into the next node right in the
3080  * tree.
3081  *
3082  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3083  * error, and > 0 if there was no room in the right hand block.
3084  *
3085  * this will  only push up to 1/2 the contents of the left node over
3086  */
3087 static int balance_node_right(struct btrfs_trans_handle *trans,
3088                               struct btrfs_root *root,
3089                               struct extent_buffer *dst,
3090                               struct extent_buffer *src)
3091 {
3092         int push_items = 0;
3093         int max_push;
3094         int src_nritems;
3095         int dst_nritems;
3096         int ret = 0;
3097
3098         WARN_ON(btrfs_header_generation(src) != trans->transid);
3099         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3100
3101         src_nritems = btrfs_header_nritems(src);
3102         dst_nritems = btrfs_header_nritems(dst);
3103         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3104         if (push_items <= 0)
3105                 return 1;
3106
3107         if (src_nritems < 4)
3108                 return 1;
3109
3110         max_push = src_nritems / 2 + 1;
3111         /* don't try to empty the node */
3112         if (max_push >= src_nritems)
3113                 return 1;
3114
3115         if (max_push < push_items)
3116                 push_items = max_push;
3117
3118         tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
3119         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3120                                       btrfs_node_key_ptr_offset(0),
3121                                       (dst_nritems) *
3122                                       sizeof(struct btrfs_key_ptr));
3123
3124         tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
3125                              src_nritems - push_items, push_items);
3126         copy_extent_buffer(dst, src,
3127                            btrfs_node_key_ptr_offset(0),
3128                            btrfs_node_key_ptr_offset(src_nritems - push_items),
3129                            push_items * sizeof(struct btrfs_key_ptr));
3130
3131         btrfs_set_header_nritems(src, src_nritems - push_items);
3132         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3133
3134         btrfs_mark_buffer_dirty(src);
3135         btrfs_mark_buffer_dirty(dst);
3136
3137         return ret;
3138 }
3139
3140 /*
3141  * helper function to insert a new root level in the tree.
3142  * A new node is allocated, and a single item is inserted to
3143  * point to the existing root
3144  *
3145  * returns zero on success or < 0 on failure.
3146  */
3147 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3148                            struct btrfs_root *root,
3149                            struct btrfs_path *path, int level, int log_removal)
3150 {
3151         u64 lower_gen;
3152         struct extent_buffer *lower;
3153         struct extent_buffer *c;
3154         struct extent_buffer *old;
3155         struct btrfs_disk_key lower_key;
3156
3157         BUG_ON(path->nodes[level]);
3158         BUG_ON(path->nodes[level-1] != root->node);
3159
3160         lower = path->nodes[level-1];
3161         if (level == 1)
3162                 btrfs_item_key(lower, &lower_key, 0);
3163         else
3164                 btrfs_node_key(lower, &lower_key, 0);
3165
3166         c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
3167                                    root->root_key.objectid, &lower_key,
3168                                    level, root->node->start, 0);
3169         if (IS_ERR(c))
3170                 return PTR_ERR(c);
3171
3172         root_add_used(root, root->nodesize);
3173
3174         memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
3175         btrfs_set_header_nritems(c, 1);
3176         btrfs_set_header_level(c, level);
3177         btrfs_set_header_bytenr(c, c->start);
3178         btrfs_set_header_generation(c, trans->transid);
3179         btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3180         btrfs_set_header_owner(c, root->root_key.objectid);
3181
3182         write_extent_buffer(c, root->fs_info->fsid,
3183                             (unsigned long)btrfs_header_fsid(c),
3184                             BTRFS_FSID_SIZE);
3185
3186         write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
3187                             (unsigned long)btrfs_header_chunk_tree_uuid(c),
3188                             BTRFS_UUID_SIZE);
3189
3190         btrfs_set_node_key(c, &lower_key, 0);
3191         btrfs_set_node_blockptr(c, 0, lower->start);
3192         lower_gen = btrfs_header_generation(lower);
3193         WARN_ON(lower_gen != trans->transid);
3194
3195         btrfs_set_node_ptr_generation(c, 0, lower_gen);
3196
3197         btrfs_mark_buffer_dirty(c);
3198
3199         old = root->node;
3200         tree_mod_log_set_root_pointer(root, c, log_removal);
3201         rcu_assign_pointer(root->node, c);
3202
3203         /* the super has an extra ref to root->node */
3204         free_extent_buffer(old);
3205
3206         add_root_to_dirty_list(root);
3207         extent_buffer_get(c);
3208         path->nodes[level] = c;
3209         path->locks[level] = BTRFS_WRITE_LOCK;
3210         path->slots[level] = 0;
3211         return 0;
3212 }
3213
3214 /*
3215  * worker function to insert a single pointer in a node.
3216  * the node should have enough room for the pointer already
3217  *
3218  * slot and level indicate where you want the key to go, and
3219  * blocknr is the block the key points to.
3220  */
3221 static void insert_ptr(struct btrfs_trans_handle *trans,
3222                        struct btrfs_root *root, struct btrfs_path *path,
3223                        struct btrfs_disk_key *key, u64 bytenr,
3224                        int slot, int level)
3225 {
3226         struct extent_buffer *lower;
3227         int nritems;
3228         int ret;
3229
3230         BUG_ON(!path->nodes[level]);
3231         btrfs_assert_tree_locked(path->nodes[level]);
3232         lower = path->nodes[level];
3233         nritems = btrfs_header_nritems(lower);
3234         BUG_ON(slot > nritems);
3235         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
3236         if (slot != nritems) {
3237                 if (level)
3238                         tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
3239                                              slot, nritems - slot);
3240                 memmove_extent_buffer(lower,
3241                               btrfs_node_key_ptr_offset(slot + 1),
3242                               btrfs_node_key_ptr_offset(slot),
3243                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
3244         }
3245         if (level) {
3246                 ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
3247                                               MOD_LOG_KEY_ADD);
3248                 BUG_ON(ret < 0);
3249         }
3250         btrfs_set_node_key(lower, key, slot);
3251         btrfs_set_node_blockptr(lower, slot, bytenr);
3252         WARN_ON(trans->transid == 0);
3253         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3254         btrfs_set_header_nritems(lower, nritems + 1);
3255         btrfs_mark_buffer_dirty(lower);
3256 }
3257
3258 /*
3259  * split the node at the specified level in path in two.
3260  * The path is corrected to point to the appropriate node after the split
3261  *
3262  * Before splitting this tries to make some room in the node by pushing
3263  * left and right, if either one works, it returns right away.
3264  *
3265  * returns 0 on success and < 0 on failure
3266  */
3267 static noinline int split_node(struct btrfs_trans_handle *trans,
3268                                struct btrfs_root *root,
3269                                struct btrfs_path *path, int level)
3270 {
3271         struct extent_buffer *c;
3272         struct extent_buffer *split;
3273         struct btrfs_disk_key disk_key;
3274         int mid;
3275         int ret;
3276         u32 c_nritems;
3277
3278         c = path->nodes[level];
3279         WARN_ON(btrfs_header_generation(c) != trans->transid);
3280         if (c == root->node) {
3281                 /*
3282                  * trying to split the root, lets make a new one
3283                  *
3284                  * tree mod log: We pass 0 as log_removal parameter to
3285                  * insert_new_root, because that root buffer will be kept as a
3286                  * normal node. We are going to log removal of half of the
3287                  * elements below with tree_mod_log_eb_copy. We're holding a
3288                  * tree lock on the buffer, which is why we cannot race with
3289                  * other tree_mod_log users.
3290                  */
3291                 ret = insert_new_root(trans, root, path, level + 1, 0);
3292                 if (ret)
3293                         return ret;
3294         } else {
3295                 ret = push_nodes_for_insert(trans, root, path, level);
3296                 c = path->nodes[level];
3297                 if (!ret && btrfs_header_nritems(c) <
3298                     BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
3299                         return 0;
3300                 if (ret < 0)
3301                         return ret;
3302         }
3303
3304         c_nritems = btrfs_header_nritems(c);
3305         mid = (c_nritems + 1) / 2;
3306         btrfs_node_key(c, &disk_key, mid);
3307
3308         split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
3309                                         root->root_key.objectid,
3310                                         &disk_key, level, c->start, 0);
3311         if (IS_ERR(split))
3312                 return PTR_ERR(split);
3313
3314         root_add_used(root, root->nodesize);
3315
3316         memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
3317         btrfs_set_header_level(split, btrfs_header_level(c));
3318         btrfs_set_header_bytenr(split, split->start);
3319         btrfs_set_header_generation(split, trans->transid);
3320         btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3321         btrfs_set_header_owner(split, root->root_key.objectid);
3322         write_extent_buffer(split, root->fs_info->fsid,
3323                             (unsigned long)btrfs_header_fsid(split),
3324                             BTRFS_FSID_SIZE);
3325         write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
3326                             (unsigned long)btrfs_header_chunk_tree_uuid(split),
3327                             BTRFS_UUID_SIZE);
3328
3329         tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);
3330         copy_extent_buffer(split, c,
3331                            btrfs_node_key_ptr_offset(0),
3332                            btrfs_node_key_ptr_offset(mid),
3333                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3334         btrfs_set_header_nritems(split, c_nritems - mid);
3335         btrfs_set_header_nritems(c, mid);
3336         ret = 0;
3337
3338         btrfs_mark_buffer_dirty(c);
3339         btrfs_mark_buffer_dirty(split);
3340
3341         insert_ptr(trans, root, path, &disk_key, split->start,
3342                    path->slots[level + 1] + 1, level + 1);
3343
3344         if (path->slots[level] >= mid) {
3345                 path->slots[level] -= mid;
3346                 btrfs_tree_unlock(c);
3347                 free_extent_buffer(c);
3348                 path->nodes[level] = split;
3349                 path->slots[level + 1] += 1;
3350         } else {
3351                 btrfs_tree_unlock(split);
3352                 free_extent_buffer(split);
3353         }
3354         return ret;
3355 }
3356
3357 /*
3358  * how many bytes are required to store the items in a leaf.  start
3359  * and nr indicate which items in the leaf to check.  This totals up the
3360  * space used both by the item structs and the item data
3361  */
3362 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3363 {
3364         struct btrfs_item *start_item;
3365         struct btrfs_item *end_item;
3366         struct btrfs_map_token token;
3367         int data_len;
3368         int nritems = btrfs_header_nritems(l);
3369         int end = min(nritems, start + nr) - 1;
3370
3371         if (!nr)
3372                 return 0;
3373         btrfs_init_map_token(&token);
3374         start_item = btrfs_item_nr(l, start);
3375         end_item = btrfs_item_nr(l, end);
3376         data_len = btrfs_token_item_offset(l, start_item, &token) +
3377                 btrfs_token_item_size(l, start_item, &token);
3378         data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3379         data_len += sizeof(struct btrfs_item) * nr;
3380         WARN_ON(data_len < 0);
3381         return data_len;
3382 }
3383
3384 /*
3385  * The space between the end of the leaf items and
3386  * the start of the leaf data.  IOW, how much room
3387  * the leaf has left for both items and data
3388  */
3389 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
3390                                    struct extent_buffer *leaf)
3391 {
3392         int nritems = btrfs_header_nritems(leaf);
3393         int ret;
3394         ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
3395         if (ret < 0) {
3396                 printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
3397                        "used %d nritems %d\n",
3398                        ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
3399                        leaf_space_used(leaf, 0, nritems), nritems);
3400         }
3401         return ret;
3402 }
3403
3404 /*
3405  * min slot controls the lowest index we're willing to push to the
3406  * right.  We'll push up to and including min_slot, but no lower
3407  */
3408 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3409                                       struct btrfs_root *root,
3410                                       struct btrfs_path *path,
3411                                       int data_size, int empty,
3412                                       struct extent_buffer *right,
3413                                       int free_space, u32 left_nritems,
3414                                       u32 min_slot)
3415 {
3416         struct extent_buffer *left = path->nodes[0];
3417         struct extent_buffer *upper = path->nodes[1];
3418         struct btrfs_map_token token;
3419         struct btrfs_disk_key disk_key;
3420         int slot;
3421         u32 i;
3422         int push_space = 0;
3423         int push_items = 0;
3424         struct btrfs_item *item;
3425         u32 nr;
3426         u32 right_nritems;
3427         u32 data_end;
3428         u32 this_item_size;
3429
3430         btrfs_init_map_token(&token);
3431
3432         if (empty)
3433                 nr = 0;
3434         else
3435                 nr = max_t(u32, 1, min_slot);
3436
3437         if (path->slots[0] >= left_nritems)
3438                 push_space += data_size;
3439
3440         slot = path->slots[1];
3441         i = left_nritems - 1;
3442         while (i >= nr) {
3443                 item = btrfs_item_nr(left, i);
3444
3445                 if (!empty && push_items > 0) {
3446                         if (path->slots[0] > i)
3447                                 break;
3448                         if (path->slots[0] == i) {
3449                                 int space = btrfs_leaf_free_space(root, left);
3450                                 if (space + push_space * 2 > free_space)
3451                                         break;
3452                         }
3453                 }
3454
3455                 if (path->slots[0] == i)
3456                         push_space += data_size;
3457
3458                 this_item_size = btrfs_item_size(left, item);
3459                 if (this_item_size + sizeof(*item) + push_space > free_space)
3460                         break;
3461
3462                 push_items++;
3463                 push_space += this_item_size + sizeof(*item);
3464                 if (i == 0)
3465                         break;
3466                 i--;
3467         }
3468
3469         if (push_items == 0)
3470                 goto out_unlock;
3471
3472         WARN_ON(!empty && push_items == left_nritems);
3473
3474         /* push left to right */
3475         right_nritems = btrfs_header_nritems(right);
3476
3477         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3478         push_space -= leaf_data_end(root, left);
3479
3480         /* make room in the right data area */
3481         data_end = leaf_data_end(root, right);
3482         memmove_extent_buffer(right,
3483                               btrfs_leaf_data(right) + data_end - push_space,
3484                               btrfs_leaf_data(right) + data_end,
3485                               BTRFS_LEAF_DATA_SIZE(root) - data_end);
3486
3487         /* copy from the left data area */
3488         copy_extent_buffer(right, left, btrfs_leaf_data(right) +
3489                      BTRFS_LEAF_DATA_SIZE(root) - push_space,
3490                      btrfs_leaf_data(left) + leaf_data_end(root, left),
3491                      push_space);
3492
3493         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3494                               btrfs_item_nr_offset(0),
3495                               right_nritems * sizeof(struct btrfs_item));
3496
3497         /* copy the items from left to right */
3498         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3499                    btrfs_item_nr_offset(left_nritems - push_items),
3500                    push_items * sizeof(struct btrfs_item));
3501
3502         /* update the item pointers */
3503         right_nritems += push_items;
3504         btrfs_set_header_nritems(right, right_nritems);
3505         push_space = BTRFS_LEAF_DATA_SIZE(root);
3506         for (i = 0; i < right_nritems; i++) {
3507                 item = btrfs_item_nr(right, i);
3508                 push_space -= btrfs_token_item_size(right, item, &token);
3509                 btrfs_set_token_item_offset(right, item, push_space, &token);
3510         }
3511
3512         left_nritems -= push_items;
3513         btrfs_set_header_nritems(left, left_nritems);
3514
3515         if (left_nritems)
3516                 btrfs_mark_buffer_dirty(left);
3517         else
3518                 clean_tree_block(trans, root, left);
3519
3520         btrfs_mark_buffer_dirty(right);
3521
3522         btrfs_item_key(right, &disk_key, 0);
3523         btrfs_set_node_key(upper, &disk_key, slot + 1);
3524         btrfs_mark_buffer_dirty(upper);
3525
3526         /* then fixup the leaf pointer in the path */
3527         if (path->slots[0] >= left_nritems) {
3528                 path->slots[0] -= left_nritems;
3529                 if (btrfs_header_nritems(path->nodes[0]) == 0)
3530                         clean_tree_block(trans, root, path->nodes[0]);
3531                 btrfs_tree_unlock(path->nodes[0]);
3532                 free_extent_buffer(path->nodes[0]);
3533                 path->nodes[0] = right;
3534                 path->slots[1] += 1;
3535         } else {
3536                 btrfs_tree_unlock(right);
3537                 free_extent_buffer(right);
3538         }
3539         return 0;
3540
3541 out_unlock:
3542         btrfs_tree_unlock(right);
3543         free_extent_buffer(right);
3544         return 1;
3545 }
3546
3547 /*
3548  * push some data in the path leaf to the right, trying to free up at
3549  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3550  *
3551  * returns 1 if the push failed because the other node didn't have enough
3552  * room, 0 if everything worked out and < 0 if there were major errors.
3553  *
3554  * this will push starting from min_slot to the end of the leaf.  It won't
3555  * push any slot lower than min_slot
3556  */
3557 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3558                            *root, struct btrfs_path *path,
3559                            int min_data_size, int data_size,
3560                            int empty, u32 min_slot)
3561 {
3562         struct extent_buffer *left = path->nodes[0];
3563         struct extent_buffer *right;
3564         struct extent_buffer *upper;
3565         int slot;
3566         int free_space;
3567         u32 left_nritems;
3568         int ret;
3569
3570         if (!path->nodes[1])
3571                 return 1;
3572
3573         slot = path->slots[1];
3574         upper = path->nodes[1];
3575         if (slot >= btrfs_header_nritems(upper) - 1)
3576                 return 1;
3577
3578         btrfs_assert_tree_locked(path->nodes[1]);
3579
3580         right = read_node_slot(root, upper, slot + 1);
3581         if (right == NULL)
3582                 return 1;
3583
3584         btrfs_tree_lock(right);
3585         btrfs_set_lock_blocking(right);
3586
3587         free_space = btrfs_leaf_free_space(root, right);
3588         if (free_space < data_size)
3589                 goto out_unlock;
3590
3591         /* cow and double check */
3592         ret = btrfs_cow_block(trans, root, right, upper,
3593                               slot + 1, &right);
3594         if (ret)
3595                 goto out_unlock;
3596
3597         free_space = btrfs_leaf_free_space(root, right);
3598         if (free_space < data_size)
3599                 goto out_unlock;
3600
3601         left_nritems = btrfs_header_nritems(left);
3602         if (left_nritems == 0)
3603                 goto out_unlock;
3604
3605         return __push_leaf_right(trans, root, path, min_data_size, empty,
3606                                 right, free_space, left_nritems, min_slot);
3607 out_unlock:
3608         btrfs_tree_unlock(right);
3609         free_extent_buffer(right);
3610         return 1;
3611 }
3612
3613 /*
3614  * push some data in the path leaf to the left, trying to free up at
3615  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3616  *
3617  * max_slot can put a limit on how far into the leaf we'll push items.  The
3618  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3619  * items
3620  */
3621 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3622                                      struct btrfs_root *root,
3623                                      struct btrfs_path *path, int data_size,
3624                                      int empty, struct extent_buffer *left,
3625                                      int free_space, u32 right_nritems,
3626                                      u32 max_slot)
3627 {
3628         struct btrfs_disk_key disk_key;
3629         struct extent_buffer *right = path->nodes[0];
3630         int i;
3631         int push_space = 0;
3632         int push_items = 0;
3633         struct btrfs_item *item;
3634         u32 old_left_nritems;
3635         u32 nr;
3636         int ret = 0;
3637         u32 this_item_size;
3638         u32 old_left_item_size;
3639         struct btrfs_map_token token;
3640
3641         btrfs_init_map_token(&token);
3642
3643         if (empty)
3644                 nr = min(right_nritems, max_slot);
3645         else
3646                 nr = min(right_nritems - 1, max_slot);
3647
3648         for (i = 0; i < nr; i++) {
3649                 item = btrfs_item_nr(right, i);
3650
3651                 if (!empty && push_items > 0) {
3652                         if (path->slots[0] < i)
3653                                 break;
3654                         if (path->slots[0] == i) {
3655                                 int space = btrfs_leaf_free_space(root, right);
3656                                 if (space + push_space * 2 > free_space)
3657                                         break;
3658                         }
3659                 }
3660
3661                 if (path->slots[0] == i)
3662                         push_space += data_size;
3663
3664                 this_item_size = btrfs_item_size(right, item);
3665                 if (this_item_size + sizeof(*item) + push_space > free_space)
3666                         break;
3667
3668                 push_items++;
3669                 push_space += this_item_size + sizeof(*item);
3670         }
3671
3672         if (push_items == 0) {
3673                 ret = 1;
3674                 goto out;
3675         }
3676         if (!empty && push_items == btrfs_header_nritems(right))
3677                 WARN_ON(1);
3678
3679         /* push data from right to left */
3680         copy_extent_buffer(left, right,
3681                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
3682                            btrfs_item_nr_offset(0),
3683                            push_items * sizeof(struct btrfs_item));
3684
3685         push_space = BTRFS_LEAF_DATA_SIZE(root) -
3686                      btrfs_item_offset_nr(right, push_items - 1);
3687
3688         copy_extent_buffer(left, right, btrfs_leaf_data(left) +
3689                      leaf_data_end(root, left) - push_space,
3690                      btrfs_leaf_data(right) +
3691                      btrfs_item_offset_nr(right, push_items - 1),
3692                      push_space);
3693         old_left_nritems = btrfs_header_nritems(left);
3694         BUG_ON(old_left_nritems <= 0);
3695
3696         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3697         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3698                 u32 ioff;
3699
3700                 item = btrfs_item_nr(left, i);
3701
3702                 ioff = btrfs_token_item_offset(left, item, &token);
3703                 btrfs_set_token_item_offset(left, item,
3704                       ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
3705                       &token);
3706         }
3707         btrfs_set_header_nritems(left, old_left_nritems + push_items);
3708
3709         /* fixup right node */
3710         if (push_items > right_nritems)
3711                 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3712                        right_nritems);
3713
3714         if (push_items < right_nritems) {
3715                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
3716                                                   leaf_data_end(root, right);
3717                 memmove_extent_buffer(right, btrfs_leaf_data(right) +
3718                                       BTRFS_LEAF_DATA_SIZE(root) - push_space,
3719                                       btrfs_leaf_data(right) +
3720                                       leaf_data_end(root, right), push_space);
3721
3722                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3723                               btrfs_item_nr_offset(push_items),
3724                              (btrfs_header_nritems(right) - push_items) *
3725                              sizeof(struct btrfs_item));
3726         }
3727         right_nritems -= push_items;
3728         btrfs_set_header_nritems(right, right_nritems);
3729         push_space = BTRFS_LEAF_DATA_SIZE(root);
3730         for (i = 0; i < right_nritems; i++) {
3731                 item = btrfs_item_nr(right, i);
3732
3733                 push_space = push_space - btrfs_token_item_size(right,
3734                                                                 item, &token);
3735                 btrfs_set_token_item_offset(right, item, push_space, &token);
3736         }
3737
3738         btrfs_mark_buffer_dirty(left);
3739         if (right_nritems)
3740                 btrfs_mark_buffer_dirty(right);
3741         else
3742                 clean_tree_block(trans, root, right);
3743
3744         btrfs_item_key(right, &disk_key, 0);
3745         fixup_low_keys(root, path, &disk_key, 1);
3746
3747         /* then fixup the leaf pointer in the path */
3748         if (path->slots[0] < push_items) {
3749                 path->slots[0] += old_left_nritems;
3750                 btrfs_tree_unlock(path->nodes[0]);
3751                 free_extent_buffer(path->nodes[0]);
3752                 path->nodes[0] = left;
3753                 path->slots[1] -= 1;
3754         } else {
3755                 btrfs_tree_unlock(left);
3756                 free_extent_buffer(left);
3757                 path->slots[0] -= push_items;
3758         }
3759         BUG_ON(path->slots[0] < 0);
3760         return ret;
3761 out:
3762         btrfs_tree_unlock(left);
3763         free_extent_buffer(left);
3764         return ret;
3765 }
3766
3767 /*
3768  * push some data in the path leaf to the left, trying to free up at
3769  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3770  *
3771  * max_slot can put a limit on how far into the leaf we'll push items.  The
3772  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3773  * items
3774  */
3775 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3776                           *root, struct btrfs_path *path, int min_data_size,
3777                           int data_size, int empty, u32 max_slot)
3778 {
3779         struct extent_buffer *right = path->nodes[0];
3780         struct extent_buffer *left;
3781         int slot;
3782         int free_space;
3783         u32 right_nritems;
3784         int ret = 0;
3785
3786         slot = path->slots[1];
3787         if (slot == 0)
3788                 return 1;
3789         if (!path->nodes[1])
3790                 return 1;
3791
3792         right_nritems = btrfs_header_nritems(right);
3793         if (right_nritems == 0)
3794                 return 1;
3795
3796         btrfs_assert_tree_locked(path->nodes[1]);
3797
3798         left = read_node_slot(root, path->nodes[1], slot - 1);
3799         if (left == NULL)
3800                 return 1;
3801
3802         btrfs_tree_lock(left);
3803         btrfs_set_lock_blocking(left);
3804
3805         free_space = btrfs_leaf_free_space(root, left);
3806         if (free_space < data_size) {
3807                 ret = 1;
3808                 goto out;
3809         }
3810
3811         /* cow and double check */
3812         ret = btrfs_cow_block(trans, root, left,
3813                               path->nodes[1], slot - 1, &left);
3814         if (ret) {
3815                 /* we hit -ENOSPC, but it isn't fatal here */
3816                 if (ret == -ENOSPC)
3817                         ret = 1;
3818                 goto out;
3819         }
3820
3821         free_space = btrfs_leaf_free_space(root, left);
3822         if (free_space < data_size) {
3823                 ret = 1;
3824                 goto out;
3825         }
3826
3827         return __push_leaf_left(trans, root, path, min_data_size,
3828                                empty, left, free_space, right_nritems,
3829                                max_slot);
3830 out:
3831         btrfs_tree_unlock(left);
3832         free_extent_buffer(left);
3833         return ret;
3834 }
3835
3836 /*
3837  * split the path's leaf in two, making sure there is at least data_size
3838  * available for the resulting leaf level of the path.
3839  */
3840 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
3841                                     struct btrfs_root *root,
3842                                     struct btrfs_path *path,
3843                                     struct extent_buffer *l,
3844                                     struct extent_buffer *right,
3845                                     int slot, int mid, int nritems)
3846 {
3847         int data_copy_size;
3848         int rt_data_off;
3849         int i;
3850         struct btrfs_disk_key disk_key;
3851         struct btrfs_map_token token;
3852
3853         btrfs_init_map_token(&token);
3854
3855         nritems = nritems - mid;
3856         btrfs_set_header_nritems(right, nritems);
3857         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
3858
3859         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
3860                            btrfs_item_nr_offset(mid),
3861                            nritems * sizeof(struct btrfs_item));
3862
3863         copy_extent_buffer(right, l,
3864                      btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
3865                      data_copy_size, btrfs_leaf_data(l) +
3866                      leaf_data_end(root, l), data_copy_size);
3867
3868         rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
3869                       btrfs_item_end_nr(l, mid);
3870
3871         for (i = 0; i < nritems; i++) {
3872                 struct btrfs_item *item = btrfs_item_nr(right, i);
3873                 u32 ioff;
3874
3875                 ioff = btrfs_token_item_offset(right, item, &token);
3876                 btrfs_set_token_item_offset(right, item,
3877                                             ioff + rt_data_off, &token);
3878         }
3879
3880         btrfs_set_header_nritems(l, mid);
3881         btrfs_item_key(right, &disk_key, 0);
3882         insert_ptr(trans, root, path, &disk_key, right->start,
3883                    path->slots[1] + 1, 1);
3884
3885         btrfs_mark_buffer_dirty(right);
3886         btrfs_mark_buffer_dirty(l);
3887         BUG_ON(path->slots[0] != slot);
3888
3889         if (mid <= slot) {
3890                 btrfs_tree_unlock(path->nodes[0]);
3891                 free_extent_buffer(path->nodes[0]);
3892                 path->nodes[0] = right;
3893                 path->slots[0] -= mid;
3894                 path->slots[1] += 1;
3895         } else {
3896                 btrfs_tree_unlock(right);
3897                 free_extent_buffer(right);
3898         }
3899
3900         BUG_ON(path->slots[0] < 0);
3901 }
3902
3903 /*
3904  * double splits happen when we need to insert a big item in the middle
3905  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
3906  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3907  *          A                 B                 C
3908  *
3909  * We avoid this by trying to push the items on either side of our target
3910  * into the adjacent leaves.  If all goes well we can avoid the double split
3911  * completely.
3912  */
3913 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3914                                           struct btrfs_root *root,
3915                                           struct btrfs_path *path,
3916                                           int data_size)
3917 {
3918         int ret;
3919         int progress = 0;
3920         int slot;
3921         u32 nritems;
3922
3923         slot = path->slots[0];
3924
3925         /*
3926          * try to push all the items after our slot into the
3927          * right leaf
3928          */
3929         ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
3930         if (ret < 0)
3931                 return ret;
3932
3933         if (ret == 0)
3934                 progress++;
3935
3936         nritems = btrfs_header_nritems(path->nodes[0]);
3937         /*
3938          * our goal is to get our slot at the start or end of a leaf.  If
3939          * we've done so we're done
3940          */
3941         if (path->slots[0] == 0 || path->slots[0] == nritems)
3942                 return 0;
3943
3944         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
3945                 return 0;
3946
3947         /* try to push all the items before our slot into the next leaf */
3948         slot = path->slots[0];
3949         ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
3950         if (ret < 0)
3951                 return ret;
3952
3953         if (ret == 0)
3954                 progress++;
3955
3956         if (progress)
3957                 return 0;
3958         return 1;
3959 }
3960
3961 /*
3962  * split the path's leaf in two, making sure there is at least data_size
3963  * available for the resulting leaf level of the path.
3964  *
3965  * returns 0 if all went well and < 0 on failure.
3966  */
3967 static noinline int split_leaf(struct btrfs_trans_handle *trans,
3968                                struct btrfs_root *root,
3969                                struct btrfs_key *ins_key,
3970                                struct btrfs_path *path, int data_size,
3971                                int extend)
3972 {
3973         struct btrfs_disk_key disk_key;
3974         struct extent_buffer *l;
3975         u32 nritems;
3976         int mid;
3977         int slot;
3978         struct extent_buffer *right;
3979         int ret = 0;
3980         int wret;
3981         int split;
3982         int num_doubles = 0;
3983         int tried_avoid_double = 0;
3984
3985         l = path->nodes[0];
3986         slot = path->slots[0];
3987         if (extend && data_size + btrfs_item_size_nr(l, slot) +
3988             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
3989                 return -EOVERFLOW;
3990
3991         /* first try to make some room by pushing left and right */
3992         if (data_size) {
3993                 wret = push_leaf_right(trans, root, path, data_size,
3994                                        data_size, 0, 0);
3995                 if (wret < 0)
3996                         return wret;
3997                 if (wret) {
3998                         wret = push_leaf_left(trans, root, path, data_size,
3999                                               data_size, 0, (u32)-1);
4000                         if (wret < 0)
4001                                 return wret;
4002                 }
4003                 l = path->nodes[0];
4004
4005                 /* did the pushes work? */
4006                 if (btrfs_leaf_free_space(root, l) >= data_size)
4007                         return 0;
4008         }
4009
4010         if (!path->nodes[1]) {
4011                 ret = insert_new_root(trans, root, path, 1, 1);
4012                 if (ret)
4013                         return ret;
4014         }
4015 again:
4016         split = 1;
4017         l = path->nodes[0];
4018         slot = path->slots[0];
4019         nritems = btrfs_header_nritems(l);
4020         mid = (nritems + 1) / 2;
4021
4022         if (mid <= slot) {
4023                 if (nritems == 1 ||
4024                     leaf_space_used(l, mid, nritems - mid) + data_size >
4025                         BTRFS_LEAF_DATA_SIZE(root)) {
4026                         if (slot >= nritems) {
4027                                 split = 0;
4028                         } else {
4029                                 mid = slot;
4030                                 if (mid != nritems &&
4031                                     leaf_space_used(l, mid, nritems - mid) +
4032                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4033                                         if (data_size && !tried_avoid_double)
4034                                                 goto push_for_double;
4035                                         split = 2;
4036                                 }
4037                         }
4038                 }
4039         } else {
4040                 if (leaf_space_used(l, 0, mid) + data_size >
4041                         BTRFS_LEAF_DATA_SIZE(root)) {
4042                         if (!extend && data_size && slot == 0) {
4043                                 split = 0;
4044                         } else if ((extend || !data_size) && slot == 0) {
4045                                 mid = 1;
4046                         } else {
4047                                 mid = slot;
4048                                 if (mid != nritems &&
4049                                     leaf_space_used(l, mid, nritems - mid) +
4050                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4051                                         if (data_size && !tried_avoid_double)
4052                                                 goto push_for_double;
4053                                         split = 2 ;
4054                                 }
4055                         }
4056                 }
4057         }
4058
4059         if (split == 0)
4060                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
4061         else
4062                 btrfs_item_key(l, &disk_key, mid);
4063
4064         right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
4065                                         root->root_key.objectid,
4066                                         &disk_key, 0, l->start, 0);
4067         if (IS_ERR(right))
4068                 return PTR_ERR(right);
4069
4070         root_add_used(root, root->leafsize);
4071
4072         memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
4073         btrfs_set_header_bytenr(right, right->start);
4074         btrfs_set_header_generation(right, trans->transid);
4075         btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
4076         btrfs_set_header_owner(right, root->root_key.objectid);
4077         btrfs_set_header_level(right, 0);
4078         write_extent_buffer(right, root->fs_info->fsid,
4079                             (unsigned long)btrfs_header_fsid(right),
4080                             BTRFS_FSID_SIZE);
4081
4082         write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
4083                             (unsigned long)btrfs_header_chunk_tree_uuid(right),
4084                             BTRFS_UUID_SIZE);
4085
4086         if (split == 0) {
4087                 if (mid <= slot) {
4088                         btrfs_set_header_nritems(right, 0);
4089                         insert_ptr(trans, root, path, &disk_key, right->start,
4090                                    path->slots[1] + 1, 1);
4091                         btrfs_tree_unlock(path->nodes[0]);
4092                         free_extent_buffer(path->nodes[0]);
4093                         path->nodes[0] = right;
4094                         path->slots[0] = 0;
4095                         path->slots[1] += 1;
4096                 } else {
4097                         btrfs_set_header_nritems(right, 0);
4098                         insert_ptr(trans, root, path, &disk_key, right->start,
4099                                           path->slots[1], 1);
4100                         btrfs_tree_unlock(path->nodes[0]);
4101                         free_extent_buffer(path->nodes[0]);
4102                         path->nodes[0] = right;
4103                         path->slots[0] = 0;
4104                         if (path->slots[1] == 0)
4105                                 fixup_low_keys(root, path, &disk_key, 1);
4106                 }
4107                 btrfs_mark_buffer_dirty(right);
4108                 return ret;
4109         }
4110
4111         copy_for_split(trans, root, path, l, right, slot, mid, nritems);
4112
4113         if (split == 2) {
4114                 BUG_ON(num_doubles != 0);
4115                 num_doubles++;
4116                 goto again;
4117         }
4118
4119         return 0;
4120
4121 push_for_double:
4122         push_for_double_split(trans, root, path, data_size);
4123         tried_avoid_double = 1;
4124         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4125                 return 0;
4126         goto again;
4127 }
4128
4129 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4130                                          struct btrfs_root *root,
4131                                          struct btrfs_path *path, int ins_len)
4132 {
4133         struct btrfs_key key;
4134         struct extent_buffer *leaf;
4135         struct btrfs_file_extent_item *fi;
4136         u64 extent_len = 0;
4137         u32 item_size;
4138         int ret;
4139
4140         leaf = path->nodes[0];
4141         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4142
4143         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4144                key.type != BTRFS_EXTENT_CSUM_KEY);
4145
4146         if (btrfs_leaf_free_space(root, leaf) >= ins_len)
4147                 return 0;
4148
4149         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4150         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4151                 fi = btrfs_item_ptr(leaf, path->slots[0],
4152                                     struct btrfs_file_extent_item);
4153                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4154         }
4155         btrfs_release_path(path);
4156
4157         path->keep_locks = 1;
4158         path->search_for_split = 1;
4159         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4160         path->search_for_split = 0;
4161         if (ret < 0)
4162                 goto err;
4163
4164         ret = -EAGAIN;
4165         leaf = path->nodes[0];
4166         /* if our item isn't there or got smaller, return now */
4167         if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4168                 goto err;
4169
4170         /* the leaf has  changed, it now has room.  return now */
4171         if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
4172                 goto err;
4173
4174         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4175                 fi = btrfs_item_ptr(leaf, path->slots[0],
4176                                     struct btrfs_file_extent_item);
4177                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4178                         goto err;
4179         }
4180
4181         btrfs_set_path_blocking(path);
4182         ret = split_leaf(trans, root, &key, path, ins_len, 1);
4183         if (ret)
4184                 goto err;
4185
4186         path->keep_locks = 0;
4187         btrfs_unlock_up_safe(path, 1);
4188         return 0;
4189 err:
4190         path->keep_locks = 0;
4191         return ret;
4192 }
4193
4194 static noinline int split_item(struct btrfs_trans_handle *trans,
4195                                struct btrfs_root *root,
4196                                struct btrfs_path *path,
4197                                struct btrfs_key *new_key,
4198                                unsigned long split_offset)
4199 {
4200         struct extent_buffer *leaf;
4201         struct btrfs_item *item;
4202         struct btrfs_item *new_item;
4203         int slot;
4204         char *buf;
4205         u32 nritems;
4206         u32 item_size;
4207         u32 orig_offset;
4208         struct btrfs_disk_key disk_key;
4209
4210         leaf = path->nodes[0];
4211         BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
4212
4213         btrfs_set_path_blocking(path);
4214
4215         item = btrfs_item_nr(leaf, path->slots[0]);
4216         orig_offset = btrfs_item_offset(leaf, item);
4217         item_size = btrfs_item_size(leaf, item);
4218
4219         buf = kmalloc(item_size, GFP_NOFS);
4220         if (!buf)
4221                 return -ENOMEM;
4222
4223         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4224                             path->slots[0]), item_size);
4225
4226         slot = path->slots[0] + 1;
4227         nritems = btrfs_header_nritems(leaf);
4228         if (slot != nritems) {
4229                 /* shift the items */
4230                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4231                                 btrfs_item_nr_offset(slot),
4232                                 (nritems - slot) * sizeof(struct btrfs_item));
4233         }
4234
4235         btrfs_cpu_key_to_disk(&disk_key, new_key);
4236         btrfs_set_item_key(leaf, &disk_key, slot);
4237
4238         new_item = btrfs_item_nr(leaf, slot);
4239
4240         btrfs_set_item_offset(leaf, new_item, orig_offset);
4241         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4242
4243         btrfs_set_item_offset(leaf, item,
4244                               orig_offset + item_size - split_offset);
4245         btrfs_set_item_size(leaf, item, split_offset);
4246
4247         btrfs_set_header_nritems(leaf, nritems + 1);
4248
4249         /* write the data for the start of the original item */
4250         write_extent_buffer(leaf, buf,
4251                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4252                             split_offset);
4253
4254         /* write the data for the new item */
4255         write_extent_buffer(leaf, buf + split_offset,
4256                             btrfs_item_ptr_offset(leaf, slot),
4257                             item_size - split_offset);
4258         btrfs_mark_buffer_dirty(leaf);
4259
4260         BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
4261         kfree(buf);
4262         return 0;
4263 }
4264
4265 /*
4266  * This function splits a single item into two items,
4267  * giving 'new_key' to the new item and splitting the
4268  * old one at split_offset (from the start of the item).
4269  *
4270  * The path may be released by this operation.  After
4271  * the split, the path is pointing to the old item.  The
4272  * new item is going to be in the same node as the old one.
4273  *
4274  * Note, the item being split must be smaller enough to live alone on
4275  * a tree block with room for one extra struct btrfs_item
4276  *
4277  * This allows us to split the item in place, keeping a lock on the
4278  * leaf the entire time.
4279  */
4280 int btrfs_split_item(struct btrfs_trans_handle *trans,
4281                      struct btrfs_root *root,
4282                      struct btrfs_path *path,
4283                      struct btrfs_key *new_key,
4284                      unsigned long split_offset)
4285 {
4286         int ret;
4287         ret = setup_leaf_for_split(trans, root, path,
4288                                    sizeof(struct btrfs_item));
4289         if (ret)
4290                 return ret;
4291
4292         ret = split_item(trans, root, path, new_key, split_offset);
4293         return ret;
4294 }
4295
4296 /*
4297  * This function duplicate a item, giving 'new_key' to the new item.
4298  * It guarantees both items live in the same tree leaf and the new item
4299  * is contiguous with the original item.
4300  *
4301  * This allows us to split file extent in place, keeping a lock on the
4302  * leaf the entire time.
4303  */
4304 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4305                          struct btrfs_root *root,
4306                          struct btrfs_path *path,
4307                          struct btrfs_key *new_key)
4308 {
4309         struct extent_buffer *leaf;
4310         int ret;
4311         u32 item_size;
4312
4313         leaf = path->nodes[0];
4314         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4315         ret = setup_leaf_for_split(trans, root, path,
4316                                    item_size + sizeof(struct btrfs_item));
4317         if (ret)
4318                 return ret;
4319
4320         path->slots[0]++;
4321         setup_items_for_insert(root, path, new_key, &item_size,
4322                                item_size, item_size +
4323                                sizeof(struct btrfs_item), 1);
4324         leaf = path->nodes[0];
4325         memcpy_extent_buffer(leaf,
4326                              btrfs_item_ptr_offset(leaf, path->slots[0]),
4327                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4328                              item_size);
4329         return 0;
4330 }
4331
4332 /*
4333  * make the item pointed to by the path smaller.  new_size indicates
4334  * how small to make it, and from_end tells us if we just chop bytes
4335  * off the end of the item or if we shift the item to chop bytes off
4336  * the front.
4337  */
4338 void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
4339                          u32 new_size, int from_end)
4340 {
4341         int slot;
4342         struct extent_buffer *leaf;
4343         struct btrfs_item *item;
4344         u32 nritems;
4345         unsigned int data_end;
4346         unsigned int old_data_start;
4347         unsigned int old_size;
4348         unsigned int size_diff;
4349         int i;
4350         struct btrfs_map_token token;
4351
4352         btrfs_init_map_token(&token);
4353
4354         leaf = path->nodes[0];
4355         slot = path->slots[0];
4356
4357         old_size = btrfs_item_size_nr(leaf, slot);
4358         if (old_size == new_size)
4359                 return;
4360
4361         nritems = btrfs_header_nritems(leaf);
4362         data_end = leaf_data_end(root, leaf);
4363
4364         old_data_start = btrfs_item_offset_nr(leaf, slot);
4365
4366         size_diff = old_size - new_size;
4367
4368         BUG_ON(slot < 0);
4369         BUG_ON(slot >= nritems);
4370
4371         /*
4372          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4373          */
4374         /* first correct the data pointers */
4375         for (i = slot; i < nritems; i++) {
4376                 u32 ioff;
4377                 item = btrfs_item_nr(leaf, i);
4378
4379                 ioff = btrfs_token_item_offset(leaf, item, &token);
4380                 btrfs_set_token_item_offset(leaf, item,
4381                                             ioff + size_diff, &token);
4382         }
4383
4384         /* shift the data */
4385         if (from_end) {
4386                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4387                               data_end + size_diff, btrfs_leaf_data(leaf) +
4388                               data_end, old_data_start + new_size - data_end);
4389         } else {
4390                 struct btrfs_disk_key disk_key;
4391                 u64 offset;
4392
4393                 btrfs_item_key(leaf, &disk_key, slot);
4394
4395                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4396                         unsigned long ptr;
4397                         struct btrfs_file_extent_item *fi;
4398
4399                         fi = btrfs_item_ptr(leaf, slot,
4400                                             struct btrfs_file_extent_item);
4401                         fi = (struct btrfs_file_extent_item *)(
4402                              (unsigned long)fi - size_diff);
4403
4404                         if (btrfs_file_extent_type(leaf, fi) ==
4405                             BTRFS_FILE_EXTENT_INLINE) {
4406                                 ptr = btrfs_item_ptr_offset(leaf, slot);
4407                                 memmove_extent_buffer(leaf, ptr,
4408                                       (unsigned long)fi,
4409                                       offsetof(struct btrfs_file_extent_item,
4410                                                  disk_bytenr));
4411                         }
4412                 }
4413
4414                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4415                               data_end + size_diff, btrfs_leaf_data(leaf) +
4416                               data_end, old_data_start - data_end);
4417
4418                 offset = btrfs_disk_key_offset(&disk_key);
4419                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4420                 btrfs_set_item_key(leaf, &disk_key, slot);
4421                 if (slot == 0)
4422                         fixup_low_keys(root, path, &disk_key, 1);
4423         }
4424
4425         item = btrfs_item_nr(leaf, slot);
4426         btrfs_set_item_size(leaf, item, new_size);
4427         btrfs_mark_buffer_dirty(leaf);
4428
4429         if (btrfs_leaf_free_space(root, leaf) < 0) {
4430                 btrfs_print_leaf(root, leaf);
4431                 BUG();
4432         }
4433 }
4434
4435 /*
4436  * make the item pointed to by the path bigger, data_size is the new size.
4437  */
4438 void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
4439                        u32 data_size)
4440 {
4441         int slot;
4442         struct extent_buffer *leaf;
4443         struct btrfs_item *item;
4444         u32 nritems;
4445         unsigned int data_end;
4446         unsigned int old_data;
4447         unsigned int old_size;
4448         int i;
4449         struct btrfs_map_token token;
4450
4451         btrfs_init_map_token(&token);
4452
4453         leaf = path->nodes[0];
4454
4455         nritems = btrfs_header_nritems(leaf);
4456         data_end = leaf_data_end(root, leaf);
4457
4458         if (btrfs_leaf_free_space(root, leaf) < data_size) {
4459                 btrfs_print_leaf(root, leaf);
4460                 BUG();
4461         }
4462         slot = path->slots[0];
4463         old_data = btrfs_item_end_nr(leaf, slot);
4464
4465         BUG_ON(slot < 0);
4466         if (slot >= nritems) {
4467                 btrfs_print_leaf(root, leaf);
4468                 printk(KERN_CRIT "slot %d too large, nritems %d\n",
4469                        slot, nritems);
4470                 BUG_ON(1);
4471         }
4472
4473         /*
4474          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4475          */
4476         /* first correct the data pointers */
4477         for (i = slot; i < nritems; i++) {
4478                 u32 ioff;
4479                 item = btrfs_item_nr(leaf, i);
4480
4481                 ioff = btrfs_token_item_offset(leaf, item, &token);
4482                 btrfs_set_token_item_offset(leaf, item,
4483                                             ioff - data_size, &token);
4484         }
4485
4486         /* shift the data */
4487         memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4488                       data_end - data_size, btrfs_leaf_data(leaf) +
4489                       data_end, old_data - data_end);
4490
4491         data_end = old_data;
4492         old_size = btrfs_item_size_nr(leaf, slot);
4493         item = btrfs_item_nr(leaf, slot);
4494         btrfs_set_item_size(leaf, item, old_size + data_size);
4495         btrfs_mark_buffer_dirty(leaf);
4496
4497         if (btrfs_leaf_free_space(root, leaf) < 0) {
4498                 btrfs_print_leaf(root, leaf);
4499                 BUG();
4500         }
4501 }
4502
4503 /*
4504  * this is a helper for btrfs_insert_empty_items, the main goal here is
4505  * to save stack depth by doing the bulk of the work in a function
4506  * that doesn't call btrfs_search_slot
4507  */
4508 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4509                             struct btrfs_key *cpu_key, u32 *data_size,
4510                             u32 total_data, u32 total_size, int nr)
4511 {
4512         struct btrfs_item *item;
4513         int i;
4514         u32 nritems;
4515         unsigned int data_end;
4516         struct btrfs_disk_key disk_key;
4517         struct extent_buffer *leaf;
4518         int slot;
4519         struct btrfs_map_token token;
4520
4521         btrfs_init_map_token(&token);
4522
4523         leaf = path->nodes[0];
4524         slot = path->slots[0];
4525
4526         nritems = btrfs_header_nritems(leaf);
4527         data_end = leaf_data_end(root, leaf);
4528
4529         if (btrfs_leaf_free_space(root, leaf) < total_size) {
4530                 btrfs_print_leaf(root, leaf);
4531                 printk(KERN_CRIT "not enough freespace need %u have %d\n",
4532                        total_size, btrfs_leaf_free_space(root, leaf));
4533                 BUG();
4534         }
4535
4536         if (slot != nritems) {
4537                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4538
4539                 if (old_data < data_end) {
4540                         btrfs_print_leaf(root, leaf);
4541                         printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
4542                                slot, old_data, data_end);
4543                         BUG_ON(1);
4544                 }
4545                 /*
4546                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
4547                  */
4548                 /* first correct the data pointers */
4549                 for (i = slot; i < nritems; i++) {
4550                         u32 ioff;
4551
4552                         item = btrfs_item_nr(leaf, i);
4553                         ioff = btrfs_token_item_offset(leaf, item, &token);
4554                         btrfs_set_token_item_offset(leaf, item,
4555                                                     ioff - total_data, &token);
4556                 }
4557                 /* shift the items */
4558                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4559                               btrfs_item_nr_offset(slot),
4560                               (nritems - slot) * sizeof(struct btrfs_item));
4561
4562                 /* shift the data */
4563                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4564                               data_end - total_data, btrfs_leaf_data(leaf) +
4565                               data_end, old_data - data_end);
4566                 data_end = old_data;
4567         }
4568
4569         /* setup the item for the new data */
4570         for (i = 0; i < nr; i++) {
4571                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4572                 btrfs_set_item_key(leaf, &disk_key, slot + i);
4573                 item = btrfs_item_nr(leaf, slot + i);
4574                 btrfs_set_token_item_offset(leaf, item,
4575                                             data_end - data_size[i], &token);
4576                 data_end -= data_size[i];
4577                 btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4578         }
4579
4580         btrfs_set_header_nritems(leaf, nritems + nr);
4581
4582         if (slot == 0) {
4583                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4584                 fixup_low_keys(root, path, &disk_key, 1);
4585         }
4586         btrfs_unlock_up_safe(path, 1);
4587         btrfs_mark_buffer_dirty(leaf);
4588
4589         if (btrfs_leaf_free_space(root, leaf) < 0) {
4590                 btrfs_print_leaf(root, leaf);
4591                 BUG();
4592         }
4593 }
4594
4595 /*
4596  * Given a key and some data, insert items into the tree.
4597  * This does all the path init required, making room in the tree if needed.
4598  */
4599 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4600                             struct btrfs_root *root,
4601                             struct btrfs_path *path,
4602                             struct btrfs_key *cpu_key, u32 *data_size,
4603                             int nr)
4604 {
4605         int ret = 0;
4606         int slot;
4607         int i;
4608         u32 total_size = 0;
4609         u32 total_data = 0;
4610
4611         for (i = 0; i < nr; i++)
4612                 total_data += data_size[i];
4613
4614         total_size = total_data + (nr * sizeof(struct btrfs_item));
4615         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4616         if (ret == 0)
4617                 return -EEXIST;
4618         if (ret < 0)
4619                 return ret;
4620
4621         slot = path->slots[0];
4622         BUG_ON(slot < 0);
4623
4624         setup_items_for_insert(root, path, cpu_key, data_size,
4625                                total_data, total_size, nr);
4626         return 0;
4627 }
4628
4629 /*
4630  * Given a key and some data, insert an item into the tree.
4631  * This does all the path init required, making room in the tree if needed.
4632  */
4633 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4634                       *root, struct btrfs_key *cpu_key, void *data, u32
4635                       data_size)
4636 {
4637         int ret = 0;
4638         struct btrfs_path *path;
4639         struct extent_buffer *leaf;
4640         unsigned long ptr;
4641
4642         path = btrfs_alloc_path();
4643         if (!path)
4644                 return -ENOMEM;
4645         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4646         if (!ret) {
4647                 leaf = path->nodes[0];
4648                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4649                 write_extent_buffer(leaf, data, ptr, data_size);
4650                 btrfs_mark_buffer_dirty(leaf);
4651         }
4652         btrfs_free_path(path);
4653         return ret;
4654 }
4655
4656 /*
4657  * delete the pointer from a given node.
4658  *
4659  * the tree should have been previously balanced so the deletion does not
4660  * empty a node.
4661  */
4662 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4663                     int level, int slot)
4664 {
4665         struct extent_buffer *parent = path->nodes[level];
4666         u32 nritems;
4667         int ret;
4668
4669         nritems = btrfs_header_nritems(parent);
4670         if (slot != nritems - 1) {
4671                 if (level)
4672                         tree_mod_log_eb_move(root->fs_info, parent, slot,
4673                                              slot + 1, nritems - slot - 1);
4674                 memmove_extent_buffer(parent,
4675                               btrfs_node_key_ptr_offset(slot),
4676                               btrfs_node_key_ptr_offset(slot + 1),
4677                               sizeof(struct btrfs_key_ptr) *
4678                               (nritems - slot - 1));
4679         } else if (level) {
4680                 ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4681                                               MOD_LOG_KEY_REMOVE);
4682                 BUG_ON(ret < 0);
4683         }
4684
4685         nritems--;
4686         btrfs_set_header_nritems(parent, nritems);
4687         if (nritems == 0 && parent == root->node) {
4688                 BUG_ON(btrfs_header_level(root->node) != 1);
4689                 /* just turn the root into a leaf and break */
4690                 btrfs_set_header_level(root->node, 0);
4691         } else if (slot == 0) {
4692                 struct btrfs_disk_key disk_key;
4693
4694                 btrfs_node_key(parent, &disk_key, 0);
4695                 fixup_low_keys(root, path, &disk_key, level + 1);
4696         }
4697         btrfs_mark_buffer_dirty(parent);
4698 }
4699
4700 /*
4701  * a helper function to delete the leaf pointed to by path->slots[1] and
4702  * path->nodes[1].
4703  *
4704  * This deletes the pointer in path->nodes[1] and frees the leaf
4705  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4706  *
4707  * The path must have already been setup for deleting the leaf, including
4708  * all the proper balancing.  path->nodes[1] must be locked.
4709  */
4710 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4711                                     struct btrfs_root *root,
4712                                     struct btrfs_path *path,
4713                                     struct extent_buffer *leaf)
4714 {
4715         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4716         del_ptr(root, path, 1, path->slots[1]);
4717
4718         /*
4719          * btrfs_free_extent is expensive, we want to make sure we
4720          * aren't holding any locks when we call it
4721          */
4722         btrfs_unlock_up_safe(path, 0);
4723
4724         root_sub_used(root, leaf->len);
4725
4726         extent_buffer_get(leaf);
4727         btrfs_free_tree_block(trans, root, leaf, 0, 1);
4728         free_extent_buffer_stale(leaf);
4729 }
4730 /*
4731  * delete the item at the leaf level in path.  If that empties
4732  * the leaf, remove it from the tree
4733  */
4734 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4735                     struct btrfs_path *path, int slot, int nr)
4736 {
4737         struct extent_buffer *leaf;
4738         struct btrfs_item *item;
4739         int last_off;
4740         int dsize = 0;
4741         int ret = 0;
4742         int wret;
4743         int i;
4744         u32 nritems;
4745         struct btrfs_map_token token;
4746
4747         btrfs_init_map_token(&token);
4748
4749         leaf = path->nodes[0];
4750         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4751
4752         for (i = 0; i < nr; i++)
4753                 dsize += btrfs_item_size_nr(leaf, slot + i);
4754
4755         nritems = btrfs_header_nritems(leaf);
4756
4757         if (slot + nr != nritems) {
4758                 int data_end = leaf_data_end(root, leaf);
4759
4760                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4761                               data_end + dsize,
4762                               btrfs_leaf_data(leaf) + data_end,
4763                               last_off - data_end);
4764
4765                 for (i = slot + nr; i < nritems; i++) {
4766                         u32 ioff;
4767
4768                         item = btrfs_item_nr(leaf, i);
4769                         ioff = btrfs_token_item_offset(leaf, item, &token);
4770                         btrfs_set_token_item_offset(leaf, item,
4771                                                     ioff + dsize, &token);
4772                 }
4773
4774                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4775                               btrfs_item_nr_offset(slot + nr),
4776                               sizeof(struct btrfs_item) *
4777                               (nritems - slot - nr));
4778         }
4779         btrfs_set_header_nritems(leaf, nritems - nr);
4780         nritems -= nr;
4781
4782         /* delete the leaf if we've emptied it */
4783         if (nritems == 0) {
4784                 if (leaf == root->node) {
4785                         btrfs_set_header_level(leaf, 0);
4786                 } else {
4787                         btrfs_set_path_blocking(path);
4788                         clean_tree_block(trans, root, leaf);
4789                         btrfs_del_leaf(trans, root, path, leaf);
4790                 }
4791         } else {
4792                 int used = leaf_space_used(leaf, 0, nritems);
4793                 if (slot == 0) {
4794                         struct btrfs_disk_key disk_key;
4795
4796                         btrfs_item_key(leaf, &disk_key, 0);
4797                         fixup_low_keys(root, path, &disk_key, 1);
4798                 }
4799
4800                 /* delete the leaf if it is mostly empty */
4801                 if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
4802                         /* push_leaf_left fixes the path.
4803                          * make sure the path still points to our leaf
4804                          * for possible call to del_ptr below
4805                          */
4806                         slot = path->slots[1];
4807                         extent_buffer_get(leaf);
4808
4809                         btrfs_set_path_blocking(path);
4810                         wret = push_leaf_left(trans, root, path, 1, 1,
4811                                               1, (u32)-1);
4812                         if (wret < 0 && wret != -ENOSPC)
4813                                 ret = wret;
4814
4815                         if (path->nodes[0] == leaf &&
4816                             btrfs_header_nritems(leaf)) {
4817                                 wret = push_leaf_right(trans, root, path, 1,
4818                                                        1, 1, 0);
4819                                 if (wret < 0 && wret != -ENOSPC)
4820                                         ret = wret;
4821                         }
4822
4823                         if (btrfs_header_nritems(leaf) == 0) {
4824                                 path->slots[1] = slot;
4825                                 btrfs_del_leaf(trans, root, path, leaf);
4826                                 free_extent_buffer(leaf);
4827                                 ret = 0;
4828                         } else {
4829                                 /* if we're still in the path, make sure
4830                                  * we're dirty.  Otherwise, one of the
4831                                  * push_leaf functions must have already
4832                                  * dirtied this buffer
4833                                  */
4834                                 if (path->nodes[0] == leaf)
4835                                         btrfs_mark_buffer_dirty(leaf);
4836                                 free_extent_buffer(leaf);
4837                         }
4838                 } else {
4839                         btrfs_mark_buffer_dirty(leaf);
4840                 }
4841         }
4842         return ret;
4843 }
4844
4845 /*
4846  * search the tree again to find a leaf with lesser keys
4847  * returns 0 if it found something or 1 if there are no lesser leaves.
4848  * returns < 0 on io errors.
4849  *
4850  * This may release the path, and so you may lose any locks held at the
4851  * time you call it.
4852  */
4853 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4854 {
4855         struct btrfs_key key;
4856         struct btrfs_disk_key found_key;
4857         int ret;
4858
4859         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
4860
4861         if (key.offset > 0)
4862                 key.offset--;
4863         else if (key.type > 0)
4864                 key.type--;
4865         else if (key.objectid > 0)
4866                 key.objectid--;
4867         else
4868                 return 1;
4869
4870         btrfs_release_path(path);
4871         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4872         if (ret < 0)
4873                 return ret;
4874         btrfs_item_key(path->nodes[0], &found_key, 0);
4875         ret = comp_keys(&found_key, &key);
4876         if (ret < 0)
4877                 return 0;
4878         return 1;
4879 }
4880
4881 /*
4882  * A helper function to walk down the tree starting at min_key, and looking
4883  * for nodes or leaves that are have a minimum transaction id.
4884  * This is used by the btree defrag code, and tree logging
4885  *
4886  * This does not cow, but it does stuff the starting key it finds back
4887  * into min_key, so you can call btrfs_search_slot with cow=1 on the
4888  * key and get a writable path.
4889  *
4890  * This does lock as it descends, and path->keep_locks should be set
4891  * to 1 by the caller.
4892  *
4893  * This honors path->lowest_level to prevent descent past a given level
4894  * of the tree.
4895  *
4896  * min_trans indicates the oldest transaction that you are interested
4897  * in walking through.  Any nodes or leaves older than min_trans are
4898  * skipped over (without reading them).
4899  *
4900  * returns zero if something useful was found, < 0 on error and 1 if there
4901  * was nothing in the tree that matched the search criteria.
4902  */
4903 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4904                          struct btrfs_key *max_key,
4905                          struct btrfs_path *path,
4906                          u64 min_trans)
4907 {
4908         struct extent_buffer *cur;
4909         struct btrfs_key found_key;
4910         int slot;
4911         int sret;
4912         u32 nritems;
4913         int level;
4914         int ret = 1;
4915
4916         WARN_ON(!path->keep_locks);
4917 again:
4918         cur = btrfs_read_lock_root_node(root);
4919         level = btrfs_header_level(cur);
4920         WARN_ON(path->nodes[level]);
4921         path->nodes[level] = cur;
4922         path->locks[level] = BTRFS_READ_LOCK;
4923
4924         if (btrfs_header_generation(cur) < min_trans) {
4925                 ret = 1;
4926                 goto out;
4927         }
4928         while (1) {
4929                 nritems = btrfs_header_nritems(cur);
4930                 level = btrfs_header_level(cur);
4931                 sret = bin_search(cur, min_key, level, &slot);
4932
4933                 /* at the lowest level, we're done, setup the path and exit */
4934                 if (level == path->lowest_level) {
4935                         if (slot >= nritems)
4936                                 goto find_next_key;
4937                         ret = 0;
4938                         path->slots[level] = slot;
4939                         btrfs_item_key_to_cpu(cur, &found_key, slot);
4940                         goto out;
4941                 }
4942                 if (sret && slot > 0)
4943                         slot--;
4944                 /*
4945                  * check this node pointer against the min_trans parameters.
4946                  * If it is too old, old, skip to the next one.
4947                  */
4948                 while (slot < nritems) {
4949                         u64 blockptr;
4950                         u64 gen;
4951
4952                         blockptr = btrfs_node_blockptr(cur, slot);
4953                         gen = btrfs_node_ptr_generation(cur, slot);
4954                         if (gen < min_trans) {
4955                                 slot++;
4956                                 continue;
4957                         }
4958                         break;
4959                 }
4960 find_next_key:
4961                 /*
4962                  * we didn't find a candidate key in this node, walk forward
4963                  * and find another one
4964                  */
4965                 if (slot >= nritems) {
4966                         path->slots[level] = slot;
4967                         btrfs_set_path_blocking(path);
4968                         sret = btrfs_find_next_key(root, path, min_key, level,
4969                                                   min_trans);
4970                         if (sret == 0) {
4971                                 btrfs_release_path(path);
4972                                 goto again;
4973                         } else {
4974                                 goto out;
4975                         }
4976                 }
4977                 /* save our key for returning back */
4978                 btrfs_node_key_to_cpu(cur, &found_key, slot);
4979                 path->slots[level] = slot;
4980                 if (level == path->lowest_level) {
4981                         ret = 0;
4982                         unlock_up(path, level, 1, 0, NULL);
4983                         goto out;
4984                 }
4985                 btrfs_set_path_blocking(path);
4986                 cur = read_node_slot(root, cur, slot);
4987                 BUG_ON(!cur); /* -ENOMEM */
4988
4989                 btrfs_tree_read_lock(cur);
4990
4991                 path->locks[level - 1] = BTRFS_READ_LOCK;
4992                 path->nodes[level - 1] = cur;
4993                 unlock_up(path, level, 1, 0, NULL);
4994                 btrfs_clear_path_blocking(path, NULL, 0);
4995         }
4996 out:
4997         if (ret == 0)
4998                 memcpy(min_key, &found_key, sizeof(found_key));
4999         btrfs_set_path_blocking(path);
5000         return ret;
5001 }
5002
5003 static void tree_move_down(struct btrfs_root *root,
5004                            struct btrfs_path *path,
5005                            int *level, int root_level)
5006 {
5007         BUG_ON(*level == 0);
5008         path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
5009                                         path->slots[*level]);
5010         path->slots[*level - 1] = 0;
5011         (*level)--;
5012 }
5013
5014 static int tree_move_next_or_upnext(struct btrfs_root *root,
5015                                     struct btrfs_path *path,
5016                                     int *level, int root_level)
5017 {
5018         int ret = 0;
5019         int nritems;
5020         nritems = btrfs_header_nritems(path->nodes[*level]);
5021
5022         path->slots[*level]++;
5023
5024         while (path->slots[*level] >= nritems) {
5025                 if (*level == root_level)
5026                         return -1;
5027
5028                 /* move upnext */
5029                 path->slots[*level] = 0;
5030                 free_extent_buffer(path->nodes[*level]);
5031                 path->nodes[*level] = NULL;
5032                 (*level)++;
5033                 path->slots[*level]++;
5034
5035                 nritems = btrfs_header_nritems(path->nodes[*level]);
5036                 ret = 1;
5037         }
5038         return ret;
5039 }
5040
5041 /*
5042  * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5043  * or down.
5044  */
5045 static int tree_advance(struct btrfs_root *root,
5046                         struct btrfs_path *path,
5047                         int *level, int root_level,
5048                         int allow_down,
5049                         struct btrfs_key *key)
5050 {
5051         int ret;
5052
5053         if (*level == 0 || !allow_down) {
5054                 ret = tree_move_next_or_upnext(root, path, level, root_level);
5055         } else {
5056                 tree_move_down(root, path, level, root_level);
5057                 ret = 0;
5058         }
5059         if (ret >= 0) {
5060                 if (*level == 0)
5061                         btrfs_item_key_to_cpu(path->nodes[*level], key,
5062                                         path->slots[*level]);
5063                 else
5064                         btrfs_node_key_to_cpu(path->nodes[*level], key,
5065                                         path->slots[*level]);
5066         }
5067         return ret;
5068 }
5069
5070 static int tree_compare_item(struct btrfs_root *left_root,
5071                              struct btrfs_path *left_path,
5072                              struct btrfs_path *right_path,
5073                              char *tmp_buf)
5074 {
5075         int cmp;
5076         int len1, len2;
5077         unsigned long off1, off2;
5078
5079         len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5080         len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5081         if (len1 != len2)
5082                 return 1;
5083
5084         off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5085         off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5086                                 right_path->slots[0]);
5087
5088         read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5089
5090         cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5091         if (cmp)
5092                 return 1;
5093         return 0;
5094 }
5095
5096 #define ADVANCE 1
5097 #define ADVANCE_ONLY_NEXT -1
5098
5099 /*
5100  * This function compares two trees and calls the provided callback for
5101  * every changed/new/deleted item it finds.
5102  * If shared tree blocks are encountered, whole subtrees are skipped, making
5103  * the compare pretty fast on snapshotted subvolumes.
5104  *
5105  * This currently works on commit roots only. As commit roots are read only,
5106  * we don't do any locking. The commit roots are protected with transactions.
5107  * Transactions are ended and rejoined when a commit is tried in between.
5108  *
5109  * This function checks for modifications done to the trees while comparing.
5110  * If it detects a change, it aborts immediately.
5111  */
5112 int btrfs_compare_trees(struct btrfs_root *left_root,
5113                         struct btrfs_root *right_root,
5114                         btrfs_changed_cb_t changed_cb, void *ctx)
5115 {
5116         int ret;
5117         int cmp;
5118         struct btrfs_trans_handle *trans = NULL;
5119         struct btrfs_path *left_path = NULL;
5120         struct btrfs_path *right_path = NULL;
5121         struct btrfs_key left_key;
5122         struct btrfs_key right_key;
5123         char *tmp_buf = NULL;
5124         int left_root_level;
5125         int right_root_level;
5126         int left_level;
5127         int right_level;
5128         int left_end_reached;
5129         int right_end_reached;
5130         int advance_left;
5131         int advance_right;
5132         u64 left_blockptr;
5133         u64 right_blockptr;
5134         u64 left_start_ctransid;
5135         u64 right_start_ctransid;
5136         u64 ctransid;
5137
5138         left_path = btrfs_alloc_path();
5139         if (!left_path) {
5140                 ret = -ENOMEM;
5141                 goto out;
5142         }
5143         right_path = btrfs_alloc_path();
5144         if (!right_path) {
5145                 ret = -ENOMEM;
5146                 goto out;
5147         }
5148
5149         tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS);
5150         if (!tmp_buf) {
5151                 ret = -ENOMEM;
5152                 goto out;
5153         }
5154
5155         left_path->search_commit_root = 1;
5156         left_path->skip_locking = 1;
5157         right_path->search_commit_root = 1;
5158         right_path->skip_locking = 1;
5159
5160         spin_lock(&left_root->root_item_lock);
5161         left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
5162         spin_unlock(&left_root->root_item_lock);
5163
5164         spin_lock(&right_root->root_item_lock);
5165         right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
5166         spin_unlock(&right_root->root_item_lock);
5167
5168         trans = btrfs_join_transaction(left_root);
5169         if (IS_ERR(trans)) {
5170                 ret = PTR_ERR(trans);
5171                 trans = NULL;
5172                 goto out;
5173         }
5174
5175         /*
5176          * Strategy: Go to the first items of both trees. Then do
5177          *
5178          * If both trees are at level 0
5179          *   Compare keys of current items
5180          *     If left < right treat left item as new, advance left tree
5181          *       and repeat
5182          *     If left > right treat right item as deleted, advance right tree
5183          *       and repeat
5184          *     If left == right do deep compare of items, treat as changed if
5185          *       needed, advance both trees and repeat
5186          * If both trees are at the same level but not at level 0
5187          *   Compare keys of current nodes/leafs
5188          *     If left < right advance left tree and repeat
5189          *     If left > right advance right tree and repeat
5190          *     If left == right compare blockptrs of the next nodes/leafs
5191          *       If they match advance both trees but stay at the same level
5192          *         and repeat
5193          *       If they don't match advance both trees while allowing to go
5194          *         deeper and repeat
5195          * If tree levels are different
5196          *   Advance the tree that needs it and repeat
5197          *
5198          * Advancing a tree means:
5199          *   If we are at level 0, try to go to the next slot. If that's not
5200          *   possible, go one level up and repeat. Stop when we found a level
5201          *   where we could go to the next slot. We may at this point be on a
5202          *   node or a leaf.
5203          *
5204          *   If we are not at level 0 and not on shared tree blocks, go one
5205          *   level deeper.
5206          *
5207          *   If we are not at level 0 and on shared tree blocks, go one slot to
5208          *   the right if possible or go up and right.
5209          */
5210
5211         left_level = btrfs_header_level(left_root->commit_root);
5212         left_root_level = left_level;
5213         left_path->nodes[left_level] = left_root->commit_root;
5214         extent_buffer_get(left_path->nodes[left_level]);
5215
5216         right_level = btrfs_header_level(right_root->commit_root);
5217         right_root_level = right_level;
5218         right_path->nodes[right_level] = right_root->commit_root;
5219         extent_buffer_get(right_path->nodes[right_level]);
5220
5221         if (left_level == 0)
5222                 btrfs_item_key_to_cpu(left_path->nodes[left_level],
5223                                 &left_key, left_path->slots[left_level]);
5224         else
5225                 btrfs_node_key_to_cpu(left_path->nodes[left_level],
5226                                 &left_key, left_path->slots[left_level]);
5227         if (right_level == 0)
5228                 btrfs_item_key_to_cpu(right_path->nodes[right_level],
5229                                 &right_key, right_path->slots[right_level]);
5230         else
5231                 btrfs_node_key_to_cpu(right_path->nodes[right_level],
5232                                 &right_key, right_path->slots[right_level]);
5233
5234         left_end_reached = right_end_reached = 0;
5235         advance_left = advance_right = 0;
5236
5237         while (1) {
5238                 /*
5239                  * We need to make sure the transaction does not get committed
5240                  * while we do anything on commit roots. This means, we need to
5241                  * join and leave transactions for every item that we process.
5242                  */
5243                 if (trans && btrfs_should_end_transaction(trans, left_root)) {
5244                         btrfs_release_path(left_path);
5245                         btrfs_release_path(right_path);
5246
5247                         ret = btrfs_end_transaction(trans, left_root);
5248                         trans = NULL;
5249                         if (ret < 0)
5250                                 goto out;
5251                 }
5252                 /* now rejoin the transaction */
5253                 if (!trans) {
5254                         trans = btrfs_join_transaction(left_root);
5255                         if (IS_ERR(trans)) {
5256                                 ret = PTR_ERR(trans);
5257                                 trans = NULL;
5258                                 goto out;
5259                         }
5260
5261                         spin_lock(&left_root->root_item_lock);
5262                         ctransid = btrfs_root_ctransid(&left_root->root_item);
5263                         spin_unlock(&left_root->root_item_lock);
5264                         if (ctransid != left_start_ctransid)
5265                                 left_start_ctransid = 0;
5266
5267                         spin_lock(&right_root->root_item_lock);
5268                         ctransid = btrfs_root_ctransid(&right_root->root_item);
5269                         spin_unlock(&right_root->root_item_lock);
5270                         if (ctransid != right_start_ctransid)
5271                                 right_start_ctransid = 0;
5272
5273                         if (!left_start_ctransid || !right_start_ctransid) {
5274                                 WARN(1, KERN_WARNING
5275                                         "btrfs: btrfs_compare_tree detected "
5276                                         "a change in one of the trees while "
5277                                         "iterating. This is probably a "
5278                                         "bug.\n");
5279                                 ret = -EIO;
5280                                 goto out;
5281                         }
5282
5283                         /*
5284                          * the commit root may have changed, so start again
5285                          * where we stopped
5286                          */
5287                         left_path->lowest_level = left_level;
5288                         right_path->lowest_level = right_level;
5289                         ret = btrfs_search_slot(NULL, left_root,
5290                                         &left_key, left_path, 0, 0);
5291                         if (ret < 0)
5292                                 goto out;
5293                         ret = btrfs_search_slot(NULL, right_root,
5294                                         &right_key, right_path, 0, 0);
5295                         if (ret < 0)
5296                                 goto out;
5297                 }
5298
5299                 if (advance_left && !left_end_reached) {
5300                         ret = tree_advance(left_root, left_path, &left_level,
5301                                         left_root_level,
5302                                         advance_left != ADVANCE_ONLY_NEXT,
5303                                         &left_key);
5304                         if (ret < 0)
5305                                 left_end_reached = ADVANCE;
5306                         advance_left = 0;
5307                 }
5308                 if (advance_right && !right_end_reached) {
5309                         ret = tree_advance(right_root, right_path, &right_level,
5310                                         right_root_level,
5311                                         advance_right != ADVANCE_ONLY_NEXT,
5312                                         &right_key);
5313                         if (ret < 0)
5314                                 right_end_reached = ADVANCE;
5315                         advance_right = 0;
5316                 }
5317
5318                 if (left_end_reached && right_end_reached) {
5319                         ret = 0;
5320                         goto out;
5321                 } else if (left_end_reached) {
5322                         if (right_level == 0) {
5323                                 ret = changed_cb(left_root, right_root,
5324                                                 left_path, right_path,
5325                                                 &right_key,
5326                                                 BTRFS_COMPARE_TREE_DELETED,
5327                                                 ctx);
5328                                 if (ret < 0)
5329                                         goto out;
5330                         }
5331                         advance_right = ADVANCE;
5332                         continue;
5333                 } else if (right_end_reached) {
5334                         if (left_level == 0) {
5335                                 ret = changed_cb(left_root, right_root,
5336                                                 left_path, right_path,
5337                                                 &left_key,
5338                                                 BTRFS_COMPARE_TREE_NEW,
5339                                                 ctx);
5340                                 if (ret < 0)
5341                                         goto out;
5342                         }
5343                         advance_left = ADVANCE;
5344                         continue;
5345                 }
5346
5347                 if (left_level == 0 && right_level == 0) {
5348                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5349                         if (cmp < 0) {
5350                                 ret = changed_cb(left_root, right_root,
5351                                                 left_path, right_path,
5352                                                 &left_key,
5353                                                 BTRFS_COMPARE_TREE_NEW,
5354                                                 ctx);
5355                                 if (ret < 0)
5356                                         goto out;
5357                                 advance_left = ADVANCE;
5358                         } else if (cmp > 0) {
5359                                 ret = changed_cb(left_root, right_root,
5360                                                 left_path, right_path,
5361                                                 &right_key,
5362                                                 BTRFS_COMPARE_TREE_DELETED,
5363                                                 ctx);
5364                                 if (ret < 0)
5365                                         goto out;
5366                                 advance_right = ADVANCE;
5367                         } else {
5368                                 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5369                                 ret = tree_compare_item(left_root, left_path,
5370                                                 right_path, tmp_buf);
5371                                 if (ret) {
5372                                         WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5373                                         ret = changed_cb(left_root, right_root,
5374                                                 left_path, right_path,
5375                                                 &left_key,
5376                                                 BTRFS_COMPARE_TREE_CHANGED,
5377                                                 ctx);
5378                                         if (ret < 0)
5379                                                 goto out;
5380                                 }
5381                                 advance_left = ADVANCE;
5382                                 advance_right = ADVANCE;
5383                         }
5384                 } else if (left_level == right_level) {
5385                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5386                         if (cmp < 0) {
5387                                 advance_left = ADVANCE;
5388                         } else if (cmp > 0) {
5389                                 advance_right = ADVANCE;
5390                         } else {
5391                                 left_blockptr = btrfs_node_blockptr(
5392                                                 left_path->nodes[left_level],
5393                                                 left_path->slots[left_level]);
5394                                 right_blockptr = btrfs_node_blockptr(
5395                                                 right_path->nodes[right_level],
5396                                                 right_path->slots[right_level]);
5397                                 if (left_blockptr == right_blockptr) {
5398                                         /*
5399                                          * As we're on a shared block, don't
5400                                          * allow to go deeper.
5401                                          */
5402                                         advance_left = ADVANCE_ONLY_NEXT;
5403                                         advance_right = ADVANCE_ONLY_NEXT;
5404                                 } else {
5405                                         advance_left = ADVANCE;
5406                                         advance_right = ADVANCE;
5407                                 }
5408                         }
5409                 } else if (left_level < right_level) {
5410                         advance_right = ADVANCE;
5411                 } else {
5412                         advance_left = ADVANCE;
5413                 }
5414         }
5415
5416 out:
5417         btrfs_free_path(left_path);
5418         btrfs_free_path(right_path);
5419         kfree(tmp_buf);
5420
5421         if (trans) {
5422                 if (!ret)
5423                         ret = btrfs_end_transaction(trans, left_root);
5424                 else
5425                         btrfs_end_transaction(trans, left_root);
5426         }
5427
5428         return ret;
5429 }
5430
5431 /*
5432  * this is similar to btrfs_next_leaf, but does not try to preserve
5433  * and fixup the path.  It looks for and returns the next key in the
5434  * tree based on the current path and the min_trans parameters.
5435  *
5436  * 0 is returned if another key is found, < 0 if there are any errors
5437  * and 1 is returned if there are no higher keys in the tree
5438  *
5439  * path->keep_locks should be set to 1 on the search made before
5440  * calling this function.
5441  */
5442 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5443                         struct btrfs_key *key, int level, u64 min_trans)
5444 {
5445         int slot;
5446         struct extent_buffer *c;
5447
5448         WARN_ON(!path->keep_locks);
5449         while (level < BTRFS_MAX_LEVEL) {
5450                 if (!path->nodes[level])
5451                         return 1;
5452
5453                 slot = path->slots[level] + 1;
5454                 c = path->nodes[level];
5455 next:
5456                 if (slot >= btrfs_header_nritems(c)) {
5457                         int ret;
5458                         int orig_lowest;
5459                         struct btrfs_key cur_key;
5460                         if (level + 1 >= BTRFS_MAX_LEVEL ||
5461                             !path->nodes[level + 1])
5462                                 return 1;
5463
5464                         if (path->locks[level + 1]) {
5465                                 level++;
5466                                 continue;
5467                         }
5468
5469                         slot = btrfs_header_nritems(c) - 1;
5470                         if (level == 0)
5471                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
5472                         else
5473                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
5474
5475                         orig_lowest = path->lowest_level;
5476                         btrfs_release_path(path);
5477                         path->lowest_level = level;
5478                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
5479                                                 0, 0);
5480                         path->lowest_level = orig_lowest;
5481                         if (ret < 0)
5482                                 return ret;
5483
5484                         c = path->nodes[level];
5485                         slot = path->slots[level];
5486                         if (ret == 0)
5487                                 slot++;
5488                         goto next;
5489                 }
5490
5491                 if (level == 0)
5492                         btrfs_item_key_to_cpu(c, key, slot);
5493                 else {
5494                         u64 gen = btrfs_node_ptr_generation(c, slot);
5495
5496                         if (gen < min_trans) {
5497                                 slot++;
5498                                 goto next;
5499                         }
5500                         btrfs_node_key_to_cpu(c, key, slot);
5501                 }
5502                 return 0;
5503         }
5504         return 1;
5505 }
5506
5507 /*
5508  * search the tree again to find a leaf with greater keys
5509  * returns 0 if it found something or 1 if there are no greater leaves.
5510  * returns < 0 on io errors.
5511  */
5512 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5513 {
5514         return btrfs_next_old_leaf(root, path, 0);
5515 }
5516
5517 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5518                         u64 time_seq)
5519 {
5520         int slot;
5521         int level;
5522         struct extent_buffer *c;
5523         struct extent_buffer *next;
5524         struct btrfs_key key;
5525         u32 nritems;
5526         int ret;
5527         int old_spinning = path->leave_spinning;
5528         int next_rw_lock = 0;
5529
5530         nritems = btrfs_header_nritems(path->nodes[0]);
5531         if (nritems == 0)
5532                 return 1;
5533
5534         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5535 again:
5536         level = 1;
5537         next = NULL;
5538         next_rw_lock = 0;
5539         btrfs_release_path(path);
5540
5541         path->keep_locks = 1;
5542         path->leave_spinning = 1;
5543
5544         if (time_seq)
5545                 ret = btrfs_search_old_slot(root, &key, path, time_seq);
5546         else
5547                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5548         path->keep_locks = 0;
5549
5550         if (ret < 0)
5551                 return ret;
5552
5553         nritems = btrfs_header_nritems(path->nodes[0]);
5554         /*
5555          * by releasing the path above we dropped all our locks.  A balance
5556          * could have added more items next to the key that used to be
5557          * at the very end of the block.  So, check again here and
5558          * advance the path if there are now more items available.
5559          */
5560         if (nritems > 0 && path->slots[0] < nritems - 1) {
5561                 if (ret == 0)
5562                         path->slots[0]++;
5563                 ret = 0;
5564                 goto done;
5565         }
5566
5567         while (level < BTRFS_MAX_LEVEL) {
5568                 if (!path->nodes[level]) {
5569                         ret = 1;
5570                         goto done;
5571                 }
5572
5573                 slot = path->slots[level] + 1;
5574                 c = path->nodes[level];
5575                 if (slot >= btrfs_header_nritems(c)) {
5576                         level++;
5577                         if (level == BTRFS_MAX_LEVEL) {
5578                                 ret = 1;
5579                                 goto done;
5580                         }
5581                         continue;
5582                 }
5583
5584                 if (next) {
5585                         btrfs_tree_unlock_rw(next, next_rw_lock);
5586                         free_extent_buffer(next);
5587                 }
5588
5589                 next = c;
5590                 next_rw_lock = path->locks[level];
5591                 ret = read_block_for_search(NULL, root, path, &next, level,
5592                                             slot, &key, 0);
5593                 if (ret == -EAGAIN)
5594                         goto again;
5595
5596                 if (ret < 0) {
5597                         btrfs_release_path(path);
5598                         goto done;
5599                 }
5600
5601                 if (!path->skip_locking) {
5602                         ret = btrfs_try_tree_read_lock(next);
5603                         if (!ret && time_seq) {
5604                                 /*
5605                                  * If we don't get the lock, we may be racing
5606                                  * with push_leaf_left, holding that lock while
5607                                  * itself waiting for the leaf we've currently
5608                                  * locked. To solve this situation, we give up
5609                                  * on our lock and cycle.
5610                                  */
5611                                 free_extent_buffer(next);
5612                                 btrfs_release_path(path);
5613                                 cond_resched();
5614                                 goto again;
5615                         }
5616                         if (!ret) {
5617                                 btrfs_set_path_blocking(path);
5618                                 btrfs_tree_read_lock(next);
5619                                 btrfs_clear_path_blocking(path, next,
5620                                                           BTRFS_READ_LOCK);
5621                         }
5622                         next_rw_lock = BTRFS_READ_LOCK;
5623                 }
5624                 break;
5625         }
5626         path->slots[level] = slot;
5627         while (1) {
5628                 level--;
5629                 c = path->nodes[level];
5630                 if (path->locks[level])
5631                         btrfs_tree_unlock_rw(c, path->locks[level]);
5632
5633                 free_extent_buffer(c);
5634                 path->nodes[level] = next;
5635                 path->slots[level] = 0;
5636                 if (!path->skip_locking)
5637                         path->locks[level] = next_rw_lock;
5638                 if (!level)
5639                         break;
5640
5641                 ret = read_block_for_search(NULL, root, path, &next, level,
5642                                             0, &key, 0);
5643                 if (ret == -EAGAIN)
5644                         goto again;
5645
5646                 if (ret < 0) {
5647                         btrfs_release_path(path);
5648                         goto done;
5649                 }
5650
5651                 if (!path->skip_locking) {
5652                         ret = btrfs_try_tree_read_lock(next);
5653                         if (!ret) {
5654                                 btrfs_set_path_blocking(path);
5655                                 btrfs_tree_read_lock(next);
5656                                 btrfs_clear_path_blocking(path, next,
5657                                                           BTRFS_READ_LOCK);
5658                         }
5659                         next_rw_lock = BTRFS_READ_LOCK;
5660                 }
5661         }
5662         ret = 0;
5663 done:
5664         unlock_up(path, 0, 1, 0, NULL);
5665         path->leave_spinning = old_spinning;
5666         if (!old_spinning)
5667                 btrfs_set_path_blocking(path);
5668
5669         return ret;
5670 }
5671
5672 /*
5673  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5674  * searching until it gets past min_objectid or finds an item of 'type'
5675  *
5676  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5677  */
5678 int btrfs_previous_item(struct btrfs_root *root,
5679                         struct btrfs_path *path, u64 min_objectid,
5680                         int type)
5681 {
5682         struct btrfs_key found_key;
5683         struct extent_buffer *leaf;
5684         u32 nritems;
5685         int ret;
5686
5687         while (1) {
5688                 if (path->slots[0] == 0) {
5689                         btrfs_set_path_blocking(path);
5690                         ret = btrfs_prev_leaf(root, path);
5691                         if (ret != 0)
5692                                 return ret;
5693                 } else {
5694                         path->slots[0]--;
5695                 }
5696                 leaf = path->nodes[0];
5697                 nritems = btrfs_header_nritems(leaf);
5698                 if (nritems == 0)
5699                         return 1;
5700                 if (path->slots[0] == nritems)
5701                         path->slots[0]--;
5702
5703                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5704                 if (found_key.objectid < min_objectid)
5705                         break;
5706                 if (found_key.type == type)
5707                         return 0;
5708                 if (found_key.objectid == min_objectid &&
5709                     found_key.type < type)
5710                         break;
5711         }
5712         return 1;
5713 }