fs/btrfs/tree-log.c

   1 /*
   2  * Copyright (C) 2008 Oracle.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18
  19 #include <linux/sched.h>
  20 #include <linux/slab.h>
  21 #include <linux/blkdev.h>
  22 #include <linux/list_sort.h>
  23 #include "ctree.h"
  24 #include "transaction.h"
  25 #include "disk-io.h"
  26 #include "locking.h"
  27 #include "print-tree.h"
  28 #include "backref.h"
  29 #include "compat.h"
  30 #include "tree-log.h"
  31 #include "hash.h"
  32
  33 /* magic values for the inode_only field in btrfs_log_inode:
  34  *
  35  * LOG_INODE_ALL means to log everything
  36  * LOG_INODE_EXISTS means to log just enough to recreate the inode
  37  * during log replay
  38  */
  39 #define LOG_INODE_ALL 0
  40 #define LOG_INODE_EXISTS 1
  41
  42 /*
  43  * directory trouble cases
  44  *
  45  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
  46  * log, we must force a full commit before doing an fsync of the directory
  47  * where the unlink was done.
  48  * ---> record transid of last unlink/rename per directory
  49  *
  50  * mkdir foo/some_dir
  51  * normal commit
  52  * rename foo/some_dir foo2/some_dir
  53  * mkdir foo/some_dir
  54  * fsync foo/some_dir/some_file
  55  *
  56  * The fsync above will unlink the original some_dir without recording
  57  * it in its new location (foo2).  After a crash, some_dir will be gone
  58  * unless the fsync of some_file forces a full commit
  59  *
  60  * 2) we must log any new names for any file or dir that is in the fsync
  61  * log. ---> check inode while renaming/linking.
  62  *
  63  * 2a) we must log any new names for any file or dir during rename
  64  * when the directory they are being removed from was logged.
  65  * ---> check inode and old parent dir during rename
  66  *
  67  *  2a is actually the more important variant.  With the extra logging
  68  *  a crash might unlink the old name without recreating the new one
  69  *
  70  * 3) after a crash, we must go through any directories with a link count
  71  * of zero and redo the rm -rf
  72  *
  73  * mkdir f1/foo
  74  * normal commit
  75  * rm -rf f1/foo
  76  * fsync(f1)
  77  *
  78  * The directory f1 was fully removed from the FS, but fsync was never
  79  * called on f1, only its parent dir.  After a crash the rm -rf must
  80  * be replayed.  This must be able to recurse down the entire
  81  * directory tree.  The inode link count fixup code takes care of the
  82  * ugly details.
  83  */
  84
  85 /*
  86  * stages for the tree walking.  The first
  87  * stage (0) is to only pin down the blocks we find
  88  * the second stage (1) is to make sure that all the inodes
  89  * we find in the log are created in the subvolume.
  90  *
  91  * The last stage is to deal with directories and links and extents
  92  * and all the other fun semantics
  93  */
  94 #define LOG_WALK_PIN_ONLY 0
  95 #define LOG_WALK_REPLAY_INODES 1
  96 #define LOG_WALK_REPLAY_DIR_INDEX 2
  97 #define LOG_WALK_REPLAY_ALL 3
  98
  99 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 100                              struct btrfs_root *root, struct inode *inode,
 101                              int inode_only);
 102 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 103                              struct btrfs_root *root,
 104                              struct btrfs_path *path, u64 objectid);
 105 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 106                                        struct btrfs_root *root,
 107                                        struct btrfs_root *log,
 108                                        struct btrfs_path *path,
 109                                        u64 dirid, int del_all);
 110
 111 /*
 112  * tree logging is a special write ahead log used to make sure that
 113  * fsyncs and O_SYNCs can happen without doing full tree commits.
 114  *
 115  * Full tree commits are expensive because they require commonly
 116  * modified blocks to be recowed, creating many dirty pages in the
 117  * extent tree an 4x-6x higher write load than ext3.
 118  *
 119  * Instead of doing a tree commit on every fsync, we use the
 120  * key ranges and transaction ids to find items for a given file or directory
 121  * that have changed in this transaction.  Those items are copied into
 122  * a special tree (one per subvolume root), that tree is written to disk
 123  * and then the fsync is considered complete.
 124  *
 125  * After a crash, items are copied out of the log-tree back into the
 126  * subvolume tree.  Any file data extents found are recorded in the extent
 127  * allocation tree, and the log-tree freed.
 128  *
 129  * The log tree is read three times, once to pin down all the extents it is
 130  * using in ram and once, once to create all the inodes logged in the tree
 131  * and once to do all the other items.
 132  */
 133
 134 /*
 135  * start a sub transaction and setup the log tree
 136  * this increments the log tree writer count to make the people
 137  * syncing the tree wait for us to finish
 138  */
 139 static int start_log_trans(struct btrfs_trans_handle *trans,
 140                            struct btrfs_root *root)
 141 {
 142         int ret;
 143         int err = 0;
 144
 145         mutex_lock(&root->log_mutex);
 146         if (root->log_root) {
 147                 if (!root->log_start_pid) {
 148                         root->log_start_pid = current->pid;
 149                         root->log_multiple_pids = false;
 150                 } else if (root->log_start_pid != current->pid) {
 151                         root->log_multiple_pids = true;
 152                 }
 153
 154                 atomic_inc(&root->log_batch);
 155                 atomic_inc(&root->log_writers);
 156                 mutex_unlock(&root->log_mutex);
 157                 return 0;
 158         }
 159         root->log_multiple_pids = false;
 160         root->log_start_pid = current->pid;
 161         mutex_lock(&root->fs_info->tree_log_mutex);
 162         if (!root->fs_info->log_root_tree) {
 163                 ret = btrfs_init_log_root_tree(trans, root->fs_info);
 164                 if (ret)
 165                         err = ret;
 166         }
 167         if (err == 0 && !root->log_root) {
 168                 ret = btrfs_add_log_tree(trans, root);
 169                 if (ret)
 170                         err = ret;
 171         }
 172         mutex_unlock(&root->fs_info->tree_log_mutex);
 173         atomic_inc(&root->log_batch);
 174         atomic_inc(&root->log_writers);
 175         mutex_unlock(&root->log_mutex);
 176         return err;
 177 }
 178
 179 /*
 180  * returns 0 if there was a log transaction running and we were able
 181  * to join, or returns -ENOENT if there were not transactions
 182  * in progress
 183  */
 184 static int join_running_log_trans(struct btrfs_root *root)
 185 {
 186         int ret = -ENOENT;
 187
 188         smp_mb();
 189         if (!root->log_root)
 190                 return -ENOENT;
 191
 192         mutex_lock(&root->log_mutex);
 193         if (root->log_root) {
 194                 ret = 0;
 195                 atomic_inc(&root->log_writers);
 196         }
 197         mutex_unlock(&root->log_mutex);
 198         return ret;
 199 }
 200
 201 /*
 202  * This either makes the current running log transaction wait
 203  * until you call btrfs_end_log_trans() or it makes any future
 204  * log transactions wait until you call btrfs_end_log_trans()
 205  */
 206 int btrfs_pin_log_trans(struct btrfs_root *root)
 207 {
 208         int ret = -ENOENT;
 209
 210         mutex_lock(&root->log_mutex);
 211         atomic_inc(&root->log_writers);
 212         mutex_unlock(&root->log_mutex);
 213         return ret;
 214 }
 215
 216 /*
 217  * indicate we're done making changes to the log tree
 218  * and wake up anyone waiting to do a sync
 219  */
 220 void btrfs_end_log_trans(struct btrfs_root *root)
 221 {
 222         if (atomic_dec_and_test(&root->log_writers)) {
 223                 smp_mb();
 224                 if (waitqueue_active(&root->log_writer_wait))
 225                         wake_up(&root->log_writer_wait);
 226         }
 227 }
 228
 229
 230 /*
 231  * the walk control struct is used to pass state down the chain when
 232  * processing the log tree.  The stage field tells us which part
 233  * of the log tree processing we are currently doing.  The others
 234  * are state fields used for that specific part
 235  */
 236 struct walk_control {
 237         /* should we free the extent on disk when done?  This is used
 238          * at transaction commit time while freeing a log tree
 239          */
 240         int free;
 241
 242         /* should we write out the extent buffer?  This is used
 243          * while flushing the log tree to disk during a sync
 244          */
 245         int write;
 246
 247         /* should we wait for the extent buffer io to finish?  Also used
 248          * while flushing the log tree to disk for a sync
 249          */
 250         int wait;
 251
 252         /* pin only walk, we record which extents on disk belong to the
 253          * log trees
 254          */
 255         int pin;
 256
 257         /* what stage of the replay code we're currently in */
 258         int stage;
 259
 260         /* the root we are currently replaying */
 261         struct btrfs_root *replay_dest;
 262
 263         /* the trans handle for the current replay */
 264         struct btrfs_trans_handle *trans;
 265
 266         /* the function that gets used to process blocks we find in the
 267          * tree.  Note the extent_buffer might not be up to date when it is
 268          * passed in, and it must be checked or read if you need the data
 269          * inside it
 270          */
 271         int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
 272                             struct walk_control *wc, u64 gen);
 273 };
 274
 275 /*
 276  * process_func used to pin down extents, write them or wait on them
 277  */
 278 static int process_one_buffer(struct btrfs_root *log,
 279                               struct extent_buffer *eb,
 280                               struct walk_control *wc, u64 gen)
 281 {
 282         int ret = 0;
 283
 284         /*
 285          * If this fs is mixed then we need to be able to process the leaves to
 286          * pin down any logged extents, so we have to read the block.
 287          */
 288         if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) {
 289                 ret = btrfs_read_buffer(eb, gen);
 290                 if (ret)
 291                         return ret;
 292         }
 293
 294         if (wc->pin)
 295                 ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
 296                                                       eb->start, eb->len);
 297
 298         if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
 299                 if (wc->pin && btrfs_header_level(eb) == 0)
 300                         ret = btrfs_exclude_logged_extents(log, eb);
 301                 if (wc->write)
 302                         btrfs_write_tree_block(eb);
 303                 if (wc->wait)
 304                         btrfs_wait_tree_block_writeback(eb);
 305         }
 306         return ret;
 307 }
 308
 309 /*
 310  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
 311  * to the src data we are copying out.
 312  *
 313  * root is the tree we are copying into, and path is a scratch
 314  * path for use in this function (it should be released on entry and
 315  * will be released on exit).
 316  *
 317  * If the key is already in the destination tree the existing item is
 318  * overwritten.  If the existing item isn't big enough, it is extended.
 319  * If it is too large, it is truncated.
 320  *
 321  * If the key isn't in the destination yet, a new item is inserted.
 322  */
 323 static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 324                                    struct btrfs_root *root,
 325                                    struct btrfs_path *path,
 326                                    struct extent_buffer *eb, int slot,
 327                                    struct btrfs_key *key)
 328 {
 329         int ret;
 330         u32 item_size;
 331         u64 saved_i_size = 0;
 332         int save_old_i_size = 0;
 333         unsigned long src_ptr;
 334         unsigned long dst_ptr;
 335         int overwrite_root = 0;
 336         bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
 337
 338         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
 339                 overwrite_root = 1;
 340
 341         item_size = btrfs_item_size_nr(eb, slot);
 342         src_ptr = btrfs_item_ptr_offset(eb, slot);
 343
 344         /* look for the key in the destination tree */
 345         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
 346         if (ret < 0)
 347                 return ret;
 348
 349         if (ret == 0) {
 350                 char *src_copy;
 351                 char *dst_copy;
 352                 u32 dst_size = btrfs_item_size_nr(path->nodes[0],
 353                                                   path->slots[0]);
 354                 if (dst_size != item_size)
 355                         goto insert;
 356
 357                 if (item_size == 0) {
 358                         btrfs_release_path(path);
 359                         return 0;
 360                 }
 361                 dst_copy = kmalloc(item_size, GFP_NOFS);
 362                 src_copy = kmalloc(item_size, GFP_NOFS);
 363                 if (!dst_copy || !src_copy) {
 364                         btrfs_release_path(path);
 365                         kfree(dst_copy);
 366                         kfree(src_copy);
 367                         return -ENOMEM;
 368                 }
 369
 370                 read_extent_buffer(eb, src_copy, src_ptr, item_size);
 371
 372                 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 373                 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
 374                                    item_size);
 375                 ret = memcmp(dst_copy, src_copy, item_size);
 376
 377                 kfree(dst_copy);
 378                 kfree(src_copy);
 379                 /*
 380                  * they have the same contents, just return, this saves
 381                  * us from cowing blocks in the destination tree and doing
 382                  * extra writes that may not have been done by a previous
 383                  * sync
 384                  */
 385                 if (ret == 0) {
 386                         btrfs_release_path(path);
 387                         return 0;
 388                 }
 389
 390                 /*
 391                  * We need to load the old nbytes into the inode so when we
 392                  * replay the extents we've logged we get the right nbytes.
 393                  */
 394                 if (inode_item) {
 395                         struct btrfs_inode_item *item;
 396                         u64 nbytes;
 397
 398                         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 399                                               struct btrfs_inode_item);
 400                         nbytes = btrfs_inode_nbytes(path->nodes[0], item);
 401                         item = btrfs_item_ptr(eb, slot,
 402                                               struct btrfs_inode_item);
 403                         btrfs_set_inode_nbytes(eb, item, nbytes);
 404                 }
 405         } else if (inode_item) {
 406                 struct btrfs_inode_item *item;
 407
 408                 /*
 409                  * New inode, set nbytes to 0 so that the nbytes comes out
 410                  * properly when we replay the extents.
 411                  */
 412                 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
 413                 btrfs_set_inode_nbytes(eb, item, 0);
 414         }
 415 insert:
 416         btrfs_release_path(path);
 417         /* try to insert the key into the destination tree */
 418         ret = btrfs_insert_empty_item(trans, root, path,
 419                                       key, item_size);
 420
 421         /* make sure any existing item is the correct size */
 422         if (ret == -EEXIST) {
 423                 u32 found_size;
 424                 found_size = btrfs_item_size_nr(path->nodes[0],
 425                                                 path->slots[0]);
 426                 if (found_size > item_size)
 427                         btrfs_truncate_item(root, path, item_size, 1);
 428                 else if (found_size < item_size)
 429                         btrfs_extend_item(root, path,
 430                                           item_size - found_size);
 431         } else if (ret) {
 432                 return ret;
 433         }
 434         dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
 435                                         path->slots[0]);
 436
 437         /* don't overwrite an existing inode if the generation number
 438          * was logged as zero.  This is done when the tree logging code
 439          * is just logging an inode to make sure it exists after recovery.
 440          *
 441          * Also, don't overwrite i_size on directories during replay.
 442          * log replay inserts and removes directory items based on the
 443          * state of the tree found in the subvolume, and i_size is modified
 444          * as it goes
 445          */
 446         if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
 447                 struct btrfs_inode_item *src_item;
 448                 struct btrfs_inode_item *dst_item;
 449
 450                 src_item = (struct btrfs_inode_item *)src_ptr;
 451                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 452
 453                 if (btrfs_inode_generation(eb, src_item) == 0)
 454                         goto no_copy;
 455
 456                 if (overwrite_root &&
 457                     S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
 458                     S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
 459                         save_old_i_size = 1;
 460                         saved_i_size = btrfs_inode_size(path->nodes[0],
 461                                                         dst_item);
 462                 }
 463         }
 464
 465         copy_extent_buffer(path->nodes[0], eb, dst_ptr,
 466                            src_ptr, item_size);
 467
 468         if (save_old_i_size) {
 469                 struct btrfs_inode_item *dst_item;
 470                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 471                 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
 472         }
 473
 474         /* make sure the generation is filled in */
 475         if (key->type == BTRFS_INODE_ITEM_KEY) {
 476                 struct btrfs_inode_item *dst_item;
 477                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 478                 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
 479                         btrfs_set_inode_generation(path->nodes[0], dst_item,
 480                                                    trans->transid);
 481                 }
 482         }
 483 no_copy:
 484         btrfs_mark_buffer_dirty(path->nodes[0]);
 485         btrfs_release_path(path);
 486         return 0;
 487 }
 488
 489 /*
 490  * simple helper to read an inode off the disk from a given root
 491  * This can only be called for subvolume roots and not for the log
 492  */
 493 static noinline struct inode *read_one_inode(struct btrfs_root *root,
 494                                              u64 objectid)
 495 {
 496         struct btrfs_key key;
 497         struct inode *inode;
 498
 499         key.objectid = objectid;
 500         key.type = BTRFS_INODE_ITEM_KEY;
 501         key.offset = 0;
 502         inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
 503         if (IS_ERR(inode)) {
 504                 inode = NULL;
 505         } else if (is_bad_inode(inode)) {
 506                 iput(inode);
 507                 inode = NULL;
 508         }
 509         return inode;
 510 }
 511
 512 /* replays a single extent in 'eb' at 'slot' with 'key' into the
 513  * subvolume 'root'.  path is released on entry and should be released
 514  * on exit.
 515  *
 516  * extents in the log tree have not been allocated out of the extent
 517  * tree yet.  So, this completes the allocation, taking a reference
 518  * as required if the extent already exists or creating a new extent
 519  * if it isn't in the extent allocation tree yet.
 520  *
 521  * The extent is inserted into the file, dropping any existing extents
 522  * from the file that overlap the new one.
 523  */
 524 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 525                                       struct btrfs_root *root,
 526                                       struct btrfs_path *path,
 527                                       struct extent_buffer *eb, int slot,
 528                                       struct btrfs_key *key)
 529 {
 530         int found_type;
 531         u64 extent_end;
 532         u64 start = key->offset;
 533         u64 nbytes = 0;
 534         struct btrfs_file_extent_item *item;
 535         struct inode *inode = NULL;
 536         unsigned long size;
 537         int ret = 0;
 538
 539         item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
 540         found_type = btrfs_file_extent_type(eb, item);
 541
 542         if (found_type == BTRFS_FILE_EXTENT_REG ||
 543             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 544                 nbytes = btrfs_file_extent_num_bytes(eb, item);
 545                 extent_end = start + nbytes;
 546
 547                 /*
 548                  * We don't add to the inodes nbytes if we are prealloc or a
 549                  * hole.
 550                  */
 551                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
 552                         nbytes = 0;
 553         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 554                 size = btrfs_file_extent_inline_len(eb, item);
 555                 nbytes = btrfs_file_extent_ram_bytes(eb, item);
 556                 extent_end = ALIGN(start + size, root->sectorsize);
 557         } else {
 558                 ret = 0;
 559                 goto out;
 560         }
 561
 562         inode = read_one_inode(root, key->objectid);
 563         if (!inode) {
 564                 ret = -EIO;
 565                 goto out;
 566         }
 567
 568         /*
 569          * first check to see if we already have this extent in the
 570          * file.  This must be done before the btrfs_drop_extents run
 571          * so we don't try to drop this extent.
 572          */
 573         ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
 574                                        start, 0);
 575
 576         if (ret == 0 &&
 577             (found_type == BTRFS_FILE_EXTENT_REG ||
 578              found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
 579                 struct btrfs_file_extent_item cmp1;
 580                 struct btrfs_file_extent_item cmp2;
 581                 struct btrfs_file_extent_item *existing;
 582                 struct extent_buffer *leaf;
 583
 584                 leaf = path->nodes[0];
 585                 existing = btrfs_item_ptr(leaf, path->slots[0],
 586                                           struct btrfs_file_extent_item);
 587
 588                 read_extent_buffer(eb, &cmp1, (unsigned long)item,
 589                                    sizeof(cmp1));
 590                 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
 591                                    sizeof(cmp2));
 592
 593                 /*
 594                  * we already have a pointer to this exact extent,
 595                  * we don't have to do anything
 596                  */
 597                 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
 598                         btrfs_release_path(path);
 599                         goto out;
 600                 }
 601         }
 602         btrfs_release_path(path);
 603
 604         /* drop any overlapping extents */
 605         ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
 606         if (ret)
 607                 goto out;
 608
 609         if (found_type == BTRFS_FILE_EXTENT_REG ||
 610             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 611                 u64 offset;
 612                 unsigned long dest_offset;
 613                 struct btrfs_key ins;
 614
 615                 ret = btrfs_insert_empty_item(trans, root, path, key,
 616                                               sizeof(*item));
 617                 if (ret)
 618                         goto out;
 619                 dest_offset = btrfs_item_ptr_offset(path->nodes[0],
 620                                                     path->slots[0]);
 621                 copy_extent_buffer(path->nodes[0], eb, dest_offset,
 622                                 (unsigned long)item,  sizeof(*item));
 623
 624                 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
 625                 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
 626                 ins.type = BTRFS_EXTENT_ITEM_KEY;
 627                 offset = key->offset - btrfs_file_extent_offset(eb, item);
 628
 629                 if (ins.objectid > 0) {
 630                         u64 csum_start;
 631                         u64 csum_end;
 632                         LIST_HEAD(ordered_sums);
 633                         /*
 634                          * is this extent already allocated in the extent
 635                          * allocation tree?  If so, just add a reference
 636                          */
 637                         ret = btrfs_lookup_extent(root, ins.objectid,
 638                                                 ins.offset);
 639                         if (ret == 0) {
 640                                 ret = btrfs_inc_extent_ref(trans, root,
 641                                                 ins.objectid, ins.offset,
 642                                                 0, root->root_key.objectid,
 643                                                 key->objectid, offset, 0);
 644                                 if (ret)
 645                                         goto out;
 646                         } else {
 647                                 /*
 648                                  * insert the extent pointer in the extent
 649                                  * allocation tree
 650                                  */
 651                                 ret = btrfs_alloc_logged_file_extent(trans,
 652                                                 root, root->root_key.objectid,
 653                                                 key->objectid, offset, &ins);
 654                                 if (ret)
 655                                         goto out;
 656                         }
 657                         btrfs_release_path(path);
 658
 659                         if (btrfs_file_extent_compression(eb, item)) {
 660                                 csum_start = ins.objectid;
 661                                 csum_end = csum_start + ins.offset;
 662                         } else {
 663                                 csum_start = ins.objectid +
 664                                         btrfs_file_extent_offset(eb, item);
 665                                 csum_end = csum_start +
 666                                         btrfs_file_extent_num_bytes(eb, item);
 667                         }
 668
 669                         ret = btrfs_lookup_csums_range(root->log_root,
 670                                                 csum_start, csum_end - 1,
 671                                                 &ordered_sums, 0);
 672                         if (ret)
 673                                 goto out;
 674                         while (!list_empty(&ordered_sums)) {
 675                                 struct btrfs_ordered_sum *sums;
 676                                 sums = list_entry(ordered_sums.next,
 677                                                 struct btrfs_ordered_sum,
 678                                                 list);
 679                                 if (!ret)
 680                                         ret = btrfs_csum_file_blocks(trans,
 681                                                 root->fs_info->csum_root,
 682                                                 sums);
 683                                 list_del(&sums->list);
 684                                 kfree(sums);
 685                         }
 686                         if (ret)
 687                                 goto out;
 688                 } else {
 689                         btrfs_release_path(path);
 690                 }
 691         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 692                 /* inline extents are easy, we just overwrite them */
 693                 ret = overwrite_item(trans, root, path, eb, slot, key);
 694                 if (ret)
 695                         goto out;
 696         }
 697
 698         inode_add_bytes(inode, nbytes);
 699         ret = btrfs_update_inode(trans, root, inode);
 700 out:
 701         if (inode)
 702                 iput(inode);
 703         return ret;
 704 }
 705
 706 /*
 707  * when cleaning up conflicts between the directory names in the
 708  * subvolume, directory names in the log and directory names in the
 709  * inode back references, we may have to unlink inodes from directories.
 710  *
 711  * This is a helper function to do the unlink of a specific directory
 712  * item
 713  */
 714 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 715                                       struct btrfs_root *root,
 716                                       struct btrfs_path *path,
 717                                       struct inode *dir,
 718                                       struct btrfs_dir_item *di)
 719 {
 720         struct inode *inode;
 721         char *name;
 722         int name_len;
 723         struct extent_buffer *leaf;
 724         struct btrfs_key location;
 725         int ret;
 726
 727         leaf = path->nodes[0];
 728
 729         btrfs_dir_item_key_to_cpu(leaf, di, &location);
 730         name_len = btrfs_dir_name_len(leaf, di);
 731         name = kmalloc(name_len, GFP_NOFS);
 732         if (!name)
 733                 return -ENOMEM;
 734
 735         read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
 736         btrfs_release_path(path);
 737
 738         inode = read_one_inode(root, location.objectid);
 739         if (!inode) {
 740                 ret = -EIO;
 741                 goto out;
 742         }
 743
 744         ret = link_to_fixup_dir(trans, root, path, location.objectid);
 745         if (ret)
 746                 goto out;
 747
 748         ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
 749         if (ret)
 750                 goto out;
 751         else
 752                 ret = btrfs_run_delayed_items(trans, root);
 753 out:
 754         kfree(name);
 755         iput(inode);
 756         return ret;
 757 }
 758
 759 /*
 760  * helper function to see if a given name and sequence number found
 761  * in an inode back reference are already in a directory and correctly
 762  * point to this inode
 763  */
 764 static noinline int inode_in_dir(struct btrfs_root *root,
 765                                  struct btrfs_path *path,
 766                                  u64 dirid, u64 objectid, u64 index,
 767                                  const char *name, int name_len)
 768 {
 769         struct btrfs_dir_item *di;
 770         struct btrfs_key location;
 771         int match = 0;
 772
 773         di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
 774                                          index, name, name_len, 0);
 775         if (di && !IS_ERR(di)) {
 776                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 777                 if (location.objectid != objectid)
 778                         goto out;
 779         } else
 780                 goto out;
 781         btrfs_release_path(path);
 782
 783         di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
 784         if (di && !IS_ERR(di)) {
 785                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 786                 if (location.objectid != objectid)
 787                         goto out;
 788         } else
 789                 goto out;
 790         match = 1;
 791 out:
 792         btrfs_release_path(path);
 793         return match;
 794 }
 795
 796 /*
 797  * helper function to check a log tree for a named back reference in
 798  * an inode.  This is used to decide if a back reference that is
 799  * found in the subvolume conflicts with what we find in the log.
 800  *
 801  * inode backreferences may have multiple refs in a single item,
 802  * during replay we process one reference at a time, and we don't
 803  * want to delete valid links to a file from the subvolume if that
 804  * link is also in the log.
 805  */
 806 static noinline int backref_in_log(struct btrfs_root *log,
 807                                    struct btrfs_key *key,
 808                                    u64 ref_objectid,
 809                                    char *name, int namelen)
 810 {
 811         struct btrfs_path *path;
 812         struct btrfs_inode_ref *ref;
 813         unsigned long ptr;
 814         unsigned long ptr_end;
 815         unsigned long name_ptr;
 816         int found_name_len;
 817         int item_size;
 818         int ret;
 819         int match = 0;
 820
 821         path = btrfs_alloc_path();
 822         if (!path)
 823                 return -ENOMEM;
 824
 825         ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
 826         if (ret != 0)
 827                 goto out;
 828
 829         ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 830
 831         if (key->type == BTRFS_INODE_EXTREF_KEY) {
 832                 if (btrfs_find_name_in_ext_backref(path, ref_objectid,
 833                                                    name, namelen, NULL))
 834                         match = 1;
 835
 836                 goto out;
 837         }
 838
 839         item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
 840         ptr_end = ptr + item_size;
 841         while (ptr < ptr_end) {
 842                 ref = (struct btrfs_inode_ref *)ptr;
 843                 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
 844                 if (found_name_len == namelen) {
 845                         name_ptr = (unsigned long)(ref + 1);
 846                         ret = memcmp_extent_buffer(path->nodes[0], name,
 847                                                    name_ptr, namelen);
 848                         if (ret == 0) {
 849                                 match = 1;
 850                                 goto out;
 851                         }
 852                 }
 853                 ptr = (unsigned long)(ref + 1) + found_name_len;
 854         }
 855 out:
 856         btrfs_free_path(path);
 857         return match;
 858 }
 859
 860 static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 861                                   struct btrfs_root *root,
 862                                   struct btrfs_path *path,
 863                                   struct btrfs_root *log_root,
 864                                   struct inode *dir, struct inode *inode,
 865                                   struct extent_buffer *eb,
 866                                   u64 inode_objectid, u64 parent_objectid,
 867                                   u64 ref_index, char *name, int namelen,
 868                                   int *search_done)
 869 {
 870         int ret;
 871         char *victim_name;
 872         int victim_name_len;
 873         struct extent_buffer *leaf;
 874         struct btrfs_dir_item *di;
 875         struct btrfs_key search_key;
 876         struct btrfs_inode_extref *extref;
 877
 878 again:
 879         /* Search old style refs */
 880         search_key.objectid = inode_objectid;
 881         search_key.type = BTRFS_INODE_REF_KEY;
 882         search_key.offset = parent_objectid;
 883         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
 884         if (ret == 0) {
 885                 struct btrfs_inode_ref *victim_ref;
 886                 unsigned long ptr;
 887                 unsigned long ptr_end;
 888
 889                 leaf = path->nodes[0];
 890
 891                 /* are we trying to overwrite a back ref for the root directory
 892                  * if so, just jump out, we're done
 893                  */
 894                 if (search_key.objectid == search_key.offset)
 895                         return 1;
 896
 897                 /* check all the names in this back reference to see
 898                  * if they are in the log.  if so, we allow them to stay
 899                  * otherwise they must be unlinked as a conflict
 900                  */
 901                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
 902                 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
 903                 while (ptr < ptr_end) {
 904                         victim_ref = (struct btrfs_inode_ref *)ptr;
 905                         victim_name_len = btrfs_inode_ref_name_len(leaf,
 906                                                                    victim_ref);
 907                         victim_name = kmalloc(victim_name_len, GFP_NOFS);
 908                         if (!victim_name)
 909                                 return -ENOMEM;
 910
 911                         read_extent_buffer(leaf, victim_name,
 912                                            (unsigned long)(victim_ref + 1),
 913                                            victim_name_len);
 914
 915                         if (!backref_in_log(log_root, &search_key,
 916                                             parent_objectid,
 917                                             victim_name,
 918                                             victim_name_len)) {
 919                                 btrfs_inc_nlink(inode);
 920                                 btrfs_release_path(path);
 921
 922                                 ret = btrfs_unlink_inode(trans, root, dir,
 923                                                          inode, victim_name,
 924                                                          victim_name_len);
 925                                 kfree(victim_name);
 926                                 if (ret)
 927                                         return ret;
 928                                 ret = btrfs_run_delayed_items(trans, root);
 929                                 if (ret)
 930                                         return ret;
 931                                 *search_done = 1;
 932                                 goto again;
 933                         }
 934                         kfree(victim_name);
 935
 936                         ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
 937                 }
 938
 939                 /*
 940                  * NOTE: we have searched root tree and checked the
 941                  * coresponding ref, it does not need to check again.
 942                  */
 943                 *search_done = 1;
 944         }
 945         btrfs_release_path(path);
 946
 947         /* Same search but for extended refs */
 948         extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
 949                                            inode_objectid, parent_objectid, 0,
 950                                            0);
 951         if (!IS_ERR_OR_NULL(extref)) {
 952                 u32 item_size;
 953                 u32 cur_offset = 0;
 954                 unsigned long base;
 955                 struct inode *victim_parent;
 956
 957                 leaf = path->nodes[0];
 958
 959                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 960                 base = btrfs_item_ptr_offset(leaf, path->slots[0]);
 961
 962                 while (cur_offset < item_size) {
 963                         extref = (struct btrfs_inode_extref *)base + cur_offset;
 964
 965                         victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
 966
 967                         if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
 968                                 goto next;
 969
 970                         victim_name = kmalloc(victim_name_len, GFP_NOFS);
 971                         if (!victim_name)
 972                                 return -ENOMEM;
 973                         read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
 974                                            victim_name_len);
 975
 976                         search_key.objectid = inode_objectid;
 977                         search_key.type = BTRFS_INODE_EXTREF_KEY;
 978                         search_key.offset = btrfs_extref_hash(parent_objectid,
 979                                                               victim_name,
 980                                                               victim_name_len);
 981                         ret = 0;
 982                         if (!backref_in_log(log_root, &search_key,
 983                                             parent_objectid, victim_name,
 984                                             victim_name_len)) {
 985                                 ret = -ENOENT;
 986                                 victim_parent = read_one_inode(root,
 987                                                                parent_objectid);
 988                                 if (victim_parent) {
 989                                         btrfs_inc_nlink(inode);
 990                                         btrfs_release_path(path);
 991
 992                                         ret = btrfs_unlink_inode(trans, root,
 993                                                                  victim_parent,
 994                                                                  inode,
 995                                                                  victim_name,
 996                                                                  victim_name_len);
 997                                         if (!ret)
 998                                                 ret = btrfs_run_delayed_items(
 999                                                                   trans, root);
1000                                 }
1001                                 iput(victim_parent);
1002                                 kfree(victim_name);
1003                                 if (ret)
1004                                         return ret;
1005                                 *search_done = 1;
1006                                 goto again;
1007                         }
1008                         kfree(victim_name);
1009                         if (ret)
1010                                 return ret;
1011 next:
1012                         cur_offset += victim_name_len + sizeof(*extref);
1013                 }
1014                 *search_done = 1;
1015         }
1016         btrfs_release_path(path);
1017
1018         /* look for a conflicting sequence number */
1019         di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1020                                          ref_index, name, namelen, 0);
1021         if (di && !IS_ERR(di)) {
1022                 ret = drop_one_dir_item(trans, root, path, dir, di);
1023                 if (ret)
1024                         return ret;
1025         }
1026         btrfs_release_path(path);
1027
1028         /* look for a conflicing name */
1029         di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
1030                                    name, namelen, 0);
1031         if (di && !IS_ERR(di)) {
1032                 ret = drop_one_dir_item(trans, root, path, dir, di);
1033                 if (ret)
1034                         return ret;
1035         }
1036         btrfs_release_path(path);
1037
1038         return 0;
1039 }
1040
1041 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1042                              u32 *namelen, char **name, u64 *index,
1043                              u64 *parent_objectid)
1044 {
1045         struct btrfs_inode_extref *extref;
1046
1047         extref = (struct btrfs_inode_extref *)ref_ptr;
1048
1049         *namelen = btrfs_inode_extref_name_len(eb, extref);
1050         *name = kmalloc(*namelen, GFP_NOFS);
1051         if (*name == NULL)
1052                 return -ENOMEM;
1053
1054         read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1055                            *namelen);
1056
1057         *index = btrfs_inode_extref_index(eb, extref);
1058         if (parent_objectid)
1059                 *parent_objectid = btrfs_inode_extref_parent(eb, extref);
1060
1061         return 0;
1062 }
1063
1064 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1065                           u32 *namelen, char **name, u64 *index)
1066 {
1067         struct btrfs_inode_ref *ref;
1068
1069         ref = (struct btrfs_inode_ref *)ref_ptr;
1070
1071         *namelen = btrfs_inode_ref_name_len(eb, ref);
1072         *name = kmalloc(*namelen, GFP_NOFS);
1073         if (*name == NULL)
1074                 return -ENOMEM;
1075
1076         read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1077
1078         *index = btrfs_inode_ref_index(eb, ref);
1079
1080         return 0;
1081 }
1082
1083 /*
1084  * replay one inode back reference item found in the log tree.
1085  * eb, slot and key refer to the buffer and key found in the log tree.
1086  * root is the destination we are replaying into, and path is for temp
1087  * use by this function.  (it should be released on return).
1088  */
1089 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1090                                   struct btrfs_root *root,
1091                                   struct btrfs_root *log,
1092                                   struct btrfs_path *path,
1093                                   struct extent_buffer *eb, int slot,
1094                                   struct btrfs_key *key)
1095 {
1096         struct inode *dir;
1097         struct inode *inode;
1098         unsigned long ref_ptr;
1099         unsigned long ref_end;
1100         char *name;
1101         int namelen;
1102         int ret;
1103         int search_done = 0;
1104         int log_ref_ver = 0;
1105         u64 parent_objectid;
1106         u64 inode_objectid;
1107         u64 ref_index = 0;
1108         int ref_struct_size;
1109
1110         ref_ptr = btrfs_item_ptr_offset(eb, slot);
1111         ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1112
1113         if (key->type == BTRFS_INODE_EXTREF_KEY) {
1114                 struct btrfs_inode_extref *r;
1115
1116                 ref_struct_size = sizeof(struct btrfs_inode_extref);
1117                 log_ref_ver = 1;
1118                 r = (struct btrfs_inode_extref *)ref_ptr;
1119                 parent_objectid = btrfs_inode_extref_parent(eb, r);
1120         } else {
1121                 ref_struct_size = sizeof(struct btrfs_inode_ref);
1122                 parent_objectid = key->offset;
1123         }
1124         inode_objectid = key->objectid;
1125
1126         /*
1127          * it is possible that we didn't log all the parent directories
1128          * for a given inode.  If we don't find the dir, just don't
1129          * copy the back ref in.  The link count fixup code will take
1130          * care of the rest
1131          */
1132         dir = read_one_inode(root, parent_objectid);
1133         if (!dir)
1134                 return -ENOENT;
1135
1136         inode = read_one_inode(root, inode_objectid);
1137         if (!inode) {
1138                 iput(dir);
1139                 return -EIO;
1140         }
1141
1142         while (ref_ptr < ref_end) {
1143                 if (log_ref_ver) {
1144                         ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1145                                                 &ref_index, &parent_objectid);
1146                         /*
1147                          * parent object can change from one array
1148                          * item to another.
1149                          */
1150                         if (!dir)
1151                                 dir = read_one_inode(root, parent_objectid);
1152                         if (!dir)
1153                                 return -ENOENT;
1154                 } else {
1155                         ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1156                                              &ref_index);
1157                 }
1158                 if (ret)
1159                         return ret;
1160
1161                 /* if we already have a perfect match, we're done */
1162                 if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
1163                                   ref_index, name, namelen)) {
1164                         /*
1165                          * look for a conflicting back reference in the
1166                          * metadata. if we find one we have to unlink that name
1167                          * of the file before we add our new link.  Later on, we
1168                          * overwrite any existing back reference, and we don't
1169                          * want to create dangling pointers in the directory.
1170                          */
1171
1172                         if (!search_done) {
1173                                 ret = __add_inode_ref(trans, root, path, log,
1174                                                       dir, inode, eb,
1175                                                       inode_objectid,
1176                                                       parent_objectid,
1177                                                       ref_index, name, namelen,
1178                                                       &search_done);
1179                                 if (ret == 1) {
1180                                         ret = 0;
1181                                         goto out;
1182                                 }
1183                                 if (ret)
1184                                         goto out;
1185                         }
1186
1187                         /* insert our name */
1188                         ret = btrfs_add_link(trans, dir, inode, name, namelen,
1189                                              0, ref_index);
1190                         if (ret)
1191                                 goto out;
1192
1193                         btrfs_update_inode(trans, root, inode);
1194                 }
1195
1196                 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1197                 kfree(name);
1198                 if (log_ref_ver) {
1199                         iput(dir);
1200                         dir = NULL;
1201                 }
1202         }
1203
1204         /* finally write the back reference in the inode */
1205         ret = overwrite_item(trans, root, path, eb, slot, key);
1206 out:
1207         btrfs_release_path(path);
1208         iput(dir);
1209         iput(inode);
1210         return ret;
1211 }
1212
1213 static int insert_orphan_item(struct btrfs_trans_handle *trans,
1214                               struct btrfs_root *root, u64 offset)
1215 {
1216         int ret;
1217         ret = btrfs_find_orphan_item(root, offset);
1218         if (ret > 0)
1219                 ret = btrfs_insert_orphan_item(trans, root, offset);
1220         return ret;
1221 }
1222
1223 static int count_inode_extrefs(struct btrfs_root *root,
1224                                struct inode *inode, struct btrfs_path *path)
1225 {
1226         int ret = 0;
1227         int name_len;
1228         unsigned int nlink = 0;
1229         u32 item_size;
1230         u32 cur_offset = 0;
1231         u64 inode_objectid = btrfs_ino(inode);
1232         u64 offset = 0;
1233         unsigned long ptr;
1234         struct btrfs_inode_extref *extref;
1235         struct extent_buffer *leaf;
1236
1237         while (1) {
1238                 ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1239                                             &extref, &offset);
1240                 if (ret)
1241                         break;
1242
1243                 leaf = path->nodes[0];
1244                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1245                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1246
1247                 while (cur_offset < item_size) {
1248                         extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1249                         name_len = btrfs_inode_extref_name_len(leaf, extref);
1250
1251                         nlink++;
1252
1253                         cur_offset += name_len + sizeof(*extref);
1254                 }
1255
1256                 offset++;
1257                 btrfs_release_path(path);
1258         }
1259         btrfs_release_path(path);
1260
1261         if (ret < 0)
1262                 return ret;
1263         return nlink;
1264 }
1265
1266 static int count_inode_refs(struct btrfs_root *root,
1267                                struct inode *inode, struct btrfs_path *path)
1268 {
1269         int ret;
1270         struct btrfs_key key;
1271         unsigned int nlink = 0;
1272         unsigned long ptr;
1273         unsigned long ptr_end;
1274         int name_len;
1275         u64 ino = btrfs_ino(inode);
1276
1277         key.objectid = ino;
1278         key.type = BTRFS_INODE_REF_KEY;
1279         key.offset = (u64)-1;
1280
1281         while (1) {
1282                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1283                 if (ret < 0)
1284                         break;
1285                 if (ret > 0) {
1286                         if (path->slots[0] == 0)
1287                                 break;
1288                         path->slots[0]--;
1289                 }
1290                 btrfs_item_key_to_cpu(path->nodes[0], &key,
1291                                       path->slots[0]);
1292                 if (key.objectid != ino ||
1293                     key.type != BTRFS_INODE_REF_KEY)
1294                         break;
1295                 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1296                 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1297                                                    path->slots[0]);
1298                 while (ptr < ptr_end) {
1299                         struct btrfs_inode_ref *ref;
1300
1301                         ref = (struct btrfs_inode_ref *)ptr;
1302                         name_len = btrfs_inode_ref_name_len(path->nodes[0],
1303                                                             ref);
1304                         ptr = (unsigned long)(ref + 1) + name_len;
1305                         nlink++;
1306                 }
1307
1308                 if (key.offset == 0)
1309                         break;
1310                 key.offset--;
1311                 btrfs_release_path(path);
1312         }
1313         btrfs_release_path(path);
1314
1315         return nlink;
1316 }
1317
1318 /*
1319  * There are a few corners where the link count of the file can't
1320  * be properly maintained during replay.  So, instead of adding
1321  * lots of complexity to the log code, we just scan the backrefs
1322  * for any file that has been through replay.
1323  *
1324  * The scan will update the link count on the inode to reflect the
1325  * number of back refs found.  If it goes down to zero, the iput
1326  * will free the inode.
1327  */
1328 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1329                                            struct btrfs_root *root,
1330                                            struct inode *inode)
1331 {
1332         struct btrfs_path *path;
1333         int ret;
1334         u64 nlink = 0;
1335         u64 ino = btrfs_ino(inode);
1336
1337         path = btrfs_alloc_path();
1338         if (!path)
1339                 return -ENOMEM;
1340
1341         ret = count_inode_refs(root, inode, path);
1342         if (ret < 0)
1343                 goto out;
1344
1345         nlink = ret;
1346
1347         ret = count_inode_extrefs(root, inode, path);
1348         if (ret == -ENOENT)
1349                 ret = 0;
1350
1351         if (ret < 0)
1352                 goto out;
1353
1354         nlink += ret;
1355
1356         ret = 0;
1357
1358         if (nlink != inode->i_nlink) {
1359                 set_nlink(inode, nlink);
1360                 btrfs_update_inode(trans, root, inode);
1361         }
1362         BTRFS_I(inode)->index_cnt = (u64)-1;
1363
1364         if (inode->i_nlink == 0) {
1365                 if (S_ISDIR(inode->i_mode)) {
1366                         ret = replay_dir_deletes(trans, root, NULL, path,
1367                                                  ino, 1);
1368                         if (ret)
1369                                 goto out;
1370                 }
1371                 ret = insert_orphan_item(trans, root, ino);
1372         }
1373
1374 out:
1375         btrfs_free_path(path);
1376         return ret;
1377 }
1378
1379 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1380                                             struct btrfs_root *root,
1381                                             struct btrfs_path *path)
1382 {
1383         int ret;
1384         struct btrfs_key key;
1385         struct inode *inode;
1386
1387         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1388         key.type = BTRFS_ORPHAN_ITEM_KEY;
1389         key.offset = (u64)-1;
1390         while (1) {
1391                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1392                 if (ret < 0)
1393                         break;
1394
1395                 if (ret == 1) {
1396                         if (path->slots[0] == 0)
1397                                 break;
1398                         path->slots[0]--;
1399                 }
1400
1401                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1402                 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1403                     key.type != BTRFS_ORPHAN_ITEM_KEY)
1404                         break;
1405
1406                 ret = btrfs_del_item(trans, root, path);
1407                 if (ret)
1408                         goto out;
1409
1410                 btrfs_release_path(path);
1411                 inode = read_one_inode(root, key.offset);
1412                 if (!inode)
1413                         return -EIO;
1414
1415                 ret = fixup_inode_link_count(trans, root, inode);
1416                 iput(inode);
1417                 if (ret)
1418                         goto out;
1419
1420                 /*
1421                  * fixup on a directory may create new entries,
1422                  * make sure we always look for the highset possible
1423                  * offset
1424                  */
1425                 key.offset = (u64)-1;
1426         }
1427         ret = 0;
1428 out:
1429         btrfs_release_path(path);
1430         return ret;
1431 }
1432
1433
1434 /*
1435  * record a given inode in the fixup dir so we can check its link
1436  * count when replay is done.  The link count is incremented here
1437  * so the inode won't go away until we check it
1438  */
1439 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1440                                       struct btrfs_root *root,
1441                                       struct btrfs_path *path,
1442                                       u64 objectid)
1443 {
1444         struct btrfs_key key;
1445         int ret = 0;
1446         struct inode *inode;
1447
1448         inode = read_one_inode(root, objectid);
1449         if (!inode)
1450                 return -EIO;
1451
1452         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1453         btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1454         key.offset = objectid;
1455
1456         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1457
1458         btrfs_release_path(path);
1459         if (ret == 0) {
1460                 if (!inode->i_nlink)
1461                         set_nlink(inode, 1);
1462                 else
1463                         btrfs_inc_nlink(inode);
1464                 ret = btrfs_update_inode(trans, root, inode);
1465         } else if (ret == -EEXIST) {
1466                 ret = 0;
1467         } else {
1468                 BUG(); /* Logic Error */
1469         }
1470         iput(inode);
1471
1472         return ret;
1473 }
1474
1475 /*
1476  * when replaying the log for a directory, we only insert names
1477  * for inodes that actually exist.  This means an fsync on a directory
1478  * does not implicitly fsync all the new files in it
1479  */
1480 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1481                                     struct btrfs_root *root,
1482                                     struct btrfs_path *path,
1483                                     u64 dirid, u64 index,
1484                                     char *name, int name_len, u8 type,
1485                                     struct btrfs_key *location)
1486 {
1487         struct inode *inode;
1488         struct inode *dir;
1489         int ret;
1490
1491         inode = read_one_inode(root, location->objectid);
1492         if (!inode)
1493                 return -ENOENT;
1494
1495         dir = read_one_inode(root, dirid);
1496         if (!dir) {
1497                 iput(inode);
1498                 return -EIO;
1499         }
1500         ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1501
1502         /* FIXME, put inode into FIXUP list */
1503
1504         iput(inode);
1505         iput(dir);
1506         return ret;
1507 }
1508
1509 /*
1510  * take a single entry in a log directory item and replay it into
1511  * the subvolume.
1512  *
1513  * if a conflicting item exists in the subdirectory already,
1514  * the inode it points to is unlinked and put into the link count
1515  * fix up tree.
1516  *
1517  * If a name from the log points to a file or directory that does
1518  * not exist in the FS, it is skipped.  fsyncs on directories
1519  * do not force down inodes inside that directory, just changes to the
1520  * names or unlinks in a directory.
1521  */
1522 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1523                                     struct btrfs_root *root,
1524                                     struct btrfs_path *path,
1525                                     struct extent_buffer *eb,
1526                                     struct btrfs_dir_item *di,
1527                                     struct btrfs_key *key)
1528 {
1529         char *name;
1530         int name_len;
1531         struct btrfs_dir_item *dst_di;
1532         struct btrfs_key found_key;
1533         struct btrfs_key log_key;
1534         struct inode *dir;
1535         u8 log_type;
1536         int exists;
1537         int ret = 0;
1538
1539         dir = read_one_inode(root, key->objectid);
1540         if (!dir)
1541                 return -EIO;
1542
1543         name_len = btrfs_dir_name_len(eb, di);
1544         name = kmalloc(name_len, GFP_NOFS);
1545         if (!name) {
1546                 ret = -ENOMEM;
1547                 goto out;
1548         }
1549
1550         log_type = btrfs_dir_type(eb, di);
1551         read_extent_buffer(eb, name, (unsigned long)(di + 1),
1552                    name_len);
1553
1554         btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1555         exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1556         if (exists == 0)
1557                 exists = 1;
1558         else
1559                 exists = 0;
1560         btrfs_release_path(path);
1561
1562         if (key->type == BTRFS_DIR_ITEM_KEY) {
1563                 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1564                                        name, name_len, 1);
1565         } else if (key->type == BTRFS_DIR_INDEX_KEY) {
1566                 dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1567                                                      key->objectid,
1568                                                      key->offset, name,
1569                                                      name_len, 1);
1570         } else {
1571                 /* Corruption */
1572                 ret = -EINVAL;
1573                 goto out;
1574         }
1575         if (IS_ERR_OR_NULL(dst_di)) {
1576                 /* we need a sequence number to insert, so we only
1577                  * do inserts for the BTRFS_DIR_INDEX_KEY types
1578                  */
1579                 if (key->type != BTRFS_DIR_INDEX_KEY)
1580                         goto out;
1581                 goto insert;
1582         }
1583
1584         btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1585         /* the existing item matches the logged item */
1586         if (found_key.objectid == log_key.objectid &&
1587             found_key.type == log_key.type &&
1588             found_key.offset == log_key.offset &&
1589             btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1590                 goto out;
1591         }
1592
1593         /*
1594          * don't drop the conflicting directory entry if the inode
1595          * for the new entry doesn't exist
1596          */
1597         if (!exists)
1598                 goto out;
1599
1600         ret = drop_one_dir_item(trans, root, path, dir, dst_di);
1601         if (ret)
1602                 goto out;
1603
1604         if (key->type == BTRFS_DIR_INDEX_KEY)
1605                 goto insert;
1606 out:
1607         btrfs_release_path(path);
1608         kfree(name);
1609         iput(dir);
1610         return ret;
1611
1612 insert:
1613         btrfs_release_path(path);
1614         ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1615                               name, name_len, log_type, &log_key);
1616         if (ret && ret != -ENOENT)
1617                 goto out;
1618         ret = 0;
1619         goto out;
1620 }
1621
1622 /*
1623  * find all the names in a directory item and reconcile them into
1624  * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
1625  * one name in a directory item, but the same code gets used for
1626  * both directory index types
1627  */
1628 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1629                                         struct btrfs_root *root,
1630                                         struct btrfs_path *path,
1631                                         struct extent_buffer *eb, int slot,
1632                                         struct btrfs_key *key)
1633 {
1634         int ret;
1635         u32 item_size = btrfs_item_size_nr(eb, slot);
1636         struct btrfs_dir_item *di;
1637         int name_len;
1638         unsigned long ptr;
1639         unsigned long ptr_end;
1640
1641         ptr = btrfs_item_ptr_offset(eb, slot);
1642         ptr_end = ptr + item_size;
1643         while (ptr < ptr_end) {
1644                 di = (struct btrfs_dir_item *)ptr;
1645                 if (verify_dir_item(root, eb, di))
1646                         return -EIO;
1647                 name_len = btrfs_dir_name_len(eb, di);
1648                 ret = replay_one_name(trans, root, path, eb, di, key);
1649                 if (ret)
1650                         return ret;
1651                 ptr = (unsigned long)(di + 1);
1652                 ptr += name_len;
1653         }
1654         return 0;
1655 }
1656
1657 /*
1658  * directory replay has two parts.  There are the standard directory
1659  * items in the log copied from the subvolume, and range items
1660  * created in the log while the subvolume was logged.
1661  *
1662  * The range items tell us which parts of the key space the log
1663  * is authoritative for.  During replay, if a key in the subvolume
1664  * directory is in a logged range item, but not actually in the log
1665  * that means it was deleted from the directory before the fsync
1666  * and should be removed.
1667  */
1668 static noinline int find_dir_range(struct btrfs_root *root,
1669                                    struct btrfs_path *path,
1670                                    u64 dirid, int key_type,
1671                                    u64 *start_ret, u64 *end_ret)
1672 {
1673         struct btrfs_key key;
1674         u64 found_end;
1675         struct btrfs_dir_log_item *item;
1676         int ret;
1677         int nritems;
1678
1679         if (*start_ret == (u64)-1)
1680                 return 1;
1681
1682         key.objectid = dirid;
1683         key.type = key_type;
1684         key.offset = *start_ret;
1685
1686         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1687         if (ret < 0)
1688                 goto out;
1689         if (ret > 0) {
1690                 if (path->slots[0] == 0)
1691                         goto out;
1692                 path->slots[0]--;
1693         }
1694         if (ret != 0)
1695                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1696
1697         if (key.type != key_type || key.objectid != dirid) {
1698                 ret = 1;
1699                 goto next;
1700         }
1701         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1702                               struct btrfs_dir_log_item);
1703         found_end = btrfs_dir_log_end(path->nodes[0], item);
1704
1705         if (*start_ret >= key.offset && *start_ret <= found_end) {
1706                 ret = 0;
1707                 *start_ret = key.offset;
1708                 *end_ret = found_end;
1709                 goto out;
1710         }
1711         ret = 1;
1712 next:
1713         /* check the next slot in the tree to see if it is a valid item */
1714         nritems = btrfs_header_nritems(path->nodes[0]);
1715         if (path->slots[0] >= nritems) {
1716                 ret = btrfs_next_leaf(root, path);
1717                 if (ret)
1718                         goto out;
1719         } else {
1720                 path->slots[0]++;
1721         }
1722
1723         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1724
1725         if (key.type != key_type || key.objectid != dirid) {
1726                 ret = 1;
1727                 goto out;
1728         }
1729         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1730                               struct btrfs_dir_log_item);
1731         found_end = btrfs_dir_log_end(path->nodes[0], item);
1732         *start_ret = key.offset;
1733         *end_ret = found_end;
1734         ret = 0;
1735 out:
1736         btrfs_release_path(path);
1737         return ret;
1738 }
1739
1740 /*
1741  * this looks for a given directory item in the log.  If the directory
1742  * item is not in the log, the item is removed and the inode it points
1743  * to is unlinked
1744  */
1745 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1746                                       struct btrfs_root *root,
1747                                       struct btrfs_root *log,
1748                                       struct btrfs_path *path,
1749                                       struct btrfs_path *log_path,
1750                                       struct inode *dir,
1751                                       struct btrfs_key *dir_key)
1752 {
1753         int ret;
1754         struct extent_buffer *eb;
1755         int slot;
1756         u32 item_size;
1757         struct btrfs_dir_item *di;
1758         struct btrfs_dir_item *log_di;
1759         int name_len;
1760         unsigned long ptr;
1761         unsigned long ptr_end;
1762         char *name;
1763         struct inode *inode;
1764         struct btrfs_key location;
1765
1766 again:
1767         eb = path->nodes[0];
1768         slot = path->slots[0];
1769         item_size = btrfs_item_size_nr(eb, slot);
1770         ptr = btrfs_item_ptr_offset(eb, slot);
1771         ptr_end = ptr + item_size;
1772         while (ptr < ptr_end) {
1773                 di = (struct btrfs_dir_item *)ptr;
1774                 if (verify_dir_item(root, eb, di)) {
1775                         ret = -EIO;
1776                         goto out;
1777                 }
1778
1779                 name_len = btrfs_dir_name_len(eb, di);
1780                 name = kmalloc(name_len, GFP_NOFS);
1781                 if (!name) {
1782                         ret = -ENOMEM;
1783                         goto out;
1784                 }
1785                 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1786                                   name_len);
1787                 log_di = NULL;
1788                 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
1789                         log_di = btrfs_lookup_dir_item(trans, log, log_path,
1790                                                        dir_key->objectid,
1791                                                        name, name_len, 0);
1792                 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
1793                         log_di = btrfs_lookup_dir_index_item(trans, log,
1794                                                      log_path,
1795                                                      dir_key->objectid,
1796                                                      dir_key->offset,
1797                                                      name, name_len, 0);
1798                 }
1799                 if (IS_ERR_OR_NULL(log_di)) {
1800                         btrfs_dir_item_key_to_cpu(eb, di, &location);
1801                         btrfs_release_path(path);
1802                         btrfs_release_path(log_path);
1803                         inode = read_one_inode(root, location.objectid);
1804                         if (!inode) {
1805                                 kfree(name);
1806                                 return -EIO;
1807                         }
1808
1809                         ret = link_to_fixup_dir(trans, root,
1810                                                 path, location.objectid);
1811                         if (ret) {
1812                                 kfree(name);
1813                                 iput(inode);
1814                                 goto out;
1815                         }
1816
1817                         btrfs_inc_nlink(inode);
1818                         ret = btrfs_unlink_inode(trans, root, dir, inode,
1819                                                  name, name_len);
1820                         if (!ret)
1821                                 ret = btrfs_run_delayed_items(trans, root);
1822                         kfree(name);
1823                         iput(inode);
1824                         if (ret)
1825                                 goto out;
1826
1827                         /* there might still be more names under this key
1828                          * check and repeat if required
1829                          */
1830                         ret = btrfs_search_slot(NULL, root, dir_key, path,
1831                                                 0, 0);
1832                         if (ret == 0)
1833                                 goto again;
1834                         ret = 0;
1835                         goto out;
1836                 }
1837                 btrfs_release_path(log_path);
1838                 kfree(name);
1839
1840                 ptr = (unsigned long)(di + 1);
1841                 ptr += name_len;
1842         }
1843         ret = 0;
1844 out:
1845         btrfs_release_path(path);
1846         btrfs_release_path(log_path);
1847         return ret;
1848 }
1849
1850 /*
1851  * deletion replay happens before we copy any new directory items
1852  * out of the log or out of backreferences from inodes.  It
1853  * scans the log to find ranges of keys that log is authoritative for,
1854  * and then scans the directory to find items in those ranges that are
1855  * not present in the log.
1856  *
1857  * Anything we don't find in the log is unlinked and removed from the
1858  * directory.
1859  */
1860 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1861                                        struct btrfs_root *root,
1862                                        struct btrfs_root *log,
1863                                        struct btrfs_path *path,
1864                                        u64 dirid, int del_all)
1865 {
1866         u64 range_start;
1867         u64 range_end;
1868         int key_type = BTRFS_DIR_LOG_ITEM_KEY;
1869         int ret = 0;
1870         struct btrfs_key dir_key;
1871         struct btrfs_key found_key;
1872         struct btrfs_path *log_path;
1873         struct inode *dir;
1874
1875         dir_key.objectid = dirid;
1876         dir_key.type = BTRFS_DIR_ITEM_KEY;
1877         log_path = btrfs_alloc_path();
1878         if (!log_path)
1879                 return -ENOMEM;
1880
1881         dir = read_one_inode(root, dirid);
1882         /* it isn't an error if the inode isn't there, that can happen
1883          * because we replay the deletes before we copy in the inode item
1884          * from the log
1885          */
1886         if (!dir) {
1887                 btrfs_free_path(log_path);
1888                 return 0;
1889         }
1890 again:
1891         range_start = 0;
1892         range_end = 0;
1893         while (1) {
1894                 if (del_all)
1895                         range_end = (u64)-1;
1896                 else {
1897                         ret = find_dir_range(log, path, dirid, key_type,
1898                                              &range_start, &range_end);
1899                         if (ret != 0)
1900                                 break;
1901                 }
1902
1903                 dir_key.offset = range_start;
1904                 while (1) {
1905                         int nritems;
1906                         ret = btrfs_search_slot(NULL, root, &dir_key, path,
1907                                                 0, 0);
1908                         if (ret < 0)
1909                                 goto out;
1910
1911                         nritems = btrfs_header_nritems(path->nodes[0]);
1912                         if (path->slots[0] >= nritems) {
1913                                 ret = btrfs_next_leaf(root, path);
1914                                 if (ret)
1915                                         break;
1916                         }
1917                         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1918                                               path->slots[0]);
1919                         if (found_key.objectid != dirid ||
1920                             found_key.type != dir_key.type)
1921                                 goto next_type;
1922
1923                         if (found_key.offset > range_end)
1924                                 break;
1925
1926                         ret = check_item_in_log(trans, root, log, path,
1927                                                 log_path, dir,
1928                                                 &found_key);
1929                         if (ret)
1930                                 goto out;
1931                         if (found_key.offset == (u64)-1)
1932                                 break;
1933                         dir_key.offset = found_key.offset + 1;
1934                 }
1935                 btrfs_release_path(path);
1936                 if (range_end == (u64)-1)
1937                         break;
1938                 range_start = range_end + 1;
1939         }
1940
1941 next_type:
1942         ret = 0;
1943         if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
1944                 key_type = BTRFS_DIR_LOG_INDEX_KEY;
1945                 dir_key.type = BTRFS_DIR_INDEX_KEY;
1946                 btrfs_release_path(path);
1947                 goto again;
1948         }
1949 out:
1950         btrfs_release_path(path);
1951         btrfs_free_path(log_path);
1952         iput(dir);
1953         return ret;
1954 }
1955
1956 /*
1957  * the process_func used to replay items from the log tree.  This
1958  * gets called in two different stages.  The first stage just looks
1959  * for inodes and makes sure they are all copied into the subvolume.
1960  *
1961  * The second stage copies all the other item types from the log into
1962  * the subvolume.  The two stage approach is slower, but gets rid of
1963  * lots of complexity around inodes referencing other inodes that exist
1964  * only in the log (references come from either directory items or inode
1965  * back refs).
1966  */
1967 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1968                              struct walk_control *wc, u64 gen)
1969 {
1970         int nritems;
1971         struct btrfs_path *path;
1972         struct btrfs_root *root = wc->replay_dest;
1973         struct btrfs_key key;
1974         int level;
1975         int i;
1976         int ret;
1977
1978         ret = btrfs_read_buffer(eb, gen);
1979         if (ret)
1980                 return ret;
1981
1982         level = btrfs_header_level(eb);
1983
1984         if (level != 0)
1985                 return 0;
1986
1987         path = btrfs_alloc_path();
1988         if (!path)
1989                 return -ENOMEM;
1990
1991         nritems = btrfs_header_nritems(eb);
1992         for (i = 0; i < nritems; i++) {
1993                 btrfs_item_key_to_cpu(eb, &key, i);
1994
1995                 /* inode keys are done during the first stage */
1996                 if (key.type == BTRFS_INODE_ITEM_KEY &&
1997                     wc->stage == LOG_WALK_REPLAY_INODES) {
1998                         struct btrfs_inode_item *inode_item;
1999                         u32 mode;
2000
2001                         inode_item = btrfs_item_ptr(eb, i,
2002                                             struct btrfs_inode_item);
2003                         mode = btrfs_inode_mode(eb, inode_item);
2004                         if (S_ISDIR(mode)) {
2005                                 ret = replay_dir_deletes(wc->trans,
2006                                          root, log, path, key.objectid, 0);
2007                                 if (ret)
2008                                         break;
2009                         }
2010                         ret = overwrite_item(wc->trans, root, path,
2011                                              eb, i, &key);
2012                         if (ret)
2013                                 break;
2014
2015                         /* for regular files, make sure corresponding
2016                          * orhpan item exist. extents past the new EOF
2017                          * will be truncated later by orphan cleanup.
2018                          */
2019                         if (S_ISREG(mode)) {
2020                                 ret = insert_orphan_item(wc->trans, root,
2021                                                          key.objectid);
2022                                 if (ret)
2023                                         break;
2024                         }
2025
2026                         ret = link_to_fixup_dir(wc->trans, root,
2027                                                 path, key.objectid);
2028                         if (ret)
2029                                 break;
2030                 }
2031
2032                 if (key.type == BTRFS_DIR_INDEX_KEY &&
2033                     wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2034                         ret = replay_one_dir_item(wc->trans, root, path,
2035                                                   eb, i, &key);
2036                         if (ret)
2037                                 break;
2038                 }
2039
2040                 if (wc->stage < LOG_WALK_REPLAY_ALL)
2041                         continue;
2042
2043                 /* these keys are simply copied */
2044                 if (key.type == BTRFS_XATTR_ITEM_KEY) {
2045                         ret = overwrite_item(wc->trans, root, path,
2046                                              eb, i, &key);
2047                         if (ret)
2048                                 break;
2049                 } else if (key.type == BTRFS_INODE_REF_KEY ||
2050                            key.type == BTRFS_INODE_EXTREF_KEY) {
2051                         ret = add_inode_ref(wc->trans, root, log, path,
2052                                             eb, i, &key);
2053                         if (ret && ret != -ENOENT)
2054                                 break;
2055                         ret = 0;
2056                 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2057                         ret = replay_one_extent(wc->trans, root, path,
2058                                                 eb, i, &key);
2059                         if (ret)
2060                                 break;
2061                 } else if (key.type == BTRFS_DIR_ITEM_KEY) {
2062                         ret = replay_one_dir_item(wc->trans, root, path,
2063                                                   eb, i, &key);
2064                         if (ret)
2065                                 break;
2066                 }
2067         }
2068         btrfs_free_path(path);
2069         return ret;
2070 }
2071
2072 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2073                                    struct btrfs_root *root,
2074                                    struct btrfs_path *path, int *level,
2075                                    struct walk_control *wc)
2076 {
2077         u64 root_owner;
2078         u64 bytenr;
2079         u64 ptr_gen;
2080         struct extent_buffer *next;
2081         struct extent_buffer *cur;
2082         struct extent_buffer *parent;
2083         u32 blocksize;
2084         int ret = 0;
2085
2086         WARN_ON(*level < 0);
2087         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2088
2089         while (*level > 0) {
2090                 WARN_ON(*level < 0);
2091                 WARN_ON(*level >= BTRFS_MAX_LEVEL);
2092                 cur = path->nodes[*level];
2093
2094                 if (btrfs_header_level(cur) != *level)
2095                         WARN_ON(1);
2096
2097                 if (path->slots[*level] >=
2098                     btrfs_header_nritems(cur))
2099                         break;
2100
2101                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2102                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2103                 blocksize = btrfs_level_size(root, *level - 1);
2104
2105                 parent = path->nodes[*level];
2106                 root_owner = btrfs_header_owner(parent);
2107
2108                 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
2109                 if (!next)
2110                         return -ENOMEM;
2111
2112                 if (*level == 1) {
2113                         ret = wc->process_func(root, next, wc, ptr_gen);
2114                         if (ret) {
2115                                 free_extent_buffer(next);
2116                                 return ret;
2117                         }
2118
2119                         path->slots[*level]++;
2120                         if (wc->free) {
2121                                 ret = btrfs_read_buffer(next, ptr_gen);
2122                                 if (ret) {
2123                                         free_extent_buffer(next);
2124                                         return ret;
2125                                 }
2126
2127                                 btrfs_tree_lock(next);
2128                                 btrfs_set_lock_blocking(next);
2129                                 clean_tree_block(trans, root, next);
2130                                 btrfs_wait_tree_block_writeback(next);
2131                                 btrfs_tree_unlock(next);
2132
2133                                 WARN_ON(root_owner !=
2134                                         BTRFS_TREE_LOG_OBJECTID);
2135                                 ret = btrfs_free_and_pin_reserved_extent(root,
2136                                                          bytenr, blocksize);
2137                                 if (ret) {
2138                                         free_extent_buffer(next);
2139                                         return ret;
2140                                 }
2141                         }
2142                         free_extent_buffer(next);
2143                         continue;
2144                 }
2145                 ret = btrfs_read_buffer(next, ptr_gen);
2146                 if (ret) {
2147                         free_extent_buffer(next);
2148                         return ret;
2149                 }
2150
2151                 WARN_ON(*level <= 0);
2152                 if (path->nodes[*level-1])
2153                         free_extent_buffer(path->nodes[*level-1]);
2154                 path->nodes[*level-1] = next;
2155                 *level = btrfs_header_level(next);
2156                 path->slots[*level] = 0;
2157                 cond_resched();
2158         }
2159         WARN_ON(*level < 0);
2160         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2161
2162         path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2163
2164         cond_resched();
2165         return 0;
2166 }
2167
2168 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2169                                  struct btrfs_root *root,
2170                                  struct btrfs_path *path, int *level,
2171                                  struct walk_control *wc)
2172 {
2173         u64 root_owner;
2174         int i;
2175         int slot;
2176         int ret;
2177
2178         for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2179                 slot = path->slots[i];
2180                 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2181                         path->slots[i]++;
2182                         *level = i;
2183                         WARN_ON(*level == 0);
2184                         return 0;
2185                 } else {
2186                         struct extent_buffer *parent;
2187                         if (path->nodes[*level] == root->node)
2188                                 parent = path->nodes[*level];
2189                         else
2190                                 parent = path->nodes[*level + 1];
2191
2192                         root_owner = btrfs_header_owner(parent);
2193                         ret = wc->process_func(root, path->nodes[*level], wc,
2194                                  btrfs_header_generation(path->nodes[*level]));
2195                         if (ret)
2196                                 return ret;
2197
2198                         if (wc->free) {
2199                                 struct extent_buffer *next;
2200
2201                                 next = path->nodes[*level];
2202
2203                                 btrfs_tree_lock(next);
2204                                 btrfs_set_lock_blocking(next);
2205                                 clean_tree_block(trans, root, next);
2206                                 btrfs_wait_tree_block_writeback(next);
2207                                 btrfs_tree_unlock(next);
2208
2209                                 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
2210                                 ret = btrfs_free_and_pin_reserved_extent(root,
2211                                                 path->nodes[*level]->start,
2212                                                 path->nodes[*level]->len);
2213                                 if (ret)
2214                                         return ret;
2215                         }
2216                         free_extent_buffer(path->nodes[*level]);
2217                         path->nodes[*level] = NULL;
2218                         *level = i + 1;
2219                 }
2220         }
2221         return 1;
2222 }
2223
2224 /*
2225  * drop the reference count on the tree rooted at 'snap'.  This traverses
2226  * the tree freeing any blocks that have a ref count of zero after being
2227  * decremented.
2228  */
2229 static int walk_log_tree(struct btrfs_trans_handle *trans,
2230                          struct btrfs_root *log, struct walk_control *wc)
2231 {
2232         int ret = 0;
2233         int wret;
2234         int level;
2235         struct btrfs_path *path;
2236         int orig_level;
2237
2238         path = btrfs_alloc_path();
2239         if (!path)
2240                 return -ENOMEM;
2241
2242         level = btrfs_header_level(log->node);
2243         orig_level = level;
2244         path->nodes[level] = log->node;
2245         extent_buffer_get(log->node);
2246         path->slots[level] = 0;
2247
2248         while (1) {
2249                 wret = walk_down_log_tree(trans, log, path, &level, wc);
2250                 if (wret > 0)
2251                         break;
2252                 if (wret < 0) {
2253                         ret = wret;
2254                         goto out;
2255                 }
2256
2257                 wret = walk_up_log_tree(trans, log, path, &level, wc);
2258                 if (wret > 0)
2259                         break;
2260                 if (wret < 0) {
2261                         ret = wret;
2262                         goto out;
2263                 }
2264         }
2265
2266         /* was the root node processed? if not, catch it here */
2267         if (path->nodes[orig_level]) {
2268                 ret = wc->process_func(log, path->nodes[orig_level], wc,
2269                          btrfs_header_generation(path->nodes[orig_level]));
2270                 if (ret)
2271                         goto out;
2272                 if (wc->free) {
2273                         struct extent_buffer *next;
2274
2275                         next = path->nodes[orig_level];
2276
2277                         btrfs_tree_lock(next);
2278                         btrfs_set_lock_blocking(next);
2279                         clean_tree_block(trans, log, next);
2280                         btrfs_wait_tree_block_writeback(next);
2281                         btrfs_tree_unlock(next);
2282
2283                         WARN_ON(log->root_key.objectid !=
2284                                 BTRFS_TREE_LOG_OBJECTID);
2285                         ret = btrfs_free_and_pin_reserved_extent(log, next->start,
2286                                                          next->len);
2287                         if (ret)
2288                                 goto out;
2289                 }
2290         }
2291
2292 out:
2293         btrfs_free_path(path);
2294         return ret;
2295 }
2296
2297 /*
2298  * helper function to update the item for a given subvolumes log root
2299  * in the tree of log roots
2300  */
2301 static int update_log_root(struct btrfs_trans_handle *trans,
2302                            struct btrfs_root *log)
2303 {
2304         int ret;
2305
2306         if (log->log_transid == 1) {
2307                 /* insert root item on the first sync */
2308                 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
2309                                 &log->root_key, &log->root_item);
2310         } else {
2311                 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2312                                 &log->root_key, &log->root_item);
2313         }
2314         return ret;
2315 }
2316
2317 static int wait_log_commit(struct btrfs_trans_handle *trans,
2318                            struct btrfs_root *root, unsigned long transid)
2319 {
2320         DEFINE_WAIT(wait);
2321         int index = transid % 2;
2322
2323         /*
2324          * we only allow two pending log transactions at a time,
2325          * so we know that if ours is more than 2 older than the
2326          * current transaction, we're done
2327          */
2328         do {
2329                 prepare_to_wait(&root->log_commit_wait[index],
2330                                 &wait, TASK_UNINTERRUPTIBLE);
2331                 mutex_unlock(&root->log_mutex);
2332
2333                 if (root->fs_info->last_trans_log_full_commit !=
2334                     trans->transid && root->log_transid < transid + 2 &&
2335                     atomic_read(&root->log_commit[index]))
2336                         schedule();
2337
2338                 finish_wait(&root->log_commit_wait[index], &wait);
2339                 mutex_lock(&root->log_mutex);
2340         } while (root->fs_info->last_trans_log_full_commit !=
2341                  trans->transid && root->log_transid < transid + 2 &&
2342                  atomic_read(&root->log_commit[index]));
2343         return 0;
2344 }
2345
2346 static void wait_for_writer(struct btrfs_trans_handle *trans,
2347                             struct btrfs_root *root)
2348 {
2349         DEFINE_WAIT(wait);
2350         while (root->fs_info->last_trans_log_full_commit !=
2351                trans->transid && atomic_read(&root->log_writers)) {
2352                 prepare_to_wait(&root->log_writer_wait,
2353                                 &wait, TASK_UNINTERRUPTIBLE);
2354                 mutex_unlock(&root->log_mutex);
2355                 if (root->fs_info->last_trans_log_full_commit !=
2356                     trans->transid && atomic_read(&root->log_writers))
2357                         schedule();
2358                 mutex_lock(&root->log_mutex);
2359                 finish_wait(&root->log_writer_wait, &wait);
2360         }
2361 }
2362
2363 /*
2364  * btrfs_sync_log does sends a given tree log down to the disk and
2365  * updates the super blocks to record it.  When this call is done,
2366  * you know that any inodes previously logged are safely on disk only
2367  * if it returns 0.
2368  *
2369  * Any other return value means you need to call btrfs_commit_transaction.
2370  * Some of the edge cases for fsyncing directories that have had unlinks
2371  * or renames done in the past mean that sometimes the only safe
2372  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
2373  * that has happened.
2374  */
2375 int btrfs_sync_log(struct btrfs_trans_handle *trans,
2376                    struct btrfs_root *root)
2377 {
2378         int index1;
2379         int index2;
2380         int mark;
2381         int ret;
2382         struct btrfs_root *log = root->log_root;
2383         struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
2384         unsigned long log_transid = 0;
2385         struct blk_plug plug;
2386
2387         mutex_lock(&root->log_mutex);
2388         log_transid = root->log_transid;
2389         index1 = root->log_transid % 2;
2390         if (atomic_read(&root->log_commit[index1])) {
2391                 wait_log_commit(trans, root, root->log_transid);
2392                 mutex_unlock(&root->log_mutex);
2393                 return 0;
2394         }
2395         atomic_set(&root->log_commit[index1], 1);
2396
2397         /* wait for previous tree log sync to complete */
2398         if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2399                 wait_log_commit(trans, root, root->log_transid - 1);
2400         while (1) {
2401                 int batch = atomic_read(&root->log_batch);
2402                 /* when we're on an ssd, just kick the log commit out */
2403                 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
2404                         mutex_unlock(&root->log_mutex);
2405                         schedule_timeout_uninterruptible(1);
2406                         mutex_lock(&root->log_mutex);
2407                 }
2408                 wait_for_writer(trans, root);
2409                 if (batch == atomic_read(&root->log_batch))
2410                         break;
2411         }
2412
2413         /* bail out if we need to do a full commit */
2414         if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2415                 ret = -EAGAIN;
2416                 btrfs_free_logged_extents(log, log_transid);
2417                 mutex_unlock(&root->log_mutex);
2418                 goto out;
2419         }
2420
2421         if (log_transid % 2 == 0)
2422                 mark = EXTENT_DIRTY;
2423         else
2424                 mark = EXTENT_NEW;
2425
2426         /* we start IO on  all the marked extents here, but we don't actually
2427          * wait for them until later.
2428          */
2429         blk_start_plug(&plug);
2430         ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
2431         if (ret) {
2432                 blk_finish_plug(&plug);
2433                 btrfs_abort_transaction(trans, root, ret);
2434                 btrfs_free_logged_extents(log, log_transid);
2435                 mutex_unlock(&root->log_mutex);
2436                 goto out;
2437         }
2438
2439         btrfs_set_root_node(&log->root_item, log->node);
2440
2441         root->log_transid++;
2442         log->log_transid = root->log_transid;
2443         root->log_start_pid = 0;
2444         smp_mb();
2445         /*
2446          * IO has been started, blocks of the log tree have WRITTEN flag set
2447          * in their headers. new modifications of the log will be written to
2448          * new positions. so it's safe to allow log writers to go in.
2449          */
2450         mutex_unlock(&root->log_mutex);
2451
2452         mutex_lock(&log_root_tree->log_mutex);
2453         atomic_inc(&log_root_tree->log_batch);
2454         atomic_inc(&log_root_tree->log_writers);
2455         mutex_unlock(&log_root_tree->log_mutex);
2456
2457         ret = update_log_root(trans, log);
2458
2459         mutex_lock(&log_root_tree->log_mutex);
2460         if (atomic_dec_and_test(&log_root_tree->log_writers)) {
2461                 smp_mb();
2462                 if (waitqueue_active(&log_root_tree->log_writer_wait))
2463                         wake_up(&log_root_tree->log_writer_wait);
2464         }
2465
2466         if (ret) {
2467                 blk_finish_plug(&plug);
2468                 if (ret != -ENOSPC) {
2469                         btrfs_abort_transaction(trans, root, ret);
2470                         mutex_unlock(&log_root_tree->log_mutex);
2471                         goto out;
2472                 }
2473                 root->fs_info->last_trans_log_full_commit = trans->transid;
2474                 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2475                 btrfs_free_logged_extents(log, log_transid);
2476                 mutex_unlock(&log_root_tree->log_mutex);
2477                 ret = -EAGAIN;
2478                 goto out;
2479         }
2480
2481         index2 = log_root_tree->log_transid % 2;
2482         if (atomic_read(&log_root_tree->log_commit[index2])) {
2483                 blk_finish_plug(&plug);
2484                 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2485                 wait_log_commit(trans, log_root_tree,
2486                                 log_root_tree->log_transid);
2487                 btrfs_free_logged_extents(log, log_transid);
2488                 mutex_unlock(&log_root_tree->log_mutex);
2489                 ret = 0;
2490                 goto out;
2491         }
2492         atomic_set(&log_root_tree->log_commit[index2], 1);
2493
2494         if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
2495                 wait_log_commit(trans, log_root_tree,
2496                                 log_root_tree->log_transid - 1);
2497         }
2498
2499         wait_for_writer(trans, log_root_tree);
2500
2501         /*
2502          * now that we've moved on to the tree of log tree roots,
2503          * check the full commit flag again
2504          */
2505         if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2506                 blk_finish_plug(&plug);
2507                 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2508                 btrfs_free_logged_extents(log, log_transid);
2509                 mutex_unlock(&log_root_tree->log_mutex);
2510                 ret = -EAGAIN;
2511                 goto out_wake_log_root;
2512         }
2513
2514         ret = btrfs_write_marked_extents(log_root_tree,
2515                                          &log_root_tree->dirty_log_pages,
2516                                          EXTENT_DIRTY | EXTENT_NEW);
2517         blk_finish_plug(&plug);
2518         if (ret) {
2519                 btrfs_abort_transaction(trans, root, ret);
2520                 btrfs_free_logged_extents(log, log_transid);
2521                 mutex_unlock(&log_root_tree->log_mutex);
2522                 goto out_wake_log_root;
2523         }
2524         btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2525         btrfs_wait_marked_extents(log_root_tree,
2526                                   &log_root_tree->dirty_log_pages,
2527                                   EXTENT_NEW | EXTENT_DIRTY);
2528         btrfs_wait_logged_extents(log, log_transid);
2529
2530         btrfs_set_super_log_root(root->fs_info->super_for_commit,
2531                                 log_root_tree->node->start);
2532         btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
2533                                 btrfs_header_level(log_root_tree->node));
2534
2535         log_root_tree->log_transid++;
2536         smp_mb();
2537
2538         mutex_unlock(&log_root_tree->log_mutex);
2539
2540         /*
2541          * nobody else is going to jump in and write the the ctree
2542          * super here because the log_commit atomic below is protecting
2543          * us.  We must be called with a transaction handle pinning
2544          * the running transaction open, so a full commit can't hop
2545          * in and cause problems either.
2546          */
2547         btrfs_scrub_pause_super(root);
2548         ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
2549         btrfs_scrub_continue_super(root);
2550         if (ret) {
2551                 btrfs_abort_transaction(trans, root, ret);
2552                 goto out_wake_log_root;
2553         }
2554
2555         mutex_lock(&root->log_mutex);
2556         if (root->last_log_commit < log_transid)
2557                 root->last_log_commit = log_transid;
2558         mutex_unlock(&root->log_mutex);
2559
2560 out_wake_log_root:
2561         atomic_set(&log_root_tree->log_commit[index2], 0);
2562         smp_mb();
2563         if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
2564                 wake_up(&log_root_tree->log_commit_wait[index2]);
2565 out:
2566         atomic_set(&root->log_commit[index1], 0);
2567         smp_mb();
2568         if (waitqueue_active(&root->log_commit_wait[index1]))
2569                 wake_up(&root->log_commit_wait[index1]);
2570         return ret;
2571 }
2572
2573 static void free_log_tree(struct btrfs_trans_handle *trans,
2574                           struct btrfs_root *log)
2575 {
2576         int ret;
2577         u64 start;
2578         u64 end;
2579         struct walk_control wc = {
2580                 .free = 1,
2581                 .process_func = process_one_buffer
2582         };
2583
2584         if (trans) {
2585                 ret = walk_log_tree(trans, log, &wc);
2586
2587                 /* I don't think this can happen but just in case */
2588                 if (ret)
2589                         btrfs_abort_transaction(trans, log, ret);
2590         }
2591
2592         while (1) {
2593                 ret = find_first_extent_bit(&log->dirty_log_pages,
2594                                 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
2595                                 NULL);
2596                 if (ret)
2597                         break;
2598
2599                 clear_extent_bits(&log->dirty_log_pages, start, end,
2600                                   EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
2601         }
2602
2603         /*
2604          * We may have short-circuited the log tree with the full commit logic
2605          * and left ordered extents on our list, so clear these out to keep us
2606          * from leaking inodes and memory.
2607          */
2608         btrfs_free_logged_extents(log, 0);
2609         btrfs_free_logged_extents(log, 1);
2610
2611         free_extent_buffer(log->node);
2612         kfree(log);
2613 }
2614
2615 /*
2616  * free all the extents used by the tree log.  This should be called
2617  * at commit time of the full transaction
2618  */
2619 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2620 {
2621         if (root->log_root) {
2622                 free_log_tree(trans, root->log_root);
2623                 root->log_root = NULL;
2624         }
2625         return 0;
2626 }
2627
2628 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
2629                              struct btrfs_fs_info *fs_info)
2630 {
2631         if (fs_info->log_root_tree) {
2632                 free_log_tree(trans, fs_info->log_root_tree);
2633                 fs_info->log_root_tree = NULL;
2634         }
2635         return 0;
2636 }
2637
2638 /*
2639  * If both a file and directory are logged, and unlinks or renames are
2640  * mixed in, we have a few interesting corners:
2641  *
2642  * create file X in dir Y
2643  * link file X to X.link in dir Y
2644  * fsync file X
2645  * unlink file X but leave X.link
2646  * fsync dir Y
2647  *
2648  * After a crash we would expect only X.link to exist.  But file X
2649  * didn't get fsync'd again so the log has back refs for X and X.link.
2650  *
2651  * We solve this by removing directory entries and inode backrefs from the
2652  * log when a file that was logged in the current transaction is
2653  * unlinked.  Any later fsync will include the updated log entries, and
2654  * we'll be able to reconstruct the proper directory items from backrefs.
2655  *
2656  * This optimizations allows us to avoid relogging the entire inode
2657  * or the entire directory.
2658  */
2659 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2660                                  struct btrfs_root *root,
2661                                  const char *name, int name_len,
2662                                  struct inode *dir, u64 index)
2663 {
2664         struct btrfs_root *log;
2665         struct btrfs_dir_item *di;
2666         struct btrfs_path *path;
2667         int ret;
2668         int err = 0;
2669         int bytes_del = 0;
2670         u64 dir_ino = btrfs_ino(dir);
2671
2672         if (BTRFS_I(dir)->logged_trans < trans->transid)
2673                 return 0;
2674
2675         ret = join_running_log_trans(root);
2676         if (ret)
2677                 return 0;
2678
2679         mutex_lock(&BTRFS_I(dir)->log_mutex);
2680
2681         log = root->log_root;
2682         path = btrfs_alloc_path();
2683         if (!path) {
2684                 err = -ENOMEM;
2685                 goto out_unlock;
2686         }
2687
2688         di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
2689                                    name, name_len, -1);
2690         if (IS_ERR(di)) {
2691                 err = PTR_ERR(di);
2692                 goto fail;
2693         }
2694         if (di) {
2695                 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2696                 bytes_del += name_len;
2697                 if (ret) {
2698                         err = ret;
2699                         goto fail;
2700                 }
2701         }
2702         btrfs_release_path(path);
2703         di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
2704                                          index, name, name_len, -1);
2705         if (IS_ERR(di)) {
2706                 err = PTR_ERR(di);
2707                 goto fail;
2708         }
2709         if (di) {
2710                 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2711                 bytes_del += name_len;
2712                 if (ret) {
2713                         err = ret;
2714                         goto fail;
2715                 }
2716         }
2717
2718         /* update the directory size in the log to reflect the names
2719          * we have removed
2720          */
2721         if (bytes_del) {
2722                 struct btrfs_key key;
2723
2724                 key.objectid = dir_ino;
2725                 key.offset = 0;
2726                 key.type = BTRFS_INODE_ITEM_KEY;
2727                 btrfs_release_path(path);
2728
2729                 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2730                 if (ret < 0) {
2731                         err = ret;
2732                         goto fail;
2733                 }
2734                 if (ret == 0) {
2735                         struct btrfs_inode_item *item;
2736                         u64 i_size;
2737
2738                         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2739                                               struct btrfs_inode_item);
2740                         i_size = btrfs_inode_size(path->nodes[0], item);
2741                         if (i_size > bytes_del)
2742                                 i_size -= bytes_del;
2743                         else
2744                                 i_size = 0;
2745                         btrfs_set_inode_size(path->nodes[0], item, i_size);
2746                         btrfs_mark_buffer_dirty(path->nodes[0]);
2747                 } else
2748                         ret = 0;
2749                 btrfs_release_path(path);
2750         }
2751 fail:
2752         btrfs_free_path(path);
2753 out_unlock:
2754         mutex_unlock(&BTRFS_I(dir)->log_mutex);
2755         if (ret == -ENOSPC) {
2756                 root->fs_info->last_trans_log_full_commit = trans->transid;
2757                 ret = 0;
2758         } else if (ret < 0)
2759                 btrfs_abort_transaction(trans, root, ret);
2760
2761         btrfs_end_log_trans(root);
2762
2763         return err;
2764 }
2765
2766 /* see comments for btrfs_del_dir_entries_in_log */
2767 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2768                                struct btrfs_root *root,
2769                                const char *name, int name_len,
2770                                struct inode *inode, u64 dirid)
2771 {
2772         struct btrfs_root *log;
2773         u64 index;
2774         int ret;
2775
2776         if (BTRFS_I(inode)->logged_trans < trans->transid)
2777                 return 0;
2778
2779         ret = join_running_log_trans(root);
2780         if (ret)
2781                 return 0;
2782         log = root->log_root;
2783         mutex_lock(&BTRFS_I(inode)->log_mutex);
2784
2785         ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
2786                                   dirid, &index);
2787         mutex_unlock(&BTRFS_I(inode)->log_mutex);
2788         if (ret == -ENOSPC) {
2789                 root->fs_info->last_trans_log_full_commit = trans->transid;
2790                 ret = 0;
2791         } else if (ret < 0 && ret != -ENOENT)
2792                 btrfs_abort_transaction(trans, root, ret);
2793         btrfs_end_log_trans(root);
2794
2795         return ret;
2796 }
2797
2798 /*
2799  * creates a range item in the log for 'dirid'.  first_offset and
2800  * last_offset tell us which parts of the key space the log should
2801  * be considered authoritative for.
2802  */
2803 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2804                                        struct btrfs_root *log,
2805                                        struct btrfs_path *path,
2806                                        int key_type, u64 dirid,
2807                                        u64 first_offset, u64 last_offset)
2808 {
2809         int ret;
2810         struct btrfs_key key;
2811         struct btrfs_dir_log_item *item;
2812
2813         key.objectid = dirid;
2814         key.offset = first_offset;
2815         if (key_type == BTRFS_DIR_ITEM_KEY)
2816                 key.type = BTRFS_DIR_LOG_ITEM_KEY;
2817         else
2818                 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2819         ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2820         if (ret)
2821                 return ret;
2822
2823         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2824                               struct btrfs_dir_log_item);
2825         btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
2826         btrfs_mark_buffer_dirty(path->nodes[0]);
2827         btrfs_release_path(path);
2828         return 0;
2829 }
2830
2831 /*
2832  * log all the items included in the current transaction for a given
2833  * directory.  This also creates the range items in the log tree required
2834  * to replay anything deleted before the fsync
2835  */
2836 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2837                           struct btrfs_root *root, struct inode *inode,
2838                           struct btrfs_path *path,
2839                           struct btrfs_path *dst_path, int key_type,
2840                           u64 min_offset, u64 *last_offset_ret)
2841 {
2842         struct btrfs_key min_key;
2843         struct btrfs_key max_key;
2844         struct btrfs_root *log = root->log_root;
2845         struct extent_buffer *src;
2846         int err = 0;
2847         int ret;
2848         int i;
2849         int nritems;
2850         u64 first_offset = min_offset;
2851         u64 last_offset = (u64)-1;
2852         u64 ino = btrfs_ino(inode);
2853
2854         log = root->log_root;
2855         max_key.objectid = ino;
2856         max_key.offset = (u64)-1;
2857         max_key.type = key_type;
2858
2859         min_key.objectid = ino;
2860         min_key.type = key_type;
2861         min_key.offset = min_offset;
2862
2863         path->keep_locks = 1;
2864
2865         ret = btrfs_search_forward(root, &min_key, &max_key,
2866                                    path, trans->transid);
2867
2868         /*
2869          * we didn't find anything from this transaction, see if there
2870          * is anything at all
2871          */
2872         if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
2873                 min_key.objectid = ino;
2874                 min_key.type = key_type;
2875                 min_key.offset = (u64)-1;
2876                 btrfs_release_path(path);
2877                 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2878                 if (ret < 0) {
2879                         btrfs_release_path(path);
2880                         return ret;
2881                 }
2882                 ret = btrfs_previous_item(root, path, ino, key_type);
2883
2884                 /* if ret == 0 there are items for this type,
2885                  * create a range to tell us the last key of this type.
2886                  * otherwise, there are no items in this directory after
2887                  * *min_offset, and we create a range to indicate that.
2888                  */
2889                 if (ret == 0) {
2890                         struct btrfs_key tmp;
2891                         btrfs_item_key_to_cpu(path->nodes[0], &tmp,
2892                                               path->slots[0]);
2893                         if (key_type == tmp.type)
2894                                 first_offset = max(min_offset, tmp.offset) + 1;
2895                 }
2896                 goto done;
2897         }
2898
2899         /* go backward to find any previous key */
2900         ret = btrfs_previous_item(root, path, ino, key_type);
2901         if (ret == 0) {
2902                 struct btrfs_key tmp;
2903                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2904                 if (key_type == tmp.type) {
2905                         first_offset = tmp.offset;
2906                         ret = overwrite_item(trans, log, dst_path,
2907                                              path->nodes[0], path->slots[0],
2908                                              &tmp);
2909                         if (ret) {
2910                                 err = ret;
2911                                 goto done;
2912                         }
2913                 }
2914         }
2915         btrfs_release_path(path);
2916
2917         /* find the first key from this transaction again */
2918         ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2919         if (ret != 0) {
2920                 WARN_ON(1);
2921                 goto done;
2922         }
2923
2924         /*
2925          * we have a block from this transaction, log every item in it
2926          * from our directory
2927          */
2928         while (1) {
2929                 struct btrfs_key tmp;
2930                 src = path->nodes[0];
2931                 nritems = btrfs_header_nritems(src);
2932                 for (i = path->slots[0]; i < nritems; i++) {
2933                         btrfs_item_key_to_cpu(src, &min_key, i);
2934
2935                         if (min_key.objectid != ino || min_key.type != key_type)
2936                                 goto done;
2937                         ret = overwrite_item(trans, log, dst_path, src, i,
2938                                              &min_key);
2939                         if (ret) {
2940                                 err = ret;
2941                                 goto done;
2942                         }
2943                 }
2944                 path->slots[0] = nritems;
2945
2946                 /*
2947                  * look ahead to the next item and see if it is also
2948                  * from this directory and from this transaction
2949                  */
2950                 ret = btrfs_next_leaf(root, path);
2951                 if (ret == 1) {
2952                         last_offset = (u64)-1;
2953                         goto done;
2954                 }
2955                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2956                 if (tmp.objectid != ino || tmp.type != key_type) {
2957                         last_offset = (u64)-1;
2958                         goto done;
2959                 }
2960                 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
2961                         ret = overwrite_item(trans, log, dst_path,
2962                                              path->nodes[0], path->slots[0],
2963                                              &tmp);
2964                         if (ret)
2965                                 err = ret;
2966                         else
2967                                 last_offset = tmp.offset;
2968                         goto done;
2969                 }
2970         }
2971 done:
2972         btrfs_release_path(path);
2973         btrfs_release_path(dst_path);
2974
2975         if (err == 0) {
2976                 *last_offset_ret = last_offset;
2977                 /*
2978                  * insert the log range keys to indicate where the log
2979                  * is valid
2980                  */
2981                 ret = insert_dir_log_key(trans, log, path, key_type,
2982                                          ino, first_offset, last_offset);
2983                 if (ret)
2984                         err = ret;
2985         }
2986         return err;
2987 }
2988
2989 /*
2990  * logging directories is very similar to logging inodes, We find all the items
2991  * from the current transaction and write them to the log.
2992  *
2993  * The recovery code scans the directory in the subvolume, and if it finds a
2994  * key in the range logged that is not present in the log tree, then it means
2995  * that dir entry was unlinked during the transaction.
2996  *
2997  * In order for that scan to work, we must include one key smaller than
2998  * the smallest logged by this transaction and one key larger than the largest
2999  * key logged by this transaction.
3000  */
3001 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
3002                           struct btrfs_root *root, struct inode *inode,
3003                           struct btrfs_path *path,
3004                           struct btrfs_path *dst_path)
3005 {
3006         u64 min_key;
3007         u64 max_key;
3008         int ret;
3009         int key_type = BTRFS_DIR_ITEM_KEY;
3010
3011 again:
3012         min_key = 0;
3013         max_key = 0;
3014         while (1) {
3015                 ret = log_dir_items(trans, root, inode, path,
3016                                     dst_path, key_type, min_key,
3017                                     &max_key);
3018                 if (ret)
3019                         return ret;
3020                 if (max_key == (u64)-1)
3021                         break;
3022                 min_key = max_key + 1;
3023         }
3024
3025         if (key_type == BTRFS_DIR_ITEM_KEY) {
3026                 key_type = BTRFS_DIR_INDEX_KEY;
3027                 goto again;
3028         }
3029         return 0;
3030 }
3031
3032 /*
3033  * a helper function to drop items from the log before we relog an
3034  * inode.  max_key_type indicates the highest item type to remove.
3035  * This cannot be run for file data extents because it does not
3036  * free the extents they point to.
3037  */
3038 static int drop_objectid_items(struct btrfs_trans_handle *trans,
3039                                   struct btrfs_root *log,
3040                                   struct btrfs_path *path,
3041                                   u64 objectid, int max_key_type)
3042 {
3043         int ret;
3044         struct btrfs_key key;
3045         struct btrfs_key found_key;
3046         int start_slot;
3047
3048         key.objectid = objectid;
3049         key.type = max_key_type;
3050         key.offset = (u64)-1;
3051
3052         while (1) {
3053                 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
3054                 BUG_ON(ret == 0); /* Logic error */
3055                 if (ret < 0)
3056                         break;
3057
3058                 if (path->slots[0] == 0)
3059                         break;
3060
3061                 path->slots[0]--;
3062                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3063                                       path->slots[0]);
3064
3065                 if (found_key.objectid != objectid)
3066                         break;
3067
3068                 found_key.offset = 0;
3069                 found_key.type = 0;
3070                 ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
3071                                        &start_slot);
3072
3073                 ret = btrfs_del_items(trans, log, path, start_slot,
3074                                       path->slots[0] - start_slot + 1);
3075                 /*
3076                  * If start slot isn't 0 then we don't need to re-search, we've
3077                  * found the last guy with the objectid in this tree.
3078                  */
3079                 if (ret || start_slot != 0)
3080                         break;
3081                 btrfs_release_path(path);
3082         }
3083         btrfs_release_path(path);
3084         if (ret > 0)
3085                 ret = 0;
3086         return ret;
3087 }
3088
3089 static void fill_inode_item(struct btrfs_trans_handle *trans,
3090                             struct extent_buffer *leaf,
3091                             struct btrfs_inode_item *item,
3092                             struct inode *inode, int log_inode_only)
3093 {
3094         struct btrfs_map_token token;
3095
3096         btrfs_init_map_token(&token);
3097
3098         if (log_inode_only) {
3099                 /* set the generation to zero so the recover code
3100                  * can tell the difference between an logging
3101                  * just to say 'this inode exists' and a logging
3102                  * to say 'update this inode with these values'
3103                  */
3104                 btrfs_set_token_inode_generation(leaf, item, 0, &token);
3105                 btrfs_set_token_inode_size(leaf, item, 0, &token);
3106         } else {
3107                 btrfs_set_token_inode_generation(leaf, item,
3108                                                  BTRFS_I(inode)->generation,
3109                                                  &token);
3110                 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
3111         }
3112
3113         btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3114         btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3115         btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3116         btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3117
3118         btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
3119                                      inode->i_atime.tv_sec, &token);
3120         btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
3121                                       inode->i_atime.tv_nsec, &token);
3122
3123         btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
3124                                      inode->i_mtime.tv_sec, &token);
3125         btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
3126                                       inode->i_mtime.tv_nsec, &token);
3127
3128         btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
3129                                      inode->i_ctime.tv_sec, &token);
3130         btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
3131                                       inode->i_ctime.tv_nsec, &token);
3132
3133         btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3134                                      &token);
3135
3136         btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3137         btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3138         btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3139         btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3140         btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3141 }
3142
3143 static int log_inode_item(struct btrfs_trans_handle *trans,
3144                           struct btrfs_root *log, struct btrfs_path *path,
3145                           struct inode *inode)
3146 {
3147         struct btrfs_inode_item *inode_item;
3148         struct btrfs_key key;
3149         int ret;
3150
3151         memcpy(&key, &BTRFS_I(inode)->location, sizeof(key));
3152         ret = btrfs_insert_empty_item(trans, log, path, &key,
3153                                       sizeof(*inode_item));
3154         if (ret && ret != -EEXIST)
3155                 return ret;
3156         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3157                                     struct btrfs_inode_item);
3158         fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
3159         btrfs_release_path(path);
3160         return 0;
3161 }
3162
3163 static noinline int copy_items(struct btrfs_trans_handle *trans,
3164                                struct inode *inode,
3165                                struct btrfs_path *dst_path,
3166                                struct extent_buffer *src,
3167                                int start_slot, int nr, int inode_only)
3168 {
3169         unsigned long src_offset;
3170         unsigned long dst_offset;
3171         struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
3172         struct btrfs_file_extent_item *extent;
3173         struct btrfs_inode_item *inode_item;
3174         int ret;
3175         struct btrfs_key *ins_keys;
3176         u32 *ins_sizes;
3177         char *ins_data;
3178         int i;
3179         struct list_head ordered_sums;
3180         int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3181
3182         INIT_LIST_HEAD(&ordered_sums);
3183
3184         ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
3185                            nr * sizeof(u32), GFP_NOFS);
3186         if (!ins_data)
3187                 return -ENOMEM;
3188
3189         ins_sizes = (u32 *)ins_data;
3190         ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
3191
3192         for (i = 0; i < nr; i++) {
3193                 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
3194                 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
3195         }
3196         ret = btrfs_insert_empty_items(trans, log, dst_path,
3197                                        ins_keys, ins_sizes, nr);
3198         if (ret) {
3199                 kfree(ins_data);
3200                 return ret;
3201         }
3202
3203         for (i = 0; i < nr; i++, dst_path->slots[0]++) {
3204                 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
3205                                                    dst_path->slots[0]);
3206
3207                 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
3208
3209                 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
3210                         inode_item = btrfs_item_ptr(dst_path->nodes[0],
3211                                                     dst_path->slots[0],
3212                                                     struct btrfs_inode_item);
3213                         fill_inode_item(trans, dst_path->nodes[0], inode_item,
3214                                         inode, inode_only == LOG_INODE_EXISTS);
3215                 } else {
3216                         copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
3217                                            src_offset, ins_sizes[i]);
3218                 }
3219
3220                 /* take a reference on file data extents so that truncates
3221                  * or deletes of this inode don't have to relog the inode
3222                  * again
3223                  */
3224                 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY &&
3225                     !skip_csum) {
3226                         int found_type;
3227                         extent = btrfs_item_ptr(src, start_slot + i,
3228                                                 struct btrfs_file_extent_item);
3229
3230                         if (btrfs_file_extent_generation(src, extent) < trans->transid)
3231                                 continue;
3232
3233                         found_type = btrfs_file_extent_type(src, extent);
3234                         if (found_type == BTRFS_FILE_EXTENT_REG) {
3235                                 u64 ds, dl, cs, cl;
3236                                 ds = btrfs_file_extent_disk_bytenr(src,
3237                                                                 extent);
3238                                 /* ds == 0 is a hole */
3239                                 if (ds == 0)
3240                                         continue;
3241
3242                                 dl = btrfs_file_extent_disk_num_bytes(src,
3243                                                                 extent);
3244                                 cs = btrfs_file_extent_offset(src, extent);
3245                                 cl = btrfs_file_extent_num_bytes(src,
3246                                                                 extent);
3247                                 if (btrfs_file_extent_compression(src,
3248                                                                   extent)) {
3249                                         cs = 0;
3250                                         cl = dl;
3251                                 }
3252
3253                                 ret = btrfs_lookup_csums_range(
3254                                                 log->fs_info->csum_root,
3255                                                 ds + cs, ds + cs + cl - 1,
3256                                                 &ordered_sums, 0);
3257                                 if (ret) {
3258                                         btrfs_release_path(dst_path);
3259                                         kfree(ins_data);
3260                                         return ret;
3261                                 }
3262                         }
3263                 }
3264         }
3265
3266         btrfs_mark_buffer_dirty(dst_path->nodes[0]);
3267         btrfs_release_path(dst_path);
3268         kfree(ins_data);
3269
3270         /*
3271          * we have to do this after the loop above to avoid changing the
3272          * log tree while trying to change the log tree.
3273          */
3274         ret = 0;
3275         while (!list_empty(&ordered_sums)) {
3276                 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3277                                                    struct btrfs_ordered_sum,
3278                                                    list);
3279                 if (!ret)
3280                         ret = btrfs_csum_file_blocks(trans, log, sums);
3281                 list_del(&sums->list);
3282                 kfree(sums);
3283         }
3284         return ret;
3285 }
3286
3287 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3288 {
3289         struct extent_map *em1, *em2;
3290
3291         em1 = list_entry(a, struct extent_map, list);
3292         em2 = list_entry(b, struct extent_map, list);
3293
3294         if (em1->start < em2->start)
3295                 return -1;
3296         else if (em1->start > em2->start)
3297                 return 1;
3298         return 0;
3299 }
3300
3301 static int log_one_extent(struct btrfs_trans_handle *trans,
3302                           struct inode *inode, struct btrfs_root *root,
3303                           struct extent_map *em, struct btrfs_path *path)
3304 {
3305         struct btrfs_root *log = root->log_root;
3306         struct btrfs_file_extent_item *fi;
3307         struct extent_buffer *leaf;
3308         struct btrfs_ordered_extent *ordered;
3309         struct list_head ordered_sums;
3310         struct btrfs_map_token token;
3311         struct btrfs_key key;
3312         u64 mod_start = em->mod_start;
3313         u64 mod_len = em->mod_len;
3314         u64 csum_offset;
3315         u64 csum_len;
3316         u64 extent_offset = em->start - em->orig_start;
3317         u64 block_len;
3318         int ret;
3319         int index = log->log_transid % 2;
3320         bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3321
3322         ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
3323                                    em->start + em->len, NULL, 0);
3324         if (ret)
3325                 return ret;
3326
3327         INIT_LIST_HEAD(&ordered_sums);
3328         btrfs_init_map_token(&token);
3329         key.objectid = btrfs_ino(inode);
3330         key.type = BTRFS_EXTENT_DATA_KEY;
3331         key.offset = em->start;
3332
3333         ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi));
3334         if (ret)
3335                 return ret;
3336         leaf = path->nodes[0];
3337         fi = btrfs_item_ptr(leaf, path->slots[0],
3338                             struct btrfs_file_extent_item);
3339
3340         btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
3341                                                &token);
3342         if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3343                 skip_csum = true;
3344                 btrfs_set_token_file_extent_type(leaf, fi,
3345                                                  BTRFS_FILE_EXTENT_PREALLOC,
3346                                                  &token);
3347         } else {
3348                 btrfs_set_token_file_extent_type(leaf, fi,
3349                                                  BTRFS_FILE_EXTENT_REG,
3350                                                  &token);
3351                 if (em->block_start == 0)
3352                         skip_csum = true;
3353         }
3354
3355         block_len = max(em->block_len, em->orig_block_len);
3356         if (em->compress_type != BTRFS_COMPRESS_NONE) {
3357                 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3358                                                         em->block_start,
3359                                                         &token);
3360                 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3361                                                            &token);
3362         } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
3363                 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3364                                                         em->block_start -
3365                                                         extent_offset, &token);
3366                 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3367                                                            &token);
3368         } else {
3369                 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
3370                 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
3371                                                            &token);
3372         }
3373
3374         btrfs_set_token_file_extent_offset(leaf, fi,
3375                                            em->start - em->orig_start,
3376                                            &token);
3377         btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
3378         btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
3379         btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
3380                                                 &token);
3381         btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
3382         btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
3383         btrfs_mark_buffer_dirty(leaf);
3384
3385         btrfs_release_path(path);
3386         if (ret) {
3387                 return ret;
3388         }
3389
3390         if (skip_csum)
3391                 return 0;
3392
3393         if (em->compress_type) {
3394                 csum_offset = 0;
3395                 csum_len = block_len;
3396         }
3397
3398         /*
3399          * First check and see if our csums are on our outstanding ordered
3400          * extents.
3401          */
3402 again:
3403         spin_lock_irq(&log->log_extents_lock[index]);
3404         list_for_each_entry(ordered, &log->logged_list[index], log_list) {
3405                 struct btrfs_ordered_sum *sum;
3406
3407                 if (!mod_len)
3408                         break;
3409
3410                 if (ordered->inode != inode)
3411                         continue;
3412
3413                 if (ordered->file_offset + ordered->len <= mod_start ||
3414                     mod_start + mod_len <= ordered->file_offset)
3415                         continue;
3416
3417                 /*
3418                  * We are going to copy all the csums on this ordered extent, so
3419                  * go ahead and adjust mod_start and mod_len in case this
3420                  * ordered extent has already been logged.
3421                  */
3422                 if (ordered->file_offset > mod_start) {
3423                         if (ordered->file_offset + ordered->len >=
3424                             mod_start + mod_len)
3425                                 mod_len = ordered->file_offset - mod_start;
3426                         /*
3427                          * If we have this case
3428                          *
3429                          * |--------- logged extent ---------|
3430                          *       |----- ordered extent ----|
3431                          *
3432                          * Just don't mess with mod_start and mod_len, we'll
3433                          * just end up logging more csums than we need and it
3434                          * will be ok.
3435                          */
3436                 } else {
3437                         if (ordered->file_offset + ordered->len <
3438                             mod_start + mod_len) {
3439                                 mod_len = (mod_start + mod_len) -
3440                                         (ordered->file_offset + ordered->len);
3441                                 mod_start = ordered->file_offset +
3442                                         ordered->len;
3443                         } else {
3444                                 mod_len = 0;
3445                         }
3446                 }
3447
3448                 /*
3449                  * To keep us from looping for the above case of an ordered
3450                  * extent that falls inside of the logged extent.
3451                  */
3452                 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
3453                                      &ordered->flags))
3454                         continue;
3455                 atomic_inc(&ordered->refs);
3456                 spin_unlock_irq(&log->log_extents_lock[index]);
3457                 /*
3458                  * we've dropped the lock, we must either break or
3459                  * start over after this.
3460                  */
3461
3462                 wait_event(ordered->wait, ordered->csum_bytes_left == 0);
3463
3464                 list_for_each_entry(sum, &ordered->list, list) {
3465                         ret = btrfs_csum_file_blocks(trans, log, sum);
3466                         if (ret) {
3467                                 btrfs_put_ordered_extent(ordered);
3468                                 goto unlocked;
3469                         }
3470                 }
3471                 btrfs_put_ordered_extent(ordered);
3472                 goto again;
3473
3474         }
3475         spin_unlock_irq(&log->log_extents_lock[index]);
3476 unlocked:
3477
3478         if (!mod_len || ret)
3479                 return ret;
3480
3481         csum_offset = mod_start - em->start;
3482         csum_len = mod_len;
3483
3484         /* block start is already adjusted for the file extent offset. */
3485         ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
3486                                        em->block_start + csum_offset,
3487                                        em->block_start + csum_offset +
3488                                        csum_len - 1, &ordered_sums, 0);
3489         if (ret)
3490                 return ret;
3491
3492         while (!list_empty(&ordered_sums)) {
3493                 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3494                                                    struct btrfs_ordered_sum,
3495                                                    list);
3496                 if (!ret)
3497                         ret = btrfs_csum_file_blocks(trans, log, sums);
3498                 list_del(&sums->list);
3499                 kfree(sums);
3500         }
3501
3502         return ret;
3503 }
3504
3505 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3506                                      struct btrfs_root *root,
3507                                      struct inode *inode,
3508                                      struct btrfs_path *path)
3509 {
3510         struct extent_map *em, *n;
3511         struct list_head extents;
3512         struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3513         u64 test_gen;
3514         int ret = 0;
3515         int num = 0;
3516
3517         INIT_LIST_HEAD(&extents);
3518
3519         write_lock(&tree->lock);
3520         test_gen = root->fs_info->last_trans_committed;
3521
3522         list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
3523                 list_del_init(&em->list);
3524
3525                 /*
3526                  * Just an arbitrary number, this can be really CPU intensive
3527                  * once we start getting a lot of extents, and really once we
3528                  * have a bunch of extents we just want to commit since it will
3529                  * be faster.
3530                  */
3531                 if (++num > 32768) {
3532                         list_del_init(&tree->modified_extents);
3533                         ret = -EFBIG;
3534                         goto process;
3535                 }
3536
3537                 if (em->generation <= test_gen)
3538                         continue;
3539                 /* Need a ref to keep it from getting evicted from cache */
3540                 atomic_inc(&em->refs);
3541                 set_bit(EXTENT_FLAG_LOGGING, &em->flags);
3542                 list_add_tail(&em->list, &extents);
3543                 num++;
3544         }
3545
3546         list_sort(NULL, &extents, extent_cmp);
3547
3548 process:
3549         while (!list_empty(&extents)) {
3550                 em = list_entry(extents.next, struct extent_map, list);
3551
3552                 list_del_init(&em->list);
3553
3554                 /*
3555                  * If we had an error we just need to delete everybody from our
3556                  * private list.
3557                  */
3558                 if (ret) {
3559                         clear_em_logging(tree, em);
3560                         free_extent_map(em);
3561                         continue;
3562                 }
3563
3564                 write_unlock(&tree->lock);
3565
3566                 ret = log_one_extent(trans, inode, root, em, path);
3567                 write_lock(&tree->lock);
3568                 clear_em_logging(tree, em);
3569                 free_extent_map(em);
3570         }
3571         WARN_ON(!list_empty(&extents));
3572         write_unlock(&tree->lock);
3573
3574         btrfs_release_path(path);
3575         return ret;
3576 }
3577
3578 /* log a single inode in the tree log.
3579  * At least one parent directory for this inode must exist in the tree
3580  * or be logged already.
3581  *
3582  * Any items from this inode changed by the current transaction are copied
3583  * to the log tree.  An extra reference is taken on any extents in this
3584  * file, allowing us to avoid a whole pile of corner cases around logging
3585  * blocks that have been removed from the tree.
3586  *
3587  * See LOG_INODE_ALL and related defines for a description of what inode_only
3588  * does.
3589  *
3590  * This handles both files and directories.
3591  */
3592 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3593                              struct btrfs_root *root, struct inode *inode,
3594                              int inode_only)
3595 {
3596         struct btrfs_path *path;
3597         struct btrfs_path *dst_path;
3598         struct btrfs_key min_key;
3599         struct btrfs_key max_key;
3600         struct btrfs_root *log = root->log_root;
3601         struct extent_buffer *src = NULL;
3602         int err = 0;
3603         int ret;
3604         int nritems;
3605         int ins_start_slot = 0;
3606         int ins_nr;
3607         bool fast_search = false;
3608         u64 ino = btrfs_ino(inode);
3609
3610         path = btrfs_alloc_path();
3611         if (!path)
3612                 return -ENOMEM;
3613         dst_path = btrfs_alloc_path();
3614         if (!dst_path) {
3615                 btrfs_free_path(path);
3616                 return -ENOMEM;
3617         }
3618
3619         min_key.objectid = ino;
3620         min_key.type = BTRFS_INODE_ITEM_KEY;
3621         min_key.offset = 0;
3622
3623         max_key.objectid = ino;
3624
3625
3626         /* today the code can only do partial logging of directories */
3627         if (S_ISDIR(inode->i_mode) ||
3628             (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3629                        &BTRFS_I(inode)->runtime_flags) &&
3630              inode_only == LOG_INODE_EXISTS))
3631                 max_key.type = BTRFS_XATTR_ITEM_KEY;
3632         else
3633                 max_key.type = (u8)-1;
3634         max_key.offset = (u64)-1;
3635
3636         /* Only run delayed items if we are a dir or a new file */
3637         if (S_ISDIR(inode->i_mode) ||
3638             BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
3639                 ret = btrfs_commit_inode_delayed_items(trans, inode);
3640                 if (ret) {
3641                         btrfs_free_path(path);
3642                         btrfs_free_path(dst_path);
3643                         return ret;
3644                 }
3645         }
3646
3647         mutex_lock(&BTRFS_I(inode)->log_mutex);
3648
3649         btrfs_get_logged_extents(log, inode);
3650
3651         /*
3652          * a brute force approach to making sure we get the most uptodate
3653          * copies of everything.
3654          */
3655         if (S_ISDIR(inode->i_mode)) {
3656                 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
3657
3658                 if (inode_only == LOG_INODE_EXISTS)
3659                         max_key_type = BTRFS_XATTR_ITEM_KEY;
3660                 ret = drop_objectid_items(trans, log, path, ino, max_key_type);
3661         } else {
3662                 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3663                                        &BTRFS_I(inode)->runtime_flags)) {
3664                         clear_bit(BTRFS_INODE_COPY_EVERYTHING,
3665                                   &BTRFS_I(inode)->runtime_flags);
3666                         ret = btrfs_truncate_inode_items(trans, log,
3667                                                          inode, 0, 0);
3668                 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
3669                                               &BTRFS_I(inode)->runtime_flags)) {
3670                         if (inode_only == LOG_INODE_ALL)
3671                                 fast_search = true;
3672                         max_key.type = BTRFS_XATTR_ITEM_KEY;
3673                         ret = drop_objectid_items(trans, log, path, ino,
3674                                                   max_key.type);
3675                 } else {
3676                         if (inode_only == LOG_INODE_ALL)
3677                                 fast_search = true;
3678                         ret = log_inode_item(trans, log, dst_path, inode);
3679                         if (ret) {
3680                                 err = ret;
3681                                 goto out_unlock;
3682                         }
3683                         goto log_extents;
3684                 }
3685
3686         }
3687         if (ret) {
3688                 err = ret;
3689                 goto out_unlock;
3690         }
3691         path->keep_locks = 1;
3692
3693         while (1) {
3694                 ins_nr = 0;
3695                 ret = btrfs_search_forward(root, &min_key, &max_key,
3696                                            path, trans->transid);
3697                 if (ret != 0)
3698                         break;
3699 again:
3700                 /* note, ins_nr might be > 0 here, cleanup outside the loop */
3701                 if (min_key.objectid != ino)
3702                         break;
3703                 if (min_key.type > max_key.type)
3704                         break;
3705
3706                 src = path->nodes[0];
3707                 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
3708                         ins_nr++;
3709                         goto next_slot;
3710                 } else if (!ins_nr) {
3711                         ins_start_slot = path->slots[0];
3712                         ins_nr = 1;
3713                         goto next_slot;
3714                 }
3715
3716                 ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
3717                                  ins_nr, inode_only);
3718                 if (ret) {
3719                         err = ret;
3720                         goto out_unlock;
3721                 }
3722                 ins_nr = 1;
3723                 ins_start_slot = path->slots[0];
3724 next_slot:
3725
3726                 nritems = btrfs_header_nritems(path->nodes[0]);
3727                 path->slots[0]++;
3728                 if (path->slots[0] < nritems) {
3729                         btrfs_item_key_to_cpu(path->nodes[0], &min_key,
3730                                               path->slots[0]);
3731                         goto again;
3732                 }
3733                 if (ins_nr) {
3734                         ret = copy_items(trans, inode, dst_path, src,
3735                                          ins_start_slot,
3736                                          ins_nr, inode_only);
3737                         if (ret) {
3738                                 err = ret;
3739                                 goto out_unlock;
3740                         }
3741                         ins_nr = 0;
3742                 }
3743                 btrfs_release_path(path);
3744
3745                 if (min_key.offset < (u64)-1)
3746                         min_key.offset++;
3747                 else if (min_key.type < (u8)-1)
3748                         min_key.type++;
3749                 else if (min_key.objectid < (u64)-1)
3750                         min_key.objectid++;
3751                 else
3752                         break;
3753         }
3754         if (ins_nr) {
3755                 ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
3756                                  ins_nr, inode_only);
3757                 if (ret) {
3758                         err = ret;
3759                         goto out_unlock;
3760                 }
3761                 ins_nr = 0;
3762         }
3763
3764 log_extents:
3765         btrfs_release_path(path);
3766         btrfs_release_path(dst_path);
3767         if (fast_search) {
3768                 ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
3769                 if (ret) {
3770                         err = ret;
3771                         goto out_unlock;
3772                 }
3773         } else {
3774                 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3775                 struct extent_map *em, *n;
3776
3777                 write_lock(&tree->lock);
3778                 list_for_each_entry_safe(em, n, &tree->modified_extents, list)
3779                         list_del_init(&em->list);
3780                 write_unlock(&tree->lock);
3781         }
3782
3783         if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
3784                 ret = log_directory_changes(trans, root, inode, path, dst_path);
3785                 if (ret) {
3786                         err = ret;
3787                         goto out_unlock;
3788                 }
3789         }
3790         BTRFS_I(inode)->logged_trans = trans->transid;
3791         BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
3792 out_unlock:
3793         if (err)
3794                 btrfs_free_logged_extents(log, log->log_transid);
3795         mutex_unlock(&BTRFS_I(inode)->log_mutex);
3796
3797         btrfs_free_path(path);
3798         btrfs_free_path(dst_path);
3799         return err;
3800 }
3801
3802 /*
3803  * follow the dentry parent pointers up the chain and see if any
3804  * of the directories in it require a full commit before they can
3805  * be logged.  Returns zero if nothing special needs to be done or 1 if
3806  * a full commit is required.
3807  */
3808 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
3809                                                struct inode *inode,
3810                                                struct dentry *parent,
3811                                                struct super_block *sb,
3812                                                u64 last_committed)
3813 {
3814         int ret = 0;
3815         struct btrfs_root *root;
3816         struct dentry *old_parent = NULL;
3817         struct inode *orig_inode = inode;
3818
3819         /*
3820          * for regular files, if its inode is already on disk, we don't
3821          * have to worry about the parents at all.  This is because
3822          * we can use the last_unlink_trans field to record renames
3823          * and other fun in this file.
3824          */
3825         if (S_ISREG(inode->i_mode) &&
3826             BTRFS_I(inode)->generation <= last_committed &&
3827             BTRFS_I(inode)->last_unlink_trans <= last_committed)
3828                         goto out;
3829
3830         if (!S_ISDIR(inode->i_mode)) {
3831                 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
3832                         goto out;
3833                 inode = parent->d_inode;
3834         }
3835
3836         while (1) {
3837                 /*
3838                  * If we are logging a directory then we start with our inode,
3839                  * not our parents inode, so we need to skipp setting the
3840                  * logged_trans so that further down in the log code we don't
3841                  * think this inode has already been logged.
3842                  */
3843                 if (inode != orig_inode)
3844                         BTRFS_I(inode)->logged_trans = trans->transid;
3845                 smp_mb();
3846
3847                 if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
3848                         root = BTRFS_I(inode)->root;
3849
3850                         /*
3851                          * make sure any commits to the log are forced
3852                          * to be full commits
3853                          */
3854                         root->fs_info->last_trans_log_full_commit =
3855                                 trans->transid;
3856                         ret = 1;
3857                         break;
3858                 }
3859
3860                 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
3861                         break;
3862
3863                 if (IS_ROOT(parent))
3864                         break;
3865
3866                 parent = dget_parent(parent);
3867                 dput(old_parent);
3868                 old_parent = parent;
3869                 inode = parent->d_inode;
3870
3871         }
3872         dput(old_parent);
3873 out:
3874         return ret;
3875 }
3876
3877 /*
3878  * helper function around btrfs_log_inode to make sure newly created
3879  * parent directories also end up in the log.  A minimal inode and backref
3880  * only logging is done of any parent directories that are older than
3881  * the last committed transaction
3882  */
3883 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
3884                                   struct btrfs_root *root, struct inode *inode,
3885                                   struct dentry *parent, int exists_only)
3886 {
3887         int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
3888         struct super_block *sb;
3889         struct dentry *old_parent = NULL;
3890         int ret = 0;
3891         u64 last_committed = root->fs_info->last_trans_committed;
3892
3893         sb = inode->i_sb;
3894
3895         if (btrfs_test_opt(root, NOTREELOG)) {
3896                 ret = 1;
3897                 goto end_no_trans;
3898         }
3899
3900         if (root->fs_info->last_trans_log_full_commit >
3901             root->fs_info->last_trans_committed) {
3902                 ret = 1;
3903                 goto end_no_trans;
3904         }
3905
3906         if (root != BTRFS_I(inode)->root ||
3907             btrfs_root_refs(&root->root_item) == 0) {
3908                 ret = 1;
3909                 goto end_no_trans;
3910         }
3911
3912         ret = check_parent_dirs_for_sync(trans, inode, parent,
3913                                          sb, last_committed);
3914         if (ret)
3915                 goto end_no_trans;
3916
3917         if (btrfs_inode_in_log(inode, trans->transid)) {
3918                 ret = BTRFS_NO_LOG_SYNC;
3919                 goto end_no_trans;
3920         }
3921
3922         ret = start_log_trans(trans, root);
3923         if (ret)
3924                 goto end_trans;
3925
3926         ret = btrfs_log_inode(trans, root, inode, inode_only);
3927         if (ret)
3928                 goto end_trans;
3929
3930         /*
3931          * for regular files, if its inode is already on disk, we don't
3932          * have to worry about the parents at all.  This is because
3933          * we can use the last_unlink_trans field to record renames
3934          * and other fun in this file.
3935          */
3936         if (S_ISREG(inode->i_mode) &&
3937             BTRFS_I(inode)->generation <= last_committed &&
3938             BTRFS_I(inode)->last_unlink_trans <= last_committed) {
3939                 ret = 0;
3940                 goto end_trans;
3941         }
3942
3943         inode_only = LOG_INODE_EXISTS;
3944         while (1) {
3945                 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
3946                         break;
3947
3948                 inode = parent->d_inode;
3949                 if (root != BTRFS_I(inode)->root)
3950                         break;
3951
3952                 if (BTRFS_I(inode)->generation >
3953                     root->fs_info->last_trans_committed) {
3954                         ret = btrfs_log_inode(trans, root, inode, inode_only);
3955                         if (ret)
3956                                 goto end_trans;
3957                 }
3958                 if (IS_ROOT(parent))
3959                         break;
3960
3961                 parent = dget_parent(parent);
3962                 dput(old_parent);
3963                 old_parent = parent;
3964         }
3965         ret = 0;
3966 end_trans:
3967         dput(old_parent);
3968         if (ret < 0) {
3969                 root->fs_info->last_trans_log_full_commit = trans->transid;
3970                 ret = 1;
3971         }
3972         btrfs_end_log_trans(root);
3973 end_no_trans:
3974         return ret;
3975 }
3976
3977 /*
3978  * it is not safe to log dentry if the chunk root has added new
3979  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
3980  * If this returns 1, you must commit the transaction to safely get your
3981  * data on disk.
3982  */
3983 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
3984                           struct btrfs_root *root, struct dentry *dentry)
3985 {
3986         struct dentry *parent = dget_parent(dentry);
3987         int ret;
3988
3989         ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
3990         dput(parent);
3991
3992         return ret;
3993 }
3994
3995 /*
3996  * should be called during mount to recover any replay any log trees
3997  * from the FS
3998  */
3999 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
4000 {
4001         int ret;
4002         struct btrfs_path *path;
4003         struct btrfs_trans_handle *trans;
4004         struct btrfs_key key;
4005         struct btrfs_key found_key;
4006         struct btrfs_key tmp_key;
4007         struct btrfs_root *log;
4008         struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
4009         struct walk_control wc = {
4010                 .process_func = process_one_buffer,
4011                 .stage = 0,
4012         };
4013
4014         path = btrfs_alloc_path();
4015         if (!path)
4016                 return -ENOMEM;
4017
4018         fs_info->log_root_recovering = 1;
4019
4020         trans = btrfs_start_transaction(fs_info->tree_root, 0);
4021         if (IS_ERR(trans)) {
4022                 ret = PTR_ERR(trans);
4023                 goto error;
4024         }
4025
4026         wc.trans = trans;
4027         wc.pin = 1;
4028
4029         ret = walk_log_tree(trans, log_root_tree, &wc);
4030         if (ret) {
4031                 btrfs_error(fs_info, ret, "Failed to pin buffers while "
4032                             "recovering log root tree.");
4033                 goto error;
4034         }
4035
4036 again:
4037         key.objectid = BTRFS_TREE_LOG_OBJECTID;
4038         key.offset = (u64)-1;
4039         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
4040
4041         while (1) {
4042                 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
4043
4044                 if (ret < 0) {
4045                         btrfs_error(fs_info, ret,
4046                                     "Couldn't find tree log root.");
4047                         goto error;
4048                 }
4049                 if (ret > 0) {
4050                         if (path->slots[0] == 0)
4051                                 break;
4052                         path->slots[0]--;
4053                 }
4054                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
4055                                       path->slots[0]);
4056                 btrfs_release_path(path);
4057                 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
4058                         break;
4059
4060                 log = btrfs_read_fs_root(log_root_tree, &found_key);
4061                 if (IS_ERR(log)) {
4062                         ret = PTR_ERR(log);
4063                         btrfs_error(fs_info, ret,
4064                                     "Couldn't read tree log root.");
4065                         goto error;
4066                 }
4067
4068                 tmp_key.objectid = found_key.offset;
4069                 tmp_key.type = BTRFS_ROOT_ITEM_KEY;
4070                 tmp_key.offset = (u64)-1;
4071
4072                 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
4073                 if (IS_ERR(wc.replay_dest)) {
4074                         ret = PTR_ERR(wc.replay_dest);
4075                         free_extent_buffer(log->node);
4076                         free_extent_buffer(log->commit_root);
4077                         kfree(log);
4078                         btrfs_error(fs_info, ret, "Couldn't read target root "
4079                                     "for tree log recovery.");
4080                         goto error;
4081                 }
4082
4083                 wc.replay_dest->log_root = log;
4084                 btrfs_record_root_in_trans(trans, wc.replay_dest);
4085                 ret = walk_log_tree(trans, log, &wc);
4086
4087                 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
4088                         ret = fixup_inode_link_counts(trans, wc.replay_dest,
4089                                                       path);
4090                 }
4091
4092                 key.offset = found_key.offset - 1;
4093                 wc.replay_dest->log_root = NULL;
4094                 free_extent_buffer(log->node);
4095                 free_extent_buffer(log->commit_root);
4096                 kfree(log);
4097
4098                 if (ret)
4099                         goto error;
4100
4101                 if (found_key.offset == 0)
4102                         break;
4103         }
4104         btrfs_release_path(path);
4105
4106         /* step one is to pin it all, step two is to replay just inodes */
4107         if (wc.pin) {
4108                 wc.pin = 0;
4109                 wc.process_func = replay_one_buffer;
4110                 wc.stage = LOG_WALK_REPLAY_INODES;
4111                 goto again;
4112         }
4113         /* step three is to replay everything */
4114         if (wc.stage < LOG_WALK_REPLAY_ALL) {
4115                 wc.stage++;
4116                 goto again;
4117         }
4118
4119         btrfs_free_path(path);
4120
4121         /* step 4: commit the transaction, which also unpins the blocks */
4122         ret = btrfs_commit_transaction(trans, fs_info->tree_root);
4123         if (ret)
4124                 return ret;
4125
4126         free_extent_buffer(log_root_tree->node);
4127         log_root_tree->log_root = NULL;
4128         fs_info->log_root_recovering = 0;
4129         kfree(log_root_tree);
4130
4131         return 0;
4132 error:
4133         if (wc.trans)
4134                 btrfs_end_transaction(wc.trans, fs_info->tree_root);
4135         btrfs_free_path(path);
4136         return ret;
4137 }
4138
4139 /*
4140  * there are some corner cases where we want to force a full
4141  * commit instead of allowing a directory to be logged.
4142  *
4143  * They revolve around files there were unlinked from the directory, and
4144  * this function updates the parent directory so that a full commit is
4145  * properly done if it is fsync'd later after the unlinks are done.
4146  */
4147 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
4148                              struct inode *dir, struct inode *inode,
4149                              int for_rename)
4150 {
4151         /*
4152          * when we're logging a file, if it hasn't been renamed
4153          * or unlinked, and its inode is fully committed on disk,
4154          * we don't have to worry about walking up the directory chain
4155          * to log its parents.
4156          *
4157          * So, we use the last_unlink_trans field to put this transid
4158          * into the file.  When the file is logged we check it and
4159          * don't log the parents if the file is fully on disk.
4160          */
4161         if (S_ISREG(inode->i_mode))
4162                 BTRFS_I(inode)->last_unlink_trans = trans->transid;
4163
4164         /*
4165          * if this directory was already logged any new
4166          * names for this file/dir will get recorded
4167          */
4168         smp_mb();
4169         if (BTRFS_I(dir)->logged_trans == trans->transid)
4170                 return;
4171
4172         /*
4173          * if the inode we're about to unlink was logged,
4174          * the log will be properly updated for any new names
4175          */
4176         if (BTRFS_I(inode)->logged_trans == trans->transid)
4177                 return;
4178
4179         /*
4180          * when renaming files across directories, if the directory
4181          * there we're unlinking from gets fsync'd later on, there's
4182          * no way to find the destination directory later and fsync it
4183          * properly.  So, we have to be conservative and force commits
4184          * so the new name gets discovered.
4185          */
4186         if (for_rename)
4187                 goto record;
4188
4189         /* we can safely do the unlink without any special recording */
4190         return;
4191
4192 record:
4193         BTRFS_I(dir)->last_unlink_trans = trans->transid;
4194 }
4195
4196 /*
4197  * Call this after adding a new name for a file and it will properly
4198  * update the log to reflect the new name.
4199  *
4200  * It will return zero if all goes well, and it will return 1 if a
4201  * full transaction commit is required.
4202  */
4203 int btrfs_log_new_name(struct btrfs_trans_handle *trans,
4204                         struct inode *inode, struct inode *old_dir,
4205                         struct dentry *parent)
4206 {
4207         struct btrfs_root * root = BTRFS_I(inode)->root;
4208
4209         /*
4210          * this will force the logging code to walk the dentry chain
4211          * up for the file
4212          */
4213         if (S_ISREG(inode->i_mode))
4214                 BTRFS_I(inode)->last_unlink_trans = trans->transid;
4215
4216         /*
4217          * if this inode hasn't been logged and directory we're renaming it
4218          * from hasn't been logged, we don't need to log it
4219          */
4220         if (BTRFS_I(inode)->logged_trans <=
4221             root->fs_info->last_trans_committed &&
4222             (!old_dir || BTRFS_I(old_dir)->logged_trans <=
4223                     root->fs_info->last_trans_committed))
4224                 return 0;
4225
4226         return btrfs_log_inode_parent(trans, root, inode, parent, 1);
4227 }
4228