Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jmorris...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 14 Aug 2014 16:09:48 +0000 (10:09 -0600)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 14 Aug 2014 16:09:48 +0000 (10:09 -0600)
Pull seccomp fix from James Morris.

BUG(!spin_is_locked()) really doesn't work very well in UP
configurations without any actual spinlock state.  Which is very much
why we have that "assert_spin_lock()" function for this.

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/linux-security:
  seccomp: Replace BUG(!spin_is_locked()) with assert_spin_lock

674 files changed:
Documentation/ABI/testing/sysfs-bus-rbd
Documentation/ABI/testing/sysfs-fs-xfs [new file with mode: 0644]
Documentation/device-mapper/switch.txt
Documentation/devicetree/bindings/dma/fsl-imx-sdma.txt
Documentation/devicetree/bindings/dma/mpc512x-dma.txt [new file with mode: 0644]
Documentation/devicetree/bindings/dma/nbpfaxi.txt [new file with mode: 0644]
Documentation/devicetree/bindings/dma/rcar-audmapp.txt [new file with mode: 0644]
Documentation/devicetree/bindings/dma/renesas,rcar-dmac.txt [new file with mode: 0644]
Documentation/devicetree/bindings/dma/ste-dma40.txt
Documentation/devicetree/bindings/dma/sun6i-dma.txt [new file with mode: 0644]
Documentation/devicetree/bindings/mmc/exynos-dw-mshc.txt
Documentation/devicetree/bindings/mmc/k3-dw-mshc.txt
Documentation/devicetree/bindings/mmc/mmc.txt
Documentation/devicetree/bindings/mmc/renesas,mmcif.txt [new file with mode: 0644]
Documentation/devicetree/bindings/mmc/sdhci-msm.txt
Documentation/devicetree/bindings/mmc/sdhci-st.txt [new file with mode: 0644]
Documentation/devicetree/bindings/mmc/synopsys-dw-mshc.txt
Documentation/devicetree/bindings/mmc/ti-omap-hsmmc.txt
Documentation/devicetree/bindings/mmc/tmio_mmc.txt
Documentation/devicetree/bindings/net/apm-xgene-enet.txt [new file with mode: 0644]
Documentation/devicetree/bindings/net/fsl-fec.txt
Documentation/devicetree/bindings/thermal/exynos-thermal.txt
Documentation/devicetree/bindings/thermal/rcar-thermal.txt
Documentation/devicetree/bindings/thermal/st-thermal.txt [new file with mode: 0644]
Documentation/devicetree/changesets.txt [new file with mode: 0644]
Documentation/devicetree/todo.txt [new file with mode: 0644]
Documentation/dmaengine.txt
Documentation/filesystems/nfs/Exporting
Documentation/filesystems/vfs.txt
Documentation/kernel-parameters.txt
MAINTAINERS
arch/arm/boot/dts/versatile-ab.dts
arch/arm/boot/dts/versatile-pb.dts
arch/arm/common/edma.c
arch/arm/xen/grant-table.c
arch/arm64/boot/dts/apm-mustang.dts
arch/arm64/boot/dts/apm-storm.dtsi
arch/metag/kernel/cachepart.c
arch/metag/mm/hugetlbpage.c
arch/powerpc/boot/dts/mpc5121.dtsi
arch/powerpc/kernel/prom.c
arch/powerpc/platforms/powermac/feature.c
arch/powerpc/platforms/powermac/pci.c
arch/powerpc/platforms/powermac/smp.c
arch/powerpc/platforms/powermac/udbg_adb.c
arch/powerpc/platforms/pseries/hotplug-memory.c
arch/powerpc/platforms/pseries/setup.c
arch/sh/drivers/dma/dma-sh.c
arch/sh/include/asm/dma-register.h
arch/sh/kernel/cpu/sh4a/setup-sh7722.c
arch/sh/kernel/cpu/sh4a/setup-sh7724.c
arch/sh/kernel/cpu/sh4a/setup-sh7757.c
arch/sparc/include/uapi/asm/unistd.h
arch/sparc/kernel/nmi.c
arch/sparc/kernel/perf_event.c
arch/sparc/kernel/process_64.c
arch/sparc/kernel/smp_32.c
arch/sparc/kernel/smp_64.c
arch/sparc/kernel/systbls_32.S
arch/sparc/kernel/systbls_64.S
arch/x86/Kconfig
arch/x86/include/asm/alternative.h
arch/x86/include/asm/apic.h
arch/x86/include/asm/fpu-internal.h
arch/x86/include/asm/hardirq.h
arch/x86/include/asm/i8259.h
arch/x86/include/asm/io_apic.h
arch/x86/include/asm/mpspec.h
arch/x86/include/asm/processor.h
arch/x86/include/asm/prom.h
arch/x86/include/asm/smpboot_hooks.h
arch/x86/include/asm/xsave.h
arch/x86/kernel/acpi/boot.c
arch/x86/kernel/apic/apic.c
arch/x86/kernel/apic/apic_flat_64.c
arch/x86/kernel/apic/apic_noop.c
arch/x86/kernel/apic/apic_numachip.c
arch/x86/kernel/apic/bigsmp_32.c
arch/x86/kernel/apic/io_apic.c
arch/x86/kernel/apic/probe_32.c
arch/x86/kernel/apic/x2apic_cluster.c
arch/x86/kernel/apic/x2apic_phys.c
arch/x86/kernel/apic/x2apic_uv_x.c
arch/x86/kernel/cpu/common.c
arch/x86/kernel/devicetree.c
arch/x86/kernel/i387.c
arch/x86/kernel/irqinit.c
arch/x86/kernel/mpparse.c
arch/x86/kernel/process.c
arch/x86/kernel/smpboot.c
arch/x86/kernel/vsmp_64.c
arch/x86/kernel/xsave.c
arch/x86/pci/acpi.c
arch/x86/pci/intel_mid_pci.c
arch/x86/pci/irq.c
arch/x86/pci/xen.c
arch/x86/platform/ce4100/ce4100.c
arch/x86/platform/intel-mid/device_libs/platform_wdt.c
arch/x86/platform/intel-mid/sfi.c
arch/x86/platform/sfi/sfi.c
arch/x86/xen/grant-table.c
arch/x86/xen/time.c
block/bio-integrity.c
block/bio.c
block/blk-core.c
block/blk-mq.c
block/blk-mq.h
block/blk-sysfs.c
block/compat_ioctl.c
block/ioctl.c
block/partitions/aix.c
block/partitions/amiga.c
block/partitions/efi.c
block/partitions/msdos.c
block/scsi_ioctl.c
drivers/acpi/pci_irq.c
drivers/atm/atmtcp.c
drivers/atm/solos-pci.c
drivers/block/drbd/Makefile
drivers/block/drbd/drbd_actlog.c
drivers/block/drbd/drbd_bitmap.c
drivers/block/drbd/drbd_debugfs.c [new file with mode: 0644]
drivers/block/drbd/drbd_debugfs.h [new file with mode: 0644]
drivers/block/drbd/drbd_int.h
drivers/block/drbd/drbd_interval.h
drivers/block/drbd/drbd_main.c
drivers/block/drbd/drbd_nl.c
drivers/block/drbd/drbd_proc.c
drivers/block/drbd/drbd_receiver.c
drivers/block/drbd/drbd_req.c
drivers/block/drbd/drbd_req.h
drivers/block/drbd/drbd_state.c
drivers/block/drbd/drbd_worker.c
drivers/block/rbd.c
drivers/block/virtio_blk.c
drivers/cpufreq/pmac64-cpufreq.c
drivers/crypto/nx/nx-842.c
drivers/dma/Kconfig
drivers/dma/Makefile
drivers/dma/TODO
drivers/dma/amba-pl08x.c
drivers/dma/at_hdmac.c
drivers/dma/bcm2835-dma.c
drivers/dma/dma-jz4740.c
drivers/dma/dw/core.c
drivers/dma/edma.c
drivers/dma/ep93xx_dma.c
drivers/dma/fsl-edma.c
drivers/dma/fsldma.c
drivers/dma/fsldma.h
drivers/dma/imx-dma.c
drivers/dma/imx-sdma.c
drivers/dma/ipu/ipu_idmac.c
drivers/dma/mmp_pdma.c
drivers/dma/mmp_tdma.c
drivers/dma/mpc512x_dma.c
drivers/dma/mxs-dma.c
drivers/dma/nbpfaxi.c [new file with mode: 0644]
drivers/dma/of-dma.c
drivers/dma/omap-dma.c
drivers/dma/pl330.c
drivers/dma/qcom_bam_dma.c
drivers/dma/s3c24xx-dma.c
drivers/dma/sa11x0-dma.c
drivers/dma/sh/Kconfig
drivers/dma/sh/Makefile
drivers/dma/sh/rcar-audmapp.c
drivers/dma/sh/shdma-arm.h
drivers/dma/sh/shdma-base.c
drivers/dma/sh/shdma.h
drivers/dma/sh/shdmac.c
drivers/dma/sirf-dma.c
drivers/dma/ste_dma40.c
drivers/dma/sun6i-dma.c [new file with mode: 0644]
drivers/dma/tegra20-apb-dma.c
drivers/edac/cell_edac.c
drivers/hwmon/adm1025.c
drivers/hwmon/adm1026.c
drivers/hwmon/ads1015.c
drivers/hwmon/asb100.c
drivers/hwmon/dme1737.c
drivers/hwmon/emc6w201.c
drivers/hwmon/hih6130.c
drivers/hwmon/lm87.c
drivers/hwmon/lm92.c
drivers/hwmon/pc87360.c
drivers/hwmon/tmp103.c
drivers/hwmon/vt1211.c
drivers/hwmon/w83627hf.c
drivers/hwmon/w83791d.c
drivers/hwmon/w83793.c
drivers/md/bcache/alloc.c
drivers/md/bcache/bcache.h
drivers/md/bcache/bset.c
drivers/md/bcache/bset.h
drivers/md/bcache/btree.c
drivers/md/bcache/btree.h
drivers/md/bcache/extents.c
drivers/md/bcache/extents.h
drivers/md/bcache/journal.c
drivers/md/bcache/request.c
drivers/md/bcache/super.c
drivers/md/bcache/util.h
drivers/md/bcache/writeback.c
drivers/md/bcache/writeback.h
drivers/md/dm-cache-metadata.c
drivers/md/dm-cache-metadata.h
drivers/md/dm-cache-target.c
drivers/md/dm-crypt.c
drivers/md/dm-io.c
drivers/md/dm-mpath.c
drivers/md/dm-switch.c
drivers/md/dm-table.c
drivers/md/dm-thin.c
drivers/md/dm.h
drivers/md/md.c
drivers/md/raid0.c
drivers/md/raid1.c
drivers/md/raid10.c
drivers/mmc/card/block.c
drivers/mmc/core/bus.c
drivers/mmc/core/core.c
drivers/mmc/core/mmc.c
drivers/mmc/core/quirks.c
drivers/mmc/core/sd_ops.c
drivers/mmc/host/Kconfig
drivers/mmc/host/Makefile
drivers/mmc/host/dw_mmc.c
drivers/mmc/host/dw_mmc.h
drivers/mmc/host/mmci.c
drivers/mmc/host/mmci.h
drivers/mmc/host/moxart-mmc.c
drivers/mmc/host/mxs-mmc.c
drivers/mmc/host/omap_hsmmc.c
drivers/mmc/host/s3cmci.c
drivers/mmc/host/s3cmci.h
drivers/mmc/host/sdhci-acpi.c
drivers/mmc/host/sdhci-msm.c
drivers/mmc/host/sdhci-pci.c
drivers/mmc/host/sdhci-pci.h
drivers/mmc/host/sdhci-pxav3.c
drivers/mmc/host/sdhci-st.c [new file with mode: 0644]
drivers/mmc/host/sdhci-tegra.c
drivers/mmc/host/sdhci.c
drivers/mmc/host/sh_mmcif.c
drivers/mmc/host/tmio_mmc_dma.c
drivers/mmc/host/wmt-sdmmc.c
drivers/mtd/ubi/block.c
drivers/mtd/ubi/vtbl.c
drivers/mtd/ubi/wl.c
drivers/net/arcnet/com20020_cs.c
drivers/net/ethernet/8390/Kconfig
drivers/net/ethernet/8390/axnet_cs.c
drivers/net/ethernet/8390/ne.c
drivers/net/ethernet/8390/pcnet_cs.c
drivers/net/ethernet/Kconfig
drivers/net/ethernet/Makefile
drivers/net/ethernet/amd/xgbe/xgbe-drv.c
drivers/net/ethernet/apm/Kconfig [new file with mode: 0644]
drivers/net/ethernet/apm/Makefile [new file with mode: 0644]
drivers/net/ethernet/apm/xgene/Kconfig [new file with mode: 0644]
drivers/net/ethernet/apm/xgene/Makefile [new file with mode: 0644]
drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c [new file with mode: 0644]
drivers/net/ethernet/apm/xgene/xgene_enet_hw.c [new file with mode: 0644]
drivers/net/ethernet/apm/xgene/xgene_enet_hw.h [new file with mode: 0644]
drivers/net/ethernet/apm/xgene/xgene_enet_main.c [new file with mode: 0644]
drivers/net/ethernet/apm/xgene/xgene_enet_main.h [new file with mode: 0644]
drivers/net/ethernet/broadcom/genet/bcmgenet.c
drivers/net/ethernet/broadcom/genet/bcmmii.c
drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.c
drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h
drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
drivers/net/ethernet/davicom/dm9000.c
drivers/net/ethernet/freescale/fec.h
drivers/net/ethernet/freescale/fec_main.c
drivers/net/ethernet/freescale/fec_mpc52xx.c
drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c
drivers/net/ethernet/freescale/gianfar.c
drivers/net/ethernet/freescale/ucc_geth.c
drivers/net/ethernet/fujitsu/fmvj18x_cs.c
drivers/net/ethernet/marvell/mvneta.c
drivers/net/ethernet/myricom/myri10ge/myri10ge.c
drivers/net/ethernet/qlogic/qlcnic/Makefile
drivers/net/ethernet/smsc/smsc911x.h
drivers/net/ethernet/ti/cpmac.c
drivers/net/ethernet/xilinx/ll_temac_main.c
drivers/net/ethernet/xilinx/xilinx_axienet_main.c
drivers/net/ethernet/xircom/xirc2ps_cs.c
drivers/net/wan/hdlc_fr.c
drivers/net/wan/wanxl.c
drivers/net/wireless/airo_cs.c
drivers/net/wireless/atmel.c
drivers/net/xen-netback/interface.c
drivers/net/xen-netback/netback.c
drivers/net/xen-netfront.c
drivers/of/Kconfig
drivers/of/Makefile
drivers/of/base.c
drivers/of/device.c
drivers/of/dynamic.c [new file with mode: 0644]
drivers/of/fdt.c
drivers/of/of_private.h
drivers/of/of_reserved_mem.c
drivers/of/platform.c
drivers/of/selftest.c
drivers/of/testcase-data/testcases.dts [new file with mode: 0644]
drivers/of/testcase-data/testcases.dtsi [deleted file]
drivers/pci/hotplug/rpaphp_core.c
drivers/thermal/Kconfig
drivers/thermal/Makefile
drivers/thermal/cpu_cooling.c
drivers/thermal/int3403_thermal.c
drivers/thermal/samsung/exynos_tmu.c
drivers/thermal/samsung/exynos_tmu.h
drivers/thermal/samsung/exynos_tmu_data.c
drivers/thermal/samsung/exynos_tmu_data.h
drivers/thermal/st/Kconfig [new file with mode: 0644]
drivers/thermal/st/Makefile [new file with mode: 0644]
drivers/thermal/st/st_thermal.c [new file with mode: 0644]
drivers/thermal/st/st_thermal.h [new file with mode: 0644]
drivers/thermal/st/st_thermal_memmap.c [new file with mode: 0644]
drivers/thermal/st/st_thermal_syscfg.c [new file with mode: 0644]
drivers/tty/ehv_bytechan.c
drivers/tty/hvc/hvc_opal.c
drivers/tty/hvc/hvc_vio.c
drivers/tty/serial/pmac_zilog.c
drivers/tty/serial/serial_core.c
drivers/vfio/Kconfig
drivers/vfio/Makefile
drivers/vfio/pci/vfio_pci.c
drivers/vfio/pci/vfio_pci_private.h
drivers/vfio/vfio_spapr_eeh.c
fs/Makefile
fs/bad_inode.c
fs/btrfs/inode.c
fs/btrfs/super.c
fs/ceph/acl.c
fs/ceph/caps.c
fs/ceph/file.c
fs/ceph/mds_client.c
fs/ceph/super.c
fs/ceph/xattr.c
fs/cifs/cifsfs.c
fs/cifs/cifsfs.h
fs/cifs/inode.c
fs/dcache.c
fs/direct-io.c
fs/ext2/super.c
fs/ext4/namei.c
fs/fs_pin.c [new file with mode: 0644]
fs/fuse/dir.c
fs/fuse/file.c
fs/hostfs/hostfs.h
fs/hostfs/hostfs_kern.c
fs/hostfs/hostfs_user.c
fs/internal.h
fs/mount.h
fs/namei.c
fs/namespace.c
fs/nfs/blocklayout/blocklayout.c
fs/nfs/callback.c
fs/nfs/client.c
fs/nfs/delegation.c
fs/nfs/delegation.h
fs/nfs/dir.c
fs/nfs/direct.c
fs/nfs/filelayout/filelayout.c
fs/nfs/filelayout/filelayoutdev.c
fs/nfs/getroot.c
fs/nfs/inode.c
fs/nfs/internal.h
fs/nfs/nfs3acl.c
fs/nfs/nfs3proc.c
fs/nfs/nfs4_fs.h
fs/nfs/nfs4client.c
fs/nfs/nfs4proc.c
fs/nfs/nfs4state.c
fs/nfs/nfs4trace.h
fs/nfs/nfs4xdr.c
fs/nfs/objlayout/objio_osd.c
fs/nfs/objlayout/objlayout.c
fs/nfs/objlayout/objlayout.h
fs/nfs/pagelist.c
fs/nfs/pnfs.c
fs/nfs/pnfs.h
fs/nfs/proc.c
fs/nfs/read.c
fs/nfs/super.c
fs/nfs/write.c
fs/nfs_common/nfsacl.c
fs/nilfs2/super.c
fs/quota/dquot.c
fs/quota/kqid.c
fs/quota/netlink.c
fs/quota/quota.c
fs/reiserfs/do_balan.c
fs/reiserfs/journal.c
fs/reiserfs/lbalance.c
fs/reiserfs/reiserfs.h
fs/reiserfs/super.c
fs/super.c
fs/ubifs/commit.c
fs/ubifs/io.c
fs/ubifs/log.c
fs/ubifs/lpt.c
fs/ubifs/lpt_commit.c
fs/ubifs/master.c
fs/ubifs/orphan.c
fs/ubifs/recovery.c
fs/ubifs/sb.c
fs/ubifs/scan.c
fs/ubifs/super.c
fs/ubifs/tnc.c
fs/ubifs/tnc_commit.c
fs/ubifs/ubifs.h
fs/udf/file.c
fs/udf/lowlevel.c
fs/udf/super.c
fs/udf/symlink.c
fs/udf/unicode.c
fs/xfs/Kconfig
fs/xfs/Makefile
fs/xfs/libxfs/xfs_ag.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_alloc.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_alloc.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_alloc_btree.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_alloc_btree.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_attr.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_attr_leaf.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_attr_leaf.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_attr_remote.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_attr_remote.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_attr_sf.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_bit.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_bmap.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_bmap.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_bmap_btree.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_bmap_btree.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_btree.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_btree.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_cksum.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_da_btree.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_da_btree.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_da_format.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_da_format.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_dinode.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_dir2.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_dir2.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_dir2_block.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_dir2_data.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_dir2_leaf.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_dir2_node.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_dir2_priv.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_dir2_sf.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_dquot_buf.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_format.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_ialloc.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_ialloc.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_ialloc_btree.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_ialloc_btree.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_inode_buf.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_inode_buf.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_inode_fork.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_inode_fork.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_inum.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_log_format.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_log_recover.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_log_rlimit.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_quota_defs.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_rtbitmap.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_sb.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_sb.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_shared.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_symlink_remote.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_trans_resv.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_trans_resv.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_trans_space.h [new file with mode: 0644]
fs/xfs/xfs_acl.c
fs/xfs/xfs_ag.h [deleted file]
fs/xfs/xfs_alloc.c [deleted file]
fs/xfs/xfs_alloc.h [deleted file]
fs/xfs/xfs_alloc_btree.c [deleted file]
fs/xfs/xfs_alloc_btree.h [deleted file]
fs/xfs/xfs_aops.c
fs/xfs/xfs_attr.c [deleted file]
fs/xfs/xfs_attr_inactive.c
fs/xfs/xfs_attr_leaf.c [deleted file]
fs/xfs/xfs_attr_leaf.h [deleted file]
fs/xfs/xfs_attr_list.c
fs/xfs/xfs_attr_remote.c [deleted file]
fs/xfs/xfs_attr_remote.h [deleted file]
fs/xfs/xfs_attr_sf.h [deleted file]
fs/xfs/xfs_bit.h [deleted file]
fs/xfs/xfs_bmap.c [deleted file]
fs/xfs/xfs_bmap.h [deleted file]
fs/xfs/xfs_bmap_btree.c [deleted file]
fs/xfs/xfs_bmap_btree.h [deleted file]
fs/xfs/xfs_bmap_util.c
fs/xfs/xfs_btree.c [deleted file]
fs/xfs/xfs_btree.h [deleted file]
fs/xfs/xfs_buf.c
fs/xfs/xfs_buf.h
fs/xfs/xfs_buf_item.c
fs/xfs/xfs_cksum.h [deleted file]
fs/xfs/xfs_da_btree.c [deleted file]
fs/xfs/xfs_da_btree.h [deleted file]
fs/xfs/xfs_da_format.c [deleted file]
fs/xfs/xfs_da_format.h [deleted file]
fs/xfs/xfs_dinode.h [deleted file]
fs/xfs/xfs_dir2.c [deleted file]
fs/xfs/xfs_dir2.h [deleted file]
fs/xfs/xfs_dir2_block.c [deleted file]
fs/xfs/xfs_dir2_data.c [deleted file]
fs/xfs/xfs_dir2_leaf.c [deleted file]
fs/xfs/xfs_dir2_node.c [deleted file]
fs/xfs/xfs_dir2_priv.h [deleted file]
fs/xfs/xfs_dir2_readdir.c
fs/xfs/xfs_dir2_sf.c [deleted file]
fs/xfs/xfs_discard.c
fs/xfs/xfs_dquot.c
fs/xfs/xfs_dquot.h
fs/xfs/xfs_dquot_buf.c [deleted file]
fs/xfs/xfs_error.c
fs/xfs/xfs_error.h
fs/xfs/xfs_export.c
fs/xfs/xfs_extfree_item.c
fs/xfs/xfs_file.c
fs/xfs/xfs_filestream.c
fs/xfs/xfs_format.h [deleted file]
fs/xfs/xfs_fs.h
fs/xfs/xfs_fsops.c
fs/xfs/xfs_ialloc.c [deleted file]
fs/xfs/xfs_ialloc.h [deleted file]
fs/xfs/xfs_ialloc_btree.c [deleted file]
fs/xfs/xfs_ialloc_btree.h [deleted file]
fs/xfs/xfs_icache.c
fs/xfs/xfs_icache.h
fs/xfs/xfs_inode.c
fs/xfs/xfs_inode.h
fs/xfs/xfs_inode_buf.c [deleted file]
fs/xfs/xfs_inode_buf.h [deleted file]
fs/xfs/xfs_inode_fork.c [deleted file]
fs/xfs/xfs_inode_fork.h [deleted file]
fs/xfs/xfs_inode_item.c
fs/xfs/xfs_inum.h [deleted file]
fs/xfs/xfs_ioctl.c
fs/xfs/xfs_ioctl32.c
fs/xfs/xfs_iomap.c
fs/xfs/xfs_iops.c
fs/xfs/xfs_itable.c
fs/xfs/xfs_itable.h
fs/xfs/xfs_linux.h
fs/xfs/xfs_log.c
fs/xfs/xfs_log_cil.c
fs/xfs/xfs_log_format.h [deleted file]
fs/xfs/xfs_log_priv.h
fs/xfs/xfs_log_recover.c
fs/xfs/xfs_log_recover.h [deleted file]
fs/xfs/xfs_log_rlimit.c [deleted file]
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h
fs/xfs/xfs_mru_cache.c
fs/xfs/xfs_qm.c
fs/xfs/xfs_qm.h
fs/xfs/xfs_qm_bhv.c
fs/xfs/xfs_qm_syscalls.c
fs/xfs/xfs_quota_defs.h [deleted file]
fs/xfs/xfs_quotaops.c
fs/xfs/xfs_rtalloc.c
fs/xfs/xfs_rtalloc.h
fs/xfs/xfs_rtbitmap.c [deleted file]
fs/xfs/xfs_sb.c [deleted file]
fs/xfs/xfs_sb.h [deleted file]
fs/xfs/xfs_shared.h [deleted file]
fs/xfs/xfs_super.c
fs/xfs/xfs_super.h
fs/xfs/xfs_symlink.c
fs/xfs/xfs_symlink_remote.c [deleted file]
fs/xfs/xfs_sysfs.c [new file with mode: 0644]
fs/xfs/xfs_sysfs.h [new file with mode: 0644]
fs/xfs/xfs_trans.c
fs/xfs/xfs_trans_ail.c
fs/xfs/xfs_trans_buf.c
fs/xfs/xfs_trans_dquot.c
fs/xfs/xfs_trans_resv.c [deleted file]
fs/xfs/xfs_trans_resv.h [deleted file]
fs/xfs/xfs_trans_space.h [deleted file]
fs/xfs/xfs_types.h
fs/xfs/xfs_vnode.h [deleted file]
fs/xfs/xfs_xattr.c
include/dt-bindings/dma/nbpfaxi.h [new file with mode: 0644]
include/linux/acct.h
include/linux/bio.h
include/linux/blkdev.h
include/linux/ceph/messenger.h
include/linux/ceph/osd_client.h
include/linux/dcache.h
include/linux/dmaengine.h
include/linux/drbd.h
include/linux/drbd_genl.h
include/linux/drbd_limits.h
include/linux/fs.h
include/linux/fs_pin.h [new file with mode: 0644]
include/linux/if_vlan.h
include/linux/mmc/dw_mmc.h
include/linux/mmc/sdhci.h
include/linux/mount.h
include/linux/nfs_fs.h
include/linux/nfs_fs_sb.h
include/linux/nfs_page.h
include/linux/nfs_xdr.h
include/linux/of.h
include/linux/of_dma.h
include/linux/of_platform.h
include/linux/of_reserved_mem.h
include/linux/platform_data/dma-imx.h
include/linux/platform_data/edma.h
include/linux/platform_data/mmc-omap.h
include/linux/quota.h
include/linux/sh_dma.h
include/linux/skbuff.h
include/linux/sunrpc/auth.h
include/linux/sunrpc/auth_gss.h
include/linux/sunrpc/gss_krb5.h
include/linux/sunrpc/xprtrdma.h
include/linux/thermal.h
include/linux/uio.h
include/linux/vfio.h
include/scsi/sg.h
include/trace/events/bcache.h
include/uapi/linux/bsg.h
include/uapi/linux/virtio_blk.h
kernel/acct.c
lib/lru_cache.c
mm/filemap.c
mm/iov_iter.c
mm/shmem.c
net/6lowpan/Kconfig
net/8021q/vlan_core.c
net/batman-adv/multicast.c
net/bridge/br_vlan.c
net/bridge/netfilter/ebtables.c
net/ceph/messenger.c
net/ceph/osd_client.c
net/core/dev.c
net/core/rtnetlink.c
net/core/skbuff.c
net/ipv4/route.c
net/netfilter/core.c
net/netfilter/ipvs/ip_vs_ctl.c
net/netfilter/nf_sockopt.c
net/netfilter/nf_tables_api.c
net/netfilter/x_tables.c
net/netlink/af_netlink.c
net/openvswitch/datapath.c
net/sunrpc/addr.c
net/sunrpc/auth.c
net/sunrpc/auth_generic.c
net/sunrpc/auth_gss/auth_gss.c
net/sunrpc/auth_gss/gss_krb5_crypto.c
net/sunrpc/auth_gss/gss_krb5_seal.c
net/sunrpc/auth_gss/gss_krb5_wrap.c
net/sunrpc/auth_null.c
net/sunrpc/clnt.c
net/sunrpc/rpc_pipe.c
net/sunrpc/xprt.c
net/sunrpc/xprtrdma/rpc_rdma.c
net/sunrpc/xprtrdma/transport.c
net/sunrpc/xprtrdma/verbs.c
net/sunrpc/xprtrdma/xprt_rdma.h
net/sunrpc/xprtsock.c
sound/ppc/pmac.c

index 501adc2a9ec723841c5246b19875a9bc14471ada..2ddd680929d8f83dc2147592978b37a6e33dc9a5 100644 (file)
@@ -94,5 +94,5 @@ current_snap
 
 parent
 
-       Information identifying the pool, image, and snapshot id for
-       the parent image in a layered rbd image (format 2 only).
+       Information identifying the chain of parent images in a layered rbd
+       image.  Entries are separated by empty lines.
diff --git a/Documentation/ABI/testing/sysfs-fs-xfs b/Documentation/ABI/testing/sysfs-fs-xfs
new file mode 100644 (file)
index 0000000..ea0cc8c
--- /dev/null
@@ -0,0 +1,39 @@
+What:          /sys/fs/xfs/<disk>/log/log_head_lsn
+Date:          July 2014
+KernelVersion: 3.17
+Contact:       xfs@oss.sgi.com
+Description:
+               The log sequence number (LSN) of the current head of the
+               log. The LSN is exported in "cycle:basic block" format.
+Users:         xfstests
+
+What:          /sys/fs/xfs/<disk>/log/log_tail_lsn
+Date:          July 2014
+KernelVersion: 3.17
+Contact:       xfs@oss.sgi.com
+Description:
+               The log sequence number (LSN) of the current tail of the
+               log. The LSN is exported in "cycle:basic block" format.
+
+What:          /sys/fs/xfs/<disk>/log/reserve_grant_head
+Date:          July 2014
+KernelVersion: 3.17
+Contact:       xfs@oss.sgi.com
+Description:
+               The current state of the log reserve grant head. It
+               represents the total log reservation of all currently
+               outstanding transactions. The grant head is exported in
+               "cycle:bytes" format.
+Users:         xfstests
+
+What:          /sys/fs/xfs/<disk>/log/write_grant_head
+Date:          July 2014
+KernelVersion: 3.17
+Contact:       xfs@oss.sgi.com
+Description:
+               The current state of the log write grant head. It
+               represents the total log reservation of all currently
+               oustanding transactions, including regrants due to
+               rolling transactions. The grant head is exported in
+               "cycle:bytes" format.
+Users:         xfstests
index 2fa749387be807ade453ba645fbcf8250afffb14..8897d04948384289b3fca54801be9676c15ce0e5 100644 (file)
@@ -106,6 +106,11 @@ which paths.
     The path number in the range 0 ... (<num_paths> - 1).
     Expressed in hexadecimal (WITHOUT any prefix like 0x).
 
+R<n>,<m>
+    This parameter allows repetitive patterns to be loaded quickly. <n> and <m>
+    are hexadecimal numbers. The last <n> mappings are repeated in the next <m>
+    slots.
+
 Status
 ======
 
@@ -124,3 +129,10 @@ Create a switch device with 64kB region size:
 Set mappings for the first 7 entries to point to devices switch0, switch1,
 switch2, switch0, switch1, switch2, switch1:
     dmsetup message switch 0 set_region_mappings 0:0 :1 :2 :0 :1 :2 :1
+
+Set repetitive mapping. This command:
+    dmsetup message switch 0 set_region_mappings 1000:1 :2 R2,10
+is equivalent to:
+    dmsetup message switch 0 set_region_mappings 1000:1 :2 :1 :2 :1 :2 :1 :2 \
+       :1 :2 :1 :2 :1 :2 :1 :2 :1 :2
+
index e577196a12c0577821e68b4aa2fa6d0d9ef6bde3..4659fd952301b1b9967e74afb7cbbda5a62ee46c 100644 (file)
@@ -47,6 +47,7 @@ The full ID of peripheral types can be found below.
        20      ASRC
        21      ESAI
        22      SSI Dual FIFO   (needs firmware ver >= 2)
+       23      Shared ASRC
 
 The third cell specifies the transfer priority as below.
 
diff --git a/Documentation/devicetree/bindings/dma/mpc512x-dma.txt b/Documentation/devicetree/bindings/dma/mpc512x-dma.txt
new file mode 100644 (file)
index 0000000..a6511df
--- /dev/null
@@ -0,0 +1,29 @@
+* Freescale MPC512x and MPC8308 DMA Controller
+
+The DMA controller in Freescale MPC512x and MPC8308 SoCs can move
+blocks of memory contents between memory and peripherals or
+from memory to memory.
+
+Refer to "Generic DMA Controller and DMA request bindings" in
+the dma/dma.txt file for a more detailed description of binding.
+
+Required properties:
+- compatible: should be "fsl,mpc5121-dma" or "fsl,mpc8308-dma";
+- reg: should contain the DMA controller registers location and length;
+- interrupt for the DMA controller: syntax of interrupt client node
+       is described in interrupt-controller/interrupts.txt file.
+- #dma-cells: the length of the DMA specifier, must be <1>.
+       Each channel of this DMA controller has a peripheral request line,
+       the assignment is fixed in hardware. This one cell
+       in dmas property of a client device represents the channel number.
+
+Example:
+
+       dma0: dma@14000 {
+               compatible = "fsl,mpc5121-dma";
+               reg = <0x14000 0x1800>;
+               interrupts = <65 0x8>;
+               #dma-cells = <1>;
+       };
+
+DMA clients must use the format described in dma/dma.txt file.
diff --git a/Documentation/devicetree/bindings/dma/nbpfaxi.txt b/Documentation/devicetree/bindings/dma/nbpfaxi.txt
new file mode 100644 (file)
index 0000000..d5e2522
--- /dev/null
@@ -0,0 +1,61 @@
+* Renesas "Type-AXI" NBPFAXI* DMA controllers
+
+* DMA controller
+
+Required properties
+
+- compatible:  must be one of
+               "renesas,nbpfaxi64dmac1b4"
+               "renesas,nbpfaxi64dmac1b8"
+               "renesas,nbpfaxi64dmac1b16"
+               "renesas,nbpfaxi64dmac4b4"
+               "renesas,nbpfaxi64dmac4b8"
+               "renesas,nbpfaxi64dmac4b16"
+               "renesas,nbpfaxi64dmac8b4"
+               "renesas,nbpfaxi64dmac8b8"
+               "renesas,nbpfaxi64dmac8b16"
+- #dma-cells:  must be 2: the first integer is a terminal number, to which this
+               slave is connected, the second one is flags. Flags is a bitmask
+               with the following bits defined:
+
+#define NBPF_SLAVE_RQ_HIGH     1
+#define NBPF_SLAVE_RQ_LOW      2
+#define NBPF_SLAVE_RQ_LEVEL    4
+
+Optional properties:
+
+You can use dma-channels and dma-requests as described in dma.txt, although they
+won't be used, this information is derived from the compatibility string.
+
+Example:
+
+       dma: dma-controller@48000000 {
+               compatible = "renesas,nbpfaxi64dmac8b4";
+               reg = <0x48000000 0x400>;
+               interrupts = <0 12 0x4
+                             0 13 0x4
+                             0 14 0x4
+                             0 15 0x4
+                             0 16 0x4
+                             0 17 0x4
+                             0 18 0x4
+                             0 19 0x4>;
+               #dma-cells = <2>;
+               dma-channels = <8>;
+               dma-requests = <8>;
+       };
+
+* DMA client
+
+Required properties:
+
+dmas and dma-names are required, as described in dma.txt.
+
+Example:
+
+#include <dt-bindings/dma/nbpfaxi.h>
+
+...
+               dmas = <&dma 0 (NBPF_SLAVE_RQ_HIGH | NBPF_SLAVE_RQ_LEVEL)
+                       &dma 1 (NBPF_SLAVE_RQ_HIGH | NBPF_SLAVE_RQ_LEVEL)>;
+               dma-names = "rx", "tx";
diff --git a/Documentation/devicetree/bindings/dma/rcar-audmapp.txt b/Documentation/devicetree/bindings/dma/rcar-audmapp.txt
new file mode 100644 (file)
index 0000000..9f1d750
--- /dev/null
@@ -0,0 +1,29 @@
+* R-Car Audio DMAC peri peri Device Tree bindings
+
+Required properties:
+- compatible:  should be "renesas,rcar-audmapp"
+- #dma-cells:  should be <1>, see "dmas" property below
+
+Example:
+       audmapp: audio-dma-pp@0xec740000 {
+               compatible = "renesas,rcar-audmapp";
+               #dma-cells = <1>;
+
+               reg = <0 0xec740000 0 0x200>;
+       };
+
+
+* DMA client
+
+Required properties:
+- dmas:                a list of <[DMA multiplexer phandle] [SRS/DRS value]> pairs,
+               where SRS/DRS values are fixed handles, specified in the SoC
+               manual as the value that would be written into the PDMACHCR.
+- dma-names:   a list of DMA channel names, one per "dmas" entry
+
+Example:
+
+       dmas = <&audmapp 0x2d00
+               &audmapp 0x3700>;
+       dma-names =  "src0_ssiu0",
+                    "dvc0_ssiu0";
diff --git a/Documentation/devicetree/bindings/dma/renesas,rcar-dmac.txt b/Documentation/devicetree/bindings/dma/renesas,rcar-dmac.txt
new file mode 100644 (file)
index 0000000..df0f48b
--- /dev/null
@@ -0,0 +1,98 @@
+* Renesas R-Car DMA Controller Device Tree bindings
+
+Renesas R-Car Generation 2 SoCs have have multiple multi-channel DMA
+controller instances named DMAC capable of serving multiple clients. Channels
+can be dedicated to specific clients or shared between a large number of
+clients.
+
+DMA clients are connected to the DMAC ports referenced by an 8-bit identifier
+called MID/RID.
+
+Each DMA client is connected to one dedicated port of the DMAC, identified by
+an 8-bit port number called the MID/RID. A DMA controller can thus serve up to
+256 clients in total. When the number of hardware channels is lower than the
+number of clients to be served, channels must be shared between multiple DMA
+clients. The association of DMA clients to DMAC channels is fully dynamic and
+not described in these device tree bindings.
+
+Required Properties:
+
+- compatible: must contain "renesas,rcar-dmac"
+
+- reg: base address and length of the registers block for the DMAC
+
+- interrupts: interrupt specifiers for the DMAC, one for each entry in
+  interrupt-names.
+- interrupt-names: one entry per channel, named "ch%u", where %u is the
+  channel number ranging from zero to the number of channels minus one.
+
+- clock-names: "fck" for the functional clock
+- clocks: a list of phandle + clock-specifier pairs, one for each entry
+  in clock-names.
+- clock-names: must contain "fck" for the functional clock.
+
+- #dma-cells: must be <1>, the cell specifies the MID/RID of the DMAC port
+  connected to the DMA client
+- dma-channels: number of DMA channels
+
+Example: R8A7790 (R-Car H2) SYS-DMACs
+
+       dmac0: dma-controller@e6700000 {
+               compatible = "renesas,rcar-dmac";
+               reg = <0 0xe6700000 0 0x20000>;
+               interrupts = <0 197 IRQ_TYPE_LEVEL_HIGH
+                             0 200 IRQ_TYPE_LEVEL_HIGH
+                             0 201 IRQ_TYPE_LEVEL_HIGH
+                             0 202 IRQ_TYPE_LEVEL_HIGH
+                             0 203 IRQ_TYPE_LEVEL_HIGH
+                             0 204 IRQ_TYPE_LEVEL_HIGH
+                             0 205 IRQ_TYPE_LEVEL_HIGH
+                             0 206 IRQ_TYPE_LEVEL_HIGH
+                             0 207 IRQ_TYPE_LEVEL_HIGH
+                             0 208 IRQ_TYPE_LEVEL_HIGH
+                             0 209 IRQ_TYPE_LEVEL_HIGH
+                             0 210 IRQ_TYPE_LEVEL_HIGH
+                             0 211 IRQ_TYPE_LEVEL_HIGH
+                             0 212 IRQ_TYPE_LEVEL_HIGH
+                             0 213 IRQ_TYPE_LEVEL_HIGH
+                             0 214 IRQ_TYPE_LEVEL_HIGH>;
+               interrupt-names = "error",
+                               "ch0", "ch1", "ch2", "ch3",
+                               "ch4", "ch5", "ch6", "ch7",
+                               "ch8", "ch9", "ch10", "ch11",
+                               "ch12", "ch13", "ch14";
+               clocks = <&mstp2_clks R8A7790_CLK_SYS_DMAC0>;
+               clock-names = "fck";
+               #dma-cells = <1>;
+               dma-channels = <15>;
+       };
+
+       dmac1: dma-controller@e6720000 {
+               compatible = "renesas,rcar-dmac";
+               reg = <0 0xe6720000 0 0x20000>;
+               interrupts = <0 220 IRQ_TYPE_LEVEL_HIGH
+                             0 216 IRQ_TYPE_LEVEL_HIGH
+                             0 217 IRQ_TYPE_LEVEL_HIGH
+                             0 218 IRQ_TYPE_LEVEL_HIGH
+                             0 219 IRQ_TYPE_LEVEL_HIGH
+                             0 308 IRQ_TYPE_LEVEL_HIGH
+                             0 309 IRQ_TYPE_LEVEL_HIGH
+                             0 310 IRQ_TYPE_LEVEL_HIGH
+                             0 311 IRQ_TYPE_LEVEL_HIGH
+                             0 312 IRQ_TYPE_LEVEL_HIGH
+                             0 313 IRQ_TYPE_LEVEL_HIGH
+                             0 314 IRQ_TYPE_LEVEL_HIGH
+                             0 315 IRQ_TYPE_LEVEL_HIGH
+                             0 316 IRQ_TYPE_LEVEL_HIGH
+                             0 317 IRQ_TYPE_LEVEL_HIGH
+                             0 318 IRQ_TYPE_LEVEL_HIGH>;
+               interrupt-names = "error",
+                               "ch0", "ch1", "ch2", "ch3",
+                               "ch4", "ch5", "ch6", "ch7",
+                               "ch8", "ch9", "ch10", "ch11",
+                               "ch12", "ch13", "ch14";
+               clocks = <&mstp2_clks R8A7790_CLK_SYS_DMAC1>;
+               clock-names = "fck";
+               #dma-cells = <1>;
+               dma-channels = <15>;
+       };
index 1f5729f106216ecb079af2c4912fd61e20098585..95800ab37bb00cce3fdfe2278625e6ba47cdec69 100644 (file)
@@ -35,9 +35,11 @@ Required properties:
 
 Each dmas request consists of 4 cells:
   1. A phandle pointing to the DMA controller
-  2. Device Type
+  2. Device signal number, the signal line for single and burst requests
+     connected from the device to the DMA40 engine
   3. The DMA request line number (only when 'use fixed channel' is set)
-  4. A 32bit mask specifying; mode, direction and endianness [NB: This list will grow]
+  4. A 32bit mask specifying; mode, direction and endianness
+     [NB: This list will grow]
         0x00000001: Mode:
                 Logical channel when unset
                 Physical channel when set
@@ -54,6 +56,74 @@ Each dmas request consists of 4 cells:
                 Normal priority when unset
                 High priority when set
 
+Existing signal numbers for the DB8500 ASIC. Unless specified, the signals are
+bidirectional, i.e. the same for RX and TX operations:
+
+0:  SPI controller 0
+1:  SD/MMC controller 0 (unused)
+2:  SD/MMC controller 1 (unused)
+3:  SD/MMC controller 2 (unused)
+4:  I2C port 1
+5:  I2C port 3
+6:  I2C port 2
+7:  I2C port 4
+8:  Synchronous Serial Port SSP0
+9:  Synchronous Serial Port SSP1
+10: Multi-Channel Display Engine MCDE RX
+11: UART port 2
+12: UART port 1
+13: UART port 0
+14: Multirate Serial Port MSP2
+15: I2C port 0
+16: USB OTG in/out endpoints 7 & 15
+17: USB OTG in/out endpoints 6 & 14
+18: USB OTG in/out endpoints 5 & 13
+19: USB OTG in/out endpoints 4 & 12
+20: SLIMbus or HSI channel 0
+21: SLIMbus or HSI channel 1
+22: SLIMbus or HSI channel 2
+23: SLIMbus or HSI channel 3
+24: Multimedia DSP SXA0
+25: Multimedia DSP SXA1
+26: Multimedia DSP SXA2
+27: Multimedia DSP SXA3
+28: SD/MM controller 2
+29: SD/MM controller 0
+30: MSP port 1 on DB8500 v1, MSP port 3 on DB8500 v2
+31: MSP port 0 or SLIMbus channel 0
+32: SD/MM controller 1
+33: SPI controller 2
+34: i2c3 RX2 TX2
+35: SPI controller 1
+36: USB OTG in/out endpoints 3 & 11
+37: USB OTG in/out endpoints 2 & 10
+38: USB OTG in/out endpoints 1 & 9
+39: USB OTG in/out endpoints 8
+40: SPI controller 3
+41: SD/MM controller 3
+42: SD/MM controller 4
+43: SD/MM controller 5
+44: Multimedia DSP SXA4
+45: Multimedia DSP SXA5
+46: SLIMbus channel 8 or Multimedia DSP SXA6
+47: SLIMbus channel 9 or Multimedia DSP SXA7
+48: Crypto Accelerator 1
+49: Crypto Accelerator 1 TX or Hash Accelerator 1 TX
+50: Hash Accelerator 1 TX
+51: memcpy TX (to be used by the DMA driver for memcpy operations)
+52: SLIMbus or HSI channel 4
+53: SLIMbus or HSI channel 5
+54: SLIMbus or HSI channel 6
+55: SLIMbus or HSI channel 7
+56: memcpy (to be used by the DMA driver for memcpy operations)
+57: memcpy (to be used by the DMA driver for memcpy operations)
+58: memcpy (to be used by the DMA driver for memcpy operations)
+59: memcpy (to be used by the DMA driver for memcpy operations)
+60: memcpy (to be used by the DMA driver for memcpy operations)
+61: Crypto Accelerator 0
+62: Crypto Accelerator 0 TX or Hash Accelerator 0 TX
+63: Hash Accelerator 0 TX
+
 Example:
 
        uart@80120000 {
diff --git a/Documentation/devicetree/bindings/dma/sun6i-dma.txt b/Documentation/devicetree/bindings/dma/sun6i-dma.txt
new file mode 100644 (file)
index 0000000..3e145c1
--- /dev/null
@@ -0,0 +1,45 @@
+Allwinner A31 DMA Controller
+
+This driver follows the generic DMA bindings defined in dma.txt.
+
+Required properties:
+
+- compatible:  Must be "allwinner,sun6i-a31-dma"
+- reg:         Should contain the registers base address and length
+- interrupts:  Should contain a reference to the interrupt used by this device
+- clocks:      Should contain a reference to the parent AHB clock
+- resets:      Should contain a reference to the reset controller asserting
+               this device in reset
+- #dma-cells : Should be 1, a single cell holding a line request number
+
+Example:
+       dma: dma-controller@01c02000 {
+               compatible = "allwinner,sun6i-a31-dma";
+               reg = <0x01c02000 0x1000>;
+               interrupts = <0 50 4>;
+               clocks = <&ahb1_gates 6>;
+               resets = <&ahb1_rst 6>;
+               #dma-cells = <1>;
+       };
+
+Clients:
+
+DMA clients connected to the A31 DMA controller must use the format
+described in the dma.txt file, using a two-cell specifier for each
+channel: a phandle plus one integer cells.
+The two cells in order are:
+
+1. A phandle pointing to the DMA controller.
+2. The port ID as specified in the datasheet
+
+Example:
+spi2: spi@01c6a000 {
+       compatible = "allwinner,sun6i-a31-spi";
+       reg = <0x01c6a000 0x1000>;
+       interrupts = <0 67 4>;
+       clocks = <&ahb1_gates 22>, <&spi2_clk>;
+       clock-names = "ahb", "mod";
+       dmas = <&dma 25>, <&dma 25>;
+       dma-names = "rx", "tx";
+       resets = <&ahb1_rst 22>;
+};
index 532b1d440abc15d1f1d1e61791b274a6ec8dafe0..6cd3525d0e09514acedaf89c1c1abeeffad7cf95 100644 (file)
@@ -46,13 +46,14 @@ Required Properties:
       - if CIU clock divider value is 0 (that is divide by 1), both tx and rx
         phase shift clocks should be 0.
 
-Required properties for a slot:
+Required properties for a slot (Deprecated - Recommend to use one slot per host):
 
 * gpios: specifies a list of gpios used for command, clock and data bus. The
   first gpio is the command line and the second gpio is the clock line. The
   rest of the gpios (depending on the bus-width property) are the data lines in
   no particular order. The format of the gpio specifier depends on the gpio
   controller.
+(Deprecated - Refer to Documentation/devicetree/binding/pinctrl/samsung-pinctrl.txt)
 
 Example:
 
@@ -69,21 +70,13 @@ Example:
 
        dwmmc0@12200000 {
                num-slots = <1>;
-               supports-highspeed;
+               cap-mmc-highspeed;
+               cap-sd-highspeed;
                broken-cd;
                fifo-depth = <0x80>;
                card-detect-delay = <200>;
                samsung,dw-mshc-ciu-div = <3>;
                samsung,dw-mshc-sdr-timing = <2 3>;
                samsung,dw-mshc-ddr-timing = <1 2>;
-
-               slot@0 {
-                       reg = <0>;
-                       bus-width = <8>;
-                       gpios = <&gpc0 0 2 0 3>, <&gpc0 1 2 0 3>,
-                               <&gpc1 0 2 3 3>, <&gpc1 1 2 3 3>,
-                               <&gpc1 2 2 3 3>, <&gpc1 3 2 3 3>,
-                               <&gpc0 3 2 3 3>, <&gpc0 4 2 3 3>,
-                               <&gpc0 5 2 3 3>, <&gpc0 6 2 3 3>;
-               };
+               bus-width = <8>;
        };
index e5bc49f764d10a982efe9a8a92f9327b19f8e575..3b3544931437accded12ada0ec38c786441d1a02 100644 (file)
@@ -34,13 +34,11 @@ Example:
                num-slots = <1>;
                vmmc-supply = <&ldo12>;
                fifo-depth = <0x100>;
-               supports-highspeed;
                pinctrl-names = "default";
                pinctrl-0 = <&sd_pmx_pins &sd_cfg_func1 &sd_cfg_func2>;
-               slot@0 {
-                       reg = <0>;
-                       bus-width = <4>;
-                       disable-wp;
-                       cd-gpios = <&gpio10 3 0>;
-               };
+               bus-width = <4>;
+               disable-wp;
+               cd-gpios = <&gpio10 3 0>;
+               cap-mmc-highspeed;
+               cap-sd-highspeed;
        };
index 3c18001dfd5d75fe91038926fd6da8f98637fe88..431716e37a3964638245b2905badff16128b9495 100644 (file)
@@ -34,8 +34,8 @@ Optional properties:
 - cap-power-off-card: powering off the card is safe
 - cap-sdio-irq: enable SDIO IRQ signalling on this interface
 - full-pwr-cycle: full power cycle of the card is supported
-- mmc-highspeed-ddr-1_8v: eMMC high-speed DDR mode(1.8V I/O) is supported
-- mmc-highspeed-ddr-1_2v: eMMC high-speed DDR mode(1.2V I/O) is supported
+- mmc-ddr-1_8v: eMMC high-speed DDR mode(1.8V I/O) is supported
+- mmc-ddr-1_2v: eMMC high-speed DDR mode(1.2V I/O) is supported
 - mmc-hs200-1_8v: eMMC HS200 mode(1.8V I/O) is supported
 - mmc-hs200-1_2v: eMMC HS200 mode(1.2V I/O) is supported
 - mmc-hs400-1_8v: eMMC HS400 mode(1.8V I/O) is supported
diff --git a/Documentation/devicetree/bindings/mmc/renesas,mmcif.txt b/Documentation/devicetree/bindings/mmc/renesas,mmcif.txt
new file mode 100644 (file)
index 0000000..299081f
--- /dev/null
@@ -0,0 +1,32 @@
+* Renesas Multi Media Card Interface (MMCIF) Controller
+
+This file documents differences between the core properties in mmc.txt
+and the properties used by the MMCIF device.
+
+
+Required properties:
+
+- compatible: must contain one of the following
+       - "renesas,mmcif-r8a7740" for the MMCIF found in r8a7740 SoCs
+       - "renesas,mmcif-r8a7790" for the MMCIF found in r8a7790 SoCs
+       - "renesas,mmcif-r8a7791" for the MMCIF found in r8a7791 SoCs
+       - "renesas,sh-mmcif" for the generic MMCIF
+
+- clocks: reference to the functional clock
+
+- dmas: reference to the DMA channels, one per channel name listed in the
+  dma-names property.
+- dma-names: must contain "tx" for the transmit DMA channel and "rx" for the
+  receive DMA channel.
+
+
+Example: R8A7790 (R-Car H2) MMCIF0
+
+       mmcif0: mmc@ee200000 {
+               compatible = "renesas,mmcif-r8a7790", "renesas,sh-mmcif";
+               reg = <0 0xee200000 0 0x80>;
+               interrupts = <0 169 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&mstp3_clks R8A7790_CLK_MMCIF0>;
+               dmas = <&dmac0 0xd1>, <&dmac0 0xd2>;
+               dma-names = "tx", "rx";
+       };
index 81b33b5b20fc07fa1ffeaf3e61e48dfb8263f555..485483a63d8ce3f44a82d30d56c283c8f1034dff 100644 (file)
@@ -27,8 +27,8 @@ Example:
                bus-width = <8>;
                non-removable;
 
-               vmmc = <&pm8941_l20>;
-               vqmmc = <&pm8941_s3>;
+               vmmc-supply = <&pm8941_l20>;
+               vqmmc-supply = <&pm8941_s3>;
 
                pinctrl-names = "default";
                pinctrl-0 = <&sdc1_clk &sdc1_cmd &sdc1_data>;
@@ -44,8 +44,8 @@ Example:
                bus-width = <4>;
                cd-gpios = <&msmgpio 62 0x1>;
 
-               vmmc = <&pm8941_l21>;
-               vqmmc = <&pm8941_l13>;
+               vmmc-supply = <&pm8941_l21>;
+               vqmmc-supply = <&pm8941_l13>;
 
                pinctrl-names = "default";
                pinctrl-0 = <&sdc2_clk &sdc2_cmd &sdc2_data>;
diff --git a/Documentation/devicetree/bindings/mmc/sdhci-st.txt b/Documentation/devicetree/bindings/mmc/sdhci-st.txt
new file mode 100644 (file)
index 0000000..7527db4
--- /dev/null
@@ -0,0 +1,33 @@
+* STMicroelectronics sdhci-st MMC/SD controller
+
+This file documents the differences between the core properties in
+Documentation/devicetree/bindings/mmc/mmc.txt and the properties
+used by the sdhci-st driver.
+
+Required properties:
+- compatible :  Must be "st,sdhci"
+- clock-names : Should be "mmc"
+                See: Documentation/devicetree/bindings/resource-names.txt
+- clocks :      Phandle of the clock used by the sdhci controler
+                See: Documentation/devicetree/bindings/clock/clock-bindings.txt
+
+Optional properties:
+- non-removable: non-removable slot
+                 See: Documentation/devicetree/bindings/mmc/mmc.txt
+- bus-width: Number of data lines
+                 See: Documentation/devicetree/bindings/mmc/mmc.txt
+
+Example:
+
+mmc0: sdhci@fe81e000 {
+       compatible      = "st,sdhci";
+       status          = "disabled";
+       reg             = <0xfe81e000 0x1000>;
+       interrupts      = <GIC_SPI 127 IRQ_TYPE_NONE>;
+       interrupt-names = "mmcirq";
+       pinctrl-names   = "default";
+       pinctrl-0       = <&pinctrl_mmc0>;
+       clock-names     = "mmc";
+       clocks          = <&clk_s_a1_ls 1>;
+       bus-width       = <8>
+};
index 2d4a7258a10db9d2c30dc808bb7b5ad5e74fa994..346c6095a6155138ad01f2d39374a8ed97224d12 100644 (file)
@@ -67,7 +67,8 @@ Optional properties:
 * card-detect-delay: Delay in milli-seconds before detecting card after card
   insert event. The default value is 0.
 
-* supports-highspeed: Enables support for high speed cards (up to 50MHz)
+* supports-highspeed (DEPRECATED): Enables support for high speed cards (up to 50MHz)
+                          (use "cap-mmc-highspeed" or "cap-sd-highspeed" instead)
 
 * broken-cd: as documented in mmc core bindings.
 
@@ -98,14 +99,11 @@ board specific portions as listed below.
                clock-frequency = <400000000>;
                clock-freq-min-max = <400000 200000000>;
                num-slots = <1>;
-               supports-highspeed;
                broken-cd;
                fifo-depth = <0x80>;
                card-detect-delay = <200>;
                vmmc-supply = <&buck8>;
-
-               slot@0 {
-                       reg = <0>;
-                       bus-width = <8>;
-               };
+               bus-width = <8>;
+               cap-mmc-highspeed;
+               cap-sd-highspeed;
        };
index ce8056116fb0bd9c281ec4a2d60844e9eb551953..76bf087bc8898fc82f9b7d48c94cce498a85be50 100644 (file)
@@ -12,6 +12,7 @@ Required properties:
  Should be "ti,omap3-hsmmc", for OMAP3 controllers
  Should be "ti,omap3-pre-es3-hsmmc" for OMAP3 controllers pre ES3.0
  Should be "ti,omap4-hsmmc", for OMAP4 controllers
+ Should be "ti,am33xx-hsmmc", for AM335x controllers
 - ti,hwmods: Must be "mmc<n>", n is controller instance starting 1
 
 Optional properties:
@@ -56,3 +57,56 @@ Examples:
                        &edma 25>;
                dma-names = "tx", "rx";
        };
+
+[workaround for missing swakeup on am33xx]
+
+This SOC is missing the swakeup line, it will not detect SDIO irq
+while in suspend.
+
+                             ------
+                             | PRCM |
+                              ------
+                               ^ |
+                       swakeup | | fclk
+                               | v
+       ------                -------               -----
+      | card | -- CIRQ -->  | hsmmc | -- IRQ -->  | CPU |
+       ------                -------               -----
+
+In suspend the fclk is off and the module is disfunctional. Even register reads
+will fail. A small logic in the host will request fclk restore, when an
+external event is detected. Once the clock is restored, the host detects the
+event normally. Since am33xx doesn't have this line it never wakes from
+suspend.
+
+The workaround is to reconfigure the dat1 line as a GPIO upon suspend. To make
+this work, we need to set the named pinctrl states "default" and "idle".
+Prepare idle to remux dat1 as a gpio, and default to remux it back as sdio
+dat1. The MMC driver will then toggle between idle and default state during
+runtime.
+
+In summary:
+1. select matching 'compatible' section, see example below.
+2. specify pinctrl states "default" and "idle", "sleep" is optional.
+3. specify the gpio irq used for detecting sdio irq in suspend
+
+If configuration is incomplete, a warning message is emitted "falling back to
+polling". Also check the "sdio irq mode" in /sys/kernel/debug/mmc0/regs. Mind
+not every application needs SDIO irq, e.g. MMC cards.
+
+       mmc1: mmc@48060100 {
+               compatible = "ti,am33xx-hsmmc";
+               ...
+               pinctrl-names = "default", "idle", "sleep"
+               pinctrl-0 = <&mmc1_pins>;
+               pinctrl-1 = <&mmc1_idle>;
+               pinctrl-2 = <&mmc1_sleep>;
+               ...
+               interrupts-extended = <&intc 64 &gpio2 28 0>;
+       };
+
+       mmc1_idle : pinmux_cirq_pin {
+               pinctrl-single,pins = <
+                       0x0f8 0x3f      /* GPIO2_28 */
+               >;
+       };
index 6a2a1160a70defdbac92be8850152f1c4448cbde..fa0f327cde01417339f8e3618f075e6ef51db12e 100644 (file)
@@ -18,6 +18,7 @@ Required properties:
                "renesas,sdhi-r8a7778" - SDHI IP on R8A7778 SoC
                "renesas,sdhi-r8a7779" - SDHI IP on R8A7779 SoC
                "renesas,sdhi-r8a7790" - SDHI IP on R8A7790 SoC
+               "renesas,sdhi-r8a7791" - SDHI IP on R8A7791 SoC
 
 Optional properties:
 - toshiba,mmc-wrprotect-disable: write-protect detection is unavailable
diff --git a/Documentation/devicetree/bindings/net/apm-xgene-enet.txt b/Documentation/devicetree/bindings/net/apm-xgene-enet.txt
new file mode 100644 (file)
index 0000000..ebcad25
--- /dev/null
@@ -0,0 +1,66 @@
+APM X-Gene SoC Ethernet nodes
+
+Ethernet nodes are defined to describe on-chip ethernet interfaces in
+APM X-Gene SoC.
+
+Required properties:
+- compatible: Should be "apm,xgene-enet"
+- reg: Address and length of the register set for the device. It contains the
+  information of registers in the same order as described by reg-names
+- reg-names: Should contain the register set names
+  - "enet_csr": Ethernet control and status register address space
+  - "ring_csr": Descriptor ring control and status register address space
+  - "ring_cmd": Descriptor ring command register address space
+- interrupts: Ethernet main interrupt
+- clocks: Reference to the clock entry.
+- local-mac-address: MAC address assigned to this device
+- phy-connection-type: Interface type between ethernet device and PHY device
+- phy-handle: Reference to a PHY node connected to this device
+
+- mdio: Device tree subnode with the following required properties:
+  - compatible: Must be "apm,xgene-mdio".
+  - #address-cells: Must be <1>.
+  - #size-cells: Must be <0>.
+
+  For the phy on the mdio bus, there must be a node with the following fields:
+  - compatible: PHY identifier.  Please refer ./phy.txt for the format.
+  - reg: The ID number for the phy.
+
+Optional properties:
+- status: Should be "ok" or "disabled" for enabled/disabled. Default is "ok".
+
+Example:
+       menetclk: menetclk {
+               compatible = "apm,xgene-device-clock";
+               clock-output-names = "menetclk";
+               status = "ok";
+       };
+
+       menet: ethernet@17020000 {
+               compatible = "apm,xgene-enet";
+               status = "disabled";
+               reg = <0x0 0x17020000 0x0 0xd100>,
+                     <0x0 0X17030000 0x0 0X400>,
+                     <0x0 0X10000000 0x0 0X200>;
+               reg-names = "enet_csr", "ring_csr", "ring_cmd";
+               interrupts = <0x0 0x3c 0x4>;
+               clocks = <&menetclk 0>;
+               local-mac-address = [00 01 73 00 00 01];
+               phy-connection-type = "rgmii";
+               phy-handle = <&menetphy>;
+               mdio {
+                       compatible = "apm,xgene-mdio";
+                       #address-cells = <1>;
+                       #size-cells = <0>;
+                       menetphy: menetphy@3 {
+                               compatible = "ethernet-phy-id001c.c915";
+                               reg = <0x3>;
+                       };
+
+               };
+       };
+
+/* Board-specific peripheral configurations */
+&menet {
+        status = "ok";
+};
index 6bc84adb10c0ca6f278cc30d40e0b802c12adfd8..8a2c7b55ec165b69ce1b6084fea32466f68ff483 100644 (file)
@@ -12,7 +12,14 @@ Optional properties:
   only if property "phy-reset-gpios" is available.  Missing the property
   will have the duration be 1 millisecond.  Numbers greater than 1000 are
   invalid and 1 millisecond will be used instead.
-- phy-supply: regulator that powers the Ethernet PHY.
+- phy-supply : regulator that powers the Ethernet PHY.
+- phy-handle : phandle to the PHY device connected to this device.
+- fixed-link : Assume a fixed link. See fixed-link.txt in the same directory.
+  Use instead of phy-handle.
+
+Optional subnodes:
+- mdio : specifies the mdio bus in the FEC, used as a container for phy nodes
+  according to phy.txt in the same directory
 
 Example:
 
@@ -25,3 +32,23 @@ ethernet@83fec000 {
        local-mac-address = [00 04 9F 01 1B B9];
        phy-supply = <&reg_fec_supply>;
 };
+
+Example with phy specified:
+
+ethernet@83fec000 {
+       compatible = "fsl,imx51-fec", "fsl,imx27-fec";
+       reg = <0x83fec000 0x4000>;
+       interrupts = <87>;
+       phy-mode = "mii";
+       phy-reset-gpios = <&gpio2 14 0>; /* GPIO2_14 */
+       local-mac-address = [00 04 9F 01 1B B9];
+       phy-supply = <&reg_fec_supply>;
+       phy-handle = <&ethphy>;
+       mdio {
+               ethphy: ethernet-phy@6 {
+                       compatible = "ethernet-phy-ieee802.3-c22";
+                       reg = <6>;
+                       max-speed = <100>;
+               };
+       };
+};
index c94909215c0785691395840de27ec80b2de0c42b..ae738f562acca6bf915ef5691dfd32da9d2e23c8 100644 (file)
@@ -3,6 +3,7 @@
 ** Required properties:
 
 - compatible : One of the following:
+              "samsung,exynos3250-tmu"
               "samsung,exynos4412-tmu"
               "samsung,exynos4210-tmu"
               "samsung,exynos5250-tmu"
index 28ef498a66e598e0399a3095f625781fe2d2d575..0ef00be44b0137e7435c13c57c379654937b4592 100644 (file)
@@ -1,7 +1,13 @@
 * Renesas R-Car Thermal
 
 Required properties:
-- compatible           : "renesas,rcar-thermal"
+- compatible           : "renesas,thermal-<soctype>", "renesas,rcar-thermal"
+                         as fallback.
+                         Examples with soctypes are:
+                           - "renesas,thermal-r8a73a4" (R-Mobile AP6)
+                           - "renesas,thermal-r8a7779" (R-Car H1)
+                           - "renesas,thermal-r8a7790" (R-Car H2)
+                           - "renesas,thermal-r8a7791" (R-Car M2)
 - reg                  : Address range of the thermal registers.
                          The 1st reg will be recognized as common register
                          if it has "interrupts".
@@ -12,18 +18,18 @@ Option properties:
 
 Example (non interrupt support):
 
-thermal@e61f0100 {
-       compatible = "renesas,rcar-thermal";
-       reg = <0xe61f0100 0x38>;
+thermal@ffc48000 {
+       compatible = "renesas,thermal-r8a7779", "renesas,rcar-thermal";
+       reg = <0xffc48000 0x38>;
 };
 
 Example (interrupt support):
 
 thermal@e61f0000 {
-       compatible = "renesas,rcar-thermal";
+       compatible = "renesas,thermal-r8a73a4", "renesas,rcar-thermal";
        reg = <0xe61f0000 0x14
                0xe61f0100 0x38
                0xe61f0200 0x38
                0xe61f0300 0x38>;
-       interrupts = <0 69 4>;
+       interrupts = <0 69 IRQ_TYPE_LEVEL_HIGH>;
 };
diff --git a/Documentation/devicetree/bindings/thermal/st-thermal.txt b/Documentation/devicetree/bindings/thermal/st-thermal.txt
new file mode 100644 (file)
index 0000000..3b9251b
--- /dev/null
@@ -0,0 +1,42 @@
+Binding for Thermal Sensor driver for STMicroelectronics STi series of SoCs.
+
+Required parameters:
+-------------------
+
+compatible :   st,<SoC>-<module>-thermal; should be one of:
+                 "st,stih415-sas-thermal",
+                 "st,stih415-mpe-thermal",
+                 "st,stih416-sas-thermal"
+                 "st,stih416-mpe-thermal"
+                 "st,stid127-thermal" or
+                 "st,stih407-thermal"
+               according to the SoC type (stih415, stih416, stid127, stih407)
+               and module type (sas or mpe). On stid127 & stih407 there is only
+               one die/module, so there is no module type in the compatible
+               string.
+clock-names :  Should be "thermal".
+                 See: Documentation/devicetree/bindings/resource-names.txt
+clocks :       Phandle of the clock used by the thermal sensor.
+                 See: Documentation/devicetree/bindings/clock/clock-bindings.txt
+
+Optional parameters:
+-------------------
+
+reg :          For non-sysconf based sensors, this should be the physical base
+               address and length of the sensor's registers.
+interrupts :   Standard way to define interrupt number.
+               Interrupt is mandatory to be defined when compatible is
+               "stih416-mpe-thermal".
+                 NB: For thermal sensor's for which no interrupt has been
+                 defined, a polling delay of 1000ms will be used to read the
+                 temperature from device.
+
+Example:
+
+       temp1@fdfe8000 {
+               compatible      = "st,stih416-mpe-thermal";
+               reg             = <0xfdfe8000 0x10>;
+               clock-names     = "thermal";
+               clocks          = <&clk_m_mpethsens>;
+               interrupts      = <GIC_SPI 23 IRQ_TYPE_NONE>;
+       };
diff --git a/Documentation/devicetree/changesets.txt b/Documentation/devicetree/changesets.txt
new file mode 100644 (file)
index 0000000..935ba5a
--- /dev/null
@@ -0,0 +1,40 @@
+A DT changeset is a method which allows one to apply changes
+in the live tree in such a way that either the full set of changes
+will be applied, or none of them will be. If an error occurs partway
+through applying the changeset, then the tree will be rolled back to the
+previous state. A changeset can also be removed after it has been
+applied.
+
+When a changeset is applied, all of the changes get applied to the tree
+at once before emitting OF_RECONFIG notifiers. This is so that the
+receiver sees a complete and consistent state of the tree when it
+receives the notifier.
+
+The sequence of a changeset is as follows.
+
+1. of_changeset_init() - initializes a changeset
+
+2. A number of DT tree change calls, of_changeset_attach_node(),
+of_changeset_detach_node(), of_changeset_add_property(),
+of_changeset_remove_property, of_changeset_update_property() to prepare
+a set of changes. No changes to the active tree are made at this point.
+All the change operations are recorded in the of_changeset 'entries'
+list.
+
+3. mutex_lock(of_mutex) - starts a changeset; The global of_mutex
+ensures there can only be one editor at a time.
+
+4. of_changeset_apply() - Apply the changes to the tree. Either the
+entire changeset will get applied, or if there is an error the tree will
+be restored to the previous state
+
+5. mutex_unlock(of_mutex) - All operations complete, release the mutex
+
+If a successfully applied changeset needs to be removed, it can be done
+with the following sequence.
+
+1. mutex_lock(of_mutex)
+
+2. of_changeset_revert()
+
+3. mutex_unlock(of_mutex)
diff --git a/Documentation/devicetree/todo.txt b/Documentation/devicetree/todo.txt
new file mode 100644 (file)
index 0000000..c3cf065
--- /dev/null
@@ -0,0 +1,11 @@
+Todo list for devicetree:
+
+=== General structure ===
+- Switch from custom lists to (h)list_head for nodes and properties structure
+- Remove of_allnodes list and iterate using list of child nodes alone
+
+=== CONFIG_OF_DYNAMIC ===
+- Switch to RCU for tree updates and get rid of global spinlock
+- Document node lifecycle for CONFIG_OF_DYNAMIC
+- Always set ->full_name at of_attach_node() time
+- pseries: Get rid of open-coded tree modification from arch/powerpc/platforms/pseries/dlpar.c
index 879b6e31e2da6b4992d9ec5c556c741089851fff..573e28ce97513c872ff80d870b5471c7e3181028 100644 (file)
@@ -84,31 +84,32 @@ The slave DMA usage consists of following steps:
    the given transaction.
 
    Interface:
-       struct dma_async_tx_descriptor *(*chan->device->device_prep_slave_sg)(
+       struct dma_async_tx_descriptor *dmaengine_prep_slave_sg(
                struct dma_chan *chan, struct scatterlist *sgl,
                unsigned int sg_len, enum dma_data_direction direction,
                unsigned long flags);
 
-       struct dma_async_tx_descriptor *(*chan->device->device_prep_dma_cyclic)(
+       struct dma_async_tx_descriptor *dmaengine_prep_dma_cyclic(
                struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len,
                size_t period_len, enum dma_data_direction direction);
 
-       struct dma_async_tx_descriptor *(*device_prep_interleaved_dma)(
+       struct dma_async_tx_descriptor *dmaengine_prep_interleaved_dma(
                struct dma_chan *chan, struct dma_interleaved_template *xt,
                unsigned long flags);
 
    The peripheral driver is expected to have mapped the scatterlist for
    the DMA operation prior to calling device_prep_slave_sg, and must
    keep the scatterlist mapped until the DMA operation has completed.
-   The scatterlist must be mapped using the DMA struct device.  So,
-   normal setup should look like this:
+   The scatterlist must be mapped using the DMA struct device.
+   If a mapping needs to be synchronized later, dma_sync_*_for_*() must be
+   called using the DMA struct device, too.
+   So, normal setup should look like this:
 
        nr_sg = dma_map_sg(chan->device->dev, sgl, sg_len);
        if (nr_sg == 0)
                /* error */
 
-       desc = chan->device->device_prep_slave_sg(chan, sgl, nr_sg,
-                       direction, flags);
+       desc = dmaengine_prep_slave_sg(chan, sgl, nr_sg, direction, flags);
 
    Once a descriptor has been obtained, the callback information can be
    added and the descriptor must then be submitted.  Some DMA engine
@@ -188,7 +189,7 @@ Further APIs:
    description of this API.
 
    This can be used in conjunction with dma_async_is_complete() and
-   the cookie returned from 'descriptor->submit()' to check for
+   the cookie returned from dmaengine_submit() to check for
    completion of a specific DMA transaction.
 
    Note:
index e543b1a619cc95b022f97d3c516687fc98f43735..c8f036a9b13fcf5d1c87761de5a97ba1248d2fee 100644 (file)
@@ -66,23 +66,31 @@ b/ A per-superblock list "s_anon" of dentries which are the roots of
 
 c/ Helper routines to allocate anonymous dentries, and to help attach
    loose directory dentries at lookup time. They are:
-    d_alloc_anon(inode) will return a dentry for the given inode.
+    d_obtain_alias(inode) will return a dentry for the given inode.
       If the inode already has a dentry, one of those is returned.
       If it doesn't, a new anonymous (IS_ROOT and
         DCACHE_DISCONNECTED) dentry is allocated and attached.
       In the case of a directory, care is taken that only one dentry
       can ever be attached.
-    d_splice_alias(inode, dentry) will make sure that there is a
-      dentry with the same name and parent as the given dentry, and
-      which refers to the given inode.
-      If the inode is a directory and already has a dentry, then that
-      dentry is d_moved over the given dentry.
-      If the passed dentry gets attached, care is taken that this is
-      mutually exclusive to a d_alloc_anon operation.
-      If the passed dentry is used, NULL is returned, else the used
-      dentry is returned.  This corresponds to the calling pattern of
-      ->lookup.
-  
+    d_splice_alias(inode, dentry) or d_materialise_unique(dentry, inode)
+      will introduce a new dentry into the tree; either the passed-in
+      dentry or a preexisting alias for the given inode (such as an
+      anonymous one created by d_obtain_alias), if appropriate.  The two
+      functions differ in their handling of directories with preexisting
+      aliases:
+        d_splice_alias will use any existing IS_ROOT dentry, but it will
+         return -EIO rather than try to move a dentry with a different
+         parent.  This is appropriate for local filesystems, which
+         should never see such an alias unless the filesystem is
+         corrupted somehow (for example, if two on-disk directory
+         entries refer to the same directory.)
+       d_materialise_unique will attempt to move any dentry.  This is
+         appropriate for distributed filesystems, where finding a
+         directory other than where we last cached it may be a normal
+         consequence of concurrent operations on other hosts.
+      Both functions return NULL when the passed-in dentry is used,
+      following the calling convention of ->lookup.
+
  
 Filesystem Issues
 -----------------
@@ -120,12 +128,12 @@ struct which has the following members:
 
   fh_to_dentry (mandatory)
     Given a filehandle fragment, this should find the implied object and
-    create a dentry for it (possibly with d_alloc_anon).
+    create a dentry for it (possibly with d_obtain_alias).
 
   fh_to_parent (optional but strongly recommended)
     Given a filehandle fragment, this should find the parent of the
-    implied object and create a dentry for it (possibly with d_alloc_anon).
-    May fail if the filehandle fragment is too small.
+    implied object and create a dentry for it (possibly with
+    d_obtain_alias).  May fail if the filehandle fragment is too small.
 
   get_parent (optional but strongly recommended)
     When given a dentry for a directory, this should return  a dentry for
index a1d0d7a301657d674c653648534ee5be527919b9..61d65cc65c54a333bef994fc49f881ece37032c5 100644 (file)
@@ -1053,7 +1053,8 @@ struct dentry_operations {
        If the 'rcu_walk' parameter is true, then the caller is doing a
        pathwalk in RCU-walk mode.  Sleeping is not permitted in this mode,
        and the caller can be asked to leave it and call again by returning
-       -ECHILD.
+       -ECHILD.  -EISDIR may also be returned to tell pathwalk to
+       ignore d_automount or any mounts.
 
        This function is only used if DCACHE_MANAGE_TRANSIT is set on the
        dentry being transited from.
index a8eb6afce6a412a80ce576911bc17b20294891d6..5ae8608ca9f58a6331cae454ae7a0c84ab2f6c47 100644 (file)
@@ -2200,6 +2200,21 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        and restore using xsave. The kernel will fallback to
                        enabling legacy floating-point and sse state.
 
+       noxsaveopt      [X86] Disables xsaveopt used in saving x86 extended
+                       register states. The kernel will fall back to use
+                       xsave to save the states. By using this parameter,
+                       performance of saving the states is degraded because
+                       xsave doesn't support modified optimization while
+                       xsaveopt supports it on xsaveopt enabled systems.
+
+       noxsaves        [X86] Disables xsaves and xrstors used in saving and
+                       restoring x86 extended register state in compacted
+                       form of xsave area. The kernel will fall back to use
+                       xsaveopt and xrstor to save and restore the states
+                       in standard form of xsave area. By using this
+                       parameter, xsave area per process might occupy more
+                       memory on xsaves enabled systems.
+
        eagerfpu=       [X86]
                        on      enable eager fpu restore
                        off     disable eager fpu restore
index 7e2eb4c646e32530598378ed279d5c77b93aa9aa..2f85f55c8fb860779801ba88a144bd2c4a6c1477 100644 (file)
@@ -719,6 +719,14 @@ S: Maintained
 F:     drivers/net/appletalk/
 F:     net/appletalk/
 
+APPLIED MICRO (APM) X-GENE SOC ETHERNET DRIVER
+M:     Iyappan Subramanian <isubramanian@apm.com>
+M:     Keyur Chudgar <kchudgar@apm.com>
+M:     Ravi Patel <rapatel@apm.com>
+S:     Supported
+F:     drivers/net/ethernet/apm/xgene/
+F:     Documentation/devicetree/bindings/net/apm-xgene-enet.txt
+
 APTINA CAMERA SENSOR PLL
 M:     Laurent Pinchart <Laurent.pinchart@ideasonboard.com>
 L:     linux-media@vger.kernel.org
index 36c771a2d765de67cb5ccf80c1fcd629db96fd71..27d0d9c8adf3da724d9e64ba2e5229731a0b38eb 100644 (file)
                i2c0 = &i2c0;
        };
 
+       chosen {
+               stdout-path = &uart0;
+       };
+
        memory {
                reg = <0x0 0x08000000>;
        };
index d025048119d3078ee1730531071a3e3ca5189d37..e36c1e82fea74d62f7efb7cc3c3585597abd78c2 100644 (file)
@@ -56,5 +56,3 @@
                };
        };
 };
-
-#include <testcases.dtsi>
index 485be42519b96ddec1dcceafd19c84431e45cd99..88099175fc56c64be13c5a31e5cf05ed18b60e50 100644 (file)
@@ -1414,6 +1414,34 @@ void edma_clear_event(unsigned channel)
 }
 EXPORT_SYMBOL(edma_clear_event);
 
+/*
+ * edma_assign_channel_eventq - move given channel to desired eventq
+ * Arguments:
+ *     channel - channel number
+ *     eventq_no - queue to move the channel
+ *
+ * Can be used to move a channel to a selected event queue.
+ */
+void edma_assign_channel_eventq(unsigned channel, enum dma_event_q eventq_no)
+{
+       unsigned ctlr;
+
+       ctlr = EDMA_CTLR(channel);
+       channel = EDMA_CHAN_SLOT(channel);
+
+       if (channel >= edma_cc[ctlr]->num_channels)
+               return;
+
+       /* default to low priority queue */
+       if (eventq_no == EVENTQ_DEFAULT)
+               eventq_no = edma_cc[ctlr]->default_queue;
+       if (eventq_no >= edma_cc[ctlr]->num_tc)
+               return;
+
+       map_dmach_queue(ctlr, channel, eventq_no);
+}
+EXPORT_SYMBOL(edma_assign_channel_eventq);
+
 static int edma_setup_from_hw(struct device *dev, struct edma_soc_info *pdata,
                              struct edma *edma_cc)
 {
@@ -1470,7 +1498,8 @@ static int edma_setup_from_hw(struct device *dev, struct edma_soc_info *pdata,
        queue_priority_map[i][1] = -1;
 
        pdata->queue_priority_mapping = queue_priority_map;
-       pdata->default_queue = 0;
+       /* Default queue has the lowest priority */
+       pdata->default_queue = i - 1;
 
        return 0;
 }
index 2c4041c9bac5e18e320d25b0577d034c52514b43..e43791829aceb2a725d45b419b36c21b54ec0300 100644 (file)
@@ -49,8 +49,3 @@ int arch_gnttab_init(unsigned long nr_shared)
 {
        return 0;
 }
-
-int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status)
-{
-       return 0;
-}
index 6541962f5d7035f5d5cf136e49c3dce57e841e4a..b2f56229aa5e5f520ae5d81aea903bbee24ea02c 100644 (file)
@@ -28,3 +28,7 @@
 &serial0 {
        status = "ok";
 };
+
+&menet {
+       status = "ok";
+};
index 40aa96ce13c4c1d44ab6aaac1aa13b0c938ab6aa..c0aceef7f5b32067be4a26a72de0316f7639ed6a 100644 (file)
                                clock-output-names = "ethclk";
                        };
 
-                       eth8clk: eth8clk {
+                       menetclk: menetclk {
                                compatible = "apm,xgene-device-clock";
                                #clock-cells = <1>;
                                clocks = <&ethclk 0>;
-                               clock-names = "eth8clk";
                                reg = <0x0 0x1702C000 0x0 0x1000>;
                                reg-names = "csr-reg";
-                               clock-output-names = "eth8clk";
+                               clock-output-names = "menetclk";
                        };
 
                        sataphy1clk: sataphy1clk@1f21c000 {
                        #clock-cells = <1>;
                        clocks = <&rtcclk 0>;
                };
+
+               menet: ethernet@17020000 {
+                       compatible = "apm,xgene-enet";
+                       status = "disabled";
+                       reg = <0x0 0x17020000 0x0 0xd100>,
+                             <0x0 0X17030000 0x0 0X400>,
+                             <0x0 0X10000000 0x0 0X200>;
+                       reg-names = "enet_csr", "ring_csr", "ring_cmd";
+                       interrupts = <0x0 0x3c 0x4>;
+                       dma-coherent;
+                       clocks = <&menetclk 0>;
+                       local-mac-address = [00 01 73 00 00 01];
+                       phy-connection-type = "rgmii";
+                       phy-handle = <&menetphy>;
+                       mdio {
+                               compatible = "apm,xgene-mdio";
+                               #address-cells = <1>;
+                               #size-cells = <0>;
+                               menetphy: menetphy@3 {
+                                       compatible = "ethernet-phy-id001c.c915";
+                                       reg = <0x3>;
+                               };
+
+                       };
+               };
        };
 };
index 0a2385fa2a1d44fca40f6b5e74be956dfb9268de..04b7d4f8429ac399ba74c34a39722311f052da3b 100644 (file)
@@ -55,7 +55,7 @@ unsigned int get_global_icache_size(void)
        return (get_icache_size() * ((temp >> SYSC_xCPARTG_AND_S) + 1)) >> 4;
 }
 
-static unsigned int get_thread_cache_size(unsigned int cache, int thread_id)
+static int get_thread_cache_size(unsigned int cache, int thread_id)
 {
        unsigned int cache_size;
        unsigned int t_cache_part;
@@ -94,7 +94,7 @@ static unsigned int get_thread_cache_size(unsigned int cache, int thread_id)
 
 void check_for_cache_aliasing(int thread_id)
 {
-       unsigned int thread_cache_size;
+       int thread_cache_size;
        unsigned int cache_type;
        for (cache_type = ICACHE; cache_type <= DCACHE; cache_type++) {
                thread_cache_size =
index 3c52fa6d0f8e24030294fecacc26498f5de9ffe5..3c32075d294528df5d8c2493215f34b012ec57f4 100644 (file)
@@ -173,7 +173,7 @@ new_search:
                                mm->context.part_huge = 0;
                        return addr;
                }
-               if (vma && (vma->vm_flags & MAP_HUGETLB)) {
+               if (vma->vm_flags & MAP_HUGETLB) {
                        /* space after a huge vma in 2nd level page table? */
                        if (vma->vm_end & HUGEPT_MASK) {
                                after_huge = 1;
index 2c0e1552d20bb9a410ab6c27762d676e6e7f5b9e..7f9d14f5c4daae8e93f4686849b4357e8fe098d4 100644 (file)
                        compatible = "fsl,mpc5121-dma";
                        reg = <0x14000 0x1800>;
                        interrupts = <65 0x8>;
+                       #dma-cells = <1>;
                };
        };
 
index 1a3b1055f5ebaf423327401c71612b38c184f12d..4e139f8a69effa0a403a2e6d75b0fe7d7e268e3d 100644 (file)
@@ -818,76 +818,6 @@ int cpu_to_chip_id(int cpu)
 }
 EXPORT_SYMBOL(cpu_to_chip_id);
 
-#ifdef CONFIG_PPC_PSERIES
-/*
- * Fix up the uninitialized fields in a new device node:
- * name, type and pci-specific fields
- */
-
-static int of_finish_dynamic_node(struct device_node *node)
-{
-       struct device_node *parent = of_get_parent(node);
-       int err = 0;
-       const phandle *ibm_phandle;
-
-       node->name = of_get_property(node, "name", NULL);
-       node->type = of_get_property(node, "device_type", NULL);
-
-       if (!node->name)
-               node->name = "<NULL>";
-       if (!node->type)
-               node->type = "<NULL>";
-
-       if (!parent) {
-               err = -ENODEV;
-               goto out;
-       }
-
-       /* We don't support that function on PowerMac, at least
-        * not yet
-        */
-       if (machine_is(powermac))
-               return -ENODEV;
-
-       /* fix up new node's phandle field */
-       if ((ibm_phandle = of_get_property(node, "ibm,phandle", NULL)))
-               node->phandle = *ibm_phandle;
-
-out:
-       of_node_put(parent);
-       return err;
-}
-
-static int prom_reconfig_notifier(struct notifier_block *nb,
-                                 unsigned long action, void *node)
-{
-       int err;
-
-       switch (action) {
-       case OF_RECONFIG_ATTACH_NODE:
-               err = of_finish_dynamic_node(node);
-               if (err < 0)
-                       printk(KERN_ERR "finish_node returned %d\n", err);
-               break;
-       default:
-               err = 0;
-               break;
-       }
-       return notifier_from_errno(err);
-}
-
-static struct notifier_block prom_reconfig_nb = {
-       .notifier_call = prom_reconfig_notifier,
-       .priority = 10, /* This one needs to run first */
-};
-
-static int __init prom_reconfig_setup(void)
-{
-       return of_reconfig_notifier_register(&prom_reconfig_nb);
-}
-__initcall(prom_reconfig_setup);
-#endif
-
 bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
 {
        return (int)phys_id == get_hard_smp_processor_id(cpu);
index 1413e72bc2e1489d38f819041fe476f9ad60c2ef..4882bfd90e27c820b42db5307a08d4c271f48ded 100644 (file)
@@ -2805,25 +2805,20 @@ set_initial_features(void)
                /* Enable GMAC for now for PCI probing. It will be disabled
                 * later on after PCI probe
                 */
-               np = of_find_node_by_name(NULL, "ethernet");
-               while(np) {
+               for_each_node_by_name(np, "ethernet")
                        if (of_device_is_compatible(np, "K2-GMAC"))
                                g5_gmac_enable(np, 0, 1);
-                       np = of_find_node_by_name(np, "ethernet");
-               }
 
                /* Enable FW before PCI probe. Will be disabled later on
                 * Note: We should have a batter way to check that we are
                 * dealing with uninorth internal cell and not a PCI cell
                 * on the external PCI. The code below works though.
                 */
-               np = of_find_node_by_name(NULL, "firewire");
-               while(np) {
+               for_each_node_by_name(np, "firewire") {
                        if (of_device_is_compatible(np, "pci106b,5811")) {
                                macio_chips[0].flags |= MACIO_FLAG_FW_SUPPORTED;
                                g5_fw_enable(np, 0, 1);
                        }
-                       np = of_find_node_by_name(np, "firewire");
                }
        }
 #else /* CONFIG_PPC64 */
@@ -2834,13 +2829,11 @@ set_initial_features(void)
                /* Enable GMAC for now for PCI probing. It will be disabled
                 * later on after PCI probe
                 */
-               np = of_find_node_by_name(NULL, "ethernet");
-               while(np) {
+               for_each_node_by_name(np, "ethernet") {
                        if (np->parent
                            && of_device_is_compatible(np->parent, "uni-north")
                            && of_device_is_compatible(np, "gmac"))
                                core99_gmac_enable(np, 0, 1);
-                       np = of_find_node_by_name(np, "ethernet");
                }
 
                /* Enable FW before PCI probe. Will be disabled later on
@@ -2848,8 +2841,7 @@ set_initial_features(void)
                 * dealing with uninorth internal cell and not a PCI cell
                 * on the external PCI. The code below works though.
                 */
-               np = of_find_node_by_name(NULL, "firewire");
-               while(np) {
+               for_each_node_by_name(np, "firewire") {
                        if (np->parent
                            && of_device_is_compatible(np->parent, "uni-north")
                            && (of_device_is_compatible(np, "pci106b,18") ||
@@ -2858,18 +2850,16 @@ set_initial_features(void)
                                macio_chips[0].flags |= MACIO_FLAG_FW_SUPPORTED;
                                core99_firewire_enable(np, 0, 1);
                        }
-                       np = of_find_node_by_name(np, "firewire");
                }
 
                /* Enable ATA-100 before PCI probe. */
                np = of_find_node_by_name(NULL, "ata-6");
-               while(np) {
+               for_each_node_by_name(np, "ata-6") {
                        if (np->parent
                            && of_device_is_compatible(np->parent, "uni-north")
                            && of_device_is_compatible(np, "kauai-ata")) {
                                core99_ata100_enable(np, 1);
                        }
-                       np = of_find_node_by_name(np, "ata-6");
                }
 
                /* Switch airport off */
index cf7009b8c7b6481b6c36339e3c0915809f2e0325..7e868ccf3b0d9451ad7efdf0cf0a6f01bb7b4f7a 100644 (file)
@@ -698,7 +698,7 @@ static void __init fixup_nec_usb2(void)
 {
        struct device_node *nec;
 
-       for (nec = NULL; (nec = of_find_node_by_name(nec, "usb")) != NULL;) {
+       for_each_node_by_name(nec, "usb") {
                struct pci_controller *hose;
                u32 data;
                const u32 *prop;
index 5cbd4d67d5c445af65be43099477df712193755b..af094ae03dbbc9dfddec65e69da279d3ab8a0b7f 100644 (file)
@@ -577,7 +577,7 @@ static void __init smp_core99_setup_i2c_hwsync(int ncpus)
        int ok;
 
        /* Look for the clock chip */
-       while ((cc = of_find_node_by_name(cc, "i2c-hwclock")) != NULL) {
+       for_each_node_by_name(cc, "i2c-hwclock") {
                p = of_get_parent(cc);
                ok = p && of_device_is_compatible(p, "uni-n-i2c");
                of_node_put(p);
index 44e0b55a2a028f4227ae0b22f31791f1ac913218..366bd221edecb089c9b887ba62b271c90694719f 100644 (file)
@@ -191,7 +191,7 @@ int __init udbg_adb_init(int force_btext)
         * of type "adb". If not, we return a failure, but we keep the
         * bext output set for now
         */
-       for (np = NULL; (np = of_find_node_by_name(np, "keyboard")) != NULL;) {
+       for_each_node_by_name(np, "keyboard") {
                struct device_node *parent = of_get_parent(np);
                int found = (parent && strcmp(parent->type, "adb") == 0);
                of_node_put(parent);
index 7995135170a31a35a7824703f72fe26b93b145a7..ac01e188faef377cece87b538129315e63ccf7d7 100644 (file)
@@ -194,7 +194,7 @@ static int pseries_update_drconf_memory(struct of_prop_reconfig *pr)
        if (!memblock_size)
                return -EINVAL;
 
-       p = (u32 *)of_get_property(pr->dn, "ibm,dynamic-memory", NULL);
+       p = (u32 *) pr->old_prop->value;
        if (!p)
                return -EINVAL;
 
index cfe8a6389a513a29b5b49fc720d9682430eebcac..e724d3186e739999cc6daf726d073fe035d26739 100644 (file)
@@ -232,8 +232,7 @@ static void __init pseries_discover_pic(void)
        struct device_node *np;
        const char *typep;
 
-       for (np = NULL; (np = of_find_node_by_name(np,
-                                                  "interrupt-controller"));) {
+       for_each_node_by_name(np, "interrupt-controller") {
                typep = of_get_property(np, "compatible", NULL);
                if (strstr(typep, "open-pic")) {
                        pSeries_mpic_node = of_node_get(np);
index b2256562314298e0e63918b7f60323691e08d5b3..afde2a7d3eb35859270fe4a3bbdf20d742b8b35a 100644 (file)
@@ -25,7 +25,7 @@
  * Define the default configuration for dual address memory-memory transfer.
  * The 0x400 value represents auto-request, external->external.
  */
-#define RS_DUAL        (DM_INC | SM_INC | 0x400 | TS_INDEX2VAL(XMIT_SZ_32BIT))
+#define RS_DUAL        (DM_INC | SM_INC | RS_AUTO | TS_INDEX2VAL(XMIT_SZ_32BIT))
 
 static unsigned long dma_find_base(unsigned int chan)
 {
index 51cd78feacff0bea6a442e37df7f532e62bfe3f5..c757b47e6b6481ff6ad49254b2f0b209f4e3d7af 100644 (file)
 #ifndef DMA_REGISTER_H
 #define DMA_REGISTER_H
 
-/* DMA register */
-#define SAR    0x00
-#define DAR    0x04
-#define TCR    0x08
-#define CHCR   0x0C
-#define DMAOR  0x40
+/* DMA registers */
+#define SAR    0x00    /* Source Address Register */
+#define DAR    0x04    /* Destination Address Register */
+#define TCR    0x08    /* Transfer Count Register */
+#define CHCR   0x0C    /* Channel Control Register */
+#define DMAOR  0x40    /* DMA Operation Register */
 
 /* DMAOR definitions */
-#define DMAOR_AE       0x00000004
+#define DMAOR_AE       0x00000004      /* Address Error Flag */
 #define DMAOR_NMIF     0x00000002
-#define DMAOR_DME      0x00000001
+#define DMAOR_DME      0x00000001      /* DMA Master Enable */
 
 /* Definitions for the SuperH DMAC */
 #define REQ_L  0x00000000
 #define ACK_W  0x00020000
 #define ACK_H  0x00000000
 #define ACK_L  0x00010000
-#define DM_INC 0x00004000
-#define DM_DEC 0x00008000
-#define DM_FIX 0x0000c000
-#define SM_INC 0x00001000
-#define SM_DEC 0x00002000
-#define SM_FIX 0x00003000
+#define DM_INC 0x00004000      /* Destination addresses are incremented */
+#define DM_DEC 0x00008000      /* Destination addresses are decremented */
+#define DM_FIX 0x0000c000      /* Destination address is fixed */
+#define SM_INC 0x00001000      /* Source addresses are incremented */
+#define SM_DEC 0x00002000      /* Source addresses are decremented */
+#define SM_FIX 0x00003000      /* Source address is fixed */
 #define RS_IN  0x00000200
 #define RS_OUT 0x00000300
+#define RS_AUTO        0x00000400      /* Auto Request */
+#define RS_ERS 0x00000800      /* DMA extended resource selector */
 #define TS_BLK 0x00000040
 #define TM_BUR 0x00000020
-#define CHCR_DE        0x00000001
-#define CHCR_TE        0x00000002
-#define CHCR_IE        0x00000004
+#define CHCR_DE        0x00000001      /* DMA Enable */
+#define CHCR_TE        0x00000002      /* Transfer End Flag */
+#define CHCR_IE        0x00000004      /* Interrupt Enable */
 
 #endif
index 57f83a92a505a639b02beba2b0494ea93d8aa3c5..7aa733307afcca4920a2a023934a1d7bfe0e7158 100644 (file)
@@ -30,62 +30,62 @@ static const struct sh_dmae_slave_config sh7722_dmae_slaves[] = {
        {
                .slave_id       = SHDMA_SLAVE_SCIF0_TX,
                .addr           = 0xffe0000c,
-               .chcr           = DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+               .chcr           = DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x21,
        }, {
                .slave_id       = SHDMA_SLAVE_SCIF0_RX,
                .addr           = 0xffe00014,
-               .chcr           = DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+               .chcr           = DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x22,
        }, {
                .slave_id       = SHDMA_SLAVE_SCIF1_TX,
                .addr           = 0xffe1000c,
-               .chcr           = DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+               .chcr           = DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x25,
        }, {
                .slave_id       = SHDMA_SLAVE_SCIF1_RX,
                .addr           = 0xffe10014,
-               .chcr           = DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+               .chcr           = DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x26,
        }, {
                .slave_id       = SHDMA_SLAVE_SCIF2_TX,
                .addr           = 0xffe2000c,
-               .chcr           = DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+               .chcr           = DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x29,
        }, {
                .slave_id       = SHDMA_SLAVE_SCIF2_RX,
                .addr           = 0xffe20014,
-               .chcr           = DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+               .chcr           = DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x2a,
        }, {
                .slave_id       = SHDMA_SLAVE_SIUA_TX,
                .addr           = 0xa454c098,
-               .chcr           = DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT),
+               .chcr           = DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_32BIT),
                .mid_rid        = 0xb1,
        }, {
                .slave_id       = SHDMA_SLAVE_SIUA_RX,
                .addr           = 0xa454c090,
-               .chcr           = DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT),
+               .chcr           = DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_32BIT),
                .mid_rid        = 0xb2,
        }, {
                .slave_id       = SHDMA_SLAVE_SIUB_TX,
                .addr           = 0xa454c09c,
-               .chcr           = DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT),
+               .chcr           = DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_32BIT),
                .mid_rid        = 0xb5,
        }, {
                .slave_id       = SHDMA_SLAVE_SIUB_RX,
                .addr           = 0xa454c094,
-               .chcr           = DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT),
+               .chcr           = DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_32BIT),
                .mid_rid        = 0xb6,
        }, {
                .slave_id       = SHDMA_SLAVE_SDHI0_TX,
                .addr           = 0x04ce0030,
-               .chcr           = DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_16BIT),
+               .chcr           = DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_16BIT),
                .mid_rid        = 0xc1,
        }, {
                .slave_id       = SHDMA_SLAVE_SDHI0_RX,
                .addr           = 0x04ce0030,
-               .chcr           = DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_16BIT),
+               .chcr           = DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_16BIT),
                .mid_rid        = 0xc2,
        },
 };
index b9e84b1d3aa72c9e38a6992c35a3d57f94093ea6..ea5780b3c7f6a0999cc0392c9c11c7dcab61cf40 100644 (file)
@@ -36,122 +36,122 @@ static const struct sh_dmae_slave_config sh7724_dmae_slaves[] = {
        {
                .slave_id       = SHDMA_SLAVE_SCIF0_TX,
                .addr           = 0xffe0000c,
-               .chcr           = DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+               .chcr           = DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x21,
        }, {
                .slave_id       = SHDMA_SLAVE_SCIF0_RX,
                .addr           = 0xffe00014,
-               .chcr           = DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+               .chcr           = DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x22,
        }, {
                .slave_id       = SHDMA_SLAVE_SCIF1_TX,
                .addr           = 0xffe1000c,
-               .chcr           = DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+               .chcr           = DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x25,
        }, {
                .slave_id       = SHDMA_SLAVE_SCIF1_RX,
                .addr           = 0xffe10014,
-               .chcr           = DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+               .chcr           = DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x26,
        }, {
                .slave_id       = SHDMA_SLAVE_SCIF2_TX,
                .addr           = 0xffe2000c,
-               .chcr           = DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+               .chcr           = DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x29,
        }, {
                .slave_id       = SHDMA_SLAVE_SCIF2_RX,
                .addr           = 0xffe20014,
-               .chcr           = DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+               .chcr           = DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x2a,
        }, {
                .slave_id       = SHDMA_SLAVE_SCIF3_TX,
                .addr           = 0xa4e30020,
-               .chcr           = DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+               .chcr           = DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x2d,
        }, {
                .slave_id       = SHDMA_SLAVE_SCIF3_RX,
                .addr           = 0xa4e30024,
-               .chcr           = DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+               .chcr           = DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x2e,
        }, {
                .slave_id       = SHDMA_SLAVE_SCIF4_TX,
                .addr           = 0xa4e40020,
-               .chcr           = DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+               .chcr           = DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x31,
        }, {
                .slave_id       = SHDMA_SLAVE_SCIF4_RX,
                .addr           = 0xa4e40024,
-               .chcr           = DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+               .chcr           = DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x32,
        }, {
                .slave_id       = SHDMA_SLAVE_SCIF5_TX,
                .addr           = 0xa4e50020,
-               .chcr           = DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+               .chcr           = DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x35,
        }, {
                .slave_id       = SHDMA_SLAVE_SCIF5_RX,
                .addr           = 0xa4e50024,
-               .chcr           = DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+               .chcr           = DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x36,
        }, {
                .slave_id       = SHDMA_SLAVE_USB0D0_TX,
                .addr           = 0xA4D80100,
-               .chcr           = DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT),
+               .chcr           = DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_32BIT),
                .mid_rid        = 0x73,
        }, {
                .slave_id       = SHDMA_SLAVE_USB0D0_RX,
                .addr           = 0xA4D80100,
-               .chcr           = DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT),
+               .chcr           = DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_32BIT),
                .mid_rid        = 0x73,
        }, {
                .slave_id       = SHDMA_SLAVE_USB0D1_TX,
                .addr           = 0xA4D80120,
-               .chcr           = DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT),
+               .chcr           = DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_32BIT),
                .mid_rid        = 0x77,
        }, {
                .slave_id       = SHDMA_SLAVE_USB0D1_RX,
                .addr           = 0xA4D80120,
-               .chcr           = DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT),
+               .chcr           = DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_32BIT),
                .mid_rid        = 0x77,
        }, {
                .slave_id       = SHDMA_SLAVE_USB1D0_TX,
                .addr           = 0xA4D90100,
-               .chcr           = DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT),
+               .chcr           = DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_32BIT),
                .mid_rid        = 0xab,
        }, {
                .slave_id       = SHDMA_SLAVE_USB1D0_RX,
                .addr           = 0xA4D90100,
-               .chcr           = DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT),
+               .chcr           = DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_32BIT),
                .mid_rid        = 0xab,
        }, {
                .slave_id       = SHDMA_SLAVE_USB1D1_TX,
                .addr           = 0xA4D90120,
-               .chcr           = DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT),
+               .chcr           = DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_32BIT),
                .mid_rid        = 0xaf,
        }, {
                .slave_id       = SHDMA_SLAVE_USB1D1_RX,
                .addr           = 0xA4D90120,
-               .chcr           = DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT),
+               .chcr           = DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_32BIT),
                .mid_rid        = 0xaf,
        }, {
                .slave_id       = SHDMA_SLAVE_SDHI0_TX,
                .addr           = 0x04ce0030,
-               .chcr           = DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_16BIT),
+               .chcr           = DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_16BIT),
                .mid_rid        = 0xc1,
        }, {
                .slave_id       = SHDMA_SLAVE_SDHI0_RX,
                .addr           = 0x04ce0030,
-               .chcr           = DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_16BIT),
+               .chcr           = DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_16BIT),
                .mid_rid        = 0xc2,
        }, {
                .slave_id       = SHDMA_SLAVE_SDHI1_TX,
                .addr           = 0x04cf0030,
-               .chcr           = DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_16BIT),
+               .chcr           = DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_16BIT),
                .mid_rid        = 0xc9,
        }, {
                .slave_id       = SHDMA_SLAVE_SDHI1_RX,
                .addr           = 0x04cf0030,
-               .chcr           = DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_16BIT),
+               .chcr           = DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_16BIT),
                .mid_rid        = 0xca,
        },
 };
index 7b24ec4b409aef03df85a72372ebec4d4d11757b..18bcd70cd813f12e5dbdb64f8cb9191f87a124cf 100644 (file)
@@ -123,28 +123,28 @@ static const struct sh_dmae_slave_config sh7757_dmae0_slaves[] = {
        {
                .slave_id       = SHDMA_SLAVE_SDHI_TX,
                .addr           = 0x1fe50030,
-               .chcr           = SM_INC | 0x800 | 0x40000000 |
+               .chcr           = SM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_16BIT),
                .mid_rid        = 0xc5,
        },
        {
                .slave_id       = SHDMA_SLAVE_SDHI_RX,
                .addr           = 0x1fe50030,
-               .chcr           = DM_INC | 0x800 | 0x40000000 |
+               .chcr           = DM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_16BIT),
                .mid_rid        = 0xc6,
        },
        {
                .slave_id       = SHDMA_SLAVE_MMCIF_TX,
                .addr           = 0x1fcb0034,
-               .chcr           = SM_INC | 0x800 | 0x40000000 |
+               .chcr           = SM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_32BIT),
                .mid_rid        = 0xd3,
        },
        {
                .slave_id       = SHDMA_SLAVE_MMCIF_RX,
                .addr           = 0x1fcb0034,
-               .chcr           = DM_INC | 0x800 | 0x40000000 |
+               .chcr           = DM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_32BIT),
                .mid_rid        = 0xd7,
        },
@@ -154,56 +154,56 @@ static const struct sh_dmae_slave_config sh7757_dmae1_slaves[] = {
        {
                .slave_id       = SHDMA_SLAVE_SCIF2_TX,
                .addr           = 0x1f4b000c,
-               .chcr           = SM_INC | 0x800 | 0x40000000 |
+               .chcr           = SM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x21,
        },
        {
                .slave_id       = SHDMA_SLAVE_SCIF2_RX,
                .addr           = 0x1f4b0014,
-               .chcr           = DM_INC | 0x800 | 0x40000000 |
+               .chcr           = DM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x22,
        },
        {
                .slave_id       = SHDMA_SLAVE_SCIF3_TX,
                .addr           = 0x1f4c000c,
-               .chcr           = SM_INC | 0x800 | 0x40000000 |
+               .chcr           = SM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x29,
        },
        {
                .slave_id       = SHDMA_SLAVE_SCIF3_RX,
                .addr           = 0x1f4c0014,
-               .chcr           = DM_INC | 0x800 | 0x40000000 |
+               .chcr           = DM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x2a,
        },
        {
                .slave_id       = SHDMA_SLAVE_SCIF4_TX,
                .addr           = 0x1f4d000c,
-               .chcr           = SM_INC | 0x800 | 0x40000000 |
+               .chcr           = SM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x41,
        },
        {
                .slave_id       = SHDMA_SLAVE_SCIF4_RX,
                .addr           = 0x1f4d0014,
-               .chcr           = DM_INC | 0x800 | 0x40000000 |
+               .chcr           = DM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x42,
        },
        {
                .slave_id       = SHDMA_SLAVE_RSPI_TX,
                .addr           = 0xfe480004,
-               .chcr           = SM_INC | 0x800 | 0x40000000 |
+               .chcr           = SM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_16BIT),
                .mid_rid        = 0xc1,
        },
        {
                .slave_id       = SHDMA_SLAVE_RSPI_RX,
                .addr           = 0xfe480004,
-               .chcr           = DM_INC | 0x800 | 0x40000000 |
+               .chcr           = DM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_16BIT),
                .mid_rid        = 0xc2,
        },
@@ -213,70 +213,70 @@ static const struct sh_dmae_slave_config sh7757_dmae2_slaves[] = {
        {
                .slave_id       = SHDMA_SLAVE_RIIC0_TX,
                .addr           = 0x1e500012,
-               .chcr           = SM_INC | 0x800 | 0x40000000 |
+               .chcr           = SM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x21,
        },
        {
                .slave_id       = SHDMA_SLAVE_RIIC0_RX,
                .addr           = 0x1e500013,
-               .chcr           = DM_INC | 0x800 | 0x40000000 |
+               .chcr           = DM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x22,
        },
        {
                .slave_id       = SHDMA_SLAVE_RIIC1_TX,
                .addr           = 0x1e510012,
-               .chcr           = SM_INC | 0x800 | 0x40000000 |
+               .chcr           = SM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x29,
        },
        {
                .slave_id       = SHDMA_SLAVE_RIIC1_RX,
                .addr           = 0x1e510013,
-               .chcr           = DM_INC | 0x800 | 0x40000000 |
+               .chcr           = DM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x2a,
        },
        {
                .slave_id       = SHDMA_SLAVE_RIIC2_TX,
                .addr           = 0x1e520012,
-               .chcr           = SM_INC | 0x800 | 0x40000000 |
+               .chcr           = SM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0xa1,
        },
        {
                .slave_id       = SHDMA_SLAVE_RIIC2_RX,
                .addr           = 0x1e520013,
-               .chcr           = DM_INC | 0x800 | 0x40000000 |
+               .chcr           = DM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0xa2,
        },
        {
                .slave_id       = SHDMA_SLAVE_RIIC3_TX,
                .addr           = 0x1e530012,
-               .chcr           = SM_INC | 0x800 | 0x40000000 |
+               .chcr           = SM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0xa9,
        },
        {
                .slave_id       = SHDMA_SLAVE_RIIC3_RX,
                .addr           = 0x1e530013,
-               .chcr           = DM_INC | 0x800 | 0x40000000 |
+               .chcr           = DM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0xaf,
        },
        {
                .slave_id       = SHDMA_SLAVE_RIIC4_TX,
                .addr           = 0x1e540012,
-               .chcr           = SM_INC | 0x800 | 0x40000000 |
+               .chcr           = SM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0xc5,
        },
        {
                .slave_id       = SHDMA_SLAVE_RIIC4_RX,
                .addr           = 0x1e540013,
-               .chcr           = DM_INC | 0x800 | 0x40000000 |
+               .chcr           = DM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0xc6,
        },
@@ -286,70 +286,70 @@ static const struct sh_dmae_slave_config sh7757_dmae3_slaves[] = {
        {
                .slave_id       = SHDMA_SLAVE_RIIC5_TX,
                .addr           = 0x1e550012,
-               .chcr           = SM_INC | 0x800 | 0x40000000 |
+               .chcr           = SM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x21,
        },
        {
                .slave_id       = SHDMA_SLAVE_RIIC5_RX,
                .addr           = 0x1e550013,
-               .chcr           = DM_INC | 0x800 | 0x40000000 |
+               .chcr           = DM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x22,
        },
        {
                .slave_id       = SHDMA_SLAVE_RIIC6_TX,
                .addr           = 0x1e560012,
-               .chcr           = SM_INC | 0x800 | 0x40000000 |
+               .chcr           = SM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x29,
        },
        {
                .slave_id       = SHDMA_SLAVE_RIIC6_RX,
                .addr           = 0x1e560013,
-               .chcr           = DM_INC | 0x800 | 0x40000000 |
+               .chcr           = DM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x2a,
        },
        {
                .slave_id       = SHDMA_SLAVE_RIIC7_TX,
                .addr           = 0x1e570012,
-               .chcr           = SM_INC | 0x800 | 0x40000000 |
+               .chcr           = SM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x41,
        },
        {
                .slave_id       = SHDMA_SLAVE_RIIC7_RX,
                .addr           = 0x1e570013,
-               .chcr           = DM_INC | 0x800 | 0x40000000 |
+               .chcr           = DM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x42,
        },
        {
                .slave_id       = SHDMA_SLAVE_RIIC8_TX,
                .addr           = 0x1e580012,
-               .chcr           = SM_INC | 0x800 | 0x40000000 |
+               .chcr           = SM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x45,
        },
        {
                .slave_id       = SHDMA_SLAVE_RIIC8_RX,
                .addr           = 0x1e580013,
-               .chcr           = DM_INC | 0x800 | 0x40000000 |
+               .chcr           = DM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x46,
        },
        {
                .slave_id       = SHDMA_SLAVE_RIIC9_TX,
                .addr           = 0x1e590012,
-               .chcr           = SM_INC | 0x800 | 0x40000000 |
+               .chcr           = SM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x51,
        },
        {
                .slave_id       = SHDMA_SLAVE_RIIC9_RX,
                .addr           = 0x1e590013,
-               .chcr           = DM_INC | 0x800 | 0x40000000 |
+               .chcr           = DM_INC | RS_ERS | 0x40000000 |
                                  TS_INDEX2VAL(XMIT_SZ_8BIT),
                .mid_rid        = 0x52,
        },
index 42f2bca1d338c231c63c6f976b647336459b1435..886cab456e1b9ef3074eb2cfee386790a16bdda4 100644 (file)
 #define __NR_sched_setattr     343
 #define __NR_sched_getattr     344
 #define __NR_renameat2         345
+#define __NR_seccomp           346
+#define __NR_getrandom         347
 
-#define NR_syscalls            346
+#define NR_syscalls            348
 
 /* Bitmask values returned from kern_features system call.  */
 #define KERN_FEATURE_MIXED_MODE_STACK  0x00000001
index 3370945569162dd43ef1252f47c83cf6ad7d87d6..5b1151dcba13b919cefde000a55949acb9a3ae6b 100644 (file)
@@ -130,7 +130,6 @@ static inline unsigned int get_nmi_count(int cpu)
 
 static __init void nmi_cpu_busy(void *data)
 {
-       local_irq_enable_in_hardirq();
        while (endflag == 0)
                mb();
 }
index 8efd33753ad33a6fcc7cee5cbe80c1f5080b38bd..d35c490a91cb2952e6fa047591a5760087c966fe 100644 (file)
@@ -1671,9 +1671,12 @@ static bool __init supported_pmu(void)
 
 static int __init init_hw_perf_events(void)
 {
+       int err;
+
        pr_info("Performance events: ");
 
-       if (!supported_pmu()) {
+       err = pcr_arch_init();
+       if (err || !supported_pmu()) {
                pr_cont("No support for PMU type '%s'\n", sparc_pmu_type);
                return 0;
        }
@@ -1685,7 +1688,7 @@ static int __init init_hw_perf_events(void)
 
        return 0;
 }
-early_initcall(init_hw_perf_events);
+pure_initcall(init_hw_perf_events);
 
 void perf_callchain_kernel(struct perf_callchain_entry *entry,
                           struct pt_regs *regs)
index 027e099861947655bc59583b0f2a715ac1fb10f2..0be7bf978cb1da03b8d657958ec6d0ec3c4e7d8b 100644 (file)
@@ -312,6 +312,9 @@ static void __global_pmu_self(int this_cpu)
        struct global_pmu_snapshot *pp;
        int i, num;
 
+       if (!pcr_ops)
+               return;
+
        pp = &global_cpu_snapshot[this_cpu].pmu;
 
        num = 1;
index 7958242d63c59203105bbfe14918b55bda1807f0..b3a5d81b20f0f7c4fcaf669d11c90f733b2db4f2 100644 (file)
@@ -68,7 +68,7 @@ void smp_store_cpu_info(int id)
        mid = cpu_get_hwmid(cpu_node);
 
        if (mid < 0) {
-               printk(KERN_NOTICE "No MID found for CPU%d at node 0x%08d", id, cpu_node);
+               printk(KERN_NOTICE "No MID found for CPU%d at node 0x%08x", id, cpu_node);
                mid = 0;
        }
        cpu_data(id).mid = mid;
index 41aa2478f3ca7951c1448393ffd9ef26929f19e1..f7ba87543e5ff092e723a14cf6df8403e343ac4c 100644 (file)
@@ -1383,7 +1383,6 @@ void __cpu_die(unsigned int cpu)
 
 void __init smp_cpus_done(unsigned int max_cpus)
 {
-       pcr_arch_init();
 }
 
 void smp_send_reschedule(int cpu)
index 85fe9b1087cdb3eae326a46337f5484ca6d9a636..217893e18d7832300f881a5fa97959a42e52d309 100644 (file)
@@ -86,4 +86,4 @@ sys_call_table:
 /*330*/        .long sys_fanotify_mark, sys_prlimit64, sys_name_to_handle_at, sys_open_by_handle_at, sys_clock_adjtime
 /*335*/        .long sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev
 /*340*/        .long sys_ni_syscall, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
-/*345*/        .long sys_renameat2
+/*345*/        .long sys_renameat2, sys_seccomp, sys_getrandom
index 33ecba2826ea20b65693d90649a791db91fd4fcd..d93b49d1b420ffc108258ed8ab322d1ab85ccbee 100644 (file)
@@ -87,7 +87,7 @@ sys_call_table32:
 /*330*/        .word compat_sys_fanotify_mark, sys_prlimit64, sys_name_to_handle_at, compat_sys_open_by_handle_at, compat_sys_clock_adjtime
        .word sys_syncfs, compat_sys_sendmmsg, sys_setns, compat_sys_process_vm_readv, compat_sys_process_vm_writev
 /*340*/        .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
-       .word sys32_renameat2
+       .word sys32_renameat2, sys_seccomp, sys_getrandom
 
 #endif /* CONFIG_COMPAT */
 
@@ -166,4 +166,4 @@ sys_call_table:
 /*330*/        .word sys_fanotify_mark, sys_prlimit64, sys_name_to_handle_at, sys_open_by_handle_at, sys_clock_adjtime
        .word sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev
 /*340*/        .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
-       .word sys_renameat2
+       .word sys_renameat2, sys_seccomp, sys_getrandom
index 4aafd322e21e273e902870f2cdcbcc6cd0937d36..5d0bf1aa9dcb6d68fd39f395dcbef9f89954cbc0 100644 (file)
@@ -434,6 +434,7 @@ config X86_INTEL_CE
        bool "CE4100 TV platform"
        depends on PCI
        depends on PCI_GODIRECT
+       depends on X86_IO_APIC
        depends on X86_32
        depends on X86_EXTENDED_PLATFORM
        select X86_REBOOTFIXUPS
@@ -840,6 +841,7 @@ config X86_IO_APIC
        def_bool y
        depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC || PCI_MSI
        select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
+       select IRQ_DOMAIN
 
 config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
        bool "Reroute for broken boot IRQs"
@@ -1541,7 +1543,8 @@ config EFI
 
 config EFI_STUB
        bool "EFI stub support"
-       depends on EFI
+       depends on EFI && !X86_USE_3DNOW
+       select RELOCATABLE
        ---help---
           This kernel feature allows a bzImage to be loaded directly
          by EFI firmware without the use of a bootloader.
index 0a3f9c9f98d5ccf446980cfcf71b7055bb79ade8..473bdbee378a10ac2030b586dc33d3be74de5a11 100644 (file)
@@ -161,6 +161,20 @@ static inline int alternatives_text_reserved(void *start, void *end)
        asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)          \
                : : "i" (0), ## input)
 
+/*
+ * This is similar to alternative_input. But it has two features and
+ * respective instructions.
+ *
+ * If CPU has feature2, newinstr2 is used.
+ * Otherwise, if CPU has feature1, newinstr1 is used.
+ * Otherwise, oldinstr is used.
+ */
+#define alternative_input_2(oldinstr, newinstr1, feature1, newinstr2,       \
+                          feature2, input...)                               \
+       asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1,            \
+               newinstr2, feature2)                                         \
+               : : "i" (0), ## input)
+
 /* Like alternative_input, but with a single output argument */
 #define alternative_io(oldinstr, newinstr, feature, output, input...)  \
        asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)          \
index 79752f2bdec57af3e467678bdbff5dec1b6d4346..465b309af25425dce160848ab8c32df14058ef35 100644 (file)
@@ -85,14 +85,6 @@ static inline bool apic_from_smp_config(void)
 #include <asm/paravirt.h>
 #endif
 
-#ifdef CONFIG_X86_64
-extern int is_vsmp_box(void);
-#else
-static inline int is_vsmp_box(void)
-{
-       return 0;
-}
-#endif
 extern int setup_profiling_timer(unsigned int);
 
 static inline void native_apic_mem_write(u32 reg, u32 v)
@@ -300,7 +292,6 @@ struct apic {
 
        int dest_logical;
        unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid);
-       unsigned long (*check_apicid_present)(int apicid);
 
        void (*vector_allocation_domain)(int cpu, struct cpumask *retmask,
                                         const struct cpumask *mask);
@@ -309,21 +300,11 @@ struct apic {
        void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
 
        void (*setup_apic_routing)(void);
-       int (*multi_timer_check)(int apic, int irq);
        int (*cpu_present_to_apicid)(int mps_cpu);
        void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap);
-       void (*setup_portio_remap)(void);
        int (*check_phys_apicid_present)(int phys_apicid);
-       void (*enable_apic_mode)(void);
        int (*phys_pkg_id)(int cpuid_apic, int index_msb);
 
-       /*
-        * When one of the next two hooks returns 1 the apic
-        * is switched to this. Essentially they are additional
-        * probe functions:
-        */
-       int (*mps_oem_check)(struct mpc_table *mpc, char *oem, char *productid);
-
        unsigned int (*get_apic_id)(unsigned long x);
        unsigned long (*set_apic_id)(unsigned int id);
        unsigned long apic_id_mask;
@@ -343,11 +324,7 @@ struct apic {
        /* wakeup_secondary_cpu */
        int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);
 
-       int trampoline_phys_low;
-       int trampoline_phys_high;
-
        bool wait_for_init_deassert;
-       void (*smp_callin_clear_local_apic)(void);
        void (*inquire_remote_apic)(int apicid);
 
        /* apic ops */
@@ -378,14 +355,6 @@ struct apic {
         * won't be applied properly during early boot in this case.
         */
        int (*x86_32_early_logical_apicid)(int cpu);
-
-       /*
-        * Optional method called from setup_local_APIC() after logical
-        * apicid is guaranteed to be known to initialize apicid -> node
-        * mapping if NUMA initialization hasn't done so already.  Don't
-        * add new users.
-        */
-       int (*x86_32_numa_cpu_node)(int cpu);
 #endif
 };
 
@@ -496,14 +465,12 @@ static inline unsigned default_get_apic_id(unsigned long x)
 }
 
 /*
- * Warm reset vector default position:
+ * Warm reset vector position:
  */
-#define DEFAULT_TRAMPOLINE_PHYS_LOW            0x467
-#define DEFAULT_TRAMPOLINE_PHYS_HIGH           0x469
+#define TRAMPOLINE_PHYS_LOW            0x467
+#define TRAMPOLINE_PHYS_HIGH           0x469
 
 #ifdef CONFIG_X86_64
-extern int default_acpi_madt_oem_check(char *, char *);
-
 extern void apic_send_IPI_self(int vector);
 
 DECLARE_PER_CPU(int, x2apic_extra_bits);
@@ -552,6 +519,8 @@ static inline int default_apic_id_valid(int apicid)
        return (apicid < 255);
 }
 
+extern int default_acpi_madt_oem_check(char *, char *);
+
 extern void default_setup_apic_routing(void);
 
 extern struct apic apic_noop;
@@ -635,11 +604,6 @@ static inline unsigned long default_check_apicid_used(physid_mask_t *map, int ap
        return physid_isset(apicid, *map);
 }
 
-static inline unsigned long default_check_apicid_present(int bit)
-{
-       return physid_isset(bit, phys_cpu_present_map);
-}
-
 static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
 {
        *retmap = *phys_map;
index e3b85422cf127d38806a903ba6c2d0e6f664bf2f..412ececa00b957014a535e0a36d15ae108a88ffa 100644 (file)
@@ -508,9 +508,12 @@ static inline void user_fpu_begin(void)
 
 static inline void __save_fpu(struct task_struct *tsk)
 {
-       if (use_xsave())
-               xsave_state(&tsk->thread.fpu.state->xsave, -1);
-       else
+       if (use_xsave()) {
+               if (unlikely(system_state == SYSTEM_BOOTING))
+                       xsave_state_booting(&tsk->thread.fpu.state->xsave, -1);
+               else
+                       xsave_state(&tsk->thread.fpu.state->xsave, -1);
+       } else
                fpu_fxsave(&tsk->thread.fpu);
 }
 
index 230853da4ec09ea6590aed6d724e1fd13b9e356c..0f5fb6b6567e9c7e2856e86678189a82af0d853e 100644 (file)
@@ -40,9 +40,6 @@ typedef struct {
 
 DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
 
-/* We can have at most NR_VECTORS irqs routed to a cpu at a time */
-#define MAX_HARDIRQS_PER_CPU NR_VECTORS
-
 #define __ARCH_IRQ_STAT
 
 #define inc_irq_stat(member)   this_cpu_inc(irq_stat.member)
index a20365953bf8a7727285029f3c6d84ba56f941a0..ccffa53750a89283feae5c1ea8b10f7e99cf57ea 100644 (file)
@@ -67,4 +67,9 @@ struct legacy_pic {
 extern struct legacy_pic *legacy_pic;
 extern struct legacy_pic null_legacy_pic;
 
+static inline int nr_legacy_irqs(void)
+{
+       return legacy_pic->nr_legacy_irqs;
+}
+
 #endif /* _ASM_X86_I8259_H */
index 90f97b4b93476e319438ba93d2c72e2f8b1adc24..0aeed5ca356ec04bdbc4705fb6bbf5010cb65354 100644 (file)
@@ -98,6 +98,8 @@ struct IR_IO_APIC_route_entry {
 #define IOAPIC_AUTO     -1
 #define IOAPIC_EDGE     0
 #define IOAPIC_LEVEL    1
+#define        IOAPIC_MAP_ALLOC                0x1
+#define        IOAPIC_MAP_CHECK                0x2
 
 #ifdef CONFIG_X86_IO_APIC
 
@@ -118,9 +120,6 @@ extern int mp_irq_entries;
 /* MP IRQ source entries */
 extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
 
-/* non-0 if default (table-less) MP configuration */
-extern int mpc_default_type;
-
 /* Older SiS APIC requires we rewrite the index register */
 extern int sis_apic_bug;
 
@@ -133,9 +132,6 @@ extern int noioapicquirk;
 /* -1 if "noapic" boot option passed */
 extern int noioapicreroute;
 
-/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */
-extern int timer_through_8259;
-
 /*
  * If we use the IO-APIC for IRQ routing, disable automatic
  * assignment of PCI IRQ's.
@@ -145,14 +141,8 @@ extern int timer_through_8259;
 
 struct io_apic_irq_attr;
 struct irq_cfg;
-extern int io_apic_set_pci_routing(struct device *dev, int irq,
-                struct io_apic_irq_attr *irq_attr);
-void setup_IO_APIC_irq_extra(u32 gsi);
 extern void ioapic_insert_resources(void);
 
-extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *,
-                                    unsigned int, int,
-                                    struct io_apic_irq_attr *);
 extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *,
                                     unsigned int, int,
                                     struct io_apic_irq_attr *);
@@ -162,7 +152,6 @@ extern void native_compose_msi_msg(struct pci_dev *pdev,
                                   unsigned int irq, unsigned int dest,
                                   struct msi_msg *msg, u8 hpet_id);
 extern void native_eoi_ioapic_pin(int apic, int pin, int vector);
-int io_apic_setup_irq_pin_once(unsigned int irq, int node, struct io_apic_irq_attr *attr);
 
 extern int save_ioapic_entries(void);
 extern void mask_ioapic_entries(void);
@@ -171,15 +160,40 @@ extern int restore_ioapic_entries(void);
 extern void setup_ioapic_ids_from_mpc(void);
 extern void setup_ioapic_ids_from_mpc_nocheck(void);
 
+enum ioapic_domain_type {
+       IOAPIC_DOMAIN_INVALID,
+       IOAPIC_DOMAIN_LEGACY,
+       IOAPIC_DOMAIN_STRICT,
+       IOAPIC_DOMAIN_DYNAMIC,
+};
+
+struct device_node;
+struct irq_domain;
+struct irq_domain_ops;
+
+struct ioapic_domain_cfg {
+       enum ioapic_domain_type         type;
+       const struct irq_domain_ops     *ops;
+       struct device_node              *dev;
+};
+
 struct mp_ioapic_gsi{
        u32 gsi_base;
        u32 gsi_end;
 };
-extern struct mp_ioapic_gsi  mp_gsi_routing[];
 extern u32 gsi_top;
-int mp_find_ioapic(u32 gsi);
-int mp_find_ioapic_pin(int ioapic, u32 gsi);
-void __init mp_register_ioapic(int id, u32 address, u32 gsi_base);
+
+extern int mp_find_ioapic(u32 gsi);
+extern int mp_find_ioapic_pin(int ioapic, u32 gsi);
+extern u32 mp_pin_to_gsi(int ioapic, int pin);
+extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags);
+extern void mp_unmap_irq(int irq);
+extern void __init mp_register_ioapic(int id, u32 address, u32 gsi_base,
+                                     struct ioapic_domain_cfg *cfg);
+extern int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq,
+                           irq_hw_number_t hwirq);
+extern void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq);
+extern int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node);
 extern void __init pre_init_apic_IRQ0(void);
 
 extern void mp_save_irq(struct mpc_intsrc *m);
@@ -217,14 +231,12 @@ extern void io_apic_eoi(unsigned int apic, unsigned int vector);
 
 #define io_apic_assign_pci_irqs 0
 #define setup_ioapic_ids_from_mpc x86_init_noop
-static const int timer_through_8259 = 0;
 static inline void ioapic_insert_resources(void) { }
 #define gsi_top (NR_IRQS_LEGACY)
 static inline int mp_find_ioapic(u32 gsi) { return 0; }
-
-struct io_apic_irq_attr;
-static inline int io_apic_set_pci_routing(struct device *dev, int irq,
-                struct io_apic_irq_attr *irq_attr) { return 0; }
+static inline u32 mp_pin_to_gsi(int ioapic, int pin) { return UINT_MAX; }
+static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) { return gsi; }
+static inline void mp_unmap_irq(int irq) { }
 
 static inline int save_ioapic_entries(void)
 {
index f5a6179567359359996d6bd878f3d85dc235b023..b07233b64578721668b6d5d06903c5b148bd169d 100644 (file)
@@ -40,8 +40,6 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES];
 extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
 
 extern unsigned int boot_cpu_physical_apicid;
-extern unsigned int max_physical_apicid;
-extern int mpc_default_type;
 extern unsigned long mp_lapic_addr;
 
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -88,15 +86,6 @@ static inline void early_reserve_e820_mpc_new(void) { }
 #endif
 
 int generic_processor_info(int apicid, int version);
-#ifdef CONFIG_ACPI
-extern void mp_register_ioapic(int id, u32 address, u32 gsi_base);
-extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
-                                  u32 gsi);
-extern void mp_config_acpi_legacy_irqs(void);
-struct device;
-extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
-                                int active_high_low);
-#endif /* CONFIG_ACPI */
 
 #define PHYSID_ARRAY_SIZE      BITS_TO_LONGS(MAX_LOCAL_APIC)
 
@@ -161,8 +150,4 @@ static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map)
 
 extern physid_mask_t phys_cpu_present_map;
 
-extern int generic_mps_oem_check(struct mpc_table *, char *, char *);
-
-extern int default_acpi_madt_oem_check(char *, char *);
-
 #endif /* _ASM_X86_MPSPEC_H */
index ee30b9f0b91c9d36f04bb4b13200a5d675e8a873..eb71ec794732b98f09531054c37ecc19c5d94f52 100644 (file)
@@ -385,8 +385,8 @@ struct bndcsr_struct {
 
 struct xsave_hdr_struct {
        u64 xstate_bv;
-       u64 reserved1[2];
-       u64 reserved2[5];
+       u64 xcomp_bv;
+       u64 reserved[6];
 } __attribute__((packed));
 
 struct xsave_struct {
index fbeb06ed0eaa7b1729e396fd6e45cd81782fb468..1d081ac1cd69d24970d4f16c87dab9171f8fde71 100644 (file)
 extern int of_ioapic;
 extern u64 initial_dtb;
 extern void add_dtb(u64 data);
-extern void x86_add_irq_domains(void);
 void x86_of_pci_init(void);
 void x86_dtb_init(void);
 #else
 static inline void add_dtb(u64 data) { }
-static inline void x86_add_irq_domains(void) { }
 static inline void x86_of_pci_init(void) { }
 static inline void x86_dtb_init(void) { }
 #define of_ioapic 0
index 49adfd7bb4a49e33c05da6e82a1378d23c29580c..0da7409f0beca5bbdf6e0d5b3faffa782d35cb4b 100644 (file)
@@ -17,11 +17,11 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
        spin_unlock_irqrestore(&rtc_lock, flags);
        local_flush_tlb();
        pr_debug("1.\n");
-       *((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_high)) =
-                                                                start_eip >> 4;
+       *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
+                                                       start_eip >> 4;
        pr_debug("2.\n");
-       *((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_low)) =
-                                                        start_eip & 0xf;
+       *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
+                                                       start_eip & 0xf;
        pr_debug("3.\n");
 }
 
@@ -42,7 +42,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
        CMOS_WRITE(0, 0xf);
        spin_unlock_irqrestore(&rtc_lock, flags);
 
-       *((volatile u32 *)phys_to_virt(apic->trampoline_phys_low)) = 0;
+       *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
 }
 
 static inline void __init smpboot_setup_io_apic(void)
index d949ef28c48bd9c423c5c668a1b0ce61ed70cac5..7e7a79ada6584fa4161e285f1f32181a29baa70e 100644 (file)
@@ -52,24 +52,170 @@ extern void xsave_init(void);
 extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
 extern int init_fpu(struct task_struct *child);
 
-static inline int fpu_xrstor_checking(struct xsave_struct *fx)
+/* These macros all use (%edi)/(%rdi) as the single memory argument. */
+#define XSAVE          ".byte " REX_PREFIX "0x0f,0xae,0x27"
+#define XSAVEOPT       ".byte " REX_PREFIX "0x0f,0xae,0x37"
+#define XSAVES         ".byte " REX_PREFIX "0x0f,0xc7,0x2f"
+#define XRSTOR         ".byte " REX_PREFIX "0x0f,0xae,0x2f"
+#define XRSTORS                ".byte " REX_PREFIX "0x0f,0xc7,0x1f"
+
+#define xstate_fault   ".section .fixup,\"ax\"\n"      \
+                       "3:  movl $-1,%[err]\n"         \
+                       "    jmp  2b\n"                 \
+                       ".previous\n"                   \
+                       _ASM_EXTABLE(1b, 3b)            \
+                       : [err] "=r" (err)
+
+/*
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
+ */
+static inline int xsave_state_booting(struct xsave_struct *fx, u64 mask)
 {
-       int err;
+       u32 lmask = mask;
+       u32 hmask = mask >> 32;
+       int err = 0;
+
+       WARN_ON(system_state != SYSTEM_BOOTING);
+
+       if (boot_cpu_has(X86_FEATURE_XSAVES))
+               asm volatile("1:"XSAVES"\n\t"
+                       "2:\n\t"
+                       : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+                       :   "memory");
+       else
+               asm volatile("1:"XSAVE"\n\t"
+                       "2:\n\t"
+                       : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+                       :   "memory");
+
+       asm volatile(xstate_fault
+                    : "0" (0)
+                    : "memory");
+
+       return err;
+}
+
+/*
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
+ */
+static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask)
+{
+       u32 lmask = mask;
+       u32 hmask = mask >> 32;
+       int err = 0;
+
+       WARN_ON(system_state != SYSTEM_BOOTING);
 
-       asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
-                    "2:\n"
-                    ".section .fixup,\"ax\"\n"
-                    "3:  movl $-1,%[err]\n"
-                    "    jmp  2b\n"
-                    ".previous\n"
-                    _ASM_EXTABLE(1b, 3b)
-                    : [err] "=r" (err)
-                    : "D" (fx), "m" (*fx), "a" (-1), "d" (-1), "0" (0)
+       if (boot_cpu_has(X86_FEATURE_XSAVES))
+               asm volatile("1:"XRSTORS"\n\t"
+                       "2:\n\t"
+                       : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+                       :   "memory");
+       else
+               asm volatile("1:"XRSTOR"\n\t"
+                       "2:\n\t"
+                       : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+                       :   "memory");
+
+       asm volatile(xstate_fault
+                    : "0" (0)
+                    : "memory");
+
+       return err;
+}
+
+/*
+ * Save processor xstate to xsave area.
+ */
+static inline int xsave_state(struct xsave_struct *fx, u64 mask)
+{
+       u32 lmask = mask;
+       u32 hmask = mask >> 32;
+       int err = 0;
+
+       /*
+        * If xsaves is enabled, xsaves replaces xsaveopt because
+        * it supports compact format and supervisor states in addition to
+        * modified optimization in xsaveopt.
+        *
+        * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave
+        * because xsaveopt supports modified optimization which is not
+        * supported by xsave.
+        *
+        * If none of xsaves and xsaveopt is enabled, use xsave.
+        */
+       alternative_input_2(
+               "1:"XSAVE,
+               "1:"XSAVEOPT,
+               X86_FEATURE_XSAVEOPT,
+               "1:"XSAVES,
+               X86_FEATURE_XSAVES,
+               [fx] "D" (fx), "a" (lmask), "d" (hmask) :
+               "memory");
+       asm volatile("2:\n\t"
+                    xstate_fault
+                    : "0" (0)
                     : "memory");
 
        return err;
 }
 
+/*
+ * Restore processor xstate from xsave area.
+ */
+static inline int xrstor_state(struct xsave_struct *fx, u64 mask)
+{
+       int err = 0;
+       u32 lmask = mask;
+       u32 hmask = mask >> 32;
+
+       /*
+        * Use xrstors to restore context if it is enabled. xrstors supports
+        * compacted format of xsave area which is not supported by xrstor.
+        */
+       alternative_input(
+               "1: " XRSTOR,
+               "1: " XRSTORS,
+               X86_FEATURE_XSAVES,
+               "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+               : "memory");
+
+       asm volatile("2:\n"
+                    xstate_fault
+                    : "0" (0)
+                    : "memory");
+
+       return err;
+}
+
+/*
+ * Save xstate context for old process during context switch.
+ */
+static inline void fpu_xsave(struct fpu *fpu)
+{
+       xsave_state(&fpu->state->xsave, -1);
+}
+
+/*
+ * Restore xstate context for new process during context switch.
+ */
+static inline int fpu_xrstor_checking(struct xsave_struct *fx)
+{
+       return xrstor_state(fx, -1);
+}
+
+/*
+ * Save xstate to user space xsave area.
+ *
+ * We don't use modified optimization because xrstor/xrstors might track
+ * a different application.
+ *
+ * We don't use compacted format xsave area for
+ * backward compatibility for old applications which don't understand
+ * compacted format of xsave area.
+ */
 static inline int xsave_user(struct xsave_struct __user *buf)
 {
        int err;
@@ -83,69 +229,34 @@ static inline int xsave_user(struct xsave_struct __user *buf)
                return -EFAULT;
 
        __asm__ __volatile__(ASM_STAC "\n"
-                            "1: .byte " REX_PREFIX "0x0f,0xae,0x27\n"
+                            "1:"XSAVE"\n"
                             "2: " ASM_CLAC "\n"
-                            ".section .fixup,\"ax\"\n"
-                            "3:  movl $-1,%[err]\n"
-                            "    jmp  2b\n"
-                            ".previous\n"
-                            _ASM_EXTABLE(1b,3b)
-                            : [err] "=r" (err)
+                            xstate_fault
                             : "D" (buf), "a" (-1), "d" (-1), "0" (0)
                             : "memory");
        return err;
 }
 
+/*
+ * Restore xstate from user space xsave area.
+ */
 static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask)
 {
-       int err;
+       int err = 0;
        struct xsave_struct *xstate = ((__force struct xsave_struct *)buf);
        u32 lmask = mask;
        u32 hmask = mask >> 32;
 
        __asm__ __volatile__(ASM_STAC "\n"
-                            "1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n"
+                            "1:"XRSTOR"\n"
                             "2: " ASM_CLAC "\n"
-                            ".section .fixup,\"ax\"\n"
-                            "3:  movl $-1,%[err]\n"
-                            "    jmp  2b\n"
-                            ".previous\n"
-                            _ASM_EXTABLE(1b,3b)
-                            : [err] "=r" (err)
+                            xstate_fault
                             : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0)
                             : "memory");       /* memory required? */
        return err;
 }
 
-static inline void xrstor_state(struct xsave_struct *fx, u64 mask)
-{
-       u32 lmask = mask;
-       u32 hmask = mask >> 32;
-
-       asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
-                    : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
-                    :   "memory");
-}
-
-static inline void xsave_state(struct xsave_struct *fx, u64 mask)
-{
-       u32 lmask = mask;
-       u32 hmask = mask >> 32;
+void *get_xsave_addr(struct xsave_struct *xsave, int xstate);
+void setup_xstate_comp(void);
 
-       asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x27\n\t"
-                    : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
-                    :   "memory");
-}
-
-static inline void fpu_xsave(struct fpu *fpu)
-{
-       /* This, however, we can work around by forcing the compiler to select
-          an addressing mode that doesn't require extended registers. */
-       alternative_input(
-               ".byte " REX_PREFIX "0x0f,0xae,0x27",
-               ".byte " REX_PREFIX "0x0f,0xae,0x37",
-               X86_FEATURE_XSAVEOPT,
-               [fx] "D" (&fpu->state->xsave), "a" (-1), "d" (-1) :
-               "memory");
-}
 #endif
index a531f6564ed08e65d9e9e1ae4f86875caa846ef3..b436fc735aa455be7f007a7bcb089741ace98c1c 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/module.h>
 #include <linux/dmi.h>
 #include <linux/irq.h>
+#include <linux/irqdomain.h>
 #include <linux/slab.h>
 #include <linux/bootmem.h>
 #include <linux/ioport.h>
@@ -43,6 +44,7 @@
 #include <asm/io.h>
 #include <asm/mpspec.h>
 #include <asm/smp.h>
+#include <asm/i8259.h>
 
 #include "sleep.h" /* To include x86_acpi_suspend_lowlevel */
 static int __initdata acpi_force = 0;
@@ -93,44 +95,7 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 };
 
-static unsigned int gsi_to_irq(unsigned int gsi)
-{
-       unsigned int irq = gsi + NR_IRQS_LEGACY;
-       unsigned int i;
-
-       for (i = 0; i < NR_IRQS_LEGACY; i++) {
-               if (isa_irq_to_gsi[i] == gsi) {
-                       return i;
-               }
-       }
-
-       /* Provide an identity mapping of gsi == irq
-        * except on truly weird platforms that have
-        * non isa irqs in the first 16 gsis.
-        */
-       if (gsi >= NR_IRQS_LEGACY)
-               irq = gsi;
-       else
-               irq = gsi_top + gsi;
-
-       return irq;
-}
-
-static u32 irq_to_gsi(int irq)
-{
-       unsigned int gsi;
-
-       if (irq < NR_IRQS_LEGACY)
-               gsi = isa_irq_to_gsi[irq];
-       else if (irq < gsi_top)
-               gsi = irq;
-       else if (irq < (gsi_top + NR_IRQS_LEGACY))
-               gsi = irq - gsi_top;
-       else
-               gsi = 0xffffffff;
-
-       return gsi;
-}
+#define        ACPI_INVALID_GSI                INT_MIN
 
 /*
  * This is just a simple wrapper around early_ioremap(),
@@ -341,11 +306,145 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e
 #endif                         /*CONFIG_X86_LOCAL_APIC */
 
 #ifdef CONFIG_X86_IO_APIC
+#define MP_ISA_BUS             0
+
+static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
+                                         u32 gsi)
+{
+       int ioapic;
+       int pin;
+       struct mpc_intsrc mp_irq;
+
+       /*
+        * Convert 'gsi' to 'ioapic.pin'.
+        */
+       ioapic = mp_find_ioapic(gsi);
+       if (ioapic < 0)
+               return;
+       pin = mp_find_ioapic_pin(ioapic, gsi);
+
+       /*
+        * TBD: This check is for faulty timer entries, where the override
+        *      erroneously sets the trigger to level, resulting in a HUGE
+        *      increase of timer interrupts!
+        */
+       if ((bus_irq == 0) && (trigger == 3))
+               trigger = 1;
+
+       mp_irq.type = MP_INTSRC;
+       mp_irq.irqtype = mp_INT;
+       mp_irq.irqflag = (trigger << 2) | polarity;
+       mp_irq.srcbus = MP_ISA_BUS;
+       mp_irq.srcbusirq = bus_irq;     /* IRQ */
+       mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */
+       mp_irq.dstirq = pin;    /* INTIN# */
+
+       mp_save_irq(&mp_irq);
+
+       /*
+        * Reset default identity mapping if gsi is also an legacy IRQ,
+        * otherwise there will be more than one entry with the same GSI
+        * and acpi_isa_irq_to_gsi() may give wrong result.
+        */
+       if (gsi < nr_legacy_irqs() && isa_irq_to_gsi[gsi] == gsi)
+               isa_irq_to_gsi[gsi] = ACPI_INVALID_GSI;
+       isa_irq_to_gsi[bus_irq] = gsi;
+}
+
+static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
+                       int polarity)
+{
+#ifdef CONFIG_X86_MPPARSE
+       struct mpc_intsrc mp_irq;
+       struct pci_dev *pdev;
+       unsigned char number;
+       unsigned int devfn;
+       int ioapic;
+       u8 pin;
+
+       if (!acpi_ioapic)
+               return 0;
+       if (!dev || !dev_is_pci(dev))
+               return 0;
+
+       pdev = to_pci_dev(dev);
+       number = pdev->bus->number;
+       devfn = pdev->devfn;
+       pin = pdev->pin;
+       /* print the entry should happen on mptable identically */
+       mp_irq.type = MP_INTSRC;
+       mp_irq.irqtype = mp_INT;
+       mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+                               (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
+       mp_irq.srcbus = number;
+       mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
+       ioapic = mp_find_ioapic(gsi);
+       mp_irq.dstapic = mpc_ioapic_id(ioapic);
+       mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
+
+       mp_save_irq(&mp_irq);
+#endif
+       return 0;
+}
+
+static int mp_register_gsi(struct device *dev, u32 gsi, int trigger,
+                          int polarity)
+{
+       int irq, node;
+
+       if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
+               return gsi;
+
+       /* Don't set up the ACPI SCI because it's already set up */
+       if (acpi_gbl_FADT.sci_interrupt == gsi)
+               return gsi;
+
+       trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1;
+       polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1;
+       node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
+       if (mp_set_gsi_attr(gsi, trigger, polarity, node)) {
+               pr_warn("Failed to set pin attr for GSI%d\n", gsi);
+               return -1;
+       }
+
+       irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC);
+       if (irq < 0)
+               return irq;
+
+       if (enable_update_mptable)
+               mp_config_acpi_gsi(dev, gsi, trigger, polarity);
+
+       return irq;
+}
+
+static void mp_unregister_gsi(u32 gsi)
+{
+       int irq;
+
+       if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
+               return;
+
+       if (acpi_gbl_FADT.sci_interrupt == gsi)
+               return;
+
+       irq = mp_map_gsi_to_irq(gsi, 0);
+       if (irq > 0)
+               mp_unmap_irq(irq);
+}
+
+static struct irq_domain_ops acpi_irqdomain_ops = {
+       .map = mp_irqdomain_map,
+       .unmap = mp_irqdomain_unmap,
+};
 
 static int __init
 acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
 {
        struct acpi_madt_io_apic *ioapic = NULL;
+       struct ioapic_domain_cfg cfg = {
+               .type = IOAPIC_DOMAIN_DYNAMIC,
+               .ops = &acpi_irqdomain_ops,
+       };
 
        ioapic = (struct acpi_madt_io_apic *)header;
 
@@ -354,8 +453,12 @@ acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
 
        acpi_table_print_madt_entry(header);
 
-       mp_register_ioapic(ioapic->id,
-                          ioapic->address, ioapic->global_irq_base);
+       /* Statically assign IRQ numbers for IOAPICs hosting legacy IRQs */
+       if (ioapic->global_irq_base < nr_legacy_irqs())
+               cfg.type = IOAPIC_DOMAIN_LEGACY;
+
+       mp_register_ioapic(ioapic->id, ioapic->address, ioapic->global_irq_base,
+                          &cfg);
 
        return 0;
 }
@@ -378,11 +481,6 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger,
        if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK)
                polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK;
 
-       /*
-        * mp_config_acpi_legacy_irqs() already setup IRQs < 16
-        * If GSI is < 16, this will update its flags,
-        * else it will create a new mp_irqs[] entry.
-        */
        mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
 
        /*
@@ -504,25 +602,28 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
        outb(new >> 8, 0x4d1);
 }
 
-int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
+int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp)
 {
-       *irq = gsi_to_irq(gsi);
+       int irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC | IOAPIC_MAP_CHECK);
 
-#ifdef CONFIG_X86_IO_APIC
-       if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC)
-               setup_IO_APIC_irq_extra(gsi);
-#endif
+       if (irq >= 0) {
+               *irqp = irq;
+               return 0;
+       }
 
-       return 0;
+       return -1;
 }
 EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
 
 int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
 {
-       if (isa_irq >= 16)
-               return -1;
-       *gsi = irq_to_gsi(isa_irq);
-       return 0;
+       if (isa_irq < nr_legacy_irqs() &&
+           isa_irq_to_gsi[isa_irq] != ACPI_INVALID_GSI) {
+               *gsi = isa_irq_to_gsi[isa_irq];
+               return 0;
+       }
+
+       return -1;
 }
 
 static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
@@ -542,15 +643,25 @@ static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
 static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
                                    int trigger, int polarity)
 {
+       int irq = gsi;
+
 #ifdef CONFIG_X86_IO_APIC
-       gsi = mp_register_gsi(dev, gsi, trigger, polarity);
+       irq = mp_register_gsi(dev, gsi, trigger, polarity);
 #endif
 
-       return gsi;
+       return irq;
+}
+
+static void acpi_unregister_gsi_ioapic(u32 gsi)
+{
+#ifdef CONFIG_X86_IO_APIC
+       mp_unregister_gsi(gsi);
+#endif
 }
 
 int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
                           int trigger, int polarity) = acpi_register_gsi_pic;
+void (*__acpi_unregister_gsi)(u32 gsi) = NULL;
 
 #ifdef CONFIG_ACPI_SLEEP
 int (*acpi_suspend_lowlevel)(void) = x86_acpi_suspend_lowlevel;
@@ -564,32 +675,22 @@ int (*acpi_suspend_lowlevel)(void);
  */
 int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
 {
-       unsigned int irq;
-       unsigned int plat_gsi = gsi;
-
-       plat_gsi = (*__acpi_register_gsi)(dev, gsi, trigger, polarity);
-       irq = gsi_to_irq(plat_gsi);
-
-       return irq;
+       return __acpi_register_gsi(dev, gsi, trigger, polarity);
 }
 EXPORT_SYMBOL_GPL(acpi_register_gsi);
 
 void acpi_unregister_gsi(u32 gsi)
 {
+       if (__acpi_unregister_gsi)
+               __acpi_unregister_gsi(gsi);
 }
 EXPORT_SYMBOL_GPL(acpi_unregister_gsi);
 
-void __init acpi_set_irq_model_pic(void)
-{
-       acpi_irq_model = ACPI_IRQ_MODEL_PIC;
-       __acpi_register_gsi = acpi_register_gsi_pic;
-       acpi_ioapic = 0;
-}
-
-void __init acpi_set_irq_model_ioapic(void)
+static void __init acpi_set_irq_model_ioapic(void)
 {
        acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
        __acpi_register_gsi = acpi_register_gsi_ioapic;
+       __acpi_unregister_gsi = acpi_unregister_gsi_ioapic;
        acpi_ioapic = 1;
 }
 
@@ -825,9 +926,8 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
         * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
         */
 
-       count =
-           acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
-                                 acpi_parse_lapic_addr_ovr, 0);
+       count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
+                                     acpi_parse_lapic_addr_ovr, 0);
        if (count < 0) {
                printk(KERN_ERR PREFIX
                       "Error parsing LAPIC address override entry\n");
@@ -852,9 +952,8 @@ static int __init acpi_parse_madt_lapic_entries(void)
         * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
         */
 
-       count =
-           acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
-                                 acpi_parse_lapic_addr_ovr, 0);
+       count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
+                                     acpi_parse_lapic_addr_ovr, 0);
        if (count < 0) {
                printk(KERN_ERR PREFIX
                       "Error parsing LAPIC address override entry\n");
@@ -882,11 +981,10 @@ static int __init acpi_parse_madt_lapic_entries(void)
                return count;
        }
 
-       x2count =
-           acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI,
-                                 acpi_parse_x2apic_nmi, 0);
-       count =
-           acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0);
+       x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI,
+                                       acpi_parse_x2apic_nmi, 0);
+       count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI,
+                                     acpi_parse_lapic_nmi, 0);
        if (count < 0 || x2count < 0) {
                printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
                /* TBD: Cleanup to allow fallback to MPS */
@@ -897,44 +995,7 @@ static int __init acpi_parse_madt_lapic_entries(void)
 #endif                         /* CONFIG_X86_LOCAL_APIC */
 
 #ifdef CONFIG_X86_IO_APIC
-#define MP_ISA_BUS             0
-
-void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
-{
-       int ioapic;
-       int pin;
-       struct mpc_intsrc mp_irq;
-
-       /*
-        * Convert 'gsi' to 'ioapic.pin'.
-        */
-       ioapic = mp_find_ioapic(gsi);
-       if (ioapic < 0)
-               return;
-       pin = mp_find_ioapic_pin(ioapic, gsi);
-
-       /*
-        * TBD: This check is for faulty timer entries, where the override
-        *      erroneously sets the trigger to level, resulting in a HUGE
-        *      increase of timer interrupts!
-        */
-       if ((bus_irq == 0) && (trigger == 3))
-               trigger = 1;
-
-       mp_irq.type = MP_INTSRC;
-       mp_irq.irqtype = mp_INT;
-       mp_irq.irqflag = (trigger << 2) | polarity;
-       mp_irq.srcbus = MP_ISA_BUS;
-       mp_irq.srcbusirq = bus_irq;     /* IRQ */
-       mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */
-       mp_irq.dstirq = pin;    /* INTIN# */
-
-       mp_save_irq(&mp_irq);
-
-       isa_irq_to_gsi[bus_irq] = gsi;
-}
-
-void __init mp_config_acpi_legacy_irqs(void)
+static void __init mp_config_acpi_legacy_irqs(void)
 {
        int i;
        struct mpc_intsrc mp_irq;
@@ -952,7 +1013,7 @@ void __init mp_config_acpi_legacy_irqs(void)
         * Use the default configuration for the IRQs 0-15.  Unless
         * overridden by (MADT) interrupt source override entries.
         */
-       for (i = 0; i < 16; i++) {
+       for (i = 0; i < nr_legacy_irqs(); i++) {
                int ioapic, pin;
                unsigned int dstapic;
                int idx;
@@ -1000,84 +1061,6 @@ void __init mp_config_acpi_legacy_irqs(void)
        }
 }
 
-static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
-                       int polarity)
-{
-#ifdef CONFIG_X86_MPPARSE
-       struct mpc_intsrc mp_irq;
-       struct pci_dev *pdev;
-       unsigned char number;
-       unsigned int devfn;
-       int ioapic;
-       u8 pin;
-
-       if (!acpi_ioapic)
-               return 0;
-       if (!dev || !dev_is_pci(dev))
-               return 0;
-
-       pdev = to_pci_dev(dev);
-       number = pdev->bus->number;
-       devfn = pdev->devfn;
-       pin = pdev->pin;
-       /* print the entry should happen on mptable identically */
-       mp_irq.type = MP_INTSRC;
-       mp_irq.irqtype = mp_INT;
-       mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
-                               (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
-       mp_irq.srcbus = number;
-       mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
-       ioapic = mp_find_ioapic(gsi);
-       mp_irq.dstapic = mpc_ioapic_id(ioapic);
-       mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
-
-       mp_save_irq(&mp_irq);
-#endif
-       return 0;
-}
-
-int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
-{
-       int ioapic;
-       int ioapic_pin;
-       struct io_apic_irq_attr irq_attr;
-       int ret;
-
-       if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
-               return gsi;
-
-       /* Don't set up the ACPI SCI because it's already set up */
-       if (acpi_gbl_FADT.sci_interrupt == gsi)
-               return gsi;
-
-       ioapic = mp_find_ioapic(gsi);
-       if (ioapic < 0) {
-               printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
-               return gsi;
-       }
-
-       ioapic_pin = mp_find_ioapic_pin(ioapic, gsi);
-
-       if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
-               printk(KERN_ERR "Invalid reference to IOAPIC pin "
-                      "%d-%d\n", mpc_ioapic_id(ioapic),
-                      ioapic_pin);
-               return gsi;
-       }
-
-       if (enable_update_mptable)
-               mp_config_acpi_gsi(dev, gsi, trigger, polarity);
-
-       set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
-                            trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
-                            polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
-       ret = io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr);
-       if (ret < 0)
-               gsi = INT_MIN;
-
-       return gsi;
-}
-
 /*
  * Parse IOAPIC related entries in MADT
  * returns 0 on success, < 0 on error
@@ -1107,9 +1090,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
                return -ENODEV;
        }
 
-       count =
-           acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic,
-                                 MAX_IO_APICS);
+       count = acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic,
+                                     MAX_IO_APICS);
        if (!count) {
                printk(KERN_ERR PREFIX "No IOAPIC entries present\n");
                return -ENODEV;
@@ -1118,9 +1100,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
                return count;
        }
 
-       count =
-           acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr,
-                                 nr_irqs);
+       count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE,
+                                     acpi_parse_int_src_ovr, nr_irqs);
        if (count < 0) {
                printk(KERN_ERR PREFIX
                       "Error parsing interrupt source overrides entry\n");
@@ -1139,9 +1120,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
        /* Fill in identity legacy mappings where no override */
        mp_config_acpi_legacy_irqs();
 
-       count =
-           acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src,
-                                 nr_irqs);
+       count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE,
+                                     acpi_parse_nmi_src, nr_irqs);
        if (count < 0) {
                printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
                /* TBD: Cleanup to allow fallback to MPS */
index ad28db7e6bdea3594a4bfad1d70d0c975ded7021..67760275544b4f548f98c5b18c098d481637995d 100644 (file)
@@ -67,7 +67,7 @@ EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);
 /*
  * The highest APIC ID seen during enumeration.
  */
-unsigned int max_physical_apicid;
+static unsigned int max_physical_apicid;
 
 /*
  * Bitmask of physically existing CPUs:
@@ -1342,17 +1342,6 @@ void setup_local_APIC(void)
        /* always use the value from LDR */
        early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
                logical_smp_processor_id();
-
-       /*
-        * Some NUMA implementations (NUMAQ) don't initialize apicid to
-        * node mapping during NUMA init.  Now that logical apicid is
-        * guaranteed to be known, give it another chance.  This is already
-        * a bit too late - percpu allocation has already happened without
-        * proper NUMA affinity.
-        */
-       if (apic->x86_32_numa_cpu_node)
-               set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu),
-                                  apic->x86_32_numa_cpu_node(cpu));
 #endif
 
        /*
@@ -2053,8 +2042,6 @@ void __init connect_bsp_APIC(void)
                imcr_pic_to_apic();
        }
 #endif
-       if (apic->enable_apic_mode)
-               apic->enable_apic_mode();
 }
 
 /**
@@ -2451,51 +2438,6 @@ static void apic_pm_activate(void) { }
 
 #ifdef CONFIG_X86_64
 
-static int apic_cluster_num(void)
-{
-       int i, clusters, zeros;
-       unsigned id;
-       u16 *bios_cpu_apicid;
-       DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
-
-       bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
-       bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
-
-       for (i = 0; i < nr_cpu_ids; i++) {
-               /* are we being called early in kernel startup? */
-               if (bios_cpu_apicid) {
-                       id = bios_cpu_apicid[i];
-               } else if (i < nr_cpu_ids) {
-                       if (cpu_present(i))
-                               id = per_cpu(x86_bios_cpu_apicid, i);
-                       else
-                               continue;
-               } else
-                       break;
-
-               if (id != BAD_APICID)
-                       __set_bit(APIC_CLUSTERID(id), clustermap);
-       }
-
-       /* Problem:  Partially populated chassis may not have CPUs in some of
-        * the APIC clusters they have been allocated.  Only present CPUs have
-        * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
-        * Since clusters are allocated sequentially, count zeros only if
-        * they are bounded by ones.
-        */
-       clusters = 0;
-       zeros = 0;
-       for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
-               if (test_bit(i, clustermap)) {
-                       clusters += 1 + zeros;
-                       zeros = 0;
-               } else
-                       ++zeros;
-       }
-
-       return clusters;
-}
-
 static int multi_checked;
 static int multi;
 
@@ -2540,20 +2482,7 @@ static void dmi_check_multi(void)
 int apic_is_clustered_box(void)
 {
        dmi_check_multi();
-       if (multi)
-               return 1;
-
-       if (!is_vsmp_box())
-               return 0;
-
-       /*
-        * ScaleMP vSMPowered boxes have one cluster per board and TSCs are
-        * not guaranteed to be synced between boards
-        */
-       if (apic_cluster_num() > 1)
-               return 1;
-
-       return 0;
+       return multi;
 }
 #endif
 
index 7c1b29479513a1d78f79410915a4ee30e5d1c53e..de918c410eaed69863e9b95d26566c638d388c45 100644 (file)
@@ -168,21 +168,16 @@ static struct apic apic_flat =  {
        .disable_esr                    = 0,
        .dest_logical                   = APIC_DEST_LOGICAL,
        .check_apicid_used              = NULL,
-       .check_apicid_present           = NULL,
 
        .vector_allocation_domain       = flat_vector_allocation_domain,
        .init_apic_ldr                  = flat_init_apic_ldr,
 
        .ioapic_phys_id_map             = NULL,
        .setup_apic_routing             = NULL,
-       .multi_timer_check              = NULL,
        .cpu_present_to_apicid          = default_cpu_present_to_apicid,
        .apicid_to_cpu_present          = NULL,
-       .setup_portio_remap             = NULL,
        .check_phys_apicid_present      = default_check_phys_apicid_present,
-       .enable_apic_mode               = NULL,
        .phys_pkg_id                    = flat_phys_pkg_id,
-       .mps_oem_check                  = NULL,
 
        .get_apic_id                    = flat_get_apic_id,
        .set_apic_id                    = set_apic_id,
@@ -196,10 +191,7 @@ static struct apic apic_flat =  {
        .send_IPI_all                   = flat_send_IPI_all,
        .send_IPI_self                  = apic_send_IPI_self,
 
-       .trampoline_phys_low            = DEFAULT_TRAMPOLINE_PHYS_LOW,
-       .trampoline_phys_high           = DEFAULT_TRAMPOLINE_PHYS_HIGH,
        .wait_for_init_deassert         = false,
-       .smp_callin_clear_local_apic    = NULL,
        .inquire_remote_apic            = default_inquire_remote_apic,
 
        .read                           = native_apic_mem_read,
@@ -283,7 +275,6 @@ static struct apic apic_physflat =  {
        .disable_esr                    = 0,
        .dest_logical                   = 0,
        .check_apicid_used              = NULL,
-       .check_apicid_present           = NULL,
 
        .vector_allocation_domain       = default_vector_allocation_domain,
        /* not needed, but shouldn't hurt: */
@@ -291,14 +282,10 @@ static struct apic apic_physflat =  {
 
        .ioapic_phys_id_map             = NULL,
        .setup_apic_routing             = NULL,
-       .multi_timer_check              = NULL,
        .cpu_present_to_apicid          = default_cpu_present_to_apicid,
        .apicid_to_cpu_present          = NULL,
-       .setup_portio_remap             = NULL,
        .check_phys_apicid_present      = default_check_phys_apicid_present,
-       .enable_apic_mode               = NULL,
        .phys_pkg_id                    = flat_phys_pkg_id,
-       .mps_oem_check                  = NULL,
 
        .get_apic_id                    = flat_get_apic_id,
        .set_apic_id                    = set_apic_id,
@@ -312,10 +299,7 @@ static struct apic apic_physflat =  {
        .send_IPI_all                   = physflat_send_IPI_all,
        .send_IPI_self                  = apic_send_IPI_self,
 
-       .trampoline_phys_low            = DEFAULT_TRAMPOLINE_PHYS_LOW,
-       .trampoline_phys_high           = DEFAULT_TRAMPOLINE_PHYS_HIGH,
        .wait_for_init_deassert         = false,
-       .smp_callin_clear_local_apic    = NULL,
        .inquire_remote_apic            = default_inquire_remote_apic,
 
        .read                           = native_apic_mem_read,
index 8c7c98249c205f0f596e7c0866e89444dc3d4bc8..b205cdbdbe6a522e51bedc05f2bf86c39c141f66 100644 (file)
@@ -89,16 +89,6 @@ static const struct cpumask *noop_target_cpus(void)
        return cpumask_of(0);
 }
 
-static unsigned long noop_check_apicid_used(physid_mask_t *map, int apicid)
-{
-       return physid_isset(apicid, *map);
-}
-
-static unsigned long noop_check_apicid_present(int bit)
-{
-       return physid_isset(bit, phys_cpu_present_map);
-}
-
 static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask,
                                          const struct cpumask *mask)
 {
@@ -133,27 +123,21 @@ struct apic apic_noop = {
        .target_cpus                    = noop_target_cpus,
        .disable_esr                    = 0,
        .dest_logical                   = APIC_DEST_LOGICAL,
-       .check_apicid_used              = noop_check_apicid_used,
-       .check_apicid_present           = noop_check_apicid_present,
+       .check_apicid_used              = default_check_apicid_used,
 
        .vector_allocation_domain       = noop_vector_allocation_domain,
        .init_apic_ldr                  = noop_init_apic_ldr,
 
        .ioapic_phys_id_map             = default_ioapic_phys_id_map,
        .setup_apic_routing             = NULL,
-       .multi_timer_check              = NULL,
 
        .cpu_present_to_apicid          = default_cpu_present_to_apicid,
        .apicid_to_cpu_present          = physid_set_mask_of_physid,
 
-       .setup_portio_remap             = NULL,
        .check_phys_apicid_present      = default_check_phys_apicid_present,
-       .enable_apic_mode               = NULL,
 
        .phys_pkg_id                    = noop_phys_pkg_id,
 
-       .mps_oem_check                  = NULL,
-
        .get_apic_id                    = noop_get_apic_id,
        .set_apic_id                    = NULL,
        .apic_id_mask                   = 0x0F << 24,
@@ -168,12 +152,7 @@ struct apic apic_noop = {
 
        .wakeup_secondary_cpu           = noop_wakeup_secondary_cpu,
 
-       /* should be safe */
-       .trampoline_phys_low            = DEFAULT_TRAMPOLINE_PHYS_LOW,
-       .trampoline_phys_high           = DEFAULT_TRAMPOLINE_PHYS_HIGH,
-
        .wait_for_init_deassert         = false,
-       .smp_callin_clear_local_apic    = NULL,
        .inquire_remote_apic            = NULL,
 
        .read                           = noop_apic_read,
index a5b45df8bc881cafbc2560f3b7ea72ef02aa7c73..ae915391ebecdbe9d01abb63e5ab787f3c484381 100644 (file)
@@ -217,21 +217,16 @@ static const struct apic apic_numachip __refconst = {
        .disable_esr                    = 0,
        .dest_logical                   = 0,
        .check_apicid_used              = NULL,
-       .check_apicid_present           = NULL,
 
        .vector_allocation_domain       = default_vector_allocation_domain,
        .init_apic_ldr                  = flat_init_apic_ldr,
 
        .ioapic_phys_id_map             = NULL,
        .setup_apic_routing             = NULL,
-       .multi_timer_check              = NULL,
        .cpu_present_to_apicid          = default_cpu_present_to_apicid,
        .apicid_to_cpu_present          = NULL,
-       .setup_portio_remap             = NULL,
        .check_phys_apicid_present      = default_check_phys_apicid_present,
-       .enable_apic_mode               = NULL,
        .phys_pkg_id                    = numachip_phys_pkg_id,
-       .mps_oem_check                  = NULL,
 
        .get_apic_id                    = get_apic_id,
        .set_apic_id                    = set_apic_id,
@@ -246,10 +241,7 @@ static const struct apic apic_numachip __refconst = {
        .send_IPI_self                  = numachip_send_IPI_self,
 
        .wakeup_secondary_cpu           = numachip_wakeup_secondary,
-       .trampoline_phys_low            = DEFAULT_TRAMPOLINE_PHYS_LOW,
-       .trampoline_phys_high           = DEFAULT_TRAMPOLINE_PHYS_HIGH,
        .wait_for_init_deassert         = false,
-       .smp_callin_clear_local_apic    = NULL,
        .inquire_remote_apic            = NULL, /* REMRD not supported */
 
        .read                           = native_apic_mem_read,
index e4840aa7a255b63db235ab5cecd07ecabf5c5013..c4a8d63f8220cf09880d7b3e8a6ad3f42e7a6a25 100644 (file)
@@ -31,11 +31,6 @@ static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
        return 0;
 }
 
-static unsigned long bigsmp_check_apicid_present(int bit)
-{
-       return 1;
-}
-
 static int bigsmp_early_logical_apicid(int cpu)
 {
        /* on bigsmp, logical apicid is the same as physical */
@@ -168,21 +163,16 @@ static struct apic apic_bigsmp = {
        .disable_esr                    = 1,
        .dest_logical                   = 0,
        .check_apicid_used              = bigsmp_check_apicid_used,
-       .check_apicid_present           = bigsmp_check_apicid_present,
 
        .vector_allocation_domain       = default_vector_allocation_domain,
        .init_apic_ldr                  = bigsmp_init_apic_ldr,
 
        .ioapic_phys_id_map             = bigsmp_ioapic_phys_id_map,
        .setup_apic_routing             = bigsmp_setup_apic_routing,
-       .multi_timer_check              = NULL,
        .cpu_present_to_apicid          = bigsmp_cpu_present_to_apicid,
        .apicid_to_cpu_present          = physid_set_mask_of_physid,
-       .setup_portio_remap             = NULL,
        .check_phys_apicid_present      = bigsmp_check_phys_apicid_present,
-       .enable_apic_mode               = NULL,
        .phys_pkg_id                    = bigsmp_phys_pkg_id,
-       .mps_oem_check                  = NULL,
 
        .get_apic_id                    = bigsmp_get_apic_id,
        .set_apic_id                    = NULL,
@@ -196,11 +186,7 @@ static struct apic apic_bigsmp = {
        .send_IPI_all                   = bigsmp_send_IPI_all,
        .send_IPI_self                  = default_send_IPI_self,
 
-       .trampoline_phys_low            = DEFAULT_TRAMPOLINE_PHYS_LOW,
-       .trampoline_phys_high           = DEFAULT_TRAMPOLINE_PHYS_HIGH,
-
        .wait_for_init_deassert         = true,
-       .smp_callin_clear_local_apic    = NULL,
        .inquire_remote_apic            = default_inquire_remote_apic,
 
        .read                           = native_apic_mem_read,
index 81e08eff05eedbd2e839a14229464cbea456e84c..29290f554e7963fc104cd385921692a6bbc41470 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/acpi.h>
 #include <linux/module.h>
 #include <linux/syscore_ops.h>
+#include <linux/irqdomain.h>
 #include <linux/msi.h>
 #include <linux/htirq.h>
 #include <linux/freezer.h>
 
 #define __apicdebuginit(type) static type __init
 
+#define        for_each_ioapic(idx)            \
+       for ((idx) = 0; (idx) < nr_ioapics; (idx)++)
+#define        for_each_ioapic_reverse(idx)    \
+       for ((idx) = nr_ioapics - 1; (idx) >= 0; (idx)--)
+#define        for_each_pin(idx, pin)          \
+       for ((pin) = 0; (pin) < ioapics[(idx)].nr_registers; (pin)++)
+#define        for_each_ioapic_pin(idx, pin)   \
+       for_each_ioapic((idx))          \
+               for_each_pin((idx), (pin))
+
 #define for_each_irq_pin(entry, head) \
        for (entry = head; entry; entry = entry->next)
 
@@ -73,6 +84,17 @@ int sis_apic_bug = -1;
 
 static DEFINE_RAW_SPINLOCK(ioapic_lock);
 static DEFINE_RAW_SPINLOCK(vector_lock);
+static DEFINE_MUTEX(ioapic_mutex);
+static unsigned int ioapic_dynirq_base;
+static int ioapic_initialized;
+
+struct mp_pin_info {
+       int trigger;
+       int polarity;
+       int node;
+       int set;
+       u32 count;
+};
 
 static struct ioapic {
        /*
@@ -87,7 +109,9 @@ static struct ioapic {
        struct mpc_ioapic mp_config;
        /* IO APIC gsi routing info */
        struct mp_ioapic_gsi  gsi_config;
-       DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+       struct ioapic_domain_cfg irqdomain_cfg;
+       struct irq_domain *irqdomain;
+       struct mp_pin_info *pin_info;
 } ioapics[MAX_IO_APICS];
 
 #define mpc_ioapic_ver(ioapic_idx)     ioapics[ioapic_idx].mp_config.apicver
@@ -107,6 +131,41 @@ struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
        return &ioapics[ioapic_idx].gsi_config;
 }
 
+static inline int mp_ioapic_pin_count(int ioapic)
+{
+       struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+
+       return gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
+}
+
+u32 mp_pin_to_gsi(int ioapic, int pin)
+{
+       return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin;
+}
+
+/*
+ * Initialize all legacy IRQs and all pins on the first IOAPIC
+ * if we have legacy interrupt controller. Kernel boot option "pirq="
+ * may rely on non-legacy pins on the first IOAPIC.
+ */
+static inline int mp_init_irq_at_boot(int ioapic, int irq)
+{
+       if (!nr_legacy_irqs())
+               return 0;
+
+       return ioapic == 0 || (irq >= 0 && irq < nr_legacy_irqs());
+}
+
+static inline struct mp_pin_info *mp_pin_info(int ioapic_idx, int pin)
+{
+       return ioapics[ioapic_idx].pin_info + pin;
+}
+
+static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic)
+{
+       return ioapics[ioapic].irqdomain;
+}
+
 int nr_ioapics;
 
 /* The one past the highest gsi number used */
@@ -118,9 +177,6 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
 /* # of MP IRQ source entries */
 int mp_irq_entries;
 
-/* GSI interrupts */
-static int nr_irqs_gsi = NR_IRQS_LEGACY;
-
 #ifdef CONFIG_EISA
 int mp_bus_id_to_type[MAX_MP_BUSSES];
 #endif
@@ -149,8 +205,7 @@ static int __init parse_noapic(char *str)
 }
 early_param("noapic", parse_noapic);
 
-static int io_apic_setup_irq_pin(unsigned int irq, int node,
-                                struct io_apic_irq_attr *attr);
+static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node);
 
 /* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
 void mp_save_irq(struct mpc_intsrc *m)
@@ -182,19 +237,15 @@ static struct irq_pin_list *alloc_irq_pin_list(int node)
        return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
 }
 
-
-/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
-static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
-
 int __init arch_early_irq_init(void)
 {
        struct irq_cfg *cfg;
-       int count, node, i;
+       int i, node = cpu_to_node(0);
 
-       if (!legacy_pic->nr_legacy_irqs)
+       if (!nr_legacy_irqs())
                io_apic_irqs = ~0UL;
 
-       for (i = 0; i < nr_ioapics; i++) {
+       for_each_ioapic(i) {
                ioapics[i].saved_registers =
                        kzalloc(sizeof(struct IO_APIC_route_entry) *
                                ioapics[i].nr_registers, GFP_KERNEL);
@@ -202,28 +253,20 @@ int __init arch_early_irq_init(void)
                        pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
        }
 
-       cfg = irq_cfgx;
-       count = ARRAY_SIZE(irq_cfgx);
-       node = cpu_to_node(0);
-
-       for (i = 0; i < count; i++) {
-               irq_set_chip_data(i, &cfg[i]);
-               zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
-               zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
-               /*
-                * For legacy IRQ's, start with assigning irq0 to irq15 to
-                * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's.
-                */
-               if (i < legacy_pic->nr_legacy_irqs) {
-                       cfg[i].vector = IRQ0_VECTOR + i;
-                       cpumask_setall(cfg[i].domain);
-               }
+       /*
+        * For legacy IRQ's, start with assigning irq0 to irq15 to
+        * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's.
+        */
+       for (i = 0; i < nr_legacy_irqs(); i++) {
+               cfg = alloc_irq_and_cfg_at(i, node);
+               cfg->vector = IRQ0_VECTOR + i;
+               cpumask_setall(cfg->domain);
        }
 
        return 0;
 }
 
-static struct irq_cfg *irq_cfg(unsigned int irq)
+static inline struct irq_cfg *irq_cfg(unsigned int irq)
 {
        return irq_get_chip_data(irq);
 }
@@ -265,7 +308,7 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
        if (res < 0) {
                if (res != -EEXIST)
                        return NULL;
-               cfg = irq_get_chip_data(at);
+               cfg = irq_cfg(at);
                if (cfg)
                        return cfg;
        }
@@ -425,6 +468,21 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi
        return 0;
 }
 
+static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin)
+{
+       struct irq_pin_list **last, *entry;
+
+       last = &cfg->irq_2_pin;
+       for_each_irq_pin(entry, cfg->irq_2_pin)
+               if (entry->apic == apic && entry->pin == pin) {
+                       *last = entry->next;
+                       kfree(entry);
+                       return;
+               } else {
+                       last = &entry->next;
+               }
+}
+
 static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
 {
        if (__add_pin_to_irq_node(cfg, node, apic, pin))
@@ -627,9 +685,8 @@ static void clear_IO_APIC (void)
 {
        int apic, pin;
 
-       for (apic = 0; apic < nr_ioapics; apic++)
-               for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
-                       clear_IO_APIC_pin(apic, pin);
+       for_each_ioapic_pin(apic, pin)
+               clear_IO_APIC_pin(apic, pin);
 }
 
 #ifdef CONFIG_X86_32
@@ -678,13 +735,13 @@ int save_ioapic_entries(void)
        int apic, pin;
        int err = 0;
 
-       for (apic = 0; apic < nr_ioapics; apic++) {
+       for_each_ioapic(apic) {
                if (!ioapics[apic].saved_registers) {
                        err = -ENOMEM;
                        continue;
                }
 
-               for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
+               for_each_pin(apic, pin)
                        ioapics[apic].saved_registers[pin] =
                                ioapic_read_entry(apic, pin);
        }
@@ -699,11 +756,11 @@ void mask_ioapic_entries(void)
 {
        int apic, pin;
 
-       for (apic = 0; apic < nr_ioapics; apic++) {
+       for_each_ioapic(apic) {
                if (!ioapics[apic].saved_registers)
                        continue;
 
-               for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
+               for_each_pin(apic, pin) {
                        struct IO_APIC_route_entry entry;
 
                        entry = ioapics[apic].saved_registers[pin];
@@ -722,11 +779,11 @@ int restore_ioapic_entries(void)
 {
        int apic, pin;
 
-       for (apic = 0; apic < nr_ioapics; apic++) {
+       for_each_ioapic(apic) {
                if (!ioapics[apic].saved_registers)
                        continue;
 
-               for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
+               for_each_pin(apic, pin)
                        ioapic_write_entry(apic, pin,
                                           ioapics[apic].saved_registers[pin]);
        }
@@ -785,7 +842,7 @@ static int __init find_isa_irq_apic(int irq, int type)
        if (i < mp_irq_entries) {
                int ioapic_idx;
 
-               for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+               for_each_ioapic(ioapic_idx)
                        if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic)
                                return ioapic_idx;
        }
@@ -799,7 +856,7 @@ static int __init find_isa_irq_apic(int irq, int type)
  */
 static int EISA_ELCR(unsigned int irq)
 {
-       if (irq < legacy_pic->nr_legacy_irqs) {
+       if (irq < nr_legacy_irqs()) {
                unsigned int port = 0x4d0 + (irq >> 3);
                return (inb(port) >> (irq & 7)) & 1;
        }
@@ -939,29 +996,101 @@ static int irq_trigger(int idx)
        return trigger;
 }
 
-static int pin_2_irq(int idx, int apic, int pin)
+static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin)
+{
+       int irq = -1;
+       int ioapic = (int)(long)domain->host_data;
+       int type = ioapics[ioapic].irqdomain_cfg.type;
+
+       switch (type) {
+       case IOAPIC_DOMAIN_LEGACY:
+               /*
+                * Dynamically allocate IRQ number for non-ISA IRQs in the first 16
+                * GSIs on some weird platforms.
+                */
+               if (gsi < nr_legacy_irqs())
+                       irq = irq_create_mapping(domain, pin);
+               else if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0)
+                       irq = gsi;
+               break;
+       case IOAPIC_DOMAIN_STRICT:
+               if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0)
+                       irq = gsi;
+               break;
+       case IOAPIC_DOMAIN_DYNAMIC:
+               irq = irq_create_mapping(domain, pin);
+               break;
+       default:
+               WARN(1, "ioapic: unknown irqdomain type %d\n", type);
+               break;
+       }
+
+       return irq > 0 ? irq : -1;
+}
+
+static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
+                            unsigned int flags)
 {
        int irq;
-       int bus = mp_irqs[idx].srcbus;
-       struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(apic);
+       struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+       struct mp_pin_info *info = mp_pin_info(ioapic, pin);
+
+       if (!domain)
+               return -1;
+
+       mutex_lock(&ioapic_mutex);
 
        /*
-        * Debugging check, we are in big trouble if this message pops up!
+        * Don't use irqdomain to manage ISA IRQs because there may be
+        * multiple IOAPIC pins sharing the same ISA IRQ number and
+        * irqdomain only supports 1:1 mapping between IOAPIC pin and
+        * IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are used
+        * for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H).
+        * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are
+        * available, and some BIOSes may use MP Interrupt Source records
+        * to override IRQ numbers for PIRQs instead of reprogramming
+        * the interrupt routing logic. Thus there may be multiple pins
+        * sharing the same legacy IRQ number when ACPI is disabled.
         */
-       if (mp_irqs[idx].dstirq != pin)
-               pr_err("broken BIOS or MPTABLE parser, ayiee!!\n");
-
-       if (test_bit(bus, mp_bus_not_pci)) {
+       if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) {
                irq = mp_irqs[idx].srcbusirq;
+               if (flags & IOAPIC_MAP_ALLOC) {
+                       if (info->count == 0 &&
+                           mp_irqdomain_map(domain, irq, pin) != 0)
+                               irq = -1;
+
+                       /* special handling for timer IRQ0 */
+                       if (irq == 0)
+                               info->count++;
+               }
        } else {
-               u32 gsi = gsi_cfg->gsi_base + pin;
+               irq = irq_find_mapping(domain, pin);
+               if (irq <= 0 && (flags & IOAPIC_MAP_ALLOC))
+                       irq = alloc_irq_from_domain(domain, gsi, pin);
+       }
 
-               if (gsi >= NR_IRQS_LEGACY)
-                       irq = gsi;
-               else
-                       irq = gsi_top + gsi;
+       if (flags & IOAPIC_MAP_ALLOC) {
+               if (irq > 0)
+                       info->count++;
+               else if (info->count == 0)
+                       info->set = 0;
        }
 
+       mutex_unlock(&ioapic_mutex);
+
+       return irq > 0 ? irq : -1;
+}
+
+static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags)
+{
+       u32 gsi = mp_pin_to_gsi(ioapic, pin);
+
+       /*
+        * Debugging check, we are in big trouble if this message pops up!
+        */
+       if (mp_irqs[idx].dstirq != pin)
+               pr_err("broken BIOS or MPTABLE parser, ayiee!!\n");
+
 #ifdef CONFIG_X86_32
        /*
         * PCI IRQ command line redirection. Yes, limits are hardcoded.
@@ -972,16 +1101,58 @@ static int pin_2_irq(int idx, int apic, int pin)
                                apic_printk(APIC_VERBOSE, KERN_DEBUG
                                                "disabling PIRQ%d\n", pin-16);
                        } else {
-                               irq = pirq_entries[pin-16];
+                               int irq = pirq_entries[pin-16];
                                apic_printk(APIC_VERBOSE, KERN_DEBUG
                                                "using PIRQ%d -> IRQ %d\n",
                                                pin-16, irq);
+                               return irq;
                        }
                }
        }
 #endif
 
-       return irq;
+       return  mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags);
+}
+
+int mp_map_gsi_to_irq(u32 gsi, unsigned int flags)
+{
+       int ioapic, pin, idx;
+
+       ioapic = mp_find_ioapic(gsi);
+       if (ioapic < 0)
+               return -1;
+
+       pin = mp_find_ioapic_pin(ioapic, gsi);
+       idx = find_irq_entry(ioapic, pin, mp_INT);
+       if ((flags & IOAPIC_MAP_CHECK) && idx < 0)
+               return -1;
+
+       return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags);
+}
+
+void mp_unmap_irq(int irq)
+{
+       struct irq_data *data = irq_get_irq_data(irq);
+       struct mp_pin_info *info;
+       int ioapic, pin;
+
+       if (!data || !data->domain)
+               return;
+
+       ioapic = (int)(long)data->domain->host_data;
+       pin = (int)data->hwirq;
+       info = mp_pin_info(ioapic, pin);
+
+       mutex_lock(&ioapic_mutex);
+       if (--info->count == 0) {
+               info->set = 0;
+               if (irq < nr_legacy_irqs() &&
+                   ioapics[ioapic].irqdomain_cfg.type == IOAPIC_DOMAIN_LEGACY)
+                       mp_irqdomain_unmap(data->domain, irq);
+               else
+                       irq_dispose_mapping(irq);
+       }
+       mutex_unlock(&ioapic_mutex);
 }
 
 /*
@@ -991,7 +1162,7 @@ static int pin_2_irq(int idx, int apic, int pin)
 int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
                                struct io_apic_irq_attr *irq_attr)
 {
-       int ioapic_idx, i, best_guess = -1;
+       int irq, i, best_ioapic = -1, best_idx = -1;
 
        apic_printk(APIC_DEBUG,
                    "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
@@ -1001,44 +1172,56 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
                            "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
                return -1;
        }
+
        for (i = 0; i < mp_irq_entries; i++) {
                int lbus = mp_irqs[i].srcbus;
+               int ioapic_idx, found = 0;
 
-               for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+               if (bus != lbus || mp_irqs[i].irqtype != mp_INT ||
+                   slot != ((mp_irqs[i].srcbusirq >> 2) & 0x1f))
+                       continue;
+
+               for_each_ioapic(ioapic_idx)
                        if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic ||
-                           mp_irqs[i].dstapic == MP_APIC_ALL)
+                           mp_irqs[i].dstapic == MP_APIC_ALL) {
+                               found = 1;
                                break;
+                       }
+               if (!found)
+                       continue;
 
-               if (!test_bit(lbus, mp_bus_not_pci) &&
-                   !mp_irqs[i].irqtype &&
-                   (bus == lbus) &&
-                   (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
-                       int irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq);
+               /* Skip ISA IRQs */
+               irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq, 0);
+               if (irq > 0 && !IO_APIC_IRQ(irq))
+                       continue;
 
-                       if (!(ioapic_idx || IO_APIC_IRQ(irq)))
-                               continue;
+               if (pin == (mp_irqs[i].srcbusirq & 3)) {
+                       best_idx = i;
+                       best_ioapic = ioapic_idx;
+                       goto out;
+               }
 
-                       if (pin == (mp_irqs[i].srcbusirq & 3)) {
-                               set_io_apic_irq_attr(irq_attr, ioapic_idx,
-                                                    mp_irqs[i].dstirq,
-                                                    irq_trigger(i),
-                                                    irq_polarity(i));
-                               return irq;
-                       }
-                       /*
-                        * Use the first all-but-pin matching entry as a
-                        * best-guess fuzzy result for broken mptables.
-                        */
-                       if (best_guess < 0) {
-                               set_io_apic_irq_attr(irq_attr, ioapic_idx,
-                                                    mp_irqs[i].dstirq,
-                                                    irq_trigger(i),
-                                                    irq_polarity(i));
-                               best_guess = irq;
-                       }
+               /*
+                * Use the first all-but-pin matching entry as a
+                * best-guess fuzzy result for broken mptables.
+                */
+               if (best_idx < 0) {
+                       best_idx = i;
+                       best_ioapic = ioapic_idx;
                }
        }
-       return best_guess;
+       if (best_idx < 0)
+               return -1;
+
+out:
+       irq = pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq,
+                       IOAPIC_MAP_ALLOC);
+       if (irq > 0)
+               set_io_apic_irq_attr(irq_attr, best_ioapic,
+                                    mp_irqs[best_idx].dstirq,
+                                    irq_trigger(best_idx),
+                                    irq_polarity(best_idx));
+       return irq;
 }
 EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
 
@@ -1198,7 +1381,7 @@ void __setup_vector_irq(int cpu)
        raw_spin_lock(&vector_lock);
        /* Mark the inuse vectors */
        for_each_active_irq(irq) {
-               cfg = irq_get_chip_data(irq);
+               cfg = irq_cfg(irq);
                if (!cfg)
                        continue;
 
@@ -1227,12 +1410,10 @@ static inline int IO_APIC_irq_trigger(int irq)
 {
        int apic, idx, pin;
 
-       for (apic = 0; apic < nr_ioapics; apic++) {
-               for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
-                       idx = find_irq_entry(apic, pin, mp_INT);
-                       if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
-                               return irq_trigger(idx);
-               }
+       for_each_ioapic_pin(apic, pin) {
+               idx = find_irq_entry(apic, pin, mp_INT);
+               if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin, 0)))
+                       return irq_trigger(idx);
        }
        /*
          * nonexistent IRQs are edge default
@@ -1330,95 +1511,29 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
        }
 
        ioapic_register_intr(irq, cfg, attr->trigger);
-       if (irq < legacy_pic->nr_legacy_irqs)
+       if (irq < nr_legacy_irqs())
                legacy_pic->mask(irq);
 
        ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry);
 }
 
-static bool __init io_apic_pin_not_connected(int idx, int ioapic_idx, int pin)
-{
-       if (idx != -1)
-               return false;
-
-       apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n",
-                   mpc_ioapic_id(ioapic_idx), pin);
-       return true;
-}
-
-static void __init __io_apic_setup_irqs(unsigned int ioapic_idx)
-{
-       int idx, node = cpu_to_node(0);
-       struct io_apic_irq_attr attr;
-       unsigned int pin, irq;
-
-       for (pin = 0; pin < ioapics[ioapic_idx].nr_registers; pin++) {
-               idx = find_irq_entry(ioapic_idx, pin, mp_INT);
-               if (io_apic_pin_not_connected(idx, ioapic_idx, pin))
-                       continue;
-
-               irq = pin_2_irq(idx, ioapic_idx, pin);
-
-               if ((ioapic_idx > 0) && (irq > 16))
-                       continue;
-
-               /*
-                * Skip the timer IRQ if there's a quirk handler
-                * installed and if it returns 1:
-                */
-               if (apic->multi_timer_check &&
-                   apic->multi_timer_check(ioapic_idx, irq))
-                       continue;
-
-               set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx),
-                                    irq_polarity(idx));
-
-               io_apic_setup_irq_pin(irq, node, &attr);
-       }
-}
-
 static void __init setup_IO_APIC_irqs(void)
 {
-       unsigned int ioapic_idx;
+       unsigned int ioapic, pin;
+       int idx;
 
        apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
 
-       for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
-               __io_apic_setup_irqs(ioapic_idx);
-}
-
-/*
- * for the gsit that is not in first ioapic
- * but could not use acpi_register_gsi()
- * like some special sci in IBM x3330
- */
-void setup_IO_APIC_irq_extra(u32 gsi)
-{
-       int ioapic_idx = 0, pin, idx, irq, node = cpu_to_node(0);
-       struct io_apic_irq_attr attr;
-
-       /*
-        * Convert 'gsi' to 'ioapic.pin'.
-        */
-       ioapic_idx = mp_find_ioapic(gsi);
-       if (ioapic_idx < 0)
-               return;
-
-       pin = mp_find_ioapic_pin(ioapic_idx, gsi);
-       idx = find_irq_entry(ioapic_idx, pin, mp_INT);
-       if (idx == -1)
-               return;
-
-       irq = pin_2_irq(idx, ioapic_idx, pin);
-
-       /* Only handle the non legacy irqs on secondary ioapics */
-       if (ioapic_idx == 0 || irq < NR_IRQS_LEGACY)
-               return;
-
-       set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx),
-                            irq_polarity(idx));
-
-       io_apic_setup_irq_pin_once(irq, node, &attr);
+       for_each_ioapic_pin(ioapic, pin) {
+               idx = find_irq_entry(ioapic, pin, mp_INT);
+               if (idx < 0)
+                       apic_printk(APIC_VERBOSE,
+                                   KERN_DEBUG " apic %d pin %d not connected\n",
+                                   mpc_ioapic_id(ioapic), pin);
+               else
+                       pin_2_irq(idx, ioapic, pin,
+                                 ioapic ? 0 : IOAPIC_MAP_ALLOC);
+       }
 }
 
 /*
@@ -1586,7 +1701,7 @@ __apicdebuginit(void) print_IO_APICs(void)
        struct irq_chip *chip;
 
        printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
-       for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+       for_each_ioapic(ioapic_idx)
                printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
                       mpc_ioapic_id(ioapic_idx),
                       ioapics[ioapic_idx].nr_registers);
@@ -1597,7 +1712,7 @@ __apicdebuginit(void) print_IO_APICs(void)
         */
        printk(KERN_INFO "testing the IO APIC.......................\n");
 
-       for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+       for_each_ioapic(ioapic_idx)
                print_IO_APIC(ioapic_idx);
 
        printk(KERN_DEBUG "IRQ to pin mappings:\n");
@@ -1608,7 +1723,7 @@ __apicdebuginit(void) print_IO_APICs(void)
                if (chip != &ioapic_chip)
                        continue;
 
-               cfg = irq_get_chip_data(irq);
+               cfg = irq_cfg(irq);
                if (!cfg)
                        continue;
                entry = cfg->irq_2_pin;
@@ -1758,7 +1873,7 @@ __apicdebuginit(void) print_PIC(void)
        unsigned int v;
        unsigned long flags;
 
-       if (!legacy_pic->nr_legacy_irqs)
+       if (!nr_legacy_irqs())
                return;
 
        printk(KERN_DEBUG "\nprinting PIC contents\n");
@@ -1828,26 +1943,22 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 void __init enable_IO_APIC(void)
 {
        int i8259_apic, i8259_pin;
-       int apic;
+       int apic, pin;
 
-       if (!legacy_pic->nr_legacy_irqs)
+       if (!nr_legacy_irqs())
                return;
 
-       for(apic = 0; apic < nr_ioapics; apic++) {
-               int pin;
+       for_each_ioapic_pin(apic, pin) {
                /* See if any of the pins is in ExtINT mode */
-               for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
-                       struct IO_APIC_route_entry entry;
-                       entry = ioapic_read_entry(apic, pin);
+               struct IO_APIC_route_entry entry = ioapic_read_entry(apic, pin);
 
-                       /* If the interrupt line is enabled and in ExtInt mode
-                        * I have found the pin where the i8259 is connected.
-                        */
-                       if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
-                               ioapic_i8259.apic = apic;
-                               ioapic_i8259.pin  = pin;
-                               goto found_i8259;
-                       }
+               /* If the interrupt line is enabled and in ExtInt mode
+                * I have found the pin where the i8259 is connected.
+                */
+               if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
+                       ioapic_i8259.apic = apic;
+                       ioapic_i8259.pin  = pin;
+                       goto found_i8259;
                }
        }
  found_i8259:
@@ -1919,7 +2030,7 @@ void disable_IO_APIC(void)
         */
        clear_IO_APIC();
 
-       if (!legacy_pic->nr_legacy_irqs)
+       if (!nr_legacy_irqs())
                return;
 
        x86_io_apic_ops.disable();
@@ -1950,7 +2061,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
        /*
         * Set the IOAPIC ID to the value stored in the MPC table.
         */
-       for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) {
+       for_each_ioapic(ioapic_idx) {
                /* Read the register 0 value */
                raw_spin_lock_irqsave(&ioapic_lock, flags);
                reg_00.raw = io_apic_read(ioapic_idx, 0);
@@ -2123,7 +2234,7 @@ static unsigned int startup_ioapic_irq(struct irq_data *data)
        unsigned long flags;
 
        raw_spin_lock_irqsave(&ioapic_lock, flags);
-       if (irq < legacy_pic->nr_legacy_irqs) {
+       if (irq < nr_legacy_irqs()) {
                legacy_pic->mask(irq);
                if (legacy_pic->irq_pending(irq))
                        was_pending = 1;
@@ -2225,7 +2336,7 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
                        apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
                        goto unlock;
                }
-               __this_cpu_write(vector_irq[vector], -1);
+               __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
 unlock:
                raw_spin_unlock(&desc->lock);
        }
@@ -2253,7 +2364,7 @@ static void irq_complete_move(struct irq_cfg *cfg)
 
 void irq_force_complete_move(int irq)
 {
-       struct irq_cfg *cfg = irq_get_chip_data(irq);
+       struct irq_cfg *cfg = irq_cfg(irq);
 
        if (!cfg)
                return;
@@ -2514,26 +2625,15 @@ static inline void init_IO_APIC_traps(void)
        struct irq_cfg *cfg;
        unsigned int irq;
 
-       /*
-        * NOTE! The local APIC isn't very good at handling
-        * multiple interrupts at the same interrupt level.
-        * As the interrupt level is determined by taking the
-        * vector number and shifting that right by 4, we
-        * want to spread these out a bit so that they don't
-        * all fall in the same interrupt level.
-        *
-        * Also, we've got to be careful not to trash gate
-        * 0x80, because int 0x80 is hm, kind of importantish. ;)
-        */
        for_each_active_irq(irq) {
-               cfg = irq_get_chip_data(irq);
+               cfg = irq_cfg(irq);
                if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
                        /*
                         * Hmm.. We don't have an entry for this,
                         * so default to an old-fashioned 8259
                         * interrupt if we can..
                         */
-                       if (irq < legacy_pic->nr_legacy_irqs)
+                       if (irq < nr_legacy_irqs())
                                legacy_pic->make_irq(irq);
                        else
                                /* Strange. Oh, well.. */
@@ -2649,8 +2749,6 @@ static int __init disable_timer_pin_setup(char *arg)
 }
 early_param("disable_timer_pin_1", disable_timer_pin_setup);
 
-int timer_through_8259 __initdata;
-
 /*
  * This code may look a bit paranoid, but it's supposed to cooperate with
  * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
@@ -2661,7 +2759,7 @@ int timer_through_8259 __initdata;
  */
 static inline void __init check_timer(void)
 {
-       struct irq_cfg *cfg = irq_get_chip_data(0);
+       struct irq_cfg *cfg = irq_cfg(0);
        int node = cpu_to_node(0);
        int apic1, pin1, apic2, pin2;
        unsigned long flags;
@@ -2755,7 +2853,6 @@ static inline void __init check_timer(void)
                legacy_pic->unmask(0);
                if (timer_irq_works()) {
                        apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
-                       timer_through_8259 = 1;
                        goto out;
                }
                /*
@@ -2827,15 +2924,54 @@ out:
  */
 #define PIC_IRQS       (1UL << PIC_CASCADE_IR)
 
+static int mp_irqdomain_create(int ioapic)
+{
+       size_t size;
+       int hwirqs = mp_ioapic_pin_count(ioapic);
+       struct ioapic *ip = &ioapics[ioapic];
+       struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg;
+       struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+
+       size = sizeof(struct mp_pin_info) * mp_ioapic_pin_count(ioapic);
+       ip->pin_info = kzalloc(size, GFP_KERNEL);
+       if (!ip->pin_info)
+               return -ENOMEM;
+
+       if (cfg->type == IOAPIC_DOMAIN_INVALID)
+               return 0;
+
+       ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops,
+                                             (void *)(long)ioapic);
+       if(!ip->irqdomain) {
+               kfree(ip->pin_info);
+               ip->pin_info = NULL;
+               return -ENOMEM;
+       }
+
+       if (cfg->type == IOAPIC_DOMAIN_LEGACY ||
+           cfg->type == IOAPIC_DOMAIN_STRICT)
+               ioapic_dynirq_base = max(ioapic_dynirq_base,
+                                        gsi_cfg->gsi_end + 1);
+
+       if (gsi_cfg->gsi_base == 0)
+               irq_set_default_host(ip->irqdomain);
+
+       return 0;
+}
+
 void __init setup_IO_APIC(void)
 {
+       int ioapic;
 
        /*
         * calling enable_IO_APIC() is moved to setup_local_APIC for BP
         */
-       io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL;
+       io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL;
 
        apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
+       for_each_ioapic(ioapic)
+               BUG_ON(mp_irqdomain_create(ioapic));
+
        /*
          * Set up IO-APIC IRQ routing.
          */
@@ -2844,8 +2980,10 @@ void __init setup_IO_APIC(void)
        sync_Arb_IDs();
        setup_IO_APIC_irqs();
        init_IO_APIC_traps();
-       if (legacy_pic->nr_legacy_irqs)
+       if (nr_legacy_irqs())
                check_timer();
+
+       ioapic_initialized = 1;
 }
 
 /*
@@ -2880,7 +3018,7 @@ static void ioapic_resume(void)
 {
        int ioapic_idx;
 
-       for (ioapic_idx = nr_ioapics - 1; ioapic_idx >= 0; ioapic_idx--)
+       for_each_ioapic_reverse(ioapic_idx)
                resume_ioapic_id(ioapic_idx);
 
        restore_ioapic_entries();
@@ -2926,7 +3064,7 @@ int arch_setup_hwirq(unsigned int irq, int node)
 
 void arch_teardown_hwirq(unsigned int irq)
 {
-       struct irq_cfg *cfg = irq_get_chip_data(irq);
+       struct irq_cfg *cfg = irq_cfg(irq);
        unsigned long flags;
 
        free_remapped_irq(irq);
@@ -3053,7 +3191,7 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
        if (!irq_offset)
                write_msi_msg(irq, &msg);
 
-       setup_remapped_irq(irq, irq_get_chip_data(irq), chip);
+       setup_remapped_irq(irq, irq_cfg(irq), chip);
 
        irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
 
@@ -3192,7 +3330,7 @@ int default_setup_hpet_msi(unsigned int irq, unsigned int id)
 
        hpet_msi_write(irq_get_handler_data(irq), &msg);
        irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
-       setup_remapped_irq(irq, irq_get_chip_data(irq), chip);
+       setup_remapped_irq(irq, irq_cfg(irq), chip);
 
        irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
        return 0;
@@ -3303,27 +3441,6 @@ io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
        return ret;
 }
 
-int io_apic_setup_irq_pin_once(unsigned int irq, int node,
-                              struct io_apic_irq_attr *attr)
-{
-       unsigned int ioapic_idx = attr->ioapic, pin = attr->ioapic_pin;
-       int ret;
-       struct IO_APIC_route_entry orig_entry;
-
-       /* Avoid redundant programming */
-       if (test_bit(pin, ioapics[ioapic_idx].pin_programmed)) {
-               pr_debug("Pin %d-%d already programmed\n", mpc_ioapic_id(ioapic_idx), pin);
-               orig_entry = ioapic_read_entry(attr->ioapic, pin);
-               if (attr->trigger == orig_entry.trigger && attr->polarity == orig_entry.polarity)
-                       return 0;
-               return -EBUSY;
-       }
-       ret = io_apic_setup_irq_pin(irq, node, attr);
-       if (!ret)
-               set_bit(pin, ioapics[ioapic_idx].pin_programmed);
-       return ret;
-}
-
 static int __init io_apic_get_redir_entries(int ioapic)
 {
        union IO_APIC_reg_01    reg_01;
@@ -3340,20 +3457,13 @@ static int __init io_apic_get_redir_entries(int ioapic)
        return reg_01.bits.entries + 1;
 }
 
-static void __init probe_nr_irqs_gsi(void)
-{
-       int nr;
-
-       nr = gsi_top + NR_IRQS_LEGACY;
-       if (nr > nr_irqs_gsi)
-               nr_irqs_gsi = nr;
-
-       printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
-}
-
 unsigned int arch_dynirq_lower_bound(unsigned int from)
 {
-       return from < nr_irqs_gsi ? nr_irqs_gsi : from;
+       /*
+        * dmar_alloc_hwirq() may be called before setup_IO_APIC(), so use
+        * gsi_top if ioapic_dynirq_base hasn't been initialized yet.
+        */
+       return ioapic_initialized ? ioapic_dynirq_base : gsi_top;
 }
 
 int __init arch_probe_nr_irqs(void)
@@ -3363,33 +3473,17 @@ int __init arch_probe_nr_irqs(void)
        if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
                nr_irqs = NR_VECTORS * nr_cpu_ids;
 
-       nr = nr_irqs_gsi + 8 * nr_cpu_ids;
+       nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids;
 #if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ)
        /*
         * for MSI and HT dyn irq
         */
-       nr += nr_irqs_gsi * 16;
+       nr += gsi_top * 16;
 #endif
        if (nr < nr_irqs)
                nr_irqs = nr;
 
-       return NR_IRQS_LEGACY;
-}
-
-int io_apic_set_pci_routing(struct device *dev, int irq,
-                           struct io_apic_irq_attr *irq_attr)
-{
-       int node;
-
-       if (!IO_APIC_IRQ(irq)) {
-               apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
-                           irq_attr->ioapic);
-               return -EINVAL;
-       }
-
-       node = dev ? dev_to_node(dev) : cpu_to_node(0);
-
-       return io_apic_setup_irq_pin_once(irq, node, irq_attr);
+       return 0;
 }
 
 #ifdef CONFIG_X86_32
@@ -3483,9 +3577,8 @@ static u8 __init io_apic_unique_id(u8 id)
        DECLARE_BITMAP(used, 256);
 
        bitmap_zero(used, 256);
-       for (i = 0; i < nr_ioapics; i++) {
+       for_each_ioapic(i)
                __set_bit(mpc_ioapic_id(i), used);
-       }
        if (!test_bit(id, used))
                return id;
        return find_first_zero_bit(used, 256);
@@ -3543,14 +3636,13 @@ void __init setup_ioapic_dest(void)
        if (skip_ioapic_setup == 1)
                return;
 
-       for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
-       for (pin = 0; pin < ioapics[ioapic].nr_registers; pin++) {
+       for_each_ioapic_pin(ioapic, pin) {
                irq_entry = find_irq_entry(ioapic, pin, mp_INT);
                if (irq_entry == -1)
                        continue;
-               irq = pin_2_irq(irq_entry, ioapic, pin);
 
-               if ((ioapic > 0) && (irq > 16))
+               irq = pin_2_irq(irq_entry, ioapic, pin, 0);
+               if (irq < 0 || !mp_init_irq_at_boot(ioapic, irq))
                        continue;
 
                idata = irq_get_irq_data(irq);
@@ -3573,29 +3665,33 @@ void __init setup_ioapic_dest(void)
 
 static struct resource *ioapic_resources;
 
-static struct resource * __init ioapic_setup_resources(int nr_ioapics)
+static struct resource * __init ioapic_setup_resources(void)
 {
        unsigned long n;
        struct resource *res;
        char *mem;
-       int i;
+       int i, num = 0;
 
-       if (nr_ioapics <= 0)
+       for_each_ioapic(i)
+               num++;
+       if (num == 0)
                return NULL;
 
        n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
-       n *= nr_ioapics;
+       n *= num;
 
        mem = alloc_bootmem(n);
        res = (void *)mem;
 
-       mem += sizeof(struct resource) * nr_ioapics;
+       mem += sizeof(struct resource) * num;
 
-       for (i = 0; i < nr_ioapics; i++) {
-               res[i].name = mem;
-               res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+       num = 0;
+       for_each_ioapic(i) {
+               res[num].name = mem;
+               res[num].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
                snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
                mem += IOAPIC_RESOURCE_NAME_SIZE;
+               num++;
        }
 
        ioapic_resources = res;
@@ -3609,8 +3705,8 @@ void __init native_io_apic_init_mappings(void)
        struct resource *ioapic_res;
        int i;
 
-       ioapic_res = ioapic_setup_resources(nr_ioapics);
-       for (i = 0; i < nr_ioapics; i++) {
+       ioapic_res = ioapic_setup_resources();
+       for_each_ioapic(i) {
                if (smp_found_config) {
                        ioapic_phys = mpc_ioapic_addr(i);
 #ifdef CONFIG_X86_32
@@ -3641,8 +3737,6 @@ fake_ioapic_page:
                ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
                ioapic_res++;
        }
-
-       probe_nr_irqs_gsi();
 }
 
 void __init ioapic_insert_resources(void)
@@ -3657,7 +3751,7 @@ void __init ioapic_insert_resources(void)
                return;
        }
 
-       for (i = 0; i < nr_ioapics; i++) {
+       for_each_ioapic(i) {
                insert_resource(&iomem_resource, r);
                r++;
        }
@@ -3665,16 +3759,15 @@ void __init ioapic_insert_resources(void)
 
 int mp_find_ioapic(u32 gsi)
 {
-       int i = 0;
+       int i;
 
        if (nr_ioapics == 0)
                return -1;
 
        /* Find the IOAPIC that manages this GSI. */
-       for (i = 0; i < nr_ioapics; i++) {
+       for_each_ioapic(i) {
                struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i);
-               if ((gsi >= gsi_cfg->gsi_base)
-                   && (gsi <= gsi_cfg->gsi_end))
+               if (gsi >= gsi_cfg->gsi_base && gsi <= gsi_cfg->gsi_end)
                        return i;
        }
 
@@ -3686,7 +3779,7 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi)
 {
        struct mp_ioapic_gsi *gsi_cfg;
 
-       if (WARN_ON(ioapic == -1))
+       if (WARN_ON(ioapic < 0))
                return -1;
 
        gsi_cfg = mp_ioapic_gsi_routing(ioapic);
@@ -3729,7 +3822,8 @@ static __init int bad_ioapic_register(int idx)
        return 0;
 }
 
-void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
+void __init mp_register_ioapic(int id, u32 address, u32 gsi_base,
+                              struct ioapic_domain_cfg *cfg)
 {
        int idx = 0;
        int entries;
@@ -3743,6 +3837,8 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
        ioapics[idx].mp_config.type = MP_IOAPIC;
        ioapics[idx].mp_config.flags = MPC_APIC_USABLE;
        ioapics[idx].mp_config.apicaddr = address;
+       ioapics[idx].irqdomain = NULL;
+       ioapics[idx].irqdomain_cfg = *cfg;
 
        set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
 
@@ -3779,6 +3875,77 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
        nr_ioapics++;
 }
 
+int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq,
+                    irq_hw_number_t hwirq)
+{
+       int ioapic = (int)(long)domain->host_data;
+       struct mp_pin_info *info = mp_pin_info(ioapic, hwirq);
+       struct io_apic_irq_attr attr;
+
+       /* Get default attribute if not set by caller yet */
+       if (!info->set) {
+               u32 gsi = mp_pin_to_gsi(ioapic, hwirq);
+
+               if (acpi_get_override_irq(gsi, &info->trigger,
+                                         &info->polarity) < 0) {
+                       /*
+                        * PCI interrupts are always polarity one level
+                        * triggered.
+                        */
+                       info->trigger = 1;
+                       info->polarity = 1;
+               }
+               info->node = NUMA_NO_NODE;
+               info->set = 1;
+       }
+       set_io_apic_irq_attr(&attr, ioapic, hwirq, info->trigger,
+                            info->polarity);
+
+       return io_apic_setup_irq_pin(virq, info->node, &attr);
+}
+
+void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq)
+{
+       struct irq_data *data = irq_get_irq_data(virq);
+       struct irq_cfg *cfg = irq_cfg(virq);
+       int ioapic = (int)(long)domain->host_data;
+       int pin = (int)data->hwirq;
+
+       ioapic_mask_entry(ioapic, pin);
+       __remove_pin_from_irq(cfg, ioapic, pin);
+       WARN_ON(cfg->irq_2_pin != NULL);
+       arch_teardown_hwirq(virq);
+}
+
+int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node)
+{
+       int ret = 0;
+       int ioapic, pin;
+       struct mp_pin_info *info;
+
+       ioapic = mp_find_ioapic(gsi);
+       if (ioapic < 0)
+               return -ENODEV;
+
+       pin = mp_find_ioapic_pin(ioapic, gsi);
+       info = mp_pin_info(ioapic, pin);
+       trigger = trigger ? 1 : 0;
+       polarity = polarity ? 1 : 0;
+
+       mutex_lock(&ioapic_mutex);
+       if (!info->set) {
+               info->trigger = trigger;
+               info->polarity = polarity;
+               info->node = node;
+               info->set = 1;
+       } else if (info->trigger != trigger || info->polarity != polarity) {
+               ret = -EBUSY;
+       }
+       mutex_unlock(&ioapic_mutex);
+
+       return ret;
+}
+
 /* Enable IOAPIC early just for system timer */
 void __init pre_init_apic_IRQ0(void)
 {
index cceb352c968c62b13404d3ce78478f64e7a5d3c5..bda488680dbc1a42cdfe4fb175aa6dae73cb612d 100644 (file)
@@ -88,21 +88,16 @@ static struct apic apic_default = {
        .disable_esr                    = 0,
        .dest_logical                   = APIC_DEST_LOGICAL,
        .check_apicid_used              = default_check_apicid_used,
-       .check_apicid_present           = default_check_apicid_present,
 
        .vector_allocation_domain       = flat_vector_allocation_domain,
        .init_apic_ldr                  = default_init_apic_ldr,
 
        .ioapic_phys_id_map             = default_ioapic_phys_id_map,
        .setup_apic_routing             = setup_apic_flat_routing,
-       .multi_timer_check              = NULL,
        .cpu_present_to_apicid          = default_cpu_present_to_apicid,
        .apicid_to_cpu_present          = physid_set_mask_of_physid,
-       .setup_portio_remap             = NULL,
        .check_phys_apicid_present      = default_check_phys_apicid_present,
-       .enable_apic_mode               = NULL,
        .phys_pkg_id                    = default_phys_pkg_id,
-       .mps_oem_check                  = NULL,
 
        .get_apic_id                    = default_get_apic_id,
        .set_apic_id                    = NULL,
@@ -116,11 +111,7 @@ static struct apic apic_default = {
        .send_IPI_all                   = default_send_IPI_all,
        .send_IPI_self                  = default_send_IPI_self,
 
-       .trampoline_phys_low            = DEFAULT_TRAMPOLINE_PHYS_LOW,
-       .trampoline_phys_high           = DEFAULT_TRAMPOLINE_PHYS_HIGH,
-
        .wait_for_init_deassert         = true,
-       .smp_callin_clear_local_apic    = NULL,
        .inquire_remote_apic            = default_inquire_remote_apic,
 
        .read                           = native_apic_mem_read,
@@ -214,29 +205,7 @@ void __init generic_apic_probe(void)
        printk(KERN_INFO "Using APIC driver %s\n", apic->name);
 }
 
-/* These functions can switch the APIC even after the initial ->probe() */
-
-int __init
-generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
-{
-       struct apic **drv;
-
-       for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
-               if (!((*drv)->mps_oem_check))
-                       continue;
-               if (!(*drv)->mps_oem_check(mpc, oem, productid))
-                       continue;
-
-               if (!cmdline_apic) {
-                       apic = *drv;
-                       printk(KERN_INFO "Switched to APIC driver `%s'.\n",
-                              apic->name);
-               }
-               return 1;
-       }
-       return 0;
-}
-
+/* This function can switch the APIC even after the initial ->probe() */
 int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
        struct apic **drv;
index e66766bf164191de15d7ea9bf7b62c6506b2d6f2..6ce600f9bc789e2ccf0fe53bd6cb61091ebd0340 100644 (file)
@@ -249,21 +249,16 @@ static struct apic apic_x2apic_cluster = {
        .disable_esr                    = 0,
        .dest_logical                   = APIC_DEST_LOGICAL,
        .check_apicid_used              = NULL,
-       .check_apicid_present           = NULL,
 
        .vector_allocation_domain       = cluster_vector_allocation_domain,
        .init_apic_ldr                  = init_x2apic_ldr,
 
        .ioapic_phys_id_map             = NULL,
        .setup_apic_routing             = NULL,
-       .multi_timer_check              = NULL,
        .cpu_present_to_apicid          = default_cpu_present_to_apicid,
        .apicid_to_cpu_present          = NULL,
-       .setup_portio_remap             = NULL,
        .check_phys_apicid_present      = default_check_phys_apicid_present,
-       .enable_apic_mode               = NULL,
        .phys_pkg_id                    = x2apic_phys_pkg_id,
-       .mps_oem_check                  = NULL,
 
        .get_apic_id                    = x2apic_get_apic_id,
        .set_apic_id                    = x2apic_set_apic_id,
@@ -277,10 +272,7 @@ static struct apic apic_x2apic_cluster = {
        .send_IPI_all                   = x2apic_send_IPI_all,
        .send_IPI_self                  = x2apic_send_IPI_self,
 
-       .trampoline_phys_low            = DEFAULT_TRAMPOLINE_PHYS_LOW,
-       .trampoline_phys_high           = DEFAULT_TRAMPOLINE_PHYS_HIGH,
        .wait_for_init_deassert         = false,
-       .smp_callin_clear_local_apic    = NULL,
        .inquire_remote_apic            = NULL,
 
        .read                           = native_apic_msr_read,
index 6d600ebf6c127f94c4bb194a9e8bb25f0c607ffe..6fae733e9194893dc761ccd06839df81232c7427 100644 (file)
@@ -103,21 +103,16 @@ static struct apic apic_x2apic_phys = {
        .disable_esr                    = 0,
        .dest_logical                   = 0,
        .check_apicid_used              = NULL,
-       .check_apicid_present           = NULL,
 
        .vector_allocation_domain       = default_vector_allocation_domain,
        .init_apic_ldr                  = init_x2apic_ldr,
 
        .ioapic_phys_id_map             = NULL,
        .setup_apic_routing             = NULL,
-       .multi_timer_check              = NULL,
        .cpu_present_to_apicid          = default_cpu_present_to_apicid,
        .apicid_to_cpu_present          = NULL,
-       .setup_portio_remap             = NULL,
        .check_phys_apicid_present      = default_check_phys_apicid_present,
-       .enable_apic_mode               = NULL,
        .phys_pkg_id                    = x2apic_phys_pkg_id,
-       .mps_oem_check                  = NULL,
 
        .get_apic_id                    = x2apic_get_apic_id,
        .set_apic_id                    = x2apic_set_apic_id,
@@ -131,10 +126,7 @@ static struct apic apic_x2apic_phys = {
        .send_IPI_all                   = x2apic_send_IPI_all,
        .send_IPI_self                  = x2apic_send_IPI_self,
 
-       .trampoline_phys_low            = DEFAULT_TRAMPOLINE_PHYS_LOW,
-       .trampoline_phys_high           = DEFAULT_TRAMPOLINE_PHYS_HIGH,
        .wait_for_init_deassert         = false,
-       .smp_callin_clear_local_apic    = NULL,
        .inquire_remote_apic            = NULL,
 
        .read                           = native_apic_msr_read,
index 293b41df54ef6628880e280e0917948cd5cee629..004f017aa7b9e633923aa6d861bb964f7abbe658 100644 (file)
@@ -365,21 +365,16 @@ static struct apic __refdata apic_x2apic_uv_x = {
        .disable_esr                    = 0,
        .dest_logical                   = APIC_DEST_LOGICAL,
        .check_apicid_used              = NULL,
-       .check_apicid_present           = NULL,
 
        .vector_allocation_domain       = default_vector_allocation_domain,
        .init_apic_ldr                  = uv_init_apic_ldr,
 
        .ioapic_phys_id_map             = NULL,
        .setup_apic_routing             = NULL,
-       .multi_timer_check              = NULL,
        .cpu_present_to_apicid          = default_cpu_present_to_apicid,
        .apicid_to_cpu_present          = NULL,
-       .setup_portio_remap             = NULL,
        .check_phys_apicid_present      = default_check_phys_apicid_present,
-       .enable_apic_mode               = NULL,
        .phys_pkg_id                    = uv_phys_pkg_id,
-       .mps_oem_check                  = NULL,
 
        .get_apic_id                    = x2apic_get_apic_id,
        .set_apic_id                    = set_apic_id,
@@ -394,10 +389,7 @@ static struct apic __refdata apic_x2apic_uv_x = {
        .send_IPI_self                  = uv_send_IPI_self,
 
        .wakeup_secondary_cpu           = uv_wakeup_secondary,
-       .trampoline_phys_low            = DEFAULT_TRAMPOLINE_PHYS_LOW,
-       .trampoline_phys_high           = DEFAULT_TRAMPOLINE_PHYS_HIGH,
        .wait_for_init_deassert         = false,
-       .smp_callin_clear_local_apic    = NULL,
        .inquire_remote_apic            = NULL,
 
        .read                           = native_apic_msr_read,
index 333fd5209336d3df51e8ed676e4aa4b937a48b6b..e4ab2b42bd6f469528c73cac4ac0fb186b5bdbd3 100644 (file)
@@ -148,6 +148,7 @@ static int __init x86_xsave_setup(char *s)
 {
        setup_clear_cpu_cap(X86_FEATURE_XSAVE);
        setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+       setup_clear_cpu_cap(X86_FEATURE_XSAVES);
        setup_clear_cpu_cap(X86_FEATURE_AVX);
        setup_clear_cpu_cap(X86_FEATURE_AVX2);
        return 1;
@@ -161,6 +162,13 @@ static int __init x86_xsaveopt_setup(char *s)
 }
 __setup("noxsaveopt", x86_xsaveopt_setup);
 
+static int __init x86_xsaves_setup(char *s)
+{
+       setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+       return 1;
+}
+__setup("noxsaves", x86_xsaves_setup);
+
 #ifdef CONFIG_X86_32
 static int cachesize_override = -1;
 static int disable_x86_serial_nr = 1;
index 7db54b5d5f868996cba08c4a6a3689368a983063..3d3503351242b7c6d8dcc0d32350e36e63da1939 100644 (file)
@@ -21,6 +21,7 @@
 #include <asm/apic.h>
 #include <asm/pci_x86.h>
 #include <asm/setup.h>
+#include <asm/i8259.h>
 
 __initdata u64 initial_dtb;
 char __initdata cmd_line[COMMAND_LINE_SIZE];
@@ -165,82 +166,6 @@ static void __init dtb_lapic_setup(void)
 #ifdef CONFIG_X86_IO_APIC
 static unsigned int ioapic_id;
 
-static void __init dtb_add_ioapic(struct device_node *dn)
-{
-       struct resource r;
-       int ret;
-
-       ret = of_address_to_resource(dn, 0, &r);
-       if (ret) {
-               printk(KERN_ERR "Can't obtain address from node %s.\n",
-                               dn->full_name);
-               return;
-       }
-       mp_register_ioapic(++ioapic_id, r.start, gsi_top);
-}
-
-static void __init dtb_ioapic_setup(void)
-{
-       struct device_node *dn;
-
-       for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic")
-               dtb_add_ioapic(dn);
-
-       if (nr_ioapics) {
-               of_ioapic = 1;
-               return;
-       }
-       printk(KERN_ERR "Error: No information about IO-APIC in OF.\n");
-}
-#else
-static void __init dtb_ioapic_setup(void) {}
-#endif
-
-static void __init dtb_apic_setup(void)
-{
-       dtb_lapic_setup();
-       dtb_ioapic_setup();
-}
-
-#ifdef CONFIG_OF_FLATTREE
-static void __init x86_flattree_get_config(void)
-{
-       u32 size, map_len;
-       void *dt;
-
-       if (!initial_dtb)
-               return;
-
-       map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128);
-
-       initial_boot_params = dt = early_memremap(initial_dtb, map_len);
-       size = of_get_flat_dt_size();
-       if (map_len < size) {
-               early_iounmap(dt, map_len);
-               initial_boot_params = dt = early_memremap(initial_dtb, size);
-               map_len = size;
-       }
-
-       unflatten_and_copy_device_tree();
-       early_iounmap(dt, map_len);
-}
-#else
-static inline void x86_flattree_get_config(void) { }
-#endif
-
-void __init x86_dtb_init(void)
-{
-       x86_flattree_get_config();
-
-       if (!of_have_populated_dt())
-               return;
-
-       dtb_setup_hpet();
-       dtb_apic_setup();
-}
-
-#ifdef CONFIG_X86_IO_APIC
-
 struct of_ioapic_type {
        u32 out_type;
        u32 trigger;
@@ -276,10 +201,8 @@ static int ioapic_xlate(struct irq_domain *domain,
                        const u32 *intspec, u32 intsize,
                        irq_hw_number_t *out_hwirq, u32 *out_type)
 {
-       struct io_apic_irq_attr attr;
        struct of_ioapic_type *it;
-       u32 line, idx;
-       int rc;
+       u32 line, idx, gsi;
 
        if (WARN_ON(intsize < 2))
                return -EINVAL;
@@ -291,13 +214,10 @@ static int ioapic_xlate(struct irq_domain *domain,
 
        it = &of_ioapic_type[intspec[1]];
 
-       idx = (u32) domain->host_data;
-       set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity);
-
-       rc = io_apic_setup_irq_pin_once(irq_find_mapping(domain, line),
-                                       cpu_to_node(0), &attr);
-       if (rc)
-               return rc;
+       idx = (u32)(long)domain->host_data;
+       gsi = mp_pin_to_gsi(idx, line);
+       if (mp_set_gsi_attr(gsi, it->trigger, it->polarity, cpu_to_node(0)))
+               return -EBUSY;
 
        *out_hwirq = line;
        *out_type = it->out_type;
@@ -305,81 +225,86 @@ static int ioapic_xlate(struct irq_domain *domain,
 }
 
 const struct irq_domain_ops ioapic_irq_domain_ops = {
+       .map = mp_irqdomain_map,
+       .unmap = mp_irqdomain_unmap,
        .xlate = ioapic_xlate,
 };
 
-static void dt_add_ioapic_domain(unsigned int ioapic_num,
-               struct device_node *np)
+static void __init dtb_add_ioapic(struct device_node *dn)
 {
-       struct irq_domain *id;
-       struct mp_ioapic_gsi *gsi_cfg;
+       struct resource r;
        int ret;
-       int num;
-
-       gsi_cfg = mp_ioapic_gsi_routing(ioapic_num);
-       num = gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
-
-       id = irq_domain_add_linear(np, num, &ioapic_irq_domain_ops,
-                       (void *)ioapic_num);
-       BUG_ON(!id);
-       if (gsi_cfg->gsi_base == 0) {
-               /*
-                * The first NR_IRQS_LEGACY irq descs are allocated in
-                * early_irq_init() and need just a mapping. The
-                * remaining irqs need both. All of them are preallocated
-                * and assigned so we can keep the 1:1 mapping which the ioapic
-                * is having.
-                */
-               irq_domain_associate_many(id, 0, 0, NR_IRQS_LEGACY);
-
-               if (num > NR_IRQS_LEGACY) {
-                       ret = irq_create_strict_mappings(id, NR_IRQS_LEGACY,
-                                       NR_IRQS_LEGACY, num - NR_IRQS_LEGACY);
-                       if (ret)
-                               pr_err("Error creating mapping for the "
-                                               "remaining IRQs: %d\n", ret);
-               }
-               irq_set_default_host(id);
-       } else {
-               ret = irq_create_strict_mappings(id, gsi_cfg->gsi_base, 0, num);
-               if (ret)
-                       pr_err("Error creating IRQ mapping: %d\n", ret);
+       struct ioapic_domain_cfg cfg = {
+               .type = IOAPIC_DOMAIN_DYNAMIC,
+               .ops = &ioapic_irq_domain_ops,
+               .dev = dn,
+       };
+
+       ret = of_address_to_resource(dn, 0, &r);
+       if (ret) {
+               printk(KERN_ERR "Can't obtain address from node %s.\n",
+                               dn->full_name);
+               return;
        }
+       mp_register_ioapic(++ioapic_id, r.start, gsi_top, &cfg);
 }
 
-static void __init ioapic_add_ofnode(struct device_node *np)
+static void __init dtb_ioapic_setup(void)
 {
-       struct resource r;
-       int i, ret;
+       struct device_node *dn;
 
-       ret = of_address_to_resource(np, 0, &r);
-       if (ret) {
-               printk(KERN_ERR "Failed to obtain address for %s\n",
-                               np->full_name);
+       for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic")
+               dtb_add_ioapic(dn);
+
+       if (nr_ioapics) {
+               of_ioapic = 1;
                return;
        }
+       printk(KERN_ERR "Error: No information about IO-APIC in OF.\n");
+}
+#else
+static void __init dtb_ioapic_setup(void) {}
+#endif
 
-       for (i = 0; i < nr_ioapics; i++) {
-               if (r.start == mpc_ioapic_addr(i)) {
-                       dt_add_ioapic_domain(i, np);
-                       return;
-               }
-       }
-       printk(KERN_ERR "IOxAPIC at %s is not registered.\n", np->full_name);
+static void __init dtb_apic_setup(void)
+{
+       dtb_lapic_setup();
+       dtb_ioapic_setup();
 }
 
-void __init x86_add_irq_domains(void)
+#ifdef CONFIG_OF_FLATTREE
+static void __init x86_flattree_get_config(void)
 {
-       struct device_node *dp;
+       u32 size, map_len;
+       void *dt;
 
-       if (!of_have_populated_dt())
+       if (!initial_dtb)
                return;
 
-       for_each_node_with_property(dp, "interrupt-controller") {
-               if (of_device_is_compatible(dp, "intel,ce4100-ioapic"))
-                       ioapic_add_ofnode(dp);
+       map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128);
+
+       initial_boot_params = dt = early_memremap(initial_dtb, map_len);
+       size = of_get_flat_dt_size();
+       if (map_len < size) {
+               early_iounmap(dt, map_len);
+               initial_boot_params = dt = early_memremap(initial_dtb, size);
+               map_len = size;
        }
+
+       unflatten_and_copy_device_tree();
+       early_iounmap(dt, map_len);
 }
 #else
-void __init x86_add_irq_domains(void) { }
+static inline void x86_flattree_get_config(void) { }
 #endif
+
+void __init x86_dtb_init(void)
+{
+       x86_flattree_get_config();
+
+       if (!of_have_populated_dt())
+               return;
+
+       dtb_setup_hpet();
+       dtb_apic_setup();
+}
index d5dd808144190ffd1d443229b4dbbd56740fface..a9a4229f6161b25b6e8a5752be7943f478525d21 100644 (file)
@@ -375,7 +375,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
        /*
         * These bits must be zero.
         */
-       xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
+       memset(xsave_hdr->reserved, 0, 48);
 
        return ret;
 }
index 7f50156542fbde0fed0eeeca9bbf0d18566a54c9..1e6cff5814fa62ea0aefdc9d87fc1dec532d16e6 100644 (file)
@@ -78,7 +78,7 @@ void __init init_ISA_irqs(void)
 #endif
        legacy_pic->init(0);
 
-       for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
+       for (i = 0; i < nr_legacy_irqs(); i++)
                irq_set_chip_and_handler_name(i, chip, handle_level_irq, name);
 }
 
@@ -86,12 +86,6 @@ void __init init_IRQ(void)
 {
        int i;
 
-       /*
-        * We probably need a better place for this, but it works for
-        * now ...
-        */
-       x86_add_irq_domains();
-
        /*
         * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
         * If these IRQ's are handled by legacy interrupt-controllers like PIC,
@@ -100,7 +94,7 @@ void __init init_IRQ(void)
         * then this vector space can be freed and re-used dynamically as the
         * irq's migrate etc.
         */
-       for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
+       for (i = 0; i < nr_legacy_irqs(); i++)
                per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i;
 
        x86_init.irqs.intr_init();
@@ -121,7 +115,7 @@ void setup_vector_irq(int cpu)
         * legacy PIC, for the new cpu that is coming online, setup the static
         * legacy vector to irq mapping:
         */
-       for (irq = 0; irq < legacy_pic->nr_legacy_irqs; irq++)
+       for (irq = 0; irq < nr_legacy_irqs(); irq++)
                per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq;
 #endif
 
index d2b56489d70fb12781e6787c98b1ea13f7d08705..2d2a237f2c73698a4dd2819800edd6179239904b 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/module.h>
 #include <linux/smp.h>
 #include <linux/pci.h>
+#include <linux/irqdomain.h>
 
 #include <asm/mtrr.h>
 #include <asm/mpspec.h>
@@ -67,7 +68,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
                boot_cpu_physical_apicid = m->apicid;
        }
 
-       printk(KERN_INFO "Processor #%d%s\n", m->apicid, bootup_cpu);
+       pr_info("Processor #%d%s\n", m->apicid, bootup_cpu);
        generic_processor_info(apicid, m->apicver);
 }
 
@@ -87,9 +88,8 @@ static void __init MP_bus_info(struct mpc_bus *m)
 
 #if MAX_MP_BUSSES < 256
        if (m->busid >= MAX_MP_BUSSES) {
-               printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
-                      " is too large, max. supported is %d\n",
-                      m->busid, str, MAX_MP_BUSSES - 1);
+               pr_warn("MP table busid value (%d) for bustype %s is too large, max. supported is %d\n",
+                       m->busid, str, MAX_MP_BUSSES - 1);
                return;
        }
 #endif
@@ -110,19 +110,29 @@ static void __init MP_bus_info(struct mpc_bus *m)
                mp_bus_id_to_type[m->busid] = MP_BUS_EISA;
 #endif
        } else
-               printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
+               pr_warn("Unknown bustype %s - ignoring\n", str);
 }
 
+static struct irq_domain_ops mp_ioapic_irqdomain_ops = {
+       .map = mp_irqdomain_map,
+       .unmap = mp_irqdomain_unmap,
+};
+
 static void __init MP_ioapic_info(struct mpc_ioapic *m)
 {
+       struct ioapic_domain_cfg cfg = {
+               .type = IOAPIC_DOMAIN_LEGACY,
+               .ops = &mp_ioapic_irqdomain_ops,
+       };
+
        if (m->flags & MPC_APIC_USABLE)
-               mp_register_ioapic(m->apicid, m->apicaddr, gsi_top);
+               mp_register_ioapic(m->apicid, m->apicaddr, gsi_top, &cfg);
 }
 
 static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
 {
-       apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
-               " IRQ %02x, APIC ID %x, APIC INT %02x\n",
+       apic_printk(APIC_VERBOSE,
+               "Int: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC INT %02x\n",
                mp_irq->irqtype, mp_irq->irqflag & 3,
                (mp_irq->irqflag >> 2) & 3, mp_irq->srcbus,
                mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
@@ -135,8 +145,8 @@ static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
 
 static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
 {
-       apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
-               " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
+       apic_printk(APIC_VERBOSE,
+               "Lint: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC LINT %02x\n",
                m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbusid,
                m->srcbusirq, m->destapic, m->destapiclint);
 }
@@ -148,34 +158,33 @@ static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
 {
 
        if (memcmp(mpc->signature, MPC_SIGNATURE, 4)) {
-               printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
+               pr_err("MPTABLE: bad signature [%c%c%c%c]!\n",
                       mpc->signature[0], mpc->signature[1],
                       mpc->signature[2], mpc->signature[3]);
                return 0;
        }
        if (mpf_checksum((unsigned char *)mpc, mpc->length)) {
-               printk(KERN_ERR "MPTABLE: checksum error!\n");
+               pr_err("MPTABLE: checksum error!\n");
                return 0;
        }
        if (mpc->spec != 0x01 && mpc->spec != 0x04) {
-               printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
-                      mpc->spec);
+               pr_err("MPTABLE: bad table version (%d)!!\n", mpc->spec);
                return 0;
        }
        if (!mpc->lapic) {
-               printk(KERN_ERR "MPTABLE: null local APIC address!\n");
+               pr_err("MPTABLE: null local APIC address!\n");
                return 0;
        }
        memcpy(oem, mpc->oem, 8);
        oem[8] = 0;
-       printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
+       pr_info("MPTABLE: OEM ID: %s\n", oem);
 
        memcpy(str, mpc->productid, 12);
        str[12] = 0;
 
-       printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
+       pr_info("MPTABLE: Product ID: %s\n", str);
 
-       printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->lapic);
+       pr_info("MPTABLE: APIC at: 0x%X\n", mpc->lapic);
 
        return 1;
 }
@@ -188,8 +197,8 @@ static void skip_entry(unsigned char **ptr, int *count, int size)
 
 static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
 {
-       printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"
-               "type %x\n", *mpt);
+       pr_err("Your mptable is wrong, contact your HW vendor!\n");
+       pr_cont("type %x\n", *mpt);
        print_hex_dump(KERN_ERR, "  ", DUMP_PREFIX_ADDRESS, 16,
                        1, mpc, mpc->length, 1);
 }
@@ -207,9 +216,6 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
        if (!smp_check_mpc(mpc, oem, str))
                return 0;
 
-#ifdef CONFIG_X86_32
-       generic_mps_oem_check(mpc, oem, str);
-#endif
        /* Initialize the lapic mapping */
        if (!acpi_lapic)
                register_lapic_address(mpc->lapic);
@@ -259,7 +265,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
        }
 
        if (!num_processors)
-               printk(KERN_ERR "MPTABLE: no processors registered!\n");
+               pr_err("MPTABLE: no processors registered!\n");
        return num_processors;
 }
 
@@ -295,16 +301,13 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
         *  If it does, we assume it's valid.
         */
        if (mpc_default_type == 5) {
-               printk(KERN_INFO "ISA/PCI bus type with no IRQ information... "
-                      "falling back to ELCR\n");
+               pr_info("ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
 
                if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) ||
                    ELCR_trigger(13))
-                       printk(KERN_ERR "ELCR contains invalid data... "
-                              "not using ELCR\n");
+                       pr_err("ELCR contains invalid data... not using ELCR\n");
                else {
-                       printk(KERN_INFO
-                              "Using ELCR to identify PCI interrupts\n");
+                       pr_info("Using ELCR to identify PCI interrupts\n");
                        ELCR_fallback = 1;
                }
        }
@@ -353,7 +356,7 @@ static void __init construct_ioapic_table(int mpc_default_type)
        bus.busid = 0;
        switch (mpc_default_type) {
        default:
-               printk(KERN_ERR "???\nUnknown standard configuration %d\n",
+               pr_err("???\nUnknown standard configuration %d\n",
                       mpc_default_type);
                /* fall through */
        case 1:
@@ -462,8 +465,8 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
 #ifdef CONFIG_X86_LOCAL_APIC
                smp_found_config = 0;
 #endif
-               printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"
-                       "... disabling SMP support. (tell your hw vendor)\n");
+               pr_err("BIOS bug, MP table errors detected!...\n");
+               pr_cont("... disabling SMP support. (tell your hw vendor)\n");
                early_iounmap(mpc, size);
                return -1;
        }
@@ -481,8 +484,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
        if (!mp_irq_entries) {
                struct mpc_bus bus;
 
-               printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
-                      "using default mptable. (tell your hw vendor)\n");
+               pr_err("BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
 
                bus.type = MP_BUS;
                bus.busid = 0;
@@ -516,14 +518,14 @@ void __init default_get_smp_config(unsigned int early)
        if (acpi_lapic && acpi_ioapic)
                return;
 
-       printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
-              mpf->specification);
+       pr_info("Intel MultiProcessor Specification v1.%d\n",
+               mpf->specification);
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
        if (mpf->feature2 & (1 << 7)) {
-               printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
+               pr_info("    IMCR and PIC compatibility mode.\n");
                pic_mode = 1;
        } else {
-               printk(KERN_INFO "    Virtual Wire compatibility mode.\n");
+               pr_info("    Virtual Wire compatibility mode.\n");
                pic_mode = 0;
        }
 #endif
@@ -539,8 +541,7 @@ void __init default_get_smp_config(unsigned int early)
                        return;
                }
 
-               printk(KERN_INFO "Default MP configuration #%d\n",
-                      mpf->feature1);
+               pr_info("Default MP configuration #%d\n", mpf->feature1);
                construct_default_ISA_mptable(mpf->feature1);
 
        } else if (mpf->physptr) {
@@ -550,7 +551,7 @@ void __init default_get_smp_config(unsigned int early)
                BUG();
 
        if (!early)
-               printk(KERN_INFO "Processors: %d\n", num_processors);
+               pr_info("Processors: %d\n", num_processors);
        /*
         * Only use the first configuration found.
         */
@@ -583,10 +584,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
 #endif
                        mpf_found = mpf;
 
-                       printk(KERN_INFO "found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n",
-                              (unsigned long long) virt_to_phys(mpf),
-                              (unsigned long long) virt_to_phys(mpf) +
-                              sizeof(*mpf) - 1, mpf);
+                       pr_info("found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n",
+                               (unsigned long long) virt_to_phys(mpf),
+                               (unsigned long long) virt_to_phys(mpf) +
+                               sizeof(*mpf) - 1, mpf);
 
                        mem = virt_to_phys(mpf);
                        memblock_reserve(mem, sizeof(*mpf));
@@ -735,7 +736,7 @@ static int  __init replace_intsrc_all(struct mpc_table *mpc,
        int nr_m_spare = 0;
        unsigned char *mpt = ((unsigned char *)mpc) + count;
 
-       printk(KERN_INFO "mpc_length %x\n", mpc->length);
+       pr_info("mpc_length %x\n", mpc->length);
        while (count < mpc->length) {
                switch (*mpt) {
                case MP_PROCESSOR:
@@ -862,13 +863,13 @@ static int __init update_mp_table(void)
        if (!smp_check_mpc(mpc, oem, str))
                return 0;
 
-       printk(KERN_INFO "mpf: %llx\n", (u64)virt_to_phys(mpf));
-       printk(KERN_INFO "physptr: %x\n", mpf->physptr);
+       pr_info("mpf: %llx\n", (u64)virt_to_phys(mpf));
+       pr_info("physptr: %x\n", mpf->physptr);
 
        if (mpc_new_phys && mpc->length > mpc_new_length) {
                mpc_new_phys = 0;
-               printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
-                        mpc_new_length);
+               pr_info("mpc_new_length is %ld, please use alloc_mptable=8k\n",
+                       mpc_new_length);
        }
 
        if (!mpc_new_phys) {
@@ -879,10 +880,10 @@ static int __init update_mp_table(void)
                mpc->checksum = 0xff;
                new = mpf_checksum((unsigned char *)mpc, mpc->length);
                if (old == new) {
-                       printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
+                       pr_info("mpc is readonly, please try alloc_mptable instead\n");
                        return 0;
                }
-               printk(KERN_INFO "use in-position replacing\n");
+               pr_info("use in-position replacing\n");
        } else {
                mpf->physptr = mpc_new_phys;
                mpc_new = phys_to_virt(mpc_new_phys);
@@ -892,7 +893,7 @@ static int __init update_mp_table(void)
                if (mpc_new_phys - mpf->physptr) {
                        struct mpf_intel *mpf_new;
                        /* steal 16 bytes from [0, 1k) */
-                       printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
+                       pr_info("mpf new: %x\n", 0x400 - 16);
                        mpf_new = phys_to_virt(0x400 - 16);
                        memcpy(mpf_new, mpf, 16);
                        mpf = mpf_new;
@@ -900,7 +901,7 @@ static int __init update_mp_table(void)
                }
                mpf->checksum = 0;
                mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16);
-               printk(KERN_INFO "physptr new: %x\n", mpf->physptr);
+               pr_info("physptr new: %x\n", mpf->physptr);
        }
 
        /*
index 4505e2a950d81f479663df20ce787dbad5f293b0..f804dc935d2adffb37f587d34de9c4cd10ce70a3 100644 (file)
@@ -93,6 +93,7 @@ void arch_task_cache_init(void)
                kmem_cache_create("task_xstate", xstate_size,
                                  __alignof__(union thread_xstate),
                                  SLAB_PANIC | SLAB_NOTRACK, NULL);
+       setup_xstate_comp();
 }
 
 /*
index 5492798930ef4224675bae7704f6da8e6f94731e..2d872e08fab95f10e8b97c23b54ba1e886680369 100644 (file)
@@ -168,10 +168,6 @@ static void smp_callin(void)
         * CPU, first the APIC. (this is probably redundant on most
         * boards)
         */
-
-       pr_debug("CALLIN, before setup_local_APIC()\n");
-       if (apic->smp_callin_clear_local_apic)
-               apic->smp_callin_clear_local_apic();
        setup_local_APIC();
        end_local_APIC_setup();
 
@@ -1143,10 +1139,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
                enable_IO_APIC();
 
        bsp_end_local_APIC_setup();
-
-       if (apic->setup_portio_remap)
-               apic->setup_portio_remap();
-
        smpboot_setup_io_apic();
        /*
         * Set up local APIC timer on boot CPU.
index b99b9ad8540c525f79468d6fce0682681478c008..ee22c1d93ae5c4c5ffe065981d591507a0511a07 100644 (file)
@@ -152,7 +152,7 @@ static void __init detect_vsmp_box(void)
                is_vsmp = 1;
 }
 
-int is_vsmp_box(void)
+static int is_vsmp_box(void)
 {
        if (is_vsmp != -1)
                return is_vsmp;
@@ -166,7 +166,7 @@ int is_vsmp_box(void)
 static void __init detect_vsmp_box(void)
 {
 }
-int is_vsmp_box(void)
+static int is_vsmp_box(void)
 {
        return 0;
 }
index a4b451c6addfb7085a22c70b408614a58bf0ad7b..940b142cc11f8390ebdc15d1121cd67cc24c94aa 100644 (file)
@@ -8,6 +8,7 @@
 
 #include <linux/bootmem.h>
 #include <linux/compat.h>
+#include <linux/cpu.h>
 #include <asm/i387.h>
 #include <asm/fpu-internal.h>
 #include <asm/sigframe.h>
@@ -24,7 +25,9 @@ u64 pcntxt_mask;
 struct xsave_struct *init_xstate_buf;
 
 static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;
-static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
+static unsigned int *xstate_offsets, *xstate_sizes;
+static unsigned int xstate_comp_offsets[sizeof(pcntxt_mask)*8];
+static unsigned int xstate_features;
 
 /*
  * If a processor implementation discern that a processor state component is
@@ -283,7 +286,7 @@ sanitize_restored_xstate(struct task_struct *tsk,
 
        if (use_xsave()) {
                /* These bits must be zero. */
-               xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
+               memset(xsave_hdr->reserved, 0, 48);
 
                /*
                 * Init the state that is not present in the memory
@@ -478,6 +481,52 @@ static void __init setup_xstate_features(void)
        } while (1);
 }
 
+/*
+ * This function sets up offsets and sizes of all extended states in
+ * xsave area. This supports both standard format and compacted format
+ * of the xsave aread.
+ *
+ * Input: void
+ * Output: void
+ */
+void setup_xstate_comp(void)
+{
+       unsigned int xstate_comp_sizes[sizeof(pcntxt_mask)*8];
+       int i;
+
+       /*
+        * The FP xstates and SSE xstates are legacy states. They are always
+        * in the fixed offsets in the xsave area in either compacted form
+        * or standard form.
+        */
+       xstate_comp_offsets[0] = 0;
+       xstate_comp_offsets[1] = offsetof(struct i387_fxsave_struct, xmm_space);
+
+       if (!cpu_has_xsaves) {
+               for (i = 2; i < xstate_features; i++) {
+                       if (test_bit(i, (unsigned long *)&pcntxt_mask)) {
+                               xstate_comp_offsets[i] = xstate_offsets[i];
+                               xstate_comp_sizes[i] = xstate_sizes[i];
+                       }
+               }
+               return;
+       }
+
+       xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+
+       for (i = 2; i < xstate_features; i++) {
+               if (test_bit(i, (unsigned long *)&pcntxt_mask))
+                       xstate_comp_sizes[i] = xstate_sizes[i];
+               else
+                       xstate_comp_sizes[i] = 0;
+
+               if (i > 2)
+                       xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
+                                       + xstate_comp_sizes[i-1];
+
+       }
+}
+
 /*
  * setup the xstate image representing the init state
  */
@@ -496,15 +545,21 @@ static void __init setup_init_fpu_buf(void)
 
        setup_xstate_features();
 
+       if (cpu_has_xsaves) {
+               init_xstate_buf->xsave_hdr.xcomp_bv =
+                                               (u64)1 << 63 | pcntxt_mask;
+               init_xstate_buf->xsave_hdr.xstate_bv = pcntxt_mask;
+       }
+
        /*
         * Init all the features state with header_bv being 0x0
         */
-       xrstor_state(init_xstate_buf, -1);
+       xrstor_state_booting(init_xstate_buf, -1);
        /*
         * Dump the init state again. This is to identify the init state
         * of any feature which is not represented by all zero's.
         */
-       xsave_state(init_xstate_buf, -1);
+       xsave_state_booting(init_xstate_buf, -1);
 }
 
 static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
@@ -520,6 +575,30 @@ static int __init eager_fpu_setup(char *s)
 }
 __setup("eagerfpu=", eager_fpu_setup);
 
+
+/*
+ * Calculate total size of enabled xstates in XCR0/pcntxt_mask.
+ */
+static void __init init_xstate_size(void)
+{
+       unsigned int eax, ebx, ecx, edx;
+       int i;
+
+       if (!cpu_has_xsaves) {
+               cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
+               xstate_size = ebx;
+               return;
+       }
+
+       xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+       for (i = 2; i < 64; i++) {
+               if (test_bit(i, (unsigned long *)&pcntxt_mask)) {
+                       cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+                       xstate_size += eax;
+               }
+       }
+}
+
 /*
  * Enable and initialize the xsave feature.
  */
@@ -551,8 +630,7 @@ static void __init xstate_enable_boot_cpu(void)
        /*
         * Recompute the context size for enabled features
         */
-       cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
-       xstate_size = ebx;
+       init_xstate_size();
 
        update_regset_xstate_info(xstate_size, pcntxt_mask);
        prepare_fx_sw_frame();
@@ -572,8 +650,9 @@ static void __init xstate_enable_boot_cpu(void)
                }
        }
 
-       pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n",
-               pcntxt_mask, xstate_size);
+       pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x using %s\n",
+               pcntxt_mask, xstate_size,
+               cpu_has_xsaves ? "compacted form" : "standard form");
 }
 
 /*
@@ -635,3 +714,26 @@ void eager_fpu_init(void)
        else
                fxrstor_checking(&init_xstate_buf->i387);
 }
+
+/*
+ * Given the xsave area and a state inside, this function returns the
+ * address of the state.
+ *
+ * This is the API that is called to get xstate address in either
+ * standard format or compacted format of xsave area.
+ *
+ * Inputs:
+ *     xsave: base address of the xsave area;
+ *     xstate: state which is defined in xsave.h (e.g. XSTATE_FP, XSTATE_SSE,
+ *     etc.)
+ * Output:
+ *     address of the state in the xsave area.
+ */
+void *get_xsave_addr(struct xsave_struct *xsave, int xstate)
+{
+       int feature = fls64(xstate) - 1;
+       if (!test_bit(feature, (unsigned long *)&pcntxt_mask))
+               return NULL;
+
+       return (void *)xsave + xstate_comp_offsets[feature];
+}
index 5075371ab59395b37d9819c4af90ba5a0d4724e5..cfd1b132b8e3ed4023ceecf12dbb68fc35bb68a3 100644 (file)
@@ -448,7 +448,7 @@ static void probe_pci_root_info(struct pci_root_info *info,
                return;
 
        size = sizeof(*info->res) * info->res_num;
-       info->res = kzalloc(size, GFP_KERNEL);
+       info->res = kzalloc_node(size, GFP_KERNEL, info->sd.node);
        if (!info->res) {
                info->res_num = 0;
                return;
@@ -456,7 +456,7 @@ static void probe_pci_root_info(struct pci_root_info *info,
 
        size = sizeof(*info->res_offset) * info->res_num;
        info->res_num = 0;
-       info->res_offset = kzalloc(size, GFP_KERNEL);
+       info->res_offset = kzalloc_node(size, GFP_KERNEL, info->sd.node);
        if (!info->res_offset) {
                kfree(info->res);
                info->res = NULL;
@@ -499,7 +499,7 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
        if (node != NUMA_NO_NODE && !node_online(node))
                node = NUMA_NO_NODE;
 
-       info = kzalloc(sizeof(*info), GFP_KERNEL);
+       info = kzalloc_node(sizeof(*info), GFP_KERNEL, node);
        if (!info) {
                printk(KERN_WARNING "pci_bus %04x:%02x: "
                       "ignored (out of memory)\n", domain, busnum);
index 84b9d672843db2e0da88d653300991947483628f..3865116c51fbf583a923d4131882c653b5fc3c24 100644 (file)
@@ -208,27 +208,31 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
 
 static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 {
-       u8 pin;
-       struct io_apic_irq_attr irq_attr;
+       int polarity;
 
-       pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+       if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER)
+               polarity = 0; /* active high */
+       else
+               polarity = 1; /* active low */
 
        /*
         * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
         * IOAPIC RTE entries, so we just enable RTE for the device.
         */
-       irq_attr.ioapic = mp_find_ioapic(dev->irq);
-       irq_attr.ioapic_pin = dev->irq;
-       irq_attr.trigger = 1; /* level */
-       if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER)
-               irq_attr.polarity = 0; /* active high */
-       else
-               irq_attr.polarity = 1; /* active low */
-       io_apic_set_pci_routing(&dev->dev, dev->irq, &irq_attr);
+       if (mp_set_gsi_attr(dev->irq, 1, polarity, dev_to_node(&dev->dev)))
+               return -EBUSY;
+       if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC) < 0)
+               return -EBUSY;
 
        return 0;
 }
 
+static void intel_mid_pci_irq_disable(struct pci_dev *dev)
+{
+       if (!dev->dev.power.is_prepared && dev->irq > 0)
+               mp_unmap_irq(dev->irq);
+}
+
 struct pci_ops intel_mid_pci_ops = {
        .read = pci_read,
        .write = pci_write,
@@ -245,6 +249,7 @@ int __init intel_mid_pci_init(void)
        pr_info("Intel MID platform detected, using MID PCI ops\n");
        pci_mmcfg_late_init();
        pcibios_enable_irq = intel_mid_pci_irq_enable;
+       pcibios_disable_irq = intel_mid_pci_irq_disable;
        pci_root_ops = intel_mid_pci_ops;
        pci_soc_mode = 1;
        /* Continue with standard init */
index 84112f55dd7a95d83232d9416b7e9196a55dee30..bc1a2c341891034d04c86b834c5d132e7f954e33 100644 (file)
@@ -26,6 +26,7 @@ static int acer_tm360_irqrouting;
 static struct irq_routing_table *pirq_table;
 
 static int pirq_enable_irq(struct pci_dev *dev);
+static void pirq_disable_irq(struct pci_dev *dev);
 
 /*
  * Never use: 0, 1, 2 (timer, keyboard, and cascade)
@@ -53,7 +54,7 @@ struct irq_router_handler {
 };
 
 int (*pcibios_enable_irq)(struct pci_dev *dev) = pirq_enable_irq;
-void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
+void (*pcibios_disable_irq)(struct pci_dev *dev) = pirq_disable_irq;
 
 /*
  *  Check passed address for the PCI IRQ Routing Table signature
@@ -1186,7 +1187,7 @@ void pcibios_penalize_isa_irq(int irq, int active)
 
 static int pirq_enable_irq(struct pci_dev *dev)
 {
-       u8 pin;
+       u8 pin = 0;
 
        pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
        if (pin && !pcibios_lookup_irq(dev, 1)) {
@@ -1227,8 +1228,6 @@ static int pirq_enable_irq(struct pci_dev *dev)
                        }
                        dev = temp_dev;
                        if (irq >= 0) {
-                               io_apic_set_pci_routing(&dev->dev, irq,
-                                                        &irq_attr);
                                dev->irq = irq;
                                dev_info(&dev->dev, "PCI->APIC IRQ transform: "
                                         "INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
@@ -1254,3 +1253,12 @@ static int pirq_enable_irq(struct pci_dev *dev)
        }
        return 0;
 }
+
+static void pirq_disable_irq(struct pci_dev *dev)
+{
+       if (io_apic_assign_pci_irqs && !dev->dev.power.is_prepared &&
+           dev->irq) {
+               mp_unmap_irq(dev->irq);
+               dev->irq = 0;
+       }
+}
index 905956f16465ccdbd8377082d348de36e43a4b05..093f5f4272d340be4307fd93f3d6e2b7d34515f0 100644 (file)
@@ -23,6 +23,7 @@
 #include <xen/features.h>
 #include <xen/events.h>
 #include <asm/xen/pci.h>
+#include <asm/i8259.h>
 
 static int xen_pcifront_enable_irq(struct pci_dev *dev)
 {
@@ -40,7 +41,7 @@ static int xen_pcifront_enable_irq(struct pci_dev *dev)
        /* In PV DomU the Xen PCI backend puts the PIRQ in the interrupt line.*/
        pirq = gsi;
 
-       if (gsi < NR_IRQS_LEGACY)
+       if (gsi < nr_legacy_irqs())
                share = 0;
 
        rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront");
@@ -511,7 +512,7 @@ int __init pci_xen_initial_domain(void)
        xen_setup_acpi_sci();
        __acpi_register_gsi = acpi_register_gsi_xen;
        /* Pre-allocate legacy irqs */
-       for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
+       for (irq = 0; irq < nr_legacy_irqs(); irq++) {
                int trigger, polarity;
 
                if (acpi_get_override_irq(irq, &trigger, &polarity) == -1)
@@ -522,7 +523,7 @@ int __init pci_xen_initial_domain(void)
                        true /* Map GSI to PIRQ */);
        }
        if (0 == nr_ioapics) {
-               for (irq = 0; irq < NR_IRQS_LEGACY; irq++)
+               for (irq = 0; irq < nr_legacy_irqs(); irq++)
                        xen_bind_pirq_gsi_to_irq(irq, irq, 0, "xt-pic");
        }
        return 0;
index 8244f5ec2f4c7520284ff2388c328a17c10e3440..701fd5843c879fb4529b3f009d0580d55a15e4a9 100644 (file)
@@ -135,14 +135,10 @@ static void __init sdv_arch_setup(void)
        sdv_serial_fixup();
 }
 
-#ifdef CONFIG_X86_IO_APIC
 static void sdv_pci_init(void)
 {
        x86_of_pci_init();
-       /* We can't set this earlier, because we need to calibrate the timer */
-       legacy_pic = &null_legacy_pic;
 }
-#endif
 
 /*
  * CE4100 specific x86_init function overrides and early setup
@@ -155,7 +151,9 @@ void __init x86_ce4100_early_setup(void)
        x86_init.resources.probe_roms = x86_init_noop;
        x86_init.mpparse.get_smp_config = x86_init_uint_noop;
        x86_init.mpparse.find_smp_config = x86_init_noop;
+       x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck;
        x86_init.pci.init = ce4100_pci_init;
+       x86_init.pci.init_irq = sdv_pci_init;
 
        /*
         * By default, the reboot method is ACPI which is supported by the
@@ -166,10 +164,5 @@ void __init x86_ce4100_early_setup(void)
         */
        reboot_type = BOOT_KBD;
 
-#ifdef CONFIG_X86_IO_APIC
-       x86_init.pci.init_irq = sdv_pci_init;
-       x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck;
-#endif
-
        pm_power_off = ce4100_power_off;
 }
index 973cf3bfa9fdebb18e4b8710e1bc082695a9f3df..0b283d4d0ad770d945f407c4921d09ecacb34db4 100644 (file)
@@ -26,28 +26,18 @@ static struct platform_device wdt_dev = {
 
 static int tangier_probe(struct platform_device *pdev)
 {
-       int ioapic;
-       int irq;
+       int gsi;
        struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data;
-       struct io_apic_irq_attr irq_attr = { 0 };
 
        if (!pdata)
                return -EINVAL;
 
-       irq = pdata->irq;
-       ioapic = mp_find_ioapic(irq);
-       if (ioapic >= 0) {
-               int ret;
-               irq_attr.ioapic = ioapic;
-               irq_attr.ioapic_pin = irq;
-               irq_attr.trigger = 1;
-               /* irq_attr.polarity = 0; -> Active high */
-               ret = io_apic_set_pci_routing(NULL, irq, &irq_attr);
-               if (ret)
-                       return ret;
-       } else {
+       /* IOAPIC builds identity mapping between GSI and IRQ on MID */
+       gsi = pdata->irq;
+       if (mp_set_gsi_attr(gsi, 1, 0, cpu_to_node(0)) ||
+           mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC) <= 0) {
                dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n",
-                        irq);
+                        gsi);
                return -EINVAL;
        }
 
index 994c40bd7cb7c8cc54dc4f9fec2be2ea7f74a4af..3c53a90fdb18b8d446bcb6ef3cc711ab543974d8 100644 (file)
@@ -432,9 +432,8 @@ static int __init sfi_parse_devs(struct sfi_table_header *table)
        struct sfi_table_simple *sb;
        struct sfi_device_table_entry *pentry;
        struct devs_id *dev = NULL;
-       int num, i;
-       int ioapic;
-       struct io_apic_irq_attr irq_attr;
+       int num, i, ret;
+       int polarity;
 
        sb = (struct sfi_table_simple *)table;
        num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry);
@@ -448,35 +447,30 @@ static int __init sfi_parse_devs(struct sfi_table_header *table)
                         * devices, but they have separate RTE entry in IOAPIC
                         * so we have to enable them one by one here
                         */
-                       ioapic = mp_find_ioapic(irq);
-                       if (ioapic >= 0) {
-                               irq_attr.ioapic = ioapic;
-                               irq_attr.ioapic_pin = irq;
-                               irq_attr.trigger = 1;
-                               if (intel_mid_identify_cpu() ==
-                                               INTEL_MID_CPU_CHIP_TANGIER) {
-                                       if (!strncmp(pentry->name,
-                                                       "r69001-ts-i2c", 13))
-                                               /* active low */
-                                               irq_attr.polarity = 1;
-                                       else if (!strncmp(pentry->name,
-                                                       "synaptics_3202", 14))
-                                               /* active low */
-                                               irq_attr.polarity = 1;
-                                       else if (irq == 41)
-                                               /* fast_int_1 */
-                                               irq_attr.polarity = 1;
-                                       else
-                                               /* active high */
-                                               irq_attr.polarity = 0;
-                               } else {
-                                       /* PNW and CLV go with active low */
-                                       irq_attr.polarity = 1;
-                               }
-                               io_apic_set_pci_routing(NULL, irq, &irq_attr);
+                       if (intel_mid_identify_cpu() ==
+                                       INTEL_MID_CPU_CHIP_TANGIER) {
+                               if (!strncmp(pentry->name, "r69001-ts-i2c", 13))
+                                       /* active low */
+                                       polarity = 1;
+                               else if (!strncmp(pentry->name,
+                                               "synaptics_3202", 14))
+                                       /* active low */
+                                       polarity = 1;
+                               else if (irq == 41)
+                                       /* fast_int_1 */
+                                       polarity = 1;
+                               else
+                                       /* active high */
+                                       polarity = 0;
+                       } else {
+                               /* PNW and CLV go with active low */
+                               polarity = 1;
                        }
-               } else {
-                       irq = 0; /* No irq */
+
+                       ret = mp_set_gsi_attr(irq, 1, polarity, NUMA_NO_NODE);
+                       if (ret == 0)
+                               ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC);
+                       WARN_ON(ret < 0);
                }
 
                dev = get_device_id(pentry->type, pentry->name);
index bcd1a703e3e60f0dcc9ebad4bcb1d8cf27d3fd5f..2a8a74f3bd76c1338e2378c26bc231cc5adf9bea 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/init.h>
 #include <linux/sfi.h>
 #include <linux/io.h>
+#include <linux/irqdomain.h>
 
 #include <asm/io_apic.h>
 #include <asm/mpspec.h>
@@ -70,19 +71,26 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table)
 #endif /* CONFIG_X86_LOCAL_APIC */
 
 #ifdef CONFIG_X86_IO_APIC
+static struct irq_domain_ops sfi_ioapic_irqdomain_ops = {
+       .map = mp_irqdomain_map,
+};
 
 static int __init sfi_parse_ioapic(struct sfi_table_header *table)
 {
        struct sfi_table_simple *sb;
        struct sfi_apic_table_entry *pentry;
        int i, num;
+       struct ioapic_domain_cfg cfg = {
+               .type = IOAPIC_DOMAIN_STRICT,
+               .ops = &sfi_ioapic_irqdomain_ops,
+       };
 
        sb = (struct sfi_table_simple *)table;
        num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry);
        pentry = (struct sfi_apic_table_entry *)sb->pentry;
 
        for (i = 0; i < num; i++) {
-               mp_register_ioapic(i, pentry->phys_addr, gsi_top);
+               mp_register_ioapic(i, pentry->phys_addr, gsi_top, &cfg);
                pentry++;
        }
 
index c0413046483ae9862c1d487348b3a701a2045e3a..1580e7a5a4cf7600d0f424ddd5a66d76efef6b2f 100644 (file)
@@ -118,6 +118,7 @@ static int __init xlated_setup_gnttab_pages(void)
 {
        struct page **pages;
        xen_pfn_t *pfns;
+       void *vaddr;
        int rc;
        unsigned int i;
        unsigned long nr_grant_frames = gnttab_max_grant_frames();
@@ -143,21 +144,20 @@ static int __init xlated_setup_gnttab_pages(void)
        for (i = 0; i < nr_grant_frames; i++)
                pfns[i] = page_to_pfn(pages[i]);
 
-       rc = arch_gnttab_map_shared(pfns, nr_grant_frames, nr_grant_frames,
-                                   &xen_auto_xlat_grant_frames.vaddr);
-
-       if (rc) {
+       vaddr = vmap(pages, nr_grant_frames, 0, PAGE_KERNEL);
+       if (!vaddr) {
                pr_warn("%s Couldn't map %ld pfns rc:%d\n", __func__,
                        nr_grant_frames, rc);
                free_xenballooned_pages(nr_grant_frames, pages);
                kfree(pages);
                kfree(pfns);
-               return rc;
+               return -ENOMEM;
        }
        kfree(pages);
 
        xen_auto_xlat_grant_frames.pfn = pfns;
        xen_auto_xlat_grant_frames.count = nr_grant_frames;
+       xen_auto_xlat_grant_frames.vaddr = vaddr;
 
        return 0;
 }
index 7b78f88c1707b994b34f20408bc1b6046447eb22..5718b0b58b60f663a845207af02d930962e96b8f 100644 (file)
@@ -444,7 +444,7 @@ void xen_setup_timer(int cpu)
 
        irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
                                      IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER|
-                                     IRQF_FORCE_RESUME,
+                                     IRQF_FORCE_RESUME|IRQF_EARLY_RESUME,
                                      name, NULL);
        (void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX);
 
index 9e241063a616f2c4ad023a08c8b6b06d676adb56..bc423f7b02da856a6987089b6750322c9b7cbfe4 100644 (file)
@@ -70,8 +70,10 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
                                          bs->bvec_integrity_pool);
                if (!bip->bip_vec)
                        goto err;
+               bip->bip_max_vcnt = bvec_nr_vecs(idx);
        } else {
                bip->bip_vec = bip->bip_inline_vecs;
+               bip->bip_max_vcnt = inline_vecs;
        }
 
        bip->bip_slab = idx;
@@ -114,14 +116,6 @@ void bio_integrity_free(struct bio *bio)
 }
 EXPORT_SYMBOL(bio_integrity_free);
 
-static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip)
-{
-       if (bip->bip_slab == BIO_POOL_NONE)
-               return BIP_INLINE_VECS;
-
-       return bvec_nr_vecs(bip->bip_slab);
-}
-
 /**
  * bio_integrity_add_page - Attach integrity metadata
  * @bio:       bio to update
@@ -137,7 +131,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
        struct bio_integrity_payload *bip = bio->bi_integrity;
        struct bio_vec *iv;
 
-       if (bip->bip_vcnt >= bip_integrity_vecs(bip)) {
+       if (bip->bip_vcnt >= bip->bip_max_vcnt) {
                printk(KERN_ERR "%s: bip_vec full\n", __func__);
                return 0;
        }
index 0ec61c9e536c2032778db165c7b31d98bdc5e735..3e6331d25d90c6aa507acd43bb90f2001c9e6641 100644 (file)
@@ -112,7 +112,8 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
        bslab = &bio_slabs[entry];
 
        snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
-       slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL);
+       slab = kmem_cache_create(bslab->name, sz, ARCH_KMALLOC_MINALIGN,
+                                SLAB_HWCACHE_ALIGN, NULL);
        if (!slab)
                goto out_unlock;
 
index 6f8dba161bfe1fbc50ee9d1091bd1ff6513e0aad..c359d72e9d76f24a44b7c4b6b8c36c6677436191 100644 (file)
@@ -438,14 +438,17 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all)
  */
 void blk_queue_bypass_start(struct request_queue *q)
 {
-       bool drain;
-
        spin_lock_irq(q->queue_lock);
-       drain = !q->bypass_depth++;
+       q->bypass_depth++;
        queue_flag_set(QUEUE_FLAG_BYPASS, q);
        spin_unlock_irq(q->queue_lock);
 
-       if (drain) {
+       /*
+        * Queues start drained.  Skip actual draining till init is
+        * complete.  This avoids lenghty delays during queue init which
+        * can happen many times during boot.
+        */
+       if (blk_queue_init_done(q)) {
                spin_lock_irq(q->queue_lock);
                __blk_drain_queue(q, false);
                spin_unlock_irq(q->queue_lock);
@@ -511,7 +514,7 @@ void blk_cleanup_queue(struct request_queue *q)
         * prevent that q->request_fn() gets invoked after draining finished.
         */
        if (q->mq_ops) {
-               blk_mq_drain_queue(q);
+               blk_mq_freeze_queue(q);
                spin_lock_irq(lock);
        } else {
                spin_lock_irq(lock);
index ad69ef657e850cc79c6379667c0c501f9400a551..5189cb1e478a6b283609006364c01f9f8e31d082 100644 (file)
@@ -78,68 +78,47 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
 
 static int blk_mq_queue_enter(struct request_queue *q)
 {
-       int ret;
-
-       __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
-       smp_wmb();
-
-       /* we have problems freezing the queue if it's initializing */
-       if (!blk_queue_dying(q) &&
-           (!blk_queue_bypass(q) || !blk_queue_init_done(q)))
-               return 0;
-
-       __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
+       while (true) {
+               int ret;
 
-       spin_lock_irq(q->queue_lock);
-       ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq,
-               !blk_queue_bypass(q) || blk_queue_dying(q),
-               *q->queue_lock);
-       /* inc usage with lock hold to avoid freeze_queue runs here */
-       if (!ret && !blk_queue_dying(q))
-               __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
-       else if (blk_queue_dying(q))
-               ret = -ENODEV;
-       spin_unlock_irq(q->queue_lock);
+               if (percpu_ref_tryget_live(&q->mq_usage_counter))
+                       return 0;
 
-       return ret;
+               ret = wait_event_interruptible(q->mq_freeze_wq,
+                               !q->mq_freeze_depth || blk_queue_dying(q));
+               if (blk_queue_dying(q))
+                       return -ENODEV;
+               if (ret)
+                       return ret;
+       }
 }
 
 static void blk_mq_queue_exit(struct request_queue *q)
 {
-       __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
+       percpu_ref_put(&q->mq_usage_counter);
 }
 
-void blk_mq_drain_queue(struct request_queue *q)
+static void blk_mq_usage_counter_release(struct percpu_ref *ref)
 {
-       while (true) {
-               s64 count;
-
-               spin_lock_irq(q->queue_lock);
-               count = percpu_counter_sum(&q->mq_usage_counter);
-               spin_unlock_irq(q->queue_lock);
+       struct request_queue *q =
+               container_of(ref, struct request_queue, mq_usage_counter);
 
-               if (count == 0)
-                       break;
-               blk_mq_start_hw_queues(q);
-               msleep(10);
-       }
+       wake_up_all(&q->mq_freeze_wq);
 }
 
 /*
  * Guarantee no request is in use, so we can change any data structure of
  * the queue afterward.
  */
-static void blk_mq_freeze_queue(struct request_queue *q)
+void blk_mq_freeze_queue(struct request_queue *q)
 {
-       bool drain;
-
        spin_lock_irq(q->queue_lock);
-       drain = !q->bypass_depth++;
-       queue_flag_set(QUEUE_FLAG_BYPASS, q);
+       q->mq_freeze_depth++;
        spin_unlock_irq(q->queue_lock);
 
-       if (drain)
-               blk_mq_drain_queue(q);
+       percpu_ref_kill(&q->mq_usage_counter);
+       blk_mq_run_queues(q, false);
+       wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
 }
 
 static void blk_mq_unfreeze_queue(struct request_queue *q)
@@ -147,14 +126,13 @@ static void blk_mq_unfreeze_queue(struct request_queue *q)
        bool wake = false;
 
        spin_lock_irq(q->queue_lock);
-       if (!--q->bypass_depth) {
-               queue_flag_clear(QUEUE_FLAG_BYPASS, q);
-               wake = true;
-       }
-       WARN_ON_ONCE(q->bypass_depth < 0);
+       wake = !--q->mq_freeze_depth;
+       WARN_ON_ONCE(q->mq_freeze_depth < 0);
        spin_unlock_irq(q->queue_lock);
-       if (wake)
+       if (wake) {
+               percpu_ref_reinit(&q->mq_usage_counter);
                wake_up_all(&q->mq_freeze_wq);
+       }
 }
 
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
@@ -1798,7 +1776,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
        if (!q)
                goto err_hctxs;
 
-       if (percpu_counter_init(&q->mq_usage_counter, 0))
+       if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release))
                goto err_map;
 
        setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
@@ -1891,7 +1869,7 @@ void blk_mq_free_queue(struct request_queue *q)
        blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
        blk_mq_free_hw_queues(q, set);
 
-       percpu_counter_destroy(&q->mq_usage_counter);
+       percpu_ref_exit(&q->mq_usage_counter);
 
        free_percpu(q->queue_ctx);
        kfree(q->queue_hw_ctx);
@@ -2050,8 +2028,7 @@ static int __init blk_mq_init(void)
 {
        blk_mq_cpu_init();
 
-       /* Must be called after percpu_counter_hotcpu_callback() */
-       hotcpu_notifier(blk_mq_queue_reinit_notify, -10);
+       hotcpu_notifier(blk_mq_queue_reinit_notify, 0);
 
        return 0;
 }
index 26460884c6cd835202e90b2d5985c827495ca626..ca4964a6295d48490f22e25f513c39165af190a4 100644 (file)
@@ -28,7 +28,7 @@ struct blk_mq_ctx {
 void __blk_mq_complete_request(struct request *rq);
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_init_flush(struct request_queue *q);
-void blk_mq_drain_queue(struct request_queue *q);
+void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 void blk_mq_clone_flush_request(struct request *flush_rq,
                struct request *orig_rq);
index 23321fbab29318ae5b550216c66eb9ae2d026c52..4db5abf96b9ec1595a822efebfb9df9776af6477 100644 (file)
@@ -554,8 +554,8 @@ int blk_register_queue(struct gendisk *disk)
         * Initialization must be complete by now.  Finish the initial
         * bypass from queue allocation.
         */
-       blk_queue_bypass_end(q);
        queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q);
+       blk_queue_bypass_end(q);
 
        ret = blk_trace_init_sysfs(dev);
        if (ret)
index a0926a6094b28a7e4e67b3a88afc993719294405..18b282ce361e12b20ac8952b1bfb1e7c178d1649 100644 (file)
@@ -663,6 +663,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        fmode_t mode = file->f_mode;
        struct backing_dev_info *bdi;
        loff_t size;
+       unsigned int max_sectors;
 
        /*
         * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
@@ -719,8 +720,9 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        case BLKSSZGET: /* get block device hardware sector size */
                return compat_put_int(arg, bdev_logical_block_size(bdev));
        case BLKSECTGET:
-               return compat_put_ushort(arg,
-                                        queue_max_sectors(bdev_get_queue(bdev)));
+               max_sectors = min_t(unsigned int, USHRT_MAX,
+                                   queue_max_sectors(bdev_get_queue(bdev)));
+               return compat_put_ushort(arg, max_sectors);
        case BLKROTATIONAL:
                return compat_put_ushort(arg,
                                         !blk_queue_nonrot(bdev_get_queue(bdev)));
index 7d5c3b20af451a111834efbabbdd83269085f123..d6cda8147c91ea828ea0dcd45f21d4ec6773abd6 100644 (file)
@@ -278,6 +278,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
        struct backing_dev_info *bdi;
        loff_t size;
        int ret, n;
+       unsigned int max_sectors;
 
        switch(cmd) {
        case BLKFLSBUF:
@@ -375,7 +376,9 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
        case BLKDISCARDZEROES:
                return put_uint(arg, bdev_discard_zeroes_data(bdev));
        case BLKSECTGET:
-               return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
+               max_sectors = min_t(unsigned int, USHRT_MAX,
+                                   queue_max_sectors(bdev_get_queue(bdev)));
+               return put_ushort(arg, max_sectors);
        case BLKROTATIONAL:
                return put_ushort(arg, !blk_queue_nonrot(bdev_get_queue(bdev)));
        case BLKRASET:
index 43be471d9b1dd8e6771b31328be679f0b53821d9..f3ed7b2d89bf5eae8e4c128472c03fb174330e24 100644 (file)
@@ -215,7 +215,7 @@ int aix_partition(struct parsed_partitions *state)
                numlvs = be16_to_cpu(p->numlvs);
                put_dev_sector(sect);
        }
-       lvip = kzalloc(sizeof(struct lv_info) * state->limit, GFP_KERNEL);
+       lvip = kcalloc(state->limit, sizeof(struct lv_info), GFP_KERNEL);
        if (!lvip)
                return 0;
        if (numlvs && (d = read_part_sector(state, vgda_sector + 1, &sect))) {
@@ -253,7 +253,7 @@ int aix_partition(struct parsed_partitions *state)
                                continue;
                        }
                        lv_ix = be16_to_cpu(p->lv_ix) - 1;
-                       if (lv_ix > state->limit) {
+                       if (lv_ix >= state->limit) {
                                cur_lv_ix = -1;
                                continue;
                        }
index 70cbf44a1560d5f78a1f0cf0c8b340ec315e4899..2b13533d60a294e43f460fe1276256e34ef6b256 100644 (file)
@@ -7,6 +7,8 @@
  *  Re-organised Feb 1998 Russell King
  */
 
+#define pr_fmt(fmt) fmt
+
 #include <linux/types.h>
 #include <linux/affs_hardblocks.h>
 
@@ -40,7 +42,7 @@ int amiga_partition(struct parsed_partitions *state)
                data = read_part_sector(state, blk, &sect);
                if (!data) {
                        if (warn_no_part)
-                               printk("Dev %s: unable to read RDB block %d\n",
+                               pr_err("Dev %s: unable to read RDB block %d\n",
                                       bdevname(state->bdev, b), blk);
                        res = -1;
                        goto rdb_done;
@@ -57,12 +59,12 @@ int amiga_partition(struct parsed_partitions *state)
                *(__be32 *)(data+0xdc) = 0;
                if (checksum_block((__be32 *)data,
                                be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) {
-                       printk("Warning: Trashed word at 0xd0 in block %d "
-                               "ignored in checksum calculation\n",blk);
+                       pr_err("Trashed word at 0xd0 in block %d ignored in checksum calculation\n",
+                              blk);
                        break;
                }
 
-               printk("Dev %s: RDB in block %d has bad checksum\n",
+               pr_err("Dev %s: RDB in block %d has bad checksum\n",
                       bdevname(state->bdev, b), blk);
        }
 
@@ -83,7 +85,7 @@ int amiga_partition(struct parsed_partitions *state)
                data = read_part_sector(state, blk, &sect);
                if (!data) {
                        if (warn_no_part)
-                               printk("Dev %s: unable to read partition block %d\n",
+                               pr_err("Dev %s: unable to read partition block %d\n",
                                       bdevname(state->bdev, b), blk);
                        res = -1;
                        goto rdb_done;
index dc51f467a560558ab4812d339fe2bd24f83a00b2..56d08fd75b1a9511152eb120b6439f4afe01a00a 100644 (file)
@@ -121,7 +121,7 @@ __setup("gpt", force_gpt_fn);
 /**
  * efi_crc32() - EFI version of crc32 function
  * @buf: buffer to calculate crc32 of
- * @len - length of buf
+ * @len: length of buf
  *
  * Description: Returns EFI-style CRC32 value for @buf
  * 
@@ -240,10 +240,10 @@ done:
 
 /**
  * read_lba(): Read bytes from disk, starting at given LBA
- * @state
- * @lba
- * @buffer
- * @size_t
+ * @state: disk parsed partitions
+ * @lba: the Logical Block Address of the partition table
+ * @buffer: destination buffer
+ * @count: bytes to read
  *
  * Description: Reads @count bytes from @state->bdev into @buffer.
  * Returns number of bytes read on success, 0 on error.
@@ -277,8 +277,8 @@ static size_t read_lba(struct parsed_partitions *state,
 
 /**
  * alloc_read_gpt_entries(): reads partition entries from disk
- * @state
- * @gpt - GPT header
+ * @state: disk parsed partitions
+ * @gpt: GPT header
  * 
  * Description: Returns ptes on success,  NULL on error.
  * Allocates space for PTEs based on information found in @gpt.
@@ -312,8 +312,8 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
 
 /**
  * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
- * @state
- * @lba is the Logical Block Address of the partition table
+ * @state: disk parsed partitions
+ * @lba: the Logical Block Address of the partition table
  * 
  * Description: returns GPT header on success, NULL on error.   Allocates
  * and fills a GPT header starting at @ from @state->bdev.
@@ -340,10 +340,10 @@ static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
 
 /**
  * is_gpt_valid() - tests one GPT header and PTEs for validity
- * @state
- * @lba is the logical block address of the GPT header to test
- * @gpt is a GPT header ptr, filled on return.
- * @ptes is a PTEs ptr, filled on return.
+ * @state: disk parsed partitions
+ * @lba: logical block address of the GPT header to test
+ * @gpt: GPT header ptr, filled on return.
+ * @ptes: PTEs ptr, filled on return.
  *
  * Description: returns 1 if valid,  0 on error.
  * If valid, returns pointers to newly allocated GPT header and PTEs.
@@ -461,8 +461,8 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
 
 /**
  * is_pte_valid() - tests one PTE for validity
- * @pte is the pte to check
- * @lastlba is last lba of the disk
+ * @pte:pte to check
+ * @lastlba: last lba of the disk
  *
  * Description: returns 1 if valid,  0 on error.
  */
@@ -478,9 +478,10 @@ is_pte_valid(const gpt_entry *pte, const u64 lastlba)
 
 /**
  * compare_gpts() - Search disk for valid GPT headers and PTEs
- * @pgpt is the primary GPT header
- * @agpt is the alternate GPT header
- * @lastlba is the last LBA number
+ * @pgpt: primary GPT header
+ * @agpt: alternate GPT header
+ * @lastlba: last LBA number
+ *
  * Description: Returns nothing.  Sanity checks pgpt and agpt fields
  * and prints warnings on discrepancies.
  * 
@@ -572,9 +573,10 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
 
 /**
  * find_valid_gpt() - Search disk for valid GPT headers and PTEs
- * @state
- * @gpt is a GPT header ptr, filled on return.
- * @ptes is a PTEs ptr, filled on return.
+ * @state: disk parsed partitions
+ * @gpt: GPT header ptr, filled on return.
+ * @ptes: PTEs ptr, filled on return.
+ *
  * Description: Returns 1 if valid, 0 on error.
  * If valid, returns pointers to newly allocated GPT header and PTEs.
  * Validity depends on PMBR being valid (or being overridden by the
@@ -663,7 +665,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
 
 /**
  * efi_partition(struct parsed_partitions *state)
- * @state
+ * @state: disk parsed partitions
  *
  * Description: called from check.c, if the disk contains GPT
  * partitions, sets up partition entries in the kernel.
index 9123f250b425170b325943c351f0c34fac3562ff..93e7c1b32eddd5aa27fc8c96f5f581f712541a53 100644 (file)
@@ -159,8 +159,9 @@ static void parse_extended(struct parsed_partitions *state,
                /*
                 * First process the data partition(s)
                 */
-               for (i=0; i<4; i++, p++) {
+               for (i = 0; i < 4; i++, p++) {
                        sector_t offs, size, next;
+
                        if (!nr_sects(p) || is_extended_partition(p))
                                continue;
 
@@ -194,7 +195,7 @@ static void parse_extended(struct parsed_partitions *state,
                 * It should be a link to the next logical partition.
                 */
                p -= 4;
-               for (i=0; i<4; i++, p++)
+               for (i = 0; i < 4; i++, p++)
                        if (nr_sects(p) && is_extended_partition(p))
                                break;
                if (i == 4)
@@ -243,8 +244,8 @@ static void parse_solaris_x86(struct parsed_partitions *state,
                return;
        }
        /* Ensure we can handle previous case of VTOC with 8 entries gracefully */
-       max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8;
-       for (i=0; i<max_nparts && state->next<state->limit; i++) {
+       max_nparts = le16_to_cpu(v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8;
+       for (i = 0; i < max_nparts && state->next < state->limit; i++) {
                struct solaris_x86_slice *s = &v->v_slice[i];
                char tmp[3 + 10 + 1 + 1];
 
@@ -409,7 +410,7 @@ static void parse_minix(struct parsed_partitions *state,
        /* The first sector of a Minix partition can have either
         * a secondary MBR describing its subpartitions, or
         * the normal boot sector. */
-       if (msdos_magic_present (data + 510) &&
+       if (msdos_magic_present(data + 510) &&
            SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */
                char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1];
 
@@ -527,6 +528,7 @@ int msdos_partition(struct parsed_partitions *state)
        for (slot = 1 ; slot <= 4 ; slot++, p++) {
                sector_t start = start_sect(p)*sector_size;
                sector_t size = nr_sects(p)*sector_size;
+
                if (!size)
                        continue;
                if (is_extended_partition(p)) {
@@ -537,6 +539,7 @@ int msdos_partition(struct parsed_partitions *state)
                         * sector, although it may not be enough/proper.
                         */
                        sector_t n = 2;
+
                        n = min(size, max(sector_size, n));
                        put_partition(state, slot, start, n);
 
index 14695c6221c821588592f65fb72507daf230a358..51bf5155ee756a4ac479e9c49fcf88824b0aeedc 100644 (file)
@@ -82,9 +82,18 @@ static int sg_set_timeout(struct request_queue *q, int __user *p)
        return err;
 }
 
+static int max_sectors_bytes(struct request_queue *q)
+{
+       unsigned int max_sectors = queue_max_sectors(q);
+
+       max_sectors = min_t(unsigned int, max_sectors, INT_MAX >> 9);
+
+       return max_sectors << 9;
+}
+
 static int sg_get_reserved_size(struct request_queue *q, int __user *p)
 {
-       unsigned val = min(q->sg_reserved_size, queue_max_sectors(q) << 9);
+       int val = min_t(int, q->sg_reserved_size, max_sectors_bytes(q));
 
        return put_user(val, p);
 }
@@ -98,10 +107,8 @@ static int sg_set_reserved_size(struct request_queue *q, int __user *p)
 
        if (size < 0)
                return -EINVAL;
-       if (size > (queue_max_sectors(q) << 9))
-               size = queue_max_sectors(q) << 9;
 
-       q->sg_reserved_size = size;
+       q->sg_reserved_size = min(size, max_sectors_bytes(q));
        return 0;
 }
 
@@ -283,6 +290,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
        unsigned long start_time;
        ssize_t ret = 0;
        int writing = 0;
+       int at_head = 0;
        struct request *rq;
        char sense[SCSI_SENSE_BUFFERSIZE];
        struct bio *bio;
@@ -306,6 +314,8 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
                case SG_DXFER_FROM_DEV:
                        break;
                }
+       if (hdr->flags & SG_FLAG_Q_AT_HEAD)
+               at_head = 1;
 
        rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL);
        if (!rq)
@@ -362,7 +372,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
         * (if he doesn't check that is his problem).
         * N.B. a non-zero SCSI status is _not_ necessarily an error.
         */
-       blk_execute_rq(q, bd_disk, rq, 0);
+       blk_execute_rq(q, bd_disk, rq, at_head);
 
        hdr->duration = jiffies_to_msecs(jiffies - start_time);
 
index 9c62340c2360b5960bdde09dc5172ca763506473..c96887d5289eaed997e29eaab738f8bb4bb488ee 100644 (file)
@@ -481,6 +481,10 @@ void acpi_pci_irq_disable(struct pci_dev *dev)
        if (!pin)
                return;
 
+       /* Keep IOAPIC pin configuration when suspending */
+       if (dev->dev.power.is_prepared)
+               return;
+
        entry = acpi_pci_irq_lookup(dev, pin);
        if (!entry)
                return;
@@ -498,5 +502,6 @@ void acpi_pci_irq_disable(struct pci_dev *dev)
         */
 
        dev_dbg(&dev->dev, "PCI INT %c disabled\n", pin_name(pin));
-       acpi_unregister_gsi(gsi);
+       if (gsi >= 0 && dev->irq > 0)
+               acpi_unregister_gsi(gsi);
 }
index 0e3f8f9dcd294f5a79db29b6752808959f3e6719..480fa6ffbc090a48ae32496ef788c8e63d92e6be 100644 (file)
@@ -299,6 +299,7 @@ static int atmtcp_c_send(struct atm_vcc *vcc,struct sk_buff *skb)
        out_vcc = find_vcc(dev, ntohs(hdr->vpi), ntohs(hdr->vci));
        read_unlock(&vcc_sklist_lock);
        if (!out_vcc) {
+               result = -EUNATCH;
                atomic_inc(&vcc->stats->tx_err);
                goto done;
        }
index 943cf0d6abaf8b959e2a879ca901fedaea693b1e..7652e8dc188f93036e03a23a99ac7aee3b543811 100644 (file)
@@ -1278,6 +1278,7 @@ static int fpga_probe(struct pci_dev *dev, const struct pci_device_id *id)
                        card->dma_bounce = kmalloc(card->nr_ports * BUF_SIZE, GFP_KERNEL);
                        if (!card->dma_bounce) {
                                dev_warn(&card->dev->dev, "Failed to allocate DMA bounce buffers\n");
+                               err = -ENOMEM;
                                /* Fallback to MMIO doesn't work */
                                goto out_unmap_both;
                        }
index 8b450338075eca905b91c04e16ce3567e54f73d9..4464e353c1e81fd2faf3fe823bf244807fa0afd3 100644 (file)
@@ -3,5 +3,6 @@ drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
 drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
 drbd-y += drbd_interval.o drbd_state.o
 drbd-y += drbd_nla.o
+drbd-$(CONFIG_DEBUG_FS) += drbd_debugfs.o
 
 obj-$(CONFIG_BLK_DEV_DRBD)     += drbd.o
index 05a1780ffa850483cdf4d73a89b3a9a78964ed4c..d26a3fa6368849ce95d59a7847c8853c11861026 100644 (file)
@@ -92,34 +92,26 @@ struct __packed al_transaction_on_disk {
        __be32  context[AL_CONTEXT_PER_TRANSACTION];
 };
 
-struct update_odbm_work {
-       struct drbd_work w;
-       struct drbd_device *device;
-       unsigned int enr;
-};
-
-struct update_al_work {
-       struct drbd_work w;
-       struct drbd_device *device;
-       struct completion event;
-       int err;
-};
-
-
-void *drbd_md_get_buffer(struct drbd_device *device)
+void *drbd_md_get_buffer(struct drbd_device *device, const char *intent)
 {
        int r;
 
        wait_event(device->misc_wait,
-                  (r = atomic_cmpxchg(&device->md_io_in_use, 0, 1)) == 0 ||
+                  (r = atomic_cmpxchg(&device->md_io.in_use, 0, 1)) == 0 ||
                   device->state.disk <= D_FAILED);
 
-       return r ? NULL : page_address(device->md_io_page);
+       if (r)
+               return NULL;
+
+       device->md_io.current_use = intent;
+       device->md_io.start_jif = jiffies;
+       device->md_io.submit_jif = device->md_io.start_jif - 1;
+       return page_address(device->md_io.page);
 }
 
 void drbd_md_put_buffer(struct drbd_device *device)
 {
-       if (atomic_dec_and_test(&device->md_io_in_use))
+       if (atomic_dec_and_test(&device->md_io.in_use))
                wake_up(&device->misc_wait);
 }
 
@@ -145,10 +137,11 @@ void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_b
 
 static int _drbd_md_sync_page_io(struct drbd_device *device,
                                 struct drbd_backing_dev *bdev,
-                                struct page *page, sector_t sector,
-                                int rw, int size)
+                                sector_t sector, int rw)
 {
        struct bio *bio;
+       /* we do all our meta data IO in aligned 4k blocks. */
+       const int size = 4096;
        int err;
 
        device->md_io.done = 0;
@@ -156,15 +149,15 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
 
        if ((rw & WRITE) && !test_bit(MD_NO_FUA, &device->flags))
                rw |= REQ_FUA | REQ_FLUSH;
-       rw |= REQ_SYNC;
+       rw |= REQ_SYNC | REQ_NOIDLE;
 
        bio = bio_alloc_drbd(GFP_NOIO);
        bio->bi_bdev = bdev->md_bdev;
        bio->bi_iter.bi_sector = sector;
        err = -EIO;
-       if (bio_add_page(bio, page, size, 0) != size)
+       if (bio_add_page(bio, device->md_io.page, size, 0) != size)
                goto out;
-       bio->bi_private = &device->md_io;
+       bio->bi_private = device;
        bio->bi_end_io = drbd_md_io_complete;
        bio->bi_rw = rw;
 
@@ -179,7 +172,8 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
        }
 
        bio_get(bio); /* one bio_put() is in the completion handler */
-       atomic_inc(&device->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */
+       atomic_inc(&device->md_io.in_use); /* drbd_md_put_buffer() is in the completion handler */
+       device->md_io.submit_jif = jiffies;
        if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
                bio_endio(bio, -EIO);
        else
@@ -197,9 +191,7 @@ int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bd
                         sector_t sector, int rw)
 {
        int err;
-       struct page *iop = device->md_io_page;
-
-       D_ASSERT(device, atomic_read(&device->md_io_in_use) == 1);
+       D_ASSERT(device, atomic_read(&device->md_io.in_use) == 1);
 
        BUG_ON(!bdev->md_bdev);
 
@@ -214,8 +206,7 @@ int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bd
                     current->comm, current->pid, __func__,
                     (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
 
-       /* we do all our meta data IO in aligned 4k blocks. */
-       err = _drbd_md_sync_page_io(device, bdev, iop, sector, rw, 4096);
+       err = _drbd_md_sync_page_io(device, bdev, sector, rw);
        if (err) {
                drbd_err(device, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
                    (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
@@ -297,26 +288,12 @@ bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *
        return need_transaction;
 }
 
-static int al_write_transaction(struct drbd_device *device, bool delegate);
-
-/* When called through generic_make_request(), we must delegate
- * activity log I/O to the worker thread: a further request
- * submitted via generic_make_request() within the same task
- * would be queued on current->bio_list, and would only start
- * after this function returns (see generic_make_request()).
- *
- * However, if we *are* the worker, we must not delegate to ourselves.
- */
+static int al_write_transaction(struct drbd_device *device);
 
-/*
- * @delegate:   delegate activity log I/O to the worker thread
- */
-void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate)
+void drbd_al_begin_io_commit(struct drbd_device *device)
 {
        bool locked = false;
 
-       BUG_ON(delegate && current == first_peer_device(device)->connection->worker.task);
-
        /* Serialize multiple transactions.
         * This uses test_and_set_bit, memory barrier is implicit.
         */
@@ -335,7 +312,7 @@ void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate)
                        rcu_read_unlock();
 
                        if (write_al_updates)
-                               al_write_transaction(device, delegate);
+                               al_write_transaction(device);
                        spin_lock_irq(&device->al_lock);
                        /* FIXME
                        if (err)
@@ -352,12 +329,10 @@ void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate)
 /*
  * @delegate:   delegate activity log I/O to the worker thread
  */
-void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i, bool delegate)
+void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i)
 {
-       BUG_ON(delegate && current == first_peer_device(device)->connection->worker.task);
-
        if (drbd_al_begin_io_prepare(device, i))
-               drbd_al_begin_io_commit(device, delegate);
+               drbd_al_begin_io_commit(device);
 }
 
 int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i)
@@ -380,8 +355,19 @@ int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *
        /* We want all necessary updates for a given request within the same transaction
         * We could first check how many updates are *actually* needed,
         * and use that instead of the worst-case nr_al_extents */
-       if (available_update_slots < nr_al_extents)
-               return -EWOULDBLOCK;
+       if (available_update_slots < nr_al_extents) {
+               /* Too many activity log extents are currently "hot".
+                *
+                * If we have accumulated pending changes already,
+                * we made progress.
+                *
+                * If we cannot get even a single pending change through,
+                * stop the fast path until we made some progress,
+                * or requests to "cold" extents could be starved. */
+               if (!al->pending_changes)
+                       __set_bit(__LC_STARVING, &device->act_log->flags);
+               return -ENOBUFS;
+       }
 
        /* Is resync active in this area? */
        for (enr = first; enr <= last; enr++) {
@@ -452,15 +438,6 @@ static unsigned int al_extent_to_bm_page(unsigned int al_enr)
                 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
 }
 
-static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
-{
-       return rs_enr >>
-               /* bit to page */
-               ((PAGE_SHIFT + 3) -
-               /* resync extent number to bit */
-                (BM_EXT_SHIFT - BM_BLOCK_SHIFT));
-}
-
 static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
 {
        const unsigned int stripes = device->ldev->md.al_stripes;
@@ -479,8 +456,7 @@ static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
        return device->ldev->md.md_offset + device->ldev->md.al_offset + t;
 }
 
-static int
-_al_write_transaction(struct drbd_device *device)
+int al_write_transaction(struct drbd_device *device)
 {
        struct al_transaction_on_disk *buffer;
        struct lc_element *e;
@@ -505,7 +481,8 @@ _al_write_transaction(struct drbd_device *device)
                return -EIO;
        }
 
-       buffer = drbd_md_get_buffer(device); /* protects md_io_buffer, al_tr_cycle, ... */
+       /* protects md_io_buffer, al_tr_cycle, ... */
+       buffer = drbd_md_get_buffer(device, __func__);
        if (!buffer) {
                drbd_err(device, "disk failed while waiting for md_io buffer\n");
                put_ldev(device);
@@ -590,38 +567,6 @@ _al_write_transaction(struct drbd_device *device)
        return err;
 }
 
-
-static int w_al_write_transaction(struct drbd_work *w, int unused)
-{
-       struct update_al_work *aw = container_of(w, struct update_al_work, w);
-       struct drbd_device *device = aw->device;
-       int err;
-
-       err = _al_write_transaction(device);
-       aw->err = err;
-       complete(&aw->event);
-
-       return err != -EIO ? err : 0;
-}
-
-/* Calls from worker context (see w_restart_disk_io()) need to write the
-   transaction directly. Others came through generic_make_request(),
-   those need to delegate it to the worker. */
-static int al_write_transaction(struct drbd_device *device, bool delegate)
-{
-       if (delegate) {
-               struct update_al_work al_work;
-               init_completion(&al_work.event);
-               al_work.w.cb = w_al_write_transaction;
-               al_work.device = device;
-               drbd_queue_work_front(&first_peer_device(device)->connection->sender_work,
-                                     &al_work.w);
-               wait_for_completion(&al_work.event);
-               return al_work.err;
-       } else
-               return _al_write_transaction(device);
-}
-
 static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext)
 {
        int rv;
@@ -682,72 +627,56 @@ int drbd_initialize_al(struct drbd_device *device, void *buffer)
        return 0;
 }
 
-static int w_update_odbm(struct drbd_work *w, int unused)
-{
-       struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
-       struct drbd_device *device = udw->device;
-       struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
-
-       if (!get_ldev(device)) {
-               if (__ratelimit(&drbd_ratelimit_state))
-                       drbd_warn(device, "Can not update on disk bitmap, local IO disabled.\n");
-               kfree(udw);
-               return 0;
-       }
-
-       drbd_bm_write_page(device, rs_extent_to_bm_page(udw->enr));
-       put_ldev(device);
-
-       kfree(udw);
-
-       if (drbd_bm_total_weight(device) <= device->rs_failed) {
-               switch (device->state.conn) {
-               case C_SYNC_SOURCE:  case C_SYNC_TARGET:
-               case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T:
-                       drbd_resync_finished(device);
-               default:
-                       /* nothing to do */
-                       break;
-               }
-       }
-       drbd_bcast_event(device, &sib);
-
-       return 0;
-}
-
+static const char *drbd_change_sync_fname[] = {
+       [RECORD_RS_FAILED] = "drbd_rs_failed_io",
+       [SET_IN_SYNC] = "drbd_set_in_sync",
+       [SET_OUT_OF_SYNC] = "drbd_set_out_of_sync"
+};
 
 /* ATTENTION. The AL's extents are 4MB each, while the extents in the
  * resync LRU-cache are 16MB each.
  * The caller of this function has to hold an get_ldev() reference.
  *
+ * Adjusts the caching members ->rs_left (success) or ->rs_failed (!success),
+ * potentially pulling in (and recounting the corresponding bits)
+ * this resync extent into the resync extent lru cache.
+ *
+ * Returns whether all bits have been cleared for this resync extent,
+ * precisely: (rs_left <= rs_failed)
+ *
  * TODO will be obsoleted once we have a caching lru of the on disk bitmap
  */
-static void drbd_try_clear_on_disk_bm(struct drbd_device *device, sector_t sector,
-                                     int count, int success)
+static bool update_rs_extent(struct drbd_device *device,
+               unsigned int enr, int count,
+               enum update_sync_bits_mode mode)
 {
        struct lc_element *e;
-       struct update_odbm_work *udw;
-
-       unsigned int enr;
 
        D_ASSERT(device, atomic_read(&device->local_cnt));
 
-       /* I simply assume that a sector/size pair never crosses
-        * a 16 MB extent border. (Currently this is true...) */
-       enr = BM_SECT_TO_EXT(sector);
-
-       e = lc_get(device->resync, enr);
+       /* When setting out-of-sync bits,
+        * we don't need it cached (lc_find).
+        * But if it is present in the cache,
+        * we should update the cached bit count.
+        * Otherwise, that extent should be in the resync extent lru cache
+        * already -- or we want to pull it in if necessary -- (lc_get),
+        * then update and check rs_left and rs_failed. */
+       if (mode == SET_OUT_OF_SYNC)
+               e = lc_find(device->resync, enr);
+       else
+               e = lc_get(device->resync, enr);
        if (e) {
                struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
                if (ext->lce.lc_number == enr) {
-                       if (success)
+                       if (mode == SET_IN_SYNC)
                                ext->rs_left -= count;
+                       else if (mode == SET_OUT_OF_SYNC)
+                               ext->rs_left += count;
                        else
                                ext->rs_failed += count;
                        if (ext->rs_left < ext->rs_failed) {
-                               drbd_warn(device, "BAD! sector=%llus enr=%u rs_left=%d "
+                               drbd_warn(device, "BAD! enr=%u rs_left=%d "
                                    "rs_failed=%d count=%d cstate=%s\n",
-                                    (unsigned long long)sector,
                                     ext->lce.lc_number, ext->rs_left,
                                     ext->rs_failed, count,
                                     drbd_conn_str(device->state.conn));
@@ -781,34 +710,27 @@ static void drbd_try_clear_on_disk_bm(struct drbd_device *device, sector_t secto
                                     ext->lce.lc_number, ext->rs_failed);
                        }
                        ext->rs_left = rs_left;
-                       ext->rs_failed = success ? 0 : count;
+                       ext->rs_failed = (mode == RECORD_RS_FAILED) ? count : 0;
                        /* we don't keep a persistent log of the resync lru,
                         * we can commit any change right away. */
                        lc_committed(device->resync);
                }
-               lc_put(device->resync, &ext->lce);
+               if (mode != SET_OUT_OF_SYNC)
+                       lc_put(device->resync, &ext->lce);
                /* no race, we are within the al_lock! */
 
-               if (ext->rs_left == ext->rs_failed) {
+               if (ext->rs_left <= ext->rs_failed) {
                        ext->rs_failed = 0;
-
-                       udw = kmalloc(sizeof(*udw), GFP_ATOMIC);
-                       if (udw) {
-                               udw->enr = ext->lce.lc_number;
-                               udw->w.cb = w_update_odbm;
-                               udw->device = device;
-                               drbd_queue_work_front(&first_peer_device(device)->connection->sender_work,
-                                                     &udw->w);
-                       } else {
-                               drbd_warn(device, "Could not kmalloc an udw\n");
-                       }
+                       return true;
                }
-       } else {
+       } else if (mode != SET_OUT_OF_SYNC) {
+               /* be quiet if lc_find() did not find it. */
                drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n",
                    device->resync_locked,
                    device->resync->nr_elements,
                    device->resync->flags);
        }
+       return false;
 }
 
 void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go)
@@ -827,105 +749,105 @@ void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go
        }
 }
 
-/* clear the bit corresponding to the piece of storage in question:
- * size byte of data starting from sector.  Only clear a bits of the affected
- * one ore more _aligned_ BM_BLOCK_SIZE blocks.
- *
- * called by worker on C_SYNC_TARGET and receiver on SyncSource.
- *
- */
-void __drbd_set_in_sync(struct drbd_device *device, sector_t sector, int size,
-                      const char *file, const unsigned int line)
+/* It is called lazy update, so don't do write-out too often. */
+static bool lazy_bitmap_update_due(struct drbd_device *device)
 {
-       /* Is called from worker and receiver context _only_ */
-       unsigned long sbnr, ebnr, lbnr;
-       unsigned long count = 0;
-       sector_t esector, nr_sectors;
-       int wake_up = 0;
-       unsigned long flags;
+       return time_after(jiffies, device->rs_last_bcast + 2*HZ);
+}
 
-       if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
-               drbd_err(device, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
-                               (unsigned long long)sector, size);
+static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done)
+{
+       if (rs_done)
+               set_bit(RS_DONE, &device->flags);
+               /* and also set RS_PROGRESS below */
+       else if (!lazy_bitmap_update_due(device))
                return;
-       }
-
-       if (!get_ldev(device))
-               return; /* no disk, no metadata, no bitmap to clear bits in */
-
-       nr_sectors = drbd_get_capacity(device->this_bdev);
-       esector = sector + (size >> 9) - 1;
-
-       if (!expect(sector < nr_sectors))
-               goto out;
-       if (!expect(esector < nr_sectors))
-               esector = nr_sectors - 1;
-
-       lbnr = BM_SECT_TO_BIT(nr_sectors-1);
-
-       /* we clear it (in sync).
-        * round up start sector, round down end sector.  we make sure we only
-        * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
-       if (unlikely(esector < BM_SECT_PER_BIT-1))
-               goto out;
-       if (unlikely(esector == (nr_sectors-1)))
-               ebnr = lbnr;
-       else
-               ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
-       sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
 
-       if (sbnr > ebnr)
-               goto out;
+       drbd_device_post_work(device, RS_PROGRESS);
+}
 
+static int update_sync_bits(struct drbd_device *device,
+               unsigned long sbnr, unsigned long ebnr,
+               enum update_sync_bits_mode mode)
+{
        /*
-        * ok, (capacity & 7) != 0 sometimes, but who cares...
-        * we count rs_{total,left} in bits, not sectors.
+        * We keep a count of set bits per resync-extent in the ->rs_left
+        * caching member, so we need to loop and work within the resync extent
+        * alignment. Typically this loop will execute exactly once.
         */
-       count = drbd_bm_clear_bits(device, sbnr, ebnr);
-       if (count) {
-               drbd_advance_rs_marks(device, drbd_bm_total_weight(device));
-               spin_lock_irqsave(&device->al_lock, flags);
-               drbd_try_clear_on_disk_bm(device, sector, count, true);
-               spin_unlock_irqrestore(&device->al_lock, flags);
-
-               /* just wake_up unconditional now, various lc_chaged(),
-                * lc_put() in drbd_try_clear_on_disk_bm(). */
-               wake_up = 1;
+       unsigned long flags;
+       unsigned long count = 0;
+       unsigned int cleared = 0;
+       while (sbnr <= ebnr) {
+               /* set temporary boundary bit number to last bit number within
+                * the resync extent of the current start bit number,
+                * but cap at provided end bit number */
+               unsigned long tbnr = min(ebnr, sbnr | BM_BLOCKS_PER_BM_EXT_MASK);
+               unsigned long c;
+
+               if (mode == RECORD_RS_FAILED)
+                       /* Only called from drbd_rs_failed_io(), bits
+                        * supposedly still set.  Recount, maybe some
+                        * of the bits have been successfully cleared
+                        * by application IO meanwhile.
+                        */
+                       c = drbd_bm_count_bits(device, sbnr, tbnr);
+               else if (mode == SET_IN_SYNC)
+                       c = drbd_bm_clear_bits(device, sbnr, tbnr);
+               else /* if (mode == SET_OUT_OF_SYNC) */
+                       c = drbd_bm_set_bits(device, sbnr, tbnr);
+
+               if (c) {
+                       spin_lock_irqsave(&device->al_lock, flags);
+                       cleared += update_rs_extent(device, BM_BIT_TO_EXT(sbnr), c, mode);
+                       spin_unlock_irqrestore(&device->al_lock, flags);
+                       count += c;
+               }
+               sbnr = tbnr + 1;
        }
-out:
-       put_ldev(device);
-       if (wake_up)
+       if (count) {
+               if (mode == SET_IN_SYNC) {
+                       unsigned long still_to_go = drbd_bm_total_weight(device);
+                       bool rs_is_done = (still_to_go <= device->rs_failed);
+                       drbd_advance_rs_marks(device, still_to_go);
+                       if (cleared || rs_is_done)
+                               maybe_schedule_on_disk_bitmap_update(device, rs_is_done);
+               } else if (mode == RECORD_RS_FAILED)
+                       device->rs_failed += count;
                wake_up(&device->al_wait);
+       }
+       return count;
 }
 
-/*
- * this is intended to set one request worth of data out of sync.
- * affects at least 1 bit,
- * and at most 1+DRBD_MAX_BIO_SIZE/BM_BLOCK_SIZE bits.
+/* clear the bit corresponding to the piece of storage in question:
+ * size byte of data starting from sector.  Only clear a bits of the affected
+ * one ore more _aligned_ BM_BLOCK_SIZE blocks.
+ *
+ * called by worker on C_SYNC_TARGET and receiver on SyncSource.
  *
- * called by tl_clear and drbd_send_dblock (==drbd_make_request).
- * so this can be _any_ process.
  */
-int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size,
-                           const char *file, const unsigned int line)
+int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
+               enum update_sync_bits_mode mode,
+               const char *file, const unsigned int line)
 {
-       unsigned long sbnr, ebnr, flags;
+       /* Is called from worker and receiver context _only_ */
+       unsigned long sbnr, ebnr, lbnr;
+       unsigned long count = 0;
        sector_t esector, nr_sectors;
-       unsigned int enr, count = 0;
-       struct lc_element *e;
 
-       /* this should be an empty REQ_FLUSH */
-       if (size == 0)
+       /* This would be an empty REQ_FLUSH, be silent. */
+       if ((mode == SET_OUT_OF_SYNC) && size == 0)
                return 0;
 
-       if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
-               drbd_err(device, "sector: %llus, size: %d\n",
-                       (unsigned long long)sector, size);
+       if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
+               drbd_err(device, "%s: sector=%llus size=%d nonsense!\n",
+                               drbd_change_sync_fname[mode],
+                               (unsigned long long)sector, size);
                return 0;
        }
 
        if (!get_ldev(device))
-               return 0; /* no disk, no metadata, no bitmap to set bits in */
+               return 0; /* no disk, no metadata, no bitmap to manipulate bits in */
 
        nr_sectors = drbd_get_capacity(device->this_bdev);
        esector = sector + (size >> 9) - 1;
@@ -935,25 +857,28 @@ int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size
        if (!expect(esector < nr_sectors))
                esector = nr_sectors - 1;
 
-       /* we set it out of sync,
-        * we do not need to round anything here */
-       sbnr = BM_SECT_TO_BIT(sector);
-       ebnr = BM_SECT_TO_BIT(esector);
-
-       /* ok, (capacity & 7) != 0 sometimes, but who cares...
-        * we count rs_{total,left} in bits, not sectors.  */
-       spin_lock_irqsave(&device->al_lock, flags);
-       count = drbd_bm_set_bits(device, sbnr, ebnr);
+       lbnr = BM_SECT_TO_BIT(nr_sectors-1);
 
-       enr = BM_SECT_TO_EXT(sector);
-       e = lc_find(device->resync, enr);
-       if (e)
-               lc_entry(e, struct bm_extent, lce)->rs_left += count;
-       spin_unlock_irqrestore(&device->al_lock, flags);
+       if (mode == SET_IN_SYNC) {
+               /* Round up start sector, round down end sector.  We make sure
+                * we only clear full, aligned, BM_BLOCK_SIZE blocks. */
+               if (unlikely(esector < BM_SECT_PER_BIT-1))
+                       goto out;
+               if (unlikely(esector == (nr_sectors-1)))
+                       ebnr = lbnr;
+               else
+                       ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
+               sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
+       } else {
+               /* We set it out of sync, or record resync failure.
+                * Should not round anything here. */
+               sbnr = BM_SECT_TO_BIT(sector);
+               ebnr = BM_SECT_TO_BIT(esector);
+       }
 
+       count = update_sync_bits(device, sbnr, ebnr, mode);
 out:
        put_ldev(device);
-
        return count;
 }
 
@@ -1075,6 +1000,15 @@ int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector)
        struct lc_element *e;
        struct bm_extent *bm_ext;
        int i;
+       bool throttle = drbd_rs_should_slow_down(device, sector, true);
+
+       /* If we need to throttle, a half-locked (only marked BME_NO_WRITES,
+        * not yet BME_LOCKED) extent needs to be kicked out explicitly if we
+        * need to throttle. There is at most one such half-locked extent,
+        * which is remembered in resync_wenr. */
+
+       if (throttle && device->resync_wenr != enr)
+               return -EAGAIN;
 
        spin_lock_irq(&device->al_lock);
        if (device->resync_wenr != LC_FREE && device->resync_wenr != enr) {
@@ -1098,8 +1032,10 @@ int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector)
                        D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
                        clear_bit(BME_NO_WRITES, &bm_ext->flags);
                        device->resync_wenr = LC_FREE;
-                       if (lc_put(device->resync, &bm_ext->lce) == 0)
+                       if (lc_put(device->resync, &bm_ext->lce) == 0) {
+                               bm_ext->flags = 0;
                                device->resync_locked--;
+                       }
                        wake_up(&device->al_wait);
                } else {
                        drbd_alert(device, "LOGIC BUG\n");
@@ -1161,8 +1097,20 @@ proceed:
        return 0;
 
 try_again:
-       if (bm_ext)
-               device->resync_wenr = enr;
+       if (bm_ext) {
+               if (throttle) {
+                       D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+                       D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
+                       clear_bit(BME_NO_WRITES, &bm_ext->flags);
+                       device->resync_wenr = LC_FREE;
+                       if (lc_put(device->resync, &bm_ext->lce) == 0) {
+                               bm_ext->flags = 0;
+                               device->resync_locked--;
+                       }
+                       wake_up(&device->al_wait);
+               } else
+                       device->resync_wenr = enr;
+       }
        spin_unlock_irq(&device->al_lock);
        return -EAGAIN;
 }
@@ -1270,69 +1218,3 @@ int drbd_rs_del_all(struct drbd_device *device)
 
        return 0;
 }
-
-/**
- * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks
- * @device:    DRBD device.
- * @sector:    The sector number.
- * @size:      Size of failed IO operation, in byte.
- */
-void drbd_rs_failed_io(struct drbd_device *device, sector_t sector, int size)
-{
-       /* Is called from worker and receiver context _only_ */
-       unsigned long sbnr, ebnr, lbnr;
-       unsigned long count;
-       sector_t esector, nr_sectors;
-       int wake_up = 0;
-
-       if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
-               drbd_err(device, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
-                               (unsigned long long)sector, size);
-               return;
-       }
-       nr_sectors = drbd_get_capacity(device->this_bdev);
-       esector = sector + (size >> 9) - 1;
-
-       if (!expect(sector < nr_sectors))
-               return;
-       if (!expect(esector < nr_sectors))
-               esector = nr_sectors - 1;
-
-       lbnr = BM_SECT_TO_BIT(nr_sectors-1);
-
-       /*
-        * round up start sector, round down end sector.  we make sure we only
-        * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */
-       if (unlikely(esector < BM_SECT_PER_BIT-1))
-               return;
-       if (unlikely(esector == (nr_sectors-1)))
-               ebnr = lbnr;
-       else
-               ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
-       sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
-
-       if (sbnr > ebnr)
-               return;
-
-       /*
-        * ok, (capacity & 7) != 0 sometimes, but who cares...
-        * we count rs_{total,left} in bits, not sectors.
-        */
-       spin_lock_irq(&device->al_lock);
-       count = drbd_bm_count_bits(device, sbnr, ebnr);
-       if (count) {
-               device->rs_failed += count;
-
-               if (get_ldev(device)) {
-                       drbd_try_clear_on_disk_bm(device, sector, count, false);
-                       put_ldev(device);
-               }
-
-               /* just wake_up unconditional now, various lc_chaged(),
-                * lc_put() in drbd_try_clear_on_disk_bm(). */
-               wake_up = 1;
-       }
-       spin_unlock_irq(&device->al_lock);
-       if (wake_up)
-               wake_up(&device->al_wait);
-}
index 1aa29f8fdfe1feb0f6d5d534cbdaf3fac90a530c..426c97aef9002193c6c7790040c95bdb2ec71eac 100644 (file)
@@ -22,6 +22,8 @@
    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#define pr_fmt(fmt)    KBUILD_MODNAME ": " fmt
+
 #include <linux/bitops.h>
 #include <linux/vmalloc.h>
 #include <linux/string.h>
@@ -353,9 +355,8 @@ static void bm_free_pages(struct page **pages, unsigned long number)
 
        for (i = 0; i < number; i++) {
                if (!pages[i]) {
-                       printk(KERN_ALERT "drbd: bm_free_pages tried to free "
-                                         "a NULL pointer; i=%lu n=%lu\n",
-                                         i, number);
+                       pr_alert("bm_free_pages tried to free a NULL pointer; i=%lu n=%lu\n",
+                                i, number);
                        continue;
                }
                __free_page(pages[i]);
@@ -592,7 +593,7 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
        end = offset + len;
 
        if (end > b->bm_words) {
-               printk(KERN_ALERT "drbd: bm_memset end > bm_words\n");
+               pr_alert("bm_memset end > bm_words\n");
                return;
        }
 
@@ -602,7 +603,7 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
                p_addr = bm_map_pidx(b, idx);
                bm = p_addr + MLPP(offset);
                if (bm+do_now > p_addr + LWPP) {
-                       printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
+                       pr_alert("BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
                               p_addr, bm, (int)do_now);
                } else
                        memset(bm, c, do_now * sizeof(long));
@@ -927,22 +928,14 @@ void drbd_bm_clear_all(struct drbd_device *device)
        spin_unlock_irq(&b->bm_lock);
 }
 
-struct bm_aio_ctx {
-       struct drbd_device *device;
-       atomic_t in_flight;
-       unsigned int done;
-       unsigned flags;
-#define BM_AIO_COPY_PAGES      1
-#define BM_AIO_WRITE_HINTED    2
-#define BM_WRITE_ALL_PAGES     4
-       int error;
-       struct kref kref;
-};
-
-static void bm_aio_ctx_destroy(struct kref *kref)
+static void drbd_bm_aio_ctx_destroy(struct kref *kref)
 {
-       struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref);
+       struct drbd_bm_aio_ctx *ctx = container_of(kref, struct drbd_bm_aio_ctx, kref);
+       unsigned long flags;
 
+       spin_lock_irqsave(&ctx->device->resource->req_lock, flags);
+       list_del(&ctx->list);
+       spin_unlock_irqrestore(&ctx->device->resource->req_lock, flags);
        put_ldev(ctx->device);
        kfree(ctx);
 }
@@ -950,7 +943,7 @@ static void bm_aio_ctx_destroy(struct kref *kref)
 /* bv_page may be a copy, or may be the original */
 static void bm_async_io_complete(struct bio *bio, int error)
 {
-       struct bm_aio_ctx *ctx = bio->bi_private;
+       struct drbd_bm_aio_ctx *ctx = bio->bi_private;
        struct drbd_device *device = ctx->device;
        struct drbd_bitmap *b = device->bitmap;
        unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
@@ -993,17 +986,18 @@ static void bm_async_io_complete(struct bio *bio, int error)
        if (atomic_dec_and_test(&ctx->in_flight)) {
                ctx->done = 1;
                wake_up(&device->misc_wait);
-               kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+               kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
        }
 }
 
-static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local)
+static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local)
 {
        struct bio *bio = bio_alloc_drbd(GFP_NOIO);
        struct drbd_device *device = ctx->device;
        struct drbd_bitmap *b = device->bitmap;
        struct page *page;
        unsigned int len;
+       unsigned int rw = (ctx->flags & BM_AIO_READ) ? READ : WRITE;
 
        sector_t on_disk_sector =
                device->ldev->md.md_offset + device->ldev->md.bm_offset;
@@ -1049,9 +1043,9 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must
 /*
  * bm_rw: read/write the whole bitmap from/to its on disk location.
  */
-static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
+static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
 {
-       struct bm_aio_ctx *ctx;
+       struct drbd_bm_aio_ctx *ctx;
        struct drbd_bitmap *b = device->bitmap;
        int num_pages, i, count = 0;
        unsigned long now;
@@ -1067,12 +1061,13 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
         * as we submit copies of pages anyways.
         */
 
-       ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
+       ctx = kmalloc(sizeof(struct drbd_bm_aio_ctx), GFP_NOIO);
        if (!ctx)
                return -ENOMEM;
 
-       *ctx = (struct bm_aio_ctx) {
+       *ctx = (struct drbd_bm_aio_ctx) {
                .device = device,
+               .start_jif = jiffies,
                .in_flight = ATOMIC_INIT(1),
                .done = 0,
                .flags = flags,
@@ -1080,15 +1075,21 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
                .kref = { ATOMIC_INIT(2) },
        };
 
-       if (!get_ldev_if_state(device, D_ATTACHING)) {  /* put is in bm_aio_ctx_destroy() */
+       if (!get_ldev_if_state(device, D_ATTACHING)) {  /* put is in drbd_bm_aio_ctx_destroy() */
                drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n");
                kfree(ctx);
                return -ENODEV;
        }
+       /* Here D_ATTACHING is sufficient since drbd_bm_read() is called only from
+          drbd_adm_attach(), after device->ldev was assigned. */
 
-       if (!ctx->flags)
+       if (0 == (ctx->flags & ~BM_AIO_READ))
                WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
 
+       spin_lock_irq(&device->resource->req_lock);
+       list_add_tail(&ctx->list, &device->pending_bitmap_io);
+       spin_unlock_irq(&device->resource->req_lock);
+
        num_pages = b->bm_number_of_pages;
 
        now = jiffies;
@@ -1098,13 +1099,13 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
                /* ignore completely unchanged pages */
                if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
                        break;
-               if (rw & WRITE) {
+               if (!(flags & BM_AIO_READ)) {
                        if ((flags & BM_AIO_WRITE_HINTED) &&
                            !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
                                    &page_private(b->bm_pages[i])))
                                continue;
 
-                       if (!(flags & BM_WRITE_ALL_PAGES) &&
+                       if (!(flags & BM_AIO_WRITE_ALL_PAGES) &&
                            bm_test_page_unchanged(b->bm_pages[i])) {
                                dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i);
                                continue;
@@ -1118,7 +1119,7 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
                        }
                }
                atomic_inc(&ctx->in_flight);
-               bm_page_io_async(ctx, i, rw);
+               bm_page_io_async(ctx, i);
                ++count;
                cond_resched();
        }
@@ -1134,12 +1135,12 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
        if (!atomic_dec_and_test(&ctx->in_flight))
                wait_until_done_or_force_detached(device, device->ldev, &ctx->done);
        else
-               kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+               kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
 
        /* summary for global bitmap IO */
        if (flags == 0)
                drbd_info(device, "bitmap %s of %u pages took %lu jiffies\n",
-                        rw == WRITE ? "WRITE" : "READ",
+                        (flags & BM_AIO_READ) ? "READ" : "WRITE",
                         count, jiffies - now);
 
        if (ctx->error) {
@@ -1152,20 +1153,18 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
                err = -EIO; /* Disk timeout/force-detach during IO... */
 
        now = jiffies;
-       if (rw == WRITE) {
-               drbd_md_flush(device);
-       } else /* rw == READ */ {
+       if (flags & BM_AIO_READ) {
                b->bm_set = bm_count_bits(b);
                drbd_info(device, "recounting of set bits took additional %lu jiffies\n",
                     jiffies - now);
        }
        now = b->bm_set;
 
-       if (flags == 0)
+       if ((flags & ~BM_AIO_READ) == 0)
                drbd_info(device, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
                     ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
 
-       kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+       kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
        return err;
 }
 
@@ -1175,7 +1174,7 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
  */
 int drbd_bm_read(struct drbd_device *device) __must_hold(local)
 {
-       return bm_rw(device, READ, 0, 0);
+       return bm_rw(device, BM_AIO_READ, 0);
 }
 
 /**
@@ -1186,7 +1185,7 @@ int drbd_bm_read(struct drbd_device *device) __must_hold(local)
  */
 int drbd_bm_write(struct drbd_device *device) __must_hold(local)
 {
-       return bm_rw(device, WRITE, 0, 0);
+       return bm_rw(device, 0, 0);
 }
 
 /**
@@ -1197,7 +1196,17 @@ int drbd_bm_write(struct drbd_device *device) __must_hold(local)
  */
 int drbd_bm_write_all(struct drbd_device *device) __must_hold(local)
 {
-       return bm_rw(device, WRITE, BM_WRITE_ALL_PAGES, 0);
+       return bm_rw(device, BM_AIO_WRITE_ALL_PAGES, 0);
+}
+
+/**
+ * drbd_bm_write_lazy() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
+ * @device:    DRBD device.
+ * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages
+ */
+int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local)
+{
+       return bm_rw(device, BM_AIO_COPY_PAGES, upper_idx);
 }
 
 /**
@@ -1213,7 +1222,7 @@ int drbd_bm_write_all(struct drbd_device *device) __must_hold(local)
  */
 int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local)
 {
-       return bm_rw(device, WRITE, BM_AIO_COPY_PAGES, 0);
+       return bm_rw(device, BM_AIO_COPY_PAGES, 0);
 }
 
 /**
@@ -1222,62 +1231,7 @@ int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local)
  */
 int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local)
 {
-       return bm_rw(device, WRITE, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
-}
-
-/**
- * drbd_bm_write_page() - Writes a PAGE_SIZE aligned piece of bitmap
- * @device:    DRBD device.
- * @idx:       bitmap page index
- *
- * We don't want to special case on logical_block_size of the backend device,
- * so we submit PAGE_SIZE aligned pieces.
- * Note that on "most" systems, PAGE_SIZE is 4k.
- *
- * In case this becomes an issue on systems with larger PAGE_SIZE,
- * we may want to change this again to write 4k aligned 4k pieces.
- */
-int drbd_bm_write_page(struct drbd_device *device, unsigned int idx) __must_hold(local)
-{
-       struct bm_aio_ctx *ctx;
-       int err;
-
-       if (bm_test_page_unchanged(device->bitmap->bm_pages[idx])) {
-               dynamic_drbd_dbg(device, "skipped bm page write for idx %u\n", idx);
-               return 0;
-       }
-
-       ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
-       if (!ctx)
-               return -ENOMEM;
-
-       *ctx = (struct bm_aio_ctx) {
-               .device = device,
-               .in_flight = ATOMIC_INIT(1),
-               .done = 0,
-               .flags = BM_AIO_COPY_PAGES,
-               .error = 0,
-               .kref = { ATOMIC_INIT(2) },
-       };
-
-       if (!get_ldev_if_state(device, D_ATTACHING)) {  /* put is in bm_aio_ctx_destroy() */
-               drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in drbd_bm_write_page()\n");
-               kfree(ctx);
-               return -ENODEV;
-       }
-
-       bm_page_io_async(ctx, idx, WRITE_SYNC);
-       wait_until_done_or_force_detached(device, device->ldev, &ctx->done);
-
-       if (ctx->error)
-               drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
-               /* that causes us to detach, so the in memory bitmap will be
-                * gone in a moment as well. */
-
-       device->bm_writ_cnt++;
-       err = atomic_read(&ctx->in_flight) ? -EIO : ctx->error;
-       kref_put(&ctx->kref, &bm_aio_ctx_destroy);
-       return err;
+       return bm_rw(device, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
 }
 
 /* NOTE
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
new file mode 100644 (file)
index 0000000..5c20b18
--- /dev/null
@@ -0,0 +1,958 @@
+#define pr_fmt(fmt) "drbd debugfs: " fmt
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/stat.h>
+#include <linux/jiffies.h>
+#include <linux/list.h>
+
+#include "drbd_int.h"
+#include "drbd_req.h"
+#include "drbd_debugfs.h"
+
+
+/**********************************************************************
+ * Whenever you change the file format, remember to bump the version. *
+ **********************************************************************/
+
+static struct dentry *drbd_debugfs_root;
+static struct dentry *drbd_debugfs_version;
+static struct dentry *drbd_debugfs_resources;
+static struct dentry *drbd_debugfs_minors;
+
+static void seq_print_age_or_dash(struct seq_file *m, bool valid, unsigned long dt)
+{
+       if (valid)
+               seq_printf(m, "\t%d", jiffies_to_msecs(dt));
+       else
+               seq_printf(m, "\t-");
+}
+
+static void __seq_print_rq_state_bit(struct seq_file *m,
+       bool is_set, char *sep, const char *set_name, const char *unset_name)
+{
+       if (is_set && set_name) {
+               seq_putc(m, *sep);
+               seq_puts(m, set_name);
+               *sep = '|';
+       } else if (!is_set && unset_name) {
+               seq_putc(m, *sep);
+               seq_puts(m, unset_name);
+               *sep = '|';
+       }
+}
+
+static void seq_print_rq_state_bit(struct seq_file *m,
+       bool is_set, char *sep, const char *set_name)
+{
+       __seq_print_rq_state_bit(m, is_set, sep, set_name, NULL);
+}
+
+/* pretty print enum drbd_req_state_bits req->rq_state */
+static void seq_print_request_state(struct seq_file *m, struct drbd_request *req)
+{
+       unsigned int s = req->rq_state;
+       char sep = ' ';
+       seq_printf(m, "\t0x%08x", s);
+       seq_printf(m, "\tmaster: %s", req->master_bio ? "pending" : "completed");
+
+       /* RQ_WRITE ignored, already reported */
+       seq_puts(m, "\tlocal:");
+       seq_print_rq_state_bit(m, s & RQ_IN_ACT_LOG, &sep, "in-AL");
+       seq_print_rq_state_bit(m, s & RQ_POSTPONED, &sep, "postponed");
+       seq_print_rq_state_bit(m, s & RQ_COMPLETION_SUSP, &sep, "suspended");
+       sep = ' ';
+       seq_print_rq_state_bit(m, s & RQ_LOCAL_PENDING, &sep, "pending");
+       seq_print_rq_state_bit(m, s & RQ_LOCAL_COMPLETED, &sep, "completed");
+       seq_print_rq_state_bit(m, s & RQ_LOCAL_ABORTED, &sep, "aborted");
+       seq_print_rq_state_bit(m, s & RQ_LOCAL_OK, &sep, "ok");
+       if (sep == ' ')
+               seq_puts(m, " -");
+
+       /* for_each_connection ... */
+       seq_printf(m, "\tnet:");
+       sep = ' ';
+       seq_print_rq_state_bit(m, s & RQ_NET_PENDING, &sep, "pending");
+       seq_print_rq_state_bit(m, s & RQ_NET_QUEUED, &sep, "queued");
+       seq_print_rq_state_bit(m, s & RQ_NET_SENT, &sep, "sent");
+       seq_print_rq_state_bit(m, s & RQ_NET_DONE, &sep, "done");
+       seq_print_rq_state_bit(m, s & RQ_NET_SIS, &sep, "sis");
+       seq_print_rq_state_bit(m, s & RQ_NET_OK, &sep, "ok");
+       if (sep == ' ')
+               seq_puts(m, " -");
+
+       seq_printf(m, " :");
+       sep = ' ';
+       seq_print_rq_state_bit(m, s & RQ_EXP_RECEIVE_ACK, &sep, "B");
+       seq_print_rq_state_bit(m, s & RQ_EXP_WRITE_ACK, &sep, "C");
+       seq_print_rq_state_bit(m, s & RQ_EXP_BARR_ACK, &sep, "barr");
+       if (sep == ' ')
+               seq_puts(m, " -");
+       seq_printf(m, "\n");
+}
+
+static void seq_print_one_request(struct seq_file *m, struct drbd_request *req, unsigned long now)
+{
+       /* change anything here, fixup header below! */
+       unsigned int s = req->rq_state;
+
+#define RQ_HDR_1 "epoch\tsector\tsize\trw"
+       seq_printf(m, "0x%x\t%llu\t%u\t%s",
+               req->epoch,
+               (unsigned long long)req->i.sector, req->i.size >> 9,
+               (s & RQ_WRITE) ? "W" : "R");
+
+#define RQ_HDR_2 "\tstart\tin AL\tsubmit"
+       seq_printf(m, "\t%d", jiffies_to_msecs(now - req->start_jif));
+       seq_print_age_or_dash(m, s & RQ_IN_ACT_LOG, now - req->in_actlog_jif);
+       seq_print_age_or_dash(m, s & RQ_LOCAL_PENDING, now - req->pre_submit_jif);
+
+#define RQ_HDR_3 "\tsent\tacked\tdone"
+       seq_print_age_or_dash(m, s & RQ_NET_SENT, now - req->pre_send_jif);
+       seq_print_age_or_dash(m, (s & RQ_NET_SENT) && !(s & RQ_NET_PENDING), now - req->acked_jif);
+       seq_print_age_or_dash(m, s & RQ_NET_DONE, now - req->net_done_jif);
+
+#define RQ_HDR_4 "\tstate\n"
+       seq_print_request_state(m, req);
+}
+#define RQ_HDR RQ_HDR_1 RQ_HDR_2 RQ_HDR_3 RQ_HDR_4
+
+static void seq_print_minor_vnr_req(struct seq_file *m, struct drbd_request *req, unsigned long now)
+{
+       seq_printf(m, "%u\t%u\t", req->device->minor, req->device->vnr);
+       seq_print_one_request(m, req, now);
+}
+
+static void seq_print_resource_pending_meta_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+       struct drbd_device *device;
+       unsigned int i;
+
+       seq_puts(m, "minor\tvnr\tstart\tsubmit\tintent\n");
+       rcu_read_lock();
+       idr_for_each_entry(&resource->devices, device, i) {
+               struct drbd_md_io tmp;
+               /* In theory this is racy,
+                * in the sense that there could have been a
+                * drbd_md_put_buffer(); drbd_md_get_buffer();
+                * between accessing these members here.  */
+               tmp = device->md_io;
+               if (atomic_read(&tmp.in_use)) {
+                       seq_printf(m, "%u\t%u\t%d\t",
+                               device->minor, device->vnr,
+                               jiffies_to_msecs(now - tmp.start_jif));
+                       if (time_before(tmp.submit_jif, tmp.start_jif))
+                               seq_puts(m, "-\t");
+                       else
+                               seq_printf(m, "%d\t", jiffies_to_msecs(now - tmp.submit_jif));
+                       seq_printf(m, "%s\n", tmp.current_use);
+               }
+       }
+       rcu_read_unlock();
+}
+
+static void seq_print_waiting_for_AL(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+       struct drbd_device *device;
+       unsigned int i;
+
+       seq_puts(m, "minor\tvnr\tage\t#waiting\n");
+       rcu_read_lock();
+       idr_for_each_entry(&resource->devices, device, i) {
+               unsigned long jif;
+               struct drbd_request *req;
+               int n = atomic_read(&device->ap_actlog_cnt);
+               if (n) {
+                       spin_lock_irq(&device->resource->req_lock);
+                       req = list_first_entry_or_null(&device->pending_master_completion[1],
+                               struct drbd_request, req_pending_master_completion);
+                       /* if the oldest request does not wait for the activity log
+                        * it is not interesting for us here */
+                       if (req && !(req->rq_state & RQ_IN_ACT_LOG))
+                               jif = req->start_jif;
+                       else
+                               req = NULL;
+                       spin_unlock_irq(&device->resource->req_lock);
+               }
+               if (n) {
+                       seq_printf(m, "%u\t%u\t", device->minor, device->vnr);
+                       if (req)
+                               seq_printf(m, "%u\t", jiffies_to_msecs(now - jif));
+                       else
+                               seq_puts(m, "-\t");
+                       seq_printf(m, "%u\n", n);
+               }
+       }
+       rcu_read_unlock();
+}
+
+static void seq_print_device_bitmap_io(struct seq_file *m, struct drbd_device *device, unsigned long now)
+{
+       struct drbd_bm_aio_ctx *ctx;
+       unsigned long start_jif;
+       unsigned int in_flight;
+       unsigned int flags;
+       spin_lock_irq(&device->resource->req_lock);
+       ctx = list_first_entry_or_null(&device->pending_bitmap_io, struct drbd_bm_aio_ctx, list);
+       if (ctx && ctx->done)
+               ctx = NULL;
+       if (ctx) {
+               start_jif = ctx->start_jif;
+               in_flight = atomic_read(&ctx->in_flight);
+               flags = ctx->flags;
+       }
+       spin_unlock_irq(&device->resource->req_lock);
+       if (ctx) {
+               seq_printf(m, "%u\t%u\t%c\t%u\t%u\n",
+                       device->minor, device->vnr,
+                       (flags & BM_AIO_READ) ? 'R' : 'W',
+                       jiffies_to_msecs(now - start_jif),
+                       in_flight);
+       }
+}
+
+static void seq_print_resource_pending_bitmap_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+       struct drbd_device *device;
+       unsigned int i;
+
+       seq_puts(m, "minor\tvnr\trw\tage\t#in-flight\n");
+       rcu_read_lock();
+       idr_for_each_entry(&resource->devices, device, i) {
+               seq_print_device_bitmap_io(m, device, now);
+       }
+       rcu_read_unlock();
+}
+
+/* pretty print enum peer_req->flags */
+static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_request *peer_req)
+{
+       unsigned long f = peer_req->flags;
+       char sep = ' ';
+
+       __seq_print_rq_state_bit(m, f & EE_SUBMITTED, &sep, "submitted", "preparing");
+       __seq_print_rq_state_bit(m, f & EE_APPLICATION, &sep, "application", "internal");
+       seq_print_rq_state_bit(m, f & EE_CALL_AL_COMPLETE_IO, &sep, "in-AL");
+       seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C");
+       seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync");
+
+       if (f & EE_IS_TRIM) {
+               seq_putc(m, sep);
+               sep = '|';
+               if (f & EE_IS_TRIM_USE_ZEROOUT)
+                       seq_puts(m, "zero-out");
+               else
+                       seq_puts(m, "trim");
+       }
+       seq_putc(m, '\n');
+}
+
+static void seq_print_peer_request(struct seq_file *m,
+       struct drbd_device *device, struct list_head *lh,
+       unsigned long now)
+{
+       bool reported_preparing = false;
+       struct drbd_peer_request *peer_req;
+       list_for_each_entry(peer_req, lh, w.list) {
+               if (reported_preparing && !(peer_req->flags & EE_SUBMITTED))
+                       continue;
+
+               if (device)
+                       seq_printf(m, "%u\t%u\t", device->minor, device->vnr);
+
+               seq_printf(m, "%llu\t%u\t%c\t%u\t",
+                       (unsigned long long)peer_req->i.sector, peer_req->i.size >> 9,
+                       (peer_req->flags & EE_WRITE) ? 'W' : 'R',
+                       jiffies_to_msecs(now - peer_req->submit_jif));
+               seq_print_peer_request_flags(m, peer_req);
+               if (peer_req->flags & EE_SUBMITTED)
+                       break;
+               else
+                       reported_preparing = true;
+       }
+}
+
+static void seq_print_device_peer_requests(struct seq_file *m,
+       struct drbd_device *device, unsigned long now)
+{
+       seq_puts(m, "minor\tvnr\tsector\tsize\trw\tage\tflags\n");
+       spin_lock_irq(&device->resource->req_lock);
+       seq_print_peer_request(m, device, &device->active_ee, now);
+       seq_print_peer_request(m, device, &device->read_ee, now);
+       seq_print_peer_request(m, device, &device->sync_ee, now);
+       spin_unlock_irq(&device->resource->req_lock);
+       if (test_bit(FLUSH_PENDING, &device->flags)) {
+               seq_printf(m, "%u\t%u\t-\t-\tF\t%u\tflush\n",
+                       device->minor, device->vnr,
+                       jiffies_to_msecs(now - device->flush_jif));
+       }
+}
+
+static void seq_print_resource_pending_peer_requests(struct seq_file *m,
+       struct drbd_resource *resource, unsigned long now)
+{
+       struct drbd_device *device;
+       unsigned int i;
+
+       rcu_read_lock();
+       idr_for_each_entry(&resource->devices, device, i) {
+               seq_print_device_peer_requests(m, device, now);
+       }
+       rcu_read_unlock();
+}
+
+static void seq_print_resource_transfer_log_summary(struct seq_file *m,
+       struct drbd_resource *resource,
+       struct drbd_connection *connection,
+       unsigned long now)
+{
+       struct drbd_request *req;
+       unsigned int count = 0;
+       unsigned int show_state = 0;
+
+       seq_puts(m, "n\tdevice\tvnr\t" RQ_HDR);
+       spin_lock_irq(&resource->req_lock);
+       list_for_each_entry(req, &connection->transfer_log, tl_requests) {
+               unsigned int tmp = 0;
+               unsigned int s;
+               ++count;
+
+               /* don't disable irq "forever" */
+               if (!(count & 0x1ff)) {
+                       struct drbd_request *req_next;
+                       kref_get(&req->kref);
+                       spin_unlock_irq(&resource->req_lock);
+                       cond_resched();
+                       spin_lock_irq(&resource->req_lock);
+                       req_next = list_next_entry(req, tl_requests);
+                       if (kref_put(&req->kref, drbd_req_destroy))
+                               req = req_next;
+                       if (&req->tl_requests == &connection->transfer_log)
+                               break;
+               }
+
+               s = req->rq_state;
+
+               /* This is meant to summarize timing issues, to be able to tell
+                * local disk problems from network problems.
+                * Skip requests, if we have shown an even older request with
+                * similar aspects already.  */
+               if (req->master_bio == NULL)
+                       tmp |= 1;
+               if ((s & RQ_LOCAL_MASK) && (s & RQ_LOCAL_PENDING))
+                       tmp |= 2;
+               if (s & RQ_NET_MASK) {
+                       if (!(s & RQ_NET_SENT))
+                               tmp |= 4;
+                       if (s & RQ_NET_PENDING)
+                               tmp |= 8;
+                       if (!(s & RQ_NET_DONE))
+                               tmp |= 16;
+               }
+               if ((tmp & show_state) == tmp)
+                       continue;
+               show_state |= tmp;
+               seq_printf(m, "%u\t", count);
+               seq_print_minor_vnr_req(m, req, now);
+               if (show_state == 0x1f)
+                       break;
+       }
+       spin_unlock_irq(&resource->req_lock);
+}
+
+/* TODO: transfer_log and friends should be moved to resource */
+static int in_flight_summary_show(struct seq_file *m, void *pos)
+{
+       struct drbd_resource *resource = m->private;
+       struct drbd_connection *connection;
+       unsigned long jif = jiffies;
+
+       connection = first_connection(resource);
+       /* This does not happen, actually.
+        * But be robust and prepare for future code changes. */
+       if (!connection || !kref_get_unless_zero(&connection->kref))
+               return -ESTALE;
+
+       /* BUMP me if you change the file format/content/presentation */
+       seq_printf(m, "v: %u\n\n", 0);
+
+       seq_puts(m, "oldest bitmap IO\n");
+       seq_print_resource_pending_bitmap_io(m, resource, jif);
+       seq_putc(m, '\n');
+
+       seq_puts(m, "meta data IO\n");
+       seq_print_resource_pending_meta_io(m, resource, jif);
+       seq_putc(m, '\n');
+
+       seq_puts(m, "socket buffer stats\n");
+       /* for each connection ... once we have more than one */
+       rcu_read_lock();
+       if (connection->data.socket) {
+               /* open coded SIOCINQ, the "relevant" part */
+               struct tcp_sock *tp = tcp_sk(connection->data.socket->sk);
+               int answ = tp->rcv_nxt - tp->copied_seq;
+               seq_printf(m, "unread receive buffer: %u Byte\n", answ);
+               /* open coded SIOCOUTQ, the "relevant" part */
+               answ = tp->write_seq - tp->snd_una;
+               seq_printf(m, "unacked send buffer: %u Byte\n", answ);
+       }
+       rcu_read_unlock();
+       seq_putc(m, '\n');
+
+       seq_puts(m, "oldest peer requests\n");
+       seq_print_resource_pending_peer_requests(m, resource, jif);
+       seq_putc(m, '\n');
+
+       seq_puts(m, "application requests waiting for activity log\n");
+       seq_print_waiting_for_AL(m, resource, jif);
+       seq_putc(m, '\n');
+
+       seq_puts(m, "oldest application requests\n");
+       seq_print_resource_transfer_log_summary(m, resource, connection, jif);
+       seq_putc(m, '\n');
+
+       jif = jiffies - jif;
+       if (jif)
+               seq_printf(m, "generated in %d ms\n", jiffies_to_msecs(jif));
+       kref_put(&connection->kref, drbd_destroy_connection);
+       return 0;
+}
+
+/* simple_positive(file->f_dentry) respectively debugfs_positive(),
+ * but neither is "reachable" from here.
+ * So we have our own inline version of it above.  :-( */
+static inline int debugfs_positive(struct dentry *dentry)
+{
+        return dentry->d_inode && !d_unhashed(dentry);
+}
+
+/* make sure at *open* time that the respective object won't go away. */
+static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, void *),
+                               void *data, struct kref *kref,
+                               void (*release)(struct kref *))
+{
+       struct dentry *parent;
+       int ret = -ESTALE;
+
+       /* Are we still linked,
+        * or has debugfs_remove() already been called? */
+       parent = file->f_dentry->d_parent;
+       /* not sure if this can happen: */
+       if (!parent || !parent->d_inode)
+               goto out;
+       /* serialize with d_delete() */
+       mutex_lock(&parent->d_inode->i_mutex);
+       /* Make sure the object is still alive */
+       if (debugfs_positive(file->f_dentry)
+       && kref_get_unless_zero(kref))
+               ret = 0;
+       mutex_unlock(&parent->d_inode->i_mutex);
+       if (!ret) {
+               ret = single_open(file, show, data);
+               if (ret)
+                       kref_put(kref, release);
+       }
+out:
+       return ret;
+}
+
+static int in_flight_summary_open(struct inode *inode, struct file *file)
+{
+       struct drbd_resource *resource = inode->i_private;
+       return drbd_single_open(file, in_flight_summary_show, resource,
+                               &resource->kref, drbd_destroy_resource);
+}
+
+static int in_flight_summary_release(struct inode *inode, struct file *file)
+{
+       struct drbd_resource *resource = inode->i_private;
+       kref_put(&resource->kref, drbd_destroy_resource);
+       return single_release(inode, file);
+}
+
+static const struct file_operations in_flight_summary_fops = {
+       .owner          = THIS_MODULE,
+       .open           = in_flight_summary_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = in_flight_summary_release,
+};
+
+void drbd_debugfs_resource_add(struct drbd_resource *resource)
+{
+       struct dentry *dentry;
+       if (!drbd_debugfs_resources)
+               return;
+
+       dentry = debugfs_create_dir(resource->name, drbd_debugfs_resources);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       resource->debugfs_res = dentry;
+
+       dentry = debugfs_create_dir("volumes", resource->debugfs_res);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       resource->debugfs_res_volumes = dentry;
+
+       dentry = debugfs_create_dir("connections", resource->debugfs_res);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       resource->debugfs_res_connections = dentry;
+
+       dentry = debugfs_create_file("in_flight_summary", S_IRUSR|S_IRGRP,
+                       resource->debugfs_res, resource,
+                       &in_flight_summary_fops);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       resource->debugfs_res_in_flight_summary = dentry;
+       return;
+
+fail:
+       drbd_debugfs_resource_cleanup(resource);
+       drbd_err(resource, "failed to create debugfs dentry\n");
+}
+
+static void drbd_debugfs_remove(struct dentry **dp)
+{
+       debugfs_remove(*dp);
+       *dp = NULL;
+}
+
+void drbd_debugfs_resource_cleanup(struct drbd_resource *resource)
+{
+       /* it is ok to call debugfs_remove(NULL) */
+       drbd_debugfs_remove(&resource->debugfs_res_in_flight_summary);
+       drbd_debugfs_remove(&resource->debugfs_res_connections);
+       drbd_debugfs_remove(&resource->debugfs_res_volumes);
+       drbd_debugfs_remove(&resource->debugfs_res);
+}
+
+static void seq_print_one_timing_detail(struct seq_file *m,
+       const struct drbd_thread_timing_details *tdp,
+       unsigned long now)
+{
+       struct drbd_thread_timing_details td;
+       /* No locking...
+        * use temporary assignment to get at consistent data. */
+       do {
+               td = *tdp;
+       } while (td.cb_nr != tdp->cb_nr);
+       if (!td.cb_addr)
+               return;
+       seq_printf(m, "%u\t%d\t%s:%u\t%ps\n",
+                       td.cb_nr,
+                       jiffies_to_msecs(now - td.start_jif),
+                       td.caller_fn, td.line,
+                       td.cb_addr);
+}
+
+static void seq_print_timing_details(struct seq_file *m,
+               const char *title,
+               unsigned int cb_nr, struct drbd_thread_timing_details *tdp, unsigned long now)
+{
+       unsigned int start_idx;
+       unsigned int i;
+
+       seq_printf(m, "%s\n", title);
+       /* If not much is going on, this will result in natural ordering.
+        * If it is very busy, we will possibly skip events, or even see wrap
+        * arounds, which could only be avoided with locking.
+        */
+       start_idx = cb_nr % DRBD_THREAD_DETAILS_HIST;
+       for (i = start_idx; i < DRBD_THREAD_DETAILS_HIST; i++)
+               seq_print_one_timing_detail(m, tdp+i, now);
+       for (i = 0; i < start_idx; i++)
+               seq_print_one_timing_detail(m, tdp+i, now);
+}
+
+static int callback_history_show(struct seq_file *m, void *ignored)
+{
+       struct drbd_connection *connection = m->private;
+       unsigned long jif = jiffies;
+
+       /* BUMP me if you change the file format/content/presentation */
+       seq_printf(m, "v: %u\n\n", 0);
+
+       seq_puts(m, "n\tage\tcallsite\tfn\n");
+       seq_print_timing_details(m, "worker", connection->w_cb_nr, connection->w_timing_details, jif);
+       seq_print_timing_details(m, "receiver", connection->r_cb_nr, connection->r_timing_details, jif);
+       return 0;
+}
+
+static int callback_history_open(struct inode *inode, struct file *file)
+{
+       struct drbd_connection *connection = inode->i_private;
+       return drbd_single_open(file, callback_history_show, connection,
+                               &connection->kref, drbd_destroy_connection);
+}
+
+static int callback_history_release(struct inode *inode, struct file *file)
+{
+       struct drbd_connection *connection = inode->i_private;
+       kref_put(&connection->kref, drbd_destroy_connection);
+       return single_release(inode, file);
+}
+
+static const struct file_operations connection_callback_history_fops = {
+       .owner          = THIS_MODULE,
+       .open           = callback_history_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = callback_history_release,
+};
+
+static int connection_oldest_requests_show(struct seq_file *m, void *ignored)
+{
+       struct drbd_connection *connection = m->private;
+       unsigned long now = jiffies;
+       struct drbd_request *r1, *r2;
+
+       /* BUMP me if you change the file format/content/presentation */
+       seq_printf(m, "v: %u\n\n", 0);
+
+       spin_lock_irq(&connection->resource->req_lock);
+       r1 = connection->req_next;
+       if (r1)
+               seq_print_minor_vnr_req(m, r1, now);
+       r2 = connection->req_ack_pending;
+       if (r2 && r2 != r1) {
+               r1 = r2;
+               seq_print_minor_vnr_req(m, r1, now);
+       }
+       r2 = connection->req_not_net_done;
+       if (r2 && r2 != r1)
+               seq_print_minor_vnr_req(m, r2, now);
+       spin_unlock_irq(&connection->resource->req_lock);
+       return 0;
+}
+
+static int connection_oldest_requests_open(struct inode *inode, struct file *file)
+{
+       struct drbd_connection *connection = inode->i_private;
+       return drbd_single_open(file, connection_oldest_requests_show, connection,
+                               &connection->kref, drbd_destroy_connection);
+}
+
+static int connection_oldest_requests_release(struct inode *inode, struct file *file)
+{
+       struct drbd_connection *connection = inode->i_private;
+       kref_put(&connection->kref, drbd_destroy_connection);
+       return single_release(inode, file);
+}
+
+static const struct file_operations connection_oldest_requests_fops = {
+       .owner          = THIS_MODULE,
+       .open           = connection_oldest_requests_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = connection_oldest_requests_release,
+};
+
+void drbd_debugfs_connection_add(struct drbd_connection *connection)
+{
+       struct dentry *conns_dir = connection->resource->debugfs_res_connections;
+       struct dentry *dentry;
+       if (!conns_dir)
+               return;
+
+       /* Once we enable mutliple peers,
+        * these connections will have descriptive names.
+        * For now, it is just the one connection to the (only) "peer". */
+       dentry = debugfs_create_dir("peer", conns_dir);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       connection->debugfs_conn = dentry;
+
+       dentry = debugfs_create_file("callback_history", S_IRUSR|S_IRGRP,
+                       connection->debugfs_conn, connection,
+                       &connection_callback_history_fops);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       connection->debugfs_conn_callback_history = dentry;
+
+       dentry = debugfs_create_file("oldest_requests", S_IRUSR|S_IRGRP,
+                       connection->debugfs_conn, connection,
+                       &connection_oldest_requests_fops);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       connection->debugfs_conn_oldest_requests = dentry;
+       return;
+
+fail:
+       drbd_debugfs_connection_cleanup(connection);
+       drbd_err(connection, "failed to create debugfs dentry\n");
+}
+
+void drbd_debugfs_connection_cleanup(struct drbd_connection *connection)
+{
+       drbd_debugfs_remove(&connection->debugfs_conn_callback_history);
+       drbd_debugfs_remove(&connection->debugfs_conn_oldest_requests);
+       drbd_debugfs_remove(&connection->debugfs_conn);
+}
+
+static void resync_dump_detail(struct seq_file *m, struct lc_element *e)
+{
+       struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
+
+       seq_printf(m, "%5d %s %s %s\n", bme->rs_left,
+                 test_bit(BME_NO_WRITES, &bme->flags) ? "NO_WRITES" : "---------",
+                 test_bit(BME_LOCKED, &bme->flags) ? "LOCKED" : "------",
+                 test_bit(BME_PRIORITY, &bme->flags) ? "PRIORITY" : "--------"
+                 );
+}
+
+static int device_resync_extents_show(struct seq_file *m, void *ignored)
+{
+       struct drbd_device *device = m->private;
+
+       /* BUMP me if you change the file format/content/presentation */
+       seq_printf(m, "v: %u\n\n", 0);
+
+       if (get_ldev_if_state(device, D_FAILED)) {
+               lc_seq_printf_stats(m, device->resync);
+               lc_seq_dump_details(m, device->resync, "rs_left flags", resync_dump_detail);
+               put_ldev(device);
+       }
+       return 0;
+}
+
+static int device_act_log_extents_show(struct seq_file *m, void *ignored)
+{
+       struct drbd_device *device = m->private;
+
+       /* BUMP me if you change the file format/content/presentation */
+       seq_printf(m, "v: %u\n\n", 0);
+
+       if (get_ldev_if_state(device, D_FAILED)) {
+               lc_seq_printf_stats(m, device->act_log);
+               lc_seq_dump_details(m, device->act_log, "", NULL);
+               put_ldev(device);
+       }
+       return 0;
+}
+
+static int device_oldest_requests_show(struct seq_file *m, void *ignored)
+{
+       struct drbd_device *device = m->private;
+       struct drbd_resource *resource = device->resource;
+       unsigned long now = jiffies;
+       struct drbd_request *r1, *r2;
+       int i;
+
+       /* BUMP me if you change the file format/content/presentation */
+       seq_printf(m, "v: %u\n\n", 0);
+
+       seq_puts(m, RQ_HDR);
+       spin_lock_irq(&resource->req_lock);
+       /* WRITE, then READ */
+       for (i = 1; i >= 0; --i) {
+               r1 = list_first_entry_or_null(&device->pending_master_completion[i],
+                       struct drbd_request, req_pending_master_completion);
+               r2 = list_first_entry_or_null(&device->pending_completion[i],
+                       struct drbd_request, req_pending_local);
+               if (r1)
+                       seq_print_one_request(m, r1, now);
+               if (r2 && r2 != r1)
+                       seq_print_one_request(m, r2, now);
+       }
+       spin_unlock_irq(&resource->req_lock);
+       return 0;
+}
+
+static int device_data_gen_id_show(struct seq_file *m, void *ignored)
+{
+       struct drbd_device *device = m->private;
+       struct drbd_md *md;
+       enum drbd_uuid_index idx;
+
+       if (!get_ldev_if_state(device, D_FAILED))
+               return -ENODEV;
+
+       md = &device->ldev->md;
+       spin_lock_irq(&md->uuid_lock);
+       for (idx = UI_CURRENT; idx <= UI_HISTORY_END; idx++) {
+               seq_printf(m, "0x%016llX\n", md->uuid[idx]);
+       }
+       spin_unlock_irq(&md->uuid_lock);
+       put_ldev(device);
+       return 0;
+}
+
+#define drbd_debugfs_device_attr(name)                                         \
+static int device_ ## name ## _open(struct inode *inode, struct file *file)    \
+{                                                                              \
+       struct drbd_device *device = inode->i_private;                          \
+       return drbd_single_open(file, device_ ## name ## _show, device,         \
+                               &device->kref, drbd_destroy_device);            \
+}                                                                              \
+static int device_ ## name ## _release(struct inode *inode, struct file *file) \
+{                                                                              \
+       struct drbd_device *device = inode->i_private;                          \
+       kref_put(&device->kref, drbd_destroy_device);                           \
+       return single_release(inode, file);                                     \
+}                                                                              \
+static const struct file_operations device_ ## name ## _fops = {               \
+       .owner          = THIS_MODULE,                                          \
+       .open           = device_ ## name ## _open,                             \
+       .read           = seq_read,                                             \
+       .llseek         = seq_lseek,                                            \
+       .release        = device_ ## name ## _release,                          \
+};
+
+drbd_debugfs_device_attr(oldest_requests)
+drbd_debugfs_device_attr(act_log_extents)
+drbd_debugfs_device_attr(resync_extents)
+drbd_debugfs_device_attr(data_gen_id)
+
+void drbd_debugfs_device_add(struct drbd_device *device)
+{
+       struct dentry *vols_dir = device->resource->debugfs_res_volumes;
+       char minor_buf[8]; /* MINORMASK, MINORBITS == 20; */
+       char vnr_buf[8];   /* volume number vnr is even 16 bit only; */
+       char *slink_name = NULL;
+
+       struct dentry *dentry;
+       if (!vols_dir || !drbd_debugfs_minors)
+               return;
+
+       snprintf(vnr_buf, sizeof(vnr_buf), "%u", device->vnr);
+       dentry = debugfs_create_dir(vnr_buf, vols_dir);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       device->debugfs_vol = dentry;
+
+       snprintf(minor_buf, sizeof(minor_buf), "%u", device->minor);
+       slink_name = kasprintf(GFP_KERNEL, "../resources/%s/volumes/%u",
+                       device->resource->name, device->vnr);
+       if (!slink_name)
+               goto fail;
+       dentry = debugfs_create_symlink(minor_buf, drbd_debugfs_minors, slink_name);
+       kfree(slink_name);
+       slink_name = NULL;
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       device->debugfs_minor = dentry;
+
+#define DCF(name)      do {                                    \
+       dentry = debugfs_create_file(#name, S_IRUSR|S_IRGRP,    \
+                       device->debugfs_vol, device,            \
+                       &device_ ## name ## _fops);             \
+       if (IS_ERR_OR_NULL(dentry))                             \
+               goto fail;                                      \
+       device->debugfs_vol_ ## name = dentry;                  \
+       } while (0)
+
+       DCF(oldest_requests);
+       DCF(act_log_extents);
+       DCF(resync_extents);
+       DCF(data_gen_id);
+#undef DCF
+       return;
+
+fail:
+       drbd_debugfs_device_cleanup(device);
+       drbd_err(device, "failed to create debugfs entries\n");
+}
+
+void drbd_debugfs_device_cleanup(struct drbd_device *device)
+{
+       drbd_debugfs_remove(&device->debugfs_minor);
+       drbd_debugfs_remove(&device->debugfs_vol_oldest_requests);
+       drbd_debugfs_remove(&device->debugfs_vol_act_log_extents);
+       drbd_debugfs_remove(&device->debugfs_vol_resync_extents);
+       drbd_debugfs_remove(&device->debugfs_vol_data_gen_id);
+       drbd_debugfs_remove(&device->debugfs_vol);
+}
+
+void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device)
+{
+       struct dentry *conn_dir = peer_device->connection->debugfs_conn;
+       struct dentry *dentry;
+       char vnr_buf[8];
+
+       if (!conn_dir)
+               return;
+
+       snprintf(vnr_buf, sizeof(vnr_buf), "%u", peer_device->device->vnr);
+       dentry = debugfs_create_dir(vnr_buf, conn_dir);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       peer_device->debugfs_peer_dev = dentry;
+       return;
+
+fail:
+       drbd_debugfs_peer_device_cleanup(peer_device);
+       drbd_err(peer_device, "failed to create debugfs entries\n");
+}
+
+void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device)
+{
+       drbd_debugfs_remove(&peer_device->debugfs_peer_dev);
+}
+
+static int drbd_version_show(struct seq_file *m, void *ignored)
+{
+       seq_printf(m, "# %s\n", drbd_buildtag());
+       seq_printf(m, "VERSION=%s\n", REL_VERSION);
+       seq_printf(m, "API_VERSION=%u\n", API_VERSION);
+       seq_printf(m, "PRO_VERSION_MIN=%u\n", PRO_VERSION_MIN);
+       seq_printf(m, "PRO_VERSION_MAX=%u\n", PRO_VERSION_MAX);
+       return 0;
+}
+
+static int drbd_version_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, drbd_version_show, NULL);
+}
+
+static struct file_operations drbd_version_fops = {
+       .owner = THIS_MODULE,
+       .open = drbd_version_open,
+       .llseek = seq_lseek,
+       .read = seq_read,
+       .release = single_release,
+};
+
+/* not __exit, may be indirectly called
+ * from the module-load-failure path as well. */
+void drbd_debugfs_cleanup(void)
+{
+       drbd_debugfs_remove(&drbd_debugfs_resources);
+       drbd_debugfs_remove(&drbd_debugfs_minors);
+       drbd_debugfs_remove(&drbd_debugfs_version);
+       drbd_debugfs_remove(&drbd_debugfs_root);
+}
+
+int __init drbd_debugfs_init(void)
+{
+       struct dentry *dentry;
+
+       dentry = debugfs_create_dir("drbd", NULL);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       drbd_debugfs_root = dentry;
+
+       dentry = debugfs_create_file("version", 0444, drbd_debugfs_root, NULL, &drbd_version_fops);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       drbd_debugfs_version = dentry;
+
+       dentry = debugfs_create_dir("resources", drbd_debugfs_root);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       drbd_debugfs_resources = dentry;
+
+       dentry = debugfs_create_dir("minors", drbd_debugfs_root);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       drbd_debugfs_minors = dentry;
+       return 0;
+
+fail:
+       drbd_debugfs_cleanup();
+       if (dentry)
+               return PTR_ERR(dentry);
+       else
+               return -EINVAL;
+}
diff --git a/drivers/block/drbd/drbd_debugfs.h b/drivers/block/drbd/drbd_debugfs.h
new file mode 100644 (file)
index 0000000..8bee213
--- /dev/null
@@ -0,0 +1,39 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+
+#include "drbd_int.h"
+
+#ifdef CONFIG_DEBUG_FS
+int __init drbd_debugfs_init(void);
+void drbd_debugfs_cleanup(void);
+
+void drbd_debugfs_resource_add(struct drbd_resource *resource);
+void drbd_debugfs_resource_cleanup(struct drbd_resource *resource);
+
+void drbd_debugfs_connection_add(struct drbd_connection *connection);
+void drbd_debugfs_connection_cleanup(struct drbd_connection *connection);
+
+void drbd_debugfs_device_add(struct drbd_device *device);
+void drbd_debugfs_device_cleanup(struct drbd_device *device);
+
+void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device);
+void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device);
+#else
+
+static inline int __init drbd_debugfs_init(void) { return -ENODEV; }
+static inline void drbd_debugfs_cleanup(void) { }
+
+static inline void drbd_debugfs_resource_add(struct drbd_resource *resource) { }
+static inline void drbd_debugfs_resource_cleanup(struct drbd_resource *resource) { }
+
+static inline void drbd_debugfs_connection_add(struct drbd_connection *connection) { }
+static inline void drbd_debugfs_connection_cleanup(struct drbd_connection *connection) { }
+
+static inline void drbd_debugfs_device_add(struct drbd_device *device) { }
+static inline void drbd_debugfs_device_cleanup(struct drbd_device *device) { }
+
+static inline void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device) { }
+static inline void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device) { }
+
+#endif
index a76ceb344d64e9411bbfd39c4db9189e519fac72..1a000016ccdfb8bfecf6769a09bb9c48a6aab215 100644 (file)
@@ -317,7 +317,63 @@ struct drbd_request {
 
        struct list_head tl_requests; /* ring list in the transfer log */
        struct bio *master_bio;       /* master bio pointer */
-       unsigned long start_time;
+
+       /* see struct drbd_device */
+       struct list_head req_pending_master_completion;
+       struct list_head req_pending_local;
+
+       /* for generic IO accounting */
+       unsigned long start_jif;
+
+       /* for DRBD internal statistics */
+
+       /* Minimal set of time stamps to determine if we wait for activity log
+        * transactions, local disk or peer.  32 bit "jiffies" are good enough,
+        * we don't expect a DRBD request to be stalled for several month.
+        */
+
+       /* before actual request processing */
+       unsigned long in_actlog_jif;
+
+       /* local disk */
+       unsigned long pre_submit_jif;
+
+       /* per connection */
+       unsigned long pre_send_jif;
+       unsigned long acked_jif;
+       unsigned long net_done_jif;
+
+       /* Possibly even more detail to track each phase:
+        *  master_completion_jif
+        *      how long did it take to complete the master bio
+        *      (application visible latency)
+        *  allocated_jif
+        *      how long the master bio was blocked until we finally allocated
+        *      a tracking struct
+        *  in_actlog_jif
+        *      how long did we wait for activity log transactions
+        *
+        *  net_queued_jif
+        *      when did we finally queue it for sending
+        *  pre_send_jif
+        *      when did we start sending it
+        *  post_send_jif
+        *      how long did we block in the network stack trying to send it
+        *  acked_jif
+        *      when did we receive (or fake, in protocol A) a remote ACK
+        *  net_done_jif
+        *      when did we receive final acknowledgement (P_BARRIER_ACK),
+        *      or decide, e.g. on connection loss, that we do no longer expect
+        *      anything from this peer for this request.
+        *
+        *  pre_submit_jif
+        *  post_sub_jif
+        *      when did we start submiting to the lower level device,
+        *      and how long did we block in that submit function
+        *  local_completion_jif
+        *      how long did it take the lower level device to complete this request
+        */
+
 
        /* once it hits 0, we may complete the master_bio */
        atomic_t completion_ref;
@@ -366,6 +422,7 @@ struct drbd_peer_request {
        struct drbd_interval i;
        /* see comments on ee flag bits below */
        unsigned long flags;
+       unsigned long submit_jif;
        union {
                u64 block_id;
                struct digest_info *digest;
@@ -408,6 +465,17 @@ enum {
 
        /* Is set when net_conf had two_primaries set while creating this peer_req */
        __EE_IN_INTERVAL_TREE,
+
+       /* for debugfs: */
+       /* has this been submitted, or does it still wait for something else? */
+       __EE_SUBMITTED,
+
+       /* this is/was a write request */
+       __EE_WRITE,
+
+       /* this originates from application on peer
+        * (not some resync or verify or other DRBD internal request) */
+       __EE_APPLICATION,
 };
 #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
 #define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
@@ -419,6 +487,9 @@ enum {
 #define EE_RESTART_REQUESTS    (1<<__EE_RESTART_REQUESTS)
 #define EE_SEND_WRITE_ACK      (1<<__EE_SEND_WRITE_ACK)
 #define EE_IN_INTERVAL_TREE    (1<<__EE_IN_INTERVAL_TREE)
+#define EE_SUBMITTED           (1<<__EE_SUBMITTED)
+#define EE_WRITE               (1<<__EE_WRITE)
+#define EE_APPLICATION         (1<<__EE_APPLICATION)
 
 /* flag bits per device */
 enum {
@@ -433,11 +504,11 @@ enum {
        CONSIDER_RESYNC,
 
        MD_NO_FUA,              /* Users wants us to not use FUA/FLUSH on meta data dev */
+
        SUSPEND_IO,             /* suspend application io */
        BITMAP_IO,              /* suspend application io;
                                   once no more io in flight, start bitmap io */
        BITMAP_IO_QUEUED,       /* Started bitmap IO */
-       GO_DISKLESS,            /* Disk is being detached, on io-error or admin request. */
        WAS_IO_ERROR,           /* Local disk failed, returned IO error */
        WAS_READ_ERROR,         /* Local disk READ failed (set additionally to the above) */
        FORCE_DETACH,           /* Force-detach from local disk, aborting any pending local IO */
@@ -450,6 +521,20 @@ enum {
        B_RS_H_DONE,            /* Before resync handler done (already executed) */
        DISCARD_MY_DATA,        /* discard_my_data flag per volume */
        READ_BALANCE_RR,
+
+       FLUSH_PENDING,          /* if set, device->flush_jif is when we submitted that flush
+                                * from drbd_flush_after_epoch() */
+
+       /* cleared only after backing device related structures have been destroyed. */
+       GOING_DISKLESS,         /* Disk is being detached, because of io-error, or admin request. */
+
+       /* to be used in drbd_device_post_work() */
+       GO_DISKLESS,            /* tell worker to schedule cleanup before detach */
+       DESTROY_DISK,           /* tell worker to close backing devices and destroy related structures. */
+       MD_SYNC,                /* tell worker to call drbd_md_sync() */
+       RS_START,               /* tell worker to start resync/OV */
+       RS_PROGRESS,            /* tell worker that resync made significant progress */
+       RS_DONE,                /* tell worker that resync is done */
 };
 
 struct drbd_bitmap; /* opaque for drbd_device */
@@ -531,6 +616,11 @@ struct drbd_backing_dev {
 };
 
 struct drbd_md_io {
+       struct page *page;
+       unsigned long start_jif;        /* last call to drbd_md_get_buffer */
+       unsigned long submit_jif;       /* last _drbd_md_sync_page_io() submit */
+       const char *current_use;
+       atomic_t in_use;
        unsigned int done;
        int error;
 };
@@ -577,10 +667,18 @@ enum {
                                 * and potentially deadlock on, this drbd worker.
                                 */
        DISCONNECT_SENT,
+
+       DEVICE_WORK_PENDING,    /* tell worker that some device has pending work */
 };
 
 struct drbd_resource {
        char *name;
+#ifdef CONFIG_DEBUG_FS
+       struct dentry *debugfs_res;
+       struct dentry *debugfs_res_volumes;
+       struct dentry *debugfs_res_connections;
+       struct dentry *debugfs_res_in_flight_summary;
+#endif
        struct kref kref;
        struct idr devices;             /* volume number to device mapping */
        struct list_head connections;
@@ -594,12 +692,28 @@ struct drbd_resource {
        unsigned susp_nod:1;            /* IO suspended because no data */
        unsigned susp_fen:1;            /* IO suspended because fence peer handler runs */
 
+       enum write_ordering_e write_ordering;
+
        cpumask_var_t cpu_mask;
 };
 
+struct drbd_thread_timing_details
+{
+       unsigned long start_jif;
+       void *cb_addr;
+       const char *caller_fn;
+       unsigned int line;
+       unsigned int cb_nr;
+};
+
 struct drbd_connection {
        struct list_head connections;
        struct drbd_resource *resource;
+#ifdef CONFIG_DEBUG_FS
+       struct dentry *debugfs_conn;
+       struct dentry *debugfs_conn_callback_history;
+       struct dentry *debugfs_conn_oldest_requests;
+#endif
        struct kref kref;
        struct idr peer_devices;        /* volume number to peer device mapping */
        enum drbd_conns cstate;         /* Only C_STANDALONE to C_WF_REPORT_PARAMS */
@@ -636,7 +750,6 @@ struct drbd_connection {
        struct drbd_epoch *current_epoch;
        spinlock_t epoch_lock;
        unsigned int epochs;
-       enum write_ordering_e write_ordering;
        atomic_t current_tle_nr;        /* transfer log epoch number */
        unsigned current_tle_writes;    /* writes seen within this tl epoch */
 
@@ -645,9 +758,22 @@ struct drbd_connection {
        struct drbd_thread worker;
        struct drbd_thread asender;
 
+       /* cached pointers,
+        * so we can look up the oldest pending requests more quickly.
+        * protected by resource->req_lock */
+       struct drbd_request *req_next; /* DRBD 9: todo.req_next */
+       struct drbd_request *req_ack_pending;
+       struct drbd_request *req_not_net_done;
+
        /* sender side */
        struct drbd_work_queue sender_work;
 
+#define DRBD_THREAD_DETAILS_HIST       16
+       unsigned int w_cb_nr; /* keeps counting up */
+       unsigned int r_cb_nr; /* keeps counting up */
+       struct drbd_thread_timing_details w_timing_details[DRBD_THREAD_DETAILS_HIST];
+       struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST];
+
        struct {
                /* whether this sender thread
                 * has processed a single write yet. */
@@ -663,11 +789,22 @@ struct drbd_connection {
        } send;
 };
 
+void __update_timing_details(
+               struct drbd_thread_timing_details *tdp,
+               unsigned int *cb_nr,
+               void *cb,
+               const char *fn, const unsigned int line);
+
+#define update_worker_timing_details(c, cb) \
+       __update_timing_details(c->w_timing_details, &c->w_cb_nr, cb, __func__ , __LINE__ )
+#define update_receiver_timing_details(c, cb) \
+       __update_timing_details(c->r_timing_details, &c->r_cb_nr, cb, __func__ , __LINE__ )
+
 struct submit_worker {
        struct workqueue_struct *wq;
        struct work_struct worker;
 
-       spinlock_t lock;
+       /* protected by ..->resource->req_lock */
        struct list_head writes;
 };
 
@@ -675,12 +812,29 @@ struct drbd_peer_device {
        struct list_head peer_devices;
        struct drbd_device *device;
        struct drbd_connection *connection;
+#ifdef CONFIG_DEBUG_FS
+       struct dentry *debugfs_peer_dev;
+#endif
 };
 
 struct drbd_device {
        struct drbd_resource *resource;
        struct list_head peer_devices;
-       int vnr;                        /* volume number within the connection */
+       struct list_head pending_bitmap_io;
+
+       unsigned long flush_jif;
+#ifdef CONFIG_DEBUG_FS
+       struct dentry *debugfs_minor;
+       struct dentry *debugfs_vol;
+       struct dentry *debugfs_vol_oldest_requests;
+       struct dentry *debugfs_vol_act_log_extents;
+       struct dentry *debugfs_vol_resync_extents;
+       struct dentry *debugfs_vol_data_gen_id;
+#endif
+
+       unsigned int vnr;       /* volume number within the connection */
+       unsigned int minor;     /* device minor number */
+
        struct kref kref;
 
        /* things that are stored as / read from meta data on disk */
@@ -697,19 +851,10 @@ struct drbd_device {
        unsigned long last_reattach_jif;
        struct drbd_work resync_work;
        struct drbd_work unplug_work;
-       struct drbd_work go_diskless;
-       struct drbd_work md_sync_work;
-       struct drbd_work start_resync_work;
        struct timer_list resync_timer;
        struct timer_list md_sync_timer;
        struct timer_list start_resync_timer;
        struct timer_list request_timer;
-#ifdef DRBD_DEBUG_MD_SYNC
-       struct {
-               unsigned int line;
-               const char* func;
-       } last_md_mark_dirty;
-#endif
 
        /* Used after attach while negotiating new disk state. */
        union drbd_state new_state_tmp;
@@ -724,6 +869,7 @@ struct drbd_device {
        unsigned int al_writ_cnt;
        unsigned int bm_writ_cnt;
        atomic_t ap_bio_cnt;     /* Requests we need to complete */
+       atomic_t ap_actlog_cnt;  /* Requests waiting for activity log */
        atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
        atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
        atomic_t unacked_cnt;    /* Need to send replies for */
@@ -733,6 +879,13 @@ struct drbd_device {
        struct rb_root read_requests;
        struct rb_root write_requests;
 
+       /* for statistics and timeouts */
+       /* [0] read, [1] write */
+       struct list_head pending_master_completion[2];
+       struct list_head pending_completion[2];
+
+       /* use checksums for *this* resync */
+       bool use_csums;
        /* blocks to resync in this run [unit BM_BLOCK_SIZE] */
        unsigned long rs_total;
        /* number of resync blocks that failed in this run */
@@ -788,9 +941,7 @@ struct drbd_device {
        atomic_t pp_in_use;             /* allocated from page pool */
        atomic_t pp_in_use_by_net;      /* sendpage()d, still referenced by tcp */
        wait_queue_head_t ee_wait;
-       struct page *md_io_page;        /* one page buffer for md_io */
        struct drbd_md_io md_io;
-       atomic_t md_io_in_use;          /* protects the md_io, md_io_page and md_io_tmpp */
        spinlock_t al_lock;
        wait_queue_head_t al_wait;
        struct lru_cache *act_log;      /* activity log */
@@ -800,7 +951,6 @@ struct drbd_device {
        atomic_t packet_seq;
        unsigned int peer_seq;
        spinlock_t peer_seq_lock;
-       unsigned int minor;
        unsigned long comm_bm_set; /* communicated number of set bits. */
        struct bm_io_work bm_io_work;
        u64 ed_uuid; /* UUID of the exposed data */
@@ -824,6 +974,21 @@ struct drbd_device {
        struct submit_worker submit;
 };
 
+struct drbd_bm_aio_ctx {
+       struct drbd_device *device;
+       struct list_head list; /* on device->pending_bitmap_io */;
+       unsigned long start_jif;
+       atomic_t in_flight;
+       unsigned int done;
+       unsigned flags;
+#define BM_AIO_COPY_PAGES      1
+#define BM_AIO_WRITE_HINTED    2
+#define BM_AIO_WRITE_ALL_PAGES 4
+#define BM_AIO_READ            8
+       int error;
+       struct kref kref;
+};
+
 struct drbd_config_context {
        /* assigned from drbd_genlmsghdr */
        unsigned int minor;
@@ -949,7 +1114,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int
 extern int drbd_send_bitmap(struct drbd_device *device);
 extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode);
 extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode);
-extern void drbd_free_bc(struct drbd_backing_dev *ldev);
+extern void drbd_free_ldev(struct drbd_backing_dev *ldev);
 extern void drbd_device_cleanup(struct drbd_device *device);
 void drbd_print_uuids(struct drbd_device *device, const char *text);
 
@@ -966,13 +1131,7 @@ extern void __drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must
 extern void drbd_md_set_flag(struct drbd_device *device, int flags) __must_hold(local);
 extern void drbd_md_clear_flag(struct drbd_device *device, int flags)__must_hold(local);
 extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
-#ifndef DRBD_DEBUG_MD_SYNC
 extern void drbd_md_mark_dirty(struct drbd_device *device);
-#else
-#define drbd_md_mark_dirty(m)  drbd_md_mark_dirty_(m, __LINE__ , __func__ )
-extern void drbd_md_mark_dirty_(struct drbd_device *device,
-               unsigned int line, const char *func);
-#endif
 extern void drbd_queue_bitmap_io(struct drbd_device *device,
                                 int (*io_fn)(struct drbd_device *),
                                 void (*done)(struct drbd_device *, int),
@@ -983,9 +1142,8 @@ extern int drbd_bitmap_io(struct drbd_device *device,
 extern int drbd_bitmap_io_from_worker(struct drbd_device *device,
                int (*io_fn)(struct drbd_device *),
                char *why, enum bm_flag flags);
-extern int drbd_bmio_set_n_write(struct drbd_device *device);
-extern int drbd_bmio_clear_n_write(struct drbd_device *device);
-extern void drbd_ldev_destroy(struct drbd_device *device);
+extern int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local);
+extern int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local);
 
 /* Meta data layout
  *
@@ -1105,17 +1263,21 @@ struct bm_extent {
 /* in which _bitmap_ extent (resp. sector) the bit for a certain
  * _storage_ sector is located in */
 #define BM_SECT_TO_EXT(x)   ((x)>>(BM_EXT_SHIFT-9))
+#define BM_BIT_TO_EXT(x)    ((x) >> (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
 
-/* how much _storage_ sectors we have per bitmap sector */
+/* first storage sector a bitmap extent corresponds to */
 #define BM_EXT_TO_SECT(x)   ((sector_t)(x) << (BM_EXT_SHIFT-9))
+/* how much _storage_ sectors we have per bitmap extent */
 #define BM_SECT_PER_EXT     BM_EXT_TO_SECT(1)
+/* how many bits are covered by one bitmap extent (resync extent) */
+#define BM_BITS_PER_EXT     (1UL << (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
+
+#define BM_BLOCKS_PER_BM_EXT_MASK  (BM_BITS_PER_EXT - 1)
+
 
 /* in one sector of the bitmap, we have this many activity_log extents. */
 #define AL_EXT_PER_BM_SECT  (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
 
-#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT)
-#define BM_BLOCKS_PER_BM_EXT_MASK  ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1)
-
 /* the extent in "PER_EXTENT" below is an activity log extent
  * we need that many (long words/bytes) to store the bitmap
  *                  of one AL_EXTENT_SIZE chunk of storage.
@@ -1195,11 +1357,11 @@ extern void _drbd_bm_set_bits(struct drbd_device *device,
                const unsigned long s, const unsigned long e);
 extern int  drbd_bm_test_bit(struct drbd_device *device, unsigned long bitnr);
 extern int  drbd_bm_e_weight(struct drbd_device *device, unsigned long enr);
-extern int  drbd_bm_write_page(struct drbd_device *device, unsigned int idx) __must_hold(local);
 extern int  drbd_bm_read(struct drbd_device *device) __must_hold(local);
 extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr);
 extern int  drbd_bm_write(struct drbd_device *device) __must_hold(local);
 extern int  drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local);
+extern int  drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local);
 extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local);
 extern int  drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local);
 extern size_t       drbd_bm_words(struct drbd_device *device);
@@ -1213,7 +1375,6 @@ extern unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned lon
 extern unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo);
 extern unsigned long _drbd_bm_total_weight(struct drbd_device *device);
 extern unsigned long drbd_bm_total_weight(struct drbd_device *device);
-extern int drbd_bm_rs_done(struct drbd_device *device);
 /* for receive_bitmap */
 extern void drbd_bm_merge_lel(struct drbd_device *device, size_t offset,
                size_t number, unsigned long *buffer);
@@ -1312,7 +1473,7 @@ enum determine_dev_size {
 extern enum determine_dev_size
 drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local);
 extern void resync_after_online_grow(struct drbd_device *);
-extern void drbd_reconsider_max_bio_size(struct drbd_device *device);
+extern void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev);
 extern enum drbd_state_rv drbd_set_role(struct drbd_device *device,
                                        enum drbd_role new_role,
                                        int force);
@@ -1333,7 +1494,7 @@ extern void resume_next_sg(struct drbd_device *device);
 extern void suspend_other_sg(struct drbd_device *device);
 extern int drbd_resync_finished(struct drbd_device *device);
 /* maybe rather drbd_main.c ? */
-extern void *drbd_md_get_buffer(struct drbd_device *device);
+extern void *drbd_md_get_buffer(struct drbd_device *device, const char *intent);
 extern void drbd_md_put_buffer(struct drbd_device *device);
 extern int drbd_md_sync_page_io(struct drbd_device *device,
                struct drbd_backing_dev *bdev, sector_t sector, int rw);
@@ -1380,7 +1541,8 @@ extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
 extern int drbd_receiver(struct drbd_thread *thi);
 extern int drbd_asender(struct drbd_thread *thi);
 extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
-extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector);
+extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
+               bool throttle_if_app_is_waiting);
 extern int drbd_submit_peer_request(struct drbd_device *,
                                    struct drbd_peer_request *, const unsigned,
                                    const int);
@@ -1464,10 +1626,7 @@ static inline void drbd_generic_make_request(struct drbd_device *device,
 {
        __release(local);
        if (!bio->bi_bdev) {
-               printk(KERN_ERR "drbd%d: drbd_generic_make_request: "
-                               "bio->bi_bdev == NULL\n",
-                      device_to_minor(device));
-               dump_stack();
+               drbd_err(device, "drbd_generic_make_request: bio->bi_bdev == NULL\n");
                bio_endio(bio, -ENODEV);
                return;
        }
@@ -1478,7 +1637,8 @@ static inline void drbd_generic_make_request(struct drbd_device *device,
                generic_make_request(bio);
 }
 
-void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo);
+void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
+                             enum write_ordering_e wo);
 
 /* drbd_proc.c */
 extern struct proc_dir_entry *drbd_proc;
@@ -1489,9 +1649,9 @@ extern const char *drbd_role_str(enum drbd_role s);
 /* drbd_actlog.c */
 extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i);
 extern int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i);
-extern void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate);
+extern void drbd_al_begin_io_commit(struct drbd_device *device);
 extern bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i);
-extern void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i, bool delegate);
+extern void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i);
 extern void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i);
 extern void drbd_rs_complete_io(struct drbd_device *device, sector_t sector);
 extern int drbd_rs_begin_io(struct drbd_device *device, sector_t sector);
@@ -1501,14 +1661,17 @@ extern int drbd_rs_del_all(struct drbd_device *device);
 extern void drbd_rs_failed_io(struct drbd_device *device,
                sector_t sector, int size);
 extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go);
-extern void __drbd_set_in_sync(struct drbd_device *device, sector_t sector,
-               int size, const char *file, const unsigned int line);
+
+enum update_sync_bits_mode { RECORD_RS_FAILED, SET_OUT_OF_SYNC, SET_IN_SYNC };
+extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
+               enum update_sync_bits_mode mode,
+               const char *file, const unsigned int line);
 #define drbd_set_in_sync(device, sector, size) \
-       __drbd_set_in_sync(device, sector, size, __FILE__, __LINE__)
-extern int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector,
-               int size, const char *file, const unsigned int line);
+       __drbd_change_sync(device, sector, size, SET_IN_SYNC, __FILE__, __LINE__)
 #define drbd_set_out_of_sync(device, sector, size) \
-       __drbd_set_out_of_sync(device, sector, size, __FILE__, __LINE__)
+       __drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC, __FILE__, __LINE__)
+#define drbd_rs_failed_io(device, sector, size) \
+       __drbd_change_sync(device, sector, size, RECORD_RS_FAILED, __FILE__, __LINE__)
 extern void drbd_al_shrink(struct drbd_device *device);
 extern int drbd_initialize_al(struct drbd_device *, void *);
 
@@ -1764,25 +1927,38 @@ static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev)
 }
 
 static inline void
-drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
+drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
 {
        unsigned long flags;
        spin_lock_irqsave(&q->q_lock, flags);
-       list_add(&w->list, &q->q);
+       list_add_tail(&w->list, &q->q);
        spin_unlock_irqrestore(&q->q_lock, flags);
        wake_up(&q->q_wait);
 }
 
 static inline void
-drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
+drbd_queue_work_if_unqueued(struct drbd_work_queue *q, struct drbd_work *w)
 {
        unsigned long flags;
        spin_lock_irqsave(&q->q_lock, flags);
-       list_add_tail(&w->list, &q->q);
+       if (list_empty_careful(&w->list))
+               list_add_tail(&w->list, &q->q);
        spin_unlock_irqrestore(&q->q_lock, flags);
        wake_up(&q->q_wait);
 }
 
+static inline void
+drbd_device_post_work(struct drbd_device *device, int work_bit)
+{
+       if (!test_and_set_bit(work_bit, &device->flags)) {
+               struct drbd_connection *connection =
+                       first_peer_device(device)->connection;
+               struct drbd_work_queue *q = &connection->sender_work;
+               if (!test_and_set_bit(DEVICE_WORK_PENDING, &connection->flags))
+                       wake_up(&q->q_wait);
+       }
+}
+
 extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue);
 
 static inline void wake_asender(struct drbd_connection *connection)
@@ -1859,7 +2035,7 @@ static inline void inc_ap_pending(struct drbd_device *device)
                        func, line,                                     \
                        atomic_read(&device->which))
 
-#define dec_ap_pending(device) _dec_ap_pending(device, __FUNCTION__, __LINE__)
+#define dec_ap_pending(device) _dec_ap_pending(device, __func__, __LINE__)
 static inline void _dec_ap_pending(struct drbd_device *device, const char *func, int line)
 {
        if (atomic_dec_and_test(&device->ap_pending_cnt))
@@ -1878,7 +2054,7 @@ static inline void inc_rs_pending(struct drbd_device *device)
        atomic_inc(&device->rs_pending_cnt);
 }
 
-#define dec_rs_pending(device) _dec_rs_pending(device, __FUNCTION__, __LINE__)
+#define dec_rs_pending(device) _dec_rs_pending(device, __func__, __LINE__)
 static inline void _dec_rs_pending(struct drbd_device *device, const char *func, int line)
 {
        atomic_dec(&device->rs_pending_cnt);
@@ -1899,20 +2075,29 @@ static inline void inc_unacked(struct drbd_device *device)
        atomic_inc(&device->unacked_cnt);
 }
 
-#define dec_unacked(device) _dec_unacked(device, __FUNCTION__, __LINE__)
+#define dec_unacked(device) _dec_unacked(device, __func__, __LINE__)
 static inline void _dec_unacked(struct drbd_device *device, const char *func, int line)
 {
        atomic_dec(&device->unacked_cnt);
        ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
 }
 
-#define sub_unacked(device, n) _sub_unacked(device, n, __FUNCTION__, __LINE__)
+#define sub_unacked(device, n) _sub_unacked(device, n, __func__, __LINE__)
 static inline void _sub_unacked(struct drbd_device *device, int n, const char *func, int line)
 {
        atomic_sub(n, &device->unacked_cnt);
        ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
 }
 
+static inline bool is_sync_state(enum drbd_conns connection_state)
+{
+       return
+          (connection_state == C_SYNC_SOURCE
+       ||  connection_state == C_SYNC_TARGET
+       ||  connection_state == C_PAUSED_SYNC_S
+       ||  connection_state == C_PAUSED_SYNC_T);
+}
+
 /**
  * get_ldev() - Increase the ref count on device->ldev. Returns 0 if there is no ldev
  * @M:         DRBD device.
@@ -1924,6 +2109,11 @@ static inline void _sub_unacked(struct drbd_device *device, int n, const char *f
 
 static inline void put_ldev(struct drbd_device *device)
 {
+       enum drbd_disk_state ds = device->state.disk;
+       /* We must check the state *before* the atomic_dec becomes visible,
+        * or we have a theoretical race where someone hitting zero,
+        * while state still D_FAILED, will then see D_DISKLESS in the
+        * condition below and calling into destroy, where he must not, yet. */
        int i = atomic_dec_return(&device->local_cnt);
 
        /* This may be called from some endio handler,
@@ -1932,15 +2122,13 @@ static inline void put_ldev(struct drbd_device *device)
        __release(local);
        D_ASSERT(device, i >= 0);
        if (i == 0) {
-               if (device->state.disk == D_DISKLESS)
+               if (ds == D_DISKLESS)
                        /* even internal references gone, safe to destroy */
-                       drbd_ldev_destroy(device);
-               if (device->state.disk == D_FAILED) {
+                       drbd_device_post_work(device, DESTROY_DISK);
+               if (ds == D_FAILED)
                        /* all application IO references gone. */
-                       if (!test_and_set_bit(GO_DISKLESS, &device->flags))
-                               drbd_queue_work(&first_peer_device(device)->connection->sender_work,
-                                               &device->go_diskless);
-               }
+                       if (!test_and_set_bit(GOING_DISKLESS, &device->flags))
+                               drbd_device_post_work(device, GO_DISKLESS);
                wake_up(&device->misc_wait);
        }
 }
@@ -1964,54 +2152,6 @@ static inline int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_
 extern int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins);
 #endif
 
-/* you must have an "get_ldev" reference */
-static inline void drbd_get_syncer_progress(struct drbd_device *device,
-               unsigned long *bits_left, unsigned int *per_mil_done)
-{
-       /* this is to break it at compile time when we change that, in case we
-        * want to support more than (1<<32) bits on a 32bit arch. */
-       typecheck(unsigned long, device->rs_total);
-
-       /* note: both rs_total and rs_left are in bits, i.e. in
-        * units of BM_BLOCK_SIZE.
-        * for the percentage, we don't care. */
-
-       if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
-               *bits_left = device->ov_left;
-       else
-               *bits_left = drbd_bm_total_weight(device) - device->rs_failed;
-       /* >> 10 to prevent overflow,
-        * +1 to prevent division by zero */
-       if (*bits_left > device->rs_total) {
-               /* doh. maybe a logic bug somewhere.
-                * may also be just a race condition
-                * between this and a disconnect during sync.
-                * for now, just prevent in-kernel buffer overflow.
-                */
-               smp_rmb();
-               drbd_warn(device, "cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n",
-                               drbd_conn_str(device->state.conn),
-                               *bits_left, device->rs_total, device->rs_failed);
-               *per_mil_done = 0;
-       } else {
-               /* Make sure the division happens in long context.
-                * We allow up to one petabyte storage right now,
-                * at a granularity of 4k per bit that is 2**38 bits.
-                * After shift right and multiplication by 1000,
-                * this should still fit easily into a 32bit long,
-                * so we don't need a 64bit division on 32bit arch.
-                * Note: currently we don't support such large bitmaps on 32bit
-                * arch anyways, but no harm done to be prepared for it here.
-                */
-               unsigned int shift = device->rs_total > UINT_MAX ? 16 : 10;
-               unsigned long left = *bits_left >> shift;
-               unsigned long total = 1UL + (device->rs_total >> shift);
-               unsigned long tmp = 1000UL - left * 1000UL/total;
-               *per_mil_done = tmp;
-       }
-}
-
-
 /* this throttles on-the-fly application requests
  * according to max_buffers settings;
  * maybe re-implement using semaphores? */
@@ -2201,25 +2341,6 @@ static inline int drbd_queue_order_type(struct drbd_device *device)
        return QUEUE_ORDERED_NONE;
 }
 
-static inline void drbd_md_flush(struct drbd_device *device)
-{
-       int r;
-
-       if (device->ldev == NULL) {
-               drbd_warn(device, "device->ldev == NULL in drbd_md_flush\n");
-               return;
-       }
-
-       if (test_bit(MD_NO_FUA, &device->flags))
-               return;
-
-       r = blkdev_issue_flush(device->ldev->md_bdev, GFP_NOIO, NULL);
-       if (r) {
-               set_bit(MD_NO_FUA, &device->flags);
-               drbd_err(device, "meta data flush failed with status %d, disabling md-flushes\n", r);
-       }
-}
-
 static inline struct drbd_connection *first_connection(struct drbd_resource *resource)
 {
        return list_first_entry_or_null(&resource->connections,
index f38fcb00c10d6c39b09c334483d80296b463e608..f210543f05f4782674de8abb0b0053e0e4833bfb 100644 (file)
@@ -10,7 +10,9 @@ struct drbd_interval {
        unsigned int size;      /* size in bytes */
        sector_t end;           /* highest interval end in subtree */
        int local:1             /* local or remote request? */;
-       int waiting:1;
+       int waiting:1;          /* someone is waiting for this to complete */
+       int completed:1;        /* this has been completed already;
+                                * ignore for conflict detection */
 };
 
 static inline void drbd_clear_interval(struct drbd_interval *i)
index 960645c26e6fc1b107e0db1ae016fd15800d3514..9b465bb68487b5c0e5a51f72d161ae7ade749453 100644 (file)
 
  */
 
+#define pr_fmt(fmt)    KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
+#include <linux/jiffies.h>
 #include <linux/drbd.h>
 #include <asm/uaccess.h>
 #include <asm/types.h>
 #include "drbd_int.h"
 #include "drbd_protocol.h"
 #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
-
 #include "drbd_vli.h"
+#include "drbd_debugfs.h"
 
 static DEFINE_MUTEX(drbd_main_mutex);
 static int drbd_open(struct block_device *bdev, fmode_t mode);
 static void drbd_release(struct gendisk *gd, fmode_t mode);
-static int w_md_sync(struct drbd_work *w, int unused);
 static void md_sync_timer_fn(unsigned long data);
 static int w_bitmap_io(struct drbd_work *w, int unused);
-static int w_go_diskless(struct drbd_work *w, int unused);
 
 MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
              "Lars Ellenberg <lars@linbit.com>");
@@ -264,7 +265,7 @@ bail:
 
 /**
  * _tl_restart() - Walks the transfer log, and applies an action to all requests
- * @device:    DRBD device.
+ * @connection:        DRBD connection to operate on.
  * @what:       The action/event to perform with all request objects
  *
  * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
@@ -662,6 +663,11 @@ static int __send_command(struct drbd_connection *connection, int vnr,
                            msg_flags);
        if (data && !err)
                err = drbd_send_all(connection, sock->socket, data, size, 0);
+       /* DRBD protocol "pings" are latency critical.
+        * This is supposed to trigger tcp_push_pending_frames() */
+       if (!err && (cmd == P_PING || cmd == P_PING_ACK))
+               drbd_tcp_nodelay(sock->socket);
+
        return err;
 }
 
@@ -1636,7 +1642,10 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
        if (peer_device->connection->agreed_pro_version >= 100) {
                if (req->rq_state & RQ_EXP_RECEIVE_ACK)
                        dp_flags |= DP_SEND_RECEIVE_ACK;
-               if (req->rq_state & RQ_EXP_WRITE_ACK)
+               /* During resync, request an explicit write ack,
+                * even in protocol != C */
+               if (req->rq_state & RQ_EXP_WRITE_ACK
+               || (dp_flags & DP_MAY_SET_IN_SYNC))
                        dp_flags |= DP_SEND_WRITE_ACK;
        }
        p->dp_flags = cpu_to_be32(dp_flags);
@@ -1900,6 +1909,7 @@ void drbd_init_set_defaults(struct drbd_device *device)
        drbd_set_defaults(device);
 
        atomic_set(&device->ap_bio_cnt, 0);
+       atomic_set(&device->ap_actlog_cnt, 0);
        atomic_set(&device->ap_pending_cnt, 0);
        atomic_set(&device->rs_pending_cnt, 0);
        atomic_set(&device->unacked_cnt, 0);
@@ -1908,7 +1918,7 @@ void drbd_init_set_defaults(struct drbd_device *device)
        atomic_set(&device->rs_sect_in, 0);
        atomic_set(&device->rs_sect_ev, 0);
        atomic_set(&device->ap_in_flight, 0);
-       atomic_set(&device->md_io_in_use, 0);
+       atomic_set(&device->md_io.in_use, 0);
 
        mutex_init(&device->own_state_mutex);
        device->state_mutex = &device->own_state_mutex;
@@ -1924,17 +1934,15 @@ void drbd_init_set_defaults(struct drbd_device *device)
        INIT_LIST_HEAD(&device->resync_reads);
        INIT_LIST_HEAD(&device->resync_work.list);
        INIT_LIST_HEAD(&device->unplug_work.list);
-       INIT_LIST_HEAD(&device->go_diskless.list);
-       INIT_LIST_HEAD(&device->md_sync_work.list);
-       INIT_LIST_HEAD(&device->start_resync_work.list);
        INIT_LIST_HEAD(&device->bm_io_work.w.list);
+       INIT_LIST_HEAD(&device->pending_master_completion[0]);
+       INIT_LIST_HEAD(&device->pending_master_completion[1]);
+       INIT_LIST_HEAD(&device->pending_completion[0]);
+       INIT_LIST_HEAD(&device->pending_completion[1]);
 
        device->resync_work.cb  = w_resync_timer;
        device->unplug_work.cb  = w_send_write_hint;
-       device->go_diskless.cb  = w_go_diskless;
-       device->md_sync_work.cb = w_md_sync;
        device->bm_io_work.w.cb = w_bitmap_io;
-       device->start_resync_work.cb = w_start_resync;
 
        init_timer(&device->resync_timer);
        init_timer(&device->md_sync_timer);
@@ -1992,7 +2000,7 @@ void drbd_device_cleanup(struct drbd_device *device)
                drbd_bm_cleanup(device);
        }
 
-       drbd_free_bc(device->ldev);
+       drbd_free_ldev(device->ldev);
        device->ldev = NULL;
 
        clear_bit(AL_SUSPENDED, &device->flags);
@@ -2006,7 +2014,6 @@ void drbd_device_cleanup(struct drbd_device *device)
        D_ASSERT(device, list_empty(&first_peer_device(device)->connection->sender_work.q));
        D_ASSERT(device, list_empty(&device->resync_work.list));
        D_ASSERT(device, list_empty(&device->unplug_work.list));
-       D_ASSERT(device, list_empty(&device->go_diskless.list));
 
        drbd_set_defaults(device);
 }
@@ -2129,20 +2136,6 @@ Enomem:
        return -ENOMEM;
 }
 
-static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
-       void *unused)
-{
-       /* just so we have it.  you never know what interesting things we
-        * might want to do here some day...
-        */
-
-       return NOTIFY_DONE;
-}
-
-static struct notifier_block drbd_notifier = {
-       .notifier_call = drbd_notify_sys,
-};
-
 static void drbd_release_all_peer_reqs(struct drbd_device *device)
 {
        int rr;
@@ -2173,7 +2166,7 @@ void drbd_destroy_device(struct kref *kref)
 {
        struct drbd_device *device = container_of(kref, struct drbd_device, kref);
        struct drbd_resource *resource = device->resource;
-       struct drbd_connection *connection;
+       struct drbd_peer_device *peer_device, *tmp_peer_device;
 
        del_timer_sync(&device->request_timer);
 
@@ -2187,7 +2180,7 @@ void drbd_destroy_device(struct kref *kref)
        if (device->this_bdev)
                bdput(device->this_bdev);
 
-       drbd_free_bc(device->ldev);
+       drbd_free_ldev(device->ldev);
        device->ldev = NULL;
 
        drbd_release_all_peer_reqs(device);
@@ -2200,15 +2193,20 @@ void drbd_destroy_device(struct kref *kref)
 
        if (device->bitmap) /* should no longer be there. */
                drbd_bm_cleanup(device);
-       __free_page(device->md_io_page);
+       __free_page(device->md_io.page);
        put_disk(device->vdisk);
        blk_cleanup_queue(device->rq_queue);
        kfree(device->rs_plan_s);
-       kfree(first_peer_device(device));
-       kfree(device);
 
-       for_each_connection(connection, resource)
-               kref_put(&connection->kref, drbd_destroy_connection);
+       /* not for_each_connection(connection, resource):
+        * those may have been cleaned up and disassociated already.
+        */
+       for_each_peer_device_safe(peer_device, tmp_peer_device, device) {
+               kref_put(&peer_device->connection->kref, drbd_destroy_connection);
+               kfree(peer_device);
+       }
+       memset(device, 0xfd, sizeof(*device));
+       kfree(device);
        kref_put(&resource->kref, drbd_destroy_resource);
 }
 
@@ -2236,7 +2234,7 @@ static void do_retry(struct work_struct *ws)
        list_for_each_entry_safe(req, tmp, &writes, tl_requests) {
                struct drbd_device *device = req->device;
                struct bio *bio = req->master_bio;
-               unsigned long start_time = req->start_time;
+               unsigned long start_jif = req->start_jif;
                bool expected;
 
                expected =
@@ -2271,10 +2269,12 @@ static void do_retry(struct work_struct *ws)
                /* We are not just doing generic_make_request(),
                 * as we want to keep the start_time information. */
                inc_ap_bio(device);
-               __drbd_make_request(device, bio, start_time);
+               __drbd_make_request(device, bio, start_jif);
        }
 }
 
+/* called via drbd_req_put_completion_ref(),
+ * holds resource->req_lock */
 void drbd_restart_request(struct drbd_request *req)
 {
        unsigned long flags;
@@ -2298,6 +2298,7 @@ void drbd_destroy_resource(struct kref *kref)
        idr_destroy(&resource->devices);
        free_cpumask_var(resource->cpu_mask);
        kfree(resource->name);
+       memset(resource, 0xf2, sizeof(*resource));
        kfree(resource);
 }
 
@@ -2307,8 +2308,10 @@ void drbd_free_resource(struct drbd_resource *resource)
 
        for_each_connection_safe(connection, tmp, resource) {
                list_del(&connection->connections);
+               drbd_debugfs_connection_cleanup(connection);
                kref_put(&connection->kref, drbd_destroy_connection);
        }
+       drbd_debugfs_resource_cleanup(resource);
        kref_put(&resource->kref, drbd_destroy_resource);
 }
 
@@ -2318,8 +2321,6 @@ static void drbd_cleanup(void)
        struct drbd_device *device;
        struct drbd_resource *resource, *tmp;
 
-       unregister_reboot_notifier(&drbd_notifier);
-
        /* first remove proc,
         * drbdsetup uses it's presence to detect
         * whether DRBD is loaded.
@@ -2335,6 +2336,7 @@ static void drbd_cleanup(void)
                destroy_workqueue(retry.wq);
 
        drbd_genl_unregister();
+       drbd_debugfs_cleanup();
 
        idr_for_each_entry(&drbd_devices, device, i)
                drbd_delete_device(device);
@@ -2350,7 +2352,7 @@ static void drbd_cleanup(void)
 
        idr_destroy(&drbd_devices);
 
-       printk(KERN_INFO "drbd: module cleanup done.\n");
+       pr_info("module cleanup done.\n");
 }
 
 /**
@@ -2539,6 +2541,20 @@ int set_resource_options(struct drbd_resource *resource, struct res_opts *res_op
        if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) {
                err = bitmap_parse(res_opts->cpu_mask, DRBD_CPU_MASK_SIZE,
                                   cpumask_bits(new_cpu_mask), nr_cpu_ids);
+               if (err == -EOVERFLOW) {
+                       /* So what. mask it out. */
+                       cpumask_var_t tmp_cpu_mask;
+                       if (zalloc_cpumask_var(&tmp_cpu_mask, GFP_KERNEL)) {
+                               cpumask_setall(tmp_cpu_mask);
+                               cpumask_and(new_cpu_mask, new_cpu_mask, tmp_cpu_mask);
+                               drbd_warn(resource, "Overflow in bitmap_parse(%.12s%s), truncating to %u bits\n",
+                                       res_opts->cpu_mask,
+                                       strlen(res_opts->cpu_mask) > 12 ? "..." : "",
+                                       nr_cpu_ids);
+                               free_cpumask_var(tmp_cpu_mask);
+                               err = 0;
+                       }
+               }
                if (err) {
                        drbd_warn(resource, "bitmap_parse() failed with %d\n", err);
                        /* retcode = ERR_CPU_MASK_PARSE; */
@@ -2579,10 +2595,12 @@ struct drbd_resource *drbd_create_resource(const char *name)
        kref_init(&resource->kref);
        idr_init(&resource->devices);
        INIT_LIST_HEAD(&resource->connections);
+       resource->write_ordering = WO_bdev_flush;
        list_add_tail_rcu(&resource->resources, &drbd_resources);
        mutex_init(&resource->conf_update);
        mutex_init(&resource->adm_mutex);
        spin_lock_init(&resource->req_lock);
+       drbd_debugfs_resource_add(resource);
        return resource;
 
 fail_free_name:
@@ -2593,7 +2611,7 @@ fail:
        return NULL;
 }
 
-/* caller must be under genl_lock() */
+/* caller must be under adm_mutex */
 struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
 {
        struct drbd_resource *resource;
@@ -2617,7 +2635,6 @@ struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
        INIT_LIST_HEAD(&connection->current_epoch->list);
        connection->epochs = 1;
        spin_lock_init(&connection->epoch_lock);
-       connection->write_ordering = WO_bdev_flush;
 
        connection->send.seen_any_write_yet = false;
        connection->send.current_epoch_nr = 0;
@@ -2652,6 +2669,7 @@ struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
 
        kref_get(&resource->kref);
        list_add_tail_rcu(&connection->connections, &resource->connections);
+       drbd_debugfs_connection_add(connection);
        return connection;
 
 fail_resource:
@@ -2680,6 +2698,7 @@ void drbd_destroy_connection(struct kref *kref)
        drbd_free_socket(&connection->data);
        kfree(connection->int_dig_in);
        kfree(connection->int_dig_vv);
+       memset(connection, 0xfc, sizeof(*connection));
        kfree(connection);
        kref_put(&resource->kref, drbd_destroy_resource);
 }
@@ -2694,7 +2713,6 @@ static int init_submitter(struct drbd_device *device)
                return -ENOMEM;
 
        INIT_WORK(&device->submit.worker, do_submit);
-       spin_lock_init(&device->submit.lock);
        INIT_LIST_HEAD(&device->submit.writes);
        return 0;
 }
@@ -2764,8 +2782,8 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
        blk_queue_merge_bvec(q, drbd_merge_bvec);
        q->queue_lock = &resource->req_lock;
 
-       device->md_io_page = alloc_page(GFP_KERNEL);
-       if (!device->md_io_page)
+       device->md_io.page = alloc_page(GFP_KERNEL);
+       if (!device->md_io.page)
                goto out_no_io_page;
 
        if (drbd_bm_init(device))
@@ -2794,6 +2812,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
        kref_get(&device->kref);
 
        INIT_LIST_HEAD(&device->peer_devices);
+       INIT_LIST_HEAD(&device->pending_bitmap_io);
        for_each_connection(connection, resource) {
                peer_device = kzalloc(sizeof(struct drbd_peer_device), GFP_KERNEL);
                if (!peer_device)
@@ -2829,7 +2848,10 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
                for_each_peer_device(peer_device, device)
                        drbd_connected(peer_device);
        }
-
+       /* move to create_peer_device() */
+       for_each_peer_device(peer_device, device)
+               drbd_debugfs_peer_device_add(peer_device);
+       drbd_debugfs_device_add(device);
        return NO_ERROR;
 
 out_idr_remove_vol:
@@ -2853,7 +2875,7 @@ out_idr_remove_minor:
 out_no_minor_idr:
        drbd_bm_cleanup(device);
 out_no_bitmap:
-       __free_page(device->md_io_page);
+       __free_page(device->md_io.page);
 out_no_io_page:
        put_disk(disk);
 out_no_disk:
@@ -2868,8 +2890,13 @@ void drbd_delete_device(struct drbd_device *device)
 {
        struct drbd_resource *resource = device->resource;
        struct drbd_connection *connection;
+       struct drbd_peer_device *peer_device;
        int refs = 3;
 
+       /* move to free_peer_device() */
+       for_each_peer_device(peer_device, device)
+               drbd_debugfs_peer_device_cleanup(peer_device);
+       drbd_debugfs_device_cleanup(device);
        for_each_connection(connection, resource) {
                idr_remove(&connection->peer_devices, device->vnr);
                refs++;
@@ -2881,13 +2908,12 @@ void drbd_delete_device(struct drbd_device *device)
        kref_sub(&device->kref, refs, drbd_destroy_device);
 }
 
-int __init drbd_init(void)
+static int __init drbd_init(void)
 {
        int err;
 
        if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
-               printk(KERN_ERR
-                      "drbd: invalid minor_count (%d)\n", minor_count);
+               pr_err("invalid minor_count (%d)\n", minor_count);
 #ifdef MODULE
                return -EINVAL;
 #else
@@ -2897,14 +2923,11 @@ int __init drbd_init(void)
 
        err = register_blkdev(DRBD_MAJOR, "drbd");
        if (err) {
-               printk(KERN_ERR
-                      "drbd: unable to register block device major %d\n",
+               pr_err("unable to register block device major %d\n",
                       DRBD_MAJOR);
                return err;
        }
 
-       register_reboot_notifier(&drbd_notifier);
-
        /*
         * allocate all necessary structs
         */
@@ -2918,7 +2941,7 @@ int __init drbd_init(void)
 
        err = drbd_genl_register();
        if (err) {
-               printk(KERN_ERR "drbd: unable to register generic netlink family\n");
+               pr_err("unable to register generic netlink family\n");
                goto fail;
        }
 
@@ -2929,38 +2952,39 @@ int __init drbd_init(void)
        err = -ENOMEM;
        drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
        if (!drbd_proc) {
-               printk(KERN_ERR "drbd: unable to register proc file\n");
+               pr_err("unable to register proc file\n");
                goto fail;
        }
 
        retry.wq = create_singlethread_workqueue("drbd-reissue");
        if (!retry.wq) {
-               printk(KERN_ERR "drbd: unable to create retry workqueue\n");
+               pr_err("unable to create retry workqueue\n");
                goto fail;
        }
        INIT_WORK(&retry.worker, do_retry);
        spin_lock_init(&retry.lock);
        INIT_LIST_HEAD(&retry.writes);
 
-       printk(KERN_INFO "drbd: initialized. "
+       if (drbd_debugfs_init())
+               pr_notice("failed to initialize debugfs -- will not be available\n");
+
+       pr_info("initialized. "
               "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
               API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
-       printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
-       printk(KERN_INFO "drbd: registered as block device major %d\n",
-               DRBD_MAJOR);
-
+       pr_info("%s\n", drbd_buildtag());
+       pr_info("registered as block device major %d\n", DRBD_MAJOR);
        return 0; /* Success! */
 
 fail:
        drbd_cleanup();
        if (err == -ENOMEM)
-               printk(KERN_ERR "drbd: ran out of memory\n");
+               pr_err("ran out of memory\n");
        else
-               printk(KERN_ERR "drbd: initialization failure\n");
+               pr_err("initialization failure\n");
        return err;
 }
 
-void drbd_free_bc(struct drbd_backing_dev *ldev)
+void drbd_free_ldev(struct drbd_backing_dev *ldev)
 {
        if (ldev == NULL)
                return;
@@ -2972,24 +2996,29 @@ void drbd_free_bc(struct drbd_backing_dev *ldev)
        kfree(ldev);
 }
 
-void drbd_free_sock(struct drbd_connection *connection)
+static void drbd_free_one_sock(struct drbd_socket *ds)
 {
-       if (connection->data.socket) {
-               mutex_lock(&connection->data.mutex);
-               kernel_sock_shutdown(connection->data.socket, SHUT_RDWR);
-               sock_release(connection->data.socket);
-               connection->data.socket = NULL;
-               mutex_unlock(&connection->data.mutex);
-       }
-       if (connection->meta.socket) {
-               mutex_lock(&connection->meta.mutex);
-               kernel_sock_shutdown(connection->meta.socket, SHUT_RDWR);
-               sock_release(connection->meta.socket);
-               connection->meta.socket = NULL;
-               mutex_unlock(&connection->meta.mutex);
+       struct socket *s;
+       mutex_lock(&ds->mutex);
+       s = ds->socket;
+       ds->socket = NULL;
+       mutex_unlock(&ds->mutex);
+       if (s) {
+               /* so debugfs does not need to mutex_lock() */
+               synchronize_rcu();
+               kernel_sock_shutdown(s, SHUT_RDWR);
+               sock_release(s);
        }
 }
 
+void drbd_free_sock(struct drbd_connection *connection)
+{
+       if (connection->data.socket)
+               drbd_free_one_sock(&connection->data);
+       if (connection->meta.socket)
+               drbd_free_one_sock(&connection->meta);
+}
+
 /* meta data management */
 
 void conn_md_sync(struct drbd_connection *connection)
@@ -3093,7 +3122,7 @@ void drbd_md_sync(struct drbd_device *device)
        if (!get_ldev_if_state(device, D_FAILED))
                return;
 
-       buffer = drbd_md_get_buffer(device);
+       buffer = drbd_md_get_buffer(device, __func__);
        if (!buffer)
                goto out;
 
@@ -3253,7 +3282,7 @@ int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev)
        if (device->state.disk != D_DISKLESS)
                return ERR_DISK_CONFIGURED;
 
-       buffer = drbd_md_get_buffer(device);
+       buffer = drbd_md_get_buffer(device, __func__);
        if (!buffer)
                return ERR_NOMEM;
 
@@ -3466,23 +3495,19 @@ void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local)
  *
  * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
  */
-int drbd_bmio_set_n_write(struct drbd_device *device)
+int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local)
 {
        int rv = -EIO;
 
-       if (get_ldev_if_state(device, D_ATTACHING)) {
-               drbd_md_set_flag(device, MDF_FULL_SYNC);
-               drbd_md_sync(device);
-               drbd_bm_set_all(device);
-
-               rv = drbd_bm_write(device);
+       drbd_md_set_flag(device, MDF_FULL_SYNC);
+       drbd_md_sync(device);
+       drbd_bm_set_all(device);
 
-               if (!rv) {
-                       drbd_md_clear_flag(device, MDF_FULL_SYNC);
-                       drbd_md_sync(device);
-               }
+       rv = drbd_bm_write(device);
 
-               put_ldev(device);
+       if (!rv) {
+               drbd_md_clear_flag(device, MDF_FULL_SYNC);
+               drbd_md_sync(device);
        }
 
        return rv;
@@ -3494,18 +3519,11 @@ int drbd_bmio_set_n_write(struct drbd_device *device)
  *
  * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
  */
-int drbd_bmio_clear_n_write(struct drbd_device *device)
+int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local)
 {
-       int rv = -EIO;
-
        drbd_resume_al(device);
-       if (get_ldev_if_state(device, D_ATTACHING)) {
-               drbd_bm_clear_all(device);
-               rv = drbd_bm_write(device);
-               put_ldev(device);
-       }
-
-       return rv;
+       drbd_bm_clear_all(device);
+       return drbd_bm_write(device);
 }
 
 static int w_bitmap_io(struct drbd_work *w, int unused)
@@ -3537,61 +3555,6 @@ static int w_bitmap_io(struct drbd_work *w, int unused)
        return 0;
 }
 
-void drbd_ldev_destroy(struct drbd_device *device)
-{
-       lc_destroy(device->resync);
-       device->resync = NULL;
-       lc_destroy(device->act_log);
-       device->act_log = NULL;
-       __no_warn(local,
-               drbd_free_bc(device->ldev);
-               device->ldev = NULL;);
-
-       clear_bit(GO_DISKLESS, &device->flags);
-}
-
-static int w_go_diskless(struct drbd_work *w, int unused)
-{
-       struct drbd_device *device =
-               container_of(w, struct drbd_device, go_diskless);
-
-       D_ASSERT(device, device->state.disk == D_FAILED);
-       /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
-        * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
-        * the protected members anymore, though, so once put_ldev reaches zero
-        * again, it will be safe to free them. */
-
-       /* Try to write changed bitmap pages, read errors may have just
-        * set some bits outside the area covered by the activity log.
-        *
-        * If we have an IO error during the bitmap writeout,
-        * we will want a full sync next time, just in case.
-        * (Do we want a specific meta data flag for this?)
-        *
-        * If that does not make it to stable storage either,
-        * we cannot do anything about that anymore.
-        *
-        * We still need to check if both bitmap and ldev are present, we may
-        * end up here after a failed attach, before ldev was even assigned.
-        */
-       if (device->bitmap && device->ldev) {
-               /* An interrupted resync or similar is allowed to recounts bits
-                * while we detach.
-                * Any modifications would not be expected anymore, though.
-                */
-               if (drbd_bitmap_io_from_worker(device, drbd_bm_write,
-                                       "detach", BM_LOCKED_TEST_ALLOWED)) {
-                       if (test_bit(WAS_READ_ERROR, &device->flags)) {
-                               drbd_md_set_flag(device, MDF_FULL_SYNC);
-                               drbd_md_sync(device);
-                       }
-               }
-       }
-
-       drbd_force_state(device, NS(disk, D_DISKLESS));
-       return 0;
-}
-
 /**
  * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
  * @device:    DRBD device.
@@ -3603,6 +3566,9 @@ static int w_go_diskless(struct drbd_work *w, int unused)
  * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
  * called from worker context. It MUST NOT be used while a previous such
  * work is still pending!
+ *
+ * Its worker function encloses the call of io_fn() by get_ldev() and
+ * put_ldev().
  */
 void drbd_queue_bitmap_io(struct drbd_device *device,
                          int (*io_fn)(struct drbd_device *),
@@ -3685,25 +3651,7 @@ int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
 static void md_sync_timer_fn(unsigned long data)
 {
        struct drbd_device *device = (struct drbd_device *) data;
-
-       /* must not double-queue! */
-       if (list_empty(&device->md_sync_work.list))
-               drbd_queue_work_front(&first_peer_device(device)->connection->sender_work,
-                                     &device->md_sync_work);
-}
-
-static int w_md_sync(struct drbd_work *w, int unused)
-{
-       struct drbd_device *device =
-               container_of(w, struct drbd_device, md_sync_work);
-
-       drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
-#ifdef DEBUG
-       drbd_warn(device, "last md_mark_dirty: %s:%u\n",
-               device->last_md_mark_dirty.func, device->last_md_mark_dirty.line);
-#endif
-       drbd_md_sync(device);
-       return 0;
+       drbd_device_post_work(device, MD_SYNC);
 }
 
 const char *cmdname(enum drbd_packet cmd)
index 3f2e1673808053a4de70077735a67723f9955b64..1cd47df44bdaf57d74c1ff3adede40835f1b1f29 100644 (file)
@@ -23,6 +23,8 @@
 
  */
 
+#define pr_fmt(fmt)    KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/drbd.h>
 #include <linux/in.h>
@@ -85,7 +87,7 @@ static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
 {
        genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb))));
        if (genlmsg_reply(skb, info))
-               printk(KERN_ERR "drbd: error sending genl reply\n");
+               pr_err("error sending genl reply\n");
 }
 
 /* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
@@ -558,8 +560,10 @@ void conn_try_outdate_peer_async(struct drbd_connection *connection)
 }
 
 enum drbd_state_rv
-drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
+drbd_set_role(struct drbd_device *const device, enum drbd_role new_role, int force)
 {
+       struct drbd_peer_device *const peer_device = first_peer_device(device);
+       struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
        const int max_tries = 4;
        enum drbd_state_rv rv = SS_UNKNOWN_ERROR;
        struct net_conf *nc;
@@ -607,7 +611,7 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
                    device->state.disk == D_CONSISTENT && mask.pdsk == 0) {
                        D_ASSERT(device, device->state.pdsk == D_UNKNOWN);
 
-                       if (conn_try_outdate_peer(first_peer_device(device)->connection)) {
+                       if (conn_try_outdate_peer(connection)) {
                                val.disk = D_UP_TO_DATE;
                                mask.disk = D_MASK;
                        }
@@ -617,7 +621,7 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
                if (rv == SS_NOTHING_TO_DO)
                        goto out;
                if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) {
-                       if (!conn_try_outdate_peer(first_peer_device(device)->connection) && force) {
+                       if (!conn_try_outdate_peer(connection) && force) {
                                drbd_warn(device, "Forced into split brain situation!\n");
                                mask.pdsk = D_MASK;
                                val.pdsk  = D_OUTDATED;
@@ -630,7 +634,7 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
                           retry at most once more in this case. */
                        int timeo;
                        rcu_read_lock();
-                       nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+                       nc = rcu_dereference(connection->net_conf);
                        timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1;
                        rcu_read_unlock();
                        schedule_timeout_interruptible(timeo);
@@ -659,19 +663,17 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
        /* FIXME also wait for all pending P_BARRIER_ACK? */
 
        if (new_role == R_SECONDARY) {
-               set_disk_ro(device->vdisk, true);
                if (get_ldev(device)) {
                        device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
                        put_ldev(device);
                }
        } else {
-               /* Called from drbd_adm_set_role only.
-                * We are still holding the conf_update mutex. */
-               nc = first_peer_device(device)->connection->net_conf;
+               mutex_lock(&device->resource->conf_update);
+               nc = connection->net_conf;
                if (nc)
                        nc->discard_my_data = 0; /* without copy; single bit op is atomic */
+               mutex_unlock(&device->resource->conf_update);
 
-               set_disk_ro(device->vdisk, false);
                if (get_ldev(device)) {
                        if (((device->state.conn < C_CONNECTED ||
                               device->state.pdsk <= D_FAILED)
@@ -689,12 +691,12 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
        if (device->state.conn >= C_WF_REPORT_PARAMS) {
                /* if this was forced, we should consider sync */
                if (forced)
-                       drbd_send_uuids(first_peer_device(device));
-               drbd_send_current_state(first_peer_device(device));
+                       drbd_send_uuids(peer_device);
+               drbd_send_current_state(peer_device);
        }
 
        drbd_md_sync(device);
-
+       set_disk_ro(device->vdisk, new_role == R_SECONDARY);
        kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
 out:
        mutex_unlock(device->state_mutex);
@@ -891,7 +893,7 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
         * still lock the act_log to not trigger ASSERTs there.
         */
        drbd_suspend_io(device);
-       buffer = drbd_md_get_buffer(device); /* Lock meta-data IO */
+       buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */
        if (!buffer) {
                drbd_resume_io(device);
                return DS_ERROR;
@@ -971,6 +973,10 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
        if (la_size_changed || md_moved || rs) {
                u32 prev_flags;
 
+               /* We do some synchronous IO below, which may take some time.
+                * Clear the timer, to avoid scary "timer expired!" messages,
+                * "Superblock" is written out at least twice below, anyways. */
+               del_timer(&device->md_sync_timer);
                drbd_al_shrink(device); /* All extents inactive. */
 
                prev_flags = md->flags;
@@ -1116,15 +1122,16 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
        return 0;
 }
 
-static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_bio_size)
+static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev,
+                                  unsigned int max_bio_size)
 {
        struct request_queue * const q = device->rq_queue;
        unsigned int max_hw_sectors = max_bio_size >> 9;
        unsigned int max_segments = 0;
        struct request_queue *b = NULL;
 
-       if (get_ldev_if_state(device, D_ATTACHING)) {
-               b = device->ldev->backing_bdev->bd_disk->queue;
+       if (bdev) {
+               b = bdev->backing_bdev->bd_disk->queue;
 
                max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
                rcu_read_lock();
@@ -1169,11 +1176,10 @@ static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_
                                 b->backing_dev_info.ra_pages);
                        q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
                }
-               put_ldev(device);
        }
 }
 
-void drbd_reconsider_max_bio_size(struct drbd_device *device)
+void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev)
 {
        unsigned int now, new, local, peer;
 
@@ -1181,10 +1187,9 @@ void drbd_reconsider_max_bio_size(struct drbd_device *device)
        local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */
        peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */
 
-       if (get_ldev_if_state(device, D_ATTACHING)) {
-               local = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9;
+       if (bdev) {
+               local = queue_max_hw_sectors(bdev->backing_bdev->bd_disk->queue) << 9;
                device->local_max_bio_size = local;
-               put_ldev(device);
        }
        local = min(local, DRBD_MAX_BIO_SIZE);
 
@@ -1217,7 +1222,7 @@ void drbd_reconsider_max_bio_size(struct drbd_device *device)
        if (new != now)
                drbd_info(device, "max BIO size = %u\n", new);
 
-       drbd_setup_queue_param(device, new);
+       drbd_setup_queue_param(device, bdev, new);
 }
 
 /* Starts the worker thread */
@@ -1299,6 +1304,13 @@ static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev)
        return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION;
 }
 
+static bool write_ordering_changed(struct disk_conf *a, struct disk_conf *b)
+{
+       return  a->disk_barrier != b->disk_barrier ||
+               a->disk_flushes != b->disk_flushes ||
+               a->disk_drain != b->disk_drain;
+}
+
 int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
 {
        struct drbd_config_context adm_ctx;
@@ -1405,7 +1417,8 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
        else
                set_bit(MD_NO_FUA, &device->flags);
 
-       drbd_bump_write_ordering(first_peer_device(device)->connection, WO_bdev_flush);
+       if (write_ordering_changed(old_disk_conf, new_disk_conf))
+               drbd_bump_write_ordering(device->resource, NULL, WO_bdev_flush);
 
        drbd_md_sync(device);
 
@@ -1440,6 +1453,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 {
        struct drbd_config_context adm_ctx;
        struct drbd_device *device;
+       struct drbd_peer_device *peer_device;
+       struct drbd_connection *connection;
        int err;
        enum drbd_ret_code retcode;
        enum determine_dev_size dd;
@@ -1462,7 +1477,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 
        device = adm_ctx.device;
        mutex_lock(&adm_ctx.resource->adm_mutex);
-       conn_reconfig_start(first_peer_device(device)->connection);
+       peer_device = first_peer_device(device);
+       connection = peer_device ? peer_device->connection : NULL;
+       conn_reconfig_start(connection);
 
        /* if you want to reconfigure, please tear down first */
        if (device->state.disk > D_DISKLESS) {
@@ -1473,7 +1490,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
         * drbd_ldev_destroy is done already, we may end up here very fast,
         * e.g. if someone calls attach from the on-io-error handler,
         * to realize a "hot spare" feature (not that I'd recommend that) */
-       wait_event(device->misc_wait, !atomic_read(&device->local_cnt));
+       wait_event(device->misc_wait, !test_bit(GOING_DISKLESS, &device->flags));
 
        /* make sure there is no leftover from previous force-detach attempts */
        clear_bit(FORCE_DETACH, &device->flags);
@@ -1529,7 +1546,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
                goto fail;
 
        rcu_read_lock();
-       nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+       nc = rcu_dereference(connection->net_conf);
        if (nc) {
                if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) {
                        rcu_read_unlock();
@@ -1649,7 +1666,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
         */
        wait_event(device->misc_wait, !atomic_read(&device->ap_pending_cnt) || drbd_suspended(device));
        /* and for any other previously queued work */
-       drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work);
+       drbd_flush_workqueue(&connection->sender_work);
 
        rv = _drbd_request_state(device, NS(disk, D_ATTACHING), CS_VERBOSE);
        retcode = rv;  /* FIXME: Type mismatch. */
@@ -1710,7 +1727,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
        new_disk_conf = NULL;
        new_plan = NULL;
 
-       drbd_bump_write_ordering(first_peer_device(device)->connection, WO_bdev_flush);
+       drbd_bump_write_ordering(device->resource, device->ldev, WO_bdev_flush);
 
        if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY))
                set_bit(CRASHED_PRIMARY, &device->flags);
@@ -1726,7 +1743,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
        device->read_cnt = 0;
        device->writ_cnt = 0;
 
-       drbd_reconsider_max_bio_size(device);
+       drbd_reconsider_max_bio_size(device, device->ldev);
 
        /* If I am currently not R_PRIMARY,
         * but meta data primary indicator is set,
@@ -1845,7 +1862,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 
        kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
        put_ldev(device);
-       conn_reconfig_done(first_peer_device(device)->connection);
+       conn_reconfig_done(connection);
        mutex_unlock(&adm_ctx.resource->adm_mutex);
        drbd_adm_finish(&adm_ctx, info, retcode);
        return 0;
@@ -1856,7 +1873,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
        drbd_force_state(device, NS(disk, D_DISKLESS));
        drbd_md_sync(device);
  fail:
-       conn_reconfig_done(first_peer_device(device)->connection);
+       conn_reconfig_done(connection);
        if (nbc) {
                if (nbc->backing_bdev)
                        blkdev_put(nbc->backing_bdev,
@@ -1888,7 +1905,7 @@ static int adm_detach(struct drbd_device *device, int force)
        }
 
        drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */
-       drbd_md_get_buffer(device); /* make sure there is no in-flight meta-data IO */
+       drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */
        retcode = drbd_request_state(device, NS(disk, D_FAILED));
        drbd_md_put_buffer(device);
        /* D_FAILED will transition to DISKLESS. */
@@ -2654,8 +2671,13 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
        if (retcode != NO_ERROR)
                goto out;
 
-       mutex_lock(&adm_ctx.resource->adm_mutex);
        device = adm_ctx.device;
+       if (!get_ldev(device)) {
+               retcode = ERR_NO_DISK;
+               goto out;
+       }
+
+       mutex_lock(&adm_ctx.resource->adm_mutex);
 
        /* If there is still bitmap IO pending, probably because of a previous
         * resync just being finished, wait for it before requesting a new resync.
@@ -2679,6 +2701,7 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
                retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T));
        drbd_resume_io(device);
        mutex_unlock(&adm_ctx.resource->adm_mutex);
+       put_ldev(device);
 out:
        drbd_adm_finish(&adm_ctx, info, retcode);
        return 0;
@@ -2704,7 +2727,7 @@ out:
        return 0;
 }
 
-static int drbd_bmio_set_susp_al(struct drbd_device *device)
+static int drbd_bmio_set_susp_al(struct drbd_device *device) __must_hold(local)
 {
        int rv;
 
@@ -2725,8 +2748,13 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
        if (retcode != NO_ERROR)
                goto out;
 
-       mutex_lock(&adm_ctx.resource->adm_mutex);
        device = adm_ctx.device;
+       if (!get_ldev(device)) {
+               retcode = ERR_NO_DISK;
+               goto out;
+       }
+
+       mutex_lock(&adm_ctx.resource->adm_mutex);
 
        /* If there is still bitmap IO pending, probably because of a previous
         * resync just being finished, wait for it before requesting a new resync.
@@ -2753,6 +2781,7 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
                retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S));
        drbd_resume_io(device);
        mutex_unlock(&adm_ctx.resource->adm_mutex);
+       put_ldev(device);
 out:
        drbd_adm_finish(&adm_ctx, info, retcode);
        return 0;
@@ -2892,7 +2921,7 @@ static struct drbd_connection *the_only_connection(struct drbd_resource *resourc
        return list_first_entry(&resource->connections, struct drbd_connection, connections);
 }
 
-int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device,
+static int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device,
                const struct sib_info *sib)
 {
        struct drbd_resource *resource = device->resource;
@@ -3622,13 +3651,6 @@ void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
        unsigned seq;
        int err = -ENOMEM;
 
-       if (sib->sib_reason == SIB_SYNC_PROGRESS) {
-               if (time_after(jiffies, device->rs_last_bcast + HZ))
-                       device->rs_last_bcast = jiffies;
-               else
-                       return;
-       }
-
        seq = atomic_inc_return(&drbd_genl_seq);
        msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
        if (!msg)
index 89736bdbbc7044aedaaacbd5dc9c858ca3933596..06e6147c76013602d2591cab96bb239afb073d17 100644 (file)
@@ -60,20 +60,65 @@ static void seq_printf_with_thousands_grouping(struct seq_file *seq, long v)
                seq_printf(seq, "%ld", v);
 }
 
+static void drbd_get_syncer_progress(struct drbd_device *device,
+               union drbd_dev_state state, unsigned long *rs_total,
+               unsigned long *bits_left, unsigned int *per_mil_done)
+{
+       /* this is to break it at compile time when we change that, in case we
+        * want to support more than (1<<32) bits on a 32bit arch. */
+       typecheck(unsigned long, device->rs_total);
+       *rs_total = device->rs_total;
+
+       /* note: both rs_total and rs_left are in bits, i.e. in
+        * units of BM_BLOCK_SIZE.
+        * for the percentage, we don't care. */
+
+       if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T)
+               *bits_left = device->ov_left;
+       else
+               *bits_left = drbd_bm_total_weight(device) - device->rs_failed;
+       /* >> 10 to prevent overflow,
+        * +1 to prevent division by zero */
+       if (*bits_left > *rs_total) {
+               /* D'oh. Maybe a logic bug somewhere.  More likely just a race
+                * between state change and reset of rs_total.
+                */
+               *bits_left = *rs_total;
+               *per_mil_done = *rs_total ? 0 : 1000;
+       } else {
+               /* Make sure the division happens in long context.
+                * We allow up to one petabyte storage right now,
+                * at a granularity of 4k per bit that is 2**38 bits.
+                * After shift right and multiplication by 1000,
+                * this should still fit easily into a 32bit long,
+                * so we don't need a 64bit division on 32bit arch.
+                * Note: currently we don't support such large bitmaps on 32bit
+                * arch anyways, but no harm done to be prepared for it here.
+                */
+               unsigned int shift = *rs_total > UINT_MAX ? 16 : 10;
+               unsigned long left = *bits_left >> shift;
+               unsigned long total = 1UL + (*rs_total >> shift);
+               unsigned long tmp = 1000UL - left * 1000UL/total;
+               *per_mil_done = tmp;
+       }
+}
+
+
 /*lge
  * progress bars shamelessly adapted from driver/md/md.c
  * output looks like
  *     [=====>..............] 33.5% (23456/123456)
  *     finish: 2:20:20 speed: 6,345 (6,456) K/sec
  */
-static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *seq)
+static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *seq,
+               union drbd_dev_state state)
 {
-       unsigned long db, dt, dbdt, rt, rs_left;
+       unsigned long db, dt, dbdt, rt, rs_total, rs_left;
        unsigned int res;
        int i, x, y;
        int stalled = 0;
 
-       drbd_get_syncer_progress(device, &rs_left, &res);
+       drbd_get_syncer_progress(device, state, &rs_total, &rs_left, &res);
 
        x = res/50;
        y = 20-x;
@@ -85,21 +130,21 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
                seq_printf(seq, ".");
        seq_printf(seq, "] ");
 
-       if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
+       if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T)
                seq_printf(seq, "verified:");
        else
                seq_printf(seq, "sync'ed:");
        seq_printf(seq, "%3u.%u%% ", res / 10, res % 10);
 
        /* if more than a few GB, display in MB */
-       if (device->rs_total > (4UL << (30 - BM_BLOCK_SHIFT)))
+       if (rs_total > (4UL << (30 - BM_BLOCK_SHIFT)))
                seq_printf(seq, "(%lu/%lu)M",
                            (unsigned long) Bit2KB(rs_left >> 10),
-                           (unsigned long) Bit2KB(device->rs_total >> 10));
+                           (unsigned long) Bit2KB(rs_total >> 10));
        else
                seq_printf(seq, "(%lu/%lu)K\n\t",
                            (unsigned long) Bit2KB(rs_left),
-                           (unsigned long) Bit2KB(device->rs_total));
+                           (unsigned long) Bit2KB(rs_total));
 
        /* see drivers/md/md.c
         * We do not want to overflow, so the order of operands and
@@ -150,13 +195,13 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
        dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
        if (dt == 0)
                dt = 1;
-       db = device->rs_total - rs_left;
+       db = rs_total - rs_left;
        dbdt = Bit2KB(db/dt);
        seq_printf_with_thousands_grouping(seq, dbdt);
        seq_printf(seq, ")");
 
-       if (device->state.conn == C_SYNC_TARGET ||
-           device->state.conn == C_VERIFY_S) {
+       if (state.conn == C_SYNC_TARGET ||
+           state.conn == C_VERIFY_S) {
                seq_printf(seq, " want: ");
                seq_printf_with_thousands_grouping(seq, device->c_sync_rate);
        }
@@ -168,8 +213,8 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
                unsigned long bm_bits = drbd_bm_bits(device);
                unsigned long bit_pos;
                unsigned long long stop_sector = 0;
-               if (device->state.conn == C_VERIFY_S ||
-                   device->state.conn == C_VERIFY_T) {
+               if (state.conn == C_VERIFY_S ||
+                   state.conn == C_VERIFY_T) {
                        bit_pos = bm_bits - device->ov_left;
                        if (verify_can_do_stop_sector(device))
                                stop_sector = device->ov_stop_sector;
@@ -188,22 +233,13 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
        }
 }
 
-static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
-{
-       struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
-
-       seq_printf(seq, "%5d %s %s\n", bme->rs_left,
-                  bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------",
-                  bme->flags & BME_LOCKED ? "LOCKED" : "------"
-                  );
-}
-
 static int drbd_seq_show(struct seq_file *seq, void *v)
 {
        int i, prev_i = -1;
        const char *sn;
        struct drbd_device *device;
        struct net_conf *nc;
+       union drbd_dev_state state;
        char wp;
 
        static char write_ordering_chars[] = {
@@ -241,11 +277,12 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
                        seq_printf(seq, "\n");
                prev_i = i;
 
-               sn = drbd_conn_str(device->state.conn);
+               state = device->state;
+               sn = drbd_conn_str(state.conn);
 
-               if (device->state.conn == C_STANDALONE &&
-                   device->state.disk == D_DISKLESS &&
-                   device->state.role == R_SECONDARY) {
+               if (state.conn == C_STANDALONE &&
+                   state.disk == D_DISKLESS &&
+                   state.role == R_SECONDARY) {
                        seq_printf(seq, "%2d: cs:Unconfigured\n", i);
                } else {
                        /* reset device->congestion_reason */
@@ -258,15 +295,15 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
                           "    ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
                           "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
                           i, sn,
-                          drbd_role_str(device->state.role),
-                          drbd_role_str(device->state.peer),
-                          drbd_disk_str(device->state.disk),
-                          drbd_disk_str(device->state.pdsk),
+                          drbd_role_str(state.role),
+                          drbd_role_str(state.peer),
+                          drbd_disk_str(state.disk),
+                          drbd_disk_str(state.pdsk),
                           wp,
                           drbd_suspended(device) ? 's' : 'r',
-                          device->state.aftr_isp ? 'a' : '-',
-                          device->state.peer_isp ? 'p' : '-',
-                          device->state.user_isp ? 'u' : '-',
+                          state.aftr_isp ? 'a' : '-',
+                          state.peer_isp ? 'p' : '-',
+                          state.user_isp ? 'u' : '-',
                           device->congestion_reason ?: '-',
                           test_bit(AL_SUSPENDED, &device->flags) ? 's' : '-',
                           device->send_cnt/2,
@@ -281,17 +318,17 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
                           atomic_read(&device->unacked_cnt),
                           atomic_read(&device->ap_bio_cnt),
                           first_peer_device(device)->connection->epochs,
-                          write_ordering_chars[first_peer_device(device)->connection->write_ordering]
+                          write_ordering_chars[device->resource->write_ordering]
                        );
                        seq_printf(seq, " oos:%llu\n",
                                   Bit2KB((unsigned long long)
                                           drbd_bm_total_weight(device)));
                }
-               if (device->state.conn == C_SYNC_SOURCE ||
-                   device->state.conn == C_SYNC_TARGET ||
-                   device->state.conn == C_VERIFY_S ||
-                   device->state.conn == C_VERIFY_T)
-                       drbd_syncer_progress(device, seq);
+               if (state.conn == C_SYNC_SOURCE ||
+                   state.conn == C_SYNC_TARGET ||
+                   state.conn == C_VERIFY_S ||
+                   state.conn == C_VERIFY_T)
+                       drbd_syncer_progress(device, seq, state);
 
                if (proc_details >= 1 && get_ldev_if_state(device, D_FAILED)) {
                        lc_seq_printf_stats(seq, device->resync);
@@ -299,12 +336,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
                        put_ldev(device);
                }
 
-               if (proc_details >= 2) {
-                       if (device->resync) {
-                               lc_seq_dump_details(seq, device->resync, "rs_left",
-                                       resync_dump_detail);
-                       }
-               }
+               if (proc_details >= 2)
+                       seq_printf(seq, "\tblocked on activity log: %d\n", atomic_read(&device->ap_actlog_cnt));
        }
        rcu_read_unlock();
 
@@ -316,7 +349,7 @@ static int drbd_proc_open(struct inode *inode, struct file *file)
        int err;
 
        if (try_module_get(THIS_MODULE)) {
-               err = single_open(file, drbd_seq_show, PDE_DATA(inode));
+               err = single_open(file, drbd_seq_show, NULL);
                if (err)
                        module_put(THIS_MODULE);
                return err;
index 5b17ec88ea058e766071e66eeadf3d8fca3f4940..9342b8da73ab517620dda3b38f9852e6c1219853 100644 (file)
@@ -362,17 +362,14 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
                        goto fail;
        }
 
+       memset(peer_req, 0, sizeof(*peer_req));
+       INIT_LIST_HEAD(&peer_req->w.list);
        drbd_clear_interval(&peer_req->i);
        peer_req->i.size = data_size;
        peer_req->i.sector = sector;
-       peer_req->i.local = false;
-       peer_req->i.waiting = false;
-
-       peer_req->epoch = NULL;
+       peer_req->submit_jif = jiffies;
        peer_req->peer_device = peer_device;
        peer_req->pages = page;
-       atomic_set(&peer_req->pending_bios, 0);
-       peer_req->flags = 0;
        /*
         * The block_id is opaque to the receiver.  It is not endianness
         * converted, and sent back to the sender unchanged.
@@ -389,11 +386,16 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
 void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req,
                       int is_net)
 {
+       might_sleep();
        if (peer_req->flags & EE_HAS_DIGEST)
                kfree(peer_req->digest);
        drbd_free_pages(device, peer_req->pages, is_net);
        D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
        D_ASSERT(device, drbd_interval_empty(&peer_req->i));
+       if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
+               peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
+               drbd_al_complete_io(device, &peer_req->i);
+       }
        mempool_free(peer_req, drbd_ee_mempool);
 }
 
@@ -791,8 +793,18 @@ static int receive_first_packet(struct drbd_connection *connection, struct socke
 {
        unsigned int header_size = drbd_header_size(connection);
        struct packet_info pi;
+       struct net_conf *nc;
        int err;
 
+       rcu_read_lock();
+       nc = rcu_dereference(connection->net_conf);
+       if (!nc) {
+               rcu_read_unlock();
+               return -EIO;
+       }
+       sock->sk->sk_rcvtimeo = nc->ping_timeo * 4 * HZ / 10;
+       rcu_read_unlock();
+
        err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0);
        if (err != header_size) {
                if (err >= 0)
@@ -809,7 +821,7 @@ static int receive_first_packet(struct drbd_connection *connection, struct socke
  * drbd_socket_okay() - Free the socket if its connection is not okay
  * @sock:      pointer to the pointer to the socket.
  */
-static int drbd_socket_okay(struct socket **sock)
+static bool drbd_socket_okay(struct socket **sock)
 {
        int rr;
        char tb[4];
@@ -827,6 +839,30 @@ static int drbd_socket_okay(struct socket **sock)
                return false;
        }
 }
+
+static bool connection_established(struct drbd_connection *connection,
+                                  struct socket **sock1,
+                                  struct socket **sock2)
+{
+       struct net_conf *nc;
+       int timeout;
+       bool ok;
+
+       if (!*sock1 || !*sock2)
+               return false;
+
+       rcu_read_lock();
+       nc = rcu_dereference(connection->net_conf);
+       timeout = (nc->sock_check_timeo ?: nc->ping_timeo) * HZ / 10;
+       rcu_read_unlock();
+       schedule_timeout_interruptible(timeout);
+
+       ok = drbd_socket_okay(sock1);
+       ok = drbd_socket_okay(sock2) && ok;
+
+       return ok;
+}
+
 /* Gets called if a connection is established, or if a new minor gets created
    in a connection */
 int drbd_connected(struct drbd_peer_device *peer_device)
@@ -868,8 +904,8 @@ static int conn_connect(struct drbd_connection *connection)
        struct drbd_socket sock, msock;
        struct drbd_peer_device *peer_device;
        struct net_conf *nc;
-       int vnr, timeout, h, ok;
-       bool discard_my_data;
+       int vnr, timeout, h;
+       bool discard_my_data, ok;
        enum drbd_state_rv rv;
        struct accept_wait_data ad = {
                .connection = connection,
@@ -913,17 +949,8 @@ static int conn_connect(struct drbd_connection *connection)
                        }
                }
 
-               if (sock.socket && msock.socket) {
-                       rcu_read_lock();
-                       nc = rcu_dereference(connection->net_conf);
-                       timeout = nc->ping_timeo * HZ / 10;
-                       rcu_read_unlock();
-                       schedule_timeout_interruptible(timeout);
-                       ok = drbd_socket_okay(&sock.socket);
-                       ok = drbd_socket_okay(&msock.socket) && ok;
-                       if (ok)
-                               break;
-               }
+               if (connection_established(connection, &sock.socket, &msock.socket))
+                       break;
 
 retry:
                s = drbd_wait_for_connect(connection, &ad);
@@ -969,8 +996,7 @@ randomize:
                                goto out_release_sockets;
                }
 
-               ok = drbd_socket_okay(&sock.socket);
-               ok = drbd_socket_okay(&msock.socket) && ok;
+               ok = connection_established(connection, &sock.socket, &msock.socket);
        } while (!ok);
 
        if (ad.s_listen)
@@ -1151,7 +1177,7 @@ static void drbd_flush(struct drbd_connection *connection)
        struct drbd_peer_device *peer_device;
        int vnr;
 
-       if (connection->write_ordering >= WO_bdev_flush) {
+       if (connection->resource->write_ordering >= WO_bdev_flush) {
                rcu_read_lock();
                idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
                        struct drbd_device *device = peer_device->device;
@@ -1161,14 +1187,22 @@ static void drbd_flush(struct drbd_connection *connection)
                        kref_get(&device->kref);
                        rcu_read_unlock();
 
+                       /* Right now, we have only this one synchronous code path
+                        * for flushes between request epochs.
+                        * We may want to make those asynchronous,
+                        * or at least parallelize the flushes to the volume devices.
+                        */
+                       device->flush_jif = jiffies;
+                       set_bit(FLUSH_PENDING, &device->flags);
                        rv = blkdev_issue_flush(device->ldev->backing_bdev,
                                        GFP_NOIO, NULL);
+                       clear_bit(FLUSH_PENDING, &device->flags);
                        if (rv) {
                                drbd_info(device, "local disk flush failed with status %d\n", rv);
                                /* would rather check on EOPNOTSUPP, but that is not reliable.
                                 * don't try again for ANY return value != 0
                                 * if (rv == -EOPNOTSUPP) */
-                               drbd_bump_write_ordering(connection, WO_drain_io);
+                               drbd_bump_write_ordering(connection->resource, NULL, WO_drain_io);
                        }
                        put_ldev(device);
                        kref_put(&device->kref, drbd_destroy_device);
@@ -1257,15 +1291,30 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connectio
        return rv;
 }
 
+static enum write_ordering_e
+max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo)
+{
+       struct disk_conf *dc;
+
+       dc = rcu_dereference(bdev->disk_conf);
+
+       if (wo == WO_bdev_flush && !dc->disk_flushes)
+               wo = WO_drain_io;
+       if (wo == WO_drain_io && !dc->disk_drain)
+               wo = WO_none;
+
+       return wo;
+}
+
 /**
  * drbd_bump_write_ordering() - Fall back to an other write ordering method
  * @connection:        DRBD connection.
  * @wo:                Write ordering method to try.
  */
-void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo)
+void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
+                             enum write_ordering_e wo)
 {
-       struct disk_conf *dc;
-       struct drbd_peer_device *peer_device;
+       struct drbd_device *device;
        enum write_ordering_e pwo;
        int vnr;
        static char *write_ordering_str[] = {
@@ -1274,26 +1323,27 @@ void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ord
                [WO_bdev_flush] = "flush",
        };
 
-       pwo = connection->write_ordering;
-       wo = min(pwo, wo);
+       pwo = resource->write_ordering;
+       if (wo != WO_bdev_flush)
+               wo = min(pwo, wo);
        rcu_read_lock();
-       idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-               struct drbd_device *device = peer_device->device;
+       idr_for_each_entry(&resource->devices, device, vnr) {
+               if (get_ldev(device)) {
+                       wo = max_allowed_wo(device->ldev, wo);
+                       if (device->ldev == bdev)
+                               bdev = NULL;
+                       put_ldev(device);
+               }
+       }
 
-               if (!get_ldev_if_state(device, D_ATTACHING))
-                       continue;
-               dc = rcu_dereference(device->ldev->disk_conf);
+       if (bdev)
+               wo = max_allowed_wo(bdev, wo);
 
-               if (wo == WO_bdev_flush && !dc->disk_flushes)
-                       wo = WO_drain_io;
-               if (wo == WO_drain_io && !dc->disk_drain)
-                       wo = WO_none;
-               put_ldev(device);
-       }
        rcu_read_unlock();
-       connection->write_ordering = wo;
-       if (pwo != connection->write_ordering || wo == WO_bdev_flush)
-               drbd_info(connection, "Method to ensure write ordering: %s\n", write_ordering_str[connection->write_ordering]);
+
+       resource->write_ordering = wo;
+       if (pwo != resource->write_ordering || wo == WO_bdev_flush)
+               drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
 }
 
 /**
@@ -1330,6 +1380,13 @@ int drbd_submit_peer_request(struct drbd_device *device,
                /* wait for all pending IO completions, before we start
                 * zeroing things out. */
                conn_wait_active_ee_empty(first_peer_device(device)->connection);
+               /* add it to the active list now,
+                * so we can find it to present it in debugfs */
+               peer_req->submit_jif = jiffies;
+               peer_req->flags |= EE_SUBMITTED;
+               spin_lock_irq(&device->resource->req_lock);
+               list_add_tail(&peer_req->w.list, &device->active_ee);
+               spin_unlock_irq(&device->resource->req_lock);
                if (blkdev_issue_zeroout(device->ldev->backing_bdev,
                        sector, ds >> 9, GFP_NOIO))
                        peer_req->flags |= EE_WAS_ERROR;
@@ -1398,6 +1455,9 @@ submit:
        D_ASSERT(device, page == NULL);
 
        atomic_set(&peer_req->pending_bios, n_bios);
+       /* for debugfs: update timestamp, mark as submitted */
+       peer_req->submit_jif = jiffies;
+       peer_req->flags |= EE_SUBMITTED;
        do {
                bio = bios;
                bios = bios->bi_next;
@@ -1471,7 +1531,7 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
         * R_PRIMARY crashes now.
         * Therefore we must send the barrier_ack after the barrier request was
         * completed. */
-       switch (connection->write_ordering) {
+       switch (connection->resource->write_ordering) {
        case WO_none:
                if (rv == FE_RECYCLED)
                        return 0;
@@ -1498,7 +1558,8 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
 
                return 0;
        default:
-               drbd_err(connection, "Strangeness in connection->write_ordering %d\n", connection->write_ordering);
+               drbd_err(connection, "Strangeness in connection->write_ordering %d\n",
+                        connection->resource->write_ordering);
                return -EIO;
        }
 
@@ -1531,7 +1592,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
        struct drbd_peer_request *peer_req;
        struct page *page;
        int dgs, ds, err;
-       int data_size = pi->size;
+       unsigned int data_size = pi->size;
        void *dig_in = peer_device->connection->int_dig_in;
        void *dig_vv = peer_device->connection->int_dig_vv;
        unsigned long *data;
@@ -1578,6 +1639,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
        if (!peer_req)
                return NULL;
 
+       peer_req->flags |= EE_WRITE;
        if (trim)
                return peer_req;
 
@@ -1734,9 +1796,10 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto
         * respective _drbd_clear_done_ee */
 
        peer_req->w.cb = e_end_resync_block;
+       peer_req->submit_jif = jiffies;
 
        spin_lock_irq(&device->resource->req_lock);
-       list_add(&peer_req->w.list, &device->sync_ee);
+       list_add_tail(&peer_req->w.list, &device->sync_ee);
        spin_unlock_irq(&device->resource->req_lock);
 
        atomic_add(pi->size >> 9, &device->rs_sect_ev);
@@ -1889,6 +1952,7 @@ static int e_end_block(struct drbd_work *w, int cancel)
                }
                dec_unacked(device);
        }
+
        /* we delete from the conflict detection hash _after_ we sent out the
         * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right.  */
        if (peer_req->flags & EE_IN_INTERVAL_TREE) {
@@ -2115,6 +2179,8 @@ static int handle_write_conflicts(struct drbd_device *device,
        drbd_for_each_overlap(i, &device->write_requests, sector, size) {
                if (i == &peer_req->i)
                        continue;
+               if (i->completed)
+                       continue;
 
                if (!i->local) {
                        /*
@@ -2147,7 +2213,6 @@ static int handle_write_conflicts(struct drbd_device *device,
                                          (unsigned long long)sector, size,
                                          superseded ? "local" : "remote");
 
-                       inc_unacked(device);
                        peer_req->w.cb = superseded ? e_send_superseded :
                                                   e_send_retry_write;
                        list_add_tail(&peer_req->w.list, &device->done_ee);
@@ -2206,6 +2271,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 {
        struct drbd_peer_device *peer_device;
        struct drbd_device *device;
+       struct net_conf *nc;
        sector_t sector;
        struct drbd_peer_request *peer_req;
        struct p_data *p = pi->data;
@@ -2245,6 +2311,8 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
        }
 
        peer_req->w.cb = e_end_block;
+       peer_req->submit_jif = jiffies;
+       peer_req->flags |= EE_APPLICATION;
 
        dp_flags = be32_to_cpu(p->dp_flags);
        rw |= wire_flags_to_bio(dp_flags);
@@ -2271,9 +2339,36 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
        spin_unlock(&connection->epoch_lock);
 
        rcu_read_lock();
-       tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries;
+       nc = rcu_dereference(peer_device->connection->net_conf);
+       tp = nc->two_primaries;
+       if (peer_device->connection->agreed_pro_version < 100) {
+               switch (nc->wire_protocol) {
+               case DRBD_PROT_C:
+                       dp_flags |= DP_SEND_WRITE_ACK;
+                       break;
+               case DRBD_PROT_B:
+                       dp_flags |= DP_SEND_RECEIVE_ACK;
+                       break;
+               }
+       }
        rcu_read_unlock();
+
+       if (dp_flags & DP_SEND_WRITE_ACK) {
+               peer_req->flags |= EE_SEND_WRITE_ACK;
+               inc_unacked(device);
+               /* corresponding dec_unacked() in e_end_block()
+                * respective _drbd_clear_done_ee */
+       }
+
+       if (dp_flags & DP_SEND_RECEIVE_ACK) {
+               /* I really don't like it that the receiver thread
+                * sends on the msock, but anyways */
+               drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req);
+       }
+
        if (tp) {
+               /* two primaries implies protocol C */
+               D_ASSERT(device, dp_flags & DP_SEND_WRITE_ACK);
                peer_req->flags |= EE_IN_INTERVAL_TREE;
                err = wait_for_and_update_peer_seq(peer_device, peer_seq);
                if (err)
@@ -2297,44 +2392,18 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
         * active_ee to become empty in drbd_submit_peer_request();
         * better not add ourselves here. */
        if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0)
-               list_add(&peer_req->w.list, &device->active_ee);
+               list_add_tail(&peer_req->w.list, &device->active_ee);
        spin_unlock_irq(&device->resource->req_lock);
 
        if (device->state.conn == C_SYNC_TARGET)
                wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req));
 
-       if (peer_device->connection->agreed_pro_version < 100) {
-               rcu_read_lock();
-               switch (rcu_dereference(peer_device->connection->net_conf)->wire_protocol) {
-               case DRBD_PROT_C:
-                       dp_flags |= DP_SEND_WRITE_ACK;
-                       break;
-               case DRBD_PROT_B:
-                       dp_flags |= DP_SEND_RECEIVE_ACK;
-                       break;
-               }
-               rcu_read_unlock();
-       }
-
-       if (dp_flags & DP_SEND_WRITE_ACK) {
-               peer_req->flags |= EE_SEND_WRITE_ACK;
-               inc_unacked(device);
-               /* corresponding dec_unacked() in e_end_block()
-                * respective _drbd_clear_done_ee */
-       }
-
-       if (dp_flags & DP_SEND_RECEIVE_ACK) {
-               /* I really don't like it that the receiver thread
-                * sends on the msock, but anyways */
-               drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req);
-       }
-
        if (device->state.pdsk < D_INCONSISTENT) {
                /* In case we have the only disk of the cluster, */
                drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size);
-               peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
                peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
-               drbd_al_begin_io(device, &peer_req->i, true);
+               drbd_al_begin_io(device, &peer_req->i);
+               peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
        }
 
        err = drbd_submit_peer_request(device, peer_req, rw, DRBD_FAULT_DT_WR);
@@ -2347,8 +2416,10 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
        list_del(&peer_req->w.list);
        drbd_remove_epoch_entry_interval(device, peer_req);
        spin_unlock_irq(&device->resource->req_lock);
-       if (peer_req->flags & EE_CALL_AL_COMPLETE_IO)
+       if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) {
+               peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
                drbd_al_complete_io(device, &peer_req->i);
+       }
 
 out_interrupted:
        drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP);
@@ -2368,13 +2439,14 @@ out_interrupted:
  * The current sync rate used here uses only the most recent two step marks,
  * to have a short time average so we can react faster.
  */
-bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
+bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
+               bool throttle_if_app_is_waiting)
 {
        struct lc_element *tmp;
-       bool throttle = true;
+       bool throttle = drbd_rs_c_min_rate_throttle(device);
 
-       if (!drbd_rs_c_min_rate_throttle(device))
-               return false;
+       if (!throttle || throttle_if_app_is_waiting)
+               return throttle;
 
        spin_lock_irq(&device->al_lock);
        tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
@@ -2382,7 +2454,8 @@ bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
                struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
                if (test_bit(BME_PRIORITY, &bm_ext->flags))
                        throttle = false;
-               /* Do not slow down if app IO is already waiting for this extent */
+               /* Do not slow down if app IO is already waiting for this extent,
+                * and our progress is necessary for application IO to complete. */
        }
        spin_unlock_irq(&device->al_lock);
 
@@ -2407,7 +2480,9 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
        curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
                      (int)part_stat_read(&disk->part0, sectors[1]) -
                        atomic_read(&device->rs_sect_ev);
-       if (!device->rs_last_events || curr_events - device->rs_last_events > 64) {
+
+       if (atomic_read(&device->ap_actlog_cnt)
+           || !device->rs_last_events || curr_events - device->rs_last_events > 64) {
                unsigned long rs_left;
                int i;
 
@@ -2508,6 +2583,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
                peer_req->w.cb = w_e_end_data_req;
                fault_type = DRBD_FAULT_DT_RD;
                /* application IO, don't drbd_rs_begin_io */
+               peer_req->flags |= EE_APPLICATION;
                goto submit;
 
        case P_RS_DATA_REQUEST:
@@ -2538,6 +2614,8 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
                        peer_req->w.cb = w_e_end_csum_rs_req;
                        /* used in the sector offset progress display */
                        device->bm_resync_fo = BM_SECT_TO_BIT(sector);
+                       /* remember to report stats in drbd_resync_finished */
+                       device->use_csums = true;
                } else if (pi->cmd == P_OV_REPLY) {
                        /* track progress, we may need to throttle */
                        atomic_add(size >> 9, &device->rs_sect_in);
@@ -2595,8 +2673,20 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
         * we would also throttle its application reads.
         * In that case, throttling is done on the SyncTarget only.
         */
-       if (device->state.peer != R_PRIMARY && drbd_rs_should_slow_down(device, sector))
+
+       /* Even though this may be a resync request, we do add to "read_ee";
+        * "sync_ee" is only used for resync WRITEs.
+        * Add to list early, so debugfs can find this request
+        * even if we have to sleep below. */
+       spin_lock_irq(&device->resource->req_lock);
+       list_add_tail(&peer_req->w.list, &device->read_ee);
+       spin_unlock_irq(&device->resource->req_lock);
+
+       update_receiver_timing_details(connection, drbd_rs_should_slow_down);
+       if (device->state.peer != R_PRIMARY
+       && drbd_rs_should_slow_down(device, sector, false))
                schedule_timeout_uninterruptible(HZ/10);
+       update_receiver_timing_details(connection, drbd_rs_begin_io);
        if (drbd_rs_begin_io(device, sector))
                goto out_free_e;
 
@@ -2604,22 +2694,20 @@ submit_for_resync:
        atomic_add(size >> 9, &device->rs_sect_ev);
 
 submit:
+       update_receiver_timing_details(connection, drbd_submit_peer_request);
        inc_unacked(device);
-       spin_lock_irq(&device->resource->req_lock);
-       list_add_tail(&peer_req->w.list, &device->read_ee);
-       spin_unlock_irq(&device->resource->req_lock);
-
        if (drbd_submit_peer_request(device, peer_req, READ, fault_type) == 0)
                return 0;
 
        /* don't care for the reason here */
        drbd_err(device, "submit failed, triggering re-connect\n");
+
+out_free_e:
        spin_lock_irq(&device->resource->req_lock);
        list_del(&peer_req->w.list);
        spin_unlock_irq(&device->resource->req_lock);
        /* no drbd_rs_complete_io(), we are dropping the connection anyways */
 
-out_free_e:
        put_ldev(device);
        drbd_free_peer_req(device, peer_req);
        return -EIO;
@@ -2842,8 +2930,10 @@ static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid,
 -1091   requires proto 91
 -1096   requires proto 96
  */
-static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_hold(local)
+static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __must_hold(local)
 {
+       struct drbd_peer_device *const peer_device = first_peer_device(device);
+       struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
        u64 self, peer;
        int i, j;
 
@@ -2869,7 +2959,7 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho
 
                if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) {
 
-                       if (first_peer_device(device)->connection->agreed_pro_version < 91)
+                       if (connection->agreed_pro_version < 91)
                                return -1091;
 
                        if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
@@ -2892,7 +2982,7 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho
 
                if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) {
 
-                       if (first_peer_device(device)->connection->agreed_pro_version < 91)
+                       if (connection->agreed_pro_version < 91)
                                return -1091;
 
                        if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) &&
@@ -2925,7 +3015,7 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho
                case 1: /*  self_pri && !peer_pri */ return 1;
                case 2: /* !self_pri &&  peer_pri */ return -1;
                case 3: /*  self_pri &&  peer_pri */
-                       dc = test_bit(RESOLVE_CONFLICTS, &first_peer_device(device)->connection->flags);
+                       dc = test_bit(RESOLVE_CONFLICTS, &connection->flags);
                        return dc ? -1 : 1;
                }
        }
@@ -2938,14 +3028,14 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho
        *rule_nr = 51;
        peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1);
        if (self == peer) {
-               if (first_peer_device(device)->connection->agreed_pro_version < 96 ?
+               if (connection->agreed_pro_version < 96 ?
                    (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
                    (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
                    peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) {
                        /* The last P_SYNC_UUID did not get though. Undo the last start of
                           resync as sync source modifications of the peer's UUIDs. */
 
-                       if (first_peer_device(device)->connection->agreed_pro_version < 91)
+                       if (connection->agreed_pro_version < 91)
                                return -1091;
 
                        device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START];
@@ -2975,14 +3065,14 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho
        *rule_nr = 71;
        self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
        if (self == peer) {
-               if (first_peer_device(device)->connection->agreed_pro_version < 96 ?
+               if (connection->agreed_pro_version < 96 ?
                    (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
                    (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
                    self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
                        /* The last P_SYNC_UUID did not get though. Undo the last start of
                           resync as sync source modifications of our UUIDs. */
 
-                       if (first_peer_device(device)->connection->agreed_pro_version < 91)
+                       if (connection->agreed_pro_version < 91)
                                return -1091;
 
                        __drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]);
@@ -3352,8 +3442,7 @@ disconnect:
  * return: NULL (alg name was "")
  *         ERR_PTR(error) if something goes wrong
  *         or the crypto hash ptr, if it worked out ok. */
-static
-struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device,
+static struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device,
                const char *alg, const char *name)
 {
        struct crypto_hash *tfm;
@@ -3639,7 +3728,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
        struct drbd_device *device;
        struct p_sizes *p = pi->data;
        enum determine_dev_size dd = DS_UNCHANGED;
-       sector_t p_size, p_usize, my_usize;
+       sector_t p_size, p_usize, p_csize, my_usize;
        int ldsc = 0; /* local disk size changed */
        enum dds_flags ddsf;
 
@@ -3650,6 +3739,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
 
        p_size = be64_to_cpu(p->d_size);
        p_usize = be64_to_cpu(p->u_size);
+       p_csize = be64_to_cpu(p->c_size);
 
        /* just store the peer's disk size for now.
         * we still need to figure out whether we accept that. */
@@ -3710,7 +3800,6 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
        }
 
        device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
-       drbd_reconsider_max_bio_size(device);
        /* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size().
           In case we cleared the QUEUE_FLAG_DISCARD from our queue in
           drbd_reconsider_max_bio_size(), we can be sure that after
@@ -3718,14 +3807,28 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
 
        ddsf = be16_to_cpu(p->dds_flags);
        if (get_ldev(device)) {
+               drbd_reconsider_max_bio_size(device, device->ldev);
                dd = drbd_determine_dev_size(device, ddsf, NULL);
                put_ldev(device);
                if (dd == DS_ERROR)
                        return -EIO;
                drbd_md_sync(device);
        } else {
-               /* I am diskless, need to accept the peer's size. */
-               drbd_set_my_capacity(device, p_size);
+               /*
+                * I am diskless, need to accept the peer's *current* size.
+                * I must NOT accept the peers backing disk size,
+                * it may have been larger than mine all along...
+                *
+                * At this point, the peer knows more about my disk, or at
+                * least about what we last agreed upon, than myself.
+                * So if his c_size is less than his d_size, the most likely
+                * reason is that *my* d_size was smaller last time we checked.
+                *
+                * However, if he sends a zero current size,
+                * take his (user-capped or) backing disk size anyways.
+                */
+               drbd_reconsider_max_bio_size(device, NULL);
+               drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size);
        }
 
        if (get_ldev(device)) {
@@ -4501,6 +4604,7 @@ static void drbdd(struct drbd_connection *connection)
                struct data_cmd *cmd;
 
                drbd_thread_current_set_cpu(&connection->receiver);
+               update_receiver_timing_details(connection, drbd_recv_header);
                if (drbd_recv_header(connection, &pi))
                        goto err_out;
 
@@ -4519,12 +4623,14 @@ static void drbdd(struct drbd_connection *connection)
                }
 
                if (shs) {
+                       update_receiver_timing_details(connection, drbd_recv_all_warn);
                        err = drbd_recv_all_warn(connection, pi.data, shs);
                        if (err)
                                goto err_out;
                        pi.size -= shs;
                }
 
+               update_receiver_timing_details(connection, cmd->fn);
                err = cmd->fn(connection, &pi);
                if (err) {
                        drbd_err(connection, "error receiving %s, e: %d l: %d!\n",
index 09803d0d5207ce7fccffc5c4a3cb0229071566cd..c67717d572d16c89b1a4f74701b25c0a7ffeafbe 100644 (file)
@@ -52,7 +52,7 @@ static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request
 static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req)
 {
        int rw = bio_data_dir(req->master_bio);
-       unsigned long duration = jiffies - req->start_time;
+       unsigned long duration = jiffies - req->start_jif;
        int cpu;
        cpu = part_stat_lock();
        part_stat_add(cpu, &device->vdisk->part0, ticks[rw], duration);
@@ -66,7 +66,7 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device,
 {
        struct drbd_request *req;
 
-       req = mempool_alloc(drbd_request_mempool, GFP_NOIO);
+       req = mempool_alloc(drbd_request_mempool, GFP_NOIO | __GFP_ZERO);
        if (!req)
                return NULL;
 
@@ -84,6 +84,8 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device,
 
        INIT_LIST_HEAD(&req->tl_requests);
        INIT_LIST_HEAD(&req->w.list);
+       INIT_LIST_HEAD(&req->req_pending_master_completion);
+       INIT_LIST_HEAD(&req->req_pending_local);
 
        /* one reference to be put by __drbd_make_request */
        atomic_set(&req->completion_ref, 1);
@@ -92,6 +94,19 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device,
        return req;
 }
 
+static void drbd_remove_request_interval(struct rb_root *root,
+                                        struct drbd_request *req)
+{
+       struct drbd_device *device = req->device;
+       struct drbd_interval *i = &req->i;
+
+       drbd_remove_interval(root, i);
+
+       /* Wake up any processes waiting for this request to complete.  */
+       if (i->waiting)
+               wake_up(&device->misc_wait);
+}
+
 void drbd_req_destroy(struct kref *kref)
 {
        struct drbd_request *req = container_of(kref, struct drbd_request, kref);
@@ -107,14 +122,30 @@ void drbd_req_destroy(struct kref *kref)
                return;
        }
 
-       /* remove it from the transfer log.
-        * well, only if it had been there in the first
-        * place... if it had not (local only or conflicting
-        * and never sent), it should still be "empty" as
-        * initialized in drbd_req_new(), so we can list_del() it
-        * here unconditionally */
+       /* If called from mod_rq_state (expected normal case) or
+        * drbd_send_and_submit (the less likely normal path), this holds the
+        * req_lock, and req->tl_requests will typicaly be on ->transfer_log,
+        * though it may be still empty (never added to the transfer log).
+        *
+        * If called from do_retry(), we do NOT hold the req_lock, but we are
+        * still allowed to unconditionally list_del(&req->tl_requests),
+        * because it will be on a local on-stack list only. */
        list_del_init(&req->tl_requests);
 
+       /* finally remove the request from the conflict detection
+        * respective block_id verification interval tree. */
+       if (!drbd_interval_empty(&req->i)) {
+               struct rb_root *root;
+
+               if (s & RQ_WRITE)
+                       root = &device->write_requests;
+               else
+                       root = &device->read_requests;
+               drbd_remove_request_interval(root, req);
+       } else if (s & (RQ_NET_MASK & ~RQ_NET_DONE) && req->i.size != 0)
+               drbd_err(device, "drbd_req_destroy: Logic BUG: interval empty, but: rq_state=0x%x, sect=%llu, size=%u\n",
+                       s, (unsigned long long)req->i.sector, req->i.size);
+
        /* if it was a write, we may have to set the corresponding
         * bit(s) out-of-sync first. If it had a local part, we need to
         * release the reference to the activity log. */
@@ -188,19 +219,6 @@ void complete_master_bio(struct drbd_device *device,
 }
 
 
-static void drbd_remove_request_interval(struct rb_root *root,
-                                        struct drbd_request *req)
-{
-       struct drbd_device *device = req->device;
-       struct drbd_interval *i = &req->i;
-
-       drbd_remove_interval(root, i);
-
-       /* Wake up any processes waiting for this request to complete.  */
-       if (i->waiting)
-               wake_up(&device->misc_wait);
-}
-
 /* Helper for __req_mod().
  * Set m->bio to the master bio, if it is fit to be completed,
  * or leave it alone (it is initialized to NULL in __req_mod),
@@ -254,18 +272,6 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
        ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
        error = PTR_ERR(req->private_bio);
 
-       /* remove the request from the conflict detection
-        * respective block_id verification hash */
-       if (!drbd_interval_empty(&req->i)) {
-               struct rb_root *root;
-
-               if (rw == WRITE)
-                       root = &device->write_requests;
-               else
-                       root = &device->read_requests;
-               drbd_remove_request_interval(root, req);
-       }
-
        /* Before we can signal completion to the upper layers,
         * we may need to close the current transfer log epoch.
         * We are within the request lock, so we can simply compare
@@ -301,9 +307,24 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
                m->error = ok ? 0 : (error ?: -EIO);
                m->bio = req->master_bio;
                req->master_bio = NULL;
+               /* We leave it in the tree, to be able to verify later
+                * write-acks in protocol != C during resync.
+                * But we mark it as "complete", so it won't be counted as
+                * conflict in a multi-primary setup. */
+               req->i.completed = true;
        }
+
+       if (req->i.waiting)
+               wake_up(&device->misc_wait);
+
+       /* Either we are about to complete to upper layers,
+        * or we will restart this request.
+        * In either case, the request object will be destroyed soon,
+        * so better remove it from all lists. */
+       list_del_init(&req->req_pending_master_completion);
 }
 
+/* still holds resource->req_lock */
 static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put)
 {
        struct drbd_device *device = req->device;
@@ -324,12 +345,91 @@ static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_
        return 1;
 }
 
+static void set_if_null_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+       struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+       if (!connection)
+               return;
+       if (connection->req_next == NULL)
+               connection->req_next = req;
+}
+
+static void advance_conn_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+       struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+       if (!connection)
+               return;
+       if (connection->req_next != req)
+               return;
+       list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
+               const unsigned s = req->rq_state;
+               if (s & RQ_NET_QUEUED)
+                       break;
+       }
+       if (&req->tl_requests == &connection->transfer_log)
+               req = NULL;
+       connection->req_next = req;
+}
+
+static void set_if_null_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+       struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+       if (!connection)
+               return;
+       if (connection->req_ack_pending == NULL)
+               connection->req_ack_pending = req;
+}
+
+static void advance_conn_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+       struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+       if (!connection)
+               return;
+       if (connection->req_ack_pending != req)
+               return;
+       list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
+               const unsigned s = req->rq_state;
+               if ((s & RQ_NET_SENT) && (s & RQ_NET_PENDING))
+                       break;
+       }
+       if (&req->tl_requests == &connection->transfer_log)
+               req = NULL;
+       connection->req_ack_pending = req;
+}
+
+static void set_if_null_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+       struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+       if (!connection)
+               return;
+       if (connection->req_not_net_done == NULL)
+               connection->req_not_net_done = req;
+}
+
+static void advance_conn_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+       struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+       if (!connection)
+               return;
+       if (connection->req_not_net_done != req)
+               return;
+       list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
+               const unsigned s = req->rq_state;
+               if ((s & RQ_NET_SENT) && !(s & RQ_NET_DONE))
+                       break;
+       }
+       if (&req->tl_requests == &connection->transfer_log)
+               req = NULL;
+       connection->req_not_net_done = req;
+}
+
 /* I'd like this to be the only place that manipulates
  * req->completion_ref and req->kref. */
 static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
                int clear, int set)
 {
        struct drbd_device *device = req->device;
+       struct drbd_peer_device *peer_device = first_peer_device(device);
        unsigned s = req->rq_state;
        int c_put = 0;
        int k_put = 0;
@@ -356,14 +456,23 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
                atomic_inc(&req->completion_ref);
        }
 
-       if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED))
+       if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED)) {
                atomic_inc(&req->completion_ref);
+               set_if_null_req_next(peer_device, req);
+       }
 
        if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK))
                kref_get(&req->kref); /* wait for the DONE */
 
-       if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT))
-               atomic_add(req->i.size >> 9, &device->ap_in_flight);
+       if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) {
+               /* potentially already completed in the asender thread */
+               if (!(s & RQ_NET_DONE)) {
+                       atomic_add(req->i.size >> 9, &device->ap_in_flight);
+                       set_if_null_req_not_net_done(peer_device, req);
+               }
+               if (s & RQ_NET_PENDING)
+                       set_if_null_req_ack_pending(peer_device, req);
+       }
 
        if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP))
                atomic_inc(&req->completion_ref);
@@ -386,20 +495,34 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
                        ++k_put;
                else
                        ++c_put;
+               list_del_init(&req->req_pending_local);
        }
 
        if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) {
                dec_ap_pending(device);
                ++c_put;
+               req->acked_jif = jiffies;
+               advance_conn_req_ack_pending(peer_device, req);
        }
 
-       if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED))
+       if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED)) {
                ++c_put;
+               advance_conn_req_next(peer_device, req);
+       }
 
-       if ((s & RQ_EXP_BARR_ACK) && !(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) {
-               if (req->rq_state & RQ_NET_SENT)
+       if (!(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) {
+               if (s & RQ_NET_SENT)
                        atomic_sub(req->i.size >> 9, &device->ap_in_flight);
-               ++k_put;
+               if (s & RQ_EXP_BARR_ACK)
+                       ++k_put;
+               req->net_done_jif = jiffies;
+
+               /* in ahead/behind mode, or just in case,
+                * before we finally destroy this request,
+                * the caching pointers must not reference it anymore */
+               advance_conn_req_next(peer_device, req);
+               advance_conn_req_ack_pending(peer_device, req);
+               advance_conn_req_not_net_done(peer_device, req);
        }
 
        /* potentially complete and destroy */
@@ -439,6 +562,19 @@ static void drbd_report_io_error(struct drbd_device *device, struct drbd_request
                        bdevname(device->ldev->backing_bdev, b));
 }
 
+/* Helper for HANDED_OVER_TO_NETWORK.
+ * Is this a protocol A write (neither WRITE_ACK nor RECEIVE_ACK expected)?
+ * Is it also still "PENDING"?
+ * --> If so, clear PENDING and set NET_OK below.
+ * If it is a protocol A write, but not RQ_PENDING anymore, neg-ack was faster
+ * (and we must not set RQ_NET_OK) */
+static inline bool is_pending_write_protocol_A(struct drbd_request *req)
+{
+       return (req->rq_state &
+                  (RQ_WRITE|RQ_NET_PENDING|RQ_EXP_WRITE_ACK|RQ_EXP_RECEIVE_ACK))
+               == (RQ_WRITE|RQ_NET_PENDING);
+}
+
 /* obviously this could be coded as many single functions
  * instead of one huge switch,
  * or by putting the code directly in the respective locations
@@ -454,7 +590,9 @@ static void drbd_report_io_error(struct drbd_device *device, struct drbd_request
 int __req_mod(struct drbd_request *req, enum drbd_req_event what,
                struct bio_and_error *m)
 {
-       struct drbd_device *device = req->device;
+       struct drbd_device *const device = req->device;
+       struct drbd_peer_device *const peer_device = first_peer_device(device);
+       struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
        struct net_conf *nc;
        int p, rv = 0;
 
@@ -477,7 +615,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
                 * and from w_read_retry_remote */
                D_ASSERT(device, !(req->rq_state & RQ_NET_MASK));
                rcu_read_lock();
-               nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+               nc = rcu_dereference(connection->net_conf);
                p = nc->wire_protocol;
                rcu_read_unlock();
                req->rq_state |=
@@ -549,7 +687,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
                D_ASSERT(device, (req->rq_state & RQ_LOCAL_MASK) == 0);
                mod_rq_state(req, m, 0, RQ_NET_QUEUED);
                req->w.cb = w_send_read_req;
-               drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+               drbd_queue_work(&connection->sender_work,
                                &req->w);
                break;
 
@@ -585,23 +723,23 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
                D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
                mod_rq_state(req, m, 0, RQ_NET_QUEUED|RQ_EXP_BARR_ACK);
                req->w.cb =  w_send_dblock;
-               drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+               drbd_queue_work(&connection->sender_work,
                                &req->w);
 
                /* close the epoch, in case it outgrew the limit */
                rcu_read_lock();
-               nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+               nc = rcu_dereference(connection->net_conf);
                p = nc->max_epoch_size;
                rcu_read_unlock();
-               if (first_peer_device(device)->connection->current_tle_writes >= p)
-                       start_new_tl_epoch(first_peer_device(device)->connection);
+               if (connection->current_tle_writes >= p)
+                       start_new_tl_epoch(connection);
 
                break;
 
        case QUEUE_FOR_SEND_OOS:
                mod_rq_state(req, m, 0, RQ_NET_QUEUED);
                req->w.cb =  w_send_out_of_sync;
-               drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+               drbd_queue_work(&connection->sender_work,
                                &req->w);
                break;
 
@@ -615,18 +753,16 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 
        case HANDED_OVER_TO_NETWORK:
                /* assert something? */
-               if (bio_data_dir(req->master_bio) == WRITE &&
-                   !(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK))) {
+               if (is_pending_write_protocol_A(req))
                        /* this is what is dangerous about protocol A:
                         * pretend it was successfully written on the peer. */
-                       if (req->rq_state & RQ_NET_PENDING)
-                               mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
-                       /* else: neg-ack was faster... */
-                       /* it is still not yet RQ_NET_DONE until the
-                        * corresponding epoch barrier got acked as well,
-                        * so we know what to dirty on connection loss */
-               }
-               mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT);
+                       mod_rq_state(req, m, RQ_NET_QUEUED|RQ_NET_PENDING,
+                                               RQ_NET_SENT|RQ_NET_OK);
+               else
+                       mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT);
+               /* It is still not yet RQ_NET_DONE until the
+                * corresponding epoch barrier got acked as well,
+                * so we know what to dirty on connection loss. */
                break;
 
        case OOS_HANDED_TO_NETWORK:
@@ -658,12 +794,13 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
        case WRITE_ACKED_BY_PEER_AND_SIS:
                req->rq_state |= RQ_NET_SIS;
        case WRITE_ACKED_BY_PEER:
-               D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK);
-               /* protocol C; successfully written on peer.
+               /* Normal operation protocol C: successfully written on peer.
+                * During resync, even in protocol != C,
+                * we requested an explicit write ack anyways.
+                * Which means we cannot even assert anything here.
                 * Nothing more to do here.
                 * We want to keep the tl in place for all protocols, to cater
                 * for volatile write-back caches on lower level devices. */
-
                goto ack_common;
        case RECV_ACKED_BY_PEER:
                D_ASSERT(device, req->rq_state & RQ_EXP_RECEIVE_ACK);
@@ -671,7 +808,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
                 * see also notes above in HANDED_OVER_TO_NETWORK about
                 * protocol != C */
        ack_common:
-               D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
                mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
                break;
 
@@ -714,7 +850,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 
                get_ldev(device); /* always succeeds in this call path */
                req->w.cb = w_restart_disk_io;
-               drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+               drbd_queue_work(&connection->sender_work,
                                &req->w);
                break;
 
@@ -736,7 +872,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 
                        mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED|RQ_NET_PENDING);
                        if (req->w.cb) {
-                               drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+                               /* w.cb expected to be w_send_dblock, or w_send_read_req */
+                               drbd_queue_work(&connection->sender_work,
                                                &req->w);
                                rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ;
                        } /* else: FIXME can this happen? */
@@ -769,7 +906,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
                break;
 
        case QUEUE_AS_DRBD_BARRIER:
-               start_new_tl_epoch(first_peer_device(device)->connection);
+               start_new_tl_epoch(connection);
                mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE);
                break;
        };
@@ -886,6 +1023,9 @@ static void maybe_pull_ahead(struct drbd_device *device)
            connection->agreed_pro_version < 96)
                return;
 
+       if (on_congestion == OC_PULL_AHEAD && device->state.conn == C_AHEAD)
+               return; /* nothing to do ... */
+
        /* If I don't even have good local storage, we can not reasonably try
         * to pull ahead of the peer. We also need the local reference to make
         * sure device->act_log is there.
@@ -1021,6 +1161,7 @@ drbd_submit_req_private_bio(struct drbd_request *req)
         * stable storage, and this is a WRITE, we may not even submit
         * this bio. */
        if (get_ldev(device)) {
+               req->pre_submit_jif = jiffies;
                if (drbd_insert_fault(device,
                                      rw == WRITE ? DRBD_FAULT_DT_WR
                                    : rw == READ  ? DRBD_FAULT_DT_RD
@@ -1035,10 +1176,14 @@ drbd_submit_req_private_bio(struct drbd_request *req)
 
 static void drbd_queue_write(struct drbd_device *device, struct drbd_request *req)
 {
-       spin_lock(&device->submit.lock);
+       spin_lock_irq(&device->resource->req_lock);
        list_add_tail(&req->tl_requests, &device->submit.writes);
-       spin_unlock(&device->submit.lock);
+       list_add_tail(&req->req_pending_master_completion,
+                       &device->pending_master_completion[1 /* WRITE */]);
+       spin_unlock_irq(&device->resource->req_lock);
        queue_work(device->submit.wq, &device->submit.worker);
+       /* do_submit() may sleep internally on al_wait, too */
+       wake_up(&device->al_wait);
 }
 
 /* returns the new drbd_request pointer, if the caller is expected to
@@ -1047,7 +1192,7 @@ static void drbd_queue_write(struct drbd_device *device, struct drbd_request *re
  * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request.
  */
 static struct drbd_request *
-drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_time)
+drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_jif)
 {
        const int rw = bio_data_dir(bio);
        struct drbd_request *req;
@@ -1062,7 +1207,7 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
                bio_endio(bio, -ENOMEM);
                return ERR_PTR(-ENOMEM);
        }
-       req->start_time = start_time;
+       req->start_jif = start_jif;
 
        if (!get_ldev(device)) {
                bio_put(req->private_bio);
@@ -1075,10 +1220,12 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
        if (rw == WRITE && req->private_bio && req->i.size
        && !test_bit(AL_SUSPENDED, &device->flags)) {
                if (!drbd_al_begin_io_fastpath(device, &req->i)) {
+                       atomic_inc(&device->ap_actlog_cnt);
                        drbd_queue_write(device, req);
                        return NULL;
                }
                req->rq_state |= RQ_IN_ACT_LOG;
+               req->in_actlog_jif = jiffies;
        }
 
        return req;
@@ -1086,11 +1233,13 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
 
 static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req)
 {
+       struct drbd_resource *resource = device->resource;
        const int rw = bio_rw(req->master_bio);
        struct bio_and_error m = { NULL, };
        bool no_remote = false;
+       bool submit_private_bio = false;
 
-       spin_lock_irq(&device->resource->req_lock);
+       spin_lock_irq(&resource->req_lock);
        if (rw == WRITE) {
                /* This may temporarily give up the req_lock,
                 * but will re-aquire it before it returns here.
@@ -1148,13 +1297,18 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
                        no_remote = true;
        }
 
+       /* If it took the fast path in drbd_request_prepare, add it here.
+        * The slow path has added it already. */
+       if (list_empty(&req->req_pending_master_completion))
+               list_add_tail(&req->req_pending_master_completion,
+                       &device->pending_master_completion[rw == WRITE]);
        if (req->private_bio) {
                /* needs to be marked within the same spinlock */
+               list_add_tail(&req->req_pending_local,
+                       &device->pending_completion[rw == WRITE]);
                _req_mod(req, TO_BE_SUBMITTED);
                /* but we need to give up the spinlock to submit */
-               spin_unlock_irq(&device->resource->req_lock);
-               drbd_submit_req_private_bio(req);
-               spin_lock_irq(&device->resource->req_lock);
+               submit_private_bio = true;
        } else if (no_remote) {
 nodata:
                if (__ratelimit(&drbd_ratelimit_state))
@@ -1167,15 +1321,23 @@ nodata:
 out:
        if (drbd_req_put_completion_ref(req, &m, 1))
                kref_put(&req->kref, drbd_req_destroy);
-       spin_unlock_irq(&device->resource->req_lock);
-
+       spin_unlock_irq(&resource->req_lock);
+
+       /* Even though above is a kref_put(), this is safe.
+        * As long as we still need to submit our private bio,
+        * we hold a completion ref, and the request cannot disappear.
+        * If however this request did not even have a private bio to submit
+        * (e.g. remote read), req may already be invalid now.
+        * That's why we cannot check on req->private_bio. */
+       if (submit_private_bio)
+               drbd_submit_req_private_bio(req);
        if (m.bio)
                complete_master_bio(device, &m);
 }
 
-void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_time)
+void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_jif)
 {
-       struct drbd_request *req = drbd_request_prepare(device, bio, start_time);
+       struct drbd_request *req = drbd_request_prepare(device, bio, start_jif);
        if (IS_ERR_OR_NULL(req))
                return;
        drbd_send_and_submit(device, req);
@@ -1194,6 +1356,8 @@ static void submit_fast_path(struct drbd_device *device, struct list_head *incom
                                continue;
 
                        req->rq_state |= RQ_IN_ACT_LOG;
+                       req->in_actlog_jif = jiffies;
+                       atomic_dec(&device->ap_actlog_cnt);
                }
 
                list_del_init(&req->tl_requests);
@@ -1203,7 +1367,8 @@ static void submit_fast_path(struct drbd_device *device, struct list_head *incom
 
 static bool prepare_al_transaction_nonblock(struct drbd_device *device,
                                            struct list_head *incoming,
-                                           struct list_head *pending)
+                                           struct list_head *pending,
+                                           struct list_head *later)
 {
        struct drbd_request *req, *tmp;
        int wake = 0;
@@ -1212,45 +1377,105 @@ static bool prepare_al_transaction_nonblock(struct drbd_device *device,
        spin_lock_irq(&device->al_lock);
        list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
                err = drbd_al_begin_io_nonblock(device, &req->i);
+               if (err == -ENOBUFS)
+                       break;
                if (err == -EBUSY)
                        wake = 1;
                if (err)
-                       continue;
-               req->rq_state |= RQ_IN_ACT_LOG;
-               list_move_tail(&req->tl_requests, pending);
+                       list_move_tail(&req->tl_requests, later);
+               else
+                       list_move_tail(&req->tl_requests, pending);
        }
        spin_unlock_irq(&device->al_lock);
        if (wake)
                wake_up(&device->al_wait);
-
        return !list_empty(pending);
 }
 
+void send_and_submit_pending(struct drbd_device *device, struct list_head *pending)
+{
+       struct drbd_request *req, *tmp;
+
+       list_for_each_entry_safe(req, tmp, pending, tl_requests) {
+               req->rq_state |= RQ_IN_ACT_LOG;
+               req->in_actlog_jif = jiffies;
+               atomic_dec(&device->ap_actlog_cnt);
+               list_del_init(&req->tl_requests);
+               drbd_send_and_submit(device, req);
+       }
+}
+
 void do_submit(struct work_struct *ws)
 {
        struct drbd_device *device = container_of(ws, struct drbd_device, submit.worker);
-       LIST_HEAD(incoming);
-       LIST_HEAD(pending);
-       struct drbd_request *req, *tmp;
+       LIST_HEAD(incoming);    /* from drbd_make_request() */
+       LIST_HEAD(pending);     /* to be submitted after next AL-transaction commit */
+       LIST_HEAD(busy);        /* blocked by resync requests */
+
+       /* grab new incoming requests */
+       spin_lock_irq(&device->resource->req_lock);
+       list_splice_tail_init(&device->submit.writes, &incoming);
+       spin_unlock_irq(&device->resource->req_lock);
 
        for (;;) {
-               spin_lock(&device->submit.lock);
-               list_splice_tail_init(&device->submit.writes, &incoming);
-               spin_unlock(&device->submit.lock);
+               DEFINE_WAIT(wait);
 
+               /* move used-to-be-busy back to front of incoming */
+               list_splice_init(&busy, &incoming);
                submit_fast_path(device, &incoming);
                if (list_empty(&incoming))
                        break;
 
-skip_fast_path:
-               wait_event(device->al_wait, prepare_al_transaction_nonblock(device, &incoming, &pending));
-               /* Maybe more was queued, while we prepared the transaction?
-                * Try to stuff them into this transaction as well.
-                * Be strictly non-blocking here, no wait_event, we already
-                * have something to commit.
-                * Stop if we don't make any more progres.
-                */
                for (;;) {
+                       prepare_to_wait(&device->al_wait, &wait, TASK_UNINTERRUPTIBLE);
+
+                       list_splice_init(&busy, &incoming);
+                       prepare_al_transaction_nonblock(device, &incoming, &pending, &busy);
+                       if (!list_empty(&pending))
+                               break;
+
+                       schedule();
+
+                       /* If all currently "hot" activity log extents are kept busy by
+                        * incoming requests, we still must not totally starve new
+                        * requests to "cold" extents.
+                        * Something left on &incoming means there had not been
+                        * enough update slots available, and the activity log
+                        * has been marked as "starving".
+                        *
+                        * Try again now, without looking for new requests,
+                        * effectively blocking all new requests until we made
+                        * at least _some_ progress with what we currently have.
+                        */
+                       if (!list_empty(&incoming))
+                               continue;
+
+                       /* Nothing moved to pending, but nothing left
+                        * on incoming: all moved to busy!
+                        * Grab new and iterate. */
+                       spin_lock_irq(&device->resource->req_lock);
+                       list_splice_tail_init(&device->submit.writes, &incoming);
+                       spin_unlock_irq(&device->resource->req_lock);
+               }
+               finish_wait(&device->al_wait, &wait);
+
+               /* If the transaction was full, before all incoming requests
+                * had been processed, skip ahead to commit, and iterate
+                * without splicing in more incoming requests from upper layers.
+                *
+                * Else, if all incoming have been processed,
+                * they have become either "pending" (to be submitted after
+                * next transaction commit) or "busy" (blocked by resync).
+                *
+                * Maybe more was queued, while we prepared the transaction?
+                * Try to stuff those into this transaction as well.
+                * Be strictly non-blocking here,
+                * we already have something to commit.
+                *
+                * Commit if we don't make any more progres.
+                */
+
+               while (list_empty(&incoming)) {
                        LIST_HEAD(more_pending);
                        LIST_HEAD(more_incoming);
                        bool made_progress;
@@ -1260,55 +1485,32 @@ skip_fast_path:
                        if (list_empty(&device->submit.writes))
                                break;
 
-                       spin_lock(&device->submit.lock);
+                       spin_lock_irq(&device->resource->req_lock);
                        list_splice_tail_init(&device->submit.writes, &more_incoming);
-                       spin_unlock(&device->submit.lock);
+                       spin_unlock_irq(&device->resource->req_lock);
 
                        if (list_empty(&more_incoming))
                                break;
 
-                       made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending);
+                       made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending, &busy);
 
                        list_splice_tail_init(&more_pending, &pending);
                        list_splice_tail_init(&more_incoming, &incoming);
-
                        if (!made_progress)
                                break;
                }
-               drbd_al_begin_io_commit(device, false);
-
-               list_for_each_entry_safe(req, tmp, &pending, tl_requests) {
-                       list_del_init(&req->tl_requests);
-                       drbd_send_and_submit(device, req);
-               }
 
-               /* If all currently hot activity log extents are kept busy by
-                * incoming requests, we still must not totally starve new
-                * requests to cold extents. In that case, prepare one request
-                * in blocking mode. */
-               list_for_each_entry_safe(req, tmp, &incoming, tl_requests) {
-                       list_del_init(&req->tl_requests);
-                       req->rq_state |= RQ_IN_ACT_LOG;
-                       if (!drbd_al_begin_io_prepare(device, &req->i)) {
-                               /* Corresponding extent was hot after all? */
-                               drbd_send_and_submit(device, req);
-                       } else {
-                               /* Found a request to a cold extent.
-                                * Put on "pending" list,
-                                * and try to cumulate with more. */
-                               list_add(&req->tl_requests, &pending);
-                               goto skip_fast_path;
-                       }
-               }
+               drbd_al_begin_io_commit(device);
+               send_and_submit_pending(device, &pending);
        }
 }
 
 void drbd_make_request(struct request_queue *q, struct bio *bio)
 {
        struct drbd_device *device = (struct drbd_device *) q->queuedata;
-       unsigned long start_time;
+       unsigned long start_jif;
 
-       start_time = jiffies;
+       start_jif = jiffies;
 
        /*
         * what we "blindly" assume:
@@ -1316,7 +1518,7 @@ void drbd_make_request(struct request_queue *q, struct bio *bio)
        D_ASSERT(device, IS_ALIGNED(bio->bi_iter.bi_size, 512));
 
        inc_ap_bio(device);
-       __drbd_make_request(device, bio, start_time);
+       __drbd_make_request(device, bio, start_jif);
 }
 
 /* This is called by bio_add_page().
@@ -1353,36 +1555,13 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
        return limit;
 }
 
-static void find_oldest_requests(
-               struct drbd_connection *connection,
-               struct drbd_device *device,
-               struct drbd_request **oldest_req_waiting_for_peer,
-               struct drbd_request **oldest_req_waiting_for_disk)
-{
-       struct drbd_request *r;
-       *oldest_req_waiting_for_peer = NULL;
-       *oldest_req_waiting_for_disk = NULL;
-       list_for_each_entry(r, &connection->transfer_log, tl_requests) {
-               const unsigned s = r->rq_state;
-               if (!*oldest_req_waiting_for_peer
-               && ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE)))
-                       *oldest_req_waiting_for_peer = r;
-
-               if (!*oldest_req_waiting_for_disk
-               && (s & RQ_LOCAL_PENDING) && r->device == device)
-                       *oldest_req_waiting_for_disk = r;
-
-               if (*oldest_req_waiting_for_peer && *oldest_req_waiting_for_disk)
-                       break;
-       }
-}
-
 void request_timer_fn(unsigned long data)
 {
        struct drbd_device *device = (struct drbd_device *) data;
        struct drbd_connection *connection = first_peer_device(device)->connection;
-       struct drbd_request *req_disk, *req_peer; /* oldest request */
+       struct drbd_request *req_read, *req_write, *req_peer; /* oldest request */
        struct net_conf *nc;
+       unsigned long oldest_submit_jif;
        unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
        unsigned long now;
 
@@ -1403,14 +1582,31 @@ void request_timer_fn(unsigned long data)
                return; /* Recurring timer stopped */
 
        now = jiffies;
+       nt = now + et;
 
        spin_lock_irq(&device->resource->req_lock);
-       find_oldest_requests(connection, device, &req_peer, &req_disk);
-       if (req_peer == NULL && req_disk == NULL) {
-               spin_unlock_irq(&device->resource->req_lock);
-               mod_timer(&device->request_timer, now + et);
-               return;
-       }
+       req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local);
+       req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local);
+       req_peer = connection->req_not_net_done;
+       /* maybe the oldest request waiting for the peer is in fact still
+        * blocking in tcp sendmsg */
+       if (!req_peer && connection->req_next && connection->req_next->pre_send_jif)
+               req_peer = connection->req_next;
+
+       /* evaluate the oldest peer request only in one timer! */
+       if (req_peer && req_peer->device != device)
+               req_peer = NULL;
+
+       /* do we have something to evaluate? */
+       if (req_peer == NULL && req_write == NULL && req_read == NULL)
+               goto out;
+
+       oldest_submit_jif =
+               (req_write && req_read)
+               ? ( time_before(req_write->pre_submit_jif, req_read->pre_submit_jif)
+                 ? req_write->pre_submit_jif : req_read->pre_submit_jif )
+               : req_write ? req_write->pre_submit_jif
+               : req_read ? req_read->pre_submit_jif : now;
 
        /* The request is considered timed out, if
         * - we have some effective timeout from the configuration,
@@ -1429,13 +1625,13 @@ void request_timer_fn(unsigned long data)
         * to expire twice (worst case) to become effective. Good enough.
         */
        if (ent && req_peer &&
-                time_after(now, req_peer->start_time + ent) &&
+                time_after(now, req_peer->pre_send_jif + ent) &&
                !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
                drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
                _drbd_set_state(_NS(device, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL);
        }
-       if (dt && req_disk &&
-                time_after(now, req_disk->start_time + dt) &&
+       if (dt && oldest_submit_jif != now &&
+                time_after(now, oldest_submit_jif + dt) &&
                !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {
                drbd_warn(device, "Local backing device failed to meet the disk-timeout\n");
                __drbd_chk_io_error(device, DRBD_FORCE_DETACH);
@@ -1443,11 +1639,12 @@ void request_timer_fn(unsigned long data)
 
        /* Reschedule timer for the nearest not already expired timeout.
         * Fallback to now + min(effective network timeout, disk timeout). */
-       ent = (ent && req_peer && time_before(now, req_peer->start_time + ent))
-               ? req_peer->start_time + ent : now + et;
-       dt = (dt && req_disk && time_before(now, req_disk->start_time + dt))
-               ? req_disk->start_time + dt : now + et;
+       ent = (ent && req_peer && time_before(now, req_peer->pre_send_jif + ent))
+               ? req_peer->pre_send_jif + ent : now + et;
+       dt = (dt && oldest_submit_jif != now && time_before(now, oldest_submit_jif + dt))
+               ? oldest_submit_jif + dt : now + et;
        nt = time_before(ent, dt) ? ent : dt;
+out:
        spin_unlock_irq(&connection->resource->req_lock);
        mod_timer(&device->request_timer, nt);
 }
index 8566cd5866b4e2388cdb441439f25eecf6071443..9f6a04080e9f76aadfdfedf8d0e1cb408dbcba2a 100644 (file)
@@ -288,6 +288,7 @@ extern void complete_master_bio(struct drbd_device *device,
 extern void request_timer_fn(unsigned long data);
 extern void tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
 extern void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
+extern void tl_abort_disk_io(struct drbd_device *device);
 
 /* this is in drbd_main.c */
 extern void drbd_restart_request(struct drbd_request *req);
index a5d8aae00e04c9515d4a684caaeabb079ada6dc5..c35c0f001bb74333887d0e47c23a0598cd203bfe 100644 (file)
@@ -410,7 +410,7 @@ _drbd_request_state(struct drbd_device *device, union drbd_state mask,
        return rv;
 }
 
-static void print_st(struct drbd_device *device, char *name, union drbd_state ns)
+static void print_st(struct drbd_device *device, const char *name, union drbd_state ns)
 {
        drbd_err(device, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n",
            name,
@@ -952,11 +952,12 @@ enum drbd_state_rv
 __drbd_set_state(struct drbd_device *device, union drbd_state ns,
                 enum chg_state_flags flags, struct completion *done)
 {
+       struct drbd_peer_device *peer_device = first_peer_device(device);
+       struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
        union drbd_state os;
        enum drbd_state_rv rv = SS_SUCCESS;
        enum sanitize_state_warnings ssw;
        struct after_state_chg_work *ascw;
-       bool did_remote, should_do_remote;
 
        os = drbd_read_state(device);
 
@@ -978,9 +979,9 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
                           this happen...*/
 
                        if (is_valid_state(device, os) == rv)
-                               rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
+                               rv = is_valid_soft_transition(os, ns, connection);
                } else
-                       rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
+                       rv = is_valid_soft_transition(os, ns, connection);
        }
 
        if (rv < SS_SUCCESS) {
@@ -997,7 +998,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
           sanitize_state(). Only display it here if we where not called from
           _conn_request_state() */
        if (!(flags & CS_DC_SUSP))
-               conn_pr_state_change(first_peer_device(device)->connection, os, ns,
+               conn_pr_state_change(connection, os, ns,
                                     (flags & ~CS_DC_MASK) | CS_DC_SUSP);
 
        /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
@@ -1008,28 +1009,35 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
            (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
                atomic_inc(&device->local_cnt);
 
-       did_remote = drbd_should_do_remote(device->state);
+       if (!is_sync_state(os.conn) && is_sync_state(ns.conn))
+               clear_bit(RS_DONE, &device->flags);
+
+       /* changes to local_cnt and device flags should be visible before
+        * changes to state, which again should be visible before anything else
+        * depending on that change happens. */
+       smp_wmb();
        device->state.i = ns.i;
-       should_do_remote = drbd_should_do_remote(device->state);
        device->resource->susp = ns.susp;
        device->resource->susp_nod = ns.susp_nod;
        device->resource->susp_fen = ns.susp_fen;
+       smp_wmb();
 
        /* put replicated vs not-replicated requests in seperate epochs */
-       if (did_remote != should_do_remote)
-               start_new_tl_epoch(first_peer_device(device)->connection);
+       if (drbd_should_do_remote((union drbd_dev_state)os.i) !=
+           drbd_should_do_remote((union drbd_dev_state)ns.i))
+               start_new_tl_epoch(connection);
 
        if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
                drbd_print_uuids(device, "attached to UUIDs");
 
        /* Wake up role changes, that were delayed because of connection establishing */
        if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS &&
-           no_peer_wf_report_params(first_peer_device(device)->connection))
-               clear_bit(STATE_SENT, &first_peer_device(device)->connection->flags);
+           no_peer_wf_report_params(connection))
+               clear_bit(STATE_SENT, &connection->flags);
 
        wake_up(&device->misc_wait);
        wake_up(&device->state_wait);
-       wake_up(&first_peer_device(device)->connection->ping_wait);
+       wake_up(&connection->ping_wait);
 
        /* Aborted verify run, or we reached the stop sector.
         * Log the last position, unless end-of-device. */
@@ -1118,21 +1126,21 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
 
        /* Receiver should clean up itself */
        if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
-               drbd_thread_stop_nowait(&first_peer_device(device)->connection->receiver);
+               drbd_thread_stop_nowait(&connection->receiver);
 
        /* Now the receiver finished cleaning up itself, it should die */
        if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
-               drbd_thread_stop_nowait(&first_peer_device(device)->connection->receiver);
+               drbd_thread_stop_nowait(&connection->receiver);
 
        /* Upon network failure, we need to restart the receiver. */
        if (os.conn > C_WF_CONNECTION &&
            ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
-               drbd_thread_restart_nowait(&first_peer_device(device)->connection->receiver);
+               drbd_thread_restart_nowait(&connection->receiver);
 
        /* Resume AL writing if we get a connection */
        if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
                drbd_resume_al(device);
-               first_peer_device(device)->connection->connect_cnt++;
+               connection->connect_cnt++;
        }
 
        /* remember last attach time so request_timer_fn() won't
@@ -1150,7 +1158,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
                ascw->w.cb = w_after_state_ch;
                ascw->device = device;
                ascw->done = done;
-               drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+               drbd_queue_work(&connection->sender_work,
                                &ascw->w);
        } else {
                drbd_err(device, "Could not kmalloc an ascw\n");
@@ -1222,13 +1230,16 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
                           union drbd_state ns, enum chg_state_flags flags)
 {
        struct drbd_resource *resource = device->resource;
+       struct drbd_peer_device *peer_device = first_peer_device(device);
+       struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
        struct sib_info sib;
 
        sib.sib_reason = SIB_STATE_CHANGE;
        sib.os = os;
        sib.ns = ns;
 
-       if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
+       if ((os.disk != D_UP_TO_DATE || os.pdsk != D_UP_TO_DATE)
+       &&  (ns.disk == D_UP_TO_DATE && ns.pdsk == D_UP_TO_DATE)) {
                clear_bit(CRASHED_PRIMARY, &device->flags);
                if (device->p_uuid)
                        device->p_uuid[UI_FLAGS] &= ~((u64)2);
@@ -1245,7 +1256,6 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
           state change. This function might sleep */
 
        if (ns.susp_nod) {
-               struct drbd_connection *connection = first_peer_device(device)->connection;
                enum drbd_req_event what = NOTHING;
 
                spin_lock_irq(&device->resource->req_lock);
@@ -1267,8 +1277,6 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
        }
 
        if (ns.susp_fen) {
-               struct drbd_connection *connection = first_peer_device(device)->connection;
-
                spin_lock_irq(&device->resource->req_lock);
                if (resource->susp_fen && conn_lowest_conn(connection) >= C_CONNECTED) {
                        /* case2: The connection was established again: */
@@ -1294,8 +1302,8 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
         * which is unexpected. */
        if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
            (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
-           first_peer_device(device)->connection->agreed_pro_version >= 96 && get_ldev(device)) {
-               drbd_gen_and_send_sync_uuid(first_peer_device(device));
+           connection->agreed_pro_version >= 96 && get_ldev(device)) {
+               drbd_gen_and_send_sync_uuid(peer_device);
                put_ldev(device);
        }
 
@@ -1309,8 +1317,8 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
                atomic_set(&device->rs_pending_cnt, 0);
                drbd_rs_cancel_all(device);
 
-               drbd_send_uuids(first_peer_device(device));
-               drbd_send_state(first_peer_device(device), ns);
+               drbd_send_uuids(peer_device);
+               drbd_send_state(peer_device, ns);
        }
        /* No point in queuing send_bitmap if we don't have a connection
         * anymore, so check also the _current_ state, not only the new state
@@ -1335,7 +1343,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
                                        set_bit(NEW_CUR_UUID, &device->flags);
                                } else {
                                        drbd_uuid_new_current(device);
-                                       drbd_send_uuids(first_peer_device(device));
+                                       drbd_send_uuids(peer_device);
                                }
                        }
                        put_ldev(device);
@@ -1346,7 +1354,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
                if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
                    device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
                        drbd_uuid_new_current(device);
-                       drbd_send_uuids(first_peer_device(device));
+                       drbd_send_uuids(peer_device);
                }
                /* D_DISKLESS Peer becomes secondary */
                if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
@@ -1373,16 +1381,16 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
        /* Last part of the attaching process ... */
        if (ns.conn >= C_CONNECTED &&
            os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
-               drbd_send_sizes(first_peer_device(device), 0, 0);  /* to start sync... */
-               drbd_send_uuids(first_peer_device(device));
-               drbd_send_state(first_peer_device(device), ns);
+               drbd_send_sizes(peer_device, 0, 0);  /* to start sync... */
+               drbd_send_uuids(peer_device);
+               drbd_send_state(peer_device, ns);
        }
 
        /* We want to pause/continue resync, tell peer. */
        if (ns.conn >= C_CONNECTED &&
             ((os.aftr_isp != ns.aftr_isp) ||
              (os.user_isp != ns.user_isp)))
-               drbd_send_state(first_peer_device(device), ns);
+               drbd_send_state(peer_device, ns);
 
        /* In case one of the isp bits got set, suspend other devices. */
        if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
@@ -1392,10 +1400,10 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
        /* Make sure the peer gets informed about eventual state
           changes (ISP bits) while we were in WFReportParams. */
        if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
-               drbd_send_state(first_peer_device(device), ns);
+               drbd_send_state(peer_device, ns);
 
        if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
-               drbd_send_state(first_peer_device(device), ns);
+               drbd_send_state(peer_device, ns);
 
        /* We are in the progress to start a full sync... */
        if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
@@ -1449,7 +1457,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
                                        drbd_disk_str(device->state.disk));
 
                        if (ns.conn >= C_CONNECTED)
-                               drbd_send_state(first_peer_device(device), ns);
+                               drbd_send_state(peer_device, ns);
 
                        drbd_rs_cancel_all(device);
 
@@ -1473,7 +1481,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
                                 drbd_disk_str(device->state.disk));
 
                if (ns.conn >= C_CONNECTED)
-                       drbd_send_state(first_peer_device(device), ns);
+                       drbd_send_state(peer_device, ns);
                /* corresponding get_ldev in __drbd_set_state
                 * this may finally trigger drbd_ldev_destroy. */
                put_ldev(device);
@@ -1481,7 +1489,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 
        /* Notify peer that I had a local IO error, and did not detached.. */
        if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
-               drbd_send_state(first_peer_device(device), ns);
+               drbd_send_state(peer_device, ns);
 
        /* Disks got bigger while they were detached */
        if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
@@ -1499,14 +1507,14 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
        /* sync target done with resync.  Explicitly notify peer, even though
         * it should (at least for non-empty resyncs) already know itself. */
        if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
-               drbd_send_state(first_peer_device(device), ns);
+               drbd_send_state(peer_device, ns);
 
        /* Verify finished, or reached stop sector.  Peer did not know about
         * the stop sector, and we may even have changed the stop sector during
         * verify to interrupt/stop early.  Send the new state. */
        if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED
        && verify_can_do_stop_sector(device))
-               drbd_send_state(first_peer_device(device), ns);
+               drbd_send_state(peer_device, ns);
 
        /* This triggers bitmap writeout of potentially still unwritten pages
         * if the resync finished cleanly, or aborted because of peer disk
@@ -1563,7 +1571,7 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused)
                old_conf = connection->net_conf;
                connection->my_addr_len = 0;
                connection->peer_addr_len = 0;
-               rcu_assign_pointer(connection->net_conf, NULL);
+               RCU_INIT_POINTER(connection->net_conf, NULL);
                conn_free_crypto(connection);
                mutex_unlock(&connection->resource->conf_update);
 
@@ -1599,7 +1607,7 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused)
        return 0;
 }
 
-void conn_old_common_state(struct drbd_connection *connection, union drbd_state *pcs, enum chg_state_flags *pf)
+static void conn_old_common_state(struct drbd_connection *connection, union drbd_state *pcs, enum chg_state_flags *pf)
 {
        enum chg_state_flags flags = ~0;
        struct drbd_peer_device *peer_device;
@@ -1688,7 +1696,7 @@ conn_is_valid_transition(struct drbd_connection *connection, union drbd_state ma
        return rv;
 }
 
-void
+static void
 conn_set_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
               union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags)
 {
index d8f57b6305cd6f84ec0be24309512fe5b25a8e92..50776b36282868415d7b48ebed66fbf1f5dec09f 100644 (file)
@@ -67,13 +67,10 @@ rwlock_t global_state_lock;
  */
 void drbd_md_io_complete(struct bio *bio, int error)
 {
-       struct drbd_md_io *md_io;
        struct drbd_device *device;
 
-       md_io = (struct drbd_md_io *)bio->bi_private;
-       device = container_of(md_io, struct drbd_device, md_io);
-
-       md_io->error = error;
+       device = bio->bi_private;
+       device->md_io.error = error;
 
        /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
         * to timeout on the lower level device, and eventually detach from it.
@@ -87,7 +84,7 @@ void drbd_md_io_complete(struct bio *bio, int error)
         * ASSERT(atomic_read(&device->md_io_in_use) == 1) there.
         */
        drbd_md_put_buffer(device);
-       md_io->done = 1;
+       device->md_io.done = 1;
        wake_up(&device->misc_wait);
        bio_put(bio);
        if (device->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */
@@ -135,6 +132,7 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l
        i = peer_req->i;
        do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
        block_id = peer_req->block_id;
+       peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
 
        spin_lock_irqsave(&device->resource->req_lock, flags);
        device->writ_cnt += peer_req->i.size >> 9;
@@ -398,9 +396,6 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector,
        if (!get_ldev(device))
                return -EIO;
 
-       if (drbd_rs_should_slow_down(device, sector))
-               goto defer;
-
        /* GFP_TRY, because if there is no memory available right now, this may
         * be rescheduled for later. It is "only" background resync, after all. */
        peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
@@ -410,7 +405,7 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector,
 
        peer_req->w.cb = w_e_send_csum;
        spin_lock_irq(&device->resource->req_lock);
-       list_add(&peer_req->w.list, &device->read_ee);
+       list_add_tail(&peer_req->w.list, &device->read_ee);
        spin_unlock_irq(&device->resource->req_lock);
 
        atomic_add(size >> 9, &device->rs_sect_ev);
@@ -452,9 +447,9 @@ void resync_timer_fn(unsigned long data)
 {
        struct drbd_device *device = (struct drbd_device *) data;
 
-       if (list_empty(&device->resync_work.list))
-               drbd_queue_work(&first_peer_device(device)->connection->sender_work,
-                               &device->resync_work);
+       drbd_queue_work_if_unqueued(
+               &first_peer_device(device)->connection->sender_work,
+               &device->resync_work);
 }
 
 static void fifo_set(struct fifo_buffer *fb, int value)
@@ -504,9 +499,9 @@ struct fifo_buffer *fifo_alloc(int fifo_size)
 static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in)
 {
        struct disk_conf *dc;
-       unsigned int want;     /* The number of sectors we want in the proxy */
+       unsigned int want;     /* The number of sectors we want in-flight */
        int req_sect; /* Number of sectors to request in this turn */
-       int correction; /* Number of sectors more we need in the proxy*/
+       int correction; /* Number of sectors more we need in-flight */
        int cps; /* correction per invocation of drbd_rs_controller() */
        int steps; /* Number of time steps to plan ahead */
        int curr_corr;
@@ -577,20 +572,27 @@ static int drbd_rs_number_requests(struct drbd_device *device)
         * potentially causing a distributed deadlock on congestion during
         * online-verify or (checksum-based) resync, if max-buffers,
         * socket buffer sizes and resync rate settings are mis-configured. */
-       if (mxb - device->rs_in_flight < number)
-               number = mxb - device->rs_in_flight;
+
+       /* note that "number" is in units of "BM_BLOCK_SIZE" (which is 4k),
+        * mxb (as used here, and in drbd_alloc_pages on the peer) is
+        * "number of pages" (typically also 4k),
+        * but "rs_in_flight" is in "sectors" (512 Byte). */
+       if (mxb - device->rs_in_flight/8 < number)
+               number = mxb - device->rs_in_flight/8;
 
        return number;
 }
 
-static int make_resync_request(struct drbd_device *device, int cancel)
+static int make_resync_request(struct drbd_device *const device, int cancel)
 {
+       struct drbd_peer_device *const peer_device = first_peer_device(device);
+       struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
        unsigned long bit;
        sector_t sector;
        const sector_t capacity = drbd_get_capacity(device->this_bdev);
        int max_bio_size;
        int number, rollback_i, size;
-       int align, queued, sndbuf;
+       int align, requeue = 0;
        int i = 0;
 
        if (unlikely(cancel))
@@ -617,17 +619,22 @@ static int make_resync_request(struct drbd_device *device, int cancel)
                goto requeue;
 
        for (i = 0; i < number; i++) {
-               /* Stop generating RS requests, when half of the send buffer is filled */
-               mutex_lock(&first_peer_device(device)->connection->data.mutex);
-               if (first_peer_device(device)->connection->data.socket) {
-                       queued = first_peer_device(device)->connection->data.socket->sk->sk_wmem_queued;
-                       sndbuf = first_peer_device(device)->connection->data.socket->sk->sk_sndbuf;
-               } else {
-                       queued = 1;
-                       sndbuf = 0;
-               }
-               mutex_unlock(&first_peer_device(device)->connection->data.mutex);
-               if (queued > sndbuf / 2)
+               /* Stop generating RS requests when half of the send buffer is filled,
+                * but notify TCP that we'd like to have more space. */
+               mutex_lock(&connection->data.mutex);
+               if (connection->data.socket) {
+                       struct sock *sk = connection->data.socket->sk;
+                       int queued = sk->sk_wmem_queued;
+                       int sndbuf = sk->sk_sndbuf;
+                       if (queued > sndbuf / 2) {
+                               requeue = 1;
+                               if (sk->sk_socket)
+                                       set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+                       }
+               } else
+                       requeue = 1;
+               mutex_unlock(&connection->data.mutex);
+               if (requeue)
                        goto requeue;
 
 next_sector:
@@ -642,8 +649,7 @@ next_sector:
 
                sector = BM_BIT_TO_SECT(bit);
 
-               if (drbd_rs_should_slow_down(device, sector) ||
-                   drbd_try_rs_begin_io(device, sector)) {
+               if (drbd_try_rs_begin_io(device, sector)) {
                        device->bm_resync_fo = bit;
                        goto requeue;
                }
@@ -696,9 +702,9 @@ next_sector:
                /* adjust very last sectors, in case we are oddly sized */
                if (sector + (size>>9) > capacity)
                        size = (capacity-sector)<<9;
-               if (first_peer_device(device)->connection->agreed_pro_version >= 89 &&
-                   first_peer_device(device)->connection->csums_tfm) {
-                       switch (read_for_csum(first_peer_device(device), sector, size)) {
+
+               if (device->use_csums) {
+                       switch (read_for_csum(peer_device, sector, size)) {
                        case -EIO: /* Disk failure */
                                put_ldev(device);
                                return -EIO;
@@ -717,7 +723,7 @@ next_sector:
                        int err;
 
                        inc_rs_pending(device);
-                       err = drbd_send_drequest(first_peer_device(device), P_RS_DATA_REQUEST,
+                       err = drbd_send_drequest(peer_device, P_RS_DATA_REQUEST,
                                                 sector, size, ID_SYNCER);
                        if (err) {
                                drbd_err(device, "drbd_send_drequest() failed, aborting...\n");
@@ -774,8 +780,7 @@ static int make_ov_request(struct drbd_device *device, int cancel)
 
                size = BM_BLOCK_SIZE;
 
-               if (drbd_rs_should_slow_down(device, sector) ||
-                   drbd_try_rs_begin_io(device, sector)) {
+               if (drbd_try_rs_begin_io(device, sector)) {
                        device->ov_position = sector;
                        goto requeue;
                }
@@ -911,7 +916,7 @@ int drbd_resync_finished(struct drbd_device *device)
                if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
                        khelper_cmd = "after-resync-target";
 
-               if (first_peer_device(device)->connection->csums_tfm && device->rs_total) {
+               if (device->use_csums && device->rs_total) {
                        const unsigned long s = device->rs_same_csum;
                        const unsigned long t = device->rs_total;
                        const int ratio =
@@ -1351,13 +1356,15 @@ int w_send_out_of_sync(struct drbd_work *w, int cancel)
 {
        struct drbd_request *req = container_of(w, struct drbd_request, w);
        struct drbd_device *device = req->device;
-       struct drbd_connection *connection = first_peer_device(device)->connection;
+       struct drbd_peer_device *const peer_device = first_peer_device(device);
+       struct drbd_connection *const connection = peer_device->connection;
        int err;
 
        if (unlikely(cancel)) {
                req_mod(req, SEND_CANCELED);
                return 0;
        }
+       req->pre_send_jif = jiffies;
 
        /* this time, no connection->send.current_epoch_writes++;
         * If it was sent, it was the closing barrier for the last
@@ -1365,7 +1372,7 @@ int w_send_out_of_sync(struct drbd_work *w, int cancel)
         * No more barriers will be sent, until we leave AHEAD mode again. */
        maybe_send_barrier(connection, req->epoch);
 
-       err = drbd_send_out_of_sync(first_peer_device(device), req);
+       err = drbd_send_out_of_sync(peer_device, req);
        req_mod(req, OOS_HANDED_TO_NETWORK);
 
        return err;
@@ -1380,19 +1387,21 @@ int w_send_dblock(struct drbd_work *w, int cancel)
 {
        struct drbd_request *req = container_of(w, struct drbd_request, w);
        struct drbd_device *device = req->device;
-       struct drbd_connection *connection = first_peer_device(device)->connection;
+       struct drbd_peer_device *const peer_device = first_peer_device(device);
+       struct drbd_connection *connection = peer_device->connection;
        int err;
 
        if (unlikely(cancel)) {
                req_mod(req, SEND_CANCELED);
                return 0;
        }
+       req->pre_send_jif = jiffies;
 
        re_init_if_first_write(connection, req->epoch);
        maybe_send_barrier(connection, req->epoch);
        connection->send.current_epoch_writes++;
 
-       err = drbd_send_dblock(first_peer_device(device), req);
+       err = drbd_send_dblock(peer_device, req);
        req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
 
        return err;
@@ -1407,19 +1416,21 @@ int w_send_read_req(struct drbd_work *w, int cancel)
 {
        struct drbd_request *req = container_of(w, struct drbd_request, w);
        struct drbd_device *device = req->device;
-       struct drbd_connection *connection = first_peer_device(device)->connection;
+       struct drbd_peer_device *const peer_device = first_peer_device(device);
+       struct drbd_connection *connection = peer_device->connection;
        int err;
 
        if (unlikely(cancel)) {
                req_mod(req, SEND_CANCELED);
                return 0;
        }
+       req->pre_send_jif = jiffies;
 
        /* Even read requests may close a write epoch,
         * if there was any yet. */
        maybe_send_barrier(connection, req->epoch);
 
-       err = drbd_send_drequest(first_peer_device(device), P_DATA_REQUEST, req->i.sector, req->i.size,
+       err = drbd_send_drequest(peer_device, P_DATA_REQUEST, req->i.sector, req->i.size,
                                 (unsigned long)req);
 
        req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
@@ -1433,7 +1444,7 @@ int w_restart_disk_io(struct drbd_work *w, int cancel)
        struct drbd_device *device = req->device;
 
        if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
-               drbd_al_begin_io(device, &req->i, false);
+               drbd_al_begin_io(device, &req->i);
 
        drbd_req_make_private_bio(req, req->master_bio);
        req->private_bio->bi_bdev = device->ldev->backing_bdev;
@@ -1601,26 +1612,32 @@ void drbd_rs_controller_reset(struct drbd_device *device)
 void start_resync_timer_fn(unsigned long data)
 {
        struct drbd_device *device = (struct drbd_device *) data;
-
-       drbd_queue_work(&first_peer_device(device)->connection->sender_work,
-                       &device->start_resync_work);
+       drbd_device_post_work(device, RS_START);
 }
 
-int w_start_resync(struct drbd_work *w, int cancel)
+static void do_start_resync(struct drbd_device *device)
 {
-       struct drbd_device *device =
-               container_of(w, struct drbd_device, start_resync_work);
-
        if (atomic_read(&device->unacked_cnt) || atomic_read(&device->rs_pending_cnt)) {
-               drbd_warn(device, "w_start_resync later...\n");
+               drbd_warn(device, "postponing start_resync ...\n");
                device->start_resync_timer.expires = jiffies + HZ/10;
                add_timer(&device->start_resync_timer);
-               return 0;
+               return;
        }
 
        drbd_start_resync(device, C_SYNC_SOURCE);
        clear_bit(AHEAD_TO_SYNC_SOURCE, &device->flags);
-       return 0;
+}
+
+static bool use_checksum_based_resync(struct drbd_connection *connection, struct drbd_device *device)
+{
+       bool csums_after_crash_only;
+       rcu_read_lock();
+       csums_after_crash_only = rcu_dereference(connection->net_conf)->csums_after_crash_only;
+       rcu_read_unlock();
+       return connection->agreed_pro_version >= 89 &&          /* supported? */
+               connection->csums_tfm &&                        /* configured? */
+               (csums_after_crash_only == 0                    /* use for each resync? */
+                || test_bit(CRASHED_PRIMARY, &device->flags)); /* or only after Primary crash? */
 }
 
 /**
@@ -1633,6 +1650,8 @@ int w_start_resync(struct drbd_work *w, int cancel)
  */
 void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 {
+       struct drbd_peer_device *peer_device = first_peer_device(device);
+       struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
        union drbd_state ns;
        int r;
 
@@ -1651,7 +1670,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
                        if (r > 0) {
                                drbd_info(device, "before-resync-target handler returned %d, "
                                         "dropping connection.\n", r);
-                               conn_request_state(first_peer_device(device)->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+                               conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
                                return;
                        }
                } else /* C_SYNC_SOURCE */ {
@@ -1664,7 +1683,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
                                } else {
                                        drbd_info(device, "before-resync-source handler returned %d, "
                                                 "dropping connection.\n", r);
-                                       conn_request_state(first_peer_device(device)->connection,
+                                       conn_request_state(connection,
                                                           NS(conn, C_DISCONNECTING), CS_HARD);
                                        return;
                                }
@@ -1672,7 +1691,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
                }
        }
 
-       if (current == first_peer_device(device)->connection->worker.task) {
+       if (current == connection->worker.task) {
                /* The worker should not sleep waiting for state_mutex,
                   that can take long */
                if (!mutex_trylock(device->state_mutex)) {
@@ -1733,11 +1752,20 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
                        device->rs_mark_time[i] = now;
                }
                _drbd_pause_after(device);
+               /* Forget potentially stale cached per resync extent bit-counts.
+                * Open coded drbd_rs_cancel_all(device), we already have IRQs
+                * disabled, and know the disk state is ok. */
+               spin_lock(&device->al_lock);
+               lc_reset(device->resync);
+               device->resync_locked = 0;
+               device->resync_wenr = LC_FREE;
+               spin_unlock(&device->al_lock);
        }
        write_unlock(&global_state_lock);
        spin_unlock_irq(&device->resource->req_lock);
 
        if (r == SS_SUCCESS) {
+               wake_up(&device->al_wait); /* for lc_reset() above */
                /* reset rs_last_bcast when a resync or verify is started,
                 * to deal with potential jiffies wrap. */
                device->rs_last_bcast = jiffies - HZ;
@@ -1746,8 +1774,12 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
                     drbd_conn_str(ns.conn),
                     (unsigned long) device->rs_total << (BM_BLOCK_SHIFT-10),
                     (unsigned long) device->rs_total);
-               if (side == C_SYNC_TARGET)
+               if (side == C_SYNC_TARGET) {
                        device->bm_resync_fo = 0;
+                       device->use_csums = use_checksum_based_resync(connection, device);
+               } else {
+                       device->use_csums = 0;
+               }
 
                /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
                 * with w_send_oos, or the sync target will get confused as to
@@ -1756,12 +1788,10 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
                 * drbd_resync_finished from here in that case.
                 * We drbd_gen_and_send_sync_uuid here for protocol < 96,
                 * and from after_state_ch otherwise. */
-               if (side == C_SYNC_SOURCE &&
-                   first_peer_device(device)->connection->agreed_pro_version < 96)
-                       drbd_gen_and_send_sync_uuid(first_peer_device(device));
+               if (side == C_SYNC_SOURCE && connection->agreed_pro_version < 96)
+                       drbd_gen_and_send_sync_uuid(peer_device);
 
-               if (first_peer_device(device)->connection->agreed_pro_version < 95 &&
-                   device->rs_total == 0) {
+               if (connection->agreed_pro_version < 95 && device->rs_total == 0) {
                        /* This still has a race (about when exactly the peers
                         * detect connection loss) that can lead to a full sync
                         * on next handshake. In 8.3.9 we fixed this with explicit
@@ -1777,7 +1807,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
                                int timeo;
 
                                rcu_read_lock();
-                               nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+                               nc = rcu_dereference(connection->net_conf);
                                timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
                                rcu_read_unlock();
                                schedule_timeout_interruptible(timeo);
@@ -1799,10 +1829,165 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
        mutex_unlock(device->state_mutex);
 }
 
+static void update_on_disk_bitmap(struct drbd_device *device, bool resync_done)
+{
+       struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
+       device->rs_last_bcast = jiffies;
+
+       if (!get_ldev(device))
+               return;
+
+       drbd_bm_write_lazy(device, 0);
+       if (resync_done && is_sync_state(device->state.conn))
+               drbd_resync_finished(device);
+
+       drbd_bcast_event(device, &sib);
+       /* update timestamp, in case it took a while to write out stuff */
+       device->rs_last_bcast = jiffies;
+       put_ldev(device);
+}
+
+static void drbd_ldev_destroy(struct drbd_device *device)
+{
+       lc_destroy(device->resync);
+       device->resync = NULL;
+       lc_destroy(device->act_log);
+       device->act_log = NULL;
+       __no_warn(local,
+               drbd_free_ldev(device->ldev);
+               device->ldev = NULL;);
+       clear_bit(GOING_DISKLESS, &device->flags);
+       wake_up(&device->misc_wait);
+}
+
+static void go_diskless(struct drbd_device *device)
+{
+       D_ASSERT(device, device->state.disk == D_FAILED);
+       /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
+        * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
+        * the protected members anymore, though, so once put_ldev reaches zero
+        * again, it will be safe to free them. */
+
+       /* Try to write changed bitmap pages, read errors may have just
+        * set some bits outside the area covered by the activity log.
+        *
+        * If we have an IO error during the bitmap writeout,
+        * we will want a full sync next time, just in case.
+        * (Do we want a specific meta data flag for this?)
+        *
+        * If that does not make it to stable storage either,
+        * we cannot do anything about that anymore.
+        *
+        * We still need to check if both bitmap and ldev are present, we may
+        * end up here after a failed attach, before ldev was even assigned.
+        */
+       if (device->bitmap && device->ldev) {
+               /* An interrupted resync or similar is allowed to recounts bits
+                * while we detach.
+                * Any modifications would not be expected anymore, though.
+                */
+               if (drbd_bitmap_io_from_worker(device, drbd_bm_write,
+                                       "detach", BM_LOCKED_TEST_ALLOWED)) {
+                       if (test_bit(WAS_READ_ERROR, &device->flags)) {
+                               drbd_md_set_flag(device, MDF_FULL_SYNC);
+                               drbd_md_sync(device);
+                       }
+               }
+       }
+
+       drbd_force_state(device, NS(disk, D_DISKLESS));
+}
+
+static int do_md_sync(struct drbd_device *device)
+{
+       drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
+       drbd_md_sync(device);
+       return 0;
+}
+
+/* only called from drbd_worker thread, no locking */
+void __update_timing_details(
+               struct drbd_thread_timing_details *tdp,
+               unsigned int *cb_nr,
+               void *cb,
+               const char *fn, const unsigned int line)
+{
+       unsigned int i = *cb_nr % DRBD_THREAD_DETAILS_HIST;
+       struct drbd_thread_timing_details *td = tdp + i;
+
+       td->start_jif = jiffies;
+       td->cb_addr = cb;
+       td->caller_fn = fn;
+       td->line = line;
+       td->cb_nr = *cb_nr;
+
+       i = (i+1) % DRBD_THREAD_DETAILS_HIST;
+       td = tdp + i;
+       memset(td, 0, sizeof(*td));
+
+       ++(*cb_nr);
+}
+
+#define WORK_PENDING(work_bit, todo)   (todo & (1UL << work_bit))
+static void do_device_work(struct drbd_device *device, const unsigned long todo)
+{
+       if (WORK_PENDING(MD_SYNC, todo))
+               do_md_sync(device);
+       if (WORK_PENDING(RS_DONE, todo) ||
+           WORK_PENDING(RS_PROGRESS, todo))
+               update_on_disk_bitmap(device, WORK_PENDING(RS_DONE, todo));
+       if (WORK_PENDING(GO_DISKLESS, todo))
+               go_diskless(device);
+       if (WORK_PENDING(DESTROY_DISK, todo))
+               drbd_ldev_destroy(device);
+       if (WORK_PENDING(RS_START, todo))
+               do_start_resync(device);
+}
+
+#define DRBD_DEVICE_WORK_MASK  \
+       ((1UL << GO_DISKLESS)   \
+       |(1UL << DESTROY_DISK)  \
+       |(1UL << MD_SYNC)       \
+       |(1UL << RS_START)      \
+       |(1UL << RS_PROGRESS)   \
+       |(1UL << RS_DONE)       \
+       )
+
+static unsigned long get_work_bits(unsigned long *flags)
+{
+       unsigned long old, new;
+       do {
+               old = *flags;
+               new = old & ~DRBD_DEVICE_WORK_MASK;
+       } while (cmpxchg(flags, old, new) != old);
+       return old & DRBD_DEVICE_WORK_MASK;
+}
+
+static void do_unqueued_work(struct drbd_connection *connection)
+{
+       struct drbd_peer_device *peer_device;
+       int vnr;
+
+       rcu_read_lock();
+       idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+               struct drbd_device *device = peer_device->device;
+               unsigned long todo = get_work_bits(&device->flags);
+               if (!todo)
+                       continue;
+
+               kref_get(&device->kref);
+               rcu_read_unlock();
+               do_device_work(device, todo);
+               kref_put(&device->kref, drbd_destroy_device);
+               rcu_read_lock();
+       }
+       rcu_read_unlock();
+}
+
 static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
 {
        spin_lock_irq(&queue->q_lock);
-       list_splice_init(&queue->q, work_list);
+       list_splice_tail_init(&queue->q, work_list);
        spin_unlock_irq(&queue->q_lock);
        return !list_empty(work_list);
 }
@@ -1851,7 +2036,7 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head *
                /* dequeue single item only,
                 * we still use drbd_queue_work_front() in some places */
                if (!list_empty(&connection->sender_work.q))
-                       list_move(connection->sender_work.q.next, work_list);
+                       list_splice_tail_init(&connection->sender_work.q, work_list);
                spin_unlock(&connection->sender_work.q_lock);   /* FIXME get rid of this one? */
                if (!list_empty(work_list) || signal_pending(current)) {
                        spin_unlock_irq(&connection->resource->req_lock);
@@ -1873,6 +2058,14 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head *
                if (send_barrier)
                        maybe_send_barrier(connection,
                                        connection->send.current_epoch_nr + 1);
+
+               if (test_bit(DEVICE_WORK_PENDING, &connection->flags))
+                       break;
+
+               /* drbd_send() may have called flush_signals() */
+               if (get_t_state(&connection->worker) != RUNNING)
+                       break;
+
                schedule();
                /* may be woken up for other things but new work, too,
                 * e.g. if the current epoch got closed.
@@ -1906,10 +2099,15 @@ int drbd_worker(struct drbd_thread *thi)
        while (get_t_state(thi) == RUNNING) {
                drbd_thread_current_set_cpu(thi);
 
-               /* as long as we use drbd_queue_work_front(),
-                * we may only dequeue single work items here, not batches. */
-               if (list_empty(&work_list))
+               if (list_empty(&work_list)) {
+                       update_worker_timing_details(connection, wait_for_work);
                        wait_for_work(connection, &work_list);
+               }
+
+               if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
+                       update_worker_timing_details(connection, do_unqueued_work);
+                       do_unqueued_work(connection);
+               }
 
                if (signal_pending(current)) {
                        flush_signals(current);
@@ -1926,6 +2124,7 @@ int drbd_worker(struct drbd_thread *thi)
                while (!list_empty(&work_list)) {
                        w = list_first_entry(&work_list, struct drbd_work, list);
                        list_del_init(&w->list);
+                       update_worker_timing_details(connection, w->cb);
                        if (w->cb(w, connection->cstate < C_WF_REPORT_PARAMS) == 0)
                                continue;
                        if (connection->cstate >= C_WF_REPORT_PARAMS)
@@ -1934,13 +2133,18 @@ int drbd_worker(struct drbd_thread *thi)
        }
 
        do {
+               if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
+                       update_worker_timing_details(connection, do_unqueued_work);
+                       do_unqueued_work(connection);
+               }
                while (!list_empty(&work_list)) {
                        w = list_first_entry(&work_list, struct drbd_work, list);
                        list_del_init(&w->list);
+                       update_worker_timing_details(connection, w->cb);
                        w->cb(w, 1);
                }
                dequeue_work_batch(&connection->sender_work, &work_list);
-       } while (!list_empty(&work_list));
+       } while (!list_empty(&work_list) || test_bit(DEVICE_WORK_PENDING, &connection->flags));
 
        rcu_read_lock();
        idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
index b2c98c1bc037e8c7674722cba6b56f5e4a7383aa..623c84145b792b9ddaa852e45c2cdbf80b1f08d5 100644 (file)
@@ -42,6 +42,7 @@
 #include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/idr.h>
+#include <linux/workqueue.h>
 
 #include "rbd_types.h"
 
@@ -332,7 +333,10 @@ struct rbd_device {
 
        char                    name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
 
+       struct list_head        rq_queue;       /* incoming rq queue */
        spinlock_t              lock;           /* queue, flags, open_count */
+       struct workqueue_struct *rq_wq;
+       struct work_struct      rq_work;
 
        struct rbd_image_header header;
        unsigned long           flags;          /* possibly lock protected */
@@ -514,7 +518,8 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
 
 static int rbd_dev_refresh(struct rbd_device *rbd_dev);
 static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
-static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev);
+static int rbd_dev_header_info(struct rbd_device *rbd_dev);
+static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
 static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
                                        u64 snap_id);
 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
@@ -971,12 +976,6 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev,
        header->snap_names = snap_names;
        header->snap_sizes = snap_sizes;
 
-       /* Make sure mapping size is consistent with header info */
-
-       if (rbd_dev->spec->snap_id == CEPH_NOSNAP || first_time)
-               if (rbd_dev->mapping.size != header->image_size)
-                       rbd_dev->mapping.size = header->image_size;
-
        return 0;
 out_2big:
        ret = -EIO;
@@ -1139,6 +1138,13 @@ static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
        rbd_dev->mapping.features = 0;
 }
 
+static void rbd_segment_name_free(const char *name)
+{
+       /* The explicit cast here is needed to drop the const qualifier */
+
+       kmem_cache_free(rbd_segment_name_cache, (void *)name);
+}
+
 static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
 {
        char *name;
@@ -1158,20 +1164,13 @@ static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
        if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
                pr_err("error formatting segment name for #%llu (%d)\n",
                        segment, ret);
-               kfree(name);
+               rbd_segment_name_free(name);
                name = NULL;
        }
 
        return name;
 }
 
-static void rbd_segment_name_free(const char *name)
-{
-       /* The explicit cast here is needed to drop the const qualifier */
-
-       kmem_cache_free(rbd_segment_name_cache, (void *)name);
-}
-
 static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
 {
        u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
@@ -1371,7 +1370,7 @@ static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
                struct rbd_device *rbd_dev;
 
                rbd_dev = obj_request->img_request->rbd_dev;
-               rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
+               rbd_warn(rbd_dev, "obj_request %p already marked img_data",
                        obj_request);
        }
 }
@@ -1389,7 +1388,7 @@ static void obj_request_done_set(struct rbd_obj_request *obj_request)
 
                if (obj_request_img_data_test(obj_request))
                        rbd_dev = obj_request->img_request->rbd_dev;
-               rbd_warn(rbd_dev, "obj_request %p already marked done\n",
+               rbd_warn(rbd_dev, "obj_request %p already marked done",
                        obj_request);
        }
 }
@@ -1527,11 +1526,37 @@ static bool obj_request_type_valid(enum obj_request_type type)
 static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
                                struct rbd_obj_request *obj_request)
 {
-       dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
-
+       dout("%s %p\n", __func__, obj_request);
        return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
 }
 
+static void rbd_obj_request_end(struct rbd_obj_request *obj_request)
+{
+       dout("%s %p\n", __func__, obj_request);
+       ceph_osdc_cancel_request(obj_request->osd_req);
+}
+
+/*
+ * Wait for an object request to complete.  If interrupted, cancel the
+ * underlying osd request.
+ */
+static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
+{
+       int ret;
+
+       dout("%s %p\n", __func__, obj_request);
+
+       ret = wait_for_completion_interruptible(&obj_request->completion);
+       if (ret < 0) {
+               dout("%s %p interrupted\n", __func__, obj_request);
+               rbd_obj_request_end(obj_request);
+               return ret;
+       }
+
+       dout("%s %p done\n", __func__, obj_request);
+       return 0;
+}
+
 static void rbd_img_request_complete(struct rbd_img_request *img_request)
 {
 
@@ -1558,15 +1583,6 @@ static void rbd_img_request_complete(struct rbd_img_request *img_request)
                rbd_img_request_put(img_request);
 }
 
-/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
-
-static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
-{
-       dout("%s: obj %p\n", __func__, obj_request);
-
-       return wait_for_completion_interruptible(&obj_request->completion);
-}
-
 /*
  * The default/initial value for all image request flags is 0.  Each
  * is conditionally set to 1 at image request initialization time
@@ -1763,7 +1779,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
                rbd_osd_trivial_callback(obj_request);
                break;
        default:
-               rbd_warn(NULL, "%s: unsupported op %hu\n",
+               rbd_warn(NULL, "%s: unsupported op %hu",
                        obj_request->object_name, (unsigned short) opcode);
                break;
        }
@@ -1998,7 +2014,7 @@ static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
        if (!counter)
                rbd_dev_unparent(rbd_dev);
        else
-               rbd_warn(rbd_dev, "parent reference underflow\n");
+               rbd_warn(rbd_dev, "parent reference underflow");
 }
 
 /*
@@ -2028,7 +2044,7 @@ static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
        /* Image was flattened, but parent is not yet torn down */
 
        if (counter < 0)
-               rbd_warn(rbd_dev, "parent reference overflow\n");
+               rbd_warn(rbd_dev, "parent reference overflow");
 
        return false;
 }
@@ -2045,7 +2061,7 @@ static struct rbd_img_request *rbd_img_request_create(
 {
        struct rbd_img_request *img_request;
 
-       img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC);
+       img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
        if (!img_request)
                return NULL;
 
@@ -2161,11 +2177,11 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
        if (result) {
                struct rbd_device *rbd_dev = img_request->rbd_dev;
 
-               rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
+               rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
                        img_request_write_test(img_request) ? "write" : "read",
                        obj_request->length, obj_request->img_offset,
                        obj_request->offset);
-               rbd_warn(rbd_dev, "  result %d xferred %x\n",
+               rbd_warn(rbd_dev, "  result %d xferred %x",
                        result, xferred);
                if (!img_request->result)
                        img_request->result = result;
@@ -2946,154 +2962,135 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
        dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
                rbd_dev->header_name, (unsigned long long)notify_id,
                (unsigned int)opcode);
+
+       /*
+        * Until adequate refresh error handling is in place, there is
+        * not much we can do here, except warn.
+        *
+        * See http://tracker.ceph.com/issues/5040
+        */
        ret = rbd_dev_refresh(rbd_dev);
        if (ret)
-               rbd_warn(rbd_dev, "header refresh error (%d)\n", ret);
+               rbd_warn(rbd_dev, "refresh failed: %d", ret);
 
-       rbd_obj_notify_ack_sync(rbd_dev, notify_id);
+       ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id);
+       if (ret)
+               rbd_warn(rbd_dev, "notify_ack ret %d", ret);
 }
 
 /*
- * Initiate a watch request, synchronously.
+ * Send a (un)watch request and wait for the ack.  Return a request
+ * with a ref held on success or error.
  */
-static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
+static struct rbd_obj_request *rbd_obj_watch_request_helper(
+                                               struct rbd_device *rbd_dev,
+                                               bool watch)
 {
        struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
        struct rbd_obj_request *obj_request;
        int ret;
 
-       rbd_assert(!rbd_dev->watch_event);
-       rbd_assert(!rbd_dev->watch_request);
-
-       ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
-                                    &rbd_dev->watch_event);
-       if (ret < 0)
-               return ret;
-
-       rbd_assert(rbd_dev->watch_event);
-
        obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
                                             OBJ_REQUEST_NODATA);
-       if (!obj_request) {
-               ret = -ENOMEM;
-               goto out_cancel;
-       }
+       if (!obj_request)
+               return ERR_PTR(-ENOMEM);
 
        obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1,
                                                  obj_request);
        if (!obj_request->osd_req) {
                ret = -ENOMEM;
-               goto out_put;
+               goto out;
        }
 
-       ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
-
        osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
-                             rbd_dev->watch_event->cookie, 0, 1);
+                             rbd_dev->watch_event->cookie, 0, watch);
        rbd_osd_req_format_write(obj_request);
 
+       if (watch)
+               ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
+
        ret = rbd_obj_request_submit(osdc, obj_request);
        if (ret)
-               goto out_linger;
+               goto out;
 
        ret = rbd_obj_request_wait(obj_request);
        if (ret)
-               goto out_linger;
+               goto out;
 
        ret = obj_request->result;
-       if (ret)
-               goto out_linger;
-
-       /*
-        * A watch request is set to linger, so the underlying osd
-        * request won't go away until we unregister it.  We retain
-        * a pointer to the object request during that time (in
-        * rbd_dev->watch_request), so we'll keep a reference to
-        * it.  We'll drop that reference (below) after we've
-        * unregistered it.
-        */
-       rbd_dev->watch_request = obj_request;
+       if (ret) {
+               if (watch)
+                       rbd_obj_request_end(obj_request);
+               goto out;
+       }
 
-       return 0;
+       return obj_request;
 
-out_linger:
-       ceph_osdc_unregister_linger_request(osdc, obj_request->osd_req);
-out_put:
+out:
        rbd_obj_request_put(obj_request);
-out_cancel:
-       ceph_osdc_cancel_event(rbd_dev->watch_event);
-       rbd_dev->watch_event = NULL;
-
-       return ret;
+       return ERR_PTR(ret);
 }
 
 /*
- * Tear down a watch request, synchronously.
+ * Initiate a watch request, synchronously.
  */
-static int __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
+static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
 {
        struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
        struct rbd_obj_request *obj_request;
        int ret;
 
-       rbd_assert(rbd_dev->watch_event);
-       rbd_assert(rbd_dev->watch_request);
+       rbd_assert(!rbd_dev->watch_event);
+       rbd_assert(!rbd_dev->watch_request);
 
-       obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
-                                            OBJ_REQUEST_NODATA);
-       if (!obj_request) {
-               ret = -ENOMEM;
-               goto out_cancel;
-       }
+       ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
+                                    &rbd_dev->watch_event);
+       if (ret < 0)
+               return ret;
 
-       obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1,
-                                                 obj_request);
-       if (!obj_request->osd_req) {
-               ret = -ENOMEM;
-               goto out_put;
+       obj_request = rbd_obj_watch_request_helper(rbd_dev, true);
+       if (IS_ERR(obj_request)) {
+               ceph_osdc_cancel_event(rbd_dev->watch_event);
+               rbd_dev->watch_event = NULL;
+               return PTR_ERR(obj_request);
        }
 
-       osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
-                             rbd_dev->watch_event->cookie, 0, 0);
-       rbd_osd_req_format_write(obj_request);
-
-       ret = rbd_obj_request_submit(osdc, obj_request);
-       if (ret)
-               goto out_put;
+       /*
+        * A watch request is set to linger, so the underlying osd
+        * request won't go away until we unregister it.  We retain
+        * a pointer to the object request during that time (in
+        * rbd_dev->watch_request), so we'll keep a reference to it.
+        * We'll drop that reference after we've unregistered it in
+        * rbd_dev_header_unwatch_sync().
+        */
+       rbd_dev->watch_request = obj_request;
 
-       ret = rbd_obj_request_wait(obj_request);
-       if (ret)
-               goto out_put;
+       return 0;
+}
 
-       ret = obj_request->result;
-       if (ret)
-               goto out_put;
+/*
+ * Tear down a watch request, synchronously.
+ */
+static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
+{
+       struct rbd_obj_request *obj_request;
 
-       /* We have successfully torn down the watch request */
+       rbd_assert(rbd_dev->watch_event);
+       rbd_assert(rbd_dev->watch_request);
 
-       ceph_osdc_unregister_linger_request(osdc,
-                                           rbd_dev->watch_request->osd_req);
+       rbd_obj_request_end(rbd_dev->watch_request);
        rbd_obj_request_put(rbd_dev->watch_request);
        rbd_dev->watch_request = NULL;
 
-out_put:
-       rbd_obj_request_put(obj_request);
-out_cancel:
+       obj_request = rbd_obj_watch_request_helper(rbd_dev, false);
+       if (!IS_ERR(obj_request))
+               rbd_obj_request_put(obj_request);
+       else
+               rbd_warn(rbd_dev, "unable to tear down watch request (%ld)",
+                        PTR_ERR(obj_request));
+
        ceph_osdc_cancel_event(rbd_dev->watch_event);
        rbd_dev->watch_event = NULL;
-
-       return ret;
-}
-
-static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
-{
-       int ret;
-
-       ret = __rbd_dev_header_unwatch_sync(rbd_dev);
-       if (ret) {
-               rbd_warn(rbd_dev, "unable to tear down watch request: %d\n",
-                        ret);
-       }
 }
 
 /*
@@ -3183,102 +3180,129 @@ out:
        return ret;
 }
 
-static void rbd_request_fn(struct request_queue *q)
-               __releases(q->queue_lock) __acquires(q->queue_lock)
+static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
 {
-       struct rbd_device *rbd_dev = q->queuedata;
-       struct request *rq;
+       struct rbd_img_request *img_request;
+       u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
+       u64 length = blk_rq_bytes(rq);
+       bool wr = rq_data_dir(rq) == WRITE;
        int result;
 
-       while ((rq = blk_fetch_request(q))) {
-               bool write_request = rq_data_dir(rq) == WRITE;
-               struct rbd_img_request *img_request;
-               u64 offset;
-               u64 length;
+       /* Ignore/skip any zero-length requests */
 
-               /* Ignore any non-FS requests that filter through. */
+       if (!length) {
+               dout("%s: zero-length request\n", __func__);
+               result = 0;
+               goto err_rq;
+       }
 
-               if (rq->cmd_type != REQ_TYPE_FS) {
-                       dout("%s: non-fs request type %d\n", __func__,
-                               (int) rq->cmd_type);
-                       __blk_end_request_all(rq, 0);
-                       continue;
+       /* Disallow writes to a read-only device */
+
+       if (wr) {
+               if (rbd_dev->mapping.read_only) {
+                       result = -EROFS;
+                       goto err_rq;
                }
+               rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
+       }
 
-               /* Ignore/skip any zero-length requests */
+       /*
+        * Quit early if the mapped snapshot no longer exists.  It's
+        * still possible the snapshot will have disappeared by the
+        * time our request arrives at the osd, but there's no sense in
+        * sending it if we already know.
+        */
+       if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
+               dout("request for non-existent snapshot");
+               rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
+               result = -ENXIO;
+               goto err_rq;
+       }
 
-               offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
-               length = (u64) blk_rq_bytes(rq);
+       if (offset && length > U64_MAX - offset + 1) {
+               rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
+                        length);
+               result = -EINVAL;
+               goto err_rq;    /* Shouldn't happen */
+       }
 
-               if (!length) {
-                       dout("%s: zero-length request\n", __func__);
-                       __blk_end_request_all(rq, 0);
-                       continue;
-               }
+       if (offset + length > rbd_dev->mapping.size) {
+               rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
+                        length, rbd_dev->mapping.size);
+               result = -EIO;
+               goto err_rq;
+       }
 
-               spin_unlock_irq(q->queue_lock);
+       img_request = rbd_img_request_create(rbd_dev, offset, length, wr);
+       if (!img_request) {
+               result = -ENOMEM;
+               goto err_rq;
+       }
+       img_request->rq = rq;
 
-               /* Disallow writes to a read-only device */
+       result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, rq->bio);
+       if (result)
+               goto err_img_request;
 
-               if (write_request) {
-                       result = -EROFS;
-                       if (rbd_dev->mapping.read_only)
-                               goto end_request;
-                       rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
-               }
+       result = rbd_img_request_submit(img_request);
+       if (result)
+               goto err_img_request;
 
-               /*
-                * Quit early if the mapped snapshot no longer
-                * exists.  It's still possible the snapshot will
-                * have disappeared by the time our request arrives
-                * at the osd, but there's no sense in sending it if
-                * we already know.
-                */
-               if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
-                       dout("request for non-existent snapshot");
-                       rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
-                       result = -ENXIO;
-                       goto end_request;
-               }
+       return;
 
-               result = -EINVAL;
-               if (offset && length > U64_MAX - offset + 1) {
-                       rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
-                               offset, length);
-                       goto end_request;       /* Shouldn't happen */
-               }
+err_img_request:
+       rbd_img_request_put(img_request);
+err_rq:
+       if (result)
+               rbd_warn(rbd_dev, "%s %llx at %llx result %d",
+                        wr ? "write" : "read", length, offset, result);
+       blk_end_request_all(rq, result);
+}
 
-               result = -EIO;
-               if (offset + length > rbd_dev->mapping.size) {
-                       rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n",
-                               offset, length, rbd_dev->mapping.size);
-                       goto end_request;
-               }
+static void rbd_request_workfn(struct work_struct *work)
+{
+       struct rbd_device *rbd_dev =
+           container_of(work, struct rbd_device, rq_work);
+       struct request *rq, *next;
+       LIST_HEAD(requests);
 
-               result = -ENOMEM;
-               img_request = rbd_img_request_create(rbd_dev, offset, length,
-                                                       write_request);
-               if (!img_request)
-                       goto end_request;
+       spin_lock_irq(&rbd_dev->lock); /* rq->q->queue_lock */
+       list_splice_init(&rbd_dev->rq_queue, &requests);
+       spin_unlock_irq(&rbd_dev->lock);
 
-               img_request->rq = rq;
+       list_for_each_entry_safe(rq, next, &requests, queuelist) {
+               list_del_init(&rq->queuelist);
+               rbd_handle_request(rbd_dev, rq);
+       }
+}
 
-               result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
-                                               rq->bio);
-               if (!result)
-                       result = rbd_img_request_submit(img_request);
-               if (result)
-                       rbd_img_request_put(img_request);
-end_request:
-               spin_lock_irq(q->queue_lock);
-               if (result < 0) {
-                       rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
-                               write_request ? "write" : "read",
-                               length, offset, result);
-
-                       __blk_end_request_all(rq, result);
+/*
+ * Called with q->queue_lock held and interrupts disabled, possibly on
+ * the way to schedule().  Do not sleep here!
+ */
+static void rbd_request_fn(struct request_queue *q)
+{
+       struct rbd_device *rbd_dev = q->queuedata;
+       struct request *rq;
+       int queued = 0;
+
+       rbd_assert(rbd_dev);
+
+       while ((rq = blk_fetch_request(q))) {
+               /* Ignore any non-FS requests that filter through. */
+               if (rq->cmd_type != REQ_TYPE_FS) {
+                       dout("%s: non-fs request type %d\n", __func__,
+                               (int) rq->cmd_type);
+                       __blk_end_request_all(rq, 0);
+                       continue;
                }
+
+               list_add_tail(&rq->queuelist, &rbd_dev->rq_queue);
+               queued++;
        }
+
+       if (queued)
+               queue_work(rbd_dev->rq_wq, &rbd_dev->rq_work);
 }
 
 /*
@@ -3517,24 +3541,37 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
        u64 mapping_size;
        int ret;
 
-       rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
        down_write(&rbd_dev->header_rwsem);
        mapping_size = rbd_dev->mapping.size;
-       if (rbd_dev->image_format == 1)
-               ret = rbd_dev_v1_header_info(rbd_dev);
-       else
-               ret = rbd_dev_v2_header_info(rbd_dev);
 
-       /* If it's a mapped snapshot, validate its EXISTS flag */
+       ret = rbd_dev_header_info(rbd_dev);
+       if (ret)
+               return ret;
+
+       /*
+        * If there is a parent, see if it has disappeared due to the
+        * mapped image getting flattened.
+        */
+       if (rbd_dev->parent) {
+               ret = rbd_dev_v2_parent_info(rbd_dev);
+               if (ret)
+                       return ret;
+       }
+
+       if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
+               if (rbd_dev->mapping.size != rbd_dev->header.image_size)
+                       rbd_dev->mapping.size = rbd_dev->header.image_size;
+       } else {
+               /* validate mapped snapshot's EXISTS flag */
+               rbd_exists_validate(rbd_dev);
+       }
 
-       rbd_exists_validate(rbd_dev);
        up_write(&rbd_dev->header_rwsem);
 
-       if (mapping_size != rbd_dev->mapping.size) {
+       if (mapping_size != rbd_dev->mapping.size)
                rbd_dev_update_size(rbd_dev);
-       }
 
-       return ret;
+       return 0;
 }
 
 static int rbd_init_disk(struct rbd_device *rbd_dev)
@@ -3696,46 +3733,36 @@ static ssize_t rbd_snap_show(struct device *dev,
 }
 
 /*
- * For an rbd v2 image, shows the pool id, image id, and snapshot id
- * for the parent image.  If there is no parent, simply shows
- * "(no parent image)".
+ * For a v2 image, shows the chain of parent images, separated by empty
+ * lines.  For v1 images or if there is no parent, shows "(no parent
+ * image)".
  */
 static ssize_t rbd_parent_show(struct device *dev,
-                            struct device_attribute *attr,
-                            char *buf)
+                              struct device_attribute *attr,
+                              char *buf)
 {
        struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
-       struct rbd_spec *spec = rbd_dev->parent_spec;
-       int count;
-       char *bufp = buf;
+       ssize_t count = 0;
 
-       if (!spec)
+       if (!rbd_dev->parent)
                return sprintf(buf, "(no parent image)\n");
 
-       count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
-                       (unsigned long long) spec->pool_id, spec->pool_name);
-       if (count < 0)
-               return count;
-       bufp += count;
-
-       count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
-                       spec->image_name ? spec->image_name : "(unknown)");
-       if (count < 0)
-               return count;
-       bufp += count;
-
-       count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
-                       (unsigned long long) spec->snap_id, spec->snap_name);
-       if (count < 0)
-               return count;
-       bufp += count;
-
-       count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
-       if (count < 0)
-               return count;
-       bufp += count;
+       for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
+               struct rbd_spec *spec = rbd_dev->parent_spec;
+
+               count += sprintf(&buf[count], "%s"
+                           "pool_id %llu\npool_name %s\n"
+                           "image_id %s\nimage_name %s\n"
+                           "snap_id %llu\nsnap_name %s\n"
+                           "overlap %llu\n",
+                           !count ? "" : "\n", /* first? */
+                           spec->pool_id, spec->pool_name,
+                           spec->image_id, spec->image_name ?: "(unknown)",
+                           spec->snap_id, spec->snap_name,
+                           rbd_dev->parent_overlap);
+       }
 
-       return (ssize_t) (bufp - buf);
+       return count;
 }
 
 static ssize_t rbd_image_refresh(struct device *dev,
@@ -3748,9 +3775,9 @@ static ssize_t rbd_image_refresh(struct device *dev,
 
        ret = rbd_dev_refresh(rbd_dev);
        if (ret)
-               rbd_warn(rbd_dev, ": manual header refresh error (%d)\n", ret);
+               return ret;
 
-       return ret < 0 ? ret : size;
+       return size;
 }
 
 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
@@ -3822,6 +3849,9 @@ static struct rbd_spec *rbd_spec_alloc(void)
        spec = kzalloc(sizeof (*spec), GFP_KERNEL);
        if (!spec)
                return NULL;
+
+       spec->pool_id = CEPH_NOPOOL;
+       spec->snap_id = CEPH_NOSNAP;
        kref_init(&spec->kref);
 
        return spec;
@@ -3848,6 +3878,8 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
                return NULL;
 
        spin_lock_init(&rbd_dev->lock);
+       INIT_LIST_HEAD(&rbd_dev->rq_queue);
+       INIT_WORK(&rbd_dev->rq_work, rbd_request_workfn);
        rbd_dev->flags = 0;
        atomic_set(&rbd_dev->parent_ref, 0);
        INIT_LIST_HEAD(&rbd_dev->node);
@@ -4021,7 +4053,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
                goto out_err;
        }
 
-       snapid = cpu_to_le64(CEPH_NOSNAP);
+       snapid = cpu_to_le64(rbd_dev->spec->snap_id);
        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
                                "rbd", "get_parent",
                                &snapid, sizeof (snapid),
@@ -4059,7 +4091,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
 
        ret = -EIO;
        if (pool_id > (u64)U32_MAX) {
-               rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
+               rbd_warn(NULL, "parent pool id too large (%llu > %u)",
                        (unsigned long long)pool_id, U32_MAX);
                goto out_err;
        }
@@ -4083,6 +4115,8 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
                parent_spec->snap_id = snap_id;
                rbd_dev->parent_spec = parent_spec;
                parent_spec = NULL;     /* rbd_dev now owns this */
+       } else {
+               kfree(image_id);
        }
 
        /*
@@ -4110,8 +4144,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
                         * overlap is zero we just pretend there was
                         * no parent image.
                         */
-                       rbd_warn(rbd_dev, "ignoring parent of "
-                                               "clone with overlap 0\n");
+                       rbd_warn(rbd_dev, "ignoring parent with overlap 0");
                }
        }
 out:
@@ -4279,18 +4312,38 @@ static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
 }
 
 /*
- * When an rbd image has a parent image, it is identified by the
- * pool, image, and snapshot ids (not names).  This function fills
- * in the names for those ids.  (It's OK if we can't figure out the
- * name for an image id, but the pool and snapshot ids should always
- * exist and have names.)  All names in an rbd spec are dynamically
- * allocated.
+ * An image being mapped will have everything but the snap id.
+ */
+static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
+{
+       struct rbd_spec *spec = rbd_dev->spec;
+
+       rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
+       rbd_assert(spec->image_id && spec->image_name);
+       rbd_assert(spec->snap_name);
+
+       if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
+               u64 snap_id;
+
+               snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
+               if (snap_id == CEPH_NOSNAP)
+                       return -ENOENT;
+
+               spec->snap_id = snap_id;
+       } else {
+               spec->snap_id = CEPH_NOSNAP;
+       }
+
+       return 0;
+}
+
+/*
+ * A parent image will have all ids but none of the names.
  *
- * When an image being mapped (not a parent) is probed, we have the
- * pool name and pool id, image name and image id, and the snapshot
- * name.  The only thing we're missing is the snapshot id.
+ * All names in an rbd spec are dynamically allocated.  It's OK if we
+ * can't figure out the name for an image id.
  */
-static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
+static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
 {
        struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
        struct rbd_spec *spec = rbd_dev->spec;
@@ -4299,24 +4352,9 @@ static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
        const char *snap_name;
        int ret;
 
-       /*
-        * An image being mapped will have the pool name (etc.), but
-        * we need to look up the snapshot id.
-        */
-       if (spec->pool_name) {
-               if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
-                       u64 snap_id;
-
-                       snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
-                       if (snap_id == CEPH_NOSNAP)
-                               return -ENOENT;
-                       spec->snap_id = snap_id;
-               } else {
-                       spec->snap_id = CEPH_NOSNAP;
-               }
-
-               return 0;
-       }
+       rbd_assert(spec->pool_id != CEPH_NOPOOL);
+       rbd_assert(spec->image_id);
+       rbd_assert(spec->snap_id != CEPH_NOSNAP);
 
        /* Get the pool name; we have to make our own copy of this */
 
@@ -4335,7 +4373,7 @@ static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
        if (!image_name)
                rbd_warn(rbd_dev, "unable to get image name");
 
-       /* Look up the snapshot name, and make a copy */
+       /* Fetch the snapshot name */
 
        snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
        if (IS_ERR(snap_name)) {
@@ -4348,10 +4386,10 @@ static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
        spec->snap_name = snap_name;
 
        return 0;
+
 out_err:
        kfree(image_name);
        kfree(pool_name);
-
        return ret;
 }
 
@@ -4483,43 +4521,22 @@ static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
                        return ret;
        }
 
-       /*
-        * If the image supports layering, get the parent info.  We
-        * need to probe the first time regardless.  Thereafter we
-        * only need to if there's a parent, to see if it has
-        * disappeared due to the mapped image getting flattened.
-        */
-       if (rbd_dev->header.features & RBD_FEATURE_LAYERING &&
-                       (first_time || rbd_dev->parent_spec)) {
-               bool warn;
-
-               ret = rbd_dev_v2_parent_info(rbd_dev);
-               if (ret)
-                       return ret;
-
-               /*
-                * Print a warning if this is the initial probe and
-                * the image has a parent.  Don't print it if the
-                * image now being probed is itself a parent.  We
-                * can tell at this point because we won't know its
-                * pool name yet (just its pool id).
-                */
-               warn = rbd_dev->parent_spec && rbd_dev->spec->pool_name;
-               if (first_time && warn)
-                       rbd_warn(rbd_dev, "WARNING: kernel layering "
-                                       "is EXPERIMENTAL!");
-       }
-
-       if (rbd_dev->spec->snap_id == CEPH_NOSNAP)
-               if (rbd_dev->mapping.size != rbd_dev->header.image_size)
-                       rbd_dev->mapping.size = rbd_dev->header.image_size;
-
        ret = rbd_dev_v2_snap_context(rbd_dev);
        dout("rbd_dev_v2_snap_context returned %d\n", ret);
 
        return ret;
 }
 
+static int rbd_dev_header_info(struct rbd_device *rbd_dev)
+{
+       rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+
+       if (rbd_dev->image_format == 1)
+               return rbd_dev_v1_header_info(rbd_dev);
+
+       return rbd_dev_v2_header_info(rbd_dev);
+}
+
 static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
 {
        struct device *dev;
@@ -5066,12 +5083,17 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
        ret = rbd_dev_mapping_set(rbd_dev);
        if (ret)
                goto err_out_disk;
+
        set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
        set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only);
 
+       rbd_dev->rq_wq = alloc_workqueue(rbd_dev->disk->disk_name, 0, 0);
+       if (!rbd_dev->rq_wq)
+               goto err_out_mapping;
+
        ret = rbd_bus_add_dev(rbd_dev);
        if (ret)
-               goto err_out_mapping;
+               goto err_out_workqueue;
 
        /* Everything's ready.  Announce the disk to the world. */
 
@@ -5083,6 +5105,9 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
 
        return ret;
 
+err_out_workqueue:
+       destroy_workqueue(rbd_dev->rq_wq);
+       rbd_dev->rq_wq = NULL;
 err_out_mapping:
        rbd_dev_mapping_clear(rbd_dev);
 err_out_disk:
@@ -5155,8 +5180,6 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
        ret = rbd_dev_image_id(rbd_dev);
        if (ret)
                return ret;
-       rbd_assert(rbd_dev->spec->image_id);
-       rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 
        ret = rbd_dev_header_name(rbd_dev);
        if (ret)
@@ -5168,25 +5191,45 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
                        goto out_header_name;
        }
 
-       if (rbd_dev->image_format == 1)
-               ret = rbd_dev_v1_header_info(rbd_dev);
-       else
-               ret = rbd_dev_v2_header_info(rbd_dev);
+       ret = rbd_dev_header_info(rbd_dev);
        if (ret)
                goto err_out_watch;
 
-       ret = rbd_dev_spec_update(rbd_dev);
+       /*
+        * If this image is the one being mapped, we have pool name and
+        * id, image name and id, and snap name - need to fill snap id.
+        * Otherwise this is a parent image, identified by pool, image
+        * and snap ids - need to fill in names for those ids.
+        */
+       if (mapping)
+               ret = rbd_spec_fill_snap_id(rbd_dev);
+       else
+               ret = rbd_spec_fill_names(rbd_dev);
        if (ret)
                goto err_out_probe;
 
+       if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
+               ret = rbd_dev_v2_parent_info(rbd_dev);
+               if (ret)
+                       goto err_out_probe;
+
+               /*
+                * Need to warn users if this image is the one being
+                * mapped and has a parent.
+                */
+               if (mapping && rbd_dev->parent_spec)
+                       rbd_warn(rbd_dev,
+                                "WARNING: kernel layering is EXPERIMENTAL!");
+       }
+
        ret = rbd_dev_probe_parent(rbd_dev);
        if (ret)
                goto err_out_probe;
 
        dout("discovered format %u image, header name is %s\n",
                rbd_dev->image_format, rbd_dev->header_name);
-
        return 0;
+
 err_out_probe:
        rbd_dev_unprobe(rbd_dev);
 err_out_watch:
@@ -5199,9 +5242,6 @@ err_out_format:
        rbd_dev->image_format = 0;
        kfree(rbd_dev->spec->image_id);
        rbd_dev->spec->image_id = NULL;
-
-       dout("probe failed, returning %d\n", ret);
-
        return ret;
 }
 
@@ -5243,7 +5283,7 @@ static ssize_t do_rbd_add(struct bus_type *bus,
        /* The ceph file layout needs to fit pool id in 32 bits */
 
        if (spec->pool_id > (u64)U32_MAX) {
-               rbd_warn(NULL, "pool id too large (%llu > %u)\n",
+               rbd_warn(NULL, "pool id too large (%llu > %u)",
                                (unsigned long long)spec->pool_id, U32_MAX);
                rc = -EIO;
                goto err_out_client;
@@ -5314,6 +5354,7 @@ static void rbd_dev_device_release(struct device *dev)
 {
        struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
 
+       destroy_workqueue(rbd_dev->rq_wq);
        rbd_free_disk(rbd_dev);
        clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
        rbd_dev_mapping_clear(rbd_dev);
index f63d358f3d933be8b2ec7a7eec0dbaec296796be..0a581400de0f4006e95de9e0f289d382608eb517 100644 (file)
 #include <linux/numa.h>
 
 #define PART_BITS 4
+#define VQ_NAME_LEN 16
 
 static int major;
 static DEFINE_IDA(vd_index_ida);
 
 static struct workqueue_struct *virtblk_wq;
 
+struct virtio_blk_vq {
+       struct virtqueue *vq;
+       spinlock_t lock;
+       char name[VQ_NAME_LEN];
+} ____cacheline_aligned_in_smp;
+
 struct virtio_blk
 {
        struct virtio_device *vdev;
-       struct virtqueue *vq;
-       spinlock_t vq_lock;
 
        /* The disk structure for the kernel. */
        struct gendisk *disk;
@@ -47,6 +52,10 @@ struct virtio_blk
 
        /* Ida index - used to track minor number allocations. */
        int index;
+
+       /* num of vqs */
+       int num_vqs;
+       struct virtio_blk_vq *vqs;
 };
 
 struct virtblk_req
@@ -133,14 +142,15 @@ static void virtblk_done(struct virtqueue *vq)
 {
        struct virtio_blk *vblk = vq->vdev->priv;
        bool req_done = false;
+       int qid = vq->index;
        struct virtblk_req *vbr;
        unsigned long flags;
        unsigned int len;
 
-       spin_lock_irqsave(&vblk->vq_lock, flags);
+       spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
        do {
                virtqueue_disable_cb(vq);
-               while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) {
+               while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) {
                        blk_mq_complete_request(vbr->req);
                        req_done = true;
                }
@@ -151,7 +161,7 @@ static void virtblk_done(struct virtqueue *vq)
        /* In case queue is stopped waiting for more buffers. */
        if (req_done)
                blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
-       spin_unlock_irqrestore(&vblk->vq_lock, flags);
+       spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
 }
 
 static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
@@ -160,6 +170,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
        struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
        unsigned long flags;
        unsigned int num;
+       int qid = hctx->queue_num;
        const bool last = (req->cmd_flags & REQ_END) != 0;
        int err;
        bool notify = false;
@@ -202,12 +213,12 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
                        vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
        }
 
-       spin_lock_irqsave(&vblk->vq_lock, flags);
-       err = __virtblk_add_req(vblk->vq, vbr, vbr->sg, num);
+       spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
+       err = __virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num);
        if (err) {
-               virtqueue_kick(vblk->vq);
+               virtqueue_kick(vblk->vqs[qid].vq);
                blk_mq_stop_hw_queue(hctx);
-               spin_unlock_irqrestore(&vblk->vq_lock, flags);
+               spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
                /* Out of mem doesn't actually happen, since we fall back
                 * to direct descriptors */
                if (err == -ENOMEM || err == -ENOSPC)
@@ -215,12 +226,12 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
                return BLK_MQ_RQ_QUEUE_ERROR;
        }
 
-       if (last && virtqueue_kick_prepare(vblk->vq))
+       if (last && virtqueue_kick_prepare(vblk->vqs[qid].vq))
                notify = true;
-       spin_unlock_irqrestore(&vblk->vq_lock, flags);
+       spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
 
        if (notify)
-               virtqueue_notify(vblk->vq);
+               virtqueue_notify(vblk->vqs[qid].vq);
        return BLK_MQ_RQ_QUEUE_OK;
 }
 
@@ -377,12 +388,64 @@ static void virtblk_config_changed(struct virtio_device *vdev)
 static int init_vq(struct virtio_blk *vblk)
 {
        int err = 0;
+       int i;
+       vq_callback_t **callbacks;
+       const char **names;
+       struct virtqueue **vqs;
+       unsigned short num_vqs;
+       struct virtio_device *vdev = vblk->vdev;
+
+       err = virtio_cread_feature(vdev, VIRTIO_BLK_F_MQ,
+                                  struct virtio_blk_config, num_queues,
+                                  &num_vqs);
+       if (err)
+               num_vqs = 1;
+
+       vblk->vqs = kmalloc(sizeof(*vblk->vqs) * num_vqs, GFP_KERNEL);
+       if (!vblk->vqs) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       names = kmalloc(sizeof(*names) * num_vqs, GFP_KERNEL);
+       if (!names)
+               goto err_names;
+
+       callbacks = kmalloc(sizeof(*callbacks) * num_vqs, GFP_KERNEL);
+       if (!callbacks)
+               goto err_callbacks;
+
+       vqs = kmalloc(sizeof(*vqs) * num_vqs, GFP_KERNEL);
+       if (!vqs)
+               goto err_vqs;
 
-       /* We expect one virtqueue, for output. */
-       vblk->vq = virtio_find_single_vq(vblk->vdev, virtblk_done, "requests");
-       if (IS_ERR(vblk->vq))
-               err = PTR_ERR(vblk->vq);
+       for (i = 0; i < num_vqs; i++) {
+               callbacks[i] = virtblk_done;
+               snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
+               names[i] = vblk->vqs[i].name;
+       }
+
+       /* Discover virtqueues and write information to configuration.  */
+       err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names);
+       if (err)
+               goto err_find_vqs;
 
+       for (i = 0; i < num_vqs; i++) {
+               spin_lock_init(&vblk->vqs[i].lock);
+               vblk->vqs[i].vq = vqs[i];
+       }
+       vblk->num_vqs = num_vqs;
+
+ err_find_vqs:
+       kfree(vqs);
+ err_vqs:
+       kfree(callbacks);
+ err_callbacks:
+       kfree(names);
+ err_names:
+       if (err)
+               kfree(vblk->vqs);
+ out:
        return err;
 }
 
@@ -551,7 +614,6 @@ static int virtblk_probe(struct virtio_device *vdev)
        err = init_vq(vblk);
        if (err)
                goto out_free_vblk;
-       spin_lock_init(&vblk->vq_lock);
 
        /* FIXME: How many partitions?  How long is a piece of string? */
        vblk->disk = alloc_disk(1 << PART_BITS);
@@ -562,7 +624,7 @@ static int virtblk_probe(struct virtio_device *vdev)
 
        /* Default queue sizing is to fill the ring. */
        if (!virtblk_queue_depth) {
-               virtblk_queue_depth = vblk->vq->num_free;
+               virtblk_queue_depth = vblk->vqs[0].vq->num_free;
                /* ... but without indirect descs, we use 2 descs per req */
                if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
                        virtblk_queue_depth /= 2;
@@ -570,7 +632,6 @@ static int virtblk_probe(struct virtio_device *vdev)
 
        memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
        vblk->tag_set.ops = &virtio_mq_ops;
-       vblk->tag_set.nr_hw_queues = 1;
        vblk->tag_set.queue_depth = virtblk_queue_depth;
        vblk->tag_set.numa_node = NUMA_NO_NODE;
        vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
@@ -578,6 +639,7 @@ static int virtblk_probe(struct virtio_device *vdev)
                sizeof(struct virtblk_req) +
                sizeof(struct scatterlist) * sg_elems;
        vblk->tag_set.driver_data = vblk;
+       vblk->tag_set.nr_hw_queues = vblk->num_vqs;
 
        err = blk_mq_alloc_tag_set(&vblk->tag_set);
        if (err)
@@ -727,6 +789,7 @@ static void virtblk_remove(struct virtio_device *vdev)
        refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount);
        put_disk(vblk->disk);
        vdev->config->del_vqs(vdev);
+       kfree(vblk->vqs);
        kfree(vblk);
 
        /* Only free device id if we don't have any users */
@@ -777,7 +840,8 @@ static const struct virtio_device_id id_table[] = {
 static unsigned int features[] = {
        VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
        VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI,
-       VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE
+       VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
+       VIRTIO_BLK_F_MQ,
 };
 
 static struct virtio_driver virtio_blk = {
index 8bc422977b5b88120ad93c08ac27cbea22cc80ec..4ff86878727fc0130ebf9be394f3f832c50c315d 100644 (file)
@@ -499,8 +499,7 @@ static int __init g5_pm72_cpufreq_init(struct device_node *cpunode)
        }
 
        /* Lookup the i2c hwclock */
-       for (hwclock = NULL;
-            (hwclock = of_find_node_by_name(hwclock, "i2c-hwclock")) != NULL;){
+       for_each_node_by_name(hwclock, "i2c-hwclock") {
                const char *loc = of_get_property(hwclock,
                                "hwctrl-location", NULL);
                if (loc == NULL)
index 544f6d327ede5512d287bcc87e28663200f873a5..061407d5952052add3424f2978267d49e949af3a 100644 (file)
@@ -936,28 +936,14 @@ static int nx842_OF_upd(struct property *new_prop)
                goto error_out;
        }
 
-       /* Set ptr to new property if provided */
-       if (new_prop) {
-               /* Single property */
-               if (!strncmp(new_prop->name, "status", new_prop->length)) {
-                       status = new_prop;
-
-               } else if (!strncmp(new_prop->name, "ibm,max-sg-len",
-                                       new_prop->length)) {
-                       maxsglen = new_prop;
-
-               } else if (!strncmp(new_prop->name, "ibm,max-sync-cop",
-                                       new_prop->length)) {
-                       maxsyncop = new_prop;
-
-               } else {
-                       /*
-                        * Skip the update, the property being updated
-                        * has no impact.
-                        */
-                       goto out;
-               }
-       }
+       /*
+        * If this is a property update, there are only certain properties that
+        * we care about. Bail if it isn't in the below list
+        */
+       if (new_prop && (strncmp(new_prop->name, "status", new_prop->length) ||
+                        strncmp(new_prop->name, "ibm,max-sg-len", new_prop->length) ||
+                        strncmp(new_prop->name, "ibm,max-sync-cop", new_prop->length)))
+               goto out;
 
        /* Perform property updates */
        ret = nx842_OF_upd_status(new_devdata, status);
index 8f6afbf9ba54103b88dc888ee724e3ff459eb16c..9b1ea0ef59af655d4fc5fda6427fd1f5579dc041 100644 (file)
@@ -393,6 +393,22 @@ config XILINX_VDMA
          channels, Memory Mapped to Stream (MM2S) and Stream to
          Memory Mapped (S2MM) for the data transfers.
 
+config DMA_SUN6I
+       tristate "Allwinner A31 SoCs DMA support"
+       depends on MACH_SUN6I || COMPILE_TEST
+       depends on RESET_CONTROLLER
+       select DMA_ENGINE
+       select DMA_VIRTUAL_CHANNELS
+       help
+         Support for the DMA engine for Allwinner A31 SoCs.
+
+config NBPFAXI_DMA
+       tristate "Renesas Type-AXI NBPF DMA support"
+       select DMA_ENGINE
+       depends on ARM || COMPILE_TEST
+       help
+         Support for "Type-AXI" NBPF DMA IPs from Renesas
+
 config DMA_ENGINE
        bool
 
@@ -406,6 +422,7 @@ config DMA_ACPI
 config DMA_OF
        def_bool y
        depends on OF
+       select DMA_ENGINE
 
 comment "DMA Clients"
        depends on DMA_ENGINE
index bd9e7fa928bd24a6b160a693e0b2c321492b7ee2..c6adb925f0b9c3f8b4178a3c079096960c884a69 100644 (file)
@@ -1,5 +1,5 @@
-ccflags-$(CONFIG_DMADEVICES_DEBUG)  := -DDEBUG
-ccflags-$(CONFIG_DMADEVICES_VDEBUG) += -DVERBOSE_DEBUG
+subdir-ccflags-$(CONFIG_DMADEVICES_DEBUG)  := -DDEBUG
+subdir-ccflags-$(CONFIG_DMADEVICES_VDEBUG) += -DVERBOSE_DEBUG
 
 obj-$(CONFIG_DMA_ENGINE) += dmaengine.o
 obj-$(CONFIG_DMA_VIRTUAL_CHANNELS) += virt-dma.o
@@ -48,3 +48,5 @@ obj-$(CONFIG_FSL_EDMA) += fsl-edma.o
 obj-$(CONFIG_QCOM_BAM_DMA) += qcom_bam_dma.o
 obj-y += xilinx/
 obj-$(CONFIG_INTEL_MIC_X100_DMA) += mic_x100_dma.o
+obj-$(CONFIG_NBPFAXI_DMA) += nbpfaxi.o
+obj-$(CONFIG_DMA_SUN6I) += sun6i-dma.o
index 734ed0206cd5ead8f9cc554e94ff168b59fc3dae..b8045cd42ee13f67bf54169443b8b61a83062e36 100644 (file)
@@ -7,7 +7,6 @@ TODO for slave dma
        - imx-dma
        - imx-sdma
        - mxs-dma.c
-       - dw_dmac
        - intel_mid_dma
 4. Check other subsystems for dma drivers and merge/move to dmaengine
 5. Remove dma_slave_config's dma direction.
index 8114731a1c62d6450cd5f8bbd490f87c7800b1ea..e34024b000a4b42952d67ac02b84c6b6593112d7 100644 (file)
@@ -1040,7 +1040,7 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 
                if (early_bytes) {
                        dev_vdbg(&pl08x->adev->dev,
-                               "%s byte width LLIs (remain 0x%08x)\n",
+                               "%s byte width LLIs (remain 0x%08zx)\n",
                                __func__, bd.remainder);
                        prep_byte_width_lli(pl08x, &bd, &cctl, early_bytes,
                                num_llis++, &total_bytes);
@@ -1653,7 +1653,7 @@ static struct dma_async_tx_descriptor *pl08x_prep_slave_sg(
 static struct dma_async_tx_descriptor *pl08x_prep_dma_cyclic(
                struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len,
                size_t period_len, enum dma_transfer_direction direction,
-               unsigned long flags, void *context)
+               unsigned long flags)
 {
        struct pl08x_dma_chan *plchan = to_pl08x_chan(chan);
        struct pl08x_driver_data *pl08x = plchan->host;
@@ -1662,7 +1662,7 @@ static struct dma_async_tx_descriptor *pl08x_prep_dma_cyclic(
        dma_addr_t slave_addr;
 
        dev_dbg(&pl08x->adev->dev,
-               "%s prepare cyclic transaction of %d/%d bytes %s %s\n",
+               "%s prepare cyclic transaction of %zd/%zd bytes %s %s\n",
                __func__, period_len, buf_len,
                direction == DMA_MEM_TO_DEV ? "to" : "from",
                plchan->name);
index c13a3bb0f5943234fac845d3214309a3c754c63a..ca9dd261328357d079466b9d0659e43d57a0da4c 100644 (file)
@@ -294,14 +294,16 @@ static int atc_get_bytes_left(struct dma_chan *chan)
                        ret = -EINVAL;
                        goto out;
                }
-               atchan->remain_desc -= (desc_cur->lli.ctrla & ATC_BTSIZE_MAX)
-                                               << (desc_first->tx_width);
-               if (atchan->remain_desc < 0) {
+
+               count = (desc_cur->lli.ctrla & ATC_BTSIZE_MAX)
+                       << desc_first->tx_width;
+               if (atchan->remain_desc < count) {
                        ret = -EINVAL;
                        goto out;
-               } else {
-                       ret = atchan->remain_desc;
                }
+
+               atchan->remain_desc -= count;
+               ret = atchan->remain_desc;
        } else {
                /*
                 * Get residual bytes when current
@@ -893,12 +895,11 @@ atc_dma_cyclic_fill_desc(struct dma_chan *chan, struct at_desc *desc,
  * @period_len: number of bytes for each period
  * @direction: transfer direction, to or from device
  * @flags: tx descriptor status flags
- * @context: transfer context (ignored)
  */
 static struct dma_async_tx_descriptor *
 atc_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len,
                size_t period_len, enum dma_transfer_direction direction,
-               unsigned long flags, void *context)
+               unsigned long flags)
 {
        struct at_dma_chan      *atchan = to_at_dma_chan(chan);
        struct at_dma_slave     *atslave = chan->private;
index a03602164e3e1aa5912f3ba1d74095fb5dbe9f74..68007974961a5ee63087c07cf7833404b09b7d49 100644 (file)
@@ -335,7 +335,7 @@ static void bcm2835_dma_issue_pending(struct dma_chan *chan)
 static struct dma_async_tx_descriptor *bcm2835_dma_prep_dma_cyclic(
        struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len,
        size_t period_len, enum dma_transfer_direction direction,
-       unsigned long flags, void *context)
+       unsigned long flags)
 {
        struct bcm2835_chan *c = to_bcm2835_dma_chan(chan);
        enum dma_slave_buswidth dev_width;
index 94c380f0753860c4c4002c19f4217332fa2460f0..6a9d89c93b1fdd0f7dfb4f8a68dee807efc5f29a 100644 (file)
@@ -433,7 +433,7 @@ static struct dma_async_tx_descriptor *jz4740_dma_prep_slave_sg(
 static struct dma_async_tx_descriptor *jz4740_dma_prep_dma_cyclic(
        struct dma_chan *c, dma_addr_t buf_addr, size_t buf_len,
        size_t period_len, enum dma_transfer_direction direction,
-       unsigned long flags, void *context)
+       unsigned long flags)
 {
        struct jz4740_dmaengine_chan *chan = to_jz4740_dma_chan(c);
        struct jz4740_dma_desc *desc;
@@ -614,4 +614,4 @@ module_platform_driver(jz4740_dma_driver);
 
 MODULE_AUTHOR("Lars-Peter Clausen <lars@metafoo.de>");
 MODULE_DESCRIPTION("JZ4740 DMA driver");
-MODULE_LICENSE("GPLv2");
+MODULE_LICENSE("GPL v2");
index a27ded53ab4f0e5eb2fa312e891da3adf8a49ef9..1af731b83b3f56f284fd5655f85d90cda09bf2ca 100644 (file)
@@ -279,6 +279,19 @@ static void dwc_dostart(struct dw_dma_chan *dwc, struct dw_desc *first)
        channel_set_bit(dw, CH_EN, dwc->mask);
 }
 
+static void dwc_dostart_first_queued(struct dw_dma_chan *dwc)
+{
+       struct dw_desc *desc;
+
+       if (list_empty(&dwc->queue))
+               return;
+
+       list_move(dwc->queue.next, &dwc->active_list);
+       desc = dwc_first_active(dwc);
+       dev_vdbg(chan2dev(&dwc->chan), "%s: started %u\n", __func__, desc->txd.cookie);
+       dwc_dostart(dwc, desc);
+}
+
 /*----------------------------------------------------------------------*/
 
 static void
@@ -335,10 +348,7 @@ static void dwc_complete_all(struct dw_dma *dw, struct dw_dma_chan *dwc)
         * the completed ones.
         */
        list_splice_init(&dwc->active_list, &list);
-       if (!list_empty(&dwc->queue)) {
-               list_move(dwc->queue.next, &dwc->active_list);
-               dwc_dostart(dwc, dwc_first_active(dwc));
-       }
+       dwc_dostart_first_queued(dwc);
 
        spin_unlock_irqrestore(&dwc->lock, flags);
 
@@ -467,10 +477,7 @@ static void dwc_scan_descriptors(struct dw_dma *dw, struct dw_dma_chan *dwc)
        /* Try to continue after resetting the channel... */
        dwc_chan_disable(dw, dwc);
 
-       if (!list_empty(&dwc->queue)) {
-               list_move(dwc->queue.next, &dwc->active_list);
-               dwc_dostart(dwc, dwc_first_active(dwc));
-       }
+       dwc_dostart_first_queued(dwc);
        spin_unlock_irqrestore(&dwc->lock, flags);
 }
 
@@ -677,17 +684,9 @@ static dma_cookie_t dwc_tx_submit(struct dma_async_tx_descriptor *tx)
         * possible, perhaps even appending to those already submitted
         * for DMA. But this is hard to do in a race-free manner.
         */
-       if (list_empty(&dwc->active_list)) {
-               dev_vdbg(chan2dev(tx->chan), "%s: started %u\n", __func__,
-                               desc->txd.cookie);
-               list_add_tail(&desc->desc_node, &dwc->active_list);
-               dwc_dostart(dwc, dwc_first_active(dwc));
-       } else {
-               dev_vdbg(chan2dev(tx->chan), "%s: queued %u\n", __func__,
-                               desc->txd.cookie);
 
-               list_add_tail(&desc->desc_node, &dwc->queue);
-       }
+       dev_vdbg(chan2dev(tx->chan), "%s: queued %u\n", __func__, desc->txd.cookie);
+       list_add_tail(&desc->desc_node, &dwc->queue);
 
        spin_unlock_irqrestore(&dwc->lock, flags);
 
@@ -1092,9 +1091,12 @@ dwc_tx_status(struct dma_chan *chan,
 static void dwc_issue_pending(struct dma_chan *chan)
 {
        struct dw_dma_chan      *dwc = to_dw_dma_chan(chan);
+       unsigned long           flags;
 
-       if (!list_empty(&dwc->queue))
-               dwc_scan_descriptors(to_dw_dma(chan->device), dwc);
+       spin_lock_irqsave(&dwc->lock, flags);
+       if (list_empty(&dwc->active_list))
+               dwc_dostart_first_queued(dwc);
+       spin_unlock_irqrestore(&dwc->lock, flags);
 }
 
 static int dwc_alloc_chan_resources(struct dma_chan *chan)
index b512caf46944ba594a7d612446e9b5af0a82adc6..7b65633f495ed28ca77842231ea65a01c8870f4c 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/of.h>
 
 #include <linux/platform_data/edma.h>
 
@@ -256,8 +257,13 @@ static int edma_terminate_all(struct edma_chan *echan)
         * echan->edesc is NULL and exit.)
         */
        if (echan->edesc) {
+               int cyclic = echan->edesc->cyclic;
                echan->edesc = NULL;
                edma_stop(echan->ch_num);
+               /* Move the cyclic channel back to default queue */
+               if (cyclic)
+                       edma_assign_channel_eventq(echan->ch_num,
+                                                  EVENTQ_DEFAULT);
        }
 
        vchan_get_all_descriptors(&echan->vchan, &head);
@@ -592,7 +598,7 @@ struct dma_async_tx_descriptor *edma_prep_dma_memcpy(
 static struct dma_async_tx_descriptor *edma_prep_dma_cyclic(
        struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len,
        size_t period_len, enum dma_transfer_direction direction,
-       unsigned long tx_flags, void *context)
+       unsigned long tx_flags)
 {
        struct edma_chan *echan = to_edma_chan(chan);
        struct device *dev = chan->device->dev;
@@ -718,12 +724,15 @@ static struct dma_async_tx_descriptor *edma_prep_dma_cyclic(
                edesc->absync = ret;
 
                /*
-                * Enable interrupts for every period because callback
-                * has to be called for every period.
+                * Enable period interrupt only if it is requested
                 */
-               edesc->pset[i].param.opt |= TCINTEN;
+               if (tx_flags & DMA_PREP_INTERRUPT)
+                       edesc->pset[i].param.opt |= TCINTEN;
        }
 
+       /* Place the cyclic channel to highest priority queue */
+       edma_assign_channel_eventq(echan->ch_num, EVENTQ_0);
+
        return vchan_tx_prep(&echan->vchan, &edesc->vdesc, tx_flags);
 }
 
@@ -993,7 +1002,7 @@ static int edma_dma_device_slave_caps(struct dma_chan *dchan,
        caps->directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
        caps->cmd_pause = true;
        caps->cmd_terminate = true;
-       caps->residue_granularity = DMA_RESIDUE_GRANULARITY_DESCRIPTOR;
+       caps->residue_granularity = DMA_RESIDUE_GRANULARITY_BURST;
 
        return 0;
 }
@@ -1040,7 +1049,7 @@ static int edma_probe(struct platform_device *pdev)
        ecc->dummy_slot = edma_alloc_slot(ecc->ctlr, EDMA_SLOT_ANY);
        if (ecc->dummy_slot < 0) {
                dev_err(&pdev->dev, "Can't allocate PaRAM dummy slot\n");
-               return -EIO;
+               return ecc->dummy_slot;
        }
 
        dma_cap_zero(ecc->dma_slave.cap_mask);
@@ -1125,7 +1134,7 @@ static int edma_init(void)
                }
        }
 
-       if (EDMA_CTLRS == 2) {
+       if (!of_have_populated_dt() && EDMA_CTLRS == 2) {
                pdev1 = platform_device_register_full(&edma_dev_info1);
                if (IS_ERR(pdev1)) {
                        platform_driver_unregister(&edma_driver);
index cb4bf682a70863e6253396eb717cc7e9ae86e453..7650470196c46c6b0d26d26184d613d11f040712 100644 (file)
@@ -1092,7 +1092,6 @@ fail:
  * @period_len: length of a single period
  * @dir: direction of the operation
  * @flags: tx descriptor status flags
- * @context: operation context (ignored)
  *
  * Prepares a descriptor for cyclic DMA operation. This means that once the
  * descriptor is submitted, we will be submitting in a @period_len sized
@@ -1105,8 +1104,7 @@ fail:
 static struct dma_async_tx_descriptor *
 ep93xx_dma_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t dma_addr,
                           size_t buf_len, size_t period_len,
-                          enum dma_transfer_direction dir, unsigned long flags,
-                          void *context)
+                          enum dma_transfer_direction dir, unsigned long flags)
 {
        struct ep93xx_dma_chan *edmac = to_ep93xx_dma_chan(chan);
        struct ep93xx_dma_desc *desc, *first;
index b396a7fb53abb5df611e4fbd551000758f876100..3c5711d5fe97c99a71c1504cd6d7e174ea27bf32 100644 (file)
@@ -248,11 +248,12 @@ static void fsl_edma_chan_mux(struct fsl_edma_chan *fsl_chan,
                        unsigned int slot, bool enable)
 {
        u32 ch = fsl_chan->vchan.chan.chan_id;
-       void __iomem *muxaddr = fsl_chan->edma->muxbase[ch / DMAMUX_NR];
+       void __iomem *muxaddr;
        unsigned chans_per_mux, ch_off;
 
        chans_per_mux = fsl_chan->edma->n_chans / DMAMUX_NR;
        ch_off = fsl_chan->vchan.chan.chan_id % chans_per_mux;
+       muxaddr = fsl_chan->edma->muxbase[ch / chans_per_mux];
 
        if (enable)
                edma_writeb(fsl_chan->edma,
@@ -516,7 +517,7 @@ err:
 static struct dma_async_tx_descriptor *fsl_edma_prep_dma_cyclic(
                struct dma_chan *chan, dma_addr_t dma_addr, size_t buf_len,
                size_t period_len, enum dma_transfer_direction direction,
-               unsigned long flags, void *context)
+               unsigned long flags)
 {
        struct fsl_edma_chan *fsl_chan = to_fsl_edma_chan(chan);
        struct fsl_edma_desc *fsl_desc;
@@ -724,6 +725,7 @@ static struct dma_chan *fsl_edma_xlate(struct of_phandle_args *dma_spec,
 {
        struct fsl_edma_engine *fsl_edma = ofdma->of_dma_data;
        struct dma_chan *chan, *_chan;
+       unsigned long chans_per_mux = fsl_edma->n_chans / DMAMUX_NR;
 
        if (dma_spec->args_count != 2)
                return NULL;
@@ -732,7 +734,7 @@ static struct dma_chan *fsl_edma_xlate(struct of_phandle_args *dma_spec,
        list_for_each_entry_safe(chan, _chan, &fsl_edma->dma_dev.channels, device_node) {
                if (chan->client_count)
                        continue;
-               if ((chan->chan_id / DMAMUX_NR) == dma_spec->args[0]) {
+               if ((chan->chan_id / chans_per_mux) == dma_spec->args[0]) {
                        chan = dma_get_slave_channel(chan);
                        if (chan) {
                                chan->device->privatecnt++;
index e0fec68aed2511220206960a1b0930cf65e696b8..d5d6885ab341bc24119e056733be1443ba91a614 100644 (file)
@@ -396,10 +396,17 @@ static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx)
        struct fsldma_chan *chan = to_fsl_chan(tx->chan);
        struct fsl_desc_sw *desc = tx_to_fsl_desc(tx);
        struct fsl_desc_sw *child;
-       unsigned long flags;
        dma_cookie_t cookie = -EINVAL;
 
-       spin_lock_irqsave(&chan->desc_lock, flags);
+       spin_lock_bh(&chan->desc_lock);
+
+#ifdef CONFIG_PM
+       if (unlikely(chan->pm_state != RUNNING)) {
+               chan_dbg(chan, "cannot submit due to suspend\n");
+               spin_unlock_bh(&chan->desc_lock);
+               return -1;
+       }
+#endif
 
        /*
         * assign cookies to all of the software descriptors
@@ -412,7 +419,7 @@ static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx)
        /* put this transaction onto the tail of the pending queue */
        append_ld_queue(chan, desc);
 
-       spin_unlock_irqrestore(&chan->desc_lock, flags);
+       spin_unlock_bh(&chan->desc_lock);
 
        return cookie;
 }
@@ -458,6 +465,88 @@ static struct fsl_desc_sw *fsl_dma_alloc_descriptor(struct fsldma_chan *chan)
        return desc;
 }
 
+/**
+ * fsldma_clean_completed_descriptor - free all descriptors which
+ * has been completed and acked
+ * @chan: Freescale DMA channel
+ *
+ * This function is used on all completed and acked descriptors.
+ * All descriptors should only be freed in this function.
+ */
+static void fsldma_clean_completed_descriptor(struct fsldma_chan *chan)
+{
+       struct fsl_desc_sw *desc, *_desc;
+
+       /* Run the callback for each descriptor, in order */
+       list_for_each_entry_safe(desc, _desc, &chan->ld_completed, node)
+               if (async_tx_test_ack(&desc->async_tx))
+                       fsl_dma_free_descriptor(chan, desc);
+}
+
+/**
+ * fsldma_run_tx_complete_actions - cleanup a single link descriptor
+ * @chan: Freescale DMA channel
+ * @desc: descriptor to cleanup and free
+ * @cookie: Freescale DMA transaction identifier
+ *
+ * This function is used on a descriptor which has been executed by the DMA
+ * controller. It will run any callbacks, submit any dependencies.
+ */
+static dma_cookie_t fsldma_run_tx_complete_actions(struct fsldma_chan *chan,
+               struct fsl_desc_sw *desc, dma_cookie_t cookie)
+{
+       struct dma_async_tx_descriptor *txd = &desc->async_tx;
+       dma_cookie_t ret = cookie;
+
+       BUG_ON(txd->cookie < 0);
+
+       if (txd->cookie > 0) {
+               ret = txd->cookie;
+
+               /* Run the link descriptor callback function */
+               if (txd->callback) {
+                       chan_dbg(chan, "LD %p callback\n", desc);
+                       txd->callback(txd->callback_param);
+               }
+       }
+
+       /* Run any dependencies */
+       dma_run_dependencies(txd);
+
+       return ret;
+}
+
+/**
+ * fsldma_clean_running_descriptor - move the completed descriptor from
+ * ld_running to ld_completed
+ * @chan: Freescale DMA channel
+ * @desc: the descriptor which is completed
+ *
+ * Free the descriptor directly if acked by async_tx api, or move it to
+ * queue ld_completed.
+ */
+static void fsldma_clean_running_descriptor(struct fsldma_chan *chan,
+               struct fsl_desc_sw *desc)
+{
+       /* Remove from the list of transactions */
+       list_del(&desc->node);
+
+       /*
+        * the client is allowed to attach dependent operations
+        * until 'ack' is set
+        */
+       if (!async_tx_test_ack(&desc->async_tx)) {
+               /*
+                * Move this descriptor to the list of descriptors which is
+                * completed, but still awaiting the 'ack' bit to be set.
+                */
+               list_add_tail(&desc->node, &chan->ld_completed);
+               return;
+       }
+
+       dma_pool_free(chan->desc_pool, desc, desc->async_tx.phys);
+}
+
 /**
  * fsl_chan_xfer_ld_queue - transfer any pending transactions
  * @chan : Freescale DMA channel
@@ -526,31 +615,58 @@ static void fsl_chan_xfer_ld_queue(struct fsldma_chan *chan)
 }
 
 /**
- * fsldma_cleanup_descriptor - cleanup and free a single link descriptor
+ * fsldma_cleanup_descriptors - cleanup link descriptors which are completed
+ * and move them to ld_completed to free until flag 'ack' is set
  * @chan: Freescale DMA channel
- * @desc: descriptor to cleanup and free
  *
- * This function is used on a descriptor which has been executed by the DMA
- * controller. It will run any callbacks, submit any dependencies, and then
- * free the descriptor.
+ * This function is used on descriptors which have been executed by the DMA
+ * controller. It will run any callbacks, submit any dependencies, then
+ * free these descriptors if flag 'ack' is set.
  */
-static void fsldma_cleanup_descriptor(struct fsldma_chan *chan,
-                                     struct fsl_desc_sw *desc)
+static void fsldma_cleanup_descriptors(struct fsldma_chan *chan)
 {
-       struct dma_async_tx_descriptor *txd = &desc->async_tx;
+       struct fsl_desc_sw *desc, *_desc;
+       dma_cookie_t cookie = 0;
+       dma_addr_t curr_phys = get_cdar(chan);
+       int seen_current = 0;
+
+       fsldma_clean_completed_descriptor(chan);
+
+       /* Run the callback for each descriptor, in order */
+       list_for_each_entry_safe(desc, _desc, &chan->ld_running, node) {
+               /*
+                * do not advance past the current descriptor loaded into the
+                * hardware channel, subsequent descriptors are either in
+                * process or have not been submitted
+                */
+               if (seen_current)
+                       break;
+
+               /*
+                * stop the search if we reach the current descriptor and the
+                * channel is busy
+                */
+               if (desc->async_tx.phys == curr_phys) {
+                       seen_current = 1;
+                       if (!dma_is_idle(chan))
+                               break;
+               }
+
+               cookie = fsldma_run_tx_complete_actions(chan, desc, cookie);
 
-       /* Run the link descriptor callback function */
-       if (txd->callback) {
-               chan_dbg(chan, "LD %p callback\n", desc);
-               txd->callback(txd->callback_param);
+               fsldma_clean_running_descriptor(chan, desc);
        }
 
-       /* Run any dependencies */
-       dma_run_dependencies(txd);
+       /*
+        * Start any pending transactions automatically
+        *
+        * In the ideal case, we keep the DMA controller busy while we go
+        * ahead and free the descriptors below.
+        */
+       fsl_chan_xfer_ld_queue(chan);
 
-       dma_descriptor_unmap(txd);
-       chan_dbg(chan, "LD %p free\n", desc);
-       dma_pool_free(chan->desc_pool, desc, txd->phys);
+       if (cookie > 0)
+               chan->common.completed_cookie = cookie;
 }
 
 /**
@@ -617,13 +733,14 @@ static void fsldma_free_desc_list_reverse(struct fsldma_chan *chan,
 static void fsl_dma_free_chan_resources(struct dma_chan *dchan)
 {
        struct fsldma_chan *chan = to_fsl_chan(dchan);
-       unsigned long flags;
 
        chan_dbg(chan, "free all channel resources\n");
-       spin_lock_irqsave(&chan->desc_lock, flags);
+       spin_lock_bh(&chan->desc_lock);
+       fsldma_cleanup_descriptors(chan);
        fsldma_free_desc_list(chan, &chan->ld_pending);
        fsldma_free_desc_list(chan, &chan->ld_running);
-       spin_unlock_irqrestore(&chan->desc_lock, flags);
+       fsldma_free_desc_list(chan, &chan->ld_completed);
+       spin_unlock_bh(&chan->desc_lock);
 
        dma_pool_destroy(chan->desc_pool);
        chan->desc_pool = NULL;
@@ -842,7 +959,6 @@ static int fsl_dma_device_control(struct dma_chan *dchan,
 {
        struct dma_slave_config *config;
        struct fsldma_chan *chan;
-       unsigned long flags;
        int size;
 
        if (!dchan)
@@ -852,7 +968,7 @@ static int fsl_dma_device_control(struct dma_chan *dchan,
 
        switch (cmd) {
        case DMA_TERMINATE_ALL:
-               spin_lock_irqsave(&chan->desc_lock, flags);
+               spin_lock_bh(&chan->desc_lock);
 
                /* Halt the DMA engine */
                dma_halt(chan);
@@ -860,9 +976,10 @@ static int fsl_dma_device_control(struct dma_chan *dchan,
                /* Remove and free all of the descriptors in the LD queue */
                fsldma_free_desc_list(chan, &chan->ld_pending);
                fsldma_free_desc_list(chan, &chan->ld_running);
+               fsldma_free_desc_list(chan, &chan->ld_completed);
                chan->idle = true;
 
-               spin_unlock_irqrestore(&chan->desc_lock, flags);
+               spin_unlock_bh(&chan->desc_lock);
                return 0;
 
        case DMA_SLAVE_CONFIG:
@@ -904,11 +1021,10 @@ static int fsl_dma_device_control(struct dma_chan *dchan,
 static void fsl_dma_memcpy_issue_pending(struct dma_chan *dchan)
 {
        struct fsldma_chan *chan = to_fsl_chan(dchan);
-       unsigned long flags;
 
-       spin_lock_irqsave(&chan->desc_lock, flags);
+       spin_lock_bh(&chan->desc_lock);
        fsl_chan_xfer_ld_queue(chan);
-       spin_unlock_irqrestore(&chan->desc_lock, flags);
+       spin_unlock_bh(&chan->desc_lock);
 }
 
 /**
@@ -919,6 +1035,17 @@ static enum dma_status fsl_tx_status(struct dma_chan *dchan,
                                        dma_cookie_t cookie,
                                        struct dma_tx_state *txstate)
 {
+       struct fsldma_chan *chan = to_fsl_chan(dchan);
+       enum dma_status ret;
+
+       ret = dma_cookie_status(dchan, cookie, txstate);
+       if (ret == DMA_COMPLETE)
+               return ret;
+
+       spin_lock_bh(&chan->desc_lock);
+       fsldma_cleanup_descriptors(chan);
+       spin_unlock_bh(&chan->desc_lock);
+
        return dma_cookie_status(dchan, cookie, txstate);
 }
 
@@ -996,52 +1123,18 @@ static irqreturn_t fsldma_chan_irq(int irq, void *data)
 static void dma_do_tasklet(unsigned long data)
 {
        struct fsldma_chan *chan = (struct fsldma_chan *)data;
-       struct fsl_desc_sw *desc, *_desc;
-       LIST_HEAD(ld_cleanup);
-       unsigned long flags;
 
        chan_dbg(chan, "tasklet entry\n");
 
-       spin_lock_irqsave(&chan->desc_lock, flags);
-
-       /* update the cookie if we have some descriptors to cleanup */
-       if (!list_empty(&chan->ld_running)) {
-               dma_cookie_t cookie;
-
-               desc = to_fsl_desc(chan->ld_running.prev);
-               cookie = desc->async_tx.cookie;
-               dma_cookie_complete(&desc->async_tx);
-
-               chan_dbg(chan, "completed_cookie=%d\n", cookie);
-       }
-
-       /*
-        * move the descriptors to a temporary list so we can drop the lock
-        * during the entire cleanup operation
-        */
-       list_splice_tail_init(&chan->ld_running, &ld_cleanup);
+       spin_lock_bh(&chan->desc_lock);
 
        /* the hardware is now idle and ready for more */
        chan->idle = true;
 
-       /*
-        * Start any pending transactions automatically
-        *
-        * In the ideal case, we keep the DMA controller busy while we go
-        * ahead and free the descriptors below.
-        */
-       fsl_chan_xfer_ld_queue(chan);
-       spin_unlock_irqrestore(&chan->desc_lock, flags);
-
-       /* Run the callback for each descriptor, in order */
-       list_for_each_entry_safe(desc, _desc, &ld_cleanup, node) {
-
-               /* Remove from the list of transactions */
-               list_del(&desc->node);
+       /* Run all cleanup for descriptors which have been completed */
+       fsldma_cleanup_descriptors(chan);
 
-               /* Run all cleanup for this descriptor */
-               fsldma_cleanup_descriptor(chan, desc);
-       }
+       spin_unlock_bh(&chan->desc_lock);
 
        chan_dbg(chan, "tasklet exit\n");
 }
@@ -1225,7 +1318,11 @@ static int fsl_dma_chan_probe(struct fsldma_device *fdev,
        spin_lock_init(&chan->desc_lock);
        INIT_LIST_HEAD(&chan->ld_pending);
        INIT_LIST_HEAD(&chan->ld_running);
+       INIT_LIST_HEAD(&chan->ld_completed);
        chan->idle = true;
+#ifdef CONFIG_PM
+       chan->pm_state = RUNNING;
+#endif
 
        chan->common.device = &fdev->common;
        dma_cookie_init(&chan->common);
@@ -1365,6 +1462,69 @@ static int fsldma_of_remove(struct platform_device *op)
        return 0;
 }
 
+#ifdef CONFIG_PM
+static int fsldma_suspend_late(struct device *dev)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct fsldma_device *fdev = platform_get_drvdata(pdev);
+       struct fsldma_chan *chan;
+       int i;
+
+       for (i = 0; i < FSL_DMA_MAX_CHANS_PER_DEVICE; i++) {
+               chan = fdev->chan[i];
+               if (!chan)
+                       continue;
+
+               spin_lock_bh(&chan->desc_lock);
+               if (unlikely(!chan->idle))
+                       goto out;
+               chan->regs_save.mr = get_mr(chan);
+               chan->pm_state = SUSPENDED;
+               spin_unlock_bh(&chan->desc_lock);
+       }
+       return 0;
+
+out:
+       for (; i >= 0; i--) {
+               chan = fdev->chan[i];
+               if (!chan)
+                       continue;
+               chan->pm_state = RUNNING;
+               spin_unlock_bh(&chan->desc_lock);
+       }
+       return -EBUSY;
+}
+
+static int fsldma_resume_early(struct device *dev)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct fsldma_device *fdev = platform_get_drvdata(pdev);
+       struct fsldma_chan *chan;
+       u32 mode;
+       int i;
+
+       for (i = 0; i < FSL_DMA_MAX_CHANS_PER_DEVICE; i++) {
+               chan = fdev->chan[i];
+               if (!chan)
+                       continue;
+
+               spin_lock_bh(&chan->desc_lock);
+               mode = chan->regs_save.mr
+                       & ~FSL_DMA_MR_CS & ~FSL_DMA_MR_CC & ~FSL_DMA_MR_CA;
+               set_mr(chan, mode);
+               chan->pm_state = RUNNING;
+               spin_unlock_bh(&chan->desc_lock);
+       }
+
+       return 0;
+}
+
+static const struct dev_pm_ops fsldma_pm_ops = {
+       .suspend_late   = fsldma_suspend_late,
+       .resume_early   = fsldma_resume_early,
+};
+#endif
+
 static const struct of_device_id fsldma_of_ids[] = {
        { .compatible = "fsl,elo3-dma", },
        { .compatible = "fsl,eloplus-dma", },
@@ -1377,6 +1537,9 @@ static struct platform_driver fsldma_of_driver = {
                .name = "fsl-elo-dma",
                .owner = THIS_MODULE,
                .of_match_table = fsldma_of_ids,
+#ifdef CONFIG_PM
+               .pm = &fsldma_pm_ops,
+#endif
        },
        .probe = fsldma_of_probe,
        .remove = fsldma_of_remove,
index d56e83599825b16666960f78b91ea08620be4cef..239c20c84382ce4c724858f5af5ed5c92a5c1416 100644 (file)
@@ -134,12 +134,36 @@ struct fsldma_device {
 #define FSL_DMA_CHAN_PAUSE_EXT 0x00001000
 #define FSL_DMA_CHAN_START_EXT 0x00002000
 
+#ifdef CONFIG_PM
+struct fsldma_chan_regs_save {
+       u32 mr;
+};
+
+enum fsldma_pm_state {
+       RUNNING = 0,
+       SUSPENDED,
+};
+#endif
+
 struct fsldma_chan {
        char name[8];                   /* Channel name */
        struct fsldma_chan_regs __iomem *regs;
        spinlock_t desc_lock;           /* Descriptor operation lock */
-       struct list_head ld_pending;    /* Link descriptors queue */
-       struct list_head ld_running;    /* Link descriptors queue */
+       /*
+        * Descriptors which are queued to run, but have not yet been
+        * submitted to the hardware for execution
+        */
+       struct list_head ld_pending;
+       /*
+        * Descriptors which are currently being executed by the hardware
+        */
+       struct list_head ld_running;
+       /*
+        * Descriptors which have finished execution by the hardware. These
+        * descriptors have already had their cleanup actions run. They are
+        * waiting for the ACK bit to be set by the async_tx API.
+        */
+       struct list_head ld_completed;  /* Link descriptors queue */
        struct dma_chan common;         /* DMA common channel */
        struct dma_pool *desc_pool;     /* Descriptors pool */
        struct device *dev;             /* Channel device */
@@ -148,6 +172,10 @@ struct fsldma_chan {
        struct tasklet_struct tasklet;
        u32 feature;
        bool idle;                      /* DMA controller is idle */
+#ifdef CONFIG_PM
+       struct fsldma_chan_regs_save regs_save;
+       enum fsldma_pm_state pm_state;
+#endif
 
        void (*toggle_ext_pause)(struct fsldma_chan *fsl_chan, int enable);
        void (*toggle_ext_start)(struct fsldma_chan *fsl_chan, int enable);
index 286660a12cc695b699b1e319968f89e148431dad..9d2c9e7374dcef910d1b444a61b43ce560520dbc 100644 (file)
@@ -866,7 +866,7 @@ static struct dma_async_tx_descriptor *imxdma_prep_slave_sg(
 static struct dma_async_tx_descriptor *imxdma_prep_dma_cyclic(
                struct dma_chan *chan, dma_addr_t dma_addr, size_t buf_len,
                size_t period_len, enum dma_transfer_direction direction,
-               unsigned long flags, void *context)
+               unsigned long flags)
 {
        struct imxdma_channel *imxdmac = to_imxdma_chan(chan);
        struct imxdma_engine *imxdma = imxdmac->imxdma;
index 14867e3ac8ffadc4f2c803d1c927fd33dc46d189..f7626e37d0b824ec077f75e0a2d323060c38052b 100644 (file)
@@ -271,6 +271,7 @@ struct sdma_channel {
        unsigned int                    chn_count;
        unsigned int                    chn_real_count;
        struct tasklet_struct           tasklet;
+       struct imx_dma_data             data;
 };
 
 #define IMX_DMA_SG_LOOP                BIT(0)
@@ -749,6 +750,11 @@ static void sdma_get_pc(struct sdma_channel *sdmac,
                emi_2_per = sdma->script_addrs->asrc_2_mcu_addr;
                per_2_per = sdma->script_addrs->per_2_per_addr;
                break;
+       case IMX_DMATYPE_ASRC_SP:
+               per_2_emi = sdma->script_addrs->shp_2_mcu_addr;
+               emi_2_per = sdma->script_addrs->mcu_2_shp_addr;
+               per_2_per = sdma->script_addrs->per_2_per_addr;
+               break;
        case IMX_DMATYPE_MSHC:
                per_2_emi = sdma->script_addrs->mshc_2_mcu_addr;
                emi_2_per = sdma->script_addrs->mcu_2_mshc_addr;
@@ -911,14 +917,13 @@ static int sdma_request_channel(struct sdma_channel *sdmac)
        int channel = sdmac->channel;
        int ret = -EBUSY;
 
-       sdmac->bd = dma_alloc_coherent(NULL, PAGE_SIZE, &sdmac->bd_phys, GFP_KERNEL);
+       sdmac->bd = dma_zalloc_coherent(NULL, PAGE_SIZE, &sdmac->bd_phys,
+                                       GFP_KERNEL);
        if (!sdmac->bd) {
                ret = -ENOMEM;
                goto out;
        }
 
-       memset(sdmac->bd, 0, PAGE_SIZE);
-
        sdma->channel_control[channel].base_bd_ptr = sdmac->bd_phys;
        sdma->channel_control[channel].current_bd_ptr = sdmac->bd_phys;
 
@@ -1120,7 +1125,7 @@ err_out:
 static struct dma_async_tx_descriptor *sdma_prep_dma_cyclic(
                struct dma_chan *chan, dma_addr_t dma_addr, size_t buf_len,
                size_t period_len, enum dma_transfer_direction direction,
-               unsigned long flags, void *context)
+               unsigned long flags)
 {
        struct sdma_channel *sdmac = to_sdma_chan(chan);
        struct sdma_engine *sdma = sdmac->sdma;
@@ -1414,12 +1419,14 @@ err_dma_alloc:
 
 static bool sdma_filter_fn(struct dma_chan *chan, void *fn_param)
 {
+       struct sdma_channel *sdmac = to_sdma_chan(chan);
        struct imx_dma_data *data = fn_param;
 
        if (!imx_dma_is_general_purpose(chan))
                return false;
 
-       chan->private = data;
+       sdmac->data = *data;
+       chan->private = &sdmac->data;
 
        return true;
 }
index 128ca143486d1b59c0106cb8a177b5a2e1b803fd..bbf62927bd72f4caa20b6fb19f4bae64042864b1 100644 (file)
@@ -1532,11 +1532,17 @@ static int idmac_alloc_chan_resources(struct dma_chan *chan)
 #ifdef DEBUG
        if (chan->chan_id == IDMAC_IC_7) {
                ic_sof = ipu_irq_map(69);
-               if (ic_sof > 0)
-                       request_irq(ic_sof, ic_sof_irq, 0, "IC SOF", ichan);
+               if (ic_sof > 0) {
+                       ret = request_irq(ic_sof, ic_sof_irq, 0, "IC SOF", ichan);
+                       if (ret)
+                               dev_err(&chan->dev->device, "request irq failed for IC SOF");
+               }
                ic_eof = ipu_irq_map(70);
-               if (ic_eof > 0)
-                       request_irq(ic_eof, ic_eof_irq, 0, "IC EOF", ichan);
+               if (ic_eof > 0) {
+                       ret = request_irq(ic_eof, ic_eof_irq, 0, "IC EOF", ichan);
+                       if (ret)
+                               dev_err(&chan->dev->device, "request irq failed for IC EOF");
+               }
        }
 #endif
 
index a7b186d536b3ad2eede1ff4bd639206f6c249768..a1a4db5721b84cb905588cf9320cd8af27da0675 100644 (file)
@@ -601,7 +601,7 @@ static struct dma_async_tx_descriptor *
 mmp_pdma_prep_dma_cyclic(struct dma_chan *dchan,
                         dma_addr_t buf_addr, size_t len, size_t period_len,
                         enum dma_transfer_direction direction,
-                        unsigned long flags, void *context)
+                        unsigned long flags)
 {
        struct mmp_pdma_chan *chan;
        struct mmp_pdma_desc_sw *first = NULL, *prev = NULL, *new;
index 724f7f4c9720dba720691911a3eb96d68fbe76a0..6ad30e2c5038351ed80e6abc8b965c9b1efa38f5 100644 (file)
@@ -389,7 +389,7 @@ struct mmp_tdma_desc *mmp_tdma_alloc_descriptor(struct mmp_tdma_chan *tdmac)
 static struct dma_async_tx_descriptor *mmp_tdma_prep_dma_cyclic(
                struct dma_chan *chan, dma_addr_t dma_addr, size_t buf_len,
                size_t period_len, enum dma_transfer_direction direction,
-               unsigned long flags, void *context)
+               unsigned long flags)
 {
        struct mmp_tdma_chan *tdmac = to_mmp_tdma_chan(chan);
        struct mmp_tdma_desc *desc;
index 2ad43738ac8b71051f496327400fced644d5dbd3..881db2bcb48b89cb1b3896dc42da69c65d517347 100644 (file)
@@ -53,6 +53,7 @@
 #include <linux/of_address.h>
 #include <linux/of_device.h>
 #include <linux/of_irq.h>
+#include <linux/of_dma.h>
 #include <linux/of_platform.h>
 
 #include <linux/random.h>
@@ -1036,7 +1037,15 @@ static int mpc_dma_probe(struct platform_device *op)
        if (retval)
                goto err_free2;
 
-       return retval;
+       /* Register with OF helpers for DMA lookups (nonfatal) */
+       if (dev->of_node) {
+               retval = of_dma_controller_register(dev->of_node,
+                                               of_dma_xlate_by_chan_id, mdma);
+               if (retval)
+                       dev_warn(dev, "Could not register for OF lookup\n");
+       }
+
+       return 0;
 
 err_free2:
        if (mdma->is_mpc8308)
@@ -1057,6 +1066,8 @@ static int mpc_dma_remove(struct platform_device *op)
        struct device *dev = &op->dev;
        struct mpc_dma *mdma = dev_get_drvdata(dev);
 
+       if (dev->of_node)
+               of_dma_controller_free(dev->of_node);
        dma_async_device_unregister(&mdma->dma);
        if (mdma->is_mpc8308) {
                free_irq(mdma->irq2, mdma);
index ead491346da70183a3a235687de975f85f89e677..5ea61201dbf02c9aa69682d93c40263727ef699e 100644 (file)
@@ -413,16 +413,14 @@ static int mxs_dma_alloc_chan_resources(struct dma_chan *chan)
        struct mxs_dma_engine *mxs_dma = mxs_chan->mxs_dma;
        int ret;
 
-       mxs_chan->ccw = dma_alloc_coherent(mxs_dma->dma_device.dev,
-                               CCW_BLOCK_SIZE, &mxs_chan->ccw_phys,
-                               GFP_KERNEL);
+       mxs_chan->ccw = dma_zalloc_coherent(mxs_dma->dma_device.dev,
+                                           CCW_BLOCK_SIZE,
+                                           &mxs_chan->ccw_phys, GFP_KERNEL);
        if (!mxs_chan->ccw) {
                ret = -ENOMEM;
                goto err_alloc;
        }
 
-       memset(mxs_chan->ccw, 0, CCW_BLOCK_SIZE);
-
        if (mxs_chan->chan_irq != NO_IRQ) {
                ret = request_irq(mxs_chan->chan_irq, mxs_dma_int_handler,
                                        0, "mxs-dma", mxs_dma);
@@ -591,7 +589,7 @@ err_out:
 static struct dma_async_tx_descriptor *mxs_dma_prep_dma_cyclic(
                struct dma_chan *chan, dma_addr_t dma_addr, size_t buf_len,
                size_t period_len, enum dma_transfer_direction direction,
-               unsigned long flags, void *context)
+               unsigned long flags)
 {
        struct mxs_dma_chan *mxs_chan = to_mxs_dma_chan(chan);
        struct mxs_dma_engine *mxs_dma = mxs_chan->mxs_dma;
diff --git a/drivers/dma/nbpfaxi.c b/drivers/dma/nbpfaxi.c
new file mode 100644 (file)
index 0000000..5aeada5
--- /dev/null
@@ -0,0 +1,1517 @@
+/*
+ * Copyright (C) 2013-2014 Renesas Electronics Europe Ltd.
+ * Author: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/clk.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmaengine.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/log2.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_dma.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+#include <dt-bindings/dma/nbpfaxi.h>
+
+#include "dmaengine.h"
+
+#define NBPF_REG_CHAN_OFFSET   0
+#define NBPF_REG_CHAN_SIZE     0x40
+
+/* Channel Current Transaction Byte register */
+#define NBPF_CHAN_CUR_TR_BYTE  0x20
+
+/* Channel Status register */
+#define NBPF_CHAN_STAT 0x24
+#define NBPF_CHAN_STAT_EN      1
+#define NBPF_CHAN_STAT_TACT    4
+#define NBPF_CHAN_STAT_ERR     0x10
+#define NBPF_CHAN_STAT_END     0x20
+#define NBPF_CHAN_STAT_TC      0x40
+#define NBPF_CHAN_STAT_DER     0x400
+
+/* Channel Control register */
+#define NBPF_CHAN_CTRL 0x28
+#define NBPF_CHAN_CTRL_SETEN   1
+#define NBPF_CHAN_CTRL_CLREN   2
+#define NBPF_CHAN_CTRL_STG     4
+#define NBPF_CHAN_CTRL_SWRST   8
+#define NBPF_CHAN_CTRL_CLRRQ   0x10
+#define NBPF_CHAN_CTRL_CLREND  0x20
+#define NBPF_CHAN_CTRL_CLRTC   0x40
+#define NBPF_CHAN_CTRL_SETSUS  0x100
+#define NBPF_CHAN_CTRL_CLRSUS  0x200
+
+/* Channel Configuration register */
+#define NBPF_CHAN_CFG  0x2c
+#define NBPF_CHAN_CFG_SEL      7               /* terminal SELect: 0..7 */
+#define NBPF_CHAN_CFG_REQD     8               /* REQuest Direction: DMAREQ is 0: input, 1: output */
+#define NBPF_CHAN_CFG_LOEN     0x10            /* LOw ENable: low DMA request line is: 0: inactive, 1: active */
+#define NBPF_CHAN_CFG_HIEN     0x20            /* HIgh ENable: high DMA request line is: 0: inactive, 1: active */
+#define NBPF_CHAN_CFG_LVL      0x40            /* LeVeL: DMA request line is sensed as 0: edge, 1: level */
+#define NBPF_CHAN_CFG_AM       0x700           /* ACK Mode: 0: Pulse mode, 1: Level mode, b'1x: Bus Cycle */
+#define NBPF_CHAN_CFG_SDS      0xf000          /* Source Data Size: 0: 8 bits,... , 7: 1024 bits */
+#define NBPF_CHAN_CFG_DDS      0xf0000         /* Destination Data Size: as above */
+#define NBPF_CHAN_CFG_SAD      0x100000        /* Source ADdress counting: 0: increment, 1: fixed */
+#define NBPF_CHAN_CFG_DAD      0x200000        /* Destination ADdress counting: 0: increment, 1: fixed */
+#define NBPF_CHAN_CFG_TM       0x400000        /* Transfer Mode: 0: single, 1: block TM */
+#define NBPF_CHAN_CFG_DEM      0x1000000       /* DMAEND interrupt Mask */
+#define NBPF_CHAN_CFG_TCM      0x2000000       /* DMATCO interrupt Mask */
+#define NBPF_CHAN_CFG_SBE      0x8000000       /* Sweep Buffer Enable */
+#define NBPF_CHAN_CFG_RSEL     0x10000000      /* RM: Register Set sELect */
+#define NBPF_CHAN_CFG_RSW      0x20000000      /* RM: Register Select sWitch */
+#define NBPF_CHAN_CFG_REN      0x40000000      /* RM: Register Set Enable */
+#define NBPF_CHAN_CFG_DMS      0x80000000      /* 0: register mode (RM), 1: link mode (LM) */
+
+#define NBPF_CHAN_NXLA 0x38
+#define NBPF_CHAN_CRLA 0x3c
+
+/* Link Header field */
+#define NBPF_HEADER_LV 1
+#define NBPF_HEADER_LE 2
+#define NBPF_HEADER_WBD        4
+#define NBPF_HEADER_DIM        8
+
+#define NBPF_CTRL      0x300
+#define NBPF_CTRL_PR   1               /* 0: fixed priority, 1: round robin */
+#define NBPF_CTRL_LVINT        2               /* DMAEND and DMAERR signalling: 0: pulse, 1: level */
+
+#define NBPF_DSTAT_ER  0x314
+#define NBPF_DSTAT_END 0x318
+
+#define NBPF_DMA_BUSWIDTHS \
+       (BIT(DMA_SLAVE_BUSWIDTH_UNDEFINED) | \
+        BIT(DMA_SLAVE_BUSWIDTH_1_BYTE) | \
+        BIT(DMA_SLAVE_BUSWIDTH_2_BYTES) | \
+        BIT(DMA_SLAVE_BUSWIDTH_4_BYTES) | \
+        BIT(DMA_SLAVE_BUSWIDTH_8_BYTES))
+
+struct nbpf_config {
+       int num_channels;
+       int buffer_size;
+};
+
+/*
+ * We've got 3 types of objects, used to describe DMA transfers:
+ * 1. high-level descriptor, containing a struct dma_async_tx_descriptor object
+ *     in it, used to communicate with the user
+ * 2. hardware DMA link descriptors, that we pass to DMAC for DMA transfer
+ *     queuing, these must be DMAable, using either the streaming DMA API or
+ *     allocated from coherent memory - one per SG segment
+ * 3. one per SG segment descriptors, used to manage HW link descriptors from
+ *     (2). They do not have to be DMAable. They can either be (a) allocated
+ *     together with link descriptors as mixed (DMA / CPU) objects, or (b)
+ *     separately. Even if allocated separately it would be best to link them
+ *     to link descriptors once during channel resource allocation and always
+ *     use them as a single object.
+ * Therefore for both cases (a) and (b) at run-time objects (2) and (3) shall be
+ * treated as a single SG segment descriptor.
+ */
+
+struct nbpf_link_reg {
+       u32     header;
+       u32     src_addr;
+       u32     dst_addr;
+       u32     transaction_size;
+       u32     config;
+       u32     interval;
+       u32     extension;
+       u32     next;
+} __packed;
+
+struct nbpf_device;
+struct nbpf_channel;
+struct nbpf_desc;
+
+struct nbpf_link_desc {
+       struct nbpf_link_reg *hwdesc;
+       dma_addr_t hwdesc_dma_addr;
+       struct nbpf_desc *desc;
+       struct list_head node;
+};
+
+/**
+ * struct nbpf_desc - DMA transfer descriptor
+ * @async_tx:  dmaengine object
+ * @user_wait: waiting for a user ack
+ * @length:    total transfer length
+ * @sg:                list of hardware descriptors, represented by struct nbpf_link_desc
+ * @node:      member in channel descriptor lists
+ */
+struct nbpf_desc {
+       struct dma_async_tx_descriptor async_tx;
+       bool user_wait;
+       size_t length;
+       struct nbpf_channel *chan;
+       struct list_head sg;
+       struct list_head node;
+};
+
+/* Take a wild guess: allocate 4 segments per descriptor */
+#define NBPF_SEGMENTS_PER_DESC 4
+#define NBPF_DESCS_PER_PAGE ((PAGE_SIZE - sizeof(struct list_head)) /  \
+       (sizeof(struct nbpf_desc) +                                     \
+        NBPF_SEGMENTS_PER_DESC *                                       \
+        (sizeof(struct nbpf_link_desc) + sizeof(struct nbpf_link_reg))))
+#define NBPF_SEGMENTS_PER_PAGE (NBPF_SEGMENTS_PER_DESC * NBPF_DESCS_PER_PAGE)
+
+struct nbpf_desc_page {
+       struct list_head node;
+       struct nbpf_desc desc[NBPF_DESCS_PER_PAGE];
+       struct nbpf_link_desc ldesc[NBPF_SEGMENTS_PER_PAGE];
+       struct nbpf_link_reg hwdesc[NBPF_SEGMENTS_PER_PAGE];
+};
+
+/**
+ * struct nbpf_channel - one DMAC channel
+ * @dma_chan:  standard dmaengine channel object
+ * @base:      register address base
+ * @nbpf:      DMAC
+ * @name:      IRQ name
+ * @irq:       IRQ number
+ * @slave_addr:        address for slave DMA
+ * @slave_width:slave data size in bytes
+ * @slave_burst:maximum slave burst size in bytes
+ * @terminal:  DMA terminal, assigned to this channel
+ * @dmarq_cfg: DMA request line configuration - high / low, edge / level for NBPF_CHAN_CFG
+ * @flags:     configuration flags from DT
+ * @lock:      protect descriptor lists
+ * @free_links:        list of free link descriptors
+ * @free:      list of free descriptors
+ * @queued:    list of queued descriptors
+ * @active:    list of descriptors, scheduled for processing
+ * @done:      list of completed descriptors, waiting post-processing
+ * @desc_page: list of additionally allocated descriptor pages - if any
+ */
+struct nbpf_channel {
+       struct dma_chan dma_chan;
+       struct tasklet_struct tasklet;
+       void __iomem *base;
+       struct nbpf_device *nbpf;
+       char name[16];
+       int irq;
+       dma_addr_t slave_src_addr;
+       size_t slave_src_width;
+       size_t slave_src_burst;
+       dma_addr_t slave_dst_addr;
+       size_t slave_dst_width;
+       size_t slave_dst_burst;
+       unsigned int terminal;
+       u32 dmarq_cfg;
+       unsigned long flags;
+       spinlock_t lock;
+       struct list_head free_links;
+       struct list_head free;
+       struct list_head queued;
+       struct list_head active;
+       struct list_head done;
+       struct list_head desc_page;
+       struct nbpf_desc *running;
+       bool paused;
+};
+
+struct nbpf_device {
+       struct dma_device dma_dev;
+       void __iomem *base;
+       struct clk *clk;
+       const struct nbpf_config *config;
+       struct nbpf_channel chan[];
+};
+
+enum nbpf_model {
+       NBPF1B4,
+       NBPF1B8,
+       NBPF1B16,
+       NBPF4B4,
+       NBPF4B8,
+       NBPF4B16,
+       NBPF8B4,
+       NBPF8B8,
+       NBPF8B16,
+};
+
+static struct nbpf_config nbpf_cfg[] = {
+       [NBPF1B4] = {
+               .num_channels = 1,
+               .buffer_size = 4,
+       },
+       [NBPF1B8] = {
+               .num_channels = 1,
+               .buffer_size = 8,
+       },
+       [NBPF1B16] = {
+               .num_channels = 1,
+               .buffer_size = 16,
+       },
+       [NBPF4B4] = {
+               .num_channels = 4,
+               .buffer_size = 4,
+       },
+       [NBPF4B8] = {
+               .num_channels = 4,
+               .buffer_size = 8,
+       },
+       [NBPF4B16] = {
+               .num_channels = 4,
+               .buffer_size = 16,
+       },
+       [NBPF8B4] = {
+               .num_channels = 8,
+               .buffer_size = 4,
+       },
+       [NBPF8B8] = {
+               .num_channels = 8,
+               .buffer_size = 8,
+       },
+       [NBPF8B16] = {
+               .num_channels = 8,
+               .buffer_size = 16,
+       },
+};
+
+#define nbpf_to_chan(d) container_of(d, struct nbpf_channel, dma_chan)
+
+/*
+ * dmaengine drivers seem to have a lot in common and instead of sharing more
+ * code, they reimplement those common algorithms independently. In this driver
+ * we try to separate the hardware-specific part from the (largely) generic
+ * part. This improves code readability and makes it possible in the future to
+ * reuse the generic code in form of a helper library. That generic code should
+ * be suitable for various DMA controllers, using transfer descriptors in RAM
+ * and pushing one SG list at a time to the DMA controller.
+ */
+
+/*             Hardware-specific part          */
+
+static inline u32 nbpf_chan_read(struct nbpf_channel *chan,
+                                unsigned int offset)
+{
+       u32 data = ioread32(chan->base + offset);
+       dev_dbg(chan->dma_chan.device->dev, "%s(0x%p + 0x%x) = 0x%x\n",
+               __func__, chan->base, offset, data);
+       return data;
+}
+
+static inline void nbpf_chan_write(struct nbpf_channel *chan,
+                                  unsigned int offset, u32 data)
+{
+       iowrite32(data, chan->base + offset);
+       dev_dbg(chan->dma_chan.device->dev, "%s(0x%p + 0x%x) = 0x%x\n",
+               __func__, chan->base, offset, data);
+}
+
+static inline u32 nbpf_read(struct nbpf_device *nbpf,
+                           unsigned int offset)
+{
+       u32 data = ioread32(nbpf->base + offset);
+       dev_dbg(nbpf->dma_dev.dev, "%s(0x%p + 0x%x) = 0x%x\n",
+               __func__, nbpf->base, offset, data);
+       return data;
+}
+
+static inline void nbpf_write(struct nbpf_device *nbpf,
+                             unsigned int offset, u32 data)
+{
+       iowrite32(data, nbpf->base + offset);
+       dev_dbg(nbpf->dma_dev.dev, "%s(0x%p + 0x%x) = 0x%x\n",
+               __func__, nbpf->base, offset, data);
+}
+
+static void nbpf_chan_halt(struct nbpf_channel *chan)
+{
+       nbpf_chan_write(chan, NBPF_CHAN_CTRL, NBPF_CHAN_CTRL_CLREN);
+}
+
+static bool nbpf_status_get(struct nbpf_channel *chan)
+{
+       u32 status = nbpf_read(chan->nbpf, NBPF_DSTAT_END);
+
+       return status & BIT(chan - chan->nbpf->chan);
+}
+
+static void nbpf_status_ack(struct nbpf_channel *chan)
+{
+       nbpf_chan_write(chan, NBPF_CHAN_CTRL, NBPF_CHAN_CTRL_CLREND);
+}
+
+static u32 nbpf_error_get(struct nbpf_device *nbpf)
+{
+       return nbpf_read(nbpf, NBPF_DSTAT_ER);
+}
+
+static struct nbpf_channel *nbpf_error_get_channel(struct nbpf_device *nbpf, u32 error)
+{
+       return nbpf->chan + __ffs(error);
+}
+
+static void nbpf_error_clear(struct nbpf_channel *chan)
+{
+       u32 status;
+       int i;
+
+       /* Stop the channel, make sure DMA has been aborted */
+       nbpf_chan_halt(chan);
+
+       for (i = 1000; i; i--) {
+               status = nbpf_chan_read(chan, NBPF_CHAN_STAT);
+               if (!(status & NBPF_CHAN_STAT_TACT))
+                       break;
+               cpu_relax();
+       }
+
+       if (!i)
+               dev_err(chan->dma_chan.device->dev,
+                       "%s(): abort timeout, channel status 0x%x\n", __func__, status);
+
+       nbpf_chan_write(chan, NBPF_CHAN_CTRL, NBPF_CHAN_CTRL_SWRST);
+}
+
+static int nbpf_start(struct nbpf_desc *desc)
+{
+       struct nbpf_channel *chan = desc->chan;
+       struct nbpf_link_desc *ldesc = list_first_entry(&desc->sg, struct nbpf_link_desc, node);
+
+       nbpf_chan_write(chan, NBPF_CHAN_NXLA, (u32)ldesc->hwdesc_dma_addr);
+       nbpf_chan_write(chan, NBPF_CHAN_CTRL, NBPF_CHAN_CTRL_SETEN | NBPF_CHAN_CTRL_CLRSUS);
+       chan->paused = false;
+
+       /* Software trigger MEMCPY - only MEMCPY uses the block mode */
+       if (ldesc->hwdesc->config & NBPF_CHAN_CFG_TM)
+               nbpf_chan_write(chan, NBPF_CHAN_CTRL, NBPF_CHAN_CTRL_STG);
+
+       dev_dbg(chan->nbpf->dma_dev.dev, "%s(): next 0x%x, cur 0x%x\n", __func__,
+               nbpf_chan_read(chan, NBPF_CHAN_NXLA), nbpf_chan_read(chan, NBPF_CHAN_CRLA));
+
+       return 0;
+}
+
+static void nbpf_chan_prepare(struct nbpf_channel *chan)
+{
+       chan->dmarq_cfg = (chan->flags & NBPF_SLAVE_RQ_HIGH ? NBPF_CHAN_CFG_HIEN : 0) |
+               (chan->flags & NBPF_SLAVE_RQ_LOW ? NBPF_CHAN_CFG_LOEN : 0) |
+               (chan->flags & NBPF_SLAVE_RQ_LEVEL ?
+                NBPF_CHAN_CFG_LVL | (NBPF_CHAN_CFG_AM & 0x200) : 0) |
+               chan->terminal;
+}
+
+static void nbpf_chan_prepare_default(struct nbpf_channel *chan)
+{
+       /* Don't output DMAACK */
+       chan->dmarq_cfg = NBPF_CHAN_CFG_AM & 0x400;
+       chan->terminal = 0;
+       chan->flags = 0;
+}
+
+static void nbpf_chan_configure(struct nbpf_channel *chan)
+{
+       /*
+        * We assume, that only the link mode and DMA request line configuration
+        * have to be set in the configuration register manually. Dynamic
+        * per-transfer configuration will be loaded from transfer descriptors.
+        */
+       nbpf_chan_write(chan, NBPF_CHAN_CFG, NBPF_CHAN_CFG_DMS | chan->dmarq_cfg);
+}
+
+static u32 nbpf_xfer_ds(struct nbpf_device *nbpf, size_t size)
+{
+       /* Maximum supported bursts depend on the buffer size */
+       return min_t(int, __ffs(size), ilog2(nbpf->config->buffer_size * 8));
+}
+
+static size_t nbpf_xfer_size(struct nbpf_device *nbpf,
+                            enum dma_slave_buswidth width, u32 burst)
+{
+       size_t size;
+
+       if (!burst)
+               burst = 1;
+
+       switch (width) {
+       case DMA_SLAVE_BUSWIDTH_8_BYTES:
+               size = 8 * burst;
+               break;
+
+       case DMA_SLAVE_BUSWIDTH_4_BYTES:
+               size = 4 * burst;
+               break;
+
+       case DMA_SLAVE_BUSWIDTH_2_BYTES:
+               size = 2 * burst;
+               break;
+
+       default:
+               pr_warn("%s(): invalid bus width %u\n", __func__, width);
+       case DMA_SLAVE_BUSWIDTH_1_BYTE:
+               size = burst;
+       }
+
+       return nbpf_xfer_ds(nbpf, size);
+}
+
+/*
+ * We need a way to recognise slaves, whose data is sent "raw" over the bus,
+ * i.e. it isn't known in advance how many bytes will be received. Therefore
+ * the slave driver has to provide a "large enough" buffer and either read the
+ * buffer, when it is full, or detect, that some data has arrived, then wait for
+ * a timeout, if no more data arrives - receive what's already there. We want to
+ * handle such slaves in a special way to allow an optimised mode for other
+ * users, for whom the amount of data is known in advance. So far there's no way
+ * to recognise such slaves. We use a data-width check to distinguish between
+ * the SD host and the PL011 UART.
+ */
+
+static int nbpf_prep_one(struct nbpf_link_desc *ldesc,
+                        enum dma_transfer_direction direction,
+                        dma_addr_t src, dma_addr_t dst, size_t size, bool last)
+{
+       struct nbpf_link_reg *hwdesc = ldesc->hwdesc;
+       struct nbpf_desc *desc = ldesc->desc;
+       struct nbpf_channel *chan = desc->chan;
+       struct device *dev = chan->dma_chan.device->dev;
+       size_t mem_xfer, slave_xfer;
+       bool can_burst;
+
+       hwdesc->header = NBPF_HEADER_WBD | NBPF_HEADER_LV |
+               (last ? NBPF_HEADER_LE : 0);
+
+       hwdesc->src_addr = src;
+       hwdesc->dst_addr = dst;
+       hwdesc->transaction_size = size;
+
+       /*
+        * set config: SAD, DAD, DDS, SDS, etc.
+        * Note on transfer sizes: the DMAC can perform unaligned DMA transfers,
+        * but it is important to have transaction size a multiple of both
+        * receiver and transmitter transfer sizes. It is also possible to use
+        * different RAM and device transfer sizes, and it does work well with
+        * some devices, e.g. with V08R07S01E SD host controllers, which can use
+        * 128 byte transfers. But this doesn't work with other devices,
+        * especially when the transaction size is unknown. This is the case,
+        * e.g. with serial drivers like amba-pl011.c. For reception it sets up
+        * the transaction size of 4K and if fewer bytes are received, it
+        * pauses DMA and reads out data received via DMA as well as those left
+        * in the Rx FIFO. For this to work with the RAM side using burst
+        * transfers we enable the SBE bit and terminate the transfer in our
+        * DMA_PAUSE handler.
+        */
+       mem_xfer = nbpf_xfer_ds(chan->nbpf, size);
+
+       switch (direction) {
+       case DMA_DEV_TO_MEM:
+               can_burst = chan->slave_src_width >= 3;
+               slave_xfer = min(mem_xfer, can_burst ?
+                                chan->slave_src_burst : chan->slave_src_width);
+               /*
+                * Is the slave narrower than 64 bits, i.e. isn't using the full
+                * bus width and cannot use bursts?
+                */
+               if (mem_xfer > chan->slave_src_burst && !can_burst)
+                       mem_xfer = chan->slave_src_burst;
+               /* Device-to-RAM DMA is unreliable without REQD set */
+               hwdesc->config = NBPF_CHAN_CFG_SAD | (NBPF_CHAN_CFG_DDS & (mem_xfer << 16)) |
+                       (NBPF_CHAN_CFG_SDS & (slave_xfer << 12)) | NBPF_CHAN_CFG_REQD |
+                       NBPF_CHAN_CFG_SBE;
+               break;
+
+       case DMA_MEM_TO_DEV:
+               slave_xfer = min(mem_xfer, chan->slave_dst_width >= 3 ?
+                                chan->slave_dst_burst : chan->slave_dst_width);
+               hwdesc->config = NBPF_CHAN_CFG_DAD | (NBPF_CHAN_CFG_SDS & (mem_xfer << 12)) |
+                       (NBPF_CHAN_CFG_DDS & (slave_xfer << 16)) | NBPF_CHAN_CFG_REQD;
+               break;
+
+       case DMA_MEM_TO_MEM:
+               hwdesc->config = NBPF_CHAN_CFG_TCM | NBPF_CHAN_CFG_TM |
+                       (NBPF_CHAN_CFG_SDS & (mem_xfer << 12)) |
+                       (NBPF_CHAN_CFG_DDS & (mem_xfer << 16));
+               break;
+
+       default:
+               return -EINVAL;
+       }
+
+       hwdesc->config |= chan->dmarq_cfg | (last ? 0 : NBPF_CHAN_CFG_DEM) |
+               NBPF_CHAN_CFG_DMS;
+
+       dev_dbg(dev, "%s(): desc @ %pad: hdr 0x%x, cfg 0x%x, %zu @ %pad -> %pad\n",
+               __func__, &ldesc->hwdesc_dma_addr, hwdesc->header,
+               hwdesc->config, size, &src, &dst);
+
+       dma_sync_single_for_device(dev, ldesc->hwdesc_dma_addr, sizeof(*hwdesc),
+                                  DMA_TO_DEVICE);
+
+       return 0;
+}
+
+static size_t nbpf_bytes_left(struct nbpf_channel *chan)
+{
+       return nbpf_chan_read(chan, NBPF_CHAN_CUR_TR_BYTE);
+}
+
+static void nbpf_configure(struct nbpf_device *nbpf)
+{
+       nbpf_write(nbpf, NBPF_CTRL, NBPF_CTRL_LVINT);
+}
+
+static void nbpf_pause(struct nbpf_channel *chan)
+{
+       nbpf_chan_write(chan, NBPF_CHAN_CTRL, NBPF_CHAN_CTRL_SETSUS);
+       /* See comment in nbpf_prep_one() */
+       nbpf_chan_write(chan, NBPF_CHAN_CTRL, NBPF_CHAN_CTRL_CLREN);
+}
+
+/*             Generic part                    */
+
+/* DMA ENGINE functions */
+static void nbpf_issue_pending(struct dma_chan *dchan)
+{
+       struct nbpf_channel *chan = nbpf_to_chan(dchan);
+       unsigned long flags;
+
+       dev_dbg(dchan->device->dev, "Entry %s()\n", __func__);
+
+       spin_lock_irqsave(&chan->lock, flags);
+       if (list_empty(&chan->queued))
+               goto unlock;
+
+       list_splice_tail_init(&chan->queued, &chan->active);
+
+       if (!chan->running) {
+               struct nbpf_desc *desc = list_first_entry(&chan->active,
+                                               struct nbpf_desc, node);
+               if (!nbpf_start(desc))
+                       chan->running = desc;
+       }
+
+unlock:
+       spin_unlock_irqrestore(&chan->lock, flags);
+}
+
+static enum dma_status nbpf_tx_status(struct dma_chan *dchan,
+               dma_cookie_t cookie, struct dma_tx_state *state)
+{
+       struct nbpf_channel *chan = nbpf_to_chan(dchan);
+       enum dma_status status = dma_cookie_status(dchan, cookie, state);
+
+       if (state) {
+               dma_cookie_t running;
+               unsigned long flags;
+
+               spin_lock_irqsave(&chan->lock, flags);
+               running = chan->running ? chan->running->async_tx.cookie : -EINVAL;
+
+               if (cookie == running) {
+                       state->residue = nbpf_bytes_left(chan);
+                       dev_dbg(dchan->device->dev, "%s(): residue %u\n", __func__,
+                               state->residue);
+               } else if (status == DMA_IN_PROGRESS) {
+                       struct nbpf_desc *desc;
+                       bool found = false;
+
+                       list_for_each_entry(desc, &chan->active, node)
+                               if (desc->async_tx.cookie == cookie) {
+                                       found = true;
+                                       break;
+                               }
+
+                       if (!found)
+                               list_for_each_entry(desc, &chan->queued, node)
+                                       if (desc->async_tx.cookie == cookie) {
+                                               found = true;
+                                               break;
+
+                                       }
+
+                       state->residue = found ? desc->length : 0;
+               }
+
+               spin_unlock_irqrestore(&chan->lock, flags);
+       }
+
+       if (chan->paused)
+               status = DMA_PAUSED;
+
+       return status;
+}
+
+static dma_cookie_t nbpf_tx_submit(struct dma_async_tx_descriptor *tx)
+{
+       struct nbpf_desc *desc = container_of(tx, struct nbpf_desc, async_tx);
+       struct nbpf_channel *chan = desc->chan;
+       unsigned long flags;
+       dma_cookie_t cookie;
+
+       spin_lock_irqsave(&chan->lock, flags);
+       cookie = dma_cookie_assign(tx);
+       list_add_tail(&desc->node, &chan->queued);
+       spin_unlock_irqrestore(&chan->lock, flags);
+
+       dev_dbg(chan->dma_chan.device->dev, "Entry %s(%d)\n", __func__, cookie);
+
+       return cookie;
+}
+
+static int nbpf_desc_page_alloc(struct nbpf_channel *chan)
+{
+       struct dma_chan *dchan = &chan->dma_chan;
+       struct nbpf_desc_page *dpage = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
+       struct nbpf_link_desc *ldesc;
+       struct nbpf_link_reg *hwdesc;
+       struct nbpf_desc *desc;
+       LIST_HEAD(head);
+       LIST_HEAD(lhead);
+       int i;
+       struct device *dev = dchan->device->dev;
+
+       if (!dpage)
+               return -ENOMEM;
+
+       dev_dbg(dev, "%s(): alloc %lu descriptors, %lu segments, total alloc %zu\n",
+               __func__, NBPF_DESCS_PER_PAGE, NBPF_SEGMENTS_PER_PAGE, sizeof(*dpage));
+
+       for (i = 0, ldesc = dpage->ldesc, hwdesc = dpage->hwdesc;
+            i < ARRAY_SIZE(dpage->ldesc);
+            i++, ldesc++, hwdesc++) {
+               ldesc->hwdesc = hwdesc;
+               list_add_tail(&ldesc->node, &lhead);
+               ldesc->hwdesc_dma_addr = dma_map_single(dchan->device->dev,
+                                       hwdesc, sizeof(*hwdesc), DMA_TO_DEVICE);
+
+               dev_dbg(dev, "%s(): mapped 0x%p to %pad\n", __func__,
+                       hwdesc, &ldesc->hwdesc_dma_addr);
+       }
+
+       for (i = 0, desc = dpage->desc;
+            i < ARRAY_SIZE(dpage->desc);
+            i++, desc++) {
+               dma_async_tx_descriptor_init(&desc->async_tx, dchan);
+               desc->async_tx.tx_submit = nbpf_tx_submit;
+               desc->chan = chan;
+               INIT_LIST_HEAD(&desc->sg);
+               list_add_tail(&desc->node, &head);
+       }
+
+       /*
+        * This function cannot be called from interrupt context, so, no need to
+        * save flags
+        */
+       spin_lock_irq(&chan->lock);
+       list_splice_tail(&lhead, &chan->free_links);
+       list_splice_tail(&head, &chan->free);
+       list_add(&dpage->node, &chan->desc_page);
+       spin_unlock_irq(&chan->lock);
+
+       return ARRAY_SIZE(dpage->desc);
+}
+
+static void nbpf_desc_put(struct nbpf_desc *desc)
+{
+       struct nbpf_channel *chan = desc->chan;
+       struct nbpf_link_desc *ldesc, *tmp;
+       unsigned long flags;
+
+       spin_lock_irqsave(&chan->lock, flags);
+       list_for_each_entry_safe(ldesc, tmp, &desc->sg, node)
+               list_move(&ldesc->node, &chan->free_links);
+
+       list_add(&desc->node, &chan->free);
+       spin_unlock_irqrestore(&chan->lock, flags);
+}
+
+static void nbpf_scan_acked(struct nbpf_channel *chan)
+{
+       struct nbpf_desc *desc, *tmp;
+       unsigned long flags;
+       LIST_HEAD(head);
+
+       spin_lock_irqsave(&chan->lock, flags);
+       list_for_each_entry_safe(desc, tmp, &chan->done, node)
+               if (async_tx_test_ack(&desc->async_tx) && desc->user_wait) {
+                       list_move(&desc->node, &head);
+                       desc->user_wait = false;
+               }
+       spin_unlock_irqrestore(&chan->lock, flags);
+
+       list_for_each_entry_safe(desc, tmp, &head, node) {
+               list_del(&desc->node);
+               nbpf_desc_put(desc);
+       }
+}
+
+/*
+ * We have to allocate descriptors with the channel lock dropped. This means,
+ * before we re-acquire the lock buffers can be taken already, so we have to
+ * re-check after re-acquiring the lock and possibly retry, if buffers are gone
+ * again.
+ */
+static struct nbpf_desc *nbpf_desc_get(struct nbpf_channel *chan, size_t len)
+{
+       struct nbpf_desc *desc = NULL;
+       struct nbpf_link_desc *ldesc, *prev = NULL;
+
+       nbpf_scan_acked(chan);
+
+       spin_lock_irq(&chan->lock);
+
+       do {
+               int i = 0, ret;
+
+               if (list_empty(&chan->free)) {
+                       /* No more free descriptors */
+                       spin_unlock_irq(&chan->lock);
+                       ret = nbpf_desc_page_alloc(chan);
+                       if (ret < 0)
+                               return NULL;
+                       spin_lock_irq(&chan->lock);
+                       continue;
+               }
+               desc = list_first_entry(&chan->free, struct nbpf_desc, node);
+               list_del(&desc->node);
+
+               do {
+                       if (list_empty(&chan->free_links)) {
+                               /* No more free link descriptors */
+                               spin_unlock_irq(&chan->lock);
+                               ret = nbpf_desc_page_alloc(chan);
+                               if (ret < 0) {
+                                       nbpf_desc_put(desc);
+                                       return NULL;
+                               }
+                               spin_lock_irq(&chan->lock);
+                               continue;
+                       }
+
+                       ldesc = list_first_entry(&chan->free_links,
+                                                struct nbpf_link_desc, node);
+                       ldesc->desc = desc;
+                       if (prev)
+                               prev->hwdesc->next = (u32)ldesc->hwdesc_dma_addr;
+
+                       prev = ldesc;
+                       list_move_tail(&ldesc->node, &desc->sg);
+
+                       i++;
+               } while (i < len);
+       } while (!desc);
+
+       prev->hwdesc->next = 0;
+
+       spin_unlock_irq(&chan->lock);
+
+       return desc;
+}
+
+static void nbpf_chan_idle(struct nbpf_channel *chan)
+{
+       struct nbpf_desc *desc, *tmp;
+       unsigned long flags;
+       LIST_HEAD(head);
+
+       spin_lock_irqsave(&chan->lock, flags);
+
+       list_splice_init(&chan->done, &head);
+       list_splice_init(&chan->active, &head);
+       list_splice_init(&chan->queued, &head);
+
+       chan->running = NULL;
+
+       spin_unlock_irqrestore(&chan->lock, flags);
+
+       list_for_each_entry_safe(desc, tmp, &head, node) {
+               dev_dbg(chan->nbpf->dma_dev.dev, "%s(): force-free desc %p cookie %d\n",
+                       __func__, desc, desc->async_tx.cookie);
+               list_del(&desc->node);
+               nbpf_desc_put(desc);
+       }
+}
+
+static int nbpf_control(struct dma_chan *dchan, enum dma_ctrl_cmd cmd,
+                       unsigned long arg)
+{
+       struct nbpf_channel *chan = nbpf_to_chan(dchan);
+       struct dma_slave_config *config;
+
+       dev_dbg(dchan->device->dev, "Entry %s(%d)\n", __func__, cmd);
+
+       switch (cmd) {
+       case DMA_TERMINATE_ALL:
+               dev_dbg(dchan->device->dev, "Terminating\n");
+               nbpf_chan_halt(chan);
+               nbpf_chan_idle(chan);
+               break;
+
+       case DMA_SLAVE_CONFIG:
+               if (!arg)
+                       return -EINVAL;
+               config = (struct dma_slave_config *)arg;
+
+               /*
+                * We could check config->slave_id to match chan->terminal here,
+                * but with DT they would be coming from the same source, so
+                * such a check would be superflous
+                */
+
+               chan->slave_dst_addr = config->dst_addr;
+               chan->slave_dst_width = nbpf_xfer_size(chan->nbpf,
+                                                      config->dst_addr_width, 1);
+               chan->slave_dst_burst = nbpf_xfer_size(chan->nbpf,
+                                                      config->dst_addr_width,
+                                                      config->dst_maxburst);
+               chan->slave_src_addr = config->src_addr;
+               chan->slave_src_width = nbpf_xfer_size(chan->nbpf,
+                                                      config->src_addr_width, 1);
+               chan->slave_src_burst = nbpf_xfer_size(chan->nbpf,
+                                                      config->src_addr_width,
+                                                      config->src_maxburst);
+               break;
+
+       case DMA_PAUSE:
+               chan->paused = true;
+               nbpf_pause(chan);
+               break;
+
+       default:
+               return -ENXIO;
+       }
+
+       return 0;
+}
+
+static struct dma_async_tx_descriptor *nbpf_prep_sg(struct nbpf_channel *chan,
+               struct scatterlist *src_sg, struct scatterlist *dst_sg,
+               size_t len, enum dma_transfer_direction direction,
+               unsigned long flags)
+{
+       struct nbpf_link_desc *ldesc;
+       struct scatterlist *mem_sg;
+       struct nbpf_desc *desc;
+       bool inc_src, inc_dst;
+       size_t data_len = 0;
+       int i = 0;
+
+       switch (direction) {
+       case DMA_DEV_TO_MEM:
+               mem_sg = dst_sg;
+               inc_src = false;
+               inc_dst = true;
+               break;
+
+       case DMA_MEM_TO_DEV:
+               mem_sg = src_sg;
+               inc_src = true;
+               inc_dst = false;
+               break;
+
+       default:
+       case DMA_MEM_TO_MEM:
+               mem_sg = src_sg;
+               inc_src = true;
+               inc_dst = true;
+       }
+
+       desc = nbpf_desc_get(chan, len);
+       if (!desc)
+               return NULL;
+
+       desc->async_tx.flags = flags;
+       desc->async_tx.cookie = -EBUSY;
+       desc->user_wait = false;
+
+       /*
+        * This is a private descriptor list, and we own the descriptor. No need
+        * to lock.
+        */
+       list_for_each_entry(ldesc, &desc->sg, node) {
+               int ret = nbpf_prep_one(ldesc, direction,
+                                       sg_dma_address(src_sg),
+                                       sg_dma_address(dst_sg),
+                                       sg_dma_len(mem_sg),
+                                       i == len - 1);
+               if (ret < 0) {
+                       nbpf_desc_put(desc);
+                       return NULL;
+               }
+               data_len += sg_dma_len(mem_sg);
+               if (inc_src)
+                       src_sg = sg_next(src_sg);
+               if (inc_dst)
+                       dst_sg = sg_next(dst_sg);
+               mem_sg = direction == DMA_DEV_TO_MEM ? dst_sg : src_sg;
+               i++;
+       }
+
+       desc->length = data_len;
+
+       /* The user has to return the descriptor to us ASAP via .tx_submit() */
+       return &desc->async_tx;
+}
+
+static struct dma_async_tx_descriptor *nbpf_prep_memcpy(
+       struct dma_chan *dchan, dma_addr_t dst, dma_addr_t src,
+       size_t len, unsigned long flags)
+{
+       struct nbpf_channel *chan = nbpf_to_chan(dchan);
+       struct scatterlist dst_sg;
+       struct scatterlist src_sg;
+
+       sg_init_table(&dst_sg, 1);
+       sg_init_table(&src_sg, 1);
+
+       sg_dma_address(&dst_sg) = dst;
+       sg_dma_address(&src_sg) = src;
+
+       sg_dma_len(&dst_sg) = len;
+       sg_dma_len(&src_sg) = len;
+
+       dev_dbg(dchan->device->dev, "%s(): %zu @ %pad -> %pad\n",
+               __func__, len, &src, &dst);
+
+       return nbpf_prep_sg(chan, &src_sg, &dst_sg, 1,
+                           DMA_MEM_TO_MEM, flags);
+}
+
+static struct dma_async_tx_descriptor *nbpf_prep_memcpy_sg(
+       struct dma_chan *dchan,
+       struct scatterlist *dst_sg, unsigned int dst_nents,
+       struct scatterlist *src_sg, unsigned int src_nents,
+       unsigned long flags)
+{
+       struct nbpf_channel *chan = nbpf_to_chan(dchan);
+
+       if (dst_nents != src_nents)
+               return NULL;
+
+       return nbpf_prep_sg(chan, src_sg, dst_sg, src_nents,
+                           DMA_MEM_TO_MEM, flags);
+}
+
+static struct dma_async_tx_descriptor *nbpf_prep_slave_sg(
+       struct dma_chan *dchan, struct scatterlist *sgl, unsigned int sg_len,
+       enum dma_transfer_direction direction, unsigned long flags, void *context)
+{
+       struct nbpf_channel *chan = nbpf_to_chan(dchan);
+       struct scatterlist slave_sg;
+
+       dev_dbg(dchan->device->dev, "Entry %s()\n", __func__);
+
+       sg_init_table(&slave_sg, 1);
+
+       switch (direction) {
+       case DMA_MEM_TO_DEV:
+               sg_dma_address(&slave_sg) = chan->slave_dst_addr;
+               return nbpf_prep_sg(chan, sgl, &slave_sg, sg_len,
+                                   direction, flags);
+
+       case DMA_DEV_TO_MEM:
+               sg_dma_address(&slave_sg) = chan->slave_src_addr;
+               return nbpf_prep_sg(chan, &slave_sg, sgl, sg_len,
+                                   direction, flags);
+
+       default:
+               return NULL;
+       }
+}
+
+static int nbpf_alloc_chan_resources(struct dma_chan *dchan)
+{
+       struct nbpf_channel *chan = nbpf_to_chan(dchan);
+       int ret;
+
+       INIT_LIST_HEAD(&chan->free);
+       INIT_LIST_HEAD(&chan->free_links);
+       INIT_LIST_HEAD(&chan->queued);
+       INIT_LIST_HEAD(&chan->active);
+       INIT_LIST_HEAD(&chan->done);
+
+       ret = nbpf_desc_page_alloc(chan);
+       if (ret < 0)
+               return ret;
+
+       dev_dbg(dchan->device->dev, "Entry %s(): terminal %u\n", __func__,
+               chan->terminal);
+
+       nbpf_chan_configure(chan);
+
+       return ret;
+}
+
+static void nbpf_free_chan_resources(struct dma_chan *dchan)
+{
+       struct nbpf_channel *chan = nbpf_to_chan(dchan);
+       struct nbpf_desc_page *dpage, *tmp;
+
+       dev_dbg(dchan->device->dev, "Entry %s()\n", __func__);
+
+       nbpf_chan_halt(chan);
+       nbpf_chan_idle(chan);
+       /* Clean up for if a channel is re-used for MEMCPY after slave DMA */
+       nbpf_chan_prepare_default(chan);
+
+       list_for_each_entry_safe(dpage, tmp, &chan->desc_page, node) {
+               struct nbpf_link_desc *ldesc;
+               int i;
+               list_del(&dpage->node);
+               for (i = 0, ldesc = dpage->ldesc;
+                    i < ARRAY_SIZE(dpage->ldesc);
+                    i++, ldesc++)
+                       dma_unmap_single(dchan->device->dev, ldesc->hwdesc_dma_addr,
+                                        sizeof(*ldesc->hwdesc), DMA_TO_DEVICE);
+               free_page((unsigned long)dpage);
+       }
+}
+
+static int nbpf_slave_caps(struct dma_chan *dchan,
+                          struct dma_slave_caps *caps)
+{
+       caps->src_addr_widths = NBPF_DMA_BUSWIDTHS;
+       caps->dstn_addr_widths = NBPF_DMA_BUSWIDTHS;
+       caps->directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
+       caps->cmd_pause = false;
+       caps->cmd_terminate = true;
+
+       return 0;
+}
+
+static struct dma_chan *nbpf_of_xlate(struct of_phandle_args *dma_spec,
+                                     struct of_dma *ofdma)
+{
+       struct nbpf_device *nbpf = ofdma->of_dma_data;
+       struct dma_chan *dchan;
+       struct nbpf_channel *chan;
+
+       if (dma_spec->args_count != 2)
+               return NULL;
+
+       dchan = dma_get_any_slave_channel(&nbpf->dma_dev);
+       if (!dchan)
+               return NULL;
+
+       dev_dbg(dchan->device->dev, "Entry %s(%s)\n", __func__,
+               dma_spec->np->name);
+
+       chan = nbpf_to_chan(dchan);
+
+       chan->terminal = dma_spec->args[0];
+       chan->flags = dma_spec->args[1];
+
+       nbpf_chan_prepare(chan);
+       nbpf_chan_configure(chan);
+
+       return dchan;
+}
+
+static void nbpf_chan_tasklet(unsigned long data)
+{
+       struct nbpf_channel *chan = (struct nbpf_channel *)data;
+       struct nbpf_desc *desc, *tmp;
+       dma_async_tx_callback callback;
+       void *param;
+
+       while (!list_empty(&chan->done)) {
+               bool found = false, must_put, recycling = false;
+
+               spin_lock_irq(&chan->lock);
+
+               list_for_each_entry_safe(desc, tmp, &chan->done, node) {
+                       if (!desc->user_wait) {
+                               /* Newly completed descriptor, have to process */
+                               found = true;
+                               break;
+                       } else if (async_tx_test_ack(&desc->async_tx)) {
+                               /*
+                                * This descriptor was waiting for a user ACK,
+                                * it can be recycled now.
+                                */
+                               list_del(&desc->node);
+                               spin_unlock_irq(&chan->lock);
+                               nbpf_desc_put(desc);
+                               recycling = true;
+                               break;
+                       }
+               }
+
+               if (recycling)
+                       continue;
+
+               if (!found) {
+                       /* This can happen if TERMINATE_ALL has been called */
+                       spin_unlock_irq(&chan->lock);
+                       break;
+               }
+
+               dma_cookie_complete(&desc->async_tx);
+
+               /*
+                * With released lock we cannot dereference desc, maybe it's
+                * still on the "done" list
+                */
+               if (async_tx_test_ack(&desc->async_tx)) {
+                       list_del(&desc->node);
+                       must_put = true;
+               } else {
+                       desc->user_wait = true;
+                       must_put = false;
+               }
+
+               callback = desc->async_tx.callback;
+               param = desc->async_tx.callback_param;
+
+               /* ack and callback completed descriptor */
+               spin_unlock_irq(&chan->lock);
+
+               if (callback)
+                       callback(param);
+
+               if (must_put)
+                       nbpf_desc_put(desc);
+       }
+}
+
+static irqreturn_t nbpf_chan_irq(int irq, void *dev)
+{
+       struct nbpf_channel *chan = dev;
+       bool done = nbpf_status_get(chan);
+       struct nbpf_desc *desc;
+       irqreturn_t ret;
+       bool bh = false;
+
+       if (!done)
+               return IRQ_NONE;
+
+       nbpf_status_ack(chan);
+
+       dev_dbg(&chan->dma_chan.dev->device, "%s()\n", __func__);
+
+       spin_lock(&chan->lock);
+       desc = chan->running;
+       if (WARN_ON(!desc)) {
+               ret = IRQ_NONE;
+               goto unlock;
+       } else {
+               ret = IRQ_HANDLED;
+               bh = true;
+       }
+
+       list_move_tail(&desc->node, &chan->done);
+       chan->running = NULL;
+
+       if (!list_empty(&chan->active)) {
+               desc = list_first_entry(&chan->active,
+                                       struct nbpf_desc, node);
+               if (!nbpf_start(desc))
+                       chan->running = desc;
+       }
+
+unlock:
+       spin_unlock(&chan->lock);
+
+       if (bh)
+               tasklet_schedule(&chan->tasklet);
+
+       return ret;
+}
+
+static irqreturn_t nbpf_err_irq(int irq, void *dev)
+{
+       struct nbpf_device *nbpf = dev;
+       u32 error = nbpf_error_get(nbpf);
+
+       dev_warn(nbpf->dma_dev.dev, "DMA error IRQ %u\n", irq);
+
+       if (!error)
+               return IRQ_NONE;
+
+       do {
+               struct nbpf_channel *chan = nbpf_error_get_channel(nbpf, error);
+               /* On error: abort all queued transfers, no callback */
+               nbpf_error_clear(chan);
+               nbpf_chan_idle(chan);
+               error = nbpf_error_get(nbpf);
+       } while (error);
+
+       return IRQ_HANDLED;
+}
+
+static int nbpf_chan_probe(struct nbpf_device *nbpf, int n)
+{
+       struct dma_device *dma_dev = &nbpf->dma_dev;
+       struct nbpf_channel *chan = nbpf->chan + n;
+       int ret;
+
+       chan->nbpf = nbpf;
+       chan->base = nbpf->base + NBPF_REG_CHAN_OFFSET + NBPF_REG_CHAN_SIZE * n;
+       INIT_LIST_HEAD(&chan->desc_page);
+       spin_lock_init(&chan->lock);
+       chan->dma_chan.device = dma_dev;
+       dma_cookie_init(&chan->dma_chan);
+       nbpf_chan_prepare_default(chan);
+
+       dev_dbg(dma_dev->dev, "%s(): channel %d: -> %p\n", __func__, n, chan->base);
+
+       snprintf(chan->name, sizeof(chan->name), "nbpf %d", n);
+
+       tasklet_init(&chan->tasklet, nbpf_chan_tasklet, (unsigned long)chan);
+       ret = devm_request_irq(dma_dev->dev, chan->irq,
+                       nbpf_chan_irq, IRQF_SHARED,
+                       chan->name, chan);
+       if (ret < 0)
+               return ret;
+
+       /* Add the channel to DMA device channel list */
+       list_add_tail(&chan->dma_chan.device_node,
+                     &dma_dev->channels);
+
+       return 0;
+}
+
+static const struct of_device_id nbpf_match[] = {
+       {.compatible = "renesas,nbpfaxi64dmac1b4",      .data = &nbpf_cfg[NBPF1B4]},
+       {.compatible = "renesas,nbpfaxi64dmac1b8",      .data = &nbpf_cfg[NBPF1B8]},
+       {.compatible = "renesas,nbpfaxi64dmac1b16",     .data = &nbpf_cfg[NBPF1B16]},
+       {.compatible = "renesas,nbpfaxi64dmac4b4",      .data = &nbpf_cfg[NBPF4B4]},
+       {.compatible = "renesas,nbpfaxi64dmac4b8",      .data = &nbpf_cfg[NBPF4B8]},
+       {.compatible = "renesas,nbpfaxi64dmac4b16",     .data = &nbpf_cfg[NBPF4B16]},
+       {.compatible = "renesas,nbpfaxi64dmac8b4",      .data = &nbpf_cfg[NBPF8B4]},
+       {.compatible = "renesas,nbpfaxi64dmac8b8",      .data = &nbpf_cfg[NBPF8B8]},
+       {.compatible = "renesas,nbpfaxi64dmac8b16",     .data = &nbpf_cfg[NBPF8B16]},
+       {}
+};
+MODULE_DEVICE_TABLE(of, nbpf_match);
+
+static int nbpf_probe(struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       const struct of_device_id *of_id = of_match_device(nbpf_match, dev);
+       struct device_node *np = dev->of_node;
+       struct nbpf_device *nbpf;
+       struct dma_device *dma_dev;
+       struct resource *iomem, *irq_res;
+       const struct nbpf_config *cfg;
+       int num_channels;
+       int ret, irq, eirq, i;
+       int irqbuf[9] /* maximum 8 channels + error IRQ */;
+       unsigned int irqs = 0;
+
+       BUILD_BUG_ON(sizeof(struct nbpf_desc_page) > PAGE_SIZE);
+
+       /* DT only */
+       if (!np || !of_id || !of_id->data)
+               return -ENODEV;
+
+       cfg = of_id->data;
+       num_channels = cfg->num_channels;
+
+       nbpf = devm_kzalloc(dev, sizeof(*nbpf) + num_channels *
+                           sizeof(nbpf->chan[0]), GFP_KERNEL);
+       if (!nbpf) {
+               dev_err(dev, "Memory allocation failed\n");
+               return -ENOMEM;
+       }
+       dma_dev = &nbpf->dma_dev;
+       dma_dev->dev = dev;
+
+       iomem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       nbpf->base = devm_ioremap_resource(dev, iomem);
+       if (IS_ERR(nbpf->base))
+               return PTR_ERR(nbpf->base);
+
+       nbpf->clk = devm_clk_get(dev, NULL);
+       if (IS_ERR(nbpf->clk))
+               return PTR_ERR(nbpf->clk);
+
+       nbpf->config = cfg;
+
+       for (i = 0; irqs < ARRAY_SIZE(irqbuf); i++) {
+               irq_res = platform_get_resource(pdev, IORESOURCE_IRQ, i);
+               if (!irq_res)
+                       break;
+
+               for (irq = irq_res->start; irq <= irq_res->end;
+                    irq++, irqs++)
+                       irqbuf[irqs] = irq;
+       }
+
+       /*
+        * 3 IRQ resource schemes are supported:
+        * 1. 1 shared IRQ for error and all channels
+        * 2. 2 IRQs: one for error and one shared for all channels
+        * 3. 1 IRQ for error and an own IRQ for each channel
+        */
+       if (irqs != 1 && irqs != 2 && irqs != num_channels + 1)
+               return -ENXIO;
+
+       if (irqs == 1) {
+               eirq = irqbuf[0];
+
+               for (i = 0; i <= num_channels; i++)
+                       nbpf->chan[i].irq = irqbuf[0];
+       } else {
+               eirq = platform_get_irq_byname(pdev, "error");
+               if (eirq < 0)
+                       return eirq;
+
+               if (irqs == num_channels + 1) {
+                       struct nbpf_channel *chan;
+
+                       for (i = 0, chan = nbpf->chan; i <= num_channels;
+                            i++, chan++) {
+                               /* Skip the error IRQ */
+                               if (irqbuf[i] == eirq)
+                                       i++;
+                               chan->irq = irqbuf[i];
+                       }
+
+                       if (chan != nbpf->chan + num_channels)
+                               return -EINVAL;
+               } else {
+                       /* 2 IRQs and more than one channel */
+                       if (irqbuf[0] == eirq)
+                               irq = irqbuf[1];
+                       else
+                               irq = irqbuf[0];
+
+                       for (i = 0; i <= num_channels; i++)
+                               nbpf->chan[i].irq = irq;
+               }
+       }
+
+       ret = devm_request_irq(dev, eirq, nbpf_err_irq,
+                              IRQF_SHARED, "dma error", nbpf);
+       if (ret < 0)
+               return ret;
+
+       INIT_LIST_HEAD(&dma_dev->channels);
+
+       /* Create DMA Channel */
+       for (i = 0; i < num_channels; i++) {
+               ret = nbpf_chan_probe(nbpf, i);
+               if (ret < 0)
+                       return ret;
+       }
+
+       dma_cap_set(DMA_MEMCPY, dma_dev->cap_mask);
+       dma_cap_set(DMA_SLAVE, dma_dev->cap_mask);
+       dma_cap_set(DMA_PRIVATE, dma_dev->cap_mask);
+       dma_cap_set(DMA_SG, dma_dev->cap_mask);
+
+       /* Common and MEMCPY operations */
+       dma_dev->device_alloc_chan_resources
+               = nbpf_alloc_chan_resources;
+       dma_dev->device_free_chan_resources = nbpf_free_chan_resources;
+       dma_dev->device_prep_dma_sg = nbpf_prep_memcpy_sg;
+       dma_dev->device_prep_dma_memcpy = nbpf_prep_memcpy;
+       dma_dev->device_tx_status = nbpf_tx_status;
+       dma_dev->device_issue_pending = nbpf_issue_pending;
+       dma_dev->device_slave_caps = nbpf_slave_caps;
+
+       /*
+        * If we drop support for unaligned MEMCPY buffer addresses and / or
+        * lengths by setting
+        * dma_dev->copy_align = 4;
+        * then we can set transfer length to 4 bytes in nbpf_prep_one() for
+        * DMA_MEM_TO_MEM
+        */
+
+       /* Compulsory for DMA_SLAVE fields */
+       dma_dev->device_prep_slave_sg = nbpf_prep_slave_sg;
+       dma_dev->device_control = nbpf_control;
+
+       platform_set_drvdata(pdev, nbpf);
+
+       ret = clk_prepare_enable(nbpf->clk);
+       if (ret < 0)
+               return ret;
+
+       nbpf_configure(nbpf);
+
+       ret = dma_async_device_register(dma_dev);
+       if (ret < 0)
+               goto e_clk_off;
+
+       ret = of_dma_controller_register(np, nbpf_of_xlate, nbpf);
+       if (ret < 0)
+               goto e_dma_dev_unreg;
+
+       return 0;
+
+e_dma_dev_unreg:
+       dma_async_device_unregister(dma_dev);
+e_clk_off:
+       clk_disable_unprepare(nbpf->clk);
+
+       return ret;
+}
+
+static int nbpf_remove(struct platform_device *pdev)
+{
+       struct nbpf_device *nbpf = platform_get_drvdata(pdev);
+
+       of_dma_controller_free(pdev->dev.of_node);
+       dma_async_device_unregister(&nbpf->dma_dev);
+       clk_disable_unprepare(nbpf->clk);
+
+       return 0;
+}
+
+static struct platform_device_id nbpf_ids[] = {
+       {"nbpfaxi64dmac1b4",    (kernel_ulong_t)&nbpf_cfg[NBPF1B4]},
+       {"nbpfaxi64dmac1b8",    (kernel_ulong_t)&nbpf_cfg[NBPF1B8]},
+       {"nbpfaxi64dmac1b16",   (kernel_ulong_t)&nbpf_cfg[NBPF1B16]},
+       {"nbpfaxi64dmac4b4",    (kernel_ulong_t)&nbpf_cfg[NBPF4B4]},
+       {"nbpfaxi64dmac4b8",    (kernel_ulong_t)&nbpf_cfg[NBPF4B8]},
+       {"nbpfaxi64dmac4b16",   (kernel_ulong_t)&nbpf_cfg[NBPF4B16]},
+       {"nbpfaxi64dmac8b4",    (kernel_ulong_t)&nbpf_cfg[NBPF8B4]},
+       {"nbpfaxi64dmac8b8",    (kernel_ulong_t)&nbpf_cfg[NBPF8B8]},
+       {"nbpfaxi64dmac8b16",   (kernel_ulong_t)&nbpf_cfg[NBPF8B16]},
+       {},
+};
+MODULE_DEVICE_TABLE(platform, nbpf_ids);
+
+#ifdef CONFIG_PM_RUNTIME
+static int nbpf_runtime_suspend(struct device *dev)
+{
+       struct nbpf_device *nbpf = platform_get_drvdata(to_platform_device(dev));
+       clk_disable_unprepare(nbpf->clk);
+       return 0;
+}
+
+static int nbpf_runtime_resume(struct device *dev)
+{
+       struct nbpf_device *nbpf = platform_get_drvdata(to_platform_device(dev));
+       return clk_prepare_enable(nbpf->clk);
+}
+#endif
+
+static const struct dev_pm_ops nbpf_pm_ops = {
+       SET_RUNTIME_PM_OPS(nbpf_runtime_suspend, nbpf_runtime_resume, NULL)
+};
+
+static struct platform_driver nbpf_driver = {
+       .driver = {
+               .owner = THIS_MODULE,
+               .name = "dma-nbpf",
+               .of_match_table = nbpf_match,
+               .pm = &nbpf_pm_ops,
+       },
+       .id_table = nbpf_ids,
+       .probe = nbpf_probe,
+       .remove = nbpf_remove,
+};
+
+module_platform_driver(nbpf_driver);
+
+MODULE_AUTHOR("Guennadi Liakhovetski <g.liakhovetski@gmx.de>");
+MODULE_DESCRIPTION("dmaengine driver for NBPFAXI64* DMACs");
+MODULE_LICENSE("GPL v2");
index e8fe9dc455f4d8989e6d75618fecdd7ce351647f..d5fbeaa1e7ba76f25a20b7d266a26db3c3204c38 100644 (file)
@@ -218,3 +218,38 @@ struct dma_chan *of_dma_simple_xlate(struct of_phandle_args *dma_spec,
                        &dma_spec->args[0]);
 }
 EXPORT_SYMBOL_GPL(of_dma_simple_xlate);
+
+/**
+ * of_dma_xlate_by_chan_id - Translate dt property to DMA channel by channel id
+ * @dma_spec:  pointer to DMA specifier as found in the device tree
+ * @of_dma:    pointer to DMA controller data
+ *
+ * This function can be used as the of xlate callback for DMA driver which wants
+ * to match the channel based on the channel id. When using this xlate function
+ * the #dma-cells propety of the DMA controller dt node needs to be set to 1.
+ * The data parameter of of_dma_controller_register must be a pointer to the
+ * dma_device struct the function should match upon.
+ *
+ * Returns pointer to appropriate dma channel on success or NULL on error.
+ */
+struct dma_chan *of_dma_xlate_by_chan_id(struct of_phandle_args *dma_spec,
+                                        struct of_dma *ofdma)
+{
+       struct dma_device *dev = ofdma->of_dma_data;
+       struct dma_chan *chan, *candidate = NULL;
+
+       if (!dev || dma_spec->args_count != 1)
+               return NULL;
+
+       list_for_each_entry(chan, &dev->channels, device_node)
+               if (chan->chan_id == dma_spec->args[0]) {
+                       candidate = chan;
+                       break;
+               }
+
+       if (!candidate)
+               return NULL;
+
+       return dma_get_slave_channel(candidate);
+}
+EXPORT_SYMBOL_GPL(of_dma_xlate_by_chan_id);
index b19f04f4390bc1907c7e4a181e2bf53cc967bfa3..4cf7d9a950d71b2116ca96c032c6599b9dfdb1f2 100644 (file)
@@ -853,8 +853,7 @@ static struct dma_async_tx_descriptor *omap_dma_prep_slave_sg(
 
 static struct dma_async_tx_descriptor *omap_dma_prep_dma_cyclic(
        struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len,
-       size_t period_len, enum dma_transfer_direction dir, unsigned long flags,
-       void *context)
+       size_t period_len, enum dma_transfer_direction dir, unsigned long flags)
 {
        struct omap_dmadev *od = to_omap_dma_dev(chan->device);
        struct omap_chan *c = to_omap_dma_chan(chan);
index 73fa9b7a10ab36b05dbc54c2850325c1e1a0b566..d5149aacd2feefdb9a3d516187e6f5b03f084ada 100644 (file)
 #define PL330_MAX_IRQS         32
 #define PL330_MAX_PERI         32
 
-enum pl330_srccachectrl {
-       SCCTRL0,        /* Noncacheable and nonbufferable */
-       SCCTRL1,        /* Bufferable only */
-       SCCTRL2,        /* Cacheable, but do not allocate */
-       SCCTRL3,        /* Cacheable and bufferable, but do not allocate */
-       SINVALID1,
-       SINVALID2,
-       SCCTRL6,        /* Cacheable write-through, allocate on reads only */
-       SCCTRL7,        /* Cacheable write-back, allocate on reads only */
-};
-
-enum pl330_dstcachectrl {
-       DCCTRL0,        /* Noncacheable and nonbufferable */
-       DCCTRL1,        /* Bufferable only */
-       DCCTRL2,        /* Cacheable, but do not allocate */
-       DCCTRL3,        /* Cacheable and bufferable, but do not allocate */
-       DINVALID1,      /* AWCACHE = 0x1000 */
-       DINVALID2,
-       DCCTRL6,        /* Cacheable write-through, allocate on writes only */
-       DCCTRL7,        /* Cacheable write-back, allocate on writes only */
+enum pl330_cachectrl {
+       CCTRL0,         /* Noncacheable and nonbufferable */
+       CCTRL1,         /* Bufferable only */
+       CCTRL2,         /* Cacheable, but do not allocate */
+       CCTRL3,         /* Cacheable and bufferable, but do not allocate */
+       INVALID1,       /* AWCACHE = 0x1000 */
+       INVALID2,
+       CCTRL6,         /* Cacheable write-through, allocate on writes only */
+       CCTRL7,         /* Cacheable write-back, allocate on writes only */
 };
 
 enum pl330_byteswap {
@@ -63,13 +52,6 @@ enum pl330_byteswap {
        SWAP_16,
 };
 
-enum pl330_reqtype {
-       MEMTOMEM,
-       MEMTODEV,
-       DEVTOMEM,
-       DEVTODEV,
-};
-
 /* Register and Bit field Definitions */
 #define DS                     0x0
 #define DS_ST_STOP             0x0
@@ -263,9 +245,6 @@ enum pl330_reqtype {
  */
 #define MCODE_BUFF_PER_REQ     256
 
-/* If the _pl330_req is available to the client */
-#define IS_FREE(req)   (*((u8 *)((req)->mc_cpu)) == CMD_DMAEND)
-
 /* Use this _only_ to wait on transient states */
 #define UNTIL(t, s)    while (!(_state(t) & (s))) cpu_relax();
 
@@ -300,27 +279,6 @@ struct pl330_config {
        u32             irq_ns;
 };
 
-/* Handle to the DMAC provided to the PL330 core */
-struct pl330_info {
-       /* Owning device */
-       struct device *dev;
-       /* Size of MicroCode buffers for each channel. */
-       unsigned mcbufsz;
-       /* ioremap'ed address of PL330 registers. */
-       void __iomem    *base;
-       /* Client can freely use it. */
-       void    *client_data;
-       /* PL330 core data, Client must not touch it. */
-       void    *pl330_data;
-       /* Populated by the PL330 core driver during pl330_add */
-       struct pl330_config     pcfg;
-       /*
-        * If the DMAC has some reset mechanism, then the
-        * client may want to provide pointer to the method.
-        */
-       void (*dmac_reset)(struct pl330_info *pi);
-};
-
 /**
  * Request Configuration.
  * The PL330 core does not modify this and uses the last
@@ -344,8 +302,8 @@ struct pl330_reqcfg {
        unsigned brst_len:5;
        unsigned brst_size:3; /* in power of 2 */
 
-       enum pl330_dstcachectrl dcctl;
-       enum pl330_srccachectrl scctl;
+       enum pl330_cachectrl dcctl;
+       enum pl330_cachectrl scctl;
        enum pl330_byteswap swap;
        struct pl330_config *pcfg;
 };
@@ -359,11 +317,6 @@ struct pl330_xfer {
        u32 dst_addr;
        /* Size to xfer */
        u32 bytes;
-       /*
-        * Pointer to next xfer in the list.
-        * The last xfer in the req must point to NULL.
-        */
-       struct pl330_xfer *next;
 };
 
 /* The xfer callbacks are made with one of these arguments. */
@@ -376,67 +329,6 @@ enum pl330_op_err {
        PL330_ERR_FAIL,
 };
 
-/* A request defining Scatter-Gather List ending with NULL xfer. */
-struct pl330_req {
-       enum pl330_reqtype rqtype;
-       /* Index of peripheral for the xfer. */
-       unsigned peri:5;
-       /* Unique token for this xfer, set by the client. */
-       void *token;
-       /* Callback to be called after xfer. */
-       void (*xfer_cb)(void *token, enum pl330_op_err err);
-       /* If NULL, req will be done at last set parameters. */
-       struct pl330_reqcfg *cfg;
-       /* Pointer to first xfer in the request. */
-       struct pl330_xfer *x;
-       /* Hook to attach to DMAC's list of reqs with due callback */
-       struct list_head rqd;
-};
-
-/*
- * To know the status of the channel and DMAC, the client
- * provides a pointer to this structure. The PL330 core
- * fills it with current information.
- */
-struct pl330_chanstatus {
-       /*
-        * If the DMAC engine halted due to some error,
-        * the client should remove-add DMAC.
-        */
-       bool dmac_halted;
-       /*
-        * If channel is halted due to some error,
-        * the client should ABORT/FLUSH and START the channel.
-        */
-       bool faulting;
-       /* Location of last load */
-       u32 src_addr;
-       /* Location of last store */
-       u32 dst_addr;
-       /*
-        * Pointer to the currently active req, NULL if channel is
-        * inactive, even though the requests may be present.
-        */
-       struct pl330_req *top_req;
-       /* Pointer to req waiting second in the queue if any. */
-       struct pl330_req *wait_req;
-};
-
-enum pl330_chan_op {
-       /* Start the channel */
-       PL330_OP_START,
-       /* Abort the active xfer */
-       PL330_OP_ABORT,
-       /* Stop xfer and flush queue */
-       PL330_OP_FLUSH,
-};
-
-struct _xfer_spec {
-       u32 ccr;
-       struct pl330_req *r;
-       struct pl330_xfer *x;
-};
-
 enum dmamov_dst {
        SAR = 0,
        CCR,
@@ -454,12 +346,12 @@ enum pl330_cond {
        ALWAYS,
 };
 
+struct dma_pl330_desc;
+
 struct _pl330_req {
        u32 mc_bus;
        void *mc_cpu;
-       /* Number of bytes taken to setup MC for the req */
-       u32 mc_len;
-       struct pl330_req *r;
+       struct dma_pl330_desc *desc;
 };
 
 /* ToBeDone for tasklet */
@@ -491,30 +383,6 @@ enum pl330_dmac_state {
        DYING,
 };
 
-/* A DMAC */
-struct pl330_dmac {
-       spinlock_t              lock;
-       /* Holds list of reqs with due callbacks */
-       struct list_head        req_done;
-       /* Pointer to platform specific stuff */
-       struct pl330_info       *pinfo;
-       /* Maximum possible events/irqs */
-       int                     events[32];
-       /* BUS address of MicroCode buffer */
-       dma_addr_t              mcode_bus;
-       /* CPU address of MicroCode buffer */
-       void                    *mcode_cpu;
-       /* List of all Channel threads */
-       struct pl330_thread     *channels;
-       /* Pointer to the MANAGER thread */
-       struct pl330_thread     *manager;
-       /* To handle bad news in interrupt */
-       struct tasklet_struct   tasks;
-       struct _pl330_tbd       dmac_tbd;
-       /* State of DMAC operation */
-       enum pl330_dmac_state   state;
-};
-
 enum desc_status {
        /* In the DMAC pool */
        FREE,
@@ -555,15 +423,16 @@ struct dma_pl330_chan {
         * As the parent, this DMAC also provides descriptors
         * to the channel.
         */
-       struct dma_pl330_dmac *dmac;
+       struct pl330_dmac *dmac;
 
        /* To protect channel manipulation */
        spinlock_t lock;
 
-       /* Token of a hardware channel thread of PL330 DMAC
-        * NULL if the channel is available to be acquired.
+       /*
+        * Hardware channel thread of PL330 DMAC. NULL if the channel is
+        * available.
         */
-       void *pl330_chid;
+       struct pl330_thread *thread;
 
        /* For D-to-M and M-to-D channels */
        int burst_sz; /* the peripheral fifo width */
@@ -574,9 +443,7 @@ struct dma_pl330_chan {
        bool cyclic;
 };
 
-struct dma_pl330_dmac {
-       struct pl330_info pif;
-
+struct pl330_dmac {
        /* DMA-Engine Device */
        struct dma_device ddma;
 
@@ -588,6 +455,32 @@ struct dma_pl330_dmac {
        /* To protect desc_pool manipulation */
        spinlock_t pool_lock;
 
+       /* Size of MicroCode buffers for each channel. */
+       unsigned mcbufsz;
+       /* ioremap'ed address of PL330 registers. */
+       void __iomem    *base;
+       /* Populated by the PL330 core driver during pl330_add */
+       struct pl330_config     pcfg;
+
+       spinlock_t              lock;
+       /* Maximum possible events/irqs */
+       int                     events[32];
+       /* BUS address of MicroCode buffer */
+       dma_addr_t              mcode_bus;
+       /* CPU address of MicroCode buffer */
+       void                    *mcode_cpu;
+       /* List of all Channel threads */
+       struct pl330_thread     *channels;
+       /* Pointer to the MANAGER thread */
+       struct pl330_thread     *manager;
+       /* To handle bad news in interrupt */
+       struct tasklet_struct   tasks;
+       struct _pl330_tbd       dmac_tbd;
+       /* State of DMAC operation */
+       enum pl330_dmac_state   state;
+       /* Holds list of reqs with due callbacks */
+       struct list_head        req_done;
+
        /* Peripheral channels connected to this DMAC */
        unsigned int num_peripherals;
        struct dma_pl330_chan *peripherals; /* keep at end */
@@ -604,49 +497,43 @@ struct dma_pl330_desc {
        struct pl330_xfer px;
 
        struct pl330_reqcfg rqcfg;
-       struct pl330_req req;
 
        enum desc_status status;
 
        /* The channel which currently holds this desc */
        struct dma_pl330_chan *pchan;
+
+       enum dma_transfer_direction rqtype;
+       /* Index of peripheral for the xfer. */
+       unsigned peri:5;
+       /* Hook to attach to DMAC's list of reqs with due callback */
+       struct list_head rqd;
 };
 
-static inline void _callback(struct pl330_req *r, enum pl330_op_err err)
-{
-       if (r && r->xfer_cb)
-               r->xfer_cb(r->token, err);
-}
+struct _xfer_spec {
+       u32 ccr;
+       struct dma_pl330_desc *desc;
+};
 
 static inline bool _queue_empty(struct pl330_thread *thrd)
 {
-       return (IS_FREE(&thrd->req[0]) && IS_FREE(&thrd->req[1]))
-               ? true : false;
+       return thrd->req[0].desc == NULL && thrd->req[1].desc == NULL;
 }
 
 static inline bool _queue_full(struct pl330_thread *thrd)
 {
-       return (IS_FREE(&thrd->req[0]) || IS_FREE(&thrd->req[1]))
-               ? false : true;
+       return thrd->req[0].desc != NULL && thrd->req[1].desc != NULL;
 }
 
 static inline bool is_manager(struct pl330_thread *thrd)
 {
-       struct pl330_dmac *pl330 = thrd->dmac;
-
-       /* MANAGER is indexed at the end */
-       if (thrd->id == pl330->pinfo->pcfg.num_chan)
-               return true;
-       else
-               return false;
+       return thrd->dmac->manager == thrd;
 }
 
 /* If manager of the thread is in Non-Secure mode */
 static inline bool _manager_ns(struct pl330_thread *thrd)
 {
-       struct pl330_dmac *pl330 = thrd->dmac;
-
-       return (pl330->pinfo->pcfg.mode & DMAC_MODE_NS) ? true : false;
+       return (thrd->dmac->pcfg.mode & DMAC_MODE_NS) ? true : false;
 }
 
 static inline u32 get_revision(u32 periph_id)
@@ -1004,7 +891,7 @@ static inline u32 _emit_GO(unsigned dry_run, u8 buf[],
 /* Returns Time-Out */
 static bool _until_dmac_idle(struct pl330_thread *thrd)
 {
-       void __iomem *regs = thrd->dmac->pinfo->base;
+       void __iomem *regs = thrd->dmac->base;
        unsigned long loops = msecs_to_loops(5);
 
        do {
@@ -1024,7 +911,7 @@ static bool _until_dmac_idle(struct pl330_thread *thrd)
 static inline void _execute_DBGINSN(struct pl330_thread *thrd,
                u8 insn[], bool as_manager)
 {
-       void __iomem *regs = thrd->dmac->pinfo->base;
+       void __iomem *regs = thrd->dmac->base;
        u32 val;
 
        val = (insn[0] << 16) | (insn[1] << 24);
@@ -1039,7 +926,7 @@ static inline void _execute_DBGINSN(struct pl330_thread *thrd,
 
        /* If timed out due to halted state-machine */
        if (_until_dmac_idle(thrd)) {
-               dev_err(thrd->dmac->pinfo->dev, "DMAC halted!\n");
+               dev_err(thrd->dmac->ddma.dev, "DMAC halted!\n");
                return;
        }
 
@@ -1047,25 +934,9 @@ static inline void _execute_DBGINSN(struct pl330_thread *thrd,
        writel(0, regs + DBGCMD);
 }
 
-/*
- * Mark a _pl330_req as free.
- * We do it by writing DMAEND as the first instruction
- * because no valid request is going to have DMAEND as
- * its first instruction to execute.
- */
-static void mark_free(struct pl330_thread *thrd, int idx)
-{
-       struct _pl330_req *req = &thrd->req[idx];
-
-       _emit_END(0, req->mc_cpu);
-       req->mc_len = 0;
-
-       thrd->req_running = -1;
-}
-
 static inline u32 _state(struct pl330_thread *thrd)
 {
-       void __iomem *regs = thrd->dmac->pinfo->base;
+       void __iomem *regs = thrd->dmac->base;
        u32 val;
 
        if (is_manager(thrd))
@@ -1123,7 +994,7 @@ static inline u32 _state(struct pl330_thread *thrd)
 
 static void _stop(struct pl330_thread *thrd)
 {
-       void __iomem *regs = thrd->dmac->pinfo->base;
+       void __iomem *regs = thrd->dmac->base;
        u8 insn[6] = {0, 0, 0, 0, 0, 0};
 
        if (_state(thrd) == PL330_STATE_FAULT_COMPLETING)
@@ -1146,9 +1017,9 @@ static void _stop(struct pl330_thread *thrd)
 /* Start doing req 'idx' of thread 'thrd' */
 static bool _trigger(struct pl330_thread *thrd)
 {
-       void __iomem *regs = thrd->dmac->pinfo->base;
+       void __iomem *regs = thrd->dmac->base;
        struct _pl330_req *req;
-       struct pl330_req *r;
+       struct dma_pl330_desc *desc;
        struct _arg_GO go;
        unsigned ns;
        u8 insn[6] = {0, 0, 0, 0, 0, 0};
@@ -1159,32 +1030,27 @@ static bool _trigger(struct pl330_thread *thrd)
                return true;
 
        idx = 1 - thrd->lstenq;
-       if (!IS_FREE(&thrd->req[idx]))
+       if (thrd->req[idx].desc != NULL) {
                req = &thrd->req[idx];
-       else {
+       else {
                idx = thrd->lstenq;
-               if (!IS_FREE(&thrd->req[idx]))
+               if (thrd->req[idx].desc != NULL)
                        req = &thrd->req[idx];
                else
                        req = NULL;
        }
 
        /* Return if no request */
-       if (!req || !req->r)
+       if (!req)
                return true;
 
-       r = req->r;
+       desc = req->desc;
 
-       if (r->cfg)
-               ns = r->cfg->nonsecure ? 1 : 0;
-       else if (readl(regs + CS(thrd->id)) & CS_CNS)
-               ns = 1;
-       else
-               ns = 0;
+       ns = desc->rqcfg.nonsecure ? 1 : 0;
 
        /* See 'Abort Sources' point-4 at Page 2-25 */
        if (_manager_ns(thrd) && !ns)
-               dev_info(thrd->dmac->pinfo->dev, "%s:%d Recipe for ABORT!\n",
+               dev_info(thrd->dmac->ddma.dev, "%s:%d Recipe for ABORT!\n",
                        __func__, __LINE__);
 
        go.chan = thrd->id;
@@ -1240,7 +1106,7 @@ static inline int _ldst_memtomem(unsigned dry_run, u8 buf[],
                const struct _xfer_spec *pxs, int cyc)
 {
        int off = 0;
-       struct pl330_config *pcfg = pxs->r->cfg->pcfg;
+       struct pl330_config *pcfg = pxs->desc->rqcfg.pcfg;
 
        /* check lock-up free version */
        if (get_revision(pcfg->periph_id) >= PERIPH_REV_R1P0) {
@@ -1266,10 +1132,10 @@ static inline int _ldst_devtomem(unsigned dry_run, u8 buf[],
        int off = 0;
 
        while (cyc--) {
-               off += _emit_WFP(dry_run, &buf[off], SINGLE, pxs->r->peri);
-               off += _emit_LDP(dry_run, &buf[off], SINGLE, pxs->r->peri);
+               off += _emit_WFP(dry_run, &buf[off], SINGLE, pxs->desc->peri);
+               off += _emit_LDP(dry_run, &buf[off], SINGLE, pxs->desc->peri);
                off += _emit_ST(dry_run, &buf[off], ALWAYS);
-               off += _emit_FLUSHP(dry_run, &buf[off], pxs->r->peri);
+               off += _emit_FLUSHP(dry_run, &buf[off], pxs->desc->peri);
        }
 
        return off;
@@ -1281,10 +1147,10 @@ static inline int _ldst_memtodev(unsigned dry_run, u8 buf[],
        int off = 0;
 
        while (cyc--) {
-               off += _emit_WFP(dry_run, &buf[off], SINGLE, pxs->r->peri);
+               off += _emit_WFP(dry_run, &buf[off], SINGLE, pxs->desc->peri);
                off += _emit_LD(dry_run, &buf[off], ALWAYS);
-               off += _emit_STP(dry_run, &buf[off], SINGLE, pxs->r->peri);
-               off += _emit_FLUSHP(dry_run, &buf[off], pxs->r->peri);
+               off += _emit_STP(dry_run, &buf[off], SINGLE, pxs->desc->peri);
+               off += _emit_FLUSHP(dry_run, &buf[off], pxs->desc->peri);
        }
 
        return off;
@@ -1295,14 +1161,14 @@ static int _bursts(unsigned dry_run, u8 buf[],
 {
        int off = 0;
 
-       switch (pxs->r->rqtype) {
-       case MEMTODEV:
+       switch (pxs->desc->rqtype) {
+       case DMA_MEM_TO_DEV:
                off += _ldst_memtodev(dry_run, &buf[off], pxs, cyc);
                break;
-       case DEVTOMEM:
+       case DMA_DEV_TO_MEM:
                off += _ldst_devtomem(dry_run, &buf[off], pxs, cyc);
                break;
-       case MEMTOMEM:
+       case DMA_MEM_TO_MEM:
                off += _ldst_memtomem(dry_run, &buf[off], pxs, cyc);
                break;
        default:
@@ -1395,7 +1261,7 @@ static inline int _loop(unsigned dry_run, u8 buf[],
 static inline int _setup_loops(unsigned dry_run, u8 buf[],
                const struct _xfer_spec *pxs)
 {
-       struct pl330_xfer *x = pxs->x;
+       struct pl330_xfer *x = &pxs->desc->px;
        u32 ccr = pxs->ccr;
        unsigned long c, bursts = BYTE_TO_BURST(x->bytes, ccr);
        int off = 0;
@@ -1412,7 +1278,7 @@ static inline int _setup_loops(unsigned dry_run, u8 buf[],
 static inline int _setup_xfer(unsigned dry_run, u8 buf[],
                const struct _xfer_spec *pxs)
 {
-       struct pl330_xfer *x = pxs->x;
+       struct pl330_xfer *x = &pxs->desc->px;
        int off = 0;
 
        /* DMAMOV SAR, x->src_addr */
@@ -1443,17 +1309,12 @@ static int _setup_req(unsigned dry_run, struct pl330_thread *thrd,
        /* DMAMOV CCR, ccr */
        off += _emit_MOV(dry_run, &buf[off], CCR, pxs->ccr);
 
-       x = pxs->r->x;
-       do {
-               /* Error if xfer length is not aligned at burst size */
-               if (x->bytes % (BRST_SIZE(pxs->ccr) * BRST_LEN(pxs->ccr)))
-                       return -EINVAL;
-
-               pxs->x = x;
-               off += _setup_xfer(dry_run, &buf[off], pxs);
+       x = &pxs->desc->px;
+       /* Error if xfer length is not aligned at burst size */
+       if (x->bytes % (BRST_SIZE(pxs->ccr) * BRST_LEN(pxs->ccr)))
+               return -EINVAL;
 
-               x = x->next;
-       } while (x);
+       off += _setup_xfer(dry_run, &buf[off], pxs);
 
        /* DMASEV peripheral/event */
        off += _emit_SEV(dry_run, &buf[off], thrd->ev);
@@ -1495,31 +1356,15 @@ static inline u32 _prepare_ccr(const struct pl330_reqcfg *rqc)
        return ccr;
 }
 
-static inline bool _is_valid(u32 ccr)
-{
-       enum pl330_dstcachectrl dcctl;
-       enum pl330_srccachectrl scctl;
-
-       dcctl = (ccr >> CC_DSTCCTRL_SHFT) & CC_DRCCCTRL_MASK;
-       scctl = (ccr >> CC_SRCCCTRL_SHFT) & CC_SRCCCTRL_MASK;
-
-       if (dcctl == DINVALID1 || dcctl == DINVALID2
-                       || scctl == SINVALID1 || scctl == SINVALID2)
-               return false;
-       else
-               return true;
-}
-
 /*
  * Submit a list of xfers after which the client wants notification.
  * Client is not notified after each xfer unit, just once after all
  * xfer units are done or some error occurs.
  */
-static int pl330_submit_req(void *ch_id, struct pl330_req *r)
+static int pl330_submit_req(struct pl330_thread *thrd,
+       struct dma_pl330_desc *desc)
 {
-       struct pl330_thread *thrd = ch_id;
-       struct pl330_dmac *pl330;
-       struct pl330_info *pi;
+       struct pl330_dmac *pl330 = thrd->dmac;
        struct _xfer_spec xs;
        unsigned long flags;
        void __iomem *regs;
@@ -1528,25 +1373,24 @@ static int pl330_submit_req(void *ch_id, struct pl330_req *r)
        int ret = 0;
 
        /* No Req or Unacquired Channel or DMAC */
-       if (!r || !thrd || thrd->free)
+       if (!desc || !thrd || thrd->free)
                return -EINVAL;
 
-       pl330 = thrd->dmac;
-       pi = pl330->pinfo;
-       regs = pi->base;
+       regs = thrd->dmac->base;
 
        if (pl330->state == DYING
                || pl330->dmac_tbd.reset_chan & (1 << thrd->id)) {
-               dev_info(thrd->dmac->pinfo->dev, "%s:%d\n",
+               dev_info(thrd->dmac->ddma.dev, "%s:%d\n",
                        __func__, __LINE__);
                return -EAGAIN;
        }
 
        /* If request for non-existing peripheral */
-       if (r->rqtype != MEMTOMEM && r->peri >= pi->pcfg.num_peri) {
-               dev_info(thrd->dmac->pinfo->dev,
+       if (desc->rqtype != DMA_MEM_TO_MEM &&
+           desc->peri >= pl330->pcfg.num_peri) {
+               dev_info(thrd->dmac->ddma.dev,
                                "%s:%d Invalid peripheral(%u)!\n",
-                               __func__, __LINE__, r->peri);
+                               __func__, __LINE__, desc->peri);
                return -EINVAL;
        }
 
@@ -1557,41 +1401,26 @@ static int pl330_submit_req(void *ch_id, struct pl330_req *r)
                goto xfer_exit;
        }
 
+       /* Prefer Secure Channel */
+       if (!_manager_ns(thrd))
+               desc->rqcfg.nonsecure = 0;
+       else
+               desc->rqcfg.nonsecure = 1;
 
-       /* Use last settings, if not provided */
-       if (r->cfg) {
-               /* Prefer Secure Channel */
-               if (!_manager_ns(thrd))
-                       r->cfg->nonsecure = 0;
-               else
-                       r->cfg->nonsecure = 1;
-
-               ccr = _prepare_ccr(r->cfg);
-       } else {
-               ccr = readl(regs + CC(thrd->id));
-       }
-
-       /* If this req doesn't have valid xfer settings */
-       if (!_is_valid(ccr)) {
-               ret = -EINVAL;
-               dev_info(thrd->dmac->pinfo->dev, "%s:%d Invalid CCR(%x)!\n",
-                       __func__, __LINE__, ccr);
-               goto xfer_exit;
-       }
+       ccr = _prepare_ccr(&desc->rqcfg);
 
-       idx = IS_FREE(&thrd->req[0]) ? 0 : 1;
+       idx = thrd->req[0].desc == NULL ? 0 : 1;
 
        xs.ccr = ccr;
-       xs.r = r;
+       xs.desc = desc;
 
        /* First dry run to check if req is acceptable */
        ret = _setup_req(1, thrd, idx, &xs);
        if (ret < 0)
                goto xfer_exit;
 
-       if (ret > pi->mcbufsz / 2) {
-               dev_info(thrd->dmac->pinfo->dev,
-                       "%s:%d Trying increasing mcbufsz\n",
+       if (ret > pl330->mcbufsz / 2) {
+               dev_info(pl330->ddma.dev, "%s:%d Trying increasing mcbufsz\n",
                                __func__, __LINE__);
                ret = -ENOMEM;
                goto xfer_exit;
@@ -1599,8 +1428,8 @@ static int pl330_submit_req(void *ch_id, struct pl330_req *r)
 
        /* Hook the request */
        thrd->lstenq = idx;
-       thrd->req[idx].mc_len = _setup_req(0, thrd, idx, &xs);
-       thrd->req[idx].r = r;
+       thrd->req[idx].desc = desc;
+       _setup_req(0, thrd, idx, &xs);
 
        ret = 0;
 
@@ -1610,10 +1439,32 @@ xfer_exit:
        return ret;
 }
 
+static void dma_pl330_rqcb(struct dma_pl330_desc *desc, enum pl330_op_err err)
+{
+       struct dma_pl330_chan *pch;
+       unsigned long flags;
+
+       if (!desc)
+               return;
+
+       pch = desc->pchan;
+
+       /* If desc aborted */
+       if (!pch)
+               return;
+
+       spin_lock_irqsave(&pch->lock, flags);
+
+       desc->status = DONE;
+
+       spin_unlock_irqrestore(&pch->lock, flags);
+
+       tasklet_schedule(&pch->task);
+}
+
 static void pl330_dotask(unsigned long data)
 {
        struct pl330_dmac *pl330 = (struct pl330_dmac *) data;
-       struct pl330_info *pi = pl330->pinfo;
        unsigned long flags;
        int i;
 
@@ -1631,16 +1482,16 @@ static void pl330_dotask(unsigned long data)
        if (pl330->dmac_tbd.reset_mngr) {
                _stop(pl330->manager);
                /* Reset all channels */
-               pl330->dmac_tbd.reset_chan = (1 << pi->pcfg.num_chan) - 1;
+               pl330->dmac_tbd.reset_chan = (1 << pl330->pcfg.num_chan) - 1;
                /* Clear the reset flag */
                pl330->dmac_tbd.reset_mngr = false;
        }
 
-       for (i = 0; i < pi->pcfg.num_chan; i++) {
+       for (i = 0; i < pl330->pcfg.num_chan; i++) {
 
                if (pl330->dmac_tbd.reset_chan & (1 << i)) {
                        struct pl330_thread *thrd = &pl330->channels[i];
-                       void __iomem *regs = pi->base;
+                       void __iomem *regs = pl330->base;
                        enum pl330_op_err err;
 
                        _stop(thrd);
@@ -1651,16 +1502,13 @@ static void pl330_dotask(unsigned long data)
                                err = PL330_ERR_ABORT;
 
                        spin_unlock_irqrestore(&pl330->lock, flags);
-
-                       _callback(thrd->req[1 - thrd->lstenq].r, err);
-                       _callback(thrd->req[thrd->lstenq].r, err);
-
+                       dma_pl330_rqcb(thrd->req[1 - thrd->lstenq].desc, err);
+                       dma_pl330_rqcb(thrd->req[thrd->lstenq].desc, err);
                        spin_lock_irqsave(&pl330->lock, flags);
 
-                       thrd->req[0].r = NULL;
-                       thrd->req[1].r = NULL;
-                       mark_free(thrd, 0);
-                       mark_free(thrd, 1);
+                       thrd->req[0].desc = NULL;
+                       thrd->req[1].desc = NULL;
+                       thrd->req_running = -1;
 
                        /* Clear the reset flag */
                        pl330->dmac_tbd.reset_chan &= ~(1 << i);
@@ -1673,20 +1521,15 @@ static void pl330_dotask(unsigned long data)
 }
 
 /* Returns 1 if state was updated, 0 otherwise */
-static int pl330_update(const struct pl330_info *pi)
+static int pl330_update(struct pl330_dmac *pl330)
 {
-       struct pl330_req *rqdone, *tmp;
-       struct pl330_dmac *pl330;
+       struct dma_pl330_desc *descdone, *tmp;
        unsigned long flags;
        void __iomem *regs;
        u32 val;
        int id, ev, ret = 0;
 
-       if (!pi || !pi->pl330_data)
-               return 0;
-
-       regs = pi->base;
-       pl330 = pi->pl330_data;
+       regs = pl330->base;
 
        spin_lock_irqsave(&pl330->lock, flags);
 
@@ -1696,13 +1539,13 @@ static int pl330_update(const struct pl330_info *pi)
        else
                pl330->dmac_tbd.reset_mngr = false;
 
-       val = readl(regs + FSC) & ((1 << pi->pcfg.num_chan) - 1);
+       val = readl(regs + FSC) & ((1 << pl330->pcfg.num_chan) - 1);
        pl330->dmac_tbd.reset_chan |= val;
        if (val) {
                int i = 0;
-               while (i < pi->pcfg.num_chan) {
+               while (i < pl330->pcfg.num_chan) {
                        if (val & (1 << i)) {
-                               dev_info(pi->dev,
+                               dev_info(pl330->ddma.dev,
                                        "Reset Channel-%d\t CS-%x FTC-%x\n",
                                                i, readl(regs + CS(i)),
                                                readl(regs + FTC(i)));
@@ -1714,15 +1557,16 @@ static int pl330_update(const struct pl330_info *pi)
 
        /* Check which event happened i.e, thread notified */
        val = readl(regs + ES);
-       if (pi->pcfg.num_events < 32
-                       && val & ~((1 << pi->pcfg.num_events) - 1)) {
+       if (pl330->pcfg.num_events < 32
+                       && val & ~((1 << pl330->pcfg.num_events) - 1)) {
                pl330->dmac_tbd.reset_dmac = true;
-               dev_err(pi->dev, "%s:%d Unexpected!\n", __func__, __LINE__);
+               dev_err(pl330->ddma.dev, "%s:%d Unexpected!\n", __func__,
+                       __LINE__);
                ret = 1;
                goto updt_exit;
        }
 
-       for (ev = 0; ev < pi->pcfg.num_events; ev++) {
+       for (ev = 0; ev < pl330->pcfg.num_events; ev++) {
                if (val & (1 << ev)) { /* Event occurred */
                        struct pl330_thread *thrd;
                        u32 inten = readl(regs + INTEN);
@@ -1743,25 +1587,22 @@ static int pl330_update(const struct pl330_info *pi)
                                continue;
 
                        /* Detach the req */
-                       rqdone = thrd->req[active].r;
-                       thrd->req[active].r = NULL;
-
-                       mark_free(thrd, active);
+                       descdone = thrd->req[active].desc;
+                       thrd->req[active].desc = NULL;
 
                        /* Get going again ASAP */
                        _start(thrd);
 
                        /* For now, just make a list of callbacks to be done */
-                       list_add_tail(&rqdone->rqd, &pl330->req_done);
+                       list_add_tail(&descdone->rqd, &pl330->req_done);
                }
        }
 
        /* Now that we are in no hurry, do the callbacks */
-       list_for_each_entry_safe(rqdone, tmp, &pl330->req_done, rqd) {
-               list_del(&rqdone->rqd);
-
+       list_for_each_entry_safe(descdone, tmp, &pl330->req_done, rqd) {
+               list_del(&descdone->rqd);
                spin_unlock_irqrestore(&pl330->lock, flags);
-               _callback(rqdone, PL330_ERR_NONE);
+               dma_pl330_rqcb(descdone, PL330_ERR_NONE);
                spin_lock_irqsave(&pl330->lock, flags);
        }
 
@@ -1778,65 +1619,13 @@ updt_exit:
        return ret;
 }
 
-static int pl330_chan_ctrl(void *ch_id, enum pl330_chan_op op)
-{
-       struct pl330_thread *thrd = ch_id;
-       struct pl330_dmac *pl330;
-       unsigned long flags;
-       int ret = 0, active;
-
-       if (!thrd || thrd->free || thrd->dmac->state == DYING)
-               return -EINVAL;
-
-       pl330 = thrd->dmac;
-       active = thrd->req_running;
-
-       spin_lock_irqsave(&pl330->lock, flags);
-
-       switch (op) {
-       case PL330_OP_FLUSH:
-               /* Make sure the channel is stopped */
-               _stop(thrd);
-
-               thrd->req[0].r = NULL;
-               thrd->req[1].r = NULL;
-               mark_free(thrd, 0);
-               mark_free(thrd, 1);
-               break;
-
-       case PL330_OP_ABORT:
-               /* Make sure the channel is stopped */
-               _stop(thrd);
-
-               /* ABORT is only for the active req */
-               if (active == -1)
-                       break;
-
-               thrd->req[active].r = NULL;
-               mark_free(thrd, active);
-
-               /* Start the next */
-       case PL330_OP_START:
-               if ((active == -1) && !_start(thrd))
-                       ret = -EIO;
-               break;
-
-       default:
-               ret = -EINVAL;
-       }
-
-       spin_unlock_irqrestore(&pl330->lock, flags);
-       return ret;
-}
-
 /* Reserve an event */
 static inline int _alloc_event(struct pl330_thread *thrd)
 {
        struct pl330_dmac *pl330 = thrd->dmac;
-       struct pl330_info *pi = pl330->pinfo;
        int ev;
 
-       for (ev = 0; ev < pi->pcfg.num_events; ev++)
+       for (ev = 0; ev < pl330->pcfg.num_events; ev++)
                if (pl330->events[ev] == -1) {
                        pl330->events[ev] = thrd->id;
                        return ev;
@@ -1845,45 +1634,38 @@ static inline int _alloc_event(struct pl330_thread *thrd)
        return -1;
 }
 
-static bool _chan_ns(const struct pl330_info *pi, int i)
+static bool _chan_ns(const struct pl330_dmac *pl330, int i)
 {
-       return pi->pcfg.irq_ns & (1 << i);
+       return pl330->pcfg.irq_ns & (1 << i);
 }
 
 /* Upon success, returns IdentityToken for the
  * allocated channel, NULL otherwise.
  */
-static void *pl330_request_channel(const struct pl330_info *pi)
+static struct pl330_thread *pl330_request_channel(struct pl330_dmac *pl330)
 {
        struct pl330_thread *thrd = NULL;
-       struct pl330_dmac *pl330;
        unsigned long flags;
        int chans, i;
 
-       if (!pi || !pi->pl330_data)
-               return NULL;
-
-       pl330 = pi->pl330_data;
-
        if (pl330->state == DYING)
                return NULL;
 
-       chans = pi->pcfg.num_chan;
+       chans = pl330->pcfg.num_chan;
 
        spin_lock_irqsave(&pl330->lock, flags);
 
        for (i = 0; i < chans; i++) {
                thrd = &pl330->channels[i];
                if ((thrd->free) && (!_manager_ns(thrd) ||
-                                       _chan_ns(pi, i))) {
+                                       _chan_ns(pl330, i))) {
                        thrd->ev = _alloc_event(thrd);
                        if (thrd->ev >= 0) {
                                thrd->free = false;
                                thrd->lstenq = 1;
-                               thrd->req[0].r = NULL;
-                               mark_free(thrd, 0);
-                               thrd->req[1].r = NULL;
-                               mark_free(thrd, 1);
+                               thrd->req[0].desc = NULL;
+                               thrd->req[1].desc = NULL;
+                               thrd->req_running = -1;
                                break;
                        }
                }
@@ -1899,17 +1681,15 @@ static void *pl330_request_channel(const struct pl330_info *pi)
 static inline void _free_event(struct pl330_thread *thrd, int ev)
 {
        struct pl330_dmac *pl330 = thrd->dmac;
-       struct pl330_info *pi = pl330->pinfo;
 
        /* If the event is valid and was held by the thread */
-       if (ev >= 0 && ev < pi->pcfg.num_events
+       if (ev >= 0 && ev < pl330->pcfg.num_events
                        && pl330->events[ev] == thrd->id)
                pl330->events[ev] = -1;
 }
 
-static void pl330_release_channel(void *ch_id)
+static void pl330_release_channel(struct pl330_thread *thrd)
 {
-       struct pl330_thread *thrd = ch_id;
        struct pl330_dmac *pl330;
        unsigned long flags;
 
@@ -1918,8 +1698,8 @@ static void pl330_release_channel(void *ch_id)
 
        _stop(thrd);
 
-       _callback(thrd->req[1 - thrd->lstenq].r, PL330_ERR_ABORT);
-       _callback(thrd->req[thrd->lstenq].r, PL330_ERR_ABORT);
+       dma_pl330_rqcb(thrd->req[1 - thrd->lstenq].desc, PL330_ERR_ABORT);
+       dma_pl330_rqcb(thrd->req[thrd->lstenq].desc, PL330_ERR_ABORT);
 
        pl330 = thrd->dmac;
 
@@ -1932,72 +1712,70 @@ static void pl330_release_channel(void *ch_id)
 /* Initialize the structure for PL330 configuration, that can be used
  * by the client driver the make best use of the DMAC
  */
-static void read_dmac_config(struct pl330_info *pi)
+static void read_dmac_config(struct pl330_dmac *pl330)
 {
-       void __iomem *regs = pi->base;
+       void __iomem *regs = pl330->base;
        u32 val;
 
        val = readl(regs + CRD) >> CRD_DATA_WIDTH_SHIFT;
        val &= CRD_DATA_WIDTH_MASK;
-       pi->pcfg.data_bus_width = 8 * (1 << val);
+       pl330->pcfg.data_bus_width = 8 * (1 << val);
 
        val = readl(regs + CRD) >> CRD_DATA_BUFF_SHIFT;
        val &= CRD_DATA_BUFF_MASK;
-       pi->pcfg.data_buf_dep = val + 1;
+       pl330->pcfg.data_buf_dep = val + 1;
 
        val = readl(regs + CR0) >> CR0_NUM_CHANS_SHIFT;
        val &= CR0_NUM_CHANS_MASK;
        val += 1;
-       pi->pcfg.num_chan = val;
+       pl330->pcfg.num_chan = val;
 
        val = readl(regs + CR0);
        if (val & CR0_PERIPH_REQ_SET) {
                val = (val >> CR0_NUM_PERIPH_SHIFT) & CR0_NUM_PERIPH_MASK;
                val += 1;
-               pi->pcfg.num_peri = val;
-               pi->pcfg.peri_ns = readl(regs + CR4);
+               pl330->pcfg.num_peri = val;
+               pl330->pcfg.peri_ns = readl(regs + CR4);
        } else {
-               pi->pcfg.num_peri = 0;
+               pl330->pcfg.num_peri = 0;
        }
 
        val = readl(regs + CR0);
        if (val & CR0_BOOT_MAN_NS)
-               pi->pcfg.mode |= DMAC_MODE_NS;
+               pl330->pcfg.mode |= DMAC_MODE_NS;
        else
-               pi->pcfg.mode &= ~DMAC_MODE_NS;
+               pl330->pcfg.mode &= ~DMAC_MODE_NS;
 
        val = readl(regs + CR0) >> CR0_NUM_EVENTS_SHIFT;
        val &= CR0_NUM_EVENTS_MASK;
        val += 1;
-       pi->pcfg.num_events = val;
+       pl330->pcfg.num_events = val;
 
-       pi->pcfg.irq_ns = readl(regs + CR3);
+       pl330->pcfg.irq_ns = readl(regs + CR3);
 }
 
 static inline void _reset_thread(struct pl330_thread *thrd)
 {
        struct pl330_dmac *pl330 = thrd->dmac;
-       struct pl330_info *pi = pl330->pinfo;
 
        thrd->req[0].mc_cpu = pl330->mcode_cpu
-                               + (thrd->id * pi->mcbufsz);
+                               + (thrd->id * pl330->mcbufsz);
        thrd->req[0].mc_bus = pl330->mcode_bus
-                               + (thrd->id * pi->mcbufsz);
-       thrd->req[0].r = NULL;
-       mark_free(thrd, 0);
+                               + (thrd->id * pl330->mcbufsz);
+       thrd->req[0].desc = NULL;
 
        thrd->req[1].mc_cpu = thrd->req[0].mc_cpu
-                               + pi->mcbufsz / 2;
+                               + pl330->mcbufsz / 2;
        thrd->req[1].mc_bus = thrd->req[0].mc_bus
-                               + pi->mcbufsz / 2;
-       thrd->req[1].r = NULL;
-       mark_free(thrd, 1);
+                               + pl330->mcbufsz / 2;
+       thrd->req[1].desc = NULL;
+
+       thrd->req_running = -1;
 }
 
 static int dmac_alloc_threads(struct pl330_dmac *pl330)
 {
-       struct pl330_info *pi = pl330->pinfo;
-       int chans = pi->pcfg.num_chan;
+       int chans = pl330->pcfg.num_chan;
        struct pl330_thread *thrd;
        int i;
 
@@ -2028,29 +1806,28 @@ static int dmac_alloc_threads(struct pl330_dmac *pl330)
 
 static int dmac_alloc_resources(struct pl330_dmac *pl330)
 {
-       struct pl330_info *pi = pl330->pinfo;
-       int chans = pi->pcfg.num_chan;
+       int chans = pl330->pcfg.num_chan;
        int ret;
 
        /*
         * Alloc MicroCode buffer for 'chans' Channel threads.
         * A channel's buffer offset is (Channel_Id * MCODE_BUFF_PERCHAN)
         */
-       pl330->mcode_cpu = dma_alloc_coherent(pi->dev,
-                               chans * pi->mcbufsz,
+       pl330->mcode_cpu = dma_alloc_coherent(pl330->ddma.dev,
+                               chans * pl330->mcbufsz,
                                &pl330->mcode_bus, GFP_KERNEL);
        if (!pl330->mcode_cpu) {
-               dev_err(pi->dev, "%s:%d Can't allocate memory!\n",
+               dev_err(pl330->ddma.dev, "%s:%d Can't allocate memory!\n",
                        __func__, __LINE__);
                return -ENOMEM;
        }
 
        ret = dmac_alloc_threads(pl330);
        if (ret) {
-               dev_err(pi->dev, "%s:%d Can't to create channels for DMAC!\n",
+               dev_err(pl330->ddma.dev, "%s:%d Can't to create channels for DMAC!\n",
                        __func__, __LINE__);
-               dma_free_coherent(pi->dev,
-                               chans * pi->mcbufsz,
+               dma_free_coherent(pl330->ddma.dev,
+                               chans * pl330->mcbufsz,
                                pl330->mcode_cpu, pl330->mcode_bus);
                return ret;
        }
@@ -2058,71 +1835,45 @@ static int dmac_alloc_resources(struct pl330_dmac *pl330)
        return 0;
 }
 
-static int pl330_add(struct pl330_info *pi)
+static int pl330_add(struct pl330_dmac *pl330)
 {
-       struct pl330_dmac *pl330;
        void __iomem *regs;
        int i, ret;
 
-       if (!pi || !pi->dev)
-               return -EINVAL;
-
-       /* If already added */
-       if (pi->pl330_data)
-               return -EINVAL;
-
-       /*
-        * If the SoC can perform reset on the DMAC, then do it
-        * before reading its configuration.
-        */
-       if (pi->dmac_reset)
-               pi->dmac_reset(pi);
-
-       regs = pi->base;
+       regs = pl330->base;
 
        /* Check if we can handle this DMAC */
-       if ((pi->pcfg.periph_id & 0xfffff) != PERIPH_ID_VAL) {
-               dev_err(pi->dev, "PERIPH_ID 0x%x !\n", pi->pcfg.periph_id);
+       if ((pl330->pcfg.periph_id & 0xfffff) != PERIPH_ID_VAL) {
+               dev_err(pl330->ddma.dev, "PERIPH_ID 0x%x !\n",
+                       pl330->pcfg.periph_id);
                return -EINVAL;
        }
 
        /* Read the configuration of the DMAC */
-       read_dmac_config(pi);
+       read_dmac_config(pl330);
 
-       if (pi->pcfg.num_events == 0) {
-               dev_err(pi->dev, "%s:%d Can't work without events!\n",
+       if (pl330->pcfg.num_events == 0) {
+               dev_err(pl330->ddma.dev, "%s:%d Can't work without events!\n",
                        __func__, __LINE__);
                return -EINVAL;
        }
 
-       pl330 = kzalloc(sizeof(*pl330), GFP_KERNEL);
-       if (!pl330) {
-               dev_err(pi->dev, "%s:%d Can't allocate memory!\n",
-                       __func__, __LINE__);
-               return -ENOMEM;
-       }
-
-       /* Assign the info structure and private data */
-       pl330->pinfo = pi;
-       pi->pl330_data = pl330;
-
        spin_lock_init(&pl330->lock);
 
        INIT_LIST_HEAD(&pl330->req_done);
 
        /* Use default MC buffer size if not provided */
-       if (!pi->mcbufsz)
-               pi->mcbufsz = MCODE_BUFF_PER_REQ * 2;
+       if (!pl330->mcbufsz)
+               pl330->mcbufsz = MCODE_BUFF_PER_REQ * 2;
 
        /* Mark all events as free */
-       for (i = 0; i < pi->pcfg.num_events; i++)
+       for (i = 0; i < pl330->pcfg.num_events; i++)
                pl330->events[i] = -1;
 
        /* Allocate resources needed by the DMAC */
        ret = dmac_alloc_resources(pl330);
        if (ret) {
-               dev_err(pi->dev, "Unable to create channels for DMAC\n");
-               kfree(pl330);
+               dev_err(pl330->ddma.dev, "Unable to create channels for DMAC\n");
                return ret;
        }
 
@@ -2135,15 +1886,13 @@ static int pl330_add(struct pl330_info *pi)
 
 static int dmac_free_threads(struct pl330_dmac *pl330)
 {
-       struct pl330_info *pi = pl330->pinfo;
-       int chans = pi->pcfg.num_chan;
        struct pl330_thread *thrd;
        int i;
 
        /* Release Channel threads */
-       for (i = 0; i < chans; i++) {
+       for (i = 0; i < pl330->pcfg.num_chan; i++) {
                thrd = &pl330->channels[i];
-               pl330_release_channel((void *)thrd);
+               pl330_release_channel(thrd);
        }
 
        /* Free memory */
@@ -2152,35 +1901,18 @@ static int dmac_free_threads(struct pl330_dmac *pl330)
        return 0;
 }
 
-static void dmac_free_resources(struct pl330_dmac *pl330)
+static void pl330_del(struct pl330_dmac *pl330)
 {
-       struct pl330_info *pi = pl330->pinfo;
-       int chans = pi->pcfg.num_chan;
-
-       dmac_free_threads(pl330);
-
-       dma_free_coherent(pi->dev, chans * pi->mcbufsz,
-                               pl330->mcode_cpu, pl330->mcode_bus);
-}
-
-static void pl330_del(struct pl330_info *pi)
-{
-       struct pl330_dmac *pl330;
-
-       if (!pi || !pi->pl330_data)
-               return;
-
-       pl330 = pi->pl330_data;
-
        pl330->state = UNINIT;
 
        tasklet_kill(&pl330->tasks);
 
        /* Free DMAC resources */
-       dmac_free_resources(pl330);
+       dmac_free_threads(pl330);
 
-       kfree(pl330);
-       pi->pl330_data = NULL;
+       dma_free_coherent(pl330->ddma.dev,
+               pl330->pcfg.num_chan * pl330->mcbufsz, pl330->mcode_cpu,
+               pl330->mcode_bus);
 }
 
 /* forward declaration */
@@ -2212,8 +1944,7 @@ static inline void fill_queue(struct dma_pl330_chan *pch)
                if (desc->status == BUSY)
                        continue;
 
-               ret = pl330_submit_req(pch->pl330_chid,
-                                               &desc->req);
+               ret = pl330_submit_req(pch->thread, desc);
                if (!ret) {
                        desc->status = BUSY;
                } else if (ret == -EAGAIN) {
@@ -2222,7 +1953,7 @@ static inline void fill_queue(struct dma_pl330_chan *pch)
                } else {
                        /* Unacceptable request */
                        desc->status = DONE;
-                       dev_err(pch->dmac->pif.dev, "%s:%d Bad Desc(%d)\n",
+                       dev_err(pch->dmac->ddma.dev, "%s:%d Bad Desc(%d)\n",
                                        __func__, __LINE__, desc->txd.cookie);
                        tasklet_schedule(&pch->task);
                }
@@ -2249,7 +1980,9 @@ static void pl330_tasklet(unsigned long data)
        fill_queue(pch);
 
        /* Make sure the PL330 Channel thread is active */
-       pl330_chan_ctrl(pch->pl330_chid, PL330_OP_START);
+       spin_lock(&pch->thread->dmac->lock);
+       _start(pch->thread);
+       spin_unlock(&pch->thread->dmac->lock);
 
        while (!list_empty(&pch->completed_list)) {
                dma_async_tx_callback callback;
@@ -2280,25 +2013,6 @@ static void pl330_tasklet(unsigned long data)
        spin_unlock_irqrestore(&pch->lock, flags);
 }
 
-static void dma_pl330_rqcb(void *token, enum pl330_op_err err)
-{
-       struct dma_pl330_desc *desc = token;
-       struct dma_pl330_chan *pch = desc->pchan;
-       unsigned long flags;
-
-       /* If desc aborted */
-       if (!pch)
-               return;
-
-       spin_lock_irqsave(&pch->lock, flags);
-
-       desc->status = DONE;
-
-       spin_unlock_irqrestore(&pch->lock, flags);
-
-       tasklet_schedule(&pch->task);
-}
-
 bool pl330_filter(struct dma_chan *chan, void *param)
 {
        u8 *peri_id;
@@ -2315,23 +2029,26 @@ static struct dma_chan *of_dma_pl330_xlate(struct of_phandle_args *dma_spec,
                                                struct of_dma *ofdma)
 {
        int count = dma_spec->args_count;
-       struct dma_pl330_dmac *pdmac = ofdma->of_dma_data;
+       struct pl330_dmac *pl330 = ofdma->of_dma_data;
        unsigned int chan_id;
 
+       if (!pl330)
+               return NULL;
+
        if (count != 1)
                return NULL;
 
        chan_id = dma_spec->args[0];
-       if (chan_id >= pdmac->num_peripherals)
+       if (chan_id >= pl330->num_peripherals)
                return NULL;
 
-       return dma_get_slave_channel(&pdmac->peripherals[chan_id].chan);
+       return dma_get_slave_channel(&pl330->peripherals[chan_id].chan);
 }
 
 static int pl330_alloc_chan_resources(struct dma_chan *chan)
 {
        struct dma_pl330_chan *pch = to_pchan(chan);
-       struct dma_pl330_dmac *pdmac = pch->dmac;
+       struct pl330_dmac *pl330 = pch->dmac;
        unsigned long flags;
 
        spin_lock_irqsave(&pch->lock, flags);
@@ -2339,8 +2056,8 @@ static int pl330_alloc_chan_resources(struct dma_chan *chan)
        dma_cookie_init(chan);
        pch->cyclic = false;
 
-       pch->pl330_chid = pl330_request_channel(&pdmac->pif);
-       if (!pch->pl330_chid) {
+       pch->thread = pl330_request_channel(pl330);
+       if (!pch->thread) {
                spin_unlock_irqrestore(&pch->lock, flags);
                return -ENOMEM;
        }
@@ -2357,7 +2074,7 @@ static int pl330_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd, unsigned
        struct dma_pl330_chan *pch = to_pchan(chan);
        struct dma_pl330_desc *desc;
        unsigned long flags;
-       struct dma_pl330_dmac *pdmac = pch->dmac;
+       struct pl330_dmac *pl330 = pch->dmac;
        struct dma_slave_config *slave_config;
        LIST_HEAD(list);
 
@@ -2365,8 +2082,13 @@ static int pl330_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd, unsigned
        case DMA_TERMINATE_ALL:
                spin_lock_irqsave(&pch->lock, flags);
 
-               /* FLUSH the PL330 Channel thread */
-               pl330_chan_ctrl(pch->pl330_chid, PL330_OP_FLUSH);
+               spin_lock(&pl330->lock);
+               _stop(pch->thread);
+               spin_unlock(&pl330->lock);
+
+               pch->thread->req[0].desc = NULL;
+               pch->thread->req[1].desc = NULL;
+               pch->thread->req_running = -1;
 
                /* Mark all desc done */
                list_for_each_entry(desc, &pch->submitted_list, node) {
@@ -2384,9 +2106,9 @@ static int pl330_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd, unsigned
                        dma_cookie_complete(&desc->txd);
                }
 
-               list_splice_tail_init(&pch->submitted_list, &pdmac->desc_pool);
-               list_splice_tail_init(&pch->work_list, &pdmac->desc_pool);
-               list_splice_tail_init(&pch->completed_list, &pdmac->desc_pool);
+               list_splice_tail_init(&pch->submitted_list, &pl330->desc_pool);
+               list_splice_tail_init(&pch->work_list, &pl330->desc_pool);
+               list_splice_tail_init(&pch->completed_list, &pl330->desc_pool);
                spin_unlock_irqrestore(&pch->lock, flags);
                break;
        case DMA_SLAVE_CONFIG:
@@ -2409,7 +2131,7 @@ static int pl330_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd, unsigned
                }
                break;
        default:
-               dev_err(pch->dmac->pif.dev, "Not supported command.\n");
+               dev_err(pch->dmac->ddma.dev, "Not supported command.\n");
                return -ENXIO;
        }
 
@@ -2425,8 +2147,8 @@ static void pl330_free_chan_resources(struct dma_chan *chan)
 
        spin_lock_irqsave(&pch->lock, flags);
 
-       pl330_release_channel(pch->pl330_chid);
-       pch->pl330_chid = NULL;
+       pl330_release_channel(pch->thread);
+       pch->thread = NULL;
 
        if (pch->cyclic)
                list_splice_tail_init(&pch->work_list, &pch->dmac->desc_pool);
@@ -2489,57 +2211,46 @@ static dma_cookie_t pl330_tx_submit(struct dma_async_tx_descriptor *tx)
 
 static inline void _init_desc(struct dma_pl330_desc *desc)
 {
-       desc->req.x = &desc->px;
-       desc->req.token = desc;
        desc->rqcfg.swap = SWAP_NO;
-       desc->rqcfg.scctl = SCCTRL0;
-       desc->rqcfg.dcctl = DCCTRL0;
-       desc->req.cfg = &desc->rqcfg;
-       desc->req.xfer_cb = dma_pl330_rqcb;
+       desc->rqcfg.scctl = CCTRL0;
+       desc->rqcfg.dcctl = CCTRL0;
        desc->txd.tx_submit = pl330_tx_submit;
 
        INIT_LIST_HEAD(&desc->node);
 }
 
 /* Returns the number of descriptors added to the DMAC pool */
-static int add_desc(struct dma_pl330_dmac *pdmac, gfp_t flg, int count)
+static int add_desc(struct pl330_dmac *pl330, gfp_t flg, int count)
 {
        struct dma_pl330_desc *desc;
        unsigned long flags;
        int i;
 
-       if (!pdmac)
-               return 0;
-
        desc = kcalloc(count, sizeof(*desc), flg);
        if (!desc)
                return 0;
 
-       spin_lock_irqsave(&pdmac->pool_lock, flags);
+       spin_lock_irqsave(&pl330->pool_lock, flags);
 
        for (i = 0; i < count; i++) {
                _init_desc(&desc[i]);
-               list_add_tail(&desc[i].node, &pdmac->desc_pool);
+               list_add_tail(&desc[i].node, &pl330->desc_pool);
        }
 
-       spin_unlock_irqrestore(&pdmac->pool_lock, flags);
+       spin_unlock_irqrestore(&pl330->pool_lock, flags);
 
        return count;
 }
 
-static struct dma_pl330_desc *
-pluck_desc(struct dma_pl330_dmac *pdmac)
+static struct dma_pl330_desc *pluck_desc(struct pl330_dmac *pl330)
 {
        struct dma_pl330_desc *desc = NULL;
        unsigned long flags;
 
-       if (!pdmac)
-               return NULL;
-
-       spin_lock_irqsave(&pdmac->pool_lock, flags);
+       spin_lock_irqsave(&pl330->pool_lock, flags);
 
-       if (!list_empty(&pdmac->desc_pool)) {
-               desc = list_entry(pdmac->desc_pool.next,
+       if (!list_empty(&pl330->desc_pool)) {
+               desc = list_entry(pl330->desc_pool.next,
                                struct dma_pl330_desc, node);
 
                list_del_init(&desc->node);
@@ -2548,29 +2259,29 @@ pluck_desc(struct dma_pl330_dmac *pdmac)
                desc->txd.callback = NULL;
        }
 
-       spin_unlock_irqrestore(&pdmac->pool_lock, flags);
+       spin_unlock_irqrestore(&pl330->pool_lock, flags);
 
        return desc;
 }
 
 static struct dma_pl330_desc *pl330_get_desc(struct dma_pl330_chan *pch)
 {
-       struct dma_pl330_dmac *pdmac = pch->dmac;
+       struct pl330_dmac *pl330 = pch->dmac;
        u8 *peri_id = pch->chan.private;
        struct dma_pl330_desc *desc;
 
        /* Pluck one desc from the pool of DMAC */
-       desc = pluck_desc(pdmac);
+       desc = pluck_desc(pl330);
 
        /* If the DMAC pool is empty, alloc new */
        if (!desc) {
-               if (!add_desc(pdmac, GFP_ATOMIC, 1))
+               if (!add_desc(pl330, GFP_ATOMIC, 1))
                        return NULL;
 
                /* Try again */
-               desc = pluck_desc(pdmac);
+               desc = pluck_desc(pl330);
                if (!desc) {
-                       dev_err(pch->dmac->pif.dev,
+                       dev_err(pch->dmac->ddma.dev,
                                "%s:%d ALERT!\n", __func__, __LINE__);
                        return NULL;
                }
@@ -2581,8 +2292,8 @@ static struct dma_pl330_desc *pl330_get_desc(struct dma_pl330_chan *pch)
        desc->txd.cookie = 0;
        async_tx_ack(&desc->txd);
 
-       desc->req.peri = peri_id ? pch->chan.chan_id : 0;
-       desc->rqcfg.pcfg = &pch->dmac->pif.pcfg;
+       desc->peri = peri_id ? pch->chan.chan_id : 0;
+       desc->rqcfg.pcfg = &pch->dmac->pcfg;
 
        dma_async_tx_descriptor_init(&desc->txd, &pch->chan);
 
@@ -2592,7 +2303,6 @@ static struct dma_pl330_desc *pl330_get_desc(struct dma_pl330_chan *pch)
 static inline void fill_px(struct pl330_xfer *px,
                dma_addr_t dst, dma_addr_t src, size_t len)
 {
-       px->next = NULL;
        px->bytes = len;
        px->dst_addr = dst;
        px->src_addr = src;
@@ -2605,7 +2315,7 @@ __pl330_prep_dma_memcpy(struct dma_pl330_chan *pch, dma_addr_t dst,
        struct dma_pl330_desc *desc = pl330_get_desc(pch);
 
        if (!desc) {
-               dev_err(pch->dmac->pif.dev, "%s:%d Unable to fetch desc\n",
+               dev_err(pch->dmac->ddma.dev, "%s:%d Unable to fetch desc\n",
                        __func__, __LINE__);
                return NULL;
        }
@@ -2629,11 +2339,11 @@ __pl330_prep_dma_memcpy(struct dma_pl330_chan *pch, dma_addr_t dst,
 static inline int get_burst_len(struct dma_pl330_desc *desc, size_t len)
 {
        struct dma_pl330_chan *pch = desc->pchan;
-       struct pl330_info *pi = &pch->dmac->pif;
+       struct pl330_dmac *pl330 = pch->dmac;
        int burst_len;
 
-       burst_len = pi->pcfg.data_bus_width / 8;
-       burst_len *= pi->pcfg.data_buf_dep;
+       burst_len = pl330->pcfg.data_bus_width / 8;
+       burst_len *= pl330->pcfg.data_buf_dep;
        burst_len >>= desc->rqcfg.brst_size;
 
        /* src/dst_burst_len can't be more than 16 */
@@ -2652,11 +2362,11 @@ static inline int get_burst_len(struct dma_pl330_desc *desc, size_t len)
 static struct dma_async_tx_descriptor *pl330_prep_dma_cyclic(
                struct dma_chan *chan, dma_addr_t dma_addr, size_t len,
                size_t period_len, enum dma_transfer_direction direction,
-               unsigned long flags, void *context)
+               unsigned long flags)
 {
        struct dma_pl330_desc *desc = NULL, *first = NULL;
        struct dma_pl330_chan *pch = to_pchan(chan);
-       struct dma_pl330_dmac *pdmac = pch->dmac;
+       struct pl330_dmac *pl330 = pch->dmac;
        unsigned int i;
        dma_addr_t dst;
        dma_addr_t src;
@@ -2665,7 +2375,7 @@ static struct dma_async_tx_descriptor *pl330_prep_dma_cyclic(
                return NULL;
 
        if (!is_slave_direction(direction)) {
-               dev_err(pch->dmac->pif.dev, "%s:%d Invalid dma direction\n",
+               dev_err(pch->dmac->ddma.dev, "%s:%d Invalid dma direction\n",
                __func__, __LINE__);
                return NULL;
        }
@@ -2673,23 +2383,23 @@ static struct dma_async_tx_descriptor *pl330_prep_dma_cyclic(
        for (i = 0; i < len / period_len; i++) {
                desc = pl330_get_desc(pch);
                if (!desc) {
-                       dev_err(pch->dmac->pif.dev, "%s:%d Unable to fetch desc\n",
+                       dev_err(pch->dmac->ddma.dev, "%s:%d Unable to fetch desc\n",
                                __func__, __LINE__);
 
                        if (!first)
                                return NULL;
 
-                       spin_lock_irqsave(&pdmac->pool_lock, flags);
+                       spin_lock_irqsave(&pl330->pool_lock, flags);
 
                        while (!list_empty(&first->node)) {
                                desc = list_entry(first->node.next,
                                                struct dma_pl330_desc, node);
-                               list_move_tail(&desc->node, &pdmac->desc_pool);
+                               list_move_tail(&desc->node, &pl330->desc_pool);
                        }
 
-                       list_move_tail(&first->node, &pdmac->desc_pool);
+                       list_move_tail(&first->node, &pl330->desc_pool);
 
-                       spin_unlock_irqrestore(&pdmac->pool_lock, flags);
+                       spin_unlock_irqrestore(&pl330->pool_lock, flags);
 
                        return NULL;
                }
@@ -2698,14 +2408,12 @@ static struct dma_async_tx_descriptor *pl330_prep_dma_cyclic(
                case DMA_MEM_TO_DEV:
                        desc->rqcfg.src_inc = 1;
                        desc->rqcfg.dst_inc = 0;
-                       desc->req.rqtype = MEMTODEV;
                        src = dma_addr;
                        dst = pch->fifo_addr;
                        break;
                case DMA_DEV_TO_MEM:
                        desc->rqcfg.src_inc = 0;
                        desc->rqcfg.dst_inc = 1;
-                       desc->req.rqtype = DEVTOMEM;
                        src = pch->fifo_addr;
                        dst = dma_addr;
                        break;
@@ -2713,6 +2421,7 @@ static struct dma_async_tx_descriptor *pl330_prep_dma_cyclic(
                        break;
                }
 
+               desc->rqtype = direction;
                desc->rqcfg.brst_size = pch->burst_sz;
                desc->rqcfg.brst_len = 1;
                fill_px(&desc->px, dst, src, period_len);
@@ -2740,24 +2449,22 @@ pl330_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dst,
 {
        struct dma_pl330_desc *desc;
        struct dma_pl330_chan *pch = to_pchan(chan);
-       struct pl330_info *pi;
+       struct pl330_dmac *pl330 = pch->dmac;
        int burst;
 
        if (unlikely(!pch || !len))
                return NULL;
 
-       pi = &pch->dmac->pif;
-
        desc = __pl330_prep_dma_memcpy(pch, dst, src, len);
        if (!desc)
                return NULL;
 
        desc->rqcfg.src_inc = 1;
        desc->rqcfg.dst_inc = 1;
-       desc->req.rqtype = MEMTOMEM;
+       desc->rqtype = DMA_MEM_TO_MEM;
 
        /* Select max possible burst size */
-       burst = pi->pcfg.data_bus_width / 8;
+       burst = pl330->pcfg.data_bus_width / 8;
 
        while (burst > 1) {
                if (!(len % burst))
@@ -2776,7 +2483,7 @@ pl330_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dst,
        return &desc->txd;
 }
 
-static void __pl330_giveback_desc(struct dma_pl330_dmac *pdmac,
+static void __pl330_giveback_desc(struct pl330_dmac *pl330,
                                  struct dma_pl330_desc *first)
 {
        unsigned long flags;
@@ -2785,17 +2492,17 @@ static void __pl330_giveback_desc(struct dma_pl330_dmac *pdmac,
        if (!first)
                return;
 
-       spin_lock_irqsave(&pdmac->pool_lock, flags);
+       spin_lock_irqsave(&pl330->pool_lock, flags);
 
        while (!list_empty(&first->node)) {
                desc = list_entry(first->node.next,
                                struct dma_pl330_desc, node);
-               list_move_tail(&desc->node, &pdmac->desc_pool);
+               list_move_tail(&desc->node, &pl330->desc_pool);
        }
 
-       list_move_tail(&first->node, &pdmac->desc_pool);
+       list_move_tail(&first->node, &pl330->desc_pool);
 
-       spin_unlock_irqrestore(&pdmac->pool_lock, flags);
+       spin_unlock_irqrestore(&pl330->pool_lock, flags);
 }
 
 static struct dma_async_tx_descriptor *
@@ -2820,12 +2527,12 @@ pl330_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 
                desc = pl330_get_desc(pch);
                if (!desc) {
-                       struct dma_pl330_dmac *pdmac = pch->dmac;
+                       struct pl330_dmac *pl330 = pch->dmac;
 
-                       dev_err(pch->dmac->pif.dev,
+                       dev_err(pch->dmac->ddma.dev,
                                "%s:%d Unable to fetch desc\n",
                                __func__, __LINE__);
-                       __pl330_giveback_desc(pdmac, first);
+                       __pl330_giveback_desc(pl330, first);
 
                        return NULL;
                }
@@ -2838,19 +2545,18 @@ pl330_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
                if (direction == DMA_MEM_TO_DEV) {
                        desc->rqcfg.src_inc = 1;
                        desc->rqcfg.dst_inc = 0;
-                       desc->req.rqtype = MEMTODEV;
                        fill_px(&desc->px,
                                addr, sg_dma_address(sg), sg_dma_len(sg));
                } else {
                        desc->rqcfg.src_inc = 0;
                        desc->rqcfg.dst_inc = 1;
-                       desc->req.rqtype = DEVTOMEM;
                        fill_px(&desc->px,
                                sg_dma_address(sg), addr, sg_dma_len(sg));
                }
 
                desc->rqcfg.brst_size = pch->burst_sz;
                desc->rqcfg.brst_len = 1;
+               desc->rqtype = direction;
        }
 
        /* Return the last desc in the chain */
@@ -2890,9 +2596,9 @@ static int
 pl330_probe(struct amba_device *adev, const struct amba_id *id)
 {
        struct dma_pl330_platdata *pdat;
-       struct dma_pl330_dmac *pdmac;
+       struct pl330_config *pcfg;
+       struct pl330_dmac *pl330;
        struct dma_pl330_chan *pch, *_p;
-       struct pl330_info *pi;
        struct dma_device *pd;
        struct resource *res;
        int i, ret, irq;
@@ -2905,30 +2611,27 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
                return ret;
 
        /* Allocate a new DMAC and its Channels */
-       pdmac = devm_kzalloc(&adev->dev, sizeof(*pdmac), GFP_KERNEL);
-       if (!pdmac) {
+       pl330 = devm_kzalloc(&adev->dev, sizeof(*pl330), GFP_KERNEL);
+       if (!pl330) {
                dev_err(&adev->dev, "unable to allocate mem\n");
                return -ENOMEM;
        }
 
-       pi = &pdmac->pif;
-       pi->dev = &adev->dev;
-       pi->pl330_data = NULL;
-       pi->mcbufsz = pdat ? pdat->mcbuf_sz : 0;
+       pl330->mcbufsz = pdat ? pdat->mcbuf_sz : 0;
 
        res = &adev->res;
-       pi->base = devm_ioremap_resource(&adev->dev, res);
-       if (IS_ERR(pi->base))
-               return PTR_ERR(pi->base);
+       pl330->base = devm_ioremap_resource(&adev->dev, res);
+       if (IS_ERR(pl330->base))
+               return PTR_ERR(pl330->base);
 
-       amba_set_drvdata(adev, pdmac);
+       amba_set_drvdata(adev, pl330);
 
        for (i = 0; i < AMBA_NR_IRQS; i++) {
                irq = adev->irq[i];
                if (irq) {
                        ret = devm_request_irq(&adev->dev, irq,
                                               pl330_irq_handler, 0,
-                                              dev_name(&adev->dev), pi);
+                                              dev_name(&adev->dev), pl330);
                        if (ret)
                                return ret;
                } else {
@@ -2936,38 +2639,40 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
                }
        }
 
-       pi->pcfg.periph_id = adev->periphid;
-       ret = pl330_add(pi);
+       pcfg = &pl330->pcfg;
+
+       pcfg->periph_id = adev->periphid;
+       ret = pl330_add(pl330);
        if (ret)
                return ret;
 
-       INIT_LIST_HEAD(&pdmac->desc_pool);
-       spin_lock_init(&pdmac->pool_lock);
+       INIT_LIST_HEAD(&pl330->desc_pool);
+       spin_lock_init(&pl330->pool_lock);
 
        /* Create a descriptor pool of default size */
-       if (!add_desc(pdmac, GFP_KERNEL, NR_DEFAULT_DESC))
+       if (!add_desc(pl330, GFP_KERNEL, NR_DEFAULT_DESC))
                dev_warn(&adev->dev, "unable to allocate desc\n");
 
-       pd = &pdmac->ddma;
+       pd = &pl330->ddma;
        INIT_LIST_HEAD(&pd->channels);
 
        /* Initialize channel parameters */
        if (pdat)
-               num_chan = max_t(int, pdat->nr_valid_peri, pi->pcfg.num_chan);
+               num_chan = max_t(int, pdat->nr_valid_peri, pcfg->num_chan);
        else
-               num_chan = max_t(int, pi->pcfg.num_peri, pi->pcfg.num_chan);
+               num_chan = max_t(int, pcfg->num_peri, pcfg->num_chan);
 
-       pdmac->num_peripherals = num_chan;
+       pl330->num_peripherals = num_chan;
 
-       pdmac->peripherals = kzalloc(num_chan * sizeof(*pch), GFP_KERNEL);
-       if (!pdmac->peripherals) {
+       pl330->peripherals = kzalloc(num_chan * sizeof(*pch), GFP_KERNEL);
+       if (!pl330->peripherals) {
                ret = -ENOMEM;
-               dev_err(&adev->dev, "unable to allocate pdmac->peripherals\n");
+               dev_err(&adev->dev, "unable to allocate pl330->peripherals\n");
                goto probe_err2;
        }
 
        for (i = 0; i < num_chan; i++) {
-               pch = &pdmac->peripherals[i];
+               pch = &pl330->peripherals[i];
                if (!adev->dev.of_node)
                        pch->chan.private = pdat ? &pdat->peri_id[i] : NULL;
                else
@@ -2977,9 +2682,9 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
                INIT_LIST_HEAD(&pch->work_list);
                INIT_LIST_HEAD(&pch->completed_list);
                spin_lock_init(&pch->lock);
-               pch->pl330_chid = NULL;
+               pch->thread = NULL;
                pch->chan.device = pd;
-               pch->dmac = pdmac;
+               pch->dmac = pl330;
 
                /* Add the channel to the DMAC list */
                list_add_tail(&pch->chan.device_node, &pd->channels);
@@ -2990,7 +2695,7 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
                pd->cap_mask = pdat->cap_mask;
        } else {
                dma_cap_set(DMA_MEMCPY, pd->cap_mask);
-               if (pi->pcfg.num_peri) {
+               if (pcfg->num_peri) {
                        dma_cap_set(DMA_SLAVE, pd->cap_mask);
                        dma_cap_set(DMA_CYCLIC, pd->cap_mask);
                        dma_cap_set(DMA_PRIVATE, pd->cap_mask);
@@ -3015,14 +2720,14 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
 
        if (adev->dev.of_node) {
                ret = of_dma_controller_register(adev->dev.of_node,
-                                        of_dma_pl330_xlate, pdmac);
+                                        of_dma_pl330_xlate, pl330);
                if (ret) {
                        dev_err(&adev->dev,
                        "unable to register DMA to the generic DT DMA helpers\n");
                }
        }
 
-       adev->dev.dma_parms = &pdmac->dma_parms;
+       adev->dev.dma_parms = &pl330->dma_parms;
 
        /*
         * This is the limit for transfers with a buswidth of 1, larger
@@ -3037,14 +2742,13 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
                "Loaded driver for PL330 DMAC-%d\n", adev->periphid);
        dev_info(&adev->dev,
                "\tDBUFF-%ux%ubytes Num_Chans-%u Num_Peri-%u Num_Events-%u\n",
-               pi->pcfg.data_buf_dep,
-               pi->pcfg.data_bus_width / 8, pi->pcfg.num_chan,
-               pi->pcfg.num_peri, pi->pcfg.num_events);
+               pcfg->data_buf_dep, pcfg->data_bus_width / 8, pcfg->num_chan,
+               pcfg->num_peri, pcfg->num_events);
 
        return 0;
 probe_err3:
        /* Idle the DMAC */
-       list_for_each_entry_safe(pch, _p, &pdmac->ddma.channels,
+       list_for_each_entry_safe(pch, _p, &pl330->ddma.channels,
                        chan.device_node) {
 
                /* Remove the channel */
@@ -3055,27 +2759,23 @@ probe_err3:
                pl330_free_chan_resources(&pch->chan);
        }
 probe_err2:
-       pl330_del(pi);
+       pl330_del(pl330);
 
        return ret;
 }
 
 static int pl330_remove(struct amba_device *adev)
 {
-       struct dma_pl330_dmac *pdmac = amba_get_drvdata(adev);
+       struct pl330_dmac *pl330 = amba_get_drvdata(adev);
        struct dma_pl330_chan *pch, *_p;
-       struct pl330_info *pi;
-
-       if (!pdmac)
-               return 0;
 
        if (adev->dev.of_node)
                of_dma_controller_free(adev->dev.of_node);
 
-       dma_async_device_unregister(&pdmac->ddma);
+       dma_async_device_unregister(&pl330->ddma);
 
        /* Idle the DMAC */
-       list_for_each_entry_safe(pch, _p, &pdmac->ddma.channels,
+       list_for_each_entry_safe(pch, _p, &pl330->ddma.channels,
                        chan.device_node) {
 
                /* Remove the channel */
@@ -3086,9 +2786,7 @@ static int pl330_remove(struct amba_device *adev)
                pl330_free_chan_resources(&pch->chan);
        }
 
-       pi = &pdmac->pif;
-
-       pl330_del(pi);
+       pl330_del(pl330);
 
        return 0;
 }
index 82c923146e49fff03afc9ce87ad0c41e2da7af7b..7a4bbb0f80a5677161211c90fca6e53bea7c2129 100644 (file)
@@ -61,12 +61,17 @@ struct bam_desc_hw {
 #define DESC_FLAG_INT BIT(15)
 #define DESC_FLAG_EOT BIT(14)
 #define DESC_FLAG_EOB BIT(13)
+#define DESC_FLAG_NWD BIT(12)
 
 struct bam_async_desc {
        struct virt_dma_desc vd;
 
        u32 num_desc;
        u32 xfer_len;
+
+       /* transaction flags, EOT|EOB|NWD */
+       u16 flags;
+
        struct bam_desc_hw *curr_desc;
 
        enum dma_transfer_direction dir;
@@ -490,6 +495,14 @@ static struct dma_async_tx_descriptor *bam_prep_slave_sg(struct dma_chan *chan,
        if (!async_desc)
                goto err_out;
 
+       if (flags & DMA_PREP_FENCE)
+               async_desc->flags |= DESC_FLAG_NWD;
+
+       if (flags & DMA_PREP_INTERRUPT)
+               async_desc->flags |= DESC_FLAG_EOT;
+       else
+               async_desc->flags |= DESC_FLAG_INT;
+
        async_desc->num_desc = num_alloc;
        async_desc->curr_desc = async_desc->desc;
        async_desc->dir = direction;
@@ -793,8 +806,11 @@ static void bam_start_dma(struct bam_chan *bchan)
        else
                async_desc->xfer_len = async_desc->num_desc;
 
-       /* set INT on last descriptor */
-       desc[async_desc->xfer_len - 1].flags |= DESC_FLAG_INT;
+       /* set any special flags on the last descriptor */
+       if (async_desc->num_desc == async_desc->xfer_len)
+               desc[async_desc->xfer_len - 1].flags = async_desc->flags;
+       else
+               desc[async_desc->xfer_len - 1].flags |= DESC_FLAG_INT;
 
        if (bchan->tail + async_desc->xfer_len > MAX_DESCRIPTORS) {
                u32 partial = MAX_DESCRIPTORS - bchan->tail;
index 012520c9fd79174a44392fa3e2a448db2bccdce7..7416572d1e4081866baf61c05411159163a109a9 100644 (file)
@@ -889,8 +889,7 @@ static struct dma_async_tx_descriptor *s3c24xx_dma_prep_memcpy(
 
 static struct dma_async_tx_descriptor *s3c24xx_dma_prep_dma_cyclic(
        struct dma_chan *chan, dma_addr_t addr, size_t size, size_t period,
-       enum dma_transfer_direction direction, unsigned long flags,
-       void *context)
+       enum dma_transfer_direction direction, unsigned long flags)
 {
        struct s3c24xx_dma_chan *s3cchan = to_s3c24xx_dma_chan(chan);
        struct s3c24xx_dma_engine *s3cdma = s3cchan->host;
index 5ebdfbc1051ea7ed6e4dd948f9cac1e9b04760fc..4b0ef043729a1dc250f53894b58ef2dac31a4f19 100644 (file)
@@ -612,7 +612,7 @@ static struct dma_async_tx_descriptor *sa11x0_dma_prep_slave_sg(
 
 static struct dma_async_tx_descriptor *sa11x0_dma_prep_dma_cyclic(
        struct dma_chan *chan, dma_addr_t addr, size_t size, size_t period,
-       enum dma_transfer_direction dir, unsigned long flags, void *context)
+       enum dma_transfer_direction dir, unsigned long flags)
 {
        struct sa11x0_dma_chan *c = to_sa11x0_dma_chan(chan);
        struct sa11x0_dma_desc *txd;
index 0f719816c91bf9277cdb19a3b6ac67449aa579e2..0349125a2e20af8b4af6a85ca3ed5d9708b662c8 100644 (file)
@@ -2,21 +2,39 @@
 # DMA engine configuration for sh
 #
 
+#
+# DMA Engine Helpers
+#
+
 config SH_DMAE_BASE
        bool "Renesas SuperH DMA Engine support"
-       depends on (SUPERH && SH_DMA) || ARCH_SHMOBILE || COMPILE_TEST
+       depends on SUPERH || ARCH_SHMOBILE || COMPILE_TEST
+       depends on !SUPERH || SH_DMA
        depends on !SH_DMA_API
        default y
        select DMA_ENGINE
        help
          Enable support for the Renesas SuperH DMA controllers.
 
+#
+# DMA Controllers
+#
+
 config SH_DMAE
        tristate "Renesas SuperH DMAC support"
        depends on SH_DMAE_BASE
        help
          Enable support for the Renesas SuperH DMA controllers.
 
+if SH_DMAE
+
+config SH_DMAE_R8A73A4
+       def_bool y
+       depends on ARCH_R8A73A4
+       depends on OF
+
+endif
+
 config SUDMAC
        tristate "Renesas SUDMAC support"
        depends on SH_DMAE_BASE
@@ -34,7 +52,3 @@ config RCAR_AUDMAC_PP
        depends on SH_DMAE_BASE
        help
          Enable support for the Renesas R-Car Audio DMAC Peripheral Peripheral controllers.
-
-config SHDMA_R8A73A4
-       def_bool y
-       depends on ARCH_R8A73A4 && SH_DMAE != n
index 1ce88b28cfc62595240b7d54c79d731d4ec4fc22..0a5cfdb76e45cab3326967c28f3f231a154a16eb 100644 (file)
@@ -1,10 +1,18 @@
+#
+# DMA Engine Helpers
+#
+
 obj-$(CONFIG_SH_DMAE_BASE) += shdma-base.o shdma-of.o
-obj-$(CONFIG_SH_DMAE) += shdma.o
+
+#
+# DMA Controllers
+#
+
 shdma-y := shdmac.o
-ifeq ($(CONFIG_OF),y)
-shdma-$(CONFIG_SHDMA_R8A73A4) += shdma-r8a73a4.o
-endif
+shdma-$(CONFIG_SH_DMAE_R8A73A4) += shdma-r8a73a4.o
 shdma-objs := $(shdma-y)
+obj-$(CONFIG_SH_DMAE) += shdma.o
+
 obj-$(CONFIG_SUDMAC) += sudmac.o
 obj-$(CONFIG_RCAR_HPB_DMAE) += rcar-hpbdma.o
 obj-$(CONFIG_RCAR_AUDMAC_PP) += rcar-audmapp.o
index 2de77289a2e978a4cd65247e38ff1f96a0aac739..dabbf0aba2e9b5195625ffee4e8508bef3e087b4 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/dmaengine.h>
+#include <linux/of_dma.h>
 #include <linux/platform_data/dma-rcar-audmapp.h>
 #include <linux/platform_device.h>
 #include <linux/shdma-base.h>
@@ -45,8 +46,9 @@
 
 struct audmapp_chan {
        struct shdma_chan shdma_chan;
-       struct audmapp_slave_config *config;
        void __iomem *base;
+       dma_addr_t slave_addr;
+       u32 chcr;
 };
 
 struct audmapp_device {
@@ -56,7 +58,16 @@ struct audmapp_device {
        void __iomem *chan_reg;
 };
 
+struct audmapp_desc {
+       struct shdma_desc shdma_desc;
+       dma_addr_t src;
+       dma_addr_t dst;
+};
+
+#define to_shdma_chan(c) container_of(c, struct shdma_chan, dma_chan)
+
 #define to_chan(chan) container_of(chan, struct audmapp_chan, shdma_chan)
+#define to_desc(sdesc) container_of(sdesc, struct audmapp_desc, shdma_desc)
 #define to_dev(chan) container_of(chan->shdma_chan.dma_chan.device,    \
                                  struct audmapp_device, shdma_dev.dma_dev)
 
@@ -90,70 +101,82 @@ static void audmapp_halt(struct shdma_chan *schan)
 }
 
 static void audmapp_start_xfer(struct shdma_chan *schan,
-                              struct shdma_desc *sdecs)
+                              struct shdma_desc *sdesc)
 {
        struct audmapp_chan *auchan = to_chan(schan);
        struct audmapp_device *audev = to_dev(auchan);
-       struct audmapp_slave_config *cfg = auchan->config;
+       struct audmapp_desc *desc = to_desc(sdesc);
        struct device *dev = audev->dev;
-       u32 chcr = cfg->chcr | PDMACHCR_DE;
+       u32 chcr = auchan->chcr | PDMACHCR_DE;
 
-       dev_dbg(dev, "src/dst/chcr = %pad/%pad/%x\n",
-               &cfg->src, &cfg->dst, cfg->chcr);
+       dev_dbg(dev, "src/dst/chcr = %pad/%pad/%08x\n",
+               &desc->src, &desc->dst, chcr);
 
-       audmapp_write(auchan, cfg->src, PDMASAR);
-       audmapp_write(auchan, cfg->dst, PDMADAR);
+       audmapp_write(auchan, desc->src,        PDMASAR);
+       audmapp_write(auchan, desc->dst,        PDMADAR);
        audmapp_write(auchan, chcr,     PDMACHCR);
 }
 
-static struct audmapp_slave_config *
-audmapp_find_slave(struct audmapp_chan *auchan, int slave_id)
+static void audmapp_get_config(struct audmapp_chan *auchan, int slave_id,
+                             u32 *chcr, dma_addr_t *dst)
 {
        struct audmapp_device *audev = to_dev(auchan);
        struct audmapp_pdata *pdata = audev->pdata;
        struct audmapp_slave_config *cfg;
        int i;
 
+       *chcr   = 0;
+       *dst    = 0;
+
+       if (!pdata) { /* DT */
+               *chcr = ((u32)slave_id) << 16;
+               auchan->shdma_chan.slave_id = (slave_id) >> 8;
+               return;
+       }
+
+       /* non-DT */
+
        if (slave_id >= AUDMAPP_SLAVE_NUMBER)
-               return NULL;
+               return;
 
        for (i = 0, cfg = pdata->slave; i < pdata->slave_num; i++, cfg++)
-               if (cfg->slave_id == slave_id)
-                       return cfg;
-
-       return NULL;
+               if (cfg->slave_id == slave_id) {
+                       *chcr   = cfg->chcr;
+                       *dst    = cfg->dst;
+                       break;
+               }
 }
 
 static int audmapp_set_slave(struct shdma_chan *schan, int slave_id,
                             dma_addr_t slave_addr, bool try)
 {
        struct audmapp_chan *auchan = to_chan(schan);
-       struct audmapp_slave_config *cfg =
-               audmapp_find_slave(auchan, slave_id);
+       u32 chcr;
+       dma_addr_t dst;
+
+       audmapp_get_config(auchan, slave_id, &chcr, &dst);
 
-       if (!cfg)
-               return -ENODEV;
        if (try)
                return 0;
 
-       auchan->config  = cfg;
+       auchan->chcr            = chcr;
+       auchan->slave_addr      = slave_addr ? : dst;
 
        return 0;
 }
 
 static int audmapp_desc_setup(struct shdma_chan *schan,
-                             struct shdma_desc *sdecs,
+                             struct shdma_desc *sdesc,
                              dma_addr_t src, dma_addr_t dst, size_t *len)
 {
-       struct audmapp_chan *auchan = to_chan(schan);
-       struct audmapp_slave_config *cfg = auchan->config;
-
-       if (!cfg)
-               return -ENODEV;
+       struct audmapp_desc *desc = to_desc(sdesc);
 
        if (*len > (size_t)AUDMAPP_LEN_MAX)
                *len = (size_t)AUDMAPP_LEN_MAX;
 
+       desc->src = src;
+       desc->dst = dst;
+
        return 0;
 }
 
@@ -164,7 +187,9 @@ static void audmapp_setup_xfer(struct shdma_chan *schan,
 
 static dma_addr_t audmapp_slave_addr(struct shdma_chan *schan)
 {
-       return 0; /* always fixed address */
+       struct audmapp_chan *auchan = to_chan(schan);
+
+       return auchan->slave_addr;
 }
 
 static bool audmapp_channel_busy(struct shdma_chan *schan)
@@ -183,7 +208,7 @@ static bool audmapp_desc_completed(struct shdma_chan *schan,
 
 static struct shdma_desc *audmapp_embedded_desc(void *buf, int i)
 {
-       return &((struct shdma_desc *)buf)[i];
+       return &((struct audmapp_desc *)buf)[i].shdma_desc;
 }
 
 static const struct shdma_ops audmapp_shdma_ops = {
@@ -234,16 +259,39 @@ static void audmapp_chan_remove(struct audmapp_device *audev)
        dma_dev->chancnt = 0;
 }
 
+static struct dma_chan *audmapp_of_xlate(struct of_phandle_args *dma_spec,
+                                        struct of_dma *ofdma)
+{
+       dma_cap_mask_t mask;
+       struct dma_chan *chan;
+       u32 chcr = dma_spec->args[0];
+
+       if (dma_spec->args_count != 1)
+               return NULL;
+
+       dma_cap_zero(mask);
+       dma_cap_set(DMA_SLAVE, mask);
+
+       chan = dma_request_channel(mask, shdma_chan_filter, NULL);
+       if (chan)
+               to_shdma_chan(chan)->hw_req = chcr;
+
+       return chan;
+}
+
 static int audmapp_probe(struct platform_device *pdev)
 {
        struct audmapp_pdata *pdata = pdev->dev.platform_data;
+       struct device_node *np = pdev->dev.of_node;
        struct audmapp_device *audev;
        struct shdma_dev *sdev;
        struct dma_device *dma_dev;
        struct resource *res;
        int err, i;
 
-       if (!pdata)
+       if (np)
+               of_dma_controller_register(np, audmapp_of_xlate, pdev);
+       else if (!pdata)
                return -ENODEV;
 
        res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
@@ -260,7 +308,7 @@ static int audmapp_probe(struct platform_device *pdev)
 
        sdev            = &audev->shdma_dev;
        sdev->ops       = &audmapp_shdma_ops;
-       sdev->desc_size = sizeof(struct shdma_desc);
+       sdev->desc_size = sizeof(struct audmapp_desc);
 
        dma_dev                 = &sdev->dma_dev;
        dma_dev->copy_align     = LOG2_DEFAULT_XFER_SIZE;
@@ -305,12 +353,18 @@ static int audmapp_remove(struct platform_device *pdev)
        return 0;
 }
 
+static const struct of_device_id audmapp_of_match[] = {
+       { .compatible = "renesas,rcar-audmapp", },
+       {},
+};
+
 static struct platform_driver audmapp_driver = {
        .probe          = audmapp_probe,
        .remove         = audmapp_remove,
        .driver         = {
                .owner  = THIS_MODULE,
                .name   = "rcar-audmapp-engine",
+               .of_match_table = audmapp_of_match,
        },
 };
 module_platform_driver(audmapp_driver);
index a2b8258426c940f0a813c8d854a634cc212e0289..a1b0ef45d6a2f664262f009df1d108099aac5ffc 100644 (file)
@@ -45,7 +45,7 @@ enum {
        ((((i) & TS_LOW_BIT) << TS_LOW_SHIFT) |\
         (((i) & TS_HI_BIT)  << TS_HI_SHIFT))
 
-#define CHCR_TX(xmit_sz) (DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL((xmit_sz)))
-#define CHCR_RX(xmit_sz) (DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL((xmit_sz)))
+#define CHCR_TX(xmit_sz) (DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL((xmit_sz)))
+#define CHCR_RX(xmit_sz) (DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL((xmit_sz)))
 
 #endif
index b35007e21e6b3cc0318126de473d99b8d917289c..42d49741619672b34147d2741d457365533c15f4 100644 (file)
@@ -206,45 +206,6 @@ static int shdma_setup_slave(struct shdma_chan *schan, int slave_id,
        return 0;
 }
 
-/*
- * This is the standard shdma filter function to be used as a replacement to the
- * "old" method, using the .private pointer. If for some reason you allocate a
- * channel without slave data, use something like ERR_PTR(-EINVAL) as a filter
- * parameter. If this filter is used, the slave driver, after calling
- * dma_request_channel(), will also have to call dmaengine_slave_config() with
- * .slave_id, .direction, and either .src_addr or .dst_addr set.
- * NOTE: this filter doesn't support multiple DMAC drivers with the DMA_SLAVE
- * capability! If this becomes a requirement, hardware glue drivers, using this
- * services would have to provide their own filters, which first would check
- * the device driver, similar to how other DMAC drivers, e.g., sa11x0-dma.c, do
- * this, and only then, in case of a match, call this common filter.
- * NOTE 2: This filter function is also used in the DT case by shdma_of_xlate().
- * In that case the MID-RID value is used for slave channel filtering and is
- * passed to this function in the "arg" parameter.
- */
-bool shdma_chan_filter(struct dma_chan *chan, void *arg)
-{
-       struct shdma_chan *schan = to_shdma_chan(chan);
-       struct shdma_dev *sdev = to_shdma_dev(schan->dma_chan.device);
-       const struct shdma_ops *ops = sdev->ops;
-       int match = (long)arg;
-       int ret;
-
-       if (match < 0)
-               /* No slave requested - arbitrary channel */
-               return true;
-
-       if (!schan->dev->of_node && match >= slave_num)
-               return false;
-
-       ret = ops->set_slave(schan, match, 0, true);
-       if (ret < 0)
-               return false;
-
-       return true;
-}
-EXPORT_SYMBOL(shdma_chan_filter);
-
 static int shdma_alloc_chan_resources(struct dma_chan *chan)
 {
        struct shdma_chan *schan = to_shdma_chan(chan);
@@ -295,6 +256,51 @@ esetslave:
        return ret;
 }
 
+/*
+ * This is the standard shdma filter function to be used as a replacement to the
+ * "old" method, using the .private pointer. If for some reason you allocate a
+ * channel without slave data, use something like ERR_PTR(-EINVAL) as a filter
+ * parameter. If this filter is used, the slave driver, after calling
+ * dma_request_channel(), will also have to call dmaengine_slave_config() with
+ * .slave_id, .direction, and either .src_addr or .dst_addr set.
+ * NOTE: this filter doesn't support multiple DMAC drivers with the DMA_SLAVE
+ * capability! If this becomes a requirement, hardware glue drivers, using this
+ * services would have to provide their own filters, which first would check
+ * the device driver, similar to how other DMAC drivers, e.g., sa11x0-dma.c, do
+ * this, and only then, in case of a match, call this common filter.
+ * NOTE 2: This filter function is also used in the DT case by shdma_of_xlate().
+ * In that case the MID-RID value is used for slave channel filtering and is
+ * passed to this function in the "arg" parameter.
+ */
+bool shdma_chan_filter(struct dma_chan *chan, void *arg)
+{
+       struct shdma_chan *schan;
+       struct shdma_dev *sdev;
+       int match = (long)arg;
+       int ret;
+
+       /* Only support channels handled by this driver. */
+       if (chan->device->device_alloc_chan_resources !=
+           shdma_alloc_chan_resources)
+               return false;
+
+       if (match < 0)
+               /* No slave requested - arbitrary channel */
+               return true;
+
+       schan = to_shdma_chan(chan);
+       if (!schan->dev->of_node && match >= slave_num)
+               return false;
+
+       sdev = to_shdma_dev(schan->dma_chan.device);
+       ret = sdev->ops->set_slave(schan, match, 0, true);
+       if (ret < 0)
+               return false;
+
+       return true;
+}
+EXPORT_SYMBOL(shdma_chan_filter);
+
 static dma_async_tx_callback __ld_cleanup(struct shdma_chan *schan, bool all)
 {
        struct shdma_desc *desc, *_desc;
@@ -662,15 +668,16 @@ static struct dma_async_tx_descriptor *shdma_prep_slave_sg(
 static struct dma_async_tx_descriptor *shdma_prep_dma_cyclic(
        struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len,
        size_t period_len, enum dma_transfer_direction direction,
-       unsigned long flags, void *context)
+       unsigned long flags)
 {
        struct shdma_chan *schan = to_shdma_chan(chan);
        struct shdma_dev *sdev = to_shdma_dev(schan->dma_chan.device);
+       struct dma_async_tx_descriptor *desc;
        const struct shdma_ops *ops = sdev->ops;
        unsigned int sg_len = buf_len / period_len;
        int slave_id = schan->slave_id;
        dma_addr_t slave_addr;
-       struct scatterlist sgl[SHDMA_MAX_SG_LEN];
+       struct scatterlist *sgl;
        int i;
 
        if (!chan)
@@ -694,7 +701,16 @@ static struct dma_async_tx_descriptor *shdma_prep_dma_cyclic(
 
        slave_addr = ops->slave_addr(schan);
 
+       /*
+        * Allocate the sg list dynamically as it would consumer too much stack
+        * space.
+        */
+       sgl = kcalloc(sg_len, sizeof(*sgl), GFP_KERNEL);
+       if (!sgl)
+               return NULL;
+
        sg_init_table(sgl, sg_len);
+
        for (i = 0; i < sg_len; i++) {
                dma_addr_t src = buf_addr + (period_len * i);
 
@@ -704,8 +720,11 @@ static struct dma_async_tx_descriptor *shdma_prep_dma_cyclic(
                sg_dma_len(&sgl[i]) = period_len;
        }
 
-       return shdma_prep_sg(schan, sgl, sg_len, &slave_addr,
+       desc = shdma_prep_sg(schan, sgl, sg_len, &slave_addr,
                             direction, flags, true);
+
+       kfree(sgl);
+       return desc;
 }
 
 static int shdma_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
index 758a57b51875b38d84b999775d306a84ca85bcad..2c0a969adc9fb73e7795c1bbdd7b2aa072037827 100644 (file)
@@ -62,7 +62,7 @@ struct sh_dmae_desc {
 #define to_sh_dev(chan) container_of(chan->shdma_chan.dma_chan.device,\
                                     struct sh_dmae_device, shdma_dev.dma_dev)
 
-#ifdef CONFIG_SHDMA_R8A73A4
+#ifdef CONFIG_SH_DMAE_R8A73A4
 extern const struct sh_dmae_pdata r8a73a4_dma_pdata;
 #define r8a73a4_shdma_devid (&r8a73a4_dma_pdata)
 #else
index 146d5df926db00364429cbe75ce9ff84e9dddf46..58eb85770eba8e07e16b9dab1da321bf318369e3 100644 (file)
 #include "../dmaengine.h"
 #include "shdma.h"
 
-/* DMA register */
-#define SAR    0x00
-#define DAR    0x04
-#define TCR    0x08
-#define CHCR   0x0C
-#define DMAOR  0x40
+/* DMA registers */
+#define SAR    0x00    /* Source Address Register */
+#define DAR    0x04    /* Destination Address Register */
+#define TCR    0x08    /* Transfer Count Register */
+#define CHCR   0x0C    /* Channel Control Register */
+#define DMAOR  0x40    /* DMA Operation Register */
 
 #define TEND   0x18 /* USB-DMAC */
 
@@ -239,9 +239,8 @@ static void dmae_init(struct sh_dmae_chan *sh_chan)
 {
        /*
         * Default configuration for dual address memory-memory transfer.
-        * 0x400 represents auto-request.
         */
-       u32 chcr = DM_INC | SM_INC | 0x400 | log2size_to_chcr(sh_chan,
+       u32 chcr = DM_INC | SM_INC | RS_AUTO | log2size_to_chcr(sh_chan,
                                                   LOG2_DEFAULT_XFER_SIZE);
        sh_chan->xmit_shift = calc_xmit_shift(sh_chan, chcr);
        chcr_write(sh_chan, chcr);
index 03f7820fa333b89a445dc3831386ba371b93a1e1..aac03ab10c54809275460aecde286ef9d5742ff9 100644 (file)
@@ -580,7 +580,7 @@ err_dir:
 static struct dma_async_tx_descriptor *
 sirfsoc_dma_prep_cyclic(struct dma_chan *chan, dma_addr_t addr,
        size_t buf_len, size_t period_len,
-       enum dma_transfer_direction direction, unsigned long flags, void *context)
+       enum dma_transfer_direction direction, unsigned long flags)
 {
        struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
        struct sirfsoc_dma_desc *sdesc = NULL;
index c7984459ede76bb1dd50b00f55c639d2d88ff741..5fe59335e247ef81518a55fff496256009eafbcf 100644 (file)
@@ -2531,8 +2531,7 @@ d40_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 static struct dma_async_tx_descriptor *
 dma40_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t dma_addr,
                     size_t buf_len, size_t period_len,
-                    enum dma_transfer_direction direction, unsigned long flags,
-                    void *context)
+                    enum dma_transfer_direction direction, unsigned long flags)
 {
        unsigned int periods = buf_len / period_len;
        struct dma_async_tx_descriptor *txd;
diff --git a/drivers/dma/sun6i-dma.c b/drivers/dma/sun6i-dma.c
new file mode 100644 (file)
index 0000000..1f92a56
--- /dev/null
@@ -0,0 +1,1053 @@
+/*
+ * Copyright (C) 2013-2014 Allwinner Tech Co., Ltd
+ * Author: Sugar <shuge@allwinnertech.com>
+ *
+ * Copyright (C) 2014 Maxime Ripard
+ * Maxime Ripard <maxime.ripard@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/dmaengine.h>
+#include <linux/dmapool.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/of_dma.h>
+#include <linux/platform_device.h>
+#include <linux/reset.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include "virt-dma.h"
+
+/*
+ * There's 16 physical channels that can work in parallel.
+ *
+ * However we have 30 different endpoints for our requests.
+ *
+ * Since the channels are able to handle only an unidirectional
+ * transfer, we need to allocate more virtual channels so that
+ * everyone can grab one channel.
+ *
+ * Some devices can't work in both direction (mostly because it
+ * wouldn't make sense), so we have a bit fewer virtual channels than
+ * 2 channels per endpoints.
+ */
+
+#define NR_MAX_CHANNELS                16
+#define NR_MAX_REQUESTS                30
+#define NR_MAX_VCHANS          53
+
+/*
+ * Common registers
+ */
+#define DMA_IRQ_EN(x)          ((x) * 0x04)
+#define DMA_IRQ_HALF                   BIT(0)
+#define DMA_IRQ_PKG                    BIT(1)
+#define DMA_IRQ_QUEUE                  BIT(2)
+
+#define DMA_IRQ_CHAN_NR                        8
+#define DMA_IRQ_CHAN_WIDTH             4
+
+
+#define DMA_IRQ_STAT(x)                ((x) * 0x04 + 0x10)
+
+#define DMA_STAT               0x30
+
+/*
+ * Channels specific registers
+ */
+#define DMA_CHAN_ENABLE                0x00
+#define DMA_CHAN_ENABLE_START          BIT(0)
+#define DMA_CHAN_ENABLE_STOP           0
+
+#define DMA_CHAN_PAUSE         0x04
+#define DMA_CHAN_PAUSE_PAUSE           BIT(1)
+#define DMA_CHAN_PAUSE_RESUME          0
+
+#define DMA_CHAN_LLI_ADDR      0x08
+
+#define DMA_CHAN_CUR_CFG       0x0c
+#define DMA_CHAN_CFG_SRC_DRQ(x)                ((x) & 0x1f)
+#define DMA_CHAN_CFG_SRC_IO_MODE       BIT(5)
+#define DMA_CHAN_CFG_SRC_LINEAR_MODE   (0 << 5)
+#define DMA_CHAN_CFG_SRC_BURST(x)      (((x) & 0x3) << 7)
+#define DMA_CHAN_CFG_SRC_WIDTH(x)      (((x) & 0x3) << 9)
+
+#define DMA_CHAN_CFG_DST_DRQ(x)                (DMA_CHAN_CFG_SRC_DRQ(x) << 16)
+#define DMA_CHAN_CFG_DST_IO_MODE       (DMA_CHAN_CFG_SRC_IO_MODE << 16)
+#define DMA_CHAN_CFG_DST_LINEAR_MODE   (DMA_CHAN_CFG_SRC_LINEAR_MODE << 16)
+#define DMA_CHAN_CFG_DST_BURST(x)      (DMA_CHAN_CFG_SRC_BURST(x) << 16)
+#define DMA_CHAN_CFG_DST_WIDTH(x)      (DMA_CHAN_CFG_SRC_WIDTH(x) << 16)
+
+#define DMA_CHAN_CUR_SRC       0x10
+
+#define DMA_CHAN_CUR_DST       0x14
+
+#define DMA_CHAN_CUR_CNT       0x18
+
+#define DMA_CHAN_CUR_PARA      0x1c
+
+
+/*
+ * Various hardware related defines
+ */
+#define LLI_LAST_ITEM  0xfffff800
+#define NORMAL_WAIT    8
+#define DRQ_SDRAM      1
+
+/*
+ * Hardware representation of the LLI
+ *
+ * The hardware will be fed the physical address of this structure,
+ * and read its content in order to start the transfer.
+ */
+struct sun6i_dma_lli {
+       u32                     cfg;
+       u32                     src;
+       u32                     dst;
+       u32                     len;
+       u32                     para;
+       u32                     p_lli_next;
+
+       /*
+        * This field is not used by the DMA controller, but will be
+        * used by the CPU to go through the list (mostly for dumping
+        * or freeing it).
+        */
+       struct sun6i_dma_lli    *v_lli_next;
+};
+
+
+struct sun6i_desc {
+       struct virt_dma_desc    vd;
+       dma_addr_t              p_lli;
+       struct sun6i_dma_lli    *v_lli;
+};
+
+struct sun6i_pchan {
+       u32                     idx;
+       void __iomem            *base;
+       struct sun6i_vchan      *vchan;
+       struct sun6i_desc       *desc;
+       struct sun6i_desc       *done;
+};
+
+struct sun6i_vchan {
+       struct virt_dma_chan    vc;
+       struct list_head        node;
+       struct dma_slave_config cfg;
+       struct sun6i_pchan      *phy;
+       u8                      port;
+};
+
+struct sun6i_dma_dev {
+       struct dma_device       slave;
+       void __iomem            *base;
+       struct clk              *clk;
+       int                     irq;
+       spinlock_t              lock;
+       struct reset_control    *rstc;
+       struct tasklet_struct   task;
+       atomic_t                tasklet_shutdown;
+       struct list_head        pending;
+       struct dma_pool         *pool;
+       struct sun6i_pchan      *pchans;
+       struct sun6i_vchan      *vchans;
+};
+
+static struct device *chan2dev(struct dma_chan *chan)
+{
+       return &chan->dev->device;
+}
+
+static inline struct sun6i_dma_dev *to_sun6i_dma_dev(struct dma_device *d)
+{
+       return container_of(d, struct sun6i_dma_dev, slave);
+}
+
+static inline struct sun6i_vchan *to_sun6i_vchan(struct dma_chan *chan)
+{
+       return container_of(chan, struct sun6i_vchan, vc.chan);
+}
+
+static inline struct sun6i_desc *
+to_sun6i_desc(struct dma_async_tx_descriptor *tx)
+{
+       return container_of(tx, struct sun6i_desc, vd.tx);
+}
+
+static inline void sun6i_dma_dump_com_regs(struct sun6i_dma_dev *sdev)
+{
+       dev_dbg(sdev->slave.dev, "Common register:\n"
+               "\tmask0(%04x): 0x%08x\n"
+               "\tmask1(%04x): 0x%08x\n"
+               "\tpend0(%04x): 0x%08x\n"
+               "\tpend1(%04x): 0x%08x\n"
+               "\tstats(%04x): 0x%08x\n",
+               DMA_IRQ_EN(0), readl(sdev->base + DMA_IRQ_EN(0)),
+               DMA_IRQ_EN(1), readl(sdev->base + DMA_IRQ_EN(1)),
+               DMA_IRQ_STAT(0), readl(sdev->base + DMA_IRQ_STAT(0)),
+               DMA_IRQ_STAT(1), readl(sdev->base + DMA_IRQ_STAT(1)),
+               DMA_STAT, readl(sdev->base + DMA_STAT));
+}
+
+static inline void sun6i_dma_dump_chan_regs(struct sun6i_dma_dev *sdev,
+                                           struct sun6i_pchan *pchan)
+{
+       phys_addr_t reg = virt_to_phys(pchan->base);
+
+       dev_dbg(sdev->slave.dev, "Chan %d reg: %pa\n"
+               "\t___en(%04x): \t0x%08x\n"
+               "\tpause(%04x): \t0x%08x\n"
+               "\tstart(%04x): \t0x%08x\n"
+               "\t__cfg(%04x): \t0x%08x\n"
+               "\t__src(%04x): \t0x%08x\n"
+               "\t__dst(%04x): \t0x%08x\n"
+               "\tcount(%04x): \t0x%08x\n"
+               "\t_para(%04x): \t0x%08x\n\n",
+               pchan->idx, &reg,
+               DMA_CHAN_ENABLE,
+               readl(pchan->base + DMA_CHAN_ENABLE),
+               DMA_CHAN_PAUSE,
+               readl(pchan->base + DMA_CHAN_PAUSE),
+               DMA_CHAN_LLI_ADDR,
+               readl(pchan->base + DMA_CHAN_LLI_ADDR),
+               DMA_CHAN_CUR_CFG,
+               readl(pchan->base + DMA_CHAN_CUR_CFG),
+               DMA_CHAN_CUR_SRC,
+               readl(pchan->base + DMA_CHAN_CUR_SRC),
+               DMA_CHAN_CUR_DST,
+               readl(pchan->base + DMA_CHAN_CUR_DST),
+               DMA_CHAN_CUR_CNT,
+               readl(pchan->base + DMA_CHAN_CUR_CNT),
+               DMA_CHAN_CUR_PARA,
+               readl(pchan->base + DMA_CHAN_CUR_PARA));
+}
+
+static inline int convert_burst(u32 maxburst, u8 *burst)
+{
+       switch (maxburst) {
+       case 1:
+               *burst = 0;
+               break;
+       case 8:
+               *burst = 2;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static inline int convert_buswidth(enum dma_slave_buswidth addr_width, u8 *width)
+{
+       if ((addr_width < DMA_SLAVE_BUSWIDTH_1_BYTE) ||
+           (addr_width > DMA_SLAVE_BUSWIDTH_4_BYTES))
+               return -EINVAL;
+
+       *width = addr_width >> 1;
+       return 0;
+}
+
+static void *sun6i_dma_lli_add(struct sun6i_dma_lli *prev,
+                              struct sun6i_dma_lli *next,
+                              dma_addr_t next_phy,
+                              struct sun6i_desc *txd)
+{
+       if ((!prev && !txd) || !next)
+               return NULL;
+
+       if (!prev) {
+               txd->p_lli = next_phy;
+               txd->v_lli = next;
+       } else {
+               prev->p_lli_next = next_phy;
+               prev->v_lli_next = next;
+       }
+
+       next->p_lli_next = LLI_LAST_ITEM;
+       next->v_lli_next = NULL;
+
+       return next;
+}
+
+static inline int sun6i_dma_cfg_lli(struct sun6i_dma_lli *lli,
+                                   dma_addr_t src,
+                                   dma_addr_t dst, u32 len,
+                                   struct dma_slave_config *config)
+{
+       u8 src_width, dst_width, src_burst, dst_burst;
+       int ret;
+
+       if (!config)
+               return -EINVAL;
+
+       ret = convert_burst(config->src_maxburst, &src_burst);
+       if (ret)
+               return ret;
+
+       ret = convert_burst(config->dst_maxburst, &dst_burst);
+       if (ret)
+               return ret;
+
+       ret = convert_buswidth(config->src_addr_width, &src_width);
+       if (ret)
+               return ret;
+
+       ret = convert_buswidth(config->dst_addr_width, &dst_width);
+       if (ret)
+               return ret;
+
+       lli->cfg = DMA_CHAN_CFG_SRC_BURST(src_burst) |
+               DMA_CHAN_CFG_SRC_WIDTH(src_width) |
+               DMA_CHAN_CFG_DST_BURST(dst_burst) |
+               DMA_CHAN_CFG_DST_WIDTH(dst_width);
+
+       lli->src = src;
+       lli->dst = dst;
+       lli->len = len;
+       lli->para = NORMAL_WAIT;
+
+       return 0;
+}
+
+static inline void sun6i_dma_dump_lli(struct sun6i_vchan *vchan,
+                                     struct sun6i_dma_lli *lli)
+{
+       phys_addr_t p_lli = virt_to_phys(lli);
+
+       dev_dbg(chan2dev(&vchan->vc.chan),
+               "\n\tdesc:   p - %pa v - 0x%p\n"
+               "\t\tc - 0x%08x s - 0x%08x d - 0x%08x\n"
+               "\t\tl - 0x%08x p - 0x%08x n - 0x%08x\n",
+               &p_lli, lli,
+               lli->cfg, lli->src, lli->dst,
+               lli->len, lli->para, lli->p_lli_next);
+}
+
+static void sun6i_dma_free_desc(struct virt_dma_desc *vd)
+{
+       struct sun6i_desc *txd = to_sun6i_desc(&vd->tx);
+       struct sun6i_dma_dev *sdev = to_sun6i_dma_dev(vd->tx.chan->device);
+       struct sun6i_dma_lli *v_lli, *v_next;
+       dma_addr_t p_lli, p_next;
+
+       if (unlikely(!txd))
+               return;
+
+       p_lli = txd->p_lli;
+       v_lli = txd->v_lli;
+
+       while (v_lli) {
+               v_next = v_lli->v_lli_next;
+               p_next = v_lli->p_lli_next;
+
+               dma_pool_free(sdev->pool, v_lli, p_lli);
+
+               v_lli = v_next;
+               p_lli = p_next;
+       }
+
+       kfree(txd);
+}
+
+static int sun6i_dma_terminate_all(struct sun6i_vchan *vchan)
+{
+       struct sun6i_dma_dev *sdev = to_sun6i_dma_dev(vchan->vc.chan.device);
+       struct sun6i_pchan *pchan = vchan->phy;
+       unsigned long flags;
+       LIST_HEAD(head);
+
+       spin_lock(&sdev->lock);
+       list_del_init(&vchan->node);
+       spin_unlock(&sdev->lock);
+
+       spin_lock_irqsave(&vchan->vc.lock, flags);
+
+       vchan_get_all_descriptors(&vchan->vc, &head);
+
+       if (pchan) {
+               writel(DMA_CHAN_ENABLE_STOP, pchan->base + DMA_CHAN_ENABLE);
+               writel(DMA_CHAN_PAUSE_RESUME, pchan->base + DMA_CHAN_PAUSE);
+
+               vchan->phy = NULL;
+               pchan->vchan = NULL;
+               pchan->desc = NULL;
+               pchan->done = NULL;
+       }
+
+       spin_unlock_irqrestore(&vchan->vc.lock, flags);
+
+       vchan_dma_desc_free_list(&vchan->vc, &head);
+
+       return 0;
+}
+
+static int sun6i_dma_start_desc(struct sun6i_vchan *vchan)
+{
+       struct sun6i_dma_dev *sdev = to_sun6i_dma_dev(vchan->vc.chan.device);
+       struct virt_dma_desc *desc = vchan_next_desc(&vchan->vc);
+       struct sun6i_pchan *pchan = vchan->phy;
+       u32 irq_val, irq_reg, irq_offset;
+
+       if (!pchan)
+               return -EAGAIN;
+
+       if (!desc) {
+               pchan->desc = NULL;
+               pchan->done = NULL;
+               return -EAGAIN;
+       }
+
+       list_del(&desc->node);
+
+       pchan->desc = to_sun6i_desc(&desc->tx);
+       pchan->done = NULL;
+
+       sun6i_dma_dump_lli(vchan, pchan->desc->v_lli);
+
+       irq_reg = pchan->idx / DMA_IRQ_CHAN_NR;
+       irq_offset = pchan->idx % DMA_IRQ_CHAN_NR;
+
+       irq_val = readl(sdev->base + DMA_IRQ_EN(irq_offset));
+       irq_val |= DMA_IRQ_QUEUE << (irq_offset * DMA_IRQ_CHAN_WIDTH);
+       writel(irq_val, sdev->base + DMA_IRQ_EN(irq_offset));
+
+       writel(pchan->desc->p_lli, pchan->base + DMA_CHAN_LLI_ADDR);
+       writel(DMA_CHAN_ENABLE_START, pchan->base + DMA_CHAN_ENABLE);
+
+       sun6i_dma_dump_com_regs(sdev);
+       sun6i_dma_dump_chan_regs(sdev, pchan);
+
+       return 0;
+}
+
+static void sun6i_dma_tasklet(unsigned long data)
+{
+       struct sun6i_dma_dev *sdev = (struct sun6i_dma_dev *)data;
+       struct sun6i_vchan *vchan;
+       struct sun6i_pchan *pchan;
+       unsigned int pchan_alloc = 0;
+       unsigned int pchan_idx;
+
+       list_for_each_entry(vchan, &sdev->slave.channels, vc.chan.device_node) {
+               spin_lock_irq(&vchan->vc.lock);
+
+               pchan = vchan->phy;
+
+               if (pchan && pchan->done) {
+                       if (sun6i_dma_start_desc(vchan)) {
+                               /*
+                                * No current txd associated with this channel
+                                */
+                               dev_dbg(sdev->slave.dev, "pchan %u: free\n",
+                                       pchan->idx);
+
+                               /* Mark this channel free */
+                               vchan->phy = NULL;
+                               pchan->vchan = NULL;
+                       }
+               }
+               spin_unlock_irq(&vchan->vc.lock);
+       }
+
+       spin_lock_irq(&sdev->lock);
+       for (pchan_idx = 0; pchan_idx < NR_MAX_CHANNELS; pchan_idx++) {
+               pchan = &sdev->pchans[pchan_idx];
+
+               if (pchan->vchan || list_empty(&sdev->pending))
+                       continue;
+
+               vchan = list_first_entry(&sdev->pending,
+                                        struct sun6i_vchan, node);
+
+               /* Remove from pending channels */
+               list_del_init(&vchan->node);
+               pchan_alloc |= BIT(pchan_idx);
+
+               /* Mark this channel allocated */
+               pchan->vchan = vchan;
+               vchan->phy = pchan;
+               dev_dbg(sdev->slave.dev, "pchan %u: alloc vchan %p\n",
+                       pchan->idx, &vchan->vc);
+       }
+       spin_unlock_irq(&sdev->lock);
+
+       for (pchan_idx = 0; pchan_idx < NR_MAX_CHANNELS; pchan_idx++) {
+               if (!(pchan_alloc & BIT(pchan_idx)))
+                       continue;
+
+               pchan = sdev->pchans + pchan_idx;
+               vchan = pchan->vchan;
+               if (vchan) {
+                       spin_lock_irq(&vchan->vc.lock);
+                       sun6i_dma_start_desc(vchan);
+                       spin_unlock_irq(&vchan->vc.lock);
+               }
+       }
+}
+
+static irqreturn_t sun6i_dma_interrupt(int irq, void *dev_id)
+{
+       struct sun6i_dma_dev *sdev = dev_id;
+       struct sun6i_vchan *vchan;
+       struct sun6i_pchan *pchan;
+       int i, j, ret = IRQ_NONE;
+       u32 status;
+
+       for (i = 0; i < 2; i++) {
+               status = readl(sdev->base + DMA_IRQ_STAT(i));
+               if (!status)
+                       continue;
+
+               dev_dbg(sdev->slave.dev, "DMA irq status %s: 0x%x\n",
+                       i ? "high" : "low", status);
+
+               writel(status, sdev->base + DMA_IRQ_STAT(i));
+
+               for (j = 0; (j < 8) && status; j++) {
+                       if (status & DMA_IRQ_QUEUE) {
+                               pchan = sdev->pchans + j;
+                               vchan = pchan->vchan;
+
+                               if (vchan) {
+                                       spin_lock(&vchan->vc.lock);
+                                       vchan_cookie_complete(&pchan->desc->vd);
+                                       pchan->done = pchan->desc;
+                                       spin_unlock(&vchan->vc.lock);
+                               }
+                       }
+
+                       status = status >> 4;
+               }
+
+               if (!atomic_read(&sdev->tasklet_shutdown))
+                       tasklet_schedule(&sdev->task);
+               ret = IRQ_HANDLED;
+       }
+
+       return ret;
+}
+
+static struct dma_async_tx_descriptor *sun6i_dma_prep_dma_memcpy(
+               struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
+               size_t len, unsigned long flags)
+{
+       struct sun6i_dma_dev *sdev = to_sun6i_dma_dev(chan->device);
+       struct sun6i_vchan *vchan = to_sun6i_vchan(chan);
+       struct dma_slave_config *sconfig = &vchan->cfg;
+       struct sun6i_dma_lli *v_lli;
+       struct sun6i_desc *txd;
+       dma_addr_t p_lli;
+       int ret;
+
+       dev_dbg(chan2dev(chan),
+               "%s; chan: %d, dest: %pad, src: %pad, len: %zu. flags: 0x%08lx\n",
+               __func__, vchan->vc.chan.chan_id, &dest, &src, len, flags);
+
+       if (!len)
+               return NULL;
+
+       txd = kzalloc(sizeof(*txd), GFP_NOWAIT);
+       if (!txd)
+               return NULL;
+
+       v_lli = dma_pool_alloc(sdev->pool, GFP_NOWAIT, &p_lli);
+       if (!v_lli) {
+               dev_err(sdev->slave.dev, "Failed to alloc lli memory\n");
+               goto err_txd_free;
+       }
+
+       ret = sun6i_dma_cfg_lli(v_lli, src, dest, len, sconfig);
+       if (ret)
+               goto err_dma_free;
+
+       v_lli->cfg |= DMA_CHAN_CFG_SRC_DRQ(DRQ_SDRAM) |
+               DMA_CHAN_CFG_DST_DRQ(DRQ_SDRAM) |
+               DMA_CHAN_CFG_DST_LINEAR_MODE |
+               DMA_CHAN_CFG_SRC_LINEAR_MODE;
+
+       sun6i_dma_lli_add(NULL, v_lli, p_lli, txd);
+
+       sun6i_dma_dump_lli(vchan, v_lli);
+
+       return vchan_tx_prep(&vchan->vc, &txd->vd, flags);
+
+err_dma_free:
+       dma_pool_free(sdev->pool, v_lli, p_lli);
+err_txd_free:
+       kfree(txd);
+       return NULL;
+}
+
+static struct dma_async_tx_descriptor *sun6i_dma_prep_slave_sg(
+               struct dma_chan *chan, struct scatterlist *sgl,
+               unsigned int sg_len, enum dma_transfer_direction dir,
+               unsigned long flags, void *context)
+{
+       struct sun6i_dma_dev *sdev = to_sun6i_dma_dev(chan->device);
+       struct sun6i_vchan *vchan = to_sun6i_vchan(chan);
+       struct dma_slave_config *sconfig = &vchan->cfg;
+       struct sun6i_dma_lli *v_lli, *prev = NULL;
+       struct sun6i_desc *txd;
+       struct scatterlist *sg;
+       dma_addr_t p_lli;
+       int i, ret;
+
+       if (!sgl)
+               return NULL;
+
+       if (!is_slave_direction(dir)) {
+               dev_err(chan2dev(chan), "Invalid DMA direction\n");
+               return NULL;
+       }
+
+       txd = kzalloc(sizeof(*txd), GFP_NOWAIT);
+       if (!txd)
+               return NULL;
+
+       for_each_sg(sgl, sg, sg_len, i) {
+               v_lli = dma_pool_alloc(sdev->pool, GFP_NOWAIT, &p_lli);
+               if (!v_lli)
+                       goto err_lli_free;
+
+               if (dir == DMA_MEM_TO_DEV) {
+                       ret = sun6i_dma_cfg_lli(v_lli, sg_dma_address(sg),
+                                               sconfig->dst_addr, sg_dma_len(sg),
+                                               sconfig);
+                       if (ret)
+                               goto err_cur_lli_free;
+
+                       v_lli->cfg |= DMA_CHAN_CFG_DST_IO_MODE |
+                               DMA_CHAN_CFG_SRC_LINEAR_MODE |
+                               DMA_CHAN_CFG_SRC_DRQ(DRQ_SDRAM) |
+                               DMA_CHAN_CFG_DST_DRQ(vchan->port);
+
+                       dev_dbg(chan2dev(chan),
+                               "%s; chan: %d, dest: %pad, src: %pad, len: %u. flags: 0x%08lx\n",
+                               __func__, vchan->vc.chan.chan_id,
+                               &sconfig->dst_addr, &sg_dma_address(sg),
+                               sg_dma_len(sg), flags);
+
+               } else {
+                       ret = sun6i_dma_cfg_lli(v_lli, sconfig->src_addr,
+                                               sg_dma_address(sg), sg_dma_len(sg),
+                                               sconfig);
+                       if (ret)
+                               goto err_cur_lli_free;
+
+                       v_lli->cfg |= DMA_CHAN_CFG_DST_LINEAR_MODE |
+                               DMA_CHAN_CFG_SRC_IO_MODE |
+                               DMA_CHAN_CFG_DST_DRQ(DRQ_SDRAM) |
+                               DMA_CHAN_CFG_SRC_DRQ(vchan->port);
+
+                       dev_dbg(chan2dev(chan),
+                               "%s; chan: %d, dest: %pad, src: %pad, len: %u. flags: 0x%08lx\n",
+                               __func__, vchan->vc.chan.chan_id,
+                               &sg_dma_address(sg), &sconfig->src_addr,
+                               sg_dma_len(sg), flags);
+               }
+
+               prev = sun6i_dma_lli_add(prev, v_lli, p_lli, txd);
+       }
+
+       dev_dbg(chan2dev(chan), "First: %pad\n", &txd->p_lli);
+       for (prev = txd->v_lli; prev; prev = prev->v_lli_next)
+               sun6i_dma_dump_lli(vchan, prev);
+
+       return vchan_tx_prep(&vchan->vc, &txd->vd, flags);
+
+err_cur_lli_free:
+       dma_pool_free(sdev->pool, v_lli, p_lli);
+err_lli_free:
+       for (prev = txd->v_lli; prev; prev = prev->v_lli_next)
+               dma_pool_free(sdev->pool, prev, virt_to_phys(prev));
+       kfree(txd);
+       return NULL;
+}
+
+static int sun6i_dma_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
+                      unsigned long arg)
+{
+       struct sun6i_dma_dev *sdev = to_sun6i_dma_dev(chan->device);
+       struct sun6i_vchan *vchan = to_sun6i_vchan(chan);
+       struct sun6i_pchan *pchan = vchan->phy;
+       unsigned long flags;
+       int ret = 0;
+
+       switch (cmd) {
+       case DMA_RESUME:
+               dev_dbg(chan2dev(chan), "vchan %p: resume\n", &vchan->vc);
+
+               spin_lock_irqsave(&vchan->vc.lock, flags);
+
+               if (pchan) {
+                       writel(DMA_CHAN_PAUSE_RESUME,
+                              pchan->base + DMA_CHAN_PAUSE);
+               } else if (!list_empty(&vchan->vc.desc_issued)) {
+                       spin_lock(&sdev->lock);
+                       list_add_tail(&vchan->node, &sdev->pending);
+                       spin_unlock(&sdev->lock);
+               }
+
+               spin_unlock_irqrestore(&vchan->vc.lock, flags);
+               break;
+
+       case DMA_PAUSE:
+               dev_dbg(chan2dev(chan), "vchan %p: pause\n", &vchan->vc);
+
+               if (pchan) {
+                       writel(DMA_CHAN_PAUSE_PAUSE,
+                              pchan->base + DMA_CHAN_PAUSE);
+               } else {
+                       spin_lock(&sdev->lock);
+                       list_del_init(&vchan->node);
+                       spin_unlock(&sdev->lock);
+               }
+               break;
+
+       case DMA_TERMINATE_ALL:
+               ret = sun6i_dma_terminate_all(vchan);
+               break;
+       case DMA_SLAVE_CONFIG:
+               memcpy(&vchan->cfg, (void *)arg, sizeof(struct dma_slave_config));
+               break;
+       default:
+               ret = -ENXIO;
+               break;
+       }
+       return ret;
+}
+
+static enum dma_status sun6i_dma_tx_status(struct dma_chan *chan,
+                                          dma_cookie_t cookie,
+                                          struct dma_tx_state *state)
+{
+       struct sun6i_vchan *vchan = to_sun6i_vchan(chan);
+       struct sun6i_pchan *pchan = vchan->phy;
+       struct sun6i_dma_lli *lli;
+       struct virt_dma_desc *vd;
+       struct sun6i_desc *txd;
+       enum dma_status ret;
+       unsigned long flags;
+       size_t bytes = 0;
+
+       ret = dma_cookie_status(chan, cookie, state);
+       if (ret == DMA_COMPLETE)
+               return ret;
+
+       spin_lock_irqsave(&vchan->vc.lock, flags);
+
+       vd = vchan_find_desc(&vchan->vc, cookie);
+       txd = to_sun6i_desc(&vd->tx);
+
+       if (vd) {
+               for (lli = txd->v_lli; lli != NULL; lli = lli->v_lli_next)
+                       bytes += lli->len;
+       } else if (!pchan || !pchan->desc) {
+               bytes = 0;
+       } else {
+               bytes = readl(pchan->base + DMA_CHAN_CUR_CNT);
+       }
+
+       spin_unlock_irqrestore(&vchan->vc.lock, flags);
+
+       dma_set_residue(state, bytes);
+
+       return ret;
+}
+
+static void sun6i_dma_issue_pending(struct dma_chan *chan)
+{
+       struct sun6i_dma_dev *sdev = to_sun6i_dma_dev(chan->device);
+       struct sun6i_vchan *vchan = to_sun6i_vchan(chan);
+       unsigned long flags;
+
+       spin_lock_irqsave(&vchan->vc.lock, flags);
+
+       if (vchan_issue_pending(&vchan->vc)) {
+               spin_lock(&sdev->lock);
+
+               if (!vchan->phy && list_empty(&vchan->node)) {
+                       list_add_tail(&vchan->node, &sdev->pending);
+                       tasklet_schedule(&sdev->task);
+                       dev_dbg(chan2dev(chan), "vchan %p: issued\n",
+                               &vchan->vc);
+               }
+
+               spin_unlock(&sdev->lock);
+       } else {
+               dev_dbg(chan2dev(chan), "vchan %p: nothing to issue\n",
+                       &vchan->vc);
+       }
+
+       spin_unlock_irqrestore(&vchan->vc.lock, flags);
+}
+
+static int sun6i_dma_alloc_chan_resources(struct dma_chan *chan)
+{
+       return 0;
+}
+
+static void sun6i_dma_free_chan_resources(struct dma_chan *chan)
+{
+       struct sun6i_dma_dev *sdev = to_sun6i_dma_dev(chan->device);
+       struct sun6i_vchan *vchan = to_sun6i_vchan(chan);
+       unsigned long flags;
+
+       spin_lock_irqsave(&sdev->lock, flags);
+       list_del_init(&vchan->node);
+       spin_unlock_irqrestore(&sdev->lock, flags);
+
+       vchan_free_chan_resources(&vchan->vc);
+}
+
+static struct dma_chan *sun6i_dma_of_xlate(struct of_phandle_args *dma_spec,
+                                          struct of_dma *ofdma)
+{
+       struct sun6i_dma_dev *sdev = ofdma->of_dma_data;
+       struct sun6i_vchan *vchan;
+       struct dma_chan *chan;
+       u8 port = dma_spec->args[0];
+
+       if (port > NR_MAX_REQUESTS)
+               return NULL;
+
+       chan = dma_get_any_slave_channel(&sdev->slave);
+       if (!chan)
+               return NULL;
+
+       vchan = to_sun6i_vchan(chan);
+       vchan->port = port;
+
+       return chan;
+}
+
+static inline void sun6i_kill_tasklet(struct sun6i_dma_dev *sdev)
+{
+       /* Disable all interrupts from DMA */
+       writel(0, sdev->base + DMA_IRQ_EN(0));
+       writel(0, sdev->base + DMA_IRQ_EN(1));
+
+       /* Prevent spurious interrupts from scheduling the tasklet */
+       atomic_inc(&sdev->tasklet_shutdown);
+
+       /* Make sure we won't have any further interrupts */
+       devm_free_irq(sdev->slave.dev, sdev->irq, sdev);
+
+       /* Actually prevent the tasklet from being scheduled */
+       tasklet_kill(&sdev->task);
+}
+
+static inline void sun6i_dma_free(struct sun6i_dma_dev *sdev)
+{
+       int i;
+
+       for (i = 0; i < NR_MAX_VCHANS; i++) {
+               struct sun6i_vchan *vchan = &sdev->vchans[i];
+
+               list_del(&vchan->vc.chan.device_node);
+               tasklet_kill(&vchan->vc.task);
+       }
+}
+
+static int sun6i_dma_probe(struct platform_device *pdev)
+{
+       struct sun6i_dma_dev *sdc;
+       struct resource *res;
+       struct clk *mux, *pll6;
+       int ret, i;
+
+       sdc = devm_kzalloc(&pdev->dev, sizeof(*sdc), GFP_KERNEL);
+       if (!sdc)
+               return -ENOMEM;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       sdc->base = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(sdc->base))
+               return PTR_ERR(sdc->base);
+
+       sdc->irq = platform_get_irq(pdev, 0);
+       if (sdc->irq < 0) {
+               dev_err(&pdev->dev, "Cannot claim IRQ\n");
+               return sdc->irq;
+       }
+
+       sdc->clk = devm_clk_get(&pdev->dev, NULL);
+       if (IS_ERR(sdc->clk)) {
+               dev_err(&pdev->dev, "No clock specified\n");
+               return PTR_ERR(sdc->clk);
+       }
+
+       mux = clk_get(NULL, "ahb1_mux");
+       if (IS_ERR(mux)) {
+               dev_err(&pdev->dev, "Couldn't get AHB1 Mux\n");
+               return PTR_ERR(mux);
+       }
+
+       pll6 = clk_get(NULL, "pll6");
+       if (IS_ERR(pll6)) {
+               dev_err(&pdev->dev, "Couldn't get PLL6\n");
+               clk_put(mux);
+               return PTR_ERR(pll6);
+       }
+
+       ret = clk_set_parent(mux, pll6);
+       clk_put(pll6);
+       clk_put(mux);
+
+       if (ret) {
+               dev_err(&pdev->dev, "Couldn't reparent AHB1 on PLL6\n");
+               return ret;
+       }
+
+       sdc->rstc = devm_reset_control_get(&pdev->dev, NULL);
+       if (IS_ERR(sdc->rstc)) {
+               dev_err(&pdev->dev, "No reset controller specified\n");
+               return PTR_ERR(sdc->rstc);
+       }
+
+       sdc->pool = dmam_pool_create(dev_name(&pdev->dev), &pdev->dev,
+                                    sizeof(struct sun6i_dma_lli), 4, 0);
+       if (!sdc->pool) {
+               dev_err(&pdev->dev, "No memory for descriptors dma pool\n");
+               return -ENOMEM;
+       }
+
+       platform_set_drvdata(pdev, sdc);
+       INIT_LIST_HEAD(&sdc->pending);
+       spin_lock_init(&sdc->lock);
+
+       dma_cap_set(DMA_PRIVATE, sdc->slave.cap_mask);
+       dma_cap_set(DMA_MEMCPY, sdc->slave.cap_mask);
+       dma_cap_set(DMA_SLAVE, sdc->slave.cap_mask);
+
+       INIT_LIST_HEAD(&sdc->slave.channels);
+       sdc->slave.device_alloc_chan_resources  = sun6i_dma_alloc_chan_resources;
+       sdc->slave.device_free_chan_resources   = sun6i_dma_free_chan_resources;
+       sdc->slave.device_tx_status             = sun6i_dma_tx_status;
+       sdc->slave.device_issue_pending         = sun6i_dma_issue_pending;
+       sdc->slave.device_prep_slave_sg         = sun6i_dma_prep_slave_sg;
+       sdc->slave.device_prep_dma_memcpy       = sun6i_dma_prep_dma_memcpy;
+       sdc->slave.device_control               = sun6i_dma_control;
+       sdc->slave.chancnt                      = NR_MAX_VCHANS;
+
+       sdc->slave.dev = &pdev->dev;
+
+       sdc->pchans = devm_kcalloc(&pdev->dev, NR_MAX_CHANNELS,
+                                  sizeof(struct sun6i_pchan), GFP_KERNEL);
+       if (!sdc->pchans)
+               return -ENOMEM;
+
+       sdc->vchans = devm_kcalloc(&pdev->dev, NR_MAX_VCHANS,
+                                  sizeof(struct sun6i_vchan), GFP_KERNEL);
+       if (!sdc->vchans)
+               return -ENOMEM;
+
+       tasklet_init(&sdc->task, sun6i_dma_tasklet, (unsigned long)sdc);
+
+       for (i = 0; i < NR_MAX_CHANNELS; i++) {
+               struct sun6i_pchan *pchan = &sdc->pchans[i];
+
+               pchan->idx = i;
+               pchan->base = sdc->base + 0x100 + i * 0x40;
+       }
+
+       for (i = 0; i < NR_MAX_VCHANS; i++) {
+               struct sun6i_vchan *vchan = &sdc->vchans[i];
+
+               INIT_LIST_HEAD(&vchan->node);
+               vchan->vc.desc_free = sun6i_dma_free_desc;
+               vchan_init(&vchan->vc, &sdc->slave);
+       }
+
+       ret = reset_control_deassert(sdc->rstc);
+       if (ret) {
+               dev_err(&pdev->dev, "Couldn't deassert the device from reset\n");
+               goto err_chan_free;
+       }
+
+       ret = clk_prepare_enable(sdc->clk);
+       if (ret) {
+               dev_err(&pdev->dev, "Couldn't enable the clock\n");
+               goto err_reset_assert;
+       }
+
+       ret = devm_request_irq(&pdev->dev, sdc->irq, sun6i_dma_interrupt, 0,
+                              dev_name(&pdev->dev), sdc);
+       if (ret) {
+               dev_err(&pdev->dev, "Cannot request IRQ\n");
+               goto err_clk_disable;
+       }
+
+       ret = dma_async_device_register(&sdc->slave);
+       if (ret) {
+               dev_warn(&pdev->dev, "Failed to register DMA engine device\n");
+               goto err_irq_disable;
+       }
+
+       ret = of_dma_controller_register(pdev->dev.of_node, sun6i_dma_of_xlate,
+                                        sdc);
+       if (ret) {
+               dev_err(&pdev->dev, "of_dma_controller_register failed\n");
+               goto err_dma_unregister;
+       }
+
+       return 0;
+
+err_dma_unregister:
+       dma_async_device_unregister(&sdc->slave);
+err_irq_disable:
+       sun6i_kill_tasklet(sdc);
+err_clk_disable:
+       clk_disable_unprepare(sdc->clk);
+err_reset_assert:
+       reset_control_assert(sdc->rstc);
+err_chan_free:
+       sun6i_dma_free(sdc);
+       return ret;
+}
+
+static int sun6i_dma_remove(struct platform_device *pdev)
+{
+       struct sun6i_dma_dev *sdc = platform_get_drvdata(pdev);
+
+       of_dma_controller_free(pdev->dev.of_node);
+       dma_async_device_unregister(&sdc->slave);
+
+       sun6i_kill_tasklet(sdc);
+
+       clk_disable_unprepare(sdc->clk);
+       reset_control_assert(sdc->rstc);
+
+       sun6i_dma_free(sdc);
+
+       return 0;
+}
+
+static struct of_device_id sun6i_dma_match[] = {
+       { .compatible = "allwinner,sun6i-a31-dma" },
+       { /* sentinel */ }
+};
+
+static struct platform_driver sun6i_dma_driver = {
+       .probe          = sun6i_dma_probe,
+       .remove         = sun6i_dma_remove,
+       .driver = {
+               .name           = "sun6i-dma",
+               .of_match_table = sun6i_dma_match,
+       },
+};
+module_platform_driver(sun6i_dma_driver);
+
+MODULE_DESCRIPTION("Allwinner A31 DMA Controller Driver");
+MODULE_AUTHOR("Sugar <shuge@allwinnertech.com>");
+MODULE_AUTHOR("Maxime Ripard <maxime.ripard@free-electrons.com>");
+MODULE_LICENSE("GPL");
index 03ad64ecaaf043a4325dd6d7d325e676672a16b4..16efa603ff65da05a2cfae6cc0b1b34f4786b525 100644 (file)
@@ -1055,7 +1055,7 @@ static struct dma_async_tx_descriptor *tegra_dma_prep_slave_sg(
 static struct dma_async_tx_descriptor *tegra_dma_prep_dma_cyclic(
        struct dma_chan *dc, dma_addr_t buf_addr, size_t buf_len,
        size_t period_len, enum dma_transfer_direction direction,
-       unsigned long flags, void *context)
+       unsigned long flags)
 {
        struct tegra_dma_channel *tdc = to_tegra_dma_chan(dc);
        struct tegra_dma_desc *dma_desc = NULL;
index 374b57fc596d87db7e9e252c2ca4fbe57cb423f6..a12c8552f6a6bbe74791865e84299bb74e5e52bc 100644 (file)
@@ -134,8 +134,7 @@ static void cell_edac_init_csrows(struct mem_ctl_info *mci)
        int                             j;
        u32                             nr_pages;
 
-       for (np = NULL;
-            (np = of_find_node_by_name(np, "memory")) != NULL;) {
+       for_each_node_by_name(np, "memory") {
                struct resource r;
 
                /* We "know" that the Cell firmware only creates one entry
index d3d0e8cf27b4beddcfbb19f70ea8b5f1f1d5c8bb..d6c767ace9166d1c0ee54105d69f7ecd551516e8 100644 (file)
@@ -382,6 +382,9 @@ static ssize_t set_vrm(struct device *dev, struct device_attribute *attr,
        if (err)
                return err;
 
+       if (val > 255)
+               return -EINVAL;
+
        data->vrm = val;
        return count;
 }
index ca8430f925643725733014cf47f98f782b017aba..e67b9a50ac7cbcda9e820117940ab294f5eb2dea 100644 (file)
@@ -1085,6 +1085,9 @@ static ssize_t store_vrm_reg(struct device *dev, struct device_attribute *attr,
        if (err)
                return err;
 
+       if (val > 255)
+               return -EINVAL;
+
        data->vrm = val;
        return count;
 }
index 22e0c926989dca8dfe2644418049d56ae99ef2d9..126516414c114f309161924e49d5dbef3d60d3bb 100644 (file)
@@ -212,6 +212,7 @@ static int ads1015_get_channels_config_of(struct i2c_client *client)
                                dev_err(&client->dev,
                                        "invalid gain on %s\n",
                                        node->full_name);
+                               return -EINVAL;
                        }
                }
 
@@ -222,6 +223,7 @@ static int ads1015_get_channels_config_of(struct i2c_client *client)
                                dev_err(&client->dev,
                                        "invalid data_rate on %s\n",
                                        node->full_name);
+                               return -EINVAL;
                        }
                }
 
index f96063680e584ff5f7cbb1d6dec8147a0592e186..272fcc837ecc0ad62e3b5dc3b40aefc4c8472bec 100644 (file)
@@ -510,6 +510,10 @@ static ssize_t set_vrm(struct device *dev, struct device_attribute *attr,
        err = kstrtoul(buf, 10, &val);
        if (err)
                return err;
+
+       if (val > 255)
+               return -EINVAL;
+
        data->vrm = val;
        return count;
 }
index 4ae3fff13f4498dbef26281679cfc6b9dc0bfe48..bea0a344fab57b4f39b855d2997256b5d1e01b93 100644 (file)
@@ -247,8 +247,8 @@ struct dme1737_data {
        u8  pwm_acz[3];
        u8  pwm_freq[6];
        u8  pwm_rr[2];
-       u8  zone_low[3];
-       u8  zone_abs[3];
+       s8  zone_low[3];
+       s8  zone_abs[3];
        u8  zone_hyst[2];
        u32 alarms;
 };
@@ -277,7 +277,7 @@ static inline int IN_FROM_REG(int reg, int nominal, int res)
        return (reg * nominal + (3 << (res - 3))) / (3 << (res - 2));
 }
 
-static inline int IN_TO_REG(int val, int nominal)
+static inline int IN_TO_REG(long val, int nominal)
 {
        return clamp_val((val * 192 + nominal / 2) / nominal, 0, 255);
 }
@@ -293,7 +293,7 @@ static inline int TEMP_FROM_REG(int reg, int res)
        return (reg * 1000) >> (res - 8);
 }
 
-static inline int TEMP_TO_REG(int val)
+static inline int TEMP_TO_REG(long val)
 {
        return clamp_val((val < 0 ? val - 500 : val + 500) / 1000, -128, 127);
 }
@@ -308,7 +308,7 @@ static inline int TEMP_RANGE_FROM_REG(int reg)
        return TEMP_RANGE[(reg >> 4) & 0x0f];
 }
 
-static int TEMP_RANGE_TO_REG(int val, int reg)
+static int TEMP_RANGE_TO_REG(long val, int reg)
 {
        int i;
 
@@ -331,7 +331,7 @@ static inline int TEMP_HYST_FROM_REG(int reg, int ix)
        return (((ix == 1) ? reg : reg >> 4) & 0x0f) * 1000;
 }
 
-static inline int TEMP_HYST_TO_REG(int val, int ix, int reg)
+static inline int TEMP_HYST_TO_REG(long val, int ix, int reg)
 {
        int hyst = clamp_val((val + 500) / 1000, 0, 15);
 
@@ -347,7 +347,7 @@ static inline int FAN_FROM_REG(int reg, int tpc)
                return (reg == 0 || reg == 0xffff) ? 0 : 90000 * 60 / reg;
 }
 
-static inline int FAN_TO_REG(int val, int tpc)
+static inline int FAN_TO_REG(long val, int tpc)
 {
        if (tpc) {
                return clamp_val(val / tpc, 0, 0xffff);
@@ -379,7 +379,7 @@ static inline int FAN_TYPE_FROM_REG(int reg)
        return (edge > 0) ? 1 << (edge - 1) : 0;
 }
 
-static inline int FAN_TYPE_TO_REG(int val, int reg)
+static inline int FAN_TYPE_TO_REG(long val, int reg)
 {
        int edge = (val == 4) ? 3 : val;
 
@@ -402,7 +402,7 @@ static int FAN_MAX_FROM_REG(int reg)
        return 1000 + i * 500;
 }
 
-static int FAN_MAX_TO_REG(int val)
+static int FAN_MAX_TO_REG(long val)
 {
        int i;
 
@@ -460,7 +460,7 @@ static inline int PWM_ACZ_FROM_REG(int reg)
        return acz[(reg >> 5) & 0x07];
 }
 
-static inline int PWM_ACZ_TO_REG(int val, int reg)
+static inline int PWM_ACZ_TO_REG(long val, int reg)
 {
        int acz = (val == 4) ? 2 : val - 1;
 
@@ -476,7 +476,7 @@ static inline int PWM_FREQ_FROM_REG(int reg)
        return PWM_FREQ[reg & 0x0f];
 }
 
-static int PWM_FREQ_TO_REG(int val, int reg)
+static int PWM_FREQ_TO_REG(long val, int reg)
 {
        int i;
 
@@ -510,7 +510,7 @@ static inline int PWM_RR_FROM_REG(int reg, int ix)
        return (rr & 0x08) ? PWM_RR[rr & 0x07] : 0;
 }
 
-static int PWM_RR_TO_REG(int val, int ix, int reg)
+static int PWM_RR_TO_REG(long val, int ix, int reg)
 {
        int i;
 
@@ -528,7 +528,7 @@ static inline int PWM_RR_EN_FROM_REG(int reg, int ix)
        return PWM_RR_FROM_REG(reg, ix) ? 1 : 0;
 }
 
-static inline int PWM_RR_EN_TO_REG(int val, int ix, int reg)
+static inline int PWM_RR_EN_TO_REG(long val, int ix, int reg)
 {
        int en = (ix == 1) ? 0x80 : 0x08;
 
@@ -1481,13 +1481,16 @@ static ssize_t set_vrm(struct device *dev, struct device_attribute *attr,
                       const char *buf, size_t count)
 {
        struct dme1737_data *data = dev_get_drvdata(dev);
-       long val;
+       unsigned long val;
        int err;
 
-       err = kstrtol(buf, 10, &val);
+       err = kstrtoul(buf, 10, &val);
        if (err)
                return err;
 
+       if (val > 255)
+               return -EINVAL;
+
        data->vrm = val;
        return count;
 }
index e87da902f3ae011c99a4622debf8784c5f57af0b..ada90716448d196f13cdb75251522f5877d08d99 100644 (file)
@@ -252,12 +252,12 @@ static ssize_t set_temp(struct device *dev, struct device_attribute *devattr,
        if (err < 0)
                return err;
 
-       val /= 1000;
+       val = DIV_ROUND_CLOSEST(val, 1000);
        reg = (sf == min) ? EMC6W201_REG_TEMP_LOW(nr)
                          : EMC6W201_REG_TEMP_HIGH(nr);
 
        mutex_lock(&data->update_lock);
-       data->temp[sf][nr] = clamp_val(val, -127, 128);
+       data->temp[sf][nr] = clamp_val(val, -127, 127);
        err = emc6w201_write8(client, reg, data->temp[sf][nr]);
        mutex_unlock(&data->update_lock);
 
index 0e01c4e13e3350f586d2f85e469d372a4c1af636..7b73d2002d3ef19ed2f3d9a042b51c346d2f979c 100644 (file)
@@ -238,6 +238,9 @@ static int hih6130_probe(struct i2c_client *client,
        hih6130->client = client;
        mutex_init(&hih6130->lock);
 
+       if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_QUICK))
+               hih6130->write_length = 1;
+
        hwmon_dev = devm_hwmon_device_register_with_groups(dev, client->name,
                                                           hih6130,
                                                           hih6130_groups);
index ba1d83d480563a14c8bafae0c38df8bb6edd2740..a5e295826aeae10fdbc1e517445f953d644f8811 100644 (file)
@@ -617,6 +617,10 @@ static ssize_t set_vrm(struct device *dev, struct device_attribute *attr,
        err = kstrtoul(buf, 10, &val);
        if (err)
                return err;
+
+       if (val > 255)
+               return -EINVAL;
+
        data->vrm = val;
        return count;
 }
index d2060e245ff589f206fbf57c12e62d0038cae0fa..cfaf70b9cba72951e670f16b8700dd4a5ca152cc 100644 (file)
@@ -74,12 +74,9 @@ static inline int TEMP_FROM_REG(s16 reg)
        return reg / 8 * 625 / 10;
 }
 
-static inline s16 TEMP_TO_REG(int val)
+static inline s16 TEMP_TO_REG(long val)
 {
-       if (val <= -60000)
-               return -60000 * 10 / 625 * 8;
-       if (val >= 160000)
-               return 160000 * 10 / 625 * 8;
+       val = clamp_val(val, -60000, 160000);
        return val * 10 / 625 * 8;
 }
 
@@ -206,10 +203,12 @@ static ssize_t set_temp_hyst(struct device *dev,
        if (err)
                return err;
 
+       val = clamp_val(val, -120000, 220000);
        mutex_lock(&data->update_lock);
-       data->temp[t_hyst] = TEMP_FROM_REG(data->temp[attr->index]) - val;
+        data->temp[t_hyst] =
+               TEMP_TO_REG(TEMP_FROM_REG(data->temp[attr->index]) - val);
        i2c_smbus_write_word_swapped(client, LM92_REG_TEMP_HYST,
-                                    TEMP_TO_REG(data->temp[t_hyst]));
+                                    data->temp[t_hyst]);
        mutex_unlock(&data->update_lock);
        return count;
 }
index 988181e4cfcdc8ab7e40f02ab2977b38ee231120..145f674c1d8722afecf3e125245eafd3b96c6e34 100644 (file)
@@ -615,6 +615,9 @@ static ssize_t set_vrm(struct device *dev, struct device_attribute *attr,
        if (err)
                return err;
 
+       if (val > 255)
+               return -EINVAL;
+
        data->vrm = val;
        return count;
 }
index c74d2da389d960ad13b77e1a7c583381ca7efda2..e42964f07f67ddcb3cba6b8d091bb22d61488609 100644 (file)
@@ -131,13 +131,6 @@ static int tmp103_probe(struct i2c_client *client,
        struct regmap *regmap;
        int ret;
 
-       if (!i2c_check_functionality(client->adapter,
-                                    I2C_FUNC_SMBUS_BYTE_DATA)) {
-               dev_err(&client->dev,
-                       "adapter doesn't support SMBus byte transactions\n");
-               return -ENODEV;
-       }
-
        regmap = devm_regmap_init_i2c(client, &tmp103_regmap_config);
        if (IS_ERR(regmap)) {
                dev_err(dev, "failed to allocate register map\n");
index 344b22ec25533e0e39770d36aa0c87af29c815fd..3ea57c3504e24cf005c200bb5c093e1a80cef80b 100644 (file)
@@ -879,6 +879,9 @@ static ssize_t set_vrm(struct device *dev, struct device_attribute *attr,
        if (err)
                return err;
 
+       if (val > 255)
+               return -EINVAL;
+
        data->vrm = val;
 
        return count;
index c1726be3654c156ede9bdb965bb04e1d2ad6c926..2f55973a8c4c37989750905d1fd2b209b0bbd902 100644 (file)
@@ -820,6 +820,9 @@ store_vrm_reg(struct device *dev, struct device_attribute *attr, const char *buf
        err = kstrtoul(buf, 10, &val);
        if (err)
                return err;
+
+       if (val > 255)
+               return -EINVAL;
        data->vrm = val;
 
        return count;
index cb3765fec98c079e121472830bd576e37e17fab9..001df856913feba93790bf9383b2023115defad3 100644 (file)
@@ -1181,6 +1181,9 @@ static ssize_t store_vrm_reg(struct device *dev,
        if (err)
                return err;
 
+       if (val > 255)
+               return -EINVAL;
+
        data->vrm = val;
        return count;
 }
index 9d63d71214cade1cddad0b8bfbbbbae726c4dda5..816aa6caf5d553ef2538eb9ebdcddab40f96c588 100644 (file)
@@ -353,6 +353,9 @@ store_vrm(struct device *dev, struct device_attribute *attr,
        if (err)
                return err;
 
+       if (val > 255)
+               return -EINVAL;
+
        data->vrm = val;
        return count;
 }
index 443d03fbac4705bd97f5acf77f622e0562b7fc55..8eeab72b93e2c4fcf6573e6a63d768f40f8e5320 100644 (file)
@@ -331,7 +331,7 @@ static int bch_allocator_thread(void *arg)
                                mutex_unlock(&ca->set->bucket_lock);
                                blkdev_issue_discard(ca->bdev,
                                        bucket_to_sector(ca->set, bucket),
-                                       ca->sb.block_size, GFP_KERNEL, 0);
+                                       ca->sb.bucket_size, GFP_KERNEL, 0);
                                mutex_lock(&ca->set->bucket_lock);
                        }
 
index d2ebcf3230942ab7872025c9fbf177ae515aded4..04f7bc28ef832b6dded6d10e810ddbfbfada4fca 100644 (file)
@@ -477,9 +477,13 @@ struct gc_stat {
  * CACHE_SET_STOPPING always gets set first when we're closing down a cache set;
  * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e.
  * flushing dirty data).
+ *
+ * CACHE_SET_RUNNING means all cache devices have been registered and journal
+ * replay is complete.
  */
 #define CACHE_SET_UNREGISTERING                0
 #define        CACHE_SET_STOPPING              1
+#define        CACHE_SET_RUNNING               2
 
 struct cache_set {
        struct closure          cl;
index 54541641530569c442f7113b687428fad4bb18d6..646fe85261c17bcfb43ff89b54586838a00c39e7 100644 (file)
@@ -1182,7 +1182,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
 {
        uint64_t start_time;
        bool used_mempool = false;
-       struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO,
+       struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT,
                                                     order);
        if (!out) {
                struct page *outp;
index 5f6728d5d4ddb0f406b0b7296025f412f193793b..ae964624efb248d59f9af530c44b68175ac4324e 100644 (file)
@@ -453,7 +453,7 @@ static inline bool bch_bkey_equal_header(const struct bkey *l,
 {
        return (KEY_DIRTY(l) == KEY_DIRTY(r) &&
                KEY_PTRS(l) == KEY_PTRS(r) &&
-               KEY_CSUM(l) == KEY_CSUM(l));
+               KEY_CSUM(l) == KEY_CSUM(r));
 }
 
 /* Keylists */
index 7347b61009615089e307fba6b5964fcb83ad2040..00cde40db57269bb173104ab3632b8a99e95c4f6 100644 (file)
 ({                                                                     \
        int _r, l = (b)->level - 1;                                     \
        bool _w = l <= (op)->lock;                                      \
-       struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\
+       struct btree *_child = bch_btree_node_get((b)->c, op, key, l,   \
+                                                 _w, b);               \
        if (!IS_ERR(_child)) {                                          \
-               _child->parent = (b);                                   \
                _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);       \
                rw_unlock(_w, _child);                                  \
        } else                                                          \
                rw_lock(_w, _b, _b->level);                             \
                if (_b == (c)->root &&                                  \
                    _w == insert_lock(op, _b)) {                        \
-                       _b->parent = NULL;                              \
                        _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);   \
                }                                                       \
                rw_unlock(_w, _b);                                      \
@@ -202,7 +201,7 @@ void bch_btree_node_read_done(struct btree *b)
        struct bset *i = btree_bset_first(b);
        struct btree_iter *iter;
 
-       iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
+       iter = mempool_alloc(b->c->fill_iter, GFP_NOIO);
        iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
        iter->used = 0;
 
@@ -421,7 +420,7 @@ static void do_btree_node_write(struct btree *b)
        SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
                       bset_sector_offset(&b->keys, i));
 
-       if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
+       if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
                int j;
                struct bio_vec *bv;
                void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
@@ -967,7 +966,8 @@ err:
  * level and op->lock.
  */
 struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
-                                struct bkey *k, int level, bool write)
+                                struct bkey *k, int level, bool write,
+                                struct btree *parent)
 {
        int i = 0;
        struct btree *b;
@@ -1002,6 +1002,7 @@ retry:
                BUG_ON(b->level != level);
        }
 
+       b->parent = parent;
        b->accessed = 1;
 
        for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
@@ -1022,15 +1023,16 @@ retry:
        return b;
 }
 
-static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
+static void btree_node_prefetch(struct btree *parent, struct bkey *k)
 {
        struct btree *b;
 
-       mutex_lock(&c->bucket_lock);
-       b = mca_alloc(c, NULL, k, level);
-       mutex_unlock(&c->bucket_lock);
+       mutex_lock(&parent->c->bucket_lock);
+       b = mca_alloc(parent->c, NULL, k, parent->level - 1);
+       mutex_unlock(&parent->c->bucket_lock);
 
        if (!IS_ERR_OR_NULL(b)) {
+               b->parent = parent;
                bch_btree_node_read(b);
                rw_unlock(true, b);
        }
@@ -1060,15 +1062,16 @@ static void btree_node_free(struct btree *b)
        mutex_unlock(&b->c->bucket_lock);
 }
 
-struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
-                                  int level)
+struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
+                                    int level, bool wait,
+                                    struct btree *parent)
 {
        BKEY_PADDED(key) k;
        struct btree *b = ERR_PTR(-EAGAIN);
 
        mutex_lock(&c->bucket_lock);
 retry:
-       if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, op != NULL))
+       if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
                goto err;
 
        bkey_put(c, &k.key);
@@ -1085,6 +1088,7 @@ retry:
        }
 
        b->accessed = 1;
+       b->parent = parent;
        bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
 
        mutex_unlock(&c->bucket_lock);
@@ -1096,14 +1100,21 @@ err_free:
 err:
        mutex_unlock(&c->bucket_lock);
 
-       trace_bcache_btree_node_alloc_fail(b);
+       trace_bcache_btree_node_alloc_fail(c);
        return b;
 }
 
+static struct btree *bch_btree_node_alloc(struct cache_set *c,
+                                         struct btree_op *op, int level,
+                                         struct btree *parent)
+{
+       return __bch_btree_node_alloc(c, op, level, op != NULL, parent);
+}
+
 static struct btree *btree_node_alloc_replacement(struct btree *b,
                                                  struct btree_op *op)
 {
-       struct btree *n = bch_btree_node_alloc(b->c, op, b->level);
+       struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent);
        if (!IS_ERR_OR_NULL(n)) {
                mutex_lock(&n->write_lock);
                bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
@@ -1403,6 +1414,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
        BUG_ON(btree_bset_first(new_nodes[0])->keys);
        btree_node_free(new_nodes[0]);
        rw_unlock(true, new_nodes[0]);
+       new_nodes[0] = NULL;
 
        for (i = 0; i < nodes; i++) {
                if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key)))
@@ -1516,7 +1528,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
                k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
                if (k) {
                        r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
-                                                 true);
+                                                 true, b);
                        if (IS_ERR(r->b)) {
                                ret = PTR_ERR(r->b);
                                break;
@@ -1811,7 +1823,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
                        k = bch_btree_iter_next_filter(&iter, &b->keys,
                                                       bch_ptr_bad);
                        if (k)
-                               btree_node_prefetch(b->c, k, b->level - 1);
+                               btree_node_prefetch(b, k);
 
                        if (p)
                                ret = btree(check_recurse, p, b, op);
@@ -1976,12 +1988,12 @@ static int btree_split(struct btree *b, struct btree_op *op,
 
                trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
 
-               n2 = bch_btree_node_alloc(b->c, op, b->level);
+               n2 = bch_btree_node_alloc(b->c, op, b->level, b->parent);
                if (IS_ERR(n2))
                        goto err_free1;
 
                if (!b->parent) {
-                       n3 = bch_btree_node_alloc(b->c, op, b->level + 1);
+                       n3 = bch_btree_node_alloc(b->c, op, b->level + 1, NULL);
                        if (IS_ERR(n3))
                                goto err_free2;
                }
index 91dfa5e696857ded36b3de7b66a8f862d9540cea..5c391fa01bedbfba3f1dea062605460ccadc1c6a 100644 (file)
@@ -242,9 +242,10 @@ void __bch_btree_node_write(struct btree *, struct closure *);
 void bch_btree_node_write(struct btree *, struct closure *);
 
 void bch_btree_set_root(struct btree *);
-struct btree *bch_btree_node_alloc(struct cache_set *, struct btree_op *, int);
+struct btree *__bch_btree_node_alloc(struct cache_set *, struct btree_op *,
+                                    int, bool, struct btree *);
 struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *,
-                                struct bkey *, int, bool);
+                                struct bkey *, int, bool, struct btree *);
 
 int bch_btree_insert_check_key(struct btree *, struct btree_op *,
                               struct bkey *);
index 3a0de4cf9771031e9d4707fc49f356aee48b50e7..243de0bf15cdbf359965dff55cac4762b804362f 100644 (file)
@@ -474,9 +474,8 @@ out:
        return false;
 }
 
-static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
+bool __bch_extent_invalid(struct cache_set *c, const struct bkey *k)
 {
-       struct btree *b = container_of(bk, struct btree, keys);
        char buf[80];
 
        if (!KEY_SIZE(k))
@@ -485,16 +484,22 @@ static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
        if (KEY_SIZE(k) > KEY_OFFSET(k))
                goto bad;
 
-       if (__ptr_invalid(b->c, k))
+       if (__ptr_invalid(c, k))
                goto bad;
 
        return false;
 bad:
        bch_extent_to_text(buf, sizeof(buf), k);
-       cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k));
+       cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k));
        return true;
 }
 
+static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
+{
+       struct btree *b = container_of(bk, struct btree, keys);
+       return __bch_extent_invalid(b->c, k);
+}
+
 static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
                                     unsigned ptr)
 {
index e4e23409782d1957baacc8022e525d65aba7f094..e2ed54054e7a9106b50d0d2a3f6eb7967ce4e494 100644 (file)
@@ -9,5 +9,6 @@ struct cache_set;
 
 void bch_extent_to_text(char *, size_t, const struct bkey *);
 bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
+bool __bch_extent_invalid(struct cache_set *, const struct bkey *);
 
 #endif /* _BCACHE_EXTENTS_H */
index 59e82021b5bb320d9c03606e35e0d5347417f1ff..fe080ad0e55841e5c95bfcb3dcf3a0f1a703b76c 100644 (file)
@@ -7,6 +7,7 @@
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
+#include "extents.h"
 
 #include <trace/events/bcache.h>
 
@@ -189,11 +190,15 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
                        if (read_bucket(l))
                                goto bsearch;
 
-               if (list_empty(list))
+               /* no journal entries on this device? */
+               if (l == ca->sb.njournal_buckets)
                        continue;
 bsearch:
+               BUG_ON(list_empty(list));
+
                /* Binary search */
-               m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
+               m = l;
+               r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
                pr_debug("starting binary search, l %u r %u", l, r);
 
                while (l + 1 < r) {
@@ -291,15 +296,16 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
 
                for (k = i->j.start;
                     k < bset_bkey_last(&i->j);
-                    k = bkey_next(k)) {
-                       unsigned j;
+                    k = bkey_next(k))
+                       if (!__bch_extent_invalid(c, k)) {
+                               unsigned j;
 
-                       for (j = 0; j < KEY_PTRS(k); j++)
-                               if (ptr_available(c, k, j))
-                                       atomic_inc(&PTR_BUCKET(c, k, j)->pin);
+                               for (j = 0; j < KEY_PTRS(k); j++)
+                                       if (ptr_available(c, k, j))
+                                               atomic_inc(&PTR_BUCKET(c, k, j)->pin);
 
-                       bch_initial_mark_key(c, 0, k);
-               }
+                               bch_initial_mark_key(c, 0, k);
+                       }
        }
 }
 
index 15fff4f68a7ce75f441a1e429d961eac2d2b0d6e..62e6e98186b5cd536d75a9bec14a2c374102abbe 100644 (file)
@@ -311,7 +311,8 @@ void bch_data_insert(struct closure *cl)
 {
        struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
 
-       trace_bcache_write(op->bio, op->writeback, op->bypass);
+       trace_bcache_write(op->c, op->inode, op->bio,
+                          op->writeback, op->bypass);
 
        bch_keylist_init(&op->insert_keys);
        bio_get(op->bio);
index 926ded8ccbf58c39788a471dffb4ed7f876028bc..d4713d098a397c2f1b124f9fdd240d9907fa16d6 100644 (file)
@@ -733,8 +733,6 @@ static void bcache_device_detach(struct bcache_device *d)
 static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
                                 unsigned id)
 {
-       BUG_ON(test_bit(CACHE_SET_STOPPING, &c->flags));
-
        d->id = id;
        d->c = c;
        c->devices[id] = d;
@@ -927,6 +925,7 @@ static void cached_dev_detach_finish(struct work_struct *w)
        list_move(&dc->list, &uncached_devices);
 
        clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
+       clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
 
        mutex_unlock(&bch_register_lock);
 
@@ -1041,6 +1040,9 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
         */
        atomic_set(&dc->count, 1);
 
+       if (bch_cached_dev_writeback_start(dc))
+               return -ENOMEM;
+
        if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
                bch_sectors_dirty_init(dc);
                atomic_set(&dc->has_dirty, 1);
@@ -1070,7 +1072,8 @@ static void cached_dev_free(struct closure *cl)
        struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
 
        cancel_delayed_work_sync(&dc->writeback_rate_update);
-       kthread_stop(dc->writeback_thread);
+       if (!IS_ERR_OR_NULL(dc->writeback_thread))
+               kthread_stop(dc->writeback_thread);
 
        mutex_lock(&bch_register_lock);
 
@@ -1081,12 +1084,8 @@ static void cached_dev_free(struct closure *cl)
 
        mutex_unlock(&bch_register_lock);
 
-       if (!IS_ERR_OR_NULL(dc->bdev)) {
-               if (dc->bdev->bd_disk)
-                       blk_sync_queue(bdev_get_queue(dc->bdev));
-
+       if (!IS_ERR_OR_NULL(dc->bdev))
                blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-       }
 
        wake_up(&unregister_wait);
 
@@ -1213,7 +1212,9 @@ void bch_flash_dev_release(struct kobject *kobj)
 static void flash_dev_free(struct closure *cl)
 {
        struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+       mutex_lock(&bch_register_lock);
        bcache_device_free(d);
+       mutex_unlock(&bch_register_lock);
        kobject_put(&d->kobj);
 }
 
@@ -1221,7 +1222,9 @@ static void flash_dev_flush(struct closure *cl)
 {
        struct bcache_device *d = container_of(cl, struct bcache_device, cl);
 
+       mutex_lock(&bch_register_lock);
        bcache_device_unlink(d);
+       mutex_unlock(&bch_register_lock);
        kobject_del(&d->kobj);
        continue_at(cl, flash_dev_free, system_wq);
 }
@@ -1277,6 +1280,9 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size)
        if (test_bit(CACHE_SET_STOPPING, &c->flags))
                return -EINTR;
 
+       if (!test_bit(CACHE_SET_RUNNING, &c->flags))
+               return -EPERM;
+
        u = uuid_find_empty(c);
        if (!u) {
                pr_err("Can't create volume, no room for UUID");
@@ -1346,8 +1352,11 @@ static void cache_set_free(struct closure *cl)
        bch_journal_free(c);
 
        for_each_cache(ca, c, i)
-               if (ca)
+               if (ca) {
+                       ca->set = NULL;
+                       c->cache[ca->sb.nr_this_dev] = NULL;
                        kobject_put(&ca->kobj);
+               }
 
        bch_bset_sort_state_free(&c->sort);
        free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
@@ -1405,9 +1414,11 @@ static void cache_set_flush(struct closure *cl)
                if (ca->alloc_thread)
                        kthread_stop(ca->alloc_thread);
 
-       cancel_delayed_work_sync(&c->journal.work);
-       /* flush last journal entry if needed */
-       c->journal.work.work.func(&c->journal.work.work);
+       if (c->journal.cur) {
+               cancel_delayed_work_sync(&c->journal.work);
+               /* flush last journal entry if needed */
+               c->journal.work.work.func(&c->journal.work.work);
+       }
 
        closure_return(cl);
 }
@@ -1586,7 +1597,7 @@ static void run_cache_set(struct cache_set *c)
                        goto err;
 
                err = "error reading btree root";
-               c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true);
+               c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true, NULL);
                if (IS_ERR_OR_NULL(c->root))
                        goto err;
 
@@ -1661,7 +1672,7 @@ static void run_cache_set(struct cache_set *c)
                        goto err;
 
                err = "cannot allocate new btree root";
-               c->root = bch_btree_node_alloc(c, NULL, 0);
+               c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
                if (IS_ERR_OR_NULL(c->root))
                        goto err;
 
@@ -1697,6 +1708,7 @@ static void run_cache_set(struct cache_set *c)
 
        flash_devs_run(c);
 
+       set_bit(CACHE_SET_RUNNING, &c->flags);
        return;
 err:
        closure_sync(&cl);
@@ -1760,6 +1772,7 @@ found:
                pr_debug("set version = %llu", c->sb.version);
        }
 
+       kobject_get(&ca->kobj);
        ca->set = c;
        ca->set->cache[ca->sb.nr_this_dev] = ca;
        c->cache_by_alloc[c->caches_loaded++] = ca;
@@ -1780,8 +1793,10 @@ void bch_cache_release(struct kobject *kobj)
        struct cache *ca = container_of(kobj, struct cache, kobj);
        unsigned i;
 
-       if (ca->set)
+       if (ca->set) {
+               BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca);
                ca->set->cache[ca->sb.nr_this_dev] = NULL;
+       }
 
        bio_split_pool_free(&ca->bio_split_hook);
 
@@ -1798,10 +1813,8 @@ void bch_cache_release(struct kobject *kobj)
        if (ca->sb_bio.bi_inline_vecs[0].bv_page)
                put_page(ca->sb_bio.bi_io_vec[0].bv_page);
 
-       if (!IS_ERR_OR_NULL(ca->bdev)) {
-               blk_sync_queue(bdev_get_queue(ca->bdev));
+       if (!IS_ERR_OR_NULL(ca->bdev))
                blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-       }
 
        kfree(ca);
        module_put(THIS_MODULE);
@@ -1844,7 +1857,7 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
 }
 
 static void register_cache(struct cache_sb *sb, struct page *sb_page,
-                                 struct block_device *bdev, struct cache *ca)
+                               struct block_device *bdev, struct cache *ca)
 {
        char name[BDEVNAME_SIZE];
        const char *err = "cannot allocate memory";
@@ -1877,10 +1890,12 @@ static void register_cache(struct cache_sb *sb, struct page *sb_page,
                goto err;
 
        pr_info("registered cache device %s", bdevname(bdev, name));
+out:
+       kobject_put(&ca->kobj);
        return;
 err:
        pr_notice("error opening %s: %s", bdevname(bdev, name), err);
-       kobject_put(&ca->kobj);
+       goto out;
 }
 
 /* Global interfaces/init */
@@ -1945,10 +1960,12 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
        if (IS_ERR(bdev)) {
                if (bdev == ERR_PTR(-EBUSY)) {
                        bdev = lookup_bdev(strim(path));
+                       mutex_lock(&bch_register_lock);
                        if (!IS_ERR(bdev) && bch_is_open(bdev))
                                err = "device already registered";
                        else
                                err = "device busy";
+                       mutex_unlock(&bch_register_lock);
                }
                goto err;
        }
index ac7d0d1f70d7be9ae818a51c6d77c461d770eb4b..98df7572b5f7f82b9091965e199a301159374bee 100644 (file)
@@ -416,8 +416,8 @@ do {                                                                        \
                          average_frequency,    frequency_units);       \
        __print_time_stat(stats, name,                                  \
                          average_duration,     duration_units);        \
-       __print_time_stat(stats, name,                                  \
-                         max_duration,         duration_units);        \
+       sysfs_print(name ## _ ##max_duration ## _ ## duration_units,    \
+                       div_u64((stats)->max_duration, NSEC_PER_ ## duration_units));\
                                                                        \
        sysfs_print(name ## _last_ ## frequency_units, (stats)->last    \
                    ? div_s64(local_clock() - (stats)->last,            \
index f4300e4c0114a0cc1abc3b90f757a03666d2637b..f1986bcd1bf05e1058e26946c600a6e2c1f5991d 100644 (file)
@@ -239,7 +239,7 @@ static void read_dirty(struct cached_dev *dc)
                if (KEY_START(&w->key) != dc->last_read ||
                    jiffies_to_msecs(delay) > 50)
                        while (!kthread_should_stop() && delay)
-                               delay = schedule_timeout_uninterruptible(delay);
+                               delay = schedule_timeout_interruptible(delay);
 
                dc->last_read   = KEY_OFFSET(&w->key);
 
@@ -436,7 +436,7 @@ static int bch_writeback_thread(void *arg)
                        while (delay &&
                               !kthread_should_stop() &&
                               !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
-                               delay = schedule_timeout_uninterruptible(delay);
+                               delay = schedule_timeout_interruptible(delay);
                }
        }
 
@@ -478,7 +478,7 @@ void bch_sectors_dirty_init(struct cached_dev *dc)
        dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk);
 }
 
-int bch_cached_dev_writeback_init(struct cached_dev *dc)
+void bch_cached_dev_writeback_init(struct cached_dev *dc)
 {
        sema_init(&dc->in_flight, 64);
        init_rwsem(&dc->writeback_lock);
@@ -494,14 +494,20 @@ int bch_cached_dev_writeback_init(struct cached_dev *dc)
        dc->writeback_rate_d_term       = 30;
        dc->writeback_rate_p_term_inverse = 6000;
 
+       INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
+}
+
+int bch_cached_dev_writeback_start(struct cached_dev *dc)
+{
        dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
                                              "bcache_writeback");
        if (IS_ERR(dc->writeback_thread))
                return PTR_ERR(dc->writeback_thread);
 
-       INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
        schedule_delayed_work(&dc->writeback_rate_update,
                              dc->writeback_rate_update_seconds * HZ);
 
+       bch_writeback_queue(dc);
+
        return 0;
 }
index e2f8598937ac41ff5c7577bc5e65aeb39de95386..0a9dab187b79c7ef0a4429c4616a6985d320b964 100644 (file)
@@ -85,6 +85,7 @@ static inline void bch_writeback_add(struct cached_dev *dc)
 void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
 
 void bch_sectors_dirty_init(struct cached_dev *dc);
-int bch_cached_dev_writeback_init(struct cached_dev *);
+void bch_cached_dev_writeback_init(struct cached_dev *);
+int bch_cached_dev_writeback_start(struct cached_dev *);
 
 #endif
index d2899e7eb3aaf317a93d91978936dac5e3c7f132..06709257adde39e84c3ddd2aa0def729b974ba73 100644 (file)
@@ -330,7 +330,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
        disk_super->discard_root = cpu_to_le64(cmd->discard_root);
        disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
        disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks));
-       disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
+       disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE);
        disk_super->data_block_size = cpu_to_le32(cmd->data_block_size);
        disk_super->cache_blocks = cpu_to_le32(0);
 
@@ -478,7 +478,7 @@ static int __create_persistent_data_objects(struct dm_cache_metadata *cmd,
                                            bool may_format_device)
 {
        int r;
-       cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE,
+       cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
                                          CACHE_METADATA_CACHE_SIZE,
                                          CACHE_MAX_CONCURRENT_LOCKS);
        if (IS_ERR(cmd->bm)) {
index cd70a78623a336956a6366a4de1b2a912821dd9a..7383c90ccdb809e72d904e4a14a642a3d34e1ed1 100644 (file)
@@ -9,19 +9,17 @@
 
 #include "dm-cache-block-types.h"
 #include "dm-cache-policy-internal.h"
+#include "persistent-data/dm-space-map-metadata.h"
 
 /*----------------------------------------------------------------*/
 
-#define DM_CACHE_METADATA_BLOCK_SIZE 4096
+#define DM_CACHE_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE
 
 /* FIXME: remove this restriction */
 /*
  * The metadata device is currently limited in size.
- *
- * We have one block of index, which can hold 255 index entries.  Each
- * index entry contains allocation info about 16k metadata blocks.
  */
-#define DM_CACHE_METADATA_MAX_SECTORS (255 * (1 << 14) * (DM_CACHE_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
+#define DM_CACHE_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS
 
 /*
  * A metadata device larger than 16GB triggers a warning.
index 2c63326638b6d4d54af4499643ac10dd9d8ee33b..1af40ee209e2b9c0d46b9873f3bc32fd78234cd7 100644 (file)
@@ -718,6 +718,22 @@ static int bio_triggers_commit(struct cache *cache, struct bio *bio)
        return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
 }
 
+/*
+ * You must increment the deferred set whilst the prison cell is held.  To
+ * encourage this, we ask for 'cell' to be passed in.
+ */
+static void inc_ds(struct cache *cache, struct bio *bio,
+                  struct dm_bio_prison_cell *cell)
+{
+       size_t pb_data_size = get_per_bio_data_size(cache);
+       struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+       BUG_ON(!cell);
+       BUG_ON(pb->all_io_entry);
+
+       pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+}
+
 static void issue(struct cache *cache, struct bio *bio)
 {
        unsigned long flags;
@@ -737,6 +753,12 @@ static void issue(struct cache *cache, struct bio *bio)
        spin_unlock_irqrestore(&cache->lock, flags);
 }
 
+static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell)
+{
+       inc_ds(cache, bio, cell);
+       issue(cache, bio);
+}
+
 static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
 {
        unsigned long flags;
@@ -1015,6 +1037,11 @@ static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
 
        dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
        remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
+
+       /*
+        * No need to inc_ds() here, since the cell will be held for the
+        * duration of the io.
+        */
        generic_make_request(bio);
 }
 
@@ -1115,8 +1142,7 @@ static void check_for_quiesced_migrations(struct cache *cache,
                return;
 
        INIT_LIST_HEAD(&work);
-       if (pb->all_io_entry)
-               dm_deferred_entry_dec(pb->all_io_entry, &work);
+       dm_deferred_entry_dec(pb->all_io_entry, &work);
 
        if (!list_empty(&work))
                queue_quiesced_migrations(cache, &work);
@@ -1252,6 +1278,11 @@ static void process_flush_bio(struct cache *cache, struct bio *bio)
        else
                remap_to_cache(cache, bio, 0);
 
+       /*
+        * REQ_FLUSH is not directed at any particular block so we don't
+        * need to inc_ds().  REQ_FUA's are split into a write + REQ_FLUSH
+        * by dm-core.
+        */
        issue(cache, bio);
 }
 
@@ -1301,15 +1332,6 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
                   &cache->stats.read_miss : &cache->stats.write_miss);
 }
 
-static void issue_cache_bio(struct cache *cache, struct bio *bio,
-                           struct per_bio_data *pb,
-                           dm_oblock_t oblock, dm_cblock_t cblock)
-{
-       pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
-       remap_to_cache_dirty(cache, bio, oblock, cblock);
-       issue(cache, bio);
-}
-
 static void process_bio(struct cache *cache, struct prealloc *structs,
                        struct bio *bio)
 {
@@ -1318,8 +1340,6 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
        dm_oblock_t block = get_bio_block(cache, bio);
        struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
        struct policy_result lookup_result;
-       size_t pb_data_size = get_per_bio_data_size(cache);
-       struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
        bool discarded_block = is_discarded_oblock(cache, block);
        bool passthrough = passthrough_mode(&cache->features);
        bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
@@ -1359,9 +1379,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
 
                        } else {
                                /* FIXME: factor out issue_origin() */
-                               pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
                                remap_to_origin_clear_discard(cache, bio, block);
-                               issue(cache, bio);
+                               inc_and_issue(cache, bio, new_ocell);
                        }
                } else {
                        inc_hit_counter(cache, bio);
@@ -1369,20 +1388,21 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
                        if (bio_data_dir(bio) == WRITE &&
                            writethrough_mode(&cache->features) &&
                            !is_dirty(cache, lookup_result.cblock)) {
-                               pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
                                remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
-                               issue(cache, bio);
-                       } else
-                               issue_cache_bio(cache, bio, pb, block, lookup_result.cblock);
+                               inc_and_issue(cache, bio, new_ocell);
+
+                       } else  {
+                               remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+                               inc_and_issue(cache, bio, new_ocell);
+                       }
                }
 
                break;
 
        case POLICY_MISS:
                inc_miss_counter(cache, bio);
-               pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
                remap_to_origin_clear_discard(cache, bio, block);
-               issue(cache, bio);
+               inc_and_issue(cache, bio, new_ocell);
                break;
 
        case POLICY_NEW:
@@ -1501,6 +1521,9 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
        bio_list_init(&cache->deferred_flush_bios);
        spin_unlock_irqrestore(&cache->lock, flags);
 
+       /*
+        * These bios have already been through inc_ds()
+        */
        while ((bio = bio_list_pop(&bios)))
                submit_bios ? generic_make_request(bio) : bio_io_error(bio);
 }
@@ -1518,6 +1541,9 @@ static void process_deferred_writethrough_bios(struct cache *cache)
        bio_list_init(&cache->deferred_writethrough_bios);
        spin_unlock_irqrestore(&cache->lock, flags);
 
+       /*
+        * These bios have already been through inc_ds()
+        */
        while ((bio = bio_list_pop(&bios)))
                generic_make_request(bio);
 }
@@ -1694,6 +1720,7 @@ static void do_worker(struct work_struct *ws)
 
                if (commit_if_needed(cache)) {
                        process_deferred_flush_bios(cache, false);
+                       process_migrations(cache, &cache->need_commit_migrations, migration_failure);
 
                        /*
                         * FIXME: rollback metadata or just go into a
@@ -2406,16 +2433,13 @@ out:
        return r;
 }
 
-static int cache_map(struct dm_target *ti, struct bio *bio)
+static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell **cell)
 {
-       struct cache *cache = ti->private;
-
        int r;
        dm_oblock_t block = get_bio_block(cache, bio);
        size_t pb_data_size = get_per_bio_data_size(cache);
        bool can_migrate = false;
        bool discarded_block;
-       struct dm_bio_prison_cell *cell;
        struct policy_result lookup_result;
        struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
 
@@ -2437,15 +2461,15 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
        /*
         * Check to see if that block is currently migrating.
         */
-       cell = alloc_prison_cell(cache);
-       if (!cell) {
+       *cell = alloc_prison_cell(cache);
+       if (!*cell) {
                defer_bio(cache, bio);
                return DM_MAPIO_SUBMITTED;
        }
 
-       r = bio_detain(cache, block, bio, cell,
+       r = bio_detain(cache, block, bio, *cell,
                       (cell_free_fn) free_prison_cell,
-                      cache, &cell);
+                      cache, cell);
        if (r) {
                if (r < 0)
                        defer_bio(cache, bio);
@@ -2458,11 +2482,12 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
        r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
                       bio, &lookup_result);
        if (r == -EWOULDBLOCK) {
-               cell_defer(cache, cell, true);
+               cell_defer(cache, *cell, true);
                return DM_MAPIO_SUBMITTED;
 
        } else if (r) {
                DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
+               cell_defer(cache, *cell, false);
                bio_io_error(bio);
                return DM_MAPIO_SUBMITTED;
        }
@@ -2476,52 +2501,44 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
                                 * We need to invalidate this block, so
                                 * defer for the worker thread.
                                 */
-                               cell_defer(cache, cell, true);
+                               cell_defer(cache, *cell, true);
                                r = DM_MAPIO_SUBMITTED;
 
                        } else {
-                               pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
                                inc_miss_counter(cache, bio);
                                remap_to_origin_clear_discard(cache, bio, block);
-
-                               cell_defer(cache, cell, false);
                        }
 
                } else {
                        inc_hit_counter(cache, bio);
-                       pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
-
                        if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
                            !is_dirty(cache, lookup_result.cblock))
                                remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
                        else
                                remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
-
-                       cell_defer(cache, cell, false);
                }
                break;
 
        case POLICY_MISS:
                inc_miss_counter(cache, bio);
-               pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
-
                if (pb->req_nr != 0) {
                        /*
                         * This is a duplicate writethrough io that is no
                         * longer needed because the block has been demoted.
                         */
                        bio_endio(bio, 0);
-                       cell_defer(cache, cell, false);
-                       return DM_MAPIO_SUBMITTED;
-               } else {
+                       cell_defer(cache, *cell, false);
+                       r = DM_MAPIO_SUBMITTED;
+
+               } else
                        remap_to_origin_clear_discard(cache, bio, block);
-                       cell_defer(cache, cell, false);
-               }
+
                break;
 
        default:
                DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
                            (unsigned) lookup_result.op);
+               cell_defer(cache, *cell, false);
                bio_io_error(bio);
                r = DM_MAPIO_SUBMITTED;
        }
@@ -2529,6 +2546,21 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
        return r;
 }
 
+static int cache_map(struct dm_target *ti, struct bio *bio)
+{
+       int r;
+       struct dm_bio_prison_cell *cell;
+       struct cache *cache = ti->private;
+
+       r = __cache_map(cache, bio, &cell);
+       if (r == DM_MAPIO_REMAPPED) {
+               inc_ds(cache, bio, cell);
+               cell_defer(cache, cell, false);
+       }
+
+       return r;
+}
+
 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
 {
        struct cache *cache = ti->private;
@@ -2808,7 +2840,7 @@ static void cache_status(struct dm_target *ti, status_type_t type,
                residency = policy_residency(cache->policy);
 
                DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ",
-                      (unsigned)(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT),
+                      (unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
                       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
                       (unsigned long long)nr_blocks_metadata,
                       cache->sectors_per_block,
@@ -3062,7 +3094,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
         */
        if (io_opt_sectors < cache->sectors_per_block ||
            do_div(io_opt_sectors, cache->sectors_per_block)) {
-               blk_limits_io_min(limits, 0);
+               blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
                blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
        }
        set_discard_limits(cache, limits);
@@ -3072,7 +3104,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 static struct target_type cache_target = {
        .name = "cache",
-       .version = {1, 4, 0},
+       .version = {1, 5, 0},
        .module = THIS_MODULE,
        .ctr = cache_ctr,
        .dtr = cache_dtr,
index 4cba2d808afb451109cfbf4602790f4011eae191..2785007e0e462597d3b8e74839a8bfe25aa90b9c 100644 (file)
@@ -59,7 +59,7 @@ struct dm_crypt_io {
        int error;
        sector_t sector;
        struct dm_crypt_io *base_io;
-};
+} CRYPTO_MINALIGN_ATTR;
 
 struct dm_crypt_request {
        struct convert_context *ctx;
@@ -162,6 +162,8 @@ struct crypt_config {
         */
        unsigned int dmreq_start;
 
+       unsigned int per_bio_data_size;
+
        unsigned long flags;
        unsigned int key_size;
        unsigned int key_parts;      /* independent parts in key buffer */
@@ -895,6 +897,15 @@ static void crypt_alloc_req(struct crypt_config *cc,
            kcryptd_async_done, dmreq_of_req(cc, ctx->req));
 }
 
+static void crypt_free_req(struct crypt_config *cc,
+                          struct ablkcipher_request *req, struct bio *base_bio)
+{
+       struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
+
+       if ((struct ablkcipher_request *)(io + 1) != req)
+               mempool_free(req, cc->req_pool);
+}
+
 /*
  * Encrypt / decrypt data from one bio to another one (can be the same one)
  */
@@ -1008,12 +1019,9 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
        }
 }
 
-static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
-                                         struct bio *bio, sector_t sector)
+static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc,
+                         struct bio *bio, sector_t sector)
 {
-       struct dm_crypt_io *io;
-
-       io = mempool_alloc(cc->io_pool, GFP_NOIO);
        io->cc = cc;
        io->base_bio = bio;
        io->sector = sector;
@@ -1021,8 +1029,6 @@ static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
        io->base_io = NULL;
        io->ctx.req = NULL;
        atomic_set(&io->io_pending, 0);
-
-       return io;
 }
 
 static void crypt_inc_pending(struct dm_crypt_io *io)
@@ -1046,8 +1052,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
                return;
 
        if (io->ctx.req)
-               mempool_free(io->ctx.req, cc->req_pool);
-       mempool_free(io, cc->io_pool);
+               crypt_free_req(cc, io->ctx.req, base_bio);
+       if (io != dm_per_bio_data(base_bio, cc->per_bio_data_size))
+               mempool_free(io, cc->io_pool);
 
        if (likely(!base_io))
                bio_endio(base_bio, error);
@@ -1255,8 +1262,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
                 * between fragments, so switch to a new dm_crypt_io structure.
                 */
                if (unlikely(!crypt_finished && remaining)) {
-                       new_io = crypt_io_alloc(io->cc, io->base_bio,
-                                               sector);
+                       new_io = mempool_alloc(cc->io_pool, GFP_NOIO);
+                       crypt_io_init(new_io, io->cc, io->base_bio, sector);
                        crypt_inc_pending(new_io);
                        crypt_convert_init(cc, &new_io->ctx, NULL,
                                           io->base_bio, sector);
@@ -1325,7 +1332,7 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
        if (error < 0)
                io->error = -EIO;
 
-       mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
+       crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);
 
        if (!atomic_dec_and_test(&ctx->cc_pending))
                return;
@@ -1728,6 +1735,10 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                goto bad;
        }
 
+       cc->per_bio_data_size = ti->per_bio_data_size =
+                               sizeof(struct dm_crypt_io) + cc->dmreq_start +
+                               sizeof(struct dm_crypt_request) + cc->iv_size;
+
        cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
        if (!cc->page_pool) {
                ti->error = "Cannot allocate page mempool";
@@ -1824,7 +1835,9 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
                return DM_MAPIO_REMAPPED;
        }
 
-       io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
+       io = dm_per_bio_data(bio, cc->per_bio_data_size);
+       crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
+       io->ctx.req = (struct ablkcipher_request *)(io + 1);
 
        if (bio_data_dir(io->base_bio) == READ) {
                if (kcryptd_io_read(io, GFP_NOWAIT))
index db404a0f7e2c83ead70bbf32e2346ecd60aa2edf..c09359db3a90730dbd32b3bd733709f3c6444192 100644 (file)
@@ -33,7 +33,6 @@ struct dm_io_client {
 struct io {
        unsigned long error_bits;
        atomic_t count;
-       struct completion *wait;
        struct dm_io_client *client;
        io_notify_fn callback;
        void *context;
@@ -112,28 +111,27 @@ static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
  * We need an io object to keep track of the number of bios that
  * have been dispatched for a particular io.
  *---------------------------------------------------------------*/
-static void dec_count(struct io *io, unsigned int region, int error)
+static void complete_io(struct io *io)
 {
-       if (error)
-               set_bit(region, &io->error_bits);
+       unsigned long error_bits = io->error_bits;
+       io_notify_fn fn = io->callback;
+       void *context = io->context;
 
-       if (atomic_dec_and_test(&io->count)) {
-               if (io->vma_invalidate_size)
-                       invalidate_kernel_vmap_range(io->vma_invalidate_address,
-                                                    io->vma_invalidate_size);
+       if (io->vma_invalidate_size)
+               invalidate_kernel_vmap_range(io->vma_invalidate_address,
+                                            io->vma_invalidate_size);
 
-               if (io->wait)
-                       complete(io->wait);
+       mempool_free(io, io->client->pool);
+       fn(error_bits, context);
+}
 
-               else {
-                       unsigned long r = io->error_bits;
-                       io_notify_fn fn = io->callback;
-                       void *context = io->context;
+static void dec_count(struct io *io, unsigned int region, int error)
+{
+       if (error)
+               set_bit(region, &io->error_bits);
 
-                       mempool_free(io, io->client->pool);
-                       fn(r, context);
-               }
-       }
+       if (atomic_dec_and_test(&io->count))
+               complete_io(io);
 }
 
 static void endio(struct bio *bio, int error)
@@ -376,41 +374,51 @@ static void dispatch_io(int rw, unsigned int num_regions,
        dec_count(io, 0, 0);
 }
 
+struct sync_io {
+       unsigned long error_bits;
+       struct completion wait;
+};
+
+static void sync_io_complete(unsigned long error, void *context)
+{
+       struct sync_io *sio = context;
+
+       sio->error_bits = error;
+       complete(&sio->wait);
+}
+
 static int sync_io(struct dm_io_client *client, unsigned int num_regions,
                   struct dm_io_region *where, int rw, struct dpages *dp,
                   unsigned long *error_bits)
 {
-       /*
-        * gcc <= 4.3 can't do the alignment for stack variables, so we must
-        * align it on our own.
-        * volatile prevents the optimizer from removing or reusing
-        * "io_" field from the stack frame (allowed in ANSI C).
-        */
-       volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
-       struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io));
-       DECLARE_COMPLETION_ONSTACK(wait);
+       struct io *io;
+       struct sync_io sio;
 
        if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
                WARN_ON(1);
                return -EIO;
        }
 
+       init_completion(&sio.wait);
+
+       io = mempool_alloc(client->pool, GFP_NOIO);
        io->error_bits = 0;
        atomic_set(&io->count, 1); /* see dispatch_io() */
-       io->wait = &wait;
        io->client = client;
+       io->callback = sync_io_complete;
+       io->context = &sio;
 
        io->vma_invalidate_address = dp->vma_invalidate_address;
        io->vma_invalidate_size = dp->vma_invalidate_size;
 
        dispatch_io(rw, num_regions, where, dp, io, 1);
 
-       wait_for_completion_io(&wait);
+       wait_for_completion_io(&sio.wait);
 
        if (error_bits)
-               *error_bits = io->error_bits;
+               *error_bits = sio.error_bits;
 
-       return io->error_bits ? -EIO : 0;
+       return sio.error_bits ? -EIO : 0;
 }
 
 static int async_io(struct dm_io_client *client, unsigned int num_regions,
@@ -428,7 +436,6 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
        io = mempool_alloc(client->pool, GFP_NOIO);
        io->error_bits = 0;
        atomic_set(&io->count, 1); /* see dispatch_io() */
-       io->wait = NULL;
        io->client = client;
        io->callback = fn;
        io->context = context;
@@ -481,9 +488,9 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp,
  * New collapsed (a)synchronous interface.
  *
  * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug
- * the queue with blk_unplug() some time later or set REQ_SYNC in
-io_req->bi_rw. If you fail to do one of these, the IO will be submitted to
- * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c.
+ * the queue with blk_unplug() some time later or set REQ_SYNC in io_req->bi_rw.
+ * If you fail to do one of these, the IO will be submitted to the disk after
+ * q->unplug_delay, which defaults to 3ms in blk-settings.c.
  */
 int dm_io(struct dm_io_request *io_req, unsigned num_regions,
          struct dm_io_region *where, unsigned long *sync_error_bits)
index f4167b013d990c3fc25f185ee467952824f0ae5d..833d7e752f0633a1586fcefb613e501d777efb30 100644 (file)
@@ -373,8 +373,6 @@ static int __must_push_back(struct multipath *m)
                 dm_noflush_suspending(m->ti)));
 }
 
-#define pg_ready(m) (!(m)->queue_io && !(m)->pg_init_required)
-
 /*
  * Map cloned requests
  */
@@ -402,11 +400,11 @@ static int multipath_map(struct dm_target *ti, struct request *clone,
                if (!__must_push_back(m))
                        r = -EIO;       /* Failed */
                goto out_unlock;
-       }
-       if (!pg_ready(m)) {
+       } else if (m->queue_io || m->pg_init_required) {
                __pg_init_all_paths(m);
                goto out_unlock;
        }
+
        if (set_mapinfo(m, map_context) < 0)
                /* ENOMEM, requeue */
                goto out_unlock;
index 09a688b3d48ca1445e136544321a54b112b280e1..50fca469cafd92b3dac8c4455391cf24419de7ac 100644 (file)
@@ -137,13 +137,23 @@ static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr
        *bit *= sctx->region_table_entry_bits;
 }
 
+static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr)
+{
+       unsigned long region_index;
+       unsigned bit;
+
+       switch_get_position(sctx, region_nr, &region_index, &bit);
+
+       return (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
+               ((1 << sctx->region_table_entry_bits) - 1);
+}
+
 /*
  * Find which path to use at given offset.
  */
 static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
 {
-       unsigned long region_index;
-       unsigned bit, path_nr;
+       unsigned path_nr;
        sector_t p;
 
        p = offset;
@@ -152,9 +162,7 @@ static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
        else
                sector_div(p, sctx->region_size);
 
-       switch_get_position(sctx, p, &region_index, &bit);
-       path_nr = (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
-              ((1 << sctx->region_table_entry_bits) - 1);
+       path_nr = switch_region_table_read(sctx, p);
 
        /* This can only happen if the processor uses non-atomic stores. */
        if (unlikely(path_nr >= sctx->nr_paths))
@@ -363,7 +371,7 @@ static __always_inline unsigned long parse_hex(const char **string)
 }
 
 static int process_set_region_mappings(struct switch_ctx *sctx,
-                            unsigned argc, char **argv)
+                                      unsigned argc, char **argv)
 {
        unsigned i;
        unsigned long region_index = 0;
@@ -372,6 +380,51 @@ static int process_set_region_mappings(struct switch_ctx *sctx,
                unsigned long path_nr;
                const char *string = argv[i];
 
+               if ((*string & 0xdf) == 'R') {
+                       unsigned long cycle_length, num_write;
+
+                       string++;
+                       if (unlikely(*string == ',')) {
+                               DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+                               return -EINVAL;
+                       }
+                       cycle_length = parse_hex(&string);
+                       if (unlikely(*string != ',')) {
+                               DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+                               return -EINVAL;
+                       }
+                       string++;
+                       if (unlikely(!*string)) {
+                               DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+                               return -EINVAL;
+                       }
+                       num_write = parse_hex(&string);
+                       if (unlikely(*string)) {
+                               DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+                               return -EINVAL;
+                       }
+
+                       if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) {
+                               DMWARN("invalid set_region_mappings cycle length: %lu > %lu",
+                                      cycle_length - 1, region_index);
+                               return -EINVAL;
+                       }
+                       if (unlikely(region_index + num_write < region_index) ||
+                           unlikely(region_index + num_write >= sctx->nr_regions)) {
+                               DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu",
+                                      region_index, num_write, sctx->nr_regions);
+                               return -EINVAL;
+                       }
+
+                       while (num_write--) {
+                               region_index++;
+                               path_nr = switch_region_table_read(sctx, region_index - cycle_length);
+                               switch_region_table_write(sctx, region_index, path_nr);
+                       }
+
+                       continue;
+               }
+
                if (*string == ':')
                        region_index++;
                else {
@@ -500,7 +553,7 @@ static int switch_iterate_devices(struct dm_target *ti,
 
 static struct target_type switch_target = {
        .name = "switch",
-       .version = {1, 0, 0},
+       .version = {1, 1, 0},
        .module = THIS_MODULE,
        .ctr = switch_ctr,
        .dtr = switch_dtr,
index 5f59f1e3e5b11de3156eef9e5bcd68b6a46c3b82..f9c6cb8dbcf8c493723f5ddc46af8d4e6fed892b 100644 (file)
@@ -1386,6 +1386,14 @@ static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev,
        return q && !blk_queue_add_random(q);
 }
 
+static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev,
+                                  sector_t start, sector_t len, void *data)
+{
+       struct request_queue *q = bdev_get_queue(dev->bdev);
+
+       return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
+}
+
 static bool dm_table_all_devices_attribute(struct dm_table *t,
                                           iterate_devices_callout_fn func)
 {
@@ -1430,6 +1438,43 @@ static bool dm_table_supports_write_same(struct dm_table *t)
        return true;
 }
 
+static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
+                                 sector_t start, sector_t len, void *data)
+{
+       struct request_queue *q = bdev_get_queue(dev->bdev);
+
+       return q && blk_queue_discard(q);
+}
+
+static bool dm_table_supports_discards(struct dm_table *t)
+{
+       struct dm_target *ti;
+       unsigned i = 0;
+
+       /*
+        * Unless any target used by the table set discards_supported,
+        * require at least one underlying device to support discards.
+        * t->devices includes internal dm devices such as mirror logs
+        * so we need to use iterate_devices here, which targets
+        * supporting discard selectively must provide.
+        */
+       while (i < dm_table_get_num_targets(t)) {
+               ti = dm_table_get_target(t, i++);
+
+               if (!ti->num_discard_bios)
+                       continue;
+
+               if (ti->discards_supported)
+                       return 1;
+
+               if (ti->type->iterate_devices &&
+                   ti->type->iterate_devices(ti, device_discard_capable, NULL))
+                       return 1;
+       }
+
+       return 0;
+}
+
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
                               struct queue_limits *limits)
 {
@@ -1464,6 +1509,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
        if (!dm_table_supports_write_same(t))
                q->limits.max_write_same_sectors = 0;
 
+       if (dm_table_all_devices_attribute(t, queue_supports_sg_merge))
+               queue_flag_clear_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
+       else
+               queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
+
        dm_table_set_integrity(t);
 
        /*
@@ -1636,39 +1686,3 @@ void dm_table_run_md_queue_async(struct dm_table *t)
 }
 EXPORT_SYMBOL(dm_table_run_md_queue_async);
 
-static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
-                                 sector_t start, sector_t len, void *data)
-{
-       struct request_queue *q = bdev_get_queue(dev->bdev);
-
-       return q && blk_queue_discard(q);
-}
-
-bool dm_table_supports_discards(struct dm_table *t)
-{
-       struct dm_target *ti;
-       unsigned i = 0;
-
-       /*
-        * Unless any target used by the table set discards_supported,
-        * require at least one underlying device to support discards.
-        * t->devices includes internal dm devices such as mirror logs
-        * so we need to use iterate_devices here, which targets
-        * supporting discard selectively must provide.
-        */
-       while (i < dm_table_get_num_targets(t)) {
-               ti = dm_table_get_target(t, i++);
-
-               if (!ti->num_discard_bios)
-                       continue;
-
-               if (ti->discards_supported)
-                       return 1;
-
-               if (ti->type->iterate_devices &&
-                   ti->type->iterate_devices(ti, device_discard_capable, NULL))
-                       return 1;
-       }
-
-       return 0;
-}
index fc9c848a60c9267a44296b54656bbdda60f40fd0..4843801173fe11a99519b59dd808e46e02425ee4 100644 (file)
@@ -227,6 +227,7 @@ struct thin_c {
        struct list_head list;
        struct dm_dev *pool_dev;
        struct dm_dev *origin_dev;
+       sector_t origin_size;
        dm_thin_id dev_id;
 
        struct pool *pool;
@@ -554,11 +555,16 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio,
 struct dm_thin_new_mapping {
        struct list_head list;
 
-       bool quiesced:1;
-       bool prepared:1;
        bool pass_discard:1;
        bool definitely_not_shared:1;
 
+       /*
+        * Track quiescing, copying and zeroing preparation actions.  When this
+        * counter hits zero the block is prepared and can be inserted into the
+        * btree.
+        */
+       atomic_t prepare_actions;
+
        int err;
        struct thin_c *tc;
        dm_block_t virt_block;
@@ -575,43 +581,41 @@ struct dm_thin_new_mapping {
        bio_end_io_t *saved_bi_end_io;
 };
 
-static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
+static void __complete_mapping_preparation(struct dm_thin_new_mapping *m)
 {
        struct pool *pool = m->tc->pool;
 
-       if (m->quiesced && m->prepared) {
+       if (atomic_dec_and_test(&m->prepare_actions)) {
                list_add_tail(&m->list, &pool->prepared_mappings);
                wake_worker(pool);
        }
 }
 
-static void copy_complete(int read_err, unsigned long write_err, void *context)
+static void complete_mapping_preparation(struct dm_thin_new_mapping *m)
 {
        unsigned long flags;
-       struct dm_thin_new_mapping *m = context;
        struct pool *pool = m->tc->pool;
 
-       m->err = read_err || write_err ? -EIO : 0;
-
        spin_lock_irqsave(&pool->lock, flags);
-       m->prepared = true;
-       __maybe_add_mapping(m);
+       __complete_mapping_preparation(m);
        spin_unlock_irqrestore(&pool->lock, flags);
 }
 
+static void copy_complete(int read_err, unsigned long write_err, void *context)
+{
+       struct dm_thin_new_mapping *m = context;
+
+       m->err = read_err || write_err ? -EIO : 0;
+       complete_mapping_preparation(m);
+}
+
 static void overwrite_endio(struct bio *bio, int err)
 {
-       unsigned long flags;
        struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
        struct dm_thin_new_mapping *m = h->overwrite_mapping;
-       struct pool *pool = m->tc->pool;
 
        m->err = err;
-
-       spin_lock_irqsave(&pool->lock, flags);
-       m->prepared = true;
-       __maybe_add_mapping(m);
-       spin_unlock_irqrestore(&pool->lock, flags);
+       complete_mapping_preparation(m);
 }
 
 /*----------------------------------------------------------------*/
@@ -821,10 +825,31 @@ static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
        return m;
 }
 
+static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
+                   sector_t begin, sector_t end)
+{
+       int r;
+       struct dm_io_region to;
+
+       to.bdev = tc->pool_dev->bdev;
+       to.sector = begin;
+       to.count = end - begin;
+
+       r = dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);
+       if (r < 0) {
+               DMERR_LIMIT("dm_kcopyd_zero() failed");
+               copy_complete(1, 1, m);
+       }
+}
+
+/*
+ * A partial copy also needs to zero the uncopied region.
+ */
 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
                          struct dm_dev *origin, dm_block_t data_origin,
                          dm_block_t data_dest,
-                         struct dm_bio_prison_cell *cell, struct bio *bio)
+                         struct dm_bio_prison_cell *cell, struct bio *bio,
+                         sector_t len)
 {
        int r;
        struct pool *pool = tc->pool;
@@ -835,8 +860,15 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
        m->data_block = data_dest;
        m->cell = cell;
 
+       /*
+        * quiesce action + copy action + an extra reference held for the
+        * duration of this function (we may need to inc later for a
+        * partial zero).
+        */
+       atomic_set(&m->prepare_actions, 3);
+
        if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
-               m->quiesced = true;
+               complete_mapping_preparation(m); /* already quiesced */
 
        /*
         * IO to pool_dev remaps to the pool target's data_dev.
@@ -857,20 +889,38 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 
                from.bdev = origin->bdev;
                from.sector = data_origin * pool->sectors_per_block;
-               from.count = pool->sectors_per_block;
+               from.count = len;
 
                to.bdev = tc->pool_dev->bdev;
                to.sector = data_dest * pool->sectors_per_block;
-               to.count = pool->sectors_per_block;
+               to.count = len;
 
                r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
                                   0, copy_complete, m);
                if (r < 0) {
-                       mempool_free(m, pool->mapping_pool);
                        DMERR_LIMIT("dm_kcopyd_copy() failed");
-                       cell_error(pool, cell);
+                       copy_complete(1, 1, m);
+
+                       /*
+                        * We allow the zero to be issued, to simplify the
+                        * error path.  Otherwise we'd need to start
+                        * worrying about decrementing the prepare_actions
+                        * counter.
+                        */
+               }
+
+               /*
+                * Do we need to zero a tail region?
+                */
+               if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {
+                       atomic_inc(&m->prepare_actions);
+                       ll_zero(tc, m,
+                               data_dest * pool->sectors_per_block + len,
+                               (data_dest + 1) * pool->sectors_per_block);
                }
        }
+
+       complete_mapping_preparation(m); /* drop our ref */
 }
 
 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
@@ -878,15 +928,8 @@ static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
                                   struct dm_bio_prison_cell *cell, struct bio *bio)
 {
        schedule_copy(tc, virt_block, tc->pool_dev,
-                     data_origin, data_dest, cell, bio);
-}
-
-static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
-                                  dm_block_t data_dest,
-                                  struct dm_bio_prison_cell *cell, struct bio *bio)
-{
-       schedule_copy(tc, virt_block, tc->origin_dev,
-                     virt_block, data_dest, cell, bio);
+                     data_origin, data_dest, cell, bio,
+                     tc->pool->sectors_per_block);
 }
 
 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
@@ -896,8 +939,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
        struct pool *pool = tc->pool;
        struct dm_thin_new_mapping *m = get_next_mapping(pool);
 
-       m->quiesced = true;
-       m->prepared = false;
+       atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
        m->tc = tc;
        m->virt_block = virt_block;
        m->data_block = data_block;
@@ -919,21 +961,33 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
                save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
                inc_all_io_entry(pool, bio);
                remap_and_issue(tc, bio, data_block);
-       } else {
-               int r;
-               struct dm_io_region to;
 
-               to.bdev = tc->pool_dev->bdev;
-               to.sector = data_block * pool->sectors_per_block;
-               to.count = pool->sectors_per_block;
+       } else
+               ll_zero(tc, m,
+                       data_block * pool->sectors_per_block,
+                       (data_block + 1) * pool->sectors_per_block);
+}
 
-               r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
-               if (r < 0) {
-                       mempool_free(m, pool->mapping_pool);
-                       DMERR_LIMIT("dm_kcopyd_zero() failed");
-                       cell_error(pool, cell);
-               }
-       }
+static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
+                                  dm_block_t data_dest,
+                                  struct dm_bio_prison_cell *cell, struct bio *bio)
+{
+       struct pool *pool = tc->pool;
+       sector_t virt_block_begin = virt_block * pool->sectors_per_block;
+       sector_t virt_block_end = (virt_block + 1) * pool->sectors_per_block;
+
+       if (virt_block_end <= tc->origin_size)
+               schedule_copy(tc, virt_block, tc->origin_dev,
+                             virt_block, data_dest, cell, bio,
+                             pool->sectors_per_block);
+
+       else if (virt_block_begin < tc->origin_size)
+               schedule_copy(tc, virt_block, tc->origin_dev,
+                             virt_block, data_dest, cell, bio,
+                             tc->origin_size - virt_block_begin);
+
+       else
+               schedule_zero(tc, virt_block, data_dest, cell, bio);
 }
 
 /*
@@ -1315,7 +1369,18 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
                        inc_all_io_entry(pool, bio);
                        cell_defer_no_holder(tc, cell);
 
-                       remap_to_origin_and_issue(tc, bio);
+                       if (bio_end_sector(bio) <= tc->origin_size)
+                               remap_to_origin_and_issue(tc, bio);
+
+                       else if (bio->bi_iter.bi_sector < tc->origin_size) {
+                               zero_fill_bio(bio);
+                               bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT;
+                               remap_to_origin_and_issue(tc, bio);
+
+                       } else {
+                               zero_fill_bio(bio);
+                               bio_endio(bio, 0);
+                       }
                } else
                        provision_block(tc, bio, block, cell);
                break;
@@ -3112,7 +3177,7 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
         */
        if (io_opt_sectors < pool->sectors_per_block ||
            do_div(io_opt_sectors, pool->sectors_per_block)) {
-               blk_limits_io_min(limits, 0);
+               blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
                blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
        }
 
@@ -3141,7 +3206,7 @@ static struct target_type pool_target = {
        .name = "thin-pool",
        .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
                    DM_TARGET_IMMUTABLE,
-       .version = {1, 12, 0},
+       .version = {1, 13, 0},
        .module = THIS_MODULE,
        .ctr = pool_ctr,
        .dtr = pool_dtr,
@@ -3361,8 +3426,7 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
                spin_lock_irqsave(&pool->lock, flags);
                list_for_each_entry_safe(m, tmp, &work, list) {
                        list_del(&m->list);
-                       m->quiesced = true;
-                       __maybe_add_mapping(m);
+                       __complete_mapping_preparation(m);
                }
                spin_unlock_irqrestore(&pool->lock, flags);
        }
@@ -3401,6 +3465,16 @@ static void thin_postsuspend(struct dm_target *ti)
        noflush_work(tc, do_noflush_stop);
 }
 
+static int thin_preresume(struct dm_target *ti)
+{
+       struct thin_c *tc = ti->private;
+
+       if (tc->origin_dev)
+               tc->origin_size = get_dev_size(tc->origin_dev->bdev);
+
+       return 0;
+}
+
 /*
  * <nr mapped sectors> <highest mapped sector>
  */
@@ -3483,12 +3557,13 @@ static int thin_iterate_devices(struct dm_target *ti,
 
 static struct target_type thin_target = {
        .name = "thin",
-       .version = {1, 12, 0},
+       .version = {1, 13, 0},
        .module = THIS_MODULE,
        .ctr = thin_ctr,
        .dtr = thin_dtr,
        .map = thin_map,
        .end_io = thin_endio,
+       .preresume = thin_preresume,
        .presuspend = thin_presuspend,
        .postsuspend = thin_postsuspend,
        .status = thin_status,
index ed76126aac542e57d092a1d50a00135d8119c2ec..e81d2152fa684198899b1998efe3e5b19554b778 100644 (file)
@@ -72,7 +72,6 @@ int dm_table_any_busy_target(struct dm_table *t);
 unsigned dm_table_get_type(struct dm_table *t);
 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
 bool dm_table_request_based(struct dm_table *t);
-bool dm_table_supports_discards(struct dm_table *t);
 void dm_table_free_md_mempools(struct dm_table *t);
 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
 
index 32fc19c540d426a95f11ace984db7195e34a870c..1294238610df0237cfe524d200278f37c9232bb3 100644 (file)
@@ -5961,7 +5961,7 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
        int err = 0;
 
        if (mddev->pers) {
-               if (!mddev->pers->quiesce)
+               if (!mddev->pers->quiesce || !mddev->thread)
                        return -EBUSY;
                if (mddev->recovery || mddev->sync_thread)
                        return -EBUSY;
@@ -6263,7 +6263,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
                rv = update_raid_disks(mddev, info->raid_disks);
 
        if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
-               if (mddev->pers->quiesce == NULL)
+               if (mddev->pers->quiesce == NULL || mddev->thread == NULL)
                        return -EINVAL;
                if (mddev->recovery || mddev->sync_thread)
                        return -EBUSY;
@@ -7376,7 +7376,7 @@ void md_do_sync(struct md_thread *thread)
        struct mddev *mddev2;
        unsigned int currspeed = 0,
                 window;
-       sector_t max_sectors,j, io_sectors;
+       sector_t max_sectors,j, io_sectors, recovery_done;
        unsigned long mark[SYNC_MARKS];
        unsigned long update_time;
        sector_t mark_cnt[SYNC_MARKS];
@@ -7652,7 +7652,8 @@ void md_do_sync(struct md_thread *thread)
                 */
                cond_resched();
 
-               currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
+               recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
+               currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
                        /((jiffies-mddev->resync_mark)/HZ +1) +1;
 
                if (currspeed > speed_min(mddev)) {
@@ -8592,7 +8593,7 @@ static int __init md_init(void)
                goto err_mdp;
        mdp_major = ret;
 
-       blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
+       blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE,
                            md_probe, NULL, NULL);
        blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
                            md_probe, NULL, NULL);
@@ -8687,7 +8688,7 @@ static __exit void md_exit(void)
        struct list_head *tmp;
        int delay = 1;
 
-       blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);
+       blk_unregister_region(MKDEV(MD_MAJOR,0), 512);
        blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
 
        unregister_blkdev(MD_MAJOR,"md");
index 407a99e46f6993a770c21fdfff1972b7f64063b6..cf91f5910c7c8e9422ef1beb73001c0f76cbdaf1 100644 (file)
@@ -685,6 +685,12 @@ static void *raid0_takeover(struct mddev *mddev)
         *  raid10 - assuming we have all necessary active disks
         *  raid1 - with (N -1) mirror drives faulty
         */
+
+       if (mddev->bitmap) {
+               printk(KERN_ERR "md/raid0: %s: cannot takeover array with bitmap\n",
+                      mdname(mddev));
+               return ERR_PTR(-EBUSY);
+       }
        if (mddev->level == 4)
                return raid0_takeover_raid45(mddev);
 
index 56e24c072b629324ec382037ab17fd43310ec3ef..d7690f86fdb9a6202edfa6d8fa5f95517fe8e75b 100644 (file)
@@ -1501,12 +1501,12 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
                mddev->degraded++;
                set_bit(Faulty, &rdev->flags);
                spin_unlock_irqrestore(&conf->device_lock, flags);
-               /*
-                * if recovery is running, make sure it aborts.
-                */
-               set_bit(MD_RECOVERY_INTR, &mddev->recovery);
        } else
                set_bit(Faulty, &rdev->flags);
+       /*
+        * if recovery is running, make sure it aborts.
+        */
+       set_bit(MD_RECOVERY_INTR, &mddev->recovery);
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
        printk(KERN_ALERT
               "md/raid1:%s: Disk failure on %s, disabling device.\n"
index cb882aae9e20d4f7032a400884f45f50de57528f..b08c18871323c904f3964e9c8b650366ad2979ef 100644 (file)
@@ -1684,13 +1684,12 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
                spin_unlock_irqrestore(&conf->device_lock, flags);
                return;
        }
-       if (test_and_clear_bit(In_sync, &rdev->flags)) {
+       if (test_and_clear_bit(In_sync, &rdev->flags))
                mddev->degraded++;
-                       /*
-                * if recovery is running, make sure it aborts.
-                */
-               set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-       }
+       /*
+        * If recovery is running, make sure it aborts.
+        */
+       set_bit(MD_RECOVERY_INTR, &mddev->recovery);
        set_bit(Blocked, &rdev->flags);
        set_bit(Faulty, &rdev->flags);
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
index 452782bffebcfd5977a2935155cbe2b1e143f302..ede41f05c392d499542dac45f9228f37364ecf30 100644 (file)
@@ -2028,8 +2028,7 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
                /* complete ongoing async transfer before issuing discard */
                if (card->host->areq)
                        mmc_blk_issue_rw_rq(mq, NULL);
-               if (req->cmd_flags & REQ_SECURE &&
-                       !(card->quirks & MMC_QUIRK_SEC_ERASE_TRIM_BROKEN))
+               if (req->cmd_flags & REQ_SECURE)
                        ret = mmc_blk_issue_secdiscard_rq(mq, req);
                else
                        ret = mmc_blk_issue_discard_rq(mq, req);
@@ -2432,6 +2431,8 @@ static int mmc_blk_probe(struct mmc_card *card)
        if (!(card->csd.cmdclass & CCC_BLOCK_READ))
                return -ENODEV;
 
+       mmc_fixup_device(card, blk_fixups);
+
        md = mmc_blk_alloc(card);
        if (IS_ERR(md))
                return PTR_ERR(md);
@@ -2446,7 +2447,6 @@ static int mmc_blk_probe(struct mmc_card *card)
                goto out;
 
        mmc_set_drvdata(card, md);
-       mmc_fixup_device(card, blk_fixups);
 
        if (mmc_add_disk(md))
                goto out;
index d2dbf02022bd05803cda6ad5b0e9b4d201721b32..8a1f1240e05802f326986afdf093cd75f303496c 100644 (file)
@@ -180,7 +180,6 @@ static int mmc_bus_resume(struct device *dev)
 #endif
 
 #ifdef CONFIG_PM_RUNTIME
-
 static int mmc_runtime_suspend(struct device *dev)
 {
        struct mmc_card *card = mmc_dev_to_card(dev);
@@ -196,17 +195,10 @@ static int mmc_runtime_resume(struct device *dev)
 
        return host->bus_ops->runtime_resume(host);
 }
-
-static int mmc_runtime_idle(struct device *dev)
-{
-       return 0;
-}
-
 #endif /* !CONFIG_PM_RUNTIME */
 
 static const struct dev_pm_ops mmc_bus_pm_ops = {
-       SET_RUNTIME_PM_OPS(mmc_runtime_suspend, mmc_runtime_resume,
-                       mmc_runtime_idle)
+       SET_RUNTIME_PM_OPS(mmc_runtime_suspend, mmc_runtime_resume, NULL)
        SET_SYSTEM_SLEEP_PM_OPS(mmc_bus_suspend, mmc_bus_resume)
 };
 
index 7dc0c85fdb6067b980b7cfd636e4c23a98d1803c..d03a080fb9cd35ff0b29bf50380f08fdd8818c23 100644 (file)
@@ -2102,7 +2102,8 @@ EXPORT_SYMBOL(mmc_can_sanitize);
 
 int mmc_can_secure_erase_trim(struct mmc_card *card)
 {
-       if (card->ext_csd.sec_feature_support & EXT_CSD_SEC_ER_EN)
+       if ((card->ext_csd.sec_feature_support & EXT_CSD_SEC_ER_EN) &&
+           !(card->quirks & MMC_QUIRK_SEC_ERASE_TRIM_BROKEN))
                return 1;
        return 0;
 }
index 793c6f7ddb049a735916eae87585b5cfb0f3b452..1eda8dd8c867228b5643e40f7655b513643eb26f 100644 (file)
@@ -324,13 +324,12 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd)
                }
        }
 
+       /*
+        * The EXT_CSD format is meant to be forward compatible. As long
+        * as CSD_STRUCTURE does not change, all values for EXT_CSD_REV
+        * are authorized, see JEDEC JESD84-B50 section B.8.
+        */
        card->ext_csd.rev = ext_csd[EXT_CSD_REV];
-       if (card->ext_csd.rev > 7) {
-               pr_err("%s: unrecognised EXT_CSD revision %d\n",
-                       mmc_hostname(card->host), card->ext_csd.rev);
-               err = -EINVAL;
-               goto out;
-       }
 
        card->ext_csd.raw_sectors[0] = ext_csd[EXT_CSD_SEC_CNT + 0];
        card->ext_csd.raw_sectors[1] = ext_csd[EXT_CSD_SEC_CNT + 1];
index 6c36fccaa1ec70d6e7f8dc5ac6396a77b90cde19..dd1d1e0fe32227edb4a184e3f8947ecbe27991b2 100644 (file)
@@ -91,7 +91,7 @@ void mmc_fixup_device(struct mmc_card *card, const struct mmc_fixup *table)
                    (f->cis_device == card->cis.device ||
                     f->cis_device == (u16) SDIO_ANY_ID) &&
                    rev >= f->rev_start && rev <= f->rev_end) {
-                       dev_dbg(&card->dev, "calling %pF\n", f->vendor_fixup);
+                       dev_dbg(&card->dev, "calling %pf\n", f->vendor_fixup);
                        f->vendor_fixup(card, f->data);
                }
        }
index 274ef00b44639149c3d43c24effffd400fa5ba69..48d0c93ba25a36590ac6e2c786c4ca923d35cde1 100644 (file)
@@ -184,6 +184,9 @@ int mmc_send_app_op_cond(struct mmc_host *host, u32 ocr, u32 *rocr)
                mmc_delay(10);
        }
 
+       if (!i)
+               pr_err("%s: card never left busy state\n", mmc_hostname(host));
+
        if (rocr && !mmc_host_is_spi(host))
                *rocr = cmd.resp[0];
 
index a5652548230a9457812a8badc69d7001a9df1f9c..45113582246427eae3f9c6d893cdd6e305a1631b 100644 (file)
@@ -290,6 +290,18 @@ config MMC_MOXART
          be found on some embedded hardware such as UC-7112-LX.
          If you have a controller with this interface, say Y here.
 
+config MMC_SDHCI_ST
+       tristate "SDHCI support on STMicroelectronics SoC"
+       depends on ARCH_STI
+       depends on MMC_SDHCI_PLTFM
+       select MMC_SDHCI_IO_ACCESSORS
+       help
+         This selects the Secure Digital Host Controller Interface in
+         STMicroelectronics SoCs.
+
+         If you have a controller with this interface, say Y or M here.
+         If unsure, say N.
+
 config MMC_OMAP
        tristate "TI OMAP Multimedia Card Interface support"
        depends on ARCH_OMAP
@@ -303,6 +315,7 @@ config MMC_OMAP
 
 config MMC_OMAP_HS
        tristate "TI OMAP High Speed Multimedia Card Interface support"
+       depends on HAS_DMA
        depends on ARCH_OMAP2PLUS || COMPILE_TEST
        help
          This selects the TI OMAP High Speed Multimedia card Interface.
@@ -343,7 +356,7 @@ config MMC_ATMELMCI
 
 config MMC_SDHCI_MSM
        tristate "Qualcomm SDHCI Controller Support"
-       depends on ARCH_QCOM
+       depends on ARCH_QCOM || (ARM && COMPILE_TEST)
        depends on MMC_SDHCI_PLTFM
        help
          This selects the Secure Digital Host Controller Interface (SDHCI)
@@ -440,6 +453,7 @@ config MMC_SPI
 config MMC_S3C
        tristate "Samsung S3C SD/MMC Card Interface support"
        depends on ARCH_S3C24XX
+       depends on S3C24XX_DMAC
        help
          This selects a driver for the MCI interface found in
           Samsung's S3C2410, S3C2412, S3C2440, S3C2442 CPUs.
@@ -477,15 +491,6 @@ config MMC_S3C_DMA
          working properly and needs to be debugged before this
          option is useful.
 
-config MMC_S3C_PIODMA
-       bool "Support for both PIO and DMA"
-       help
-         Compile both the PIO and DMA transfer routines into the
-         driver and let the platform select at run-time which one
-         is best.
-
-         See notes for the DMA option.
-
 endchoice
 
 config MMC_SDRICOH_CS
@@ -623,7 +628,7 @@ config MMC_DW_PCI
 
 config MMC_SH_MMCIF
        tristate "SuperH Internal MMCIF support"
-       depends on MMC_BLOCK
+       depends on MMC_BLOCK && HAS_DMA
        depends on SUPERH || ARCH_SHMOBILE || COMPILE_TEST
        help
          This selects the MMC Host Interface controller (MMCIF).
@@ -697,6 +702,7 @@ config MMC_WMT
 
 config MMC_USDHI6ROL0
        tristate "Renesas USDHI6ROL0 SD/SDIO Host Controller support"
+       depends on HAS_DMA
        help
          This selects support for the Renesas USDHI6ROL0 SD/SDIO
          Host Controller
index 7f81ddf1dd2c9f87f64beb3c54a91d1a8faa7169..f211eede8db58d48887d5619d96c4cca6bdbdbab 100644 (file)
@@ -68,6 +68,7 @@ obj-$(CONFIG_MMC_SDHCI_OF_HLWD)               += sdhci-of-hlwd.o
 obj-$(CONFIG_MMC_SDHCI_BCM_KONA)       += sdhci-bcm-kona.o
 obj-$(CONFIG_MMC_SDHCI_BCM2835)                += sdhci-bcm2835.o
 obj-$(CONFIG_MMC_SDHCI_MSM)            += sdhci-msm.o
+obj-$(CONFIG_MMC_SDHCI_ST)             += sdhci-st.o
 
 ifeq ($(CONFIG_CB710_DEBUG),y)
        CFLAGS-cb710-mmc        += -DDEBUG
index 1ac227c603b7e13a687b631eff298ce16222707d..8f216edbdf080d0c51e3eedcf83a75a1886fc9e3 100644 (file)
@@ -111,8 +111,7 @@ static const u8 tuning_blk_pattern_8bit[] = {
        0xff, 0x77, 0x77, 0xff, 0x77, 0xbb, 0xdd, 0xee,
 };
 
-static inline bool dw_mci_fifo_reset(struct dw_mci *host);
-static inline bool dw_mci_ctrl_all_reset(struct dw_mci *host);
+static bool dw_mci_reset(struct dw_mci *host);
 
 #if defined(CONFIG_DEBUG_FS)
 static int dw_mci_req_show(struct seq_file *s, void *v)
@@ -997,7 +996,8 @@ static int dw_mci_get_ro(struct mmc_host *mmc)
        int gpio_ro = mmc_gpio_get_ro(mmc);
 
        /* Use platform get_ro function, else try on board write protect */
-       if (slot->quirks & DW_MCI_SLOT_QUIRK_NO_WRITE_PROTECT)
+       if ((slot->quirks & DW_MCI_SLOT_QUIRK_NO_WRITE_PROTECT) ||
+                       (slot->host->quirks & DW_MCI_QUIRK_NO_WRITE_PROTECT))
                read_only = 0;
        else if (!IS_ERR_VALUE(gpio_ro))
                read_only = gpio_ro;
@@ -1235,7 +1235,7 @@ static int dw_mci_data_complete(struct dw_mci *host, struct mmc_data *data)
                 * After an error, there may be data lingering
                 * in the FIFO
                 */
-               dw_mci_fifo_reset(host);
+               dw_mci_reset(host);
        } else {
                data->bytes_xfered = data->blocks * data->blksz;
                data->error = 0;
@@ -1352,7 +1352,7 @@ static void dw_mci_tasklet_func(unsigned long priv)
 
                        /* CMD error in data command */
                        if (mrq->cmd->error && mrq->data)
-                               dw_mci_fifo_reset(host);
+                               dw_mci_reset(host);
 
                        host->cmd = NULL;
                        host->data = NULL;
@@ -1963,14 +1963,8 @@ static void dw_mci_work_routine_card(struct work_struct *work)
                        }
 
                        /* Power down slot */
-                       if (present == 0) {
-                               /* Clear down the FIFO */
-                               dw_mci_fifo_reset(host);
-#ifdef CONFIG_MMC_DW_IDMAC
-                               dw_mci_idmac_reset(host);
-#endif
-
-                       }
+                       if (present == 0)
+                               dw_mci_reset(host);
 
                        spin_unlock_bh(&host->lock);
 
@@ -2021,8 +2015,11 @@ static int dw_mci_of_get_slot_quirks(struct device *dev, u8 slot)
 
        /* get quirks */
        for (idx = 0; idx < ARRAY_SIZE(of_slot_quirks); idx++)
-               if (of_get_property(np, of_slot_quirks[idx].quirk, NULL))
+               if (of_get_property(np, of_slot_quirks[idx].quirk, NULL)) {
+                       dev_warn(dev, "Slot quirk %s is deprecated\n",
+                                       of_slot_quirks[idx].quirk);
                        quirks |= of_slot_quirks[idx].id;
+               }
 
        return quirks;
 }
@@ -2208,8 +2205,11 @@ static bool dw_mci_ctrl_reset(struct dw_mci *host, u32 reset)
        return false;
 }
 
-static inline bool dw_mci_fifo_reset(struct dw_mci *host)
+static bool dw_mci_reset(struct dw_mci *host)
 {
+       u32 flags = SDMMC_CTRL_RESET | SDMMC_CTRL_FIFO_RESET;
+       bool ret = false;
+
        /*
         * Reseting generates a block interrupt, hence setting
         * the scatter-gather pointer to NULL.
@@ -2219,15 +2219,60 @@ static inline bool dw_mci_fifo_reset(struct dw_mci *host)
                host->sg = NULL;
        }
 
-       return dw_mci_ctrl_reset(host, SDMMC_CTRL_FIFO_RESET);
-}
+       if (host->use_dma)
+               flags |= SDMMC_CTRL_DMA_RESET;
 
-static inline bool dw_mci_ctrl_all_reset(struct dw_mci *host)
-{
-       return dw_mci_ctrl_reset(host,
-                                SDMMC_CTRL_FIFO_RESET |
-                                SDMMC_CTRL_RESET |
-                                SDMMC_CTRL_DMA_RESET);
+       if (dw_mci_ctrl_reset(host, flags)) {
+               /*
+                * In all cases we clear the RAWINTS register to clear any
+                * interrupts.
+                */
+               mci_writel(host, RINTSTS, 0xFFFFFFFF);
+
+               /* if using dma we wait for dma_req to clear */
+               if (host->use_dma) {
+                       unsigned long timeout = jiffies + msecs_to_jiffies(500);
+                       u32 status;
+                       do {
+                               status = mci_readl(host, STATUS);
+                               if (!(status & SDMMC_STATUS_DMA_REQ))
+                                       break;
+                               cpu_relax();
+                       } while (time_before(jiffies, timeout));
+
+                       if (status & SDMMC_STATUS_DMA_REQ) {
+                               dev_err(host->dev,
+                                       "%s: Timeout waiting for dma_req to "
+                                       "clear during reset\n", __func__);
+                               goto ciu_out;
+                       }
+
+                       /* when using DMA next we reset the fifo again */
+                       if (!dw_mci_ctrl_reset(host, SDMMC_CTRL_FIFO_RESET))
+                               goto ciu_out;
+               }
+       } else {
+               /* if the controller reset bit did clear, then set clock regs */
+               if (!(mci_readl(host, CTRL) & SDMMC_CTRL_RESET)) {
+                       dev_err(host->dev, "%s: fifo/dma reset bits didn't "
+                               "clear but ciu was reset, doing clock update\n",
+                               __func__);
+                       goto ciu_out;
+               }
+       }
+
+#if IS_ENABLED(CONFIG_MMC_DW_IDMAC)
+       /* It is also recommended that we reset and reprogram idmac */
+       dw_mci_idmac_reset(host);
+#endif
+
+       ret = true;
+
+ciu_out:
+       /* After a CTRL reset we need to have CIU set clock registers  */
+       mci_send_cmd(host->cur_slot, SDMMC_CMD_UPD_CLK, 0);
+
+       return ret;
 }
 
 #ifdef CONFIG_OF
@@ -2238,6 +2283,9 @@ static struct dw_mci_of_quirks {
        {
                .quirk  = "broken-cd",
                .id     = DW_MCI_QUIRK_BROKEN_CARD_DETECTION,
+       }, {
+               .quirk  = "disable-wp",
+               .id     = DW_MCI_QUIRK_NO_WRITE_PROTECT,
        },
 };
 
@@ -2425,7 +2473,7 @@ int dw_mci_probe(struct dw_mci *host)
        }
 
        /* Reset all blocks */
-       if (!dw_mci_ctrl_all_reset(host))
+       if (!dw_mci_ctrl_reset(host, SDMMC_CTRL_ALL_RESET_FLAGS))
                return -ENODEV;
 
        host->dma_ops = host->pdata->dma_ops;
@@ -2612,7 +2660,7 @@ int dw_mci_resume(struct dw_mci *host)
                }
        }
 
-       if (!dw_mci_ctrl_all_reset(host)) {
+       if (!dw_mci_ctrl_reset(host, SDMMC_CTRL_ALL_RESET_FLAGS)) {
                ret = -ENODEV;
                return ret;
        }
index 738fa241d05882258958107348a75b511c76fc08..08fd956d81f3bc687162e333cb612c37f06ad025 100644 (file)
 #define SDMMC_CMD_INDX(n)              ((n) & 0x1F)
 /* Status register defines */
 #define SDMMC_GET_FCNT(x)              (((x)>>17) & 0x1FFF)
+#define SDMMC_STATUS_DMA_REQ           BIT(31)
 /* FIFOTH register defines */
 #define SDMMC_SET_FIFOTH(m, r, t)      (((m) & 0x7) << 28 | \
                                         ((r) & 0xFFF) << 16 | \
 /* Card read threshold */
 #define SDMMC_SET_RD_THLD(v, x)                (((v) & 0x1FFF) << 16 | (x))
 
+/* All ctrl reset bits */
+#define SDMMC_CTRL_ALL_RESET_FLAGS \
+       (SDMMC_CTRL_RESET | SDMMC_CTRL_FIFO_RESET | SDMMC_CTRL_DMA_RESET)
+
 /* Register access macros */
 #define mci_readl(dev, reg)                    \
        __raw_readl((dev)->regs + SDMMC_##reg)
index 7ad463e9741c0e4359b0c2c10a43f83ea3683995..e4d470704150c257bdf0e535cefa05f4bc41a059 100644 (file)
@@ -52,34 +52,53 @@ static unsigned int fmax = 515633;
  * struct variant_data - MMCI variant-specific quirks
  * @clkreg: default value for MCICLOCK register
  * @clkreg_enable: enable value for MMCICLOCK register
+ * @clkreg_8bit_bus_enable: enable value for 8 bit bus
+ * @clkreg_neg_edge_enable: enable value for inverted data/cmd output
  * @datalength_bits: number of bits in the MMCIDATALENGTH register
  * @fifosize: number of bytes that can be written when MMCI_TXFIFOEMPTY
  *           is asserted (likewise for RX)
  * @fifohalfsize: number of bytes that can be written when MCI_TXFIFOHALFEMPTY
  *               is asserted (likewise for RX)
+ * @data_cmd_enable: enable value for data commands.
  * @sdio: variant supports SDIO
  * @st_clkdiv: true if using a ST-specific clock divider algorithm
+ * @datactrl_mask_ddrmode: ddr mode mask in datactrl register.
  * @blksz_datactrl16: true if Block size is at b16..b30 position in datactrl register
+ * @blksz_datactrl4: true if Block size is at b4..b16 position in datactrl
+ *                  register
  * @pwrreg_powerup: power up value for MMCIPOWER register
+ * @f_max: maximum clk frequency supported by the controller.
  * @signal_direction: input/out direction of bus signals can be indicated
  * @pwrreg_clkgate: MMCIPOWER register must be used to gate the clock
  * @busy_detect: true if busy detection on dat0 is supported
  * @pwrreg_nopower: bits in MMCIPOWER don't controls ext. power supply
+ * @explicit_mclk_control: enable explicit mclk control in driver.
+ * @qcom_fifo: enables qcom specific fifo pio read logic.
+ * @reversed_irq_handling: handle data irq before cmd irq.
  */
 struct variant_data {
        unsigned int            clkreg;
        unsigned int            clkreg_enable;
+       unsigned int            clkreg_8bit_bus_enable;
+       unsigned int            clkreg_neg_edge_enable;
        unsigned int            datalength_bits;
        unsigned int            fifosize;
        unsigned int            fifohalfsize;
+       unsigned int            data_cmd_enable;
+       unsigned int            datactrl_mask_ddrmode;
        bool                    sdio;
        bool                    st_clkdiv;
        bool                    blksz_datactrl16;
+       bool                    blksz_datactrl4;
        u32                     pwrreg_powerup;
+       u32                     f_max;
        bool                    signal_direction;
        bool                    pwrreg_clkgate;
        bool                    busy_detect;
        bool                    pwrreg_nopower;
+       bool                    explicit_mclk_control;
+       bool                    qcom_fifo;
+       bool                    reversed_irq_handling;
 };
 
 static struct variant_data variant_arm = {
@@ -87,6 +106,8 @@ static struct variant_data variant_arm = {
        .fifohalfsize           = 8 * 4,
        .datalength_bits        = 16,
        .pwrreg_powerup         = MCI_PWR_UP,
+       .f_max                  = 100000000,
+       .reversed_irq_handling  = true,
 };
 
 static struct variant_data variant_arm_extended_fifo = {
@@ -94,6 +115,7 @@ static struct variant_data variant_arm_extended_fifo = {
        .fifohalfsize           = 64 * 4,
        .datalength_bits        = 16,
        .pwrreg_powerup         = MCI_PWR_UP,
+       .f_max                  = 100000000,
 };
 
 static struct variant_data variant_arm_extended_fifo_hwfc = {
@@ -102,15 +124,18 @@ static struct variant_data variant_arm_extended_fifo_hwfc = {
        .clkreg_enable          = MCI_ARM_HWFCEN,
        .datalength_bits        = 16,
        .pwrreg_powerup         = MCI_PWR_UP,
+       .f_max                  = 100000000,
 };
 
 static struct variant_data variant_u300 = {
        .fifosize               = 16 * 4,
        .fifohalfsize           = 8 * 4,
        .clkreg_enable          = MCI_ST_U300_HWFCEN,
+       .clkreg_8bit_bus_enable = MCI_ST_8BIT_BUS,
        .datalength_bits        = 16,
        .sdio                   = true,
        .pwrreg_powerup         = MCI_PWR_ON,
+       .f_max                  = 100000000,
        .signal_direction       = true,
        .pwrreg_clkgate         = true,
        .pwrreg_nopower         = true,
@@ -124,6 +149,7 @@ static struct variant_data variant_nomadik = {
        .sdio                   = true,
        .st_clkdiv              = true,
        .pwrreg_powerup         = MCI_PWR_ON,
+       .f_max                  = 100000000,
        .signal_direction       = true,
        .pwrreg_clkgate         = true,
        .pwrreg_nopower         = true,
@@ -134,10 +160,13 @@ static struct variant_data variant_ux500 = {
        .fifohalfsize           = 8 * 4,
        .clkreg                 = MCI_CLK_ENABLE,
        .clkreg_enable          = MCI_ST_UX500_HWFCEN,
+       .clkreg_8bit_bus_enable = MCI_ST_8BIT_BUS,
+       .clkreg_neg_edge_enable = MCI_ST_UX500_NEG_EDGE,
        .datalength_bits        = 24,
        .sdio                   = true,
        .st_clkdiv              = true,
        .pwrreg_powerup         = MCI_PWR_ON,
+       .f_max                  = 100000000,
        .signal_direction       = true,
        .pwrreg_clkgate         = true,
        .busy_detect            = true,
@@ -149,17 +178,38 @@ static struct variant_data variant_ux500v2 = {
        .fifohalfsize           = 8 * 4,
        .clkreg                 = MCI_CLK_ENABLE,
        .clkreg_enable          = MCI_ST_UX500_HWFCEN,
+       .clkreg_8bit_bus_enable = MCI_ST_8BIT_BUS,
+       .clkreg_neg_edge_enable = MCI_ST_UX500_NEG_EDGE,
+       .datactrl_mask_ddrmode  = MCI_ST_DPSM_DDRMODE,
        .datalength_bits        = 24,
        .sdio                   = true,
        .st_clkdiv              = true,
        .blksz_datactrl16       = true,
        .pwrreg_powerup         = MCI_PWR_ON,
+       .f_max                  = 100000000,
        .signal_direction       = true,
        .pwrreg_clkgate         = true,
        .busy_detect            = true,
        .pwrreg_nopower         = true,
 };
 
+static struct variant_data variant_qcom = {
+       .fifosize               = 16 * 4,
+       .fifohalfsize           = 8 * 4,
+       .clkreg                 = MCI_CLK_ENABLE,
+       .clkreg_enable          = MCI_QCOM_CLK_FLOWENA |
+                                 MCI_QCOM_CLK_SELECT_IN_FBCLK,
+       .clkreg_8bit_bus_enable = MCI_QCOM_CLK_WIDEBUS_8,
+       .datactrl_mask_ddrmode  = MCI_QCOM_CLK_SELECT_IN_DDR_MODE,
+       .data_cmd_enable        = MCI_QCOM_CSPM_DATCMD,
+       .blksz_datactrl4        = true,
+       .datalength_bits        = 24,
+       .pwrreg_powerup         = MCI_PWR_UP,
+       .f_max                  = 208000000,
+       .explicit_mclk_control  = true,
+       .qcom_fifo              = true,
+};
+
 static int mmci_card_busy(struct mmc_host *mmc)
 {
        struct mmci_host *host = mmc_priv(mmc);
@@ -260,7 +310,9 @@ static void mmci_set_clkreg(struct mmci_host *host, unsigned int desired)
        host->cclk = 0;
 
        if (desired) {
-               if (desired >= host->mclk) {
+               if (variant->explicit_mclk_control) {
+                       host->cclk = host->mclk;
+               } else if (desired >= host->mclk) {
                        clk = MCI_CLK_BYPASS;
                        if (variant->st_clkdiv)
                                clk |= MCI_ST_UX500_NEG_EDGE;
@@ -299,11 +351,11 @@ static void mmci_set_clkreg(struct mmci_host *host, unsigned int desired)
        if (host->mmc->ios.bus_width == MMC_BUS_WIDTH_4)
                clk |= MCI_4BIT_BUS;
        if (host->mmc->ios.bus_width == MMC_BUS_WIDTH_8)
-               clk |= MCI_ST_8BIT_BUS;
+               clk |= variant->clkreg_8bit_bus_enable;
 
        if (host->mmc->ios.timing == MMC_TIMING_UHS_DDR50 ||
            host->mmc->ios.timing == MMC_TIMING_MMC_DDR52)
-               clk |= MCI_ST_UX500_NEG_EDGE;
+               clk |= variant->clkreg_neg_edge_enable;
 
        mmci_write_clkreg(host, clk);
 }
@@ -719,7 +771,7 @@ static void mmci_start_data(struct mmci_host *host, struct mmc_data *data)
        data->bytes_xfered = 0;
 
        clks = (unsigned long long)data->timeout_ns * host->cclk;
-       do_div(clks, 1000000000UL);
+       do_div(clks, NSEC_PER_SEC);
 
        timeout = data->timeout_clks + (unsigned int)clks;
 
@@ -732,6 +784,8 @@ static void mmci_start_data(struct mmci_host *host, struct mmc_data *data)
 
        if (variant->blksz_datactrl16)
                datactrl = MCI_DPSM_ENABLE | (data->blksz << 16);
+       else if (variant->blksz_datactrl4)
+               datactrl = MCI_DPSM_ENABLE | (data->blksz << 4);
        else
                datactrl = MCI_DPSM_ENABLE | blksz_bits << 4;
 
@@ -767,7 +821,7 @@ static void mmci_start_data(struct mmci_host *host, struct mmc_data *data)
 
        if (host->mmc->ios.timing == MMC_TIMING_UHS_DDR50 ||
            host->mmc->ios.timing == MMC_TIMING_MMC_DDR52)
-               datactrl |= MCI_ST_DPSM_DDRMODE;
+               datactrl |= variant->datactrl_mask_ddrmode;
 
        /*
         * Attempt to use DMA operation mode, if this
@@ -812,7 +866,7 @@ mmci_start_command(struct mmci_host *host, struct mmc_command *cmd, u32 c)
 
        if (readl(base + MMCICOMMAND) & MCI_CPSM_ENABLE) {
                writel(0, base + MMCICOMMAND);
-               udelay(1);
+               mmci_reg_delay(host);
        }
 
        c |= cmd->opcode | MCI_CPSM_ENABLE;
@@ -824,6 +878,9 @@ mmci_start_command(struct mmci_host *host, struct mmc_command *cmd, u32 c)
        if (/*interrupt*/0)
                c |= MCI_CPSM_INTERRUPT;
 
+       if (mmc_cmd_type(cmd) == MMC_CMD_ADTC)
+               c |= host->variant->data_cmd_enable;
+
        host->cmd = cmd;
 
        writel(cmd->arg, base + MMCIARGUMENT);
@@ -834,6 +891,10 @@ static void
 mmci_data_irq(struct mmci_host *host, struct mmc_data *data,
              unsigned int status)
 {
+       /* Make sure we have data to handle */
+       if (!data)
+               return;
+
        /* First check for errors */
        if (status & (MCI_DATACRCFAIL|MCI_DATATIMEOUT|MCI_STARTBITERR|
                      MCI_TXUNDERRUN|MCI_RXOVERRUN)) {
@@ -902,9 +963,17 @@ mmci_cmd_irq(struct mmci_host *host, struct mmc_command *cmd,
             unsigned int status)
 {
        void __iomem *base = host->base;
-       bool sbc = (cmd == host->mrq->sbc);
-       bool busy_resp = host->variant->busy_detect &&
-                       (cmd->flags & MMC_RSP_BUSY);
+       bool sbc, busy_resp;
+
+       if (!cmd)
+               return;
+
+       sbc = (cmd == host->mrq->sbc);
+       busy_resp = host->variant->busy_detect && (cmd->flags & MMC_RSP_BUSY);
+
+       if (!((status|host->busy_status) & (MCI_CMDCRCFAIL|MCI_CMDTIMEOUT|
+               MCI_CMDSENT|MCI_CMDRESPEND)))
+               return;
 
        /* Check if we need to wait for busy completion. */
        if (host->busy_status && (status & MCI_ST_CARDBUSY))
@@ -957,15 +1026,34 @@ mmci_cmd_irq(struct mmci_host *host, struct mmc_command *cmd,
        }
 }
 
+static int mmci_get_rx_fifocnt(struct mmci_host *host, u32 status, int remain)
+{
+       return remain - (readl(host->base + MMCIFIFOCNT) << 2);
+}
+
+static int mmci_qcom_get_rx_fifocnt(struct mmci_host *host, u32 status, int r)
+{
+       /*
+        * on qcom SDCC4 only 8 words are used in each burst so only 8 addresses
+        * from the fifo range should be used
+        */
+       if (status & MCI_RXFIFOHALFFULL)
+               return host->variant->fifohalfsize;
+       else if (status & MCI_RXDATAAVLBL)
+               return 4;
+
+       return 0;
+}
+
 static int mmci_pio_read(struct mmci_host *host, char *buffer, unsigned int remain)
 {
        void __iomem *base = host->base;
        char *ptr = buffer;
-       u32 status;
+       u32 status = readl(host->base + MMCISTATUS);
        int host_remain = host->size;
 
        do {
-               int count = host_remain - (readl(base + MMCIFIFOCNT) << 2);
+               int count = host->get_rx_fifocnt(host, status, host_remain);
 
                if (count > remain)
                        count = remain;
@@ -1132,9 +1220,6 @@ static irqreturn_t mmci_irq(int irq, void *dev_id)
        spin_lock(&host->lock);
 
        do {
-               struct mmc_command *cmd;
-               struct mmc_data *data;
-
                status = readl(host->base + MMCISTATUS);
 
                if (host->singleirq) {
@@ -1154,16 +1239,13 @@ static irqreturn_t mmci_irq(int irq, void *dev_id)
 
                dev_dbg(mmc_dev(host->mmc), "irq0 (data+cmd) %08x\n", status);
 
-               cmd = host->cmd;
-               if ((status|host->busy_status) & (MCI_CMDCRCFAIL|MCI_CMDTIMEOUT|
-                       MCI_CMDSENT|MCI_CMDRESPEND) && cmd)
-                       mmci_cmd_irq(host, cmd, status);
-
-               data = host->data;
-               if (status & (MCI_DATACRCFAIL|MCI_DATATIMEOUT|MCI_STARTBITERR|
-                             MCI_TXUNDERRUN|MCI_RXOVERRUN|MCI_DATAEND|
-                             MCI_DATABLOCKEND) && data)
-                       mmci_data_irq(host, data, status);
+               if (host->variant->reversed_irq_handling) {
+                       mmci_data_irq(host, host->data, status);
+                       mmci_cmd_irq(host, host->cmd, status);
+               } else {
+                       mmci_cmd_irq(host, host->cmd, status);
+                       mmci_data_irq(host, host->data, status);
+               }
 
                /* Don't poll for busy completion in irq context. */
                if (host->busy_status)
@@ -1296,6 +1378,17 @@ static void mmci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
        if (!ios->clock && variant->pwrreg_clkgate)
                pwr &= ~MCI_PWR_ON;
 
+       if (host->variant->explicit_mclk_control &&
+           ios->clock != host->clock_cache) {
+               ret = clk_set_rate(host->clk, ios->clock);
+               if (ret < 0)
+                       dev_err(mmc_dev(host->mmc),
+                               "Error setting clock rate (%d)\n", ret);
+               else
+                       host->mclk = clk_get_rate(host->clk);
+       }
+       host->clock_cache = ios->clock;
+
        spin_lock_irqsave(&host->lock, flags);
 
        mmci_set_clkreg(host, ios->clock);
@@ -1443,6 +1536,11 @@ static int mmci_probe(struct amba_device *dev,
        if (ret)
                goto host_free;
 
+       if (variant->qcom_fifo)
+               host->get_rx_fifocnt = mmci_qcom_get_rx_fifocnt;
+       else
+               host->get_rx_fifocnt = mmci_get_rx_fifocnt;
+
        host->plat = plat;
        host->variant = variant;
        host->mclk = clk_get_rate(host->clk);
@@ -1451,8 +1549,8 @@ static int mmci_probe(struct amba_device *dev,
         * so we try to adjust the clock down to this,
         * (if possible).
         */
-       if (host->mclk > 100000000) {
-               ret = clk_set_rate(host->clk, 100000000);
+       if (host->mclk > variant->f_max) {
+               ret = clk_set_rate(host->clk, variant->f_max);
                if (ret < 0)
                        goto clk_disable;
                host->mclk = clk_get_rate(host->clk);
@@ -1471,9 +1569,12 @@ static int mmci_probe(struct amba_device *dev,
         * The ARM and ST versions of the block have slightly different
         * clock divider equations which means that the minimum divider
         * differs too.
+        * on Qualcomm like controllers get the nearest minimum clock to 100Khz
         */
        if (variant->st_clkdiv)
                mmc->f_min = DIV_ROUND_UP(host->mclk, 257);
+       else if (variant->explicit_mclk_control)
+               mmc->f_min = clk_round_rate(host->clk, 100000);
        else
                mmc->f_min = DIV_ROUND_UP(host->mclk, 512);
        /*
@@ -1483,9 +1584,14 @@ static int mmci_probe(struct amba_device *dev,
         * the block, of course.
         */
        if (mmc->f_max)
-               mmc->f_max = min(host->mclk, mmc->f_max);
+               mmc->f_max = variant->explicit_mclk_control ?
+                               min(variant->f_max, mmc->f_max) :
+                               min(host->mclk, mmc->f_max);
        else
-               mmc->f_max = min(host->mclk, fmax);
+               mmc->f_max = variant->explicit_mclk_control ?
+                               fmax : min(host->mclk, fmax);
+
+
        dev_dbg(mmc_dev(mmc), "clocking block at %u Hz\n", mmc->f_max);
 
        /* Get regulators and the supported OCR mask */
@@ -1752,6 +1858,12 @@ static struct amba_id mmci_ids[] = {
                .mask   = 0xf0ffffff,
                .data   = &variant_ux500v2,
        },
+       /* Qualcomm variants */
+       {
+               .id     = 0x00051180,
+               .mask   = 0x000fffff,
+               .data   = &variant_qcom,
+       },
        { 0, 0 },
 };
 
index 347d942d740bfd91d0e4aa9feef9ef1f1fd8aab1..a1f5e4f49e2a3367038268f5bfc7ed43961482e4 100644 (file)
 /* Modified PL180 on Versatile Express platform */
 #define MCI_ARM_HWFCEN         (1 << 12)
 
+/* Modified on Qualcomm Integrations */
+#define MCI_QCOM_CLK_WIDEBUS_8 (BIT(10) | BIT(11))
+#define MCI_QCOM_CLK_FLOWENA   BIT(12)
+#define MCI_QCOM_CLK_INVERTOUT BIT(13)
+
+/* select in latch data and command in */
+#define MCI_QCOM_CLK_SELECT_IN_FBCLK   BIT(15)
+#define MCI_QCOM_CLK_SELECT_IN_DDR_MODE        (BIT(14) | BIT(15))
+
 #define MMCIARGUMENT           0x008
 #define MMCICOMMAND            0x00c
 #define MCI_CPSM_RESPONSE      (1 << 6)
 #define MCI_ST_NIEN            (1 << 13)
 #define MCI_ST_CE_ATACMD       (1 << 14)
 
+/* Modified on Qualcomm Integrations */
+#define MCI_QCOM_CSPM_DATCMD           BIT(12)
+#define MCI_QCOM_CSPM_MCIABORT         BIT(13)
+#define MCI_QCOM_CSPM_CCSENABLE                BIT(14)
+#define MCI_QCOM_CSPM_CCSDISABLE       BIT(15)
+#define MCI_QCOM_CSPM_AUTO_CMD19       BIT(16)
+#define MCI_QCOM_CSPM_AUTO_CMD21       BIT(21)
+
 #define MMCIRESPCMD            0x010
 #define MMCIRESPONSE0          0x014
 #define MMCIRESPONSE1          0x018
@@ -191,6 +208,8 @@ struct mmci_host {
        spinlock_t              lock;
 
        unsigned int            mclk;
+       /* cached value of requested clk in set_ios */
+       unsigned int            clock_cache;
        unsigned int            cclk;
        u32                     pwr_reg;
        u32                     pwr_reg_add;
@@ -210,6 +229,7 @@ struct mmci_host {
        /* pio stuff */
        struct sg_mapping_iter  sg_miter;
        unsigned int            size;
+       int (*get_rx_fifocnt)(struct mmci_host *h, u32 status, int remain);
 
 #ifdef CONFIG_DMA_ENGINE
        /* DMA stuff */
index 74924a04026ea58d53e72855ae94dbd181ea7dab..b4b1efbf6c165c21aa1d8f1fe13e6bc9efb57f09 100644 (file)
@@ -13,7 +13,6 @@
  * warranty of any kind, whether express or implied.
  */
 
-#include <linux/version.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/platform_device.h>
index babfea03ba8a0ce50b64fae4e0bcbcc96ef0e60e..140885a5a4e734bbcb3806e39e7601503e1cf599 100644 (file)
@@ -86,7 +86,8 @@ static int mxs_mmc_get_cd(struct mmc_host *mmc)
        if (ret >= 0)
                return ret;
 
-       present = !(readl(ssp->base + HW_SSP_STATUS(ssp)) &
+       present = mmc->caps & MMC_CAP_NEEDS_POLL ||
+               !(readl(ssp->base + HW_SSP_STATUS(ssp)) &
                        BM_SSP_STATUS_CARD_DETECT);
 
        if (mmc->caps2 & MMC_CAP2_CD_ACTIVE_HIGH)
index 6b7b75585926c44d2fc39fdda4f0d58e73ec392a..965672663ef066a3b2e58c5f1a2478c5becdfdda 100644 (file)
@@ -29,6 +29,7 @@
 #include <linux/timer.h>
 #include <linux/clk.h>
 #include <linux/of.h>
+#include <linux/of_irq.h>
 #include <linux/of_gpio.h>
 #include <linux/of_device.h>
 #include <linux/omap-dmaengine.h>
@@ -36,6 +37,7 @@
 #include <linux/mmc/core.h>
 #include <linux/mmc/mmc.h>
 #include <linux/io.h>
+#include <linux/irq.h>
 #include <linux/gpio.h>
 #include <linux/regulator/consumer.h>
 #include <linux/pinctrl/consumer.h>
@@ -54,6 +56,7 @@
 #define OMAP_HSMMC_RSP54       0x0118
 #define OMAP_HSMMC_RSP76       0x011C
 #define OMAP_HSMMC_DATA                0x0120
+#define OMAP_HSMMC_PSTATE      0x0124
 #define OMAP_HSMMC_HCTL                0x0128
 #define OMAP_HSMMC_SYSCTL      0x012C
 #define OMAP_HSMMC_STAT                0x0130
 #define BCE                    (1 << 1)
 #define FOUR_BIT               (1 << 1)
 #define HSPE                   (1 << 2)
+#define IWE                    (1 << 24)
 #define DDR                    (1 << 19)
+#define CLKEXTFREE             (1 << 16)
+#define CTPL                   (1 << 11)
 #define DW8                    (1 << 5)
 #define OD                     0x1
 #define STAT_CLEAR             0xFFFFFFFF
 #define SRD                    (1 << 26)
 #define SOFTRESET              (1 << 1)
 
+/* PSTATE */
+#define DLEV_DAT(x)            (1 << (20 + (x)))
+
 /* Interrupt masks for IE and ISE register */
 #define CC_EN                  (1 << 0)
 #define TC_EN                  (1 << 1)
 #define BWR_EN                 (1 << 4)
 #define BRR_EN                 (1 << 5)
+#define CIRQ_EN                        (1 << 8)
 #define ERR_EN                 (1 << 15)
 #define CTO_EN                 (1 << 16)
 #define CCRC_EN                        (1 << 17)
 #define VDD_3V0                        3000000         /* 300000 uV */
 #define VDD_165_195            (ffs(MMC_VDD_165_195) - 1)
 
-#define AUTO_CMD23             (1 << 1)        /* Auto CMD23 support */
 /*
  * One controller can have multiple slots, like on some omap boards using
  * omap.c controller driver. Luckily this is not currently done on any known
@@ -194,6 +203,7 @@ struct omap_hsmmc_host {
        u32                     sysctl;
        u32                     capa;
        int                     irq;
+       int                     wake_irq;
        int                     use_dma, dma_ch;
        struct dma_chan         *tx_chan;
        struct dma_chan         *rx_chan;
@@ -206,6 +216,9 @@ struct omap_hsmmc_host {
        int                     req_in_progress;
        unsigned long           clk_rate;
        unsigned int            flags;
+#define AUTO_CMD23             (1 << 0)        /* Auto CMD23 support */
+#define HSMMC_SDIO_IRQ_ENABLED (1 << 1)        /* SDIO irq enabled */
+#define HSMMC_WAKE_IRQ_ENABLED (1 << 2)
        struct omap_hsmmc_next  next_data;
        struct  omap_mmc_platform_data  *pdata;
 };
@@ -510,27 +523,40 @@ static void omap_hsmmc_stop_clock(struct omap_hsmmc_host *host)
 static void omap_hsmmc_enable_irq(struct omap_hsmmc_host *host,
                                  struct mmc_command *cmd)
 {
-       unsigned int irq_mask;
+       u32 irq_mask = INT_EN_MASK;
+       unsigned long flags;
 
        if (host->use_dma)
-               irq_mask = INT_EN_MASK & ~(BRR_EN | BWR_EN);
-       else
-               irq_mask = INT_EN_MASK;
+               irq_mask &= ~(BRR_EN | BWR_EN);
 
        /* Disable timeout for erases */
        if (cmd->opcode == MMC_ERASE)
                irq_mask &= ~DTO_EN;
 
+       spin_lock_irqsave(&host->irq_lock, flags);
        OMAP_HSMMC_WRITE(host->base, STAT, STAT_CLEAR);
        OMAP_HSMMC_WRITE(host->base, ISE, irq_mask);
+
+       /* latch pending CIRQ, but don't signal MMC core */
+       if (host->flags & HSMMC_SDIO_IRQ_ENABLED)
+               irq_mask |= CIRQ_EN;
        OMAP_HSMMC_WRITE(host->base, IE, irq_mask);
+       spin_unlock_irqrestore(&host->irq_lock, flags);
 }
 
 static void omap_hsmmc_disable_irq(struct omap_hsmmc_host *host)
 {
-       OMAP_HSMMC_WRITE(host->base, ISE, 0);
-       OMAP_HSMMC_WRITE(host->base, IE, 0);
+       u32 irq_mask = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&host->irq_lock, flags);
+       /* no transfer running but need to keep cirq if enabled */
+       if (host->flags & HSMMC_SDIO_IRQ_ENABLED)
+               irq_mask |= CIRQ_EN;
+       OMAP_HSMMC_WRITE(host->base, ISE, irq_mask);
+       OMAP_HSMMC_WRITE(host->base, IE, irq_mask);
        OMAP_HSMMC_WRITE(host->base, STAT, STAT_CLEAR);
+       spin_unlock_irqrestore(&host->irq_lock, flags);
 }
 
 /* Calculate divisor for the given clock frequency */
@@ -667,6 +693,9 @@ static int omap_hsmmc_context_restore(struct omap_hsmmc_host *host)
                capa = VS18;
        }
 
+       if (host->mmc->caps & MMC_CAP_SDIO_IRQ)
+               hctl |= IWE;
+
        OMAP_HSMMC_WRITE(host->base, HCTL,
                        OMAP_HSMMC_READ(host->base, HCTL) | hctl);
 
@@ -681,7 +710,9 @@ static int omap_hsmmc_context_restore(struct omap_hsmmc_host *host)
                && time_before(jiffies, timeout))
                ;
 
-       omap_hsmmc_disable_irq(host);
+       OMAP_HSMMC_WRITE(host->base, ISE, 0);
+       OMAP_HSMMC_WRITE(host->base, IE, 0);
+       OMAP_HSMMC_WRITE(host->base, STAT, STAT_CLEAR);
 
        /* Do not initialize card-specific things if the power is off */
        if (host->power_mode == MMC_POWER_OFF)
@@ -1118,8 +1149,12 @@ static irqreturn_t omap_hsmmc_irq(int irq, void *dev_id)
        int status;
 
        status = OMAP_HSMMC_READ(host->base, STAT);
-       while (status & INT_EN_MASK && host->req_in_progress) {
-               omap_hsmmc_do_irq(host, status);
+       while (status & (INT_EN_MASK | CIRQ_EN)) {
+               if (host->req_in_progress)
+                       omap_hsmmc_do_irq(host, status);
+
+               if (status & CIRQ_EN)
+                       mmc_signal_sdio_irq(host->mmc);
 
                /* Flush posted write */
                status = OMAP_HSMMC_READ(host->base, STAT);
@@ -1128,6 +1163,22 @@ static irqreturn_t omap_hsmmc_irq(int irq, void *dev_id)
        return IRQ_HANDLED;
 }
 
+static irqreturn_t omap_hsmmc_wake_irq(int irq, void *dev_id)
+{
+       struct omap_hsmmc_host *host = dev_id;
+
+       /* cirq is level triggered, disable to avoid infinite loop */
+       spin_lock(&host->irq_lock);
+       if (host->flags & HSMMC_WAKE_IRQ_ENABLED) {
+               disable_irq_nosync(host->wake_irq);
+               host->flags &= ~HSMMC_WAKE_IRQ_ENABLED;
+       }
+       spin_unlock(&host->irq_lock);
+       pm_request_resume(host->dev); /* no use counter */
+
+       return IRQ_HANDLED;
+}
+
 static void set_sd_bus_power(struct omap_hsmmc_host *host)
 {
        unsigned long i;
@@ -1639,6 +1690,103 @@ static void omap_hsmmc_init_card(struct mmc_host *mmc, struct mmc_card *card)
                mmc_slot(host).init_card(card);
 }
 
+static void omap_hsmmc_enable_sdio_irq(struct mmc_host *mmc, int enable)
+{
+       struct omap_hsmmc_host *host = mmc_priv(mmc);
+       u32 irq_mask, con;
+       unsigned long flags;
+
+       spin_lock_irqsave(&host->irq_lock, flags);
+
+       con = OMAP_HSMMC_READ(host->base, CON);
+       irq_mask = OMAP_HSMMC_READ(host->base, ISE);
+       if (enable) {
+               host->flags |= HSMMC_SDIO_IRQ_ENABLED;
+               irq_mask |= CIRQ_EN;
+               con |= CTPL | CLKEXTFREE;
+       } else {
+               host->flags &= ~HSMMC_SDIO_IRQ_ENABLED;
+               irq_mask &= ~CIRQ_EN;
+               con &= ~(CTPL | CLKEXTFREE);
+       }
+       OMAP_HSMMC_WRITE(host->base, CON, con);
+       OMAP_HSMMC_WRITE(host->base, IE, irq_mask);
+
+       /*
+        * if enable, piggy back detection on current request
+        * but always disable immediately
+        */
+       if (!host->req_in_progress || !enable)
+               OMAP_HSMMC_WRITE(host->base, ISE, irq_mask);
+
+       /* flush posted write */
+       OMAP_HSMMC_READ(host->base, IE);
+
+       spin_unlock_irqrestore(&host->irq_lock, flags);
+}
+
+static int omap_hsmmc_configure_wake_irq(struct omap_hsmmc_host *host)
+{
+       struct mmc_host *mmc = host->mmc;
+       int ret;
+
+       /*
+        * For omaps with wake-up path, wakeirq will be irq from pinctrl and
+        * for other omaps, wakeirq will be from GPIO (dat line remuxed to
+        * gpio). wakeirq is needed to detect sdio irq in runtime suspend state
+        * with functional clock disabled.
+        */
+       if (!host->dev->of_node || !host->wake_irq)
+               return -ENODEV;
+
+       /* Prevent auto-enabling of IRQ */
+       irq_set_status_flags(host->wake_irq, IRQ_NOAUTOEN);
+       ret = devm_request_irq(host->dev, host->wake_irq, omap_hsmmc_wake_irq,
+                              IRQF_TRIGGER_LOW | IRQF_ONESHOT,
+                              mmc_hostname(mmc), host);
+       if (ret) {
+               dev_err(mmc_dev(host->mmc), "Unable to request wake IRQ\n");
+               goto err;
+       }
+
+       /*
+        * Some omaps don't have wake-up path from deeper idle states
+        * and need to remux SDIO DAT1 to GPIO for wake-up from idle.
+        */
+       if (host->pdata->controller_flags & OMAP_HSMMC_SWAKEUP_MISSING) {
+               struct pinctrl *p = devm_pinctrl_get(host->dev);
+               if (!p) {
+                       ret = -ENODEV;
+                       goto err_free_irq;
+               }
+               if (IS_ERR(pinctrl_lookup_state(p, PINCTRL_STATE_DEFAULT))) {
+                       dev_info(host->dev, "missing default pinctrl state\n");
+                       devm_pinctrl_put(p);
+                       ret = -EINVAL;
+                       goto err_free_irq;
+               }
+
+               if (IS_ERR(pinctrl_lookup_state(p, PINCTRL_STATE_IDLE))) {
+                       dev_info(host->dev, "missing idle pinctrl state\n");
+                       devm_pinctrl_put(p);
+                       ret = -EINVAL;
+                       goto err_free_irq;
+               }
+               devm_pinctrl_put(p);
+       }
+
+       OMAP_HSMMC_WRITE(host->base, HCTL,
+                        OMAP_HSMMC_READ(host->base, HCTL) | IWE);
+       return 0;
+
+err_free_irq:
+       devm_free_irq(host->dev, host->wake_irq, host);
+err:
+       dev_warn(host->dev, "no SDIO IRQ support, falling back to polling\n");
+       host->wake_irq = 0;
+       return ret;
+}
+
 static void omap_hsmmc_conf_bus_power(struct omap_hsmmc_host *host)
 {
        u32 hctl, capa, value;
@@ -1691,7 +1839,7 @@ static const struct mmc_host_ops omap_hsmmc_ops = {
        .get_cd = omap_hsmmc_get_cd,
        .get_ro = omap_hsmmc_get_ro,
        .init_card = omap_hsmmc_init_card,
-       /* NYET -- enable_sdio_irq */
+       .enable_sdio_irq = omap_hsmmc_enable_sdio_irq,
 };
 
 #ifdef CONFIG_DEBUG_FS
@@ -1701,13 +1849,23 @@ static int omap_hsmmc_regs_show(struct seq_file *s, void *data)
        struct mmc_host *mmc = s->private;
        struct omap_hsmmc_host *host = mmc_priv(mmc);
 
-       seq_printf(s, "mmc%d:\n ctx_loss:\t%d\n\nregs:\n",
-                       mmc->index, host->context_loss);
+       seq_printf(s, "mmc%d:\n", mmc->index);
+       seq_printf(s, "sdio irq mode\t%s\n",
+                  (mmc->caps & MMC_CAP_SDIO_IRQ) ? "interrupt" : "polling");
 
-       pm_runtime_get_sync(host->dev);
+       if (mmc->caps & MMC_CAP_SDIO_IRQ) {
+               seq_printf(s, "sdio irq \t%s\n",
+                          (host->flags & HSMMC_SDIO_IRQ_ENABLED) ?  "enabled"
+                          : "disabled");
+       }
+       seq_printf(s, "ctx_loss:\t%d\n", host->context_loss);
 
+       pm_runtime_get_sync(host->dev);
+       seq_puts(s, "\nregs:\n");
        seq_printf(s, "CON:\t\t0x%08x\n",
                        OMAP_HSMMC_READ(host->base, CON));
+       seq_printf(s, "PSTATE:\t\t0x%08x\n",
+                  OMAP_HSMMC_READ(host->base, PSTATE));
        seq_printf(s, "HCTL:\t\t0x%08x\n",
                        OMAP_HSMMC_READ(host->base, HCTL));
        seq_printf(s, "SYSCTL:\t\t0x%08x\n",
@@ -1761,6 +1919,10 @@ static const struct omap_mmc_of_data omap3_pre_es3_mmc_of_data = {
 static const struct omap_mmc_of_data omap4_mmc_of_data = {
        .reg_offset = 0x100,
 };
+static const struct omap_mmc_of_data am33xx_mmc_of_data = {
+       .reg_offset = 0x100,
+       .controller_flags = OMAP_HSMMC_SWAKEUP_MISSING,
+};
 
 static const struct of_device_id omap_mmc_of_match[] = {
        {
@@ -1777,6 +1939,10 @@ static const struct of_device_id omap_mmc_of_match[] = {
                .compatible = "ti,omap4-hsmmc",
                .data = &omap4_mmc_of_data,
        },
+       {
+               .compatible = "ti,am33xx-hsmmc",
+               .data = &am33xx_mmc_of_data,
+       },
        {},
 };
 MODULE_DEVICE_TABLE(of, omap_mmc_of_match);
@@ -1850,7 +2016,6 @@ static int omap_hsmmc_probe(struct platform_device *pdev)
        const struct of_device_id *match;
        dma_cap_mask_t mask;
        unsigned tx_req, rx_req;
-       struct pinctrl *pinctrl;
        const struct omap_mmc_of_data *data;
        void __iomem *base;
 
@@ -1913,6 +2078,9 @@ static int omap_hsmmc_probe(struct platform_device *pdev)
 
        platform_set_drvdata(pdev, host);
 
+       if (pdev->dev.of_node)
+               host->wake_irq = irq_of_parse_and_map(pdev->dev.of_node, 1);
+
        mmc->ops        = &omap_hsmmc_ops;
 
        mmc->f_min = OMAP_MMC_MIN_CLOCK;
@@ -2061,10 +2229,17 @@ static int omap_hsmmc_probe(struct platform_device *pdev)
 
        omap_hsmmc_disable_irq(host);
 
-       pinctrl = devm_pinctrl_get_select_default(&pdev->dev);
-       if (IS_ERR(pinctrl))
-               dev_warn(&pdev->dev,
-                       "pins are not configured from the driver\n");
+       /*
+        * For now, only support SDIO interrupt if we have a separate
+        * wake-up interrupt configured from device tree. This is because
+        * the wake-up interrupt is needed for idle state and some
+        * platforms need special quirks. And we don't want to add new
+        * legacy mux platform init code callbacks any longer as we
+        * are moving to DT based booting anyways.
+        */
+       ret = omap_hsmmc_configure_wake_irq(host);
+       if (!ret)
+               mmc->caps |= MMC_CAP_SDIO_IRQ;
 
        omap_hsmmc_protect_card(host);
 
@@ -2170,11 +2345,18 @@ static int omap_hsmmc_suspend(struct device *dev)
        pm_runtime_get_sync(host->dev);
 
        if (!(host->mmc->pm_flags & MMC_PM_KEEP_POWER)) {
-               omap_hsmmc_disable_irq(host);
+               OMAP_HSMMC_WRITE(host->base, ISE, 0);
+               OMAP_HSMMC_WRITE(host->base, IE, 0);
+               OMAP_HSMMC_WRITE(host->base, STAT, STAT_CLEAR);
                OMAP_HSMMC_WRITE(host->base, HCTL,
                                OMAP_HSMMC_READ(host->base, HCTL) & ~SDBP);
        }
 
+       /* do not wake up due to sdio irq */
+       if ((host->mmc->caps & MMC_CAP_SDIO_IRQ) &&
+           !(host->mmc->pm_flags & MMC_PM_WAKE_SDIO_IRQ))
+               disable_irq(host->wake_irq);
+
        if (host->dbclk)
                clk_disable_unprepare(host->dbclk);
 
@@ -2200,6 +2382,10 @@ static int omap_hsmmc_resume(struct device *dev)
 
        omap_hsmmc_protect_card(host);
 
+       if ((host->mmc->caps & MMC_CAP_SDIO_IRQ) &&
+           !(host->mmc->pm_flags & MMC_PM_WAKE_SDIO_IRQ))
+               enable_irq(host->wake_irq);
+
        pm_runtime_mark_last_busy(host->dev);
        pm_runtime_put_autosuspend(host->dev);
        return 0;
@@ -2215,22 +2401,77 @@ static int omap_hsmmc_resume(struct device *dev)
 static int omap_hsmmc_runtime_suspend(struct device *dev)
 {
        struct omap_hsmmc_host *host;
+       unsigned long flags;
+       int ret = 0;
 
        host = platform_get_drvdata(to_platform_device(dev));
        omap_hsmmc_context_save(host);
        dev_dbg(dev, "disabled\n");
 
-       return 0;
+       spin_lock_irqsave(&host->irq_lock, flags);
+       if ((host->mmc->caps & MMC_CAP_SDIO_IRQ) &&
+           (host->flags & HSMMC_SDIO_IRQ_ENABLED)) {
+               /* disable sdio irq handling to prevent race */
+               OMAP_HSMMC_WRITE(host->base, ISE, 0);
+               OMAP_HSMMC_WRITE(host->base, IE, 0);
+
+               if (!(OMAP_HSMMC_READ(host->base, PSTATE) & DLEV_DAT(1))) {
+                       /*
+                        * dat1 line low, pending sdio irq
+                        * race condition: possible irq handler running on
+                        * multi-core, abort
+                        */
+                       dev_dbg(dev, "pending sdio irq, abort suspend\n");
+                       OMAP_HSMMC_WRITE(host->base, STAT, STAT_CLEAR);
+                       OMAP_HSMMC_WRITE(host->base, ISE, CIRQ_EN);
+                       OMAP_HSMMC_WRITE(host->base, IE, CIRQ_EN);
+                       pm_runtime_mark_last_busy(dev);
+                       ret = -EBUSY;
+                       goto abort;
+               }
+
+               pinctrl_pm_select_idle_state(dev);
+
+               WARN_ON(host->flags & HSMMC_WAKE_IRQ_ENABLED);
+               enable_irq(host->wake_irq);
+               host->flags |= HSMMC_WAKE_IRQ_ENABLED;
+       } else {
+               pinctrl_pm_select_idle_state(dev);
+       }
+
+abort:
+       spin_unlock_irqrestore(&host->irq_lock, flags);
+       return ret;
 }
 
 static int omap_hsmmc_runtime_resume(struct device *dev)
 {
        struct omap_hsmmc_host *host;
+       unsigned long flags;
 
        host = platform_get_drvdata(to_platform_device(dev));
        omap_hsmmc_context_restore(host);
        dev_dbg(dev, "enabled\n");
 
+       spin_lock_irqsave(&host->irq_lock, flags);
+       if ((host->mmc->caps & MMC_CAP_SDIO_IRQ) &&
+           (host->flags & HSMMC_SDIO_IRQ_ENABLED)) {
+               /* sdio irq flag can't change while in runtime suspend */
+               if (host->flags & HSMMC_WAKE_IRQ_ENABLED) {
+                       disable_irq_nosync(host->wake_irq);
+                       host->flags &= ~HSMMC_WAKE_IRQ_ENABLED;
+               }
+
+               pinctrl_pm_select_default_state(host->dev);
+
+               /* irq lost, if pinmux incorrect */
+               OMAP_HSMMC_WRITE(host->base, STAT, STAT_CLEAR);
+               OMAP_HSMMC_WRITE(host->base, ISE, CIRQ_EN);
+               OMAP_HSMMC_WRITE(host->base, IE, CIRQ_EN);
+       } else {
+               pinctrl_pm_select_default_state(host->dev);
+       }
+       spin_unlock_irqrestore(&host->irq_lock, flags);
        return 0;
 }
 
index f23782683a7c223dfa22db288cf308ef8c21ed1b..e5516a226362dc0c4e98c39215bfe2c55f87b992 100644 (file)
@@ -12,6 +12,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/dmaengine.h>
 #include <linux/dma-mapping.h>
 #include <linux/clk.h>
 #include <linux/mmc/host.h>
@@ -27,6 +28,7 @@
 #include <mach/dma.h>
 #include <mach/gpio-samsung.h>
 
+#include <linux/platform_data/dma-s3c24xx.h>
 #include <linux/platform_data/mmc-s3cmci.h>
 
 #include "s3cmci.h"
@@ -140,10 +142,6 @@ static const int dbgmap_debug = dbg_err | dbg_debug;
                dev_dbg(&host->pdev->dev, args);  \
        } while (0)
 
-static struct s3c2410_dma_client s3cmci_dma_client = {
-       .name           = "s3c-mci",
-};
-
 static void finalize_request(struct s3cmci_host *host);
 static void s3cmci_send_request(struct mmc_host *mmc);
 static void s3cmci_reset(struct s3cmci_host *host);
@@ -256,25 +254,8 @@ static inline bool s3cmci_host_usedma(struct s3cmci_host *host)
 {
 #ifdef CONFIG_MMC_S3C_PIO
        return false;
-#elif defined(CONFIG_MMC_S3C_DMA)
+#else /* CONFIG_MMC_S3C_DMA */
        return true;
-#else
-       return host->dodma;
-#endif
-}
-
-/**
- * s3cmci_host_canpio - return true if host has pio code available
- *
- * Return true if the driver has been compiled with the PIO support code
- * available.
- */
-static inline bool s3cmci_host_canpio(void)
-{
-#ifdef CONFIG_MMC_S3C_PIO
-       return true;
-#else
-       return false;
 #endif
 }
 
@@ -841,60 +822,24 @@ static irqreturn_t s3cmci_irq_cd(int irq, void *dev_id)
        return IRQ_HANDLED;
 }
 
-static void s3cmci_dma_done_callback(struct s3c2410_dma_chan *dma_ch,
-                                    void *buf_id, int size,
-                                    enum s3c2410_dma_buffresult result)
+static void s3cmci_dma_done_callback(void *arg)
 {
-       struct s3cmci_host *host = buf_id;
+       struct s3cmci_host *host = arg;
        unsigned long iflags;
-       u32 mci_csta, mci_dsta, mci_fsta, mci_dcnt;
-
-       mci_csta = readl(host->base + S3C2410_SDICMDSTAT);
-       mci_dsta = readl(host->base + S3C2410_SDIDSTA);
-       mci_fsta = readl(host->base + S3C2410_SDIFSTA);
-       mci_dcnt = readl(host->base + S3C2410_SDIDCNT);
 
        BUG_ON(!host->mrq);
        BUG_ON(!host->mrq->data);
-       BUG_ON(!host->dmatogo);
 
        spin_lock_irqsave(&host->complete_lock, iflags);
 
-       if (result != S3C2410_RES_OK) {
-               dbg(host, dbg_fail, "DMA FAILED: csta=0x%08x dsta=0x%08x "
-                       "fsta=0x%08x dcnt:0x%08x result:0x%08x toGo:%u\n",
-                       mci_csta, mci_dsta, mci_fsta,
-                       mci_dcnt, result, host->dmatogo);
-
-               goto fail_request;
-       }
-
-       host->dmatogo--;
-       if (host->dmatogo) {
-               dbg(host, dbg_dma, "DMA DONE  Size:%i DSTA:[%08x] "
-                       "DCNT:[%08x] toGo:%u\n",
-                       size, mci_dsta, mci_dcnt, host->dmatogo);
-
-               goto out;
-       }
-
-       dbg(host, dbg_dma, "DMA FINISHED Size:%i DSTA:%08x DCNT:%08x\n",
-               size, mci_dsta, mci_dcnt);
+       dbg(host, dbg_dma, "DMA FINISHED\n");
 
        host->dma_complete = 1;
        host->complete_what = COMPLETION_FINALIZE;
 
-out:
        tasklet_schedule(&host->pio_tasklet);
        spin_unlock_irqrestore(&host->complete_lock, iflags);
-       return;
 
-fail_request:
-       host->mrq->data->error = -EINVAL;
-       host->complete_what = COMPLETION_FINALIZE;
-       clear_imask(host);
-
-       goto out;
 }
 
 static void finalize_request(struct s3cmci_host *host)
@@ -966,7 +911,7 @@ static void finalize_request(struct s3cmci_host *host)
         * DMA channel and the fifo to clear out any garbage. */
        if (mrq->data->error != 0) {
                if (s3cmci_host_usedma(host))
-                       s3c2410_dma_ctrl(host->dma, S3C2410_DMAOP_FLUSH);
+                       dmaengine_terminate_all(host->dma);
 
                if (host->is2440) {
                        /* Clear failure register and reset fifo. */
@@ -992,29 +937,6 @@ request_done:
        mmc_request_done(host->mmc, mrq);
 }
 
-static void s3cmci_dma_setup(struct s3cmci_host *host,
-                            enum dma_data_direction source)
-{
-       static enum dma_data_direction last_source = -1;
-       static int setup_ok;
-
-       if (last_source == source)
-               return;
-
-       last_source = source;
-
-       s3c2410_dma_devconfig(host->dma, source,
-                             host->mem->start + host->sdidata);
-
-       if (!setup_ok) {
-               s3c2410_dma_config(host->dma, 4);
-               s3c2410_dma_set_buffdone_fn(host->dma,
-                                           s3cmci_dma_done_callback);
-               s3c2410_dma_setflags(host->dma, S3C2410_DMAF_AUTOSTART);
-               setup_ok = 1;
-       }
-}
-
 static void s3cmci_send_command(struct s3cmci_host *host,
                                        struct mmc_command *cmd)
 {
@@ -1162,43 +1084,45 @@ static int s3cmci_prepare_pio(struct s3cmci_host *host, struct mmc_data *data)
 
 static int s3cmci_prepare_dma(struct s3cmci_host *host, struct mmc_data *data)
 {
-       int dma_len, i;
        int rw = data->flags & MMC_DATA_WRITE;
+       struct dma_async_tx_descriptor *desc;
+       struct dma_slave_config conf = {
+               .src_addr = host->mem->start + host->sdidata,
+               .dst_addr = host->mem->start + host->sdidata,
+               .src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES,
+               .dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES,
+       };
 
        BUG_ON((data->flags & BOTH_DIR) == BOTH_DIR);
 
-       s3cmci_dma_setup(host, rw ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-       s3c2410_dma_ctrl(host->dma, S3C2410_DMAOP_FLUSH);
-
-       dma_len = dma_map_sg(mmc_dev(host->mmc), data->sg, data->sg_len,
-                            rw ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-
-       if (dma_len == 0)
-               return -ENOMEM;
-
-       host->dma_complete = 0;
-       host->dmatogo = dma_len;
-
-       for (i = 0; i < dma_len; i++) {
-               int res;
-
-               dbg(host, dbg_dma, "enqueue %i: %08x@%u\n", i,
-                   sg_dma_address(&data->sg[i]),
-                   sg_dma_len(&data->sg[i]));
+       /* Restore prescaler value */
+       writel(host->prescaler, host->base + S3C2410_SDIPRE);
 
-               res = s3c2410_dma_enqueue(host->dma, host,
-                                         sg_dma_address(&data->sg[i]),
-                                         sg_dma_len(&data->sg[i]));
+       if (!rw)
+               conf.direction = DMA_DEV_TO_MEM;
+       else
+               conf.direction = DMA_MEM_TO_DEV;
 
-               if (res) {
-                       s3c2410_dma_ctrl(host->dma, S3C2410_DMAOP_FLUSH);
-                       return -EBUSY;
-               }
-       }
+       dma_map_sg(mmc_dev(host->mmc), data->sg, data->sg_len,
+                            rw ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 
-       s3c2410_dma_ctrl(host->dma, S3C2410_DMAOP_START);
+       dmaengine_slave_config(host->dma, &conf);
+       desc = dmaengine_prep_slave_sg(host->dma, data->sg, data->sg_len,
+               conf.direction,
+               DMA_CTRL_ACK | DMA_PREP_INTERRUPT);
+       if (!desc)
+               goto unmap_exit;
+       desc->callback = s3cmci_dma_done_callback;
+       desc->callback_param = host;
+       dmaengine_submit(desc);
+       dma_async_issue_pending(host->dma);
 
        return 0;
+
+unmap_exit:
+       dma_unmap_sg(mmc_dev(host->mmc), data->sg, data->sg_len,
+                            rw ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+       return -ENOMEM;
 }
 
 static void s3cmci_send_request(struct mmc_host *mmc)
@@ -1676,10 +1600,6 @@ static int s3cmci_probe(struct platform_device *pdev)
        host->complete_what     = COMPLETION_NONE;
        host->pio_active        = XFER_NONE;
 
-#ifdef CONFIG_MMC_S3C_PIODMA
-       host->dodma             = host->pdata->use_dma;
-#endif
-
        host->mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
        if (!host->mem) {
                dev_err(&pdev->dev,
@@ -1765,17 +1685,17 @@ static int s3cmci_probe(struct platform_device *pdev)
        /* depending on the dma state, get a dma channel to use. */
 
        if (s3cmci_host_usedma(host)) {
-               host->dma = s3c2410_dma_request(DMACH_SDI, &s3cmci_dma_client,
-                                               host);
-               if (host->dma < 0) {
+               dma_cap_mask_t mask;
+
+               dma_cap_zero(mask);
+               dma_cap_set(DMA_SLAVE, mask);
+
+               host->dma = dma_request_slave_channel_compat(mask,
+                       s3c24xx_dma_filter, (void *)DMACH_SDI, &pdev->dev, "rx-tx");
+               if (!host->dma) {
                        dev_err(&pdev->dev, "cannot get DMA channel.\n");
-                       if (!s3cmci_host_canpio()) {
-                               ret = -EBUSY;
-                               goto probe_free_gpio_wp;
-                       } else {
-                               dev_warn(&pdev->dev, "falling back to PIO.\n");
-                               host->dodma = 0;
-                       }
+                       ret = -EBUSY;
+                       goto probe_free_gpio_wp;
                }
        }
 
@@ -1787,7 +1707,7 @@ static int s3cmci_probe(struct platform_device *pdev)
                goto probe_free_dma;
        }
 
-       ret = clk_enable(host->clk);
+       ret = clk_prepare_enable(host->clk);
        if (ret) {
                dev_err(&pdev->dev, "failed to enable clock source.\n");
                goto clk_free;
@@ -1816,7 +1736,7 @@ static int s3cmci_probe(struct platform_device *pdev)
        mmc->max_segs           = 128;
 
        dbg(host, dbg_debug,
-           "probe: mode:%s mapped mci_base:%p irq:%u irq_cd:%u dma:%u.\n",
+           "probe: mode:%s mapped mci_base:%p irq:%u irq_cd:%u dma:%p.\n",
            (host->is2440?"2440":""),
            host->base, host->irq, host->irq_cd, host->dma);
 
@@ -1845,14 +1765,14 @@ static int s3cmci_probe(struct platform_device *pdev)
        s3cmci_cpufreq_deregister(host);
 
  free_dmabuf:
-       clk_disable(host->clk);
+       clk_disable_unprepare(host->clk);
 
  clk_free:
        clk_put(host->clk);
 
  probe_free_dma:
        if (s3cmci_host_usedma(host))
-               s3c2410_dma_free(host->dma, &s3cmci_dma_client);
+               dma_release_channel(host->dma);
 
  probe_free_gpio_wp:
        if (!host->pdata->no_wprotect)
@@ -1897,7 +1817,7 @@ static void s3cmci_shutdown(struct platform_device *pdev)
        s3cmci_debugfs_remove(host);
        s3cmci_cpufreq_deregister(host);
        mmc_remove_host(mmc);
-       clk_disable(host->clk);
+       clk_disable_unprepare(host->clk);
 }
 
 static int s3cmci_remove(struct platform_device *pdev)
@@ -1914,7 +1834,7 @@ static int s3cmci_remove(struct platform_device *pdev)
        tasklet_disable(&host->pio_tasklet);
 
        if (s3cmci_host_usedma(host))
-               s3c2410_dma_free(host->dma, &s3cmci_dma_client);
+               dma_release_channel(host->dma);
 
        free_irq(host->irq, host);
 
index c76b53dbeb6179a3545bbb26b7aa87d50302477e..cc2e46cb5c643b07c791543852cd62f7eb29f1ad 100644 (file)
@@ -26,7 +26,7 @@ struct s3cmci_host {
        void __iomem            *base;
        int                     irq;
        int                     irq_cd;
-       int                     dma;
+       struct dma_chan         *dma;
 
        unsigned long           clk_rate;
        unsigned long           clk_div;
@@ -36,8 +36,6 @@ struct s3cmci_host {
        int                     is2440;
        unsigned                sdiimsk;
        unsigned                sdidata;
-       int                     dodma;
-       int                     dmatogo;
 
        bool                    irq_disabled;
        bool                    irq_enabled;
index 8ce3c28cb76ed503e9ea16605e059fa93780ea1a..8c5337002c5137ec5658cc79f929c25f378ab0f8 100644 (file)
@@ -124,9 +124,11 @@ static const struct sdhci_acpi_chip sdhci_acpi_chip_int = {
 
 static const struct sdhci_acpi_slot sdhci_acpi_slot_int_emmc = {
        .chip    = &sdhci_acpi_chip_int,
-       .caps    = MMC_CAP_8_BIT_DATA | MMC_CAP_NONREMOVABLE | MMC_CAP_HW_RESET,
+       .caps    = MMC_CAP_8_BIT_DATA | MMC_CAP_NONREMOVABLE |
+                  MMC_CAP_HW_RESET | MMC_CAP_1_8V_DDR,
        .caps2   = MMC_CAP2_HC_ERASE_SZ,
        .flags   = SDHCI_ACPI_RUNTIME_PM,
+       .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN,
 };
 
 static const struct sdhci_acpi_slot sdhci_acpi_slot_int_sdio = {
index 40573a58486a165e659853b327b7c8e8f886863d..1a6661ed62050fda39cea0a57428abd2bb09e9e0 100644 (file)
@@ -16,7 +16,6 @@
 
 #include <linux/module.h>
 #include <linux/of_device.h>
-#include <linux/regulator/consumer.h>
 #include <linux/delay.h>
 #include <linux/mmc/mmc.h>
 #include <linux/slab.h>
index 52c42fcc284c4b3dca6cd889b562954d9bfc6352..c3a1debc9289860755a207cf1315ef544bdda21c 100644 (file)
@@ -103,6 +103,10 @@ static const struct sdhci_pci_fixes sdhci_cafe = {
                          SDHCI_QUIRK_BROKEN_TIMEOUT_VAL,
 };
 
+static const struct sdhci_pci_fixes sdhci_intel_qrk = {
+       .quirks         = SDHCI_QUIRK_NO_HISPD_BIT,
+};
+
 static int mrst_hc_probe_slot(struct sdhci_pci_slot *slot)
 {
        slot->host->mmc->caps |= MMC_CAP_8_BIT_DATA;
@@ -264,7 +268,7 @@ static void sdhci_pci_int_hw_reset(struct sdhci_host *host)
 static int byt_emmc_probe_slot(struct sdhci_pci_slot *slot)
 {
        slot->host->mmc->caps |= MMC_CAP_8_BIT_DATA | MMC_CAP_NONREMOVABLE |
-                                MMC_CAP_HW_RESET;
+                                MMC_CAP_HW_RESET | MMC_CAP_1_8V_DDR;
        slot->host->mmc->caps2 |= MMC_CAP2_HC_ERASE_SZ;
        slot->hw_reset = sdhci_pci_int_hw_reset;
        return 0;
@@ -279,6 +283,7 @@ static int byt_sdio_probe_slot(struct sdhci_pci_slot *slot)
 static const struct sdhci_pci_fixes sdhci_intel_byt_emmc = {
        .allow_runtime_pm = true,
        .probe_slot     = byt_emmc_probe_slot,
+       .quirks2        = SDHCI_QUIRK2_PRESET_VALUE_BROKEN,
 };
 
 static const struct sdhci_pci_fixes sdhci_intel_byt_sdio = {
@@ -751,6 +756,14 @@ static const struct pci_device_id pci_ids[] = {
                .driver_data    = (kernel_ulong_t)&sdhci_rtsx,
        },
 
+       {
+               .vendor         = PCI_VENDOR_ID_INTEL,
+               .device         = PCI_DEVICE_ID_INTEL_QRK_SD,
+               .subvendor      = PCI_ANY_ID,
+               .subdevice      = PCI_ANY_ID,
+               .driver_data    = (kernel_ulong_t)&sdhci_intel_qrk,
+       },
+
        {
                .vendor         = PCI_VENDOR_ID_INTEL,
                .device         = PCI_DEVICE_ID_INTEL_MRST_SD0,
@@ -1130,18 +1143,13 @@ static int sdhci_pci_suspend(struct device *dev)
                        goto err_pci_suspend;
        }
 
-       pci_save_state(pdev);
        if (pm_flags & MMC_PM_KEEP_POWER) {
-               if (pm_flags & MMC_PM_WAKE_SDIO_IRQ) {
-                       pci_pme_active(pdev, true);
-                       pci_enable_wake(pdev, PCI_D3hot, 1);
-               }
-               pci_set_power_state(pdev, PCI_D3hot);
-       } else {
-               pci_enable_wake(pdev, PCI_D3hot, 0);
-               pci_disable_device(pdev);
-               pci_set_power_state(pdev, PCI_D3hot);
-       }
+               if (pm_flags & MMC_PM_WAKE_SDIO_IRQ)
+                       device_init_wakeup(dev, true);
+               else
+                       device_init_wakeup(dev, false);
+       } else
+               device_init_wakeup(dev, false);
 
        return 0;
 
@@ -1162,12 +1170,6 @@ static int sdhci_pci_resume(struct device *dev)
        if (!chip)
                return 0;
 
-       pci_set_power_state(pdev, PCI_D0);
-       pci_restore_state(pdev);
-       ret = pci_enable_device(pdev);
-       if (ret)
-               return ret;
-
        if (chip->fixes && chip->fixes->resume) {
                ret = chip->fixes->resume(chip);
                if (ret)
index 6d718719659e48abbc63ff7376f26b0dd6696d3b..c101477ef3be28364b31b50c92c88dd72909e3d6 100644 (file)
@@ -17,6 +17,7 @@
 #define PCI_DEVICE_ID_INTEL_CLV_SDIO2  0x08fb
 #define PCI_DEVICE_ID_INTEL_CLV_EMMC0  0x08e5
 #define PCI_DEVICE_ID_INTEL_CLV_EMMC1  0x08e6
+#define PCI_DEVICE_ID_INTEL_QRK_SD     0x08A7
 
 /*
  * PCI registers
index f4f128947561266e63b47f74c3af458c22aca27b..6f842fb8e6b81834de13f0ffef3e19af2b9cb3a0 100644 (file)
@@ -288,15 +288,13 @@ static int sdhci_pxav3_probe(struct platform_device *pdev)
        int ret;
        struct clk *clk;
 
-       pxa = kzalloc(sizeof(struct sdhci_pxa), GFP_KERNEL);
+       pxa = devm_kzalloc(&pdev->dev, sizeof(struct sdhci_pxa), GFP_KERNEL);
        if (!pxa)
                return -ENOMEM;
 
        host = sdhci_pltfm_init(pdev, &sdhci_pxav3_pdata, 0);
-       if (IS_ERR(host)) {
-               kfree(pxa);
+       if (IS_ERR(host))
                return PTR_ERR(host);
-       }
 
        if (of_device_is_compatible(np, "marvell,armada-380-sdhci")) {
                ret = mv_conf_mbus_windows(pdev, mv_mbus_dram_info());
@@ -308,7 +306,7 @@ static int sdhci_pxav3_probe(struct platform_device *pdev)
        pltfm_host = sdhci_priv(host);
        pltfm_host->priv = pxa;
 
-       clk = clk_get(dev, NULL);
+       clk = devm_clk_get(dev, NULL);
        if (IS_ERR(clk)) {
                dev_err(dev, "failed to get io clock\n");
                ret = PTR_ERR(clk);
@@ -389,11 +387,9 @@ err_add_host:
        pm_runtime_put_sync(&pdev->dev);
        pm_runtime_disable(&pdev->dev);
        clk_disable_unprepare(clk);
-       clk_put(clk);
 err_clk_get:
 err_mbus_win:
        sdhci_pltfm_free(pdev);
-       kfree(pxa);
        return ret;
 }
 
@@ -401,17 +397,14 @@ static int sdhci_pxav3_remove(struct platform_device *pdev)
 {
        struct sdhci_host *host = platform_get_drvdata(pdev);
        struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
-       struct sdhci_pxa *pxa = pltfm_host->priv;
 
        pm_runtime_get_sync(&pdev->dev);
        sdhci_remove_host(host, 1);
        pm_runtime_disable(&pdev->dev);
 
        clk_disable_unprepare(pltfm_host->clk);
-       clk_put(pltfm_host->clk);
 
        sdhci_pltfm_free(pdev);
-       kfree(pxa);
 
        return 0;
 }
diff --git a/drivers/mmc/host/sdhci-st.c b/drivers/mmc/host/sdhci-st.c
new file mode 100644 (file)
index 0000000..328f348
--- /dev/null
@@ -0,0 +1,176 @@
+/*
+ * Support for SDHCI on STMicroelectronics SoCs
+ *
+ * Copyright (C) 2014 STMicroelectronics Ltd
+ * Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
+ * Contributors: Peter Griffin <peter.griffin@linaro.org>
+ *
+ * Based on sdhci-cns3xxx.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/io.h>
+#include <linux/of.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/mmc/host.h>
+
+#include "sdhci-pltfm.h"
+
+static u32 sdhci_st_readl(struct sdhci_host *host, int reg)
+{
+       u32 ret;
+
+       switch (reg) {
+       case SDHCI_CAPABILITIES:
+               ret = readl_relaxed(host->ioaddr + reg);
+               /* Support 3.3V and 1.8V */
+               ret &= ~SDHCI_CAN_VDD_300;
+               break;
+       default:
+               ret = readl_relaxed(host->ioaddr + reg);
+       }
+       return ret;
+}
+
+static const struct sdhci_ops sdhci_st_ops = {
+       .get_max_clock = sdhci_pltfm_clk_get_max_clock,
+       .set_clock = sdhci_set_clock,
+       .set_bus_width = sdhci_set_bus_width,
+       .read_l = sdhci_st_readl,
+       .reset = sdhci_reset,
+};
+
+static const struct sdhci_pltfm_data sdhci_st_pdata = {
+       .ops = &sdhci_st_ops,
+       .quirks = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC |
+           SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN,
+};
+
+
+static int sdhci_st_probe(struct platform_device *pdev)
+{
+       struct sdhci_host *host;
+       struct sdhci_pltfm_host *pltfm_host;
+       struct clk *clk;
+       int ret = 0;
+       u16 host_version;
+
+       clk =  devm_clk_get(&pdev->dev, "mmc");
+       if (IS_ERR(clk)) {
+               dev_err(&pdev->dev, "Peripheral clk not found\n");
+               return PTR_ERR(clk);
+       }
+
+       host = sdhci_pltfm_init(pdev, &sdhci_st_pdata, 0);
+       if (IS_ERR(host)) {
+               dev_err(&pdev->dev, "Failed sdhci_pltfm_init\n");
+               return PTR_ERR(host);
+       }
+
+       ret = mmc_of_parse(host->mmc);
+
+       if (ret) {
+               dev_err(&pdev->dev, "Failed mmc_of_parse\n");
+               return ret;
+       }
+
+       clk_prepare_enable(clk);
+
+       pltfm_host = sdhci_priv(host);
+       pltfm_host->clk = clk;
+
+       ret = sdhci_add_host(host);
+       if (ret) {
+               dev_err(&pdev->dev, "Failed sdhci_add_host\n");
+               goto err_out;
+       }
+
+       platform_set_drvdata(pdev, host);
+
+       host_version = readw_relaxed((host->ioaddr + SDHCI_HOST_VERSION));
+
+       dev_info(&pdev->dev, "SDHCI ST Initialised: Host Version: 0x%x Vendor Version 0x%x\n",
+               ((host_version & SDHCI_SPEC_VER_MASK) >> SDHCI_SPEC_VER_SHIFT),
+               ((host_version & SDHCI_VENDOR_VER_MASK) >>
+               SDHCI_VENDOR_VER_SHIFT));
+
+       return 0;
+
+err_out:
+       clk_disable_unprepare(clk);
+       sdhci_pltfm_free(pdev);
+
+       return ret;
+}
+
+static int sdhci_st_remove(struct platform_device *pdev)
+{
+       struct sdhci_host *host = platform_get_drvdata(pdev);
+       struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
+
+       clk_disable_unprepare(pltfm_host->clk);
+
+       return sdhci_pltfm_unregister(pdev);
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int sdhci_st_suspend(struct device *dev)
+{
+       struct sdhci_host *host = dev_get_drvdata(dev);
+       struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
+       int ret = sdhci_suspend_host(host);
+
+       if (ret)
+               goto out;
+
+       clk_disable_unprepare(pltfm_host->clk);
+out:
+       return ret;
+}
+
+static int sdhci_st_resume(struct device *dev)
+{
+       struct sdhci_host *host = dev_get_drvdata(dev);
+       struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
+
+       clk_prepare_enable(pltfm_host->clk);
+
+       return sdhci_resume_host(host);
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(sdhci_st_pmops, sdhci_st_suspend, sdhci_st_resume);
+
+static const struct of_device_id st_sdhci_match[] = {
+       { .compatible = "st,sdhci" },
+       {},
+};
+
+MODULE_DEVICE_TABLE(of, st_sdhci_match);
+
+static struct platform_driver sdhci_st_driver = {
+       .probe = sdhci_st_probe,
+       .remove = sdhci_st_remove,
+       .driver = {
+                  .name = "sdhci-st",
+                  .pm = &sdhci_st_pmops,
+                  .of_match_table = of_match_ptr(st_sdhci_match),
+                 },
+};
+
+module_platform_driver(sdhci_st_driver);
+
+MODULE_DESCRIPTION("SDHCI driver for STMicroelectronics SoCs");
+MODULE_AUTHOR("Giuseppe Cavallaro <peppe.cavallaro@st.com>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:st-sdhci");
index d93a063a36f37bc5d25c6f6c1b3beba5c8d544a9..33100d10d17685b732279dc5c2d42b3f8950ee97 100644 (file)
@@ -26,8 +26,6 @@
 #include <linux/mmc/host.h>
 #include <linux/mmc/slot-gpio.h>
 
-#include <asm/gpio.h>
-
 #include "sdhci-pltfm.h"
 
 /* Tegra SDHOST controller vendor register definitions */
index 47055f3f01b8580e01ff147232d106bc14db3667..37b2a9ae52eff16cd44649f42fb4822ff05c89db 100644 (file)
@@ -1223,8 +1223,16 @@ EXPORT_SYMBOL_GPL(sdhci_set_clock);
 static void sdhci_set_power(struct sdhci_host *host, unsigned char mode,
                            unsigned short vdd)
 {
+       struct mmc_host *mmc = host->mmc;
        u8 pwr = 0;
 
+       if (!IS_ERR(mmc->supply.vmmc)) {
+               spin_unlock_irq(&host->lock);
+               mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, vdd);
+               spin_lock_irq(&host->lock);
+               return;
+       }
+
        if (mode != MMC_POWER_OFF) {
                switch (1 << vdd) {
                case MMC_VDD_165_195:
@@ -1283,12 +1291,6 @@ static void sdhci_set_power(struct sdhci_host *host, unsigned char mode,
                if (host->quirks & SDHCI_QUIRK_DELAY_AFTER_POWER)
                        mdelay(10);
        }
-
-       if (host->vmmc) {
-               spin_unlock_irq(&host->lock);
-               mmc_regulator_set_ocr(host->mmc, host->vmmc, vdd);
-               spin_lock_irq(&host->lock);
-       }
 }
 
 /*****************************************************************************\
@@ -1440,13 +1442,15 @@ static void sdhci_do_set_ios(struct sdhci_host *host, struct mmc_ios *ios)
 {
        unsigned long flags;
        u8 ctrl;
+       struct mmc_host *mmc = host->mmc;
 
        spin_lock_irqsave(&host->lock, flags);
 
        if (host->flags & SDHCI_DEVICE_DEAD) {
                spin_unlock_irqrestore(&host->lock, flags);
-               if (host->vmmc && ios->power_mode == MMC_POWER_OFF)
-                       mmc_regulator_set_ocr(host->mmc, host->vmmc, 0);
+               if (!IS_ERR(mmc->supply.vmmc) &&
+                   ios->power_mode == MMC_POWER_OFF)
+                       mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, 0);
                return;
        }
 
@@ -1530,7 +1534,6 @@ static void sdhci_do_set_ios(struct sdhci_host *host, struct mmc_ios *ios)
                        host->ops->set_clock(host, host->clock);
                }
 
-
                /* Reset SD Clock Enable */
                clk = sdhci_readw(host, SDHCI_CLOCK_CONTROL);
                clk &= ~SDHCI_CLOCK_CARD_EN;
@@ -1707,6 +1710,7 @@ static void sdhci_enable_sdio_irq(struct mmc_host *mmc, int enable)
 static int sdhci_do_start_signal_voltage_switch(struct sdhci_host *host,
                                                struct mmc_ios *ios)
 {
+       struct mmc_host *mmc = host->mmc;
        u16 ctrl;
        int ret;
 
@@ -1725,11 +1729,12 @@ static int sdhci_do_start_signal_voltage_switch(struct sdhci_host *host,
                ctrl &= ~SDHCI_CTRL_VDD_180;
                sdhci_writew(host, ctrl, SDHCI_HOST_CONTROL2);
 
-               if (host->vqmmc) {
-                       ret = regulator_set_voltage(host->vqmmc, 2700000, 3600000);
+               if (!IS_ERR(mmc->supply.vqmmc)) {
+                       ret = regulator_set_voltage(mmc->supply.vqmmc, 2700000,
+                                                   3600000);
                        if (ret) {
                                pr_warning("%s: Switching to 3.3V signalling voltage "
-                                               " failed\n", mmc_hostname(host->mmc));
+                                               " failed\n", mmc_hostname(mmc));
                                return -EIO;
                        }
                }
@@ -1742,16 +1747,16 @@ static int sdhci_do_start_signal_voltage_switch(struct sdhci_host *host,
                        return 0;
 
                pr_warning("%s: 3.3V regulator output did not became stable\n",
-                               mmc_hostname(host->mmc));
+                               mmc_hostname(mmc));
 
                return -EAGAIN;
        case MMC_SIGNAL_VOLTAGE_180:
-               if (host->vqmmc) {
-                       ret = regulator_set_voltage(host->vqmmc,
+               if (!IS_ERR(mmc->supply.vqmmc)) {
+                       ret = regulator_set_voltage(mmc->supply.vqmmc,
                                        1700000, 1950000);
                        if (ret) {
                                pr_warning("%s: Switching to 1.8V signalling voltage "
-                                               " failed\n", mmc_hostname(host->mmc));
+                                               " failed\n", mmc_hostname(mmc));
                                return -EIO;
                        }
                }
@@ -1763,24 +1768,22 @@ static int sdhci_do_start_signal_voltage_switch(struct sdhci_host *host,
                ctrl |= SDHCI_CTRL_VDD_180;
                sdhci_writew(host, ctrl, SDHCI_HOST_CONTROL2);
 
-               /* Wait for 5ms */
-               usleep_range(5000, 5500);
-
                /* 1.8V regulator output should be stable within 5 ms */
                ctrl = sdhci_readw(host, SDHCI_HOST_CONTROL2);
                if (ctrl & SDHCI_CTRL_VDD_180)
                        return 0;
 
                pr_warning("%s: 1.8V regulator output did not became stable\n",
-                               mmc_hostname(host->mmc));
+                               mmc_hostname(mmc));
 
                return -EAGAIN;
        case MMC_SIGNAL_VOLTAGE_120:
-               if (host->vqmmc) {
-                       ret = regulator_set_voltage(host->vqmmc, 1100000, 1300000);
+               if (!IS_ERR(mmc->supply.vqmmc)) {
+                       ret = regulator_set_voltage(mmc->supply.vqmmc, 1100000,
+                                                   1300000);
                        if (ret) {
                                pr_warning("%s: Switching to 1.2V signalling voltage "
-                                               " failed\n", mmc_hostname(host->mmc));
+                                               " failed\n", mmc_hostname(mmc));
                                return -EIO;
                        }
                }
@@ -2643,7 +2646,6 @@ static void sdhci_runtime_pm_bus_off(struct sdhci_host *host)
 int sdhci_runtime_suspend_host(struct sdhci_host *host)
 {
        unsigned long flags;
-       int ret = 0;
 
        /* Disable tuning since we are suspending */
        if (host->flags & SDHCI_USING_RETUNING_TIMER) {
@@ -2663,14 +2665,14 @@ int sdhci_runtime_suspend_host(struct sdhci_host *host)
        host->runtime_suspended = true;
        spin_unlock_irqrestore(&host->lock, flags);
 
-       return ret;
+       return 0;
 }
 EXPORT_SYMBOL_GPL(sdhci_runtime_suspend_host);
 
 int sdhci_runtime_resume_host(struct sdhci_host *host)
 {
        unsigned long flags;
-       int ret = 0, host_flags = host->flags;
+       int host_flags = host->flags;
 
        if (host_flags & (SDHCI_USE_SDMA | SDHCI_USE_ADMA)) {
                if (host->ops->enable_dma)
@@ -2709,7 +2711,7 @@ int sdhci_runtime_resume_host(struct sdhci_host *host)
 
        spin_unlock_irqrestore(&host->lock, flags);
 
-       return ret;
+       return 0;
 }
 EXPORT_SYMBOL_GPL(sdhci_runtime_resume_host);
 
@@ -2820,12 +2822,12 @@ int sdhci_add_host(struct sdhci_host *host)
                 * (128) and potentially one alignment transfer for
                 * each of those entries.
                 */
-               host->adma_desc = dma_alloc_coherent(mmc_dev(host->mmc),
+               host->adma_desc = dma_alloc_coherent(mmc_dev(mmc),
                                                     ADMA_SIZE, &host->adma_addr,
                                                     GFP_KERNEL);
                host->align_buffer = kmalloc(128 * 4, GFP_KERNEL);
                if (!host->adma_desc || !host->align_buffer) {
-                       dma_free_coherent(mmc_dev(host->mmc), ADMA_SIZE,
+                       dma_free_coherent(mmc_dev(mmc), ADMA_SIZE,
                                          host->adma_desc, host->adma_addr);
                        kfree(host->align_buffer);
                        pr_warning("%s: Unable to allocate ADMA "
@@ -2838,7 +2840,7 @@ int sdhci_add_host(struct sdhci_host *host)
                        pr_warning("%s: unable to allocate aligned ADMA descriptor\n",
                                   mmc_hostname(mmc));
                        host->flags &= ~SDHCI_USE_ADMA;
-                       dma_free_coherent(mmc_dev(host->mmc), ADMA_SIZE,
+                       dma_free_coherent(mmc_dev(mmc), ADMA_SIZE,
                                          host->adma_desc, host->adma_addr);
                        kfree(host->align_buffer);
                        host->adma_desc = NULL;
@@ -2853,7 +2855,7 @@ int sdhci_add_host(struct sdhci_host *host)
         */
        if (!(host->flags & (SDHCI_USE_SDMA | SDHCI_USE_ADMA))) {
                host->dma_mask = DMA_BIT_MASK(64);
-               mmc_dev(host->mmc)->dma_mask = &host->dma_mask;
+               mmc_dev(mmc)->dma_mask = &host->dma_mask;
        }
 
        if (host->version >= SDHCI_SPEC_300)
@@ -2959,28 +2961,25 @@ int sdhci_add_host(struct sdhci_host *host)
                mmc->caps |= MMC_CAP_SD_HIGHSPEED | MMC_CAP_MMC_HIGHSPEED;
 
        if ((host->quirks & SDHCI_QUIRK_BROKEN_CARD_DETECTION) &&
-           !(host->mmc->caps & MMC_CAP_NONREMOVABLE))
+           !(mmc->caps & MMC_CAP_NONREMOVABLE))
                mmc->caps |= MMC_CAP_NEEDS_POLL;
 
+       /* If there are external regulators, get them */
+       if (mmc_regulator_get_supply(mmc) == -EPROBE_DEFER)
+               return -EPROBE_DEFER;
+
        /* If vqmmc regulator and no 1.8V signalling, then there's no UHS */
-       host->vqmmc = regulator_get_optional(mmc_dev(mmc), "vqmmc");
-       if (IS_ERR_OR_NULL(host->vqmmc)) {
-               if (PTR_ERR(host->vqmmc) < 0) {
-                       pr_info("%s: no vqmmc regulator found\n",
-                               mmc_hostname(mmc));
-                       host->vqmmc = NULL;
-               }
-       } else {
-               ret = regulator_enable(host->vqmmc);
-               if (!regulator_is_supported_voltage(host->vqmmc, 1700000,
-                       1950000))
+       if (!IS_ERR(mmc->supply.vqmmc)) {
+               ret = regulator_enable(mmc->supply.vqmmc);
+               if (!regulator_is_supported_voltage(mmc->supply.vqmmc, 1700000,
+                                                   1950000))
                        caps[1] &= ~(SDHCI_SUPPORT_SDR104 |
                                        SDHCI_SUPPORT_SDR50 |
                                        SDHCI_SUPPORT_DDR50);
                if (ret) {
                        pr_warn("%s: Failed to enable vqmmc regulator: %d\n",
                                mmc_hostname(mmc), ret);
-                       host->vqmmc = NULL;
+                       mmc->supply.vqmmc = NULL;
                }
        }
 
@@ -3041,34 +3040,6 @@ int sdhci_add_host(struct sdhci_host *host)
 
        ocr_avail = 0;
 
-       host->vmmc = regulator_get_optional(mmc_dev(mmc), "vmmc");
-       if (IS_ERR_OR_NULL(host->vmmc)) {
-               if (PTR_ERR(host->vmmc) < 0) {
-                       pr_info("%s: no vmmc regulator found\n",
-                               mmc_hostname(mmc));
-                       host->vmmc = NULL;
-               }
-       }
-
-#ifdef CONFIG_REGULATOR
-       /*
-        * Voltage range check makes sense only if regulator reports
-        * any voltage value.
-        */
-       if (host->vmmc && regulator_get_voltage(host->vmmc) > 0) {
-               ret = regulator_is_supported_voltage(host->vmmc, 2700000,
-                       3600000);
-               if ((ret <= 0) || (!(caps[0] & SDHCI_CAN_VDD_330)))
-                       caps[0] &= ~SDHCI_CAN_VDD_330;
-               if ((ret <= 0) || (!(caps[0] & SDHCI_CAN_VDD_300)))
-                       caps[0] &= ~SDHCI_CAN_VDD_300;
-               ret = regulator_is_supported_voltage(host->vmmc, 1700000,
-                       1950000);
-               if ((ret <= 0) || (!(caps[0] & SDHCI_CAN_VDD_180)))
-                       caps[0] &= ~SDHCI_CAN_VDD_180;
-       }
-#endif /* CONFIG_REGULATOR */
-
        /*
         * According to SD Host Controller spec v3.00, if the Host System
         * can afford more than 150mA, Host Driver should set XPC to 1. Also
@@ -3077,8 +3048,8 @@ int sdhci_add_host(struct sdhci_host *host)
         * value.
         */
        max_current_caps = sdhci_readl(host, SDHCI_MAX_CURRENT);
-       if (!max_current_caps && host->vmmc) {
-               u32 curr = regulator_get_current_limit(host->vmmc);
+       if (!max_current_caps && !IS_ERR(mmc->supply.vmmc)) {
+               u32 curr = regulator_get_current_limit(mmc->supply.vmmc);
                if (curr > 0) {
 
                        /* convert to SDHCI_MAX_CURRENT format */
@@ -3118,8 +3089,12 @@ int sdhci_add_host(struct sdhci_host *host)
                                   SDHCI_MAX_CURRENT_MULTIPLIER;
        }
 
+       /* If OCR set by external regulators, use it instead */
+       if (mmc->ocr_avail)
+               ocr_avail = mmc->ocr_avail;
+
        if (host->ocr_mask)
-               ocr_avail = host->ocr_mask;
+               ocr_avail &= host->ocr_mask;
 
        mmc->ocr_avail = ocr_avail;
        mmc->ocr_avail_sdio = ocr_avail;
@@ -3273,6 +3248,7 @@ EXPORT_SYMBOL_GPL(sdhci_add_host);
 
 void sdhci_remove_host(struct sdhci_host *host, int dead)
 {
+       struct mmc_host *mmc = host->mmc;
        unsigned long flags;
 
        if (dead) {
@@ -3282,7 +3258,7 @@ void sdhci_remove_host(struct sdhci_host *host, int dead)
 
                if (host->mrq) {
                        pr_err("%s: Controller removed during "
-                               " transfer!\n", mmc_hostname(host->mmc));
+                               " transfer!\n", mmc_hostname(mmc));
 
                        host->mrq->cmd->error = -ENOMEDIUM;
                        tasklet_schedule(&host->finish_tasklet);
@@ -3293,7 +3269,7 @@ void sdhci_remove_host(struct sdhci_host *host, int dead)
 
        sdhci_disable_card_detection(host);
 
-       mmc_remove_host(host->mmc);
+       mmc_remove_host(mmc);
 
 #ifdef SDHCI_USE_LEDS_CLASS
        led_classdev_unregister(&host->led);
@@ -3310,18 +3286,14 @@ void sdhci_remove_host(struct sdhci_host *host, int dead)
 
        tasklet_kill(&host->finish_tasklet);
 
-       if (host->vmmc) {
-               regulator_disable(host->vmmc);
-               regulator_put(host->vmmc);
-       }
+       if (!IS_ERR(mmc->supply.vmmc))
+               regulator_disable(mmc->supply.vmmc);
 
-       if (host->vqmmc) {
-               regulator_disable(host->vqmmc);
-               regulator_put(host->vqmmc);
-       }
+       if (!IS_ERR(mmc->supply.vqmmc))
+               regulator_disable(mmc->supply.vqmmc);
 
        if (host->adma_desc)
-               dma_free_coherent(mmc_dev(host->mmc), ADMA_SIZE,
+               dma_free_coherent(mmc_dev(mmc), ADMA_SIZE,
                                  host->adma_desc, host->adma_addr);
        kfree(host->align_buffer);
 
index 656fbba4c4223f275dffba85ebbc29ffb4574174..d11708c815d721ba7dd9626ff21a523331ac4392 100644 (file)
@@ -386,7 +386,7 @@ sh_mmcif_request_dma_one(struct sh_mmcif_host *host,
                         struct sh_mmcif_plat_data *pdata,
                         enum dma_transfer_direction direction)
 {
-       struct dma_slave_config cfg;
+       struct dma_slave_config cfg = { 0, };
        struct dma_chan *chan;
        unsigned int slave_id;
        struct resource *res;
@@ -417,8 +417,15 @@ sh_mmcif_request_dma_one(struct sh_mmcif_host *host,
        /* In the OF case the driver will get the slave ID from the DT */
        cfg.slave_id = slave_id;
        cfg.direction = direction;
-       cfg.dst_addr = res->start + MMCIF_CE_DATA;
-       cfg.src_addr = 0;
+
+       if (direction == DMA_DEV_TO_MEM) {
+               cfg.src_addr = res->start + MMCIF_CE_DATA;
+               cfg.src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES;
+       } else {
+               cfg.dst_addr = res->start + MMCIF_CE_DATA;
+               cfg.dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES;
+       }
+
        ret = dmaengine_slave_config(chan, &cfg);
        if (ret < 0) {
                dma_release_channel(chan);
@@ -1378,26 +1385,19 @@ static int sh_mmcif_probe(struct platform_device *pdev)
                dev_err(&pdev->dev, "Get irq error\n");
                return -ENXIO;
        }
+
        res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       if (!res) {
-               dev_err(&pdev->dev, "platform_get_resource error.\n");
-               return -ENXIO;
-       }
-       reg = ioremap(res->start, resource_size(res));
-       if (!reg) {
-               dev_err(&pdev->dev, "ioremap error.\n");
-               return -ENOMEM;
-       }
+       reg = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(reg))
+               return PTR_ERR(reg);
 
        mmc = mmc_alloc_host(sizeof(struct sh_mmcif_host), &pdev->dev);
-       if (!mmc) {
-               ret = -ENOMEM;
-               goto ealloch;
-       }
+       if (!mmc)
+               return -ENOMEM;
 
        ret = mmc_of_parse(mmc);
        if (ret < 0)
-               goto eofparse;
+               goto err_host;
 
        host            = mmc_priv(mmc);
        host->mmc       = mmc;
@@ -1427,19 +1427,19 @@ static int sh_mmcif_probe(struct platform_device *pdev)
        pm_runtime_enable(&pdev->dev);
        host->power = false;
 
-       host->hclk = clk_get(&pdev->dev, NULL);
+       host->hclk = devm_clk_get(&pdev->dev, NULL);
        if (IS_ERR(host->hclk)) {
                ret = PTR_ERR(host->hclk);
                dev_err(&pdev->dev, "cannot get clock: %d\n", ret);
-               goto eclkget;
+               goto err_pm;
        }
        ret = sh_mmcif_clk_update(host);
        if (ret < 0)
-               goto eclkupdate;
+               goto err_pm;
 
        ret = pm_runtime_resume(&pdev->dev);
        if (ret < 0)
-               goto eresume;
+               goto err_clk;
 
        INIT_DELAYED_WORK(&host->timeout_work, mmcif_timeout_work);
 
@@ -1447,65 +1447,55 @@ static int sh_mmcif_probe(struct platform_device *pdev)
        sh_mmcif_writel(host->addr, MMCIF_CE_INT_MASK, MASK_ALL);
 
        name = irq[1] < 0 ? dev_name(&pdev->dev) : "sh_mmc:error";
-       ret = request_threaded_irq(irq[0], sh_mmcif_intr, sh_mmcif_irqt, 0, name, host);
+       ret = devm_request_threaded_irq(&pdev->dev, irq[0], sh_mmcif_intr,
+                                       sh_mmcif_irqt, 0, name, host);
        if (ret) {
                dev_err(&pdev->dev, "request_irq error (%s)\n", name);
-               goto ereqirq0;
+               goto err_clk;
        }
        if (irq[1] >= 0) {
-               ret = request_threaded_irq(irq[1], sh_mmcif_intr, sh_mmcif_irqt,
-                                          0, "sh_mmc:int", host);
+               ret = devm_request_threaded_irq(&pdev->dev, irq[1],
+                                               sh_mmcif_intr, sh_mmcif_irqt,
+                                               0, "sh_mmc:int", host);
                if (ret) {
                        dev_err(&pdev->dev, "request_irq error (sh_mmc:int)\n");
-                       goto ereqirq1;
+                       goto err_clk;
                }
        }
 
        if (pd && pd->use_cd_gpio) {
                ret = mmc_gpio_request_cd(mmc, pd->cd_gpio, 0);
                if (ret < 0)
-                       goto erqcd;
+                       goto err_clk;
        }
 
        mutex_init(&host->thread_lock);
 
-       clk_disable_unprepare(host->hclk);
        ret = mmc_add_host(mmc);
        if (ret < 0)
-               goto emmcaddh;
+               goto err_clk;
 
        dev_pm_qos_expose_latency_limit(&pdev->dev, 100);
 
-       dev_info(&pdev->dev, "driver version %s\n", DRIVER_VERSION);
-       dev_dbg(&pdev->dev, "chip ver H'%04x\n",
-               sh_mmcif_readl(host->addr, MMCIF_CE_VERSION) & 0x0000ffff);
+       dev_info(&pdev->dev, "Chip version 0x%04x, clock rate %luMHz\n",
+                sh_mmcif_readl(host->addr, MMCIF_CE_VERSION) & 0xffff,
+                clk_get_rate(host->hclk) / 1000000UL);
+
+       clk_disable_unprepare(host->hclk);
        return ret;
 
-emmcaddh:
-erqcd:
-       if (irq[1] >= 0)
-               free_irq(irq[1], host);
-ereqirq1:
-       free_irq(irq[0], host);
-ereqirq0:
-       pm_runtime_suspend(&pdev->dev);
-eresume:
+err_clk:
        clk_disable_unprepare(host->hclk);
-eclkupdate:
-       clk_put(host->hclk);
-eclkget:
+err_pm:
        pm_runtime_disable(&pdev->dev);
-eofparse:
+err_host:
        mmc_free_host(mmc);
-ealloch:
-       iounmap(reg);
        return ret;
 }
 
 static int sh_mmcif_remove(struct platform_device *pdev)
 {
        struct sh_mmcif_host *host = platform_get_drvdata(pdev);
-       int irq[2];
 
        host->dying = true;
        clk_prepare_enable(host->hclk);
@@ -1523,16 +1513,6 @@ static int sh_mmcif_remove(struct platform_device *pdev)
         */
        cancel_delayed_work_sync(&host->timeout_work);
 
-       if (host->addr)
-               iounmap(host->addr);
-
-       irq[0] = platform_get_irq(pdev, 0);
-       irq[1] = platform_get_irq(pdev, 1);
-
-       free_irq(irq[0], host);
-       if (irq[1] >= 0)
-               free_irq(irq[1], host);
-
        clk_disable_unprepare(host->hclk);
        mmc_free_host(host->mmc);
        pm_runtime_put_sync(&pdev->dev);
index 03e7b280cb4c71addd6d10fe52989a3206dd6c18..eb8f1d5c34b157f51c01210a273bf2c81257ac2d 100644 (file)
@@ -294,6 +294,7 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat
                        cfg.slave_id = pdata->dma->slave_id_tx;
                cfg.direction = DMA_MEM_TO_DEV;
                cfg.dst_addr = res->start + (CTL_SD_DATA_PORT << host->pdata->bus_shift);
+               cfg.dst_addr_width = DMA_SLAVE_BUSWIDTH_2_BYTES;
                cfg.src_addr = 0;
                ret = dmaengine_slave_config(host->chan_tx, &cfg);
                if (ret < 0)
@@ -312,6 +313,7 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat
                        cfg.slave_id = pdata->dma->slave_id_rx;
                cfg.direction = DMA_DEV_TO_MEM;
                cfg.src_addr = cfg.dst_addr;
+               cfg.src_addr_width = DMA_SLAVE_BUSWIDTH_2_BYTES;
                cfg.dst_addr = 0;
                ret = dmaengine_slave_config(host->chan_rx, &cfg);
                if (ret < 0)
index 282891a8e451e77040faaf26e5267e70595cc4ba..54181b4f6e9eab58cdb057e6d3d85b92ee2cadf0 100644 (file)
@@ -72,7 +72,6 @@
 #define BM_SPI_CS                      0x20
 #define BM_SD_POWER                    0x40
 #define BM_SOFT_RESET                  0x80
-#define BM_ONEBIT_MASK                 0xFD
 
 /* SDMMC_BLKLEN bit fields */
 #define BLKL_CRCERR_ABORT              0x0800
 #define STS2_DATARSP_BUSY              0x20
 #define STS2_DIS_FORCECLK              0x80
 
+/* SDMMC_EXTCTRL bit fields */
+#define EXT_EIGHTBIT                   0x04
 
 /* MMC/SD DMA Controller Registers */
 #define SDDMA_GCR                      0x100
@@ -672,7 +673,7 @@ static void wmt_mci_request(struct mmc_host *mmc, struct mmc_request *req)
 static void wmt_mci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 {
        struct wmt_mci_priv *priv;
-       u32 reg_tmp;
+       u32 busmode, extctrl;
 
        priv = mmc_priv(mmc);
 
@@ -687,28 +688,26 @@ static void wmt_mci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
        if (ios->clock != 0)
                clk_set_rate(priv->clk_sdmmc, ios->clock);
 
+       busmode = readb(priv->sdmmc_base + SDMMC_BUSMODE);
+       extctrl = readb(priv->sdmmc_base + SDMMC_EXTCTRL);
+
+       busmode &= ~(BM_EIGHTBIT_MODE | BM_FOURBIT_MODE);
+       extctrl &= ~EXT_EIGHTBIT;
+
        switch (ios->bus_width) {
        case MMC_BUS_WIDTH_8:
-               reg_tmp = readb(priv->sdmmc_base + SDMMC_EXTCTRL);
-               writeb(reg_tmp | 0x04, priv->sdmmc_base + SDMMC_EXTCTRL);
+               busmode |= BM_EIGHTBIT_MODE;
+               extctrl |= EXT_EIGHTBIT;
                break;
        case MMC_BUS_WIDTH_4:
-               reg_tmp = readb(priv->sdmmc_base + SDMMC_BUSMODE);
-               writeb(reg_tmp | BM_FOURBIT_MODE, priv->sdmmc_base +
-                      SDMMC_BUSMODE);
-
-               reg_tmp = readb(priv->sdmmc_base + SDMMC_EXTCTRL);
-               writeb(reg_tmp & 0xFB, priv->sdmmc_base + SDMMC_EXTCTRL);
+               busmode |= BM_FOURBIT_MODE;
                break;
        case MMC_BUS_WIDTH_1:
-               reg_tmp = readb(priv->sdmmc_base + SDMMC_BUSMODE);
-               writeb(reg_tmp & BM_ONEBIT_MASK, priv->sdmmc_base +
-                      SDMMC_BUSMODE);
-
-               reg_tmp = readb(priv->sdmmc_base + SDMMC_EXTCTRL);
-               writeb(reg_tmp & 0xFB, priv->sdmmc_base + SDMMC_EXTCTRL);
                break;
        }
+
+       writeb(busmode, priv->sdmmc_base + SDMMC_BUSMODE);
+       writeb(extctrl, priv->sdmmc_base + SDMMC_EXTCTRL);
 }
 
 static int wmt_mci_get_ro(struct mmc_host *mmc)
@@ -830,7 +829,7 @@ static int wmt_mci_probe(struct platform_device *pdev)
                goto fail3;
        }
 
-       ret = request_irq(dma_irq, wmt_mci_dma_isr, 32, "sdmmc", priv);
+       ret = request_irq(dma_irq, wmt_mci_dma_isr, 0, "sdmmc", priv);
        if (ret) {
                dev_err(&pdev->dev, "Register DMA IRQ fail\n");
                goto fail4;
index 8457df7ec5afe548d29f6faaa4560dd5db6fb815..33c64955d4d7b3e4bd1d9fc09b08b42b1113db34 100644 (file)
@@ -378,9 +378,11 @@ int ubiblock_create(struct ubi_volume_info *vi)
 {
        struct ubiblock *dev;
        struct gendisk *gd;
-       int disk_capacity;
+       u64 disk_capacity = ((u64)vi->size * vi->usable_leb_size) >> 9;
        int ret;
 
+       if ((sector_t)disk_capacity != disk_capacity)
+               return -EFBIG;
        /* Check that the volume isn't already handled */
        mutex_lock(&devices_mutex);
        if (find_dev_nolock(vi->ubi_num, vi->vol_id)) {
@@ -412,7 +414,6 @@ int ubiblock_create(struct ubi_volume_info *vi)
        gd->first_minor = dev->ubi_num * UBI_MAX_VOLUMES + dev->vol_id;
        gd->private_data = dev;
        sprintf(gd->disk_name, "ubiblock%d_%d", dev->ubi_num, dev->vol_id);
-       disk_capacity = (vi->size * vi->usable_leb_size) >> 9;
        set_capacity(gd, disk_capacity);
        dev->gd = gd;
 
@@ -498,11 +499,16 @@ int ubiblock_remove(struct ubi_volume_info *vi)
        return 0;
 }
 
-static void ubiblock_resize(struct ubi_volume_info *vi)
+static int ubiblock_resize(struct ubi_volume_info *vi)
 {
        struct ubiblock *dev;
-       int disk_capacity;
+       u64 disk_capacity = ((u64)vi->size * vi->usable_leb_size) >> 9;
 
+       if ((sector_t)disk_capacity != disk_capacity) {
+               ubi_warn("%s: the volume is too big, cannot resize (%d LEBs)",
+                        dev->gd->disk_name, vi->size);
+               return -EFBIG;
+       }
        /*
         * Need to lock the device list until we stop using the device,
         * otherwise the device struct might get released in
@@ -512,15 +518,15 @@ static void ubiblock_resize(struct ubi_volume_info *vi)
        dev = find_dev_nolock(vi->ubi_num, vi->vol_id);
        if (!dev) {
                mutex_unlock(&devices_mutex);
-               return;
+               return -ENODEV;
        }
 
        mutex_lock(&dev->dev_mutex);
-       disk_capacity = (vi->size * vi->usable_leb_size) >> 9;
        set_capacity(dev->gd, disk_capacity);
        ubi_msg("%s resized to %d LEBs", dev->gd->disk_name, vi->size);
        mutex_unlock(&dev->dev_mutex);
        mutex_unlock(&devices_mutex);
+       return 0;
 }
 
 static int ubiblock_notify(struct notifier_block *nb,
index d77b1c1d7c7267c3186635497eeba492f539d735..07cac5f9ffb801f4d3acc8c2c5638d9617638097 100644 (file)
@@ -591,7 +591,7 @@ static int init_volumes(struct ubi_device *ubi,
 
                /* Static volumes only */
                av = ubi_find_av(ai, i);
-               if (!av) {
+               if (!av || !av->leb_count) {
                        /*
                         * No eraseblocks belonging to this volume found. We
                         * don't actually know whether this static volume is
index 0f3425dac91046300f93587d4f341e080c98e322..20f4917131450f844332183756a25bce6f28d9a9 100644 (file)
@@ -1718,12 +1718,12 @@ int ubi_wl_flush(struct ubi_device *ubi, int vol_id, int lnum)
               vol_id, lnum, ubi->works_count);
 
        while (found) {
-               struct ubi_work *wrk;
+               struct ubi_work *wrk, *tmp;
                found = 0;
 
                down_read(&ubi->work_sem);
                spin_lock(&ubi->wl_lock);
-               list_for_each_entry(wrk, &ubi->works, list) {
+               list_for_each_entry_safe(wrk, tmp, &ubi->works, list) {
                        if ((vol_id == UBI_ALL || wrk->vol_id == vol_id) &&
                            (lnum == UBI_ALL || wrk->lnum == lnum)) {
                                list_del(&wrk->list);
index 326a612a27305d773a65467e420f8a758026c68a..1a790a20210d9bc6103c2c7401ed89695e5a336a 100644 (file)
@@ -112,20 +112,20 @@ static void com20020_detach(struct pcmcia_device *p_dev);
 
 /*====================================================================*/
 
-typedef struct com20020_dev_t {
+struct com20020_dev {
     struct net_device       *dev;
-} com20020_dev_t;
+};
 
 static int com20020_probe(struct pcmcia_device *p_dev)
 {
-    com20020_dev_t *info;
+    struct com20020_dev *info;
     struct net_device *dev;
     struct arcnet_local *lp;
 
     dev_dbg(&p_dev->dev, "com20020_attach()\n");
 
     /* Create new network device */
-    info = kzalloc(sizeof(struct com20020_dev_t), GFP_KERNEL);
+    info = kzalloc(sizeof(*info), GFP_KERNEL);
     if (!info)
        goto fail_alloc_info;
 
@@ -160,7 +160,7 @@ fail_alloc_info:
 
 static void com20020_detach(struct pcmcia_device *link)
 {
-    struct com20020_dev_t *info = link->priv;
+    struct com20020_dev *info = link->priv;
     struct net_device *dev = info->dev;
 
     dev_dbg(&link->dev, "detach...\n");
@@ -199,7 +199,7 @@ static void com20020_detach(struct pcmcia_device *link)
 static int com20020_config(struct pcmcia_device *link)
 {
     struct arcnet_local *lp;
-    com20020_dev_t *info;
+    struct com20020_dev *info;
     struct net_device *dev;
     int i, ret;
     int ioaddr;
@@ -291,7 +291,7 @@ static void com20020_release(struct pcmcia_device *link)
 
 static int com20020_suspend(struct pcmcia_device *link)
 {
-       com20020_dev_t *info = link->priv;
+       struct com20020_dev *info = link->priv;
        struct net_device *dev = info->dev;
 
        if (link->open)
@@ -302,7 +302,7 @@ static int com20020_suspend(struct pcmcia_device *link)
 
 static int com20020_resume(struct pcmcia_device *link)
 {
-       com20020_dev_t *info = link->priv;
+       struct com20020_dev *info = link->priv;
        struct net_device *dev = info->dev;
 
        if (link->open) {
index 0988811f4e40e2e4effd4fa92a665b67d34291b5..2d89bd00de6190071e3d26fe81e68545511932f2 100644 (file)
@@ -91,7 +91,8 @@ config MCF8390
 
 config NE2000
        tristate "NE2000/NE1000 support"
-       depends on (ISA || (Q40 && m) || M32R || MACH_TX49XX)
+       depends on (ISA || (Q40 && m) || M32R || MACH_TX49XX || \
+                   ATARI_ETHERNEC)
        select CRC32
        ---help---
          If you have a network (Ethernet) card of this type, say Y and read
index 73c57a4a7b9e517fd407943eb134d40545468338..7769c05543f17fcc8432cac6145ebdd70c4fc5da 100644 (file)
@@ -108,7 +108,7 @@ static u32 axnet_msg_enable;
 
 /*====================================================================*/
 
-typedef struct axnet_dev_t {
+struct axnet_dev {
        struct pcmcia_device    *p_dev;
        caddr_t base;
        struct timer_list       watchdog;
@@ -118,9 +118,9 @@ typedef struct axnet_dev_t {
        int     phy_id;
        int     flags;
        int     active_low;
-} axnet_dev_t;
+};
 
-static inline axnet_dev_t *PRIV(struct net_device *dev)
+static inline struct axnet_dev *PRIV(struct net_device *dev)
 {
        void *p = (char *)netdev_priv(dev) + sizeof(struct ei_device);
        return p;
@@ -141,13 +141,13 @@ static const struct net_device_ops axnet_netdev_ops = {
 
 static int axnet_probe(struct pcmcia_device *link)
 {
-    axnet_dev_t *info;
+    struct axnet_dev *info;
     struct net_device *dev;
     struct ei_device *ei_local;
 
     dev_dbg(&link->dev, "axnet_attach()\n");
 
-    dev = alloc_etherdev(sizeof(struct ei_device) + sizeof(axnet_dev_t));
+    dev = alloc_etherdev(sizeof(struct ei_device) + sizeof(struct axnet_dev));
     if (!dev)
        return -ENOMEM;
 
@@ -274,7 +274,7 @@ static int axnet_configcheck(struct pcmcia_device *p_dev, void *priv_data)
 static int axnet_config(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
-    axnet_dev_t *info = PRIV(dev);
+    struct axnet_dev *info = PRIV(dev);
     int i, j, j2, ret;
 
     dev_dbg(&link->dev, "axnet_config(0x%p)\n", link);
@@ -389,7 +389,7 @@ static int axnet_suspend(struct pcmcia_device *link)
 static int axnet_resume(struct pcmcia_device *link)
 {
        struct net_device *dev = link->priv;
-       axnet_dev_t *info = PRIV(dev);
+       struct axnet_dev *info = PRIV(dev);
 
        if (link->open) {
                if (info->active_low == 1)
@@ -467,7 +467,7 @@ static void mdio_write(unsigned int addr, int phy_id, int loc, int value)
 static int axnet_open(struct net_device *dev)
 {
     int ret;
-    axnet_dev_t *info = PRIV(dev);
+    struct axnet_dev *info = PRIV(dev);
     struct pcmcia_device *link = info->p_dev;
     unsigned int nic_base = dev->base_addr;
     
@@ -497,7 +497,7 @@ static int axnet_open(struct net_device *dev)
 
 static int axnet_close(struct net_device *dev)
 {
-    axnet_dev_t *info = PRIV(dev);
+    struct axnet_dev *info = PRIV(dev);
     struct pcmcia_device *link = info->p_dev;
 
     dev_dbg(&link->dev, "axnet_close('%s')\n", dev->name);
@@ -554,7 +554,7 @@ static irqreturn_t ei_irq_wrapper(int irq, void *dev_id)
 static void ei_watchdog(u_long arg)
 {
     struct net_device *dev = (struct net_device *)(arg);
-    axnet_dev_t *info = PRIV(dev);
+    struct axnet_dev *info = PRIV(dev);
     unsigned int nic_base = dev->base_addr;
     unsigned int mii_addr = nic_base + AXNET_MII_EEP;
     u_short link;
@@ -610,7 +610,7 @@ reschedule:
 
 static int axnet_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
-    axnet_dev_t *info = PRIV(dev);
+    struct axnet_dev *info = PRIV(dev);
     struct mii_ioctl_data *data = if_mii(rq);
     unsigned int mii_addr = dev->base_addr + AXNET_MII_EEP;
     switch (cmd) {
@@ -1452,7 +1452,7 @@ static void ei_receive(struct net_device *dev)
 
 static void ei_rx_overrun(struct net_device *dev)
 {
-       axnet_dev_t *info = PRIV(dev);
+       struct axnet_dev *info = PRIV(dev);
        long e8390_base = dev->base_addr;
        unsigned char was_txing, must_resend = 0;
        struct ei_device *ei_local = netdev_priv(dev);
@@ -1624,7 +1624,7 @@ static void set_multicast_list(struct net_device *dev)
 
 static void AX88190_init(struct net_device *dev, int startp)
 {
-       axnet_dev_t *info = PRIV(dev);
+       struct axnet_dev *info = PRIV(dev);
        long e8390_base = dev->base_addr;
        struct ei_device *ei_local = netdev_priv(dev);
        int i;
index 58eaa8f34942cac2dfe416e0d6086b7103afb83c..de566fb6e0f7ba50ec78b83426eaf0718a147de7 100644 (file)
@@ -169,6 +169,8 @@ bad_clone_list[] __initdata = {
 #elif defined(CONFIG_PLAT_OAKS32R)  || \
    defined(CONFIG_MACH_TX49XX)
 #  define DCR_VAL 0x48         /* 8-bit mode */
+#elif defined(CONFIG_ATARI)    /* 8-bit mode on Atari, normal on Q40 */
+#  define DCR_VAL (MACH_IS_ATARI ? 0x48 : 0x49)
 #else
 #  define DCR_VAL 0x49
 #endif
index ca3c2b921cf612be0913663445ae8864373b0c9b..9fb7b9d4fd6c6595f7642d859678bc3097998750 100644 (file)
@@ -111,11 +111,11 @@ static void pcnet_detach(struct pcmcia_device *p_dev);
 
 /*====================================================================*/
 
-typedef struct hw_info_t {
+struct hw_info {
     u_int      offset;
     u_char     a0, a1, a2;
     u_int      flags;
-} hw_info_t;
+};
 
 #define DELAY_OUTPUT   0x01
 #define HAS_MISC_REG   0x02
@@ -132,7 +132,7 @@ typedef struct hw_info_t {
 #define MII_PHYID_REG1         0x02
 #define MII_PHYID_REG2         0x03
 
-static hw_info_t hw_info[] = {
+static struct hw_info hw_info[] = {
     { /* Accton EN2212 */ 0x0ff0, 0x00, 0x00, 0xe8, DELAY_OUTPUT },
     { /* Allied Telesis LA-PCM */ 0x0ff0, 0x00, 0x00, 0xf4, 0 },
     { /* APEX MultiCard */ 0x03f4, 0x00, 0x20, 0xe5, 0 },
@@ -196,11 +196,11 @@ static hw_info_t hw_info[] = {
 
 #define NR_INFO                ARRAY_SIZE(hw_info)
 
-static hw_info_t default_info = { 0, 0, 0, 0, 0 };
-static hw_info_t dl10019_info = { 0, 0, 0, 0, IS_DL10019|HAS_MII };
-static hw_info_t dl10022_info = { 0, 0, 0, 0, IS_DL10022|HAS_MII };
+static struct hw_info default_info = { 0, 0, 0, 0, 0 };
+static struct hw_info dl10019_info = { 0, 0, 0, 0, IS_DL10019|HAS_MII };
+static struct hw_info dl10022_info = { 0, 0, 0, 0, IS_DL10022|HAS_MII };
 
-typedef struct pcnet_dev_t {
+struct pcnet_dev {
        struct pcmcia_device    *p_dev;
     u_int              flags;
     void               __iomem *base;
@@ -210,12 +210,12 @@ typedef struct pcnet_dev_t {
     u_char             eth_phy, pna_phy;
     u_short            link_status;
     u_long             mii_reset;
-} pcnet_dev_t;
+};
 
-static inline pcnet_dev_t *PRIV(struct net_device *dev)
+static inline struct pcnet_dev *PRIV(struct net_device *dev)
 {
        char *p = netdev_priv(dev);
-       return (pcnet_dev_t *)(p + sizeof(struct ei_device));
+       return (struct pcnet_dev *)(p + sizeof(struct ei_device));
 }
 
 static const struct net_device_ops pcnet_netdev_ops = {
@@ -237,13 +237,13 @@ static const struct net_device_ops pcnet_netdev_ops = {
 
 static int pcnet_probe(struct pcmcia_device *link)
 {
-    pcnet_dev_t *info;
+    struct pcnet_dev *info;
     struct net_device *dev;
 
     dev_dbg(&link->dev, "pcnet_attach()\n");
 
     /* Create new ethernet device */
-    dev = __alloc_ei_netdev(sizeof(pcnet_dev_t));
+    dev = __alloc_ei_netdev(sizeof(struct pcnet_dev));
     if (!dev) return -ENOMEM;
     info = PRIV(dev);
     info->p_dev = link;
@@ -276,7 +276,7 @@ static void pcnet_detach(struct pcmcia_device *link)
 
 ======================================================================*/
 
-static hw_info_t *get_hwinfo(struct pcmcia_device *link)
+static struct hw_info *get_hwinfo(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
     u_char __iomem *base, *virt;
@@ -317,7 +317,7 @@ static hw_info_t *get_hwinfo(struct pcmcia_device *link)
 
 ======================================================================*/
 
-static hw_info_t *get_prom(struct pcmcia_device *link)
+static struct hw_info *get_prom(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
     unsigned int ioaddr = dev->base_addr;
@@ -371,7 +371,7 @@ static hw_info_t *get_prom(struct pcmcia_device *link)
 
 ======================================================================*/
 
-static hw_info_t *get_dl10019(struct pcmcia_device *link)
+static struct hw_info *get_dl10019(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
     int i;
@@ -393,7 +393,7 @@ static hw_info_t *get_dl10019(struct pcmcia_device *link)
 
 ======================================================================*/
 
-static hw_info_t *get_ax88190(struct pcmcia_device *link)
+static struct hw_info *get_ax88190(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
     unsigned int ioaddr = dev->base_addr;
@@ -424,7 +424,7 @@ static hw_info_t *get_ax88190(struct pcmcia_device *link)
 
 ======================================================================*/
 
-static hw_info_t *get_hwired(struct pcmcia_device *link)
+static struct hw_info *get_hwired(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
     int i;
@@ -489,12 +489,12 @@ static int pcnet_confcheck(struct pcmcia_device *p_dev, void *priv_data)
        return try_io_port(p_dev);
 }
 
-static hw_info_t *pcnet_try_config(struct pcmcia_device *link,
-                                  int *has_shmem, int try)
+static struct hw_info *pcnet_try_config(struct pcmcia_device *link,
+                                       int *has_shmem, int try)
 {
        struct net_device *dev = link->priv;
-       hw_info_t *local_hw_info;
-       pcnet_dev_t *info = PRIV(dev);
+       struct hw_info *local_hw_info;
+       struct pcnet_dev *info = PRIV(dev);
        int priv = try;
        int ret;
 
@@ -553,10 +553,10 @@ static hw_info_t *pcnet_try_config(struct pcmcia_device *link,
 static int pcnet_config(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
-    pcnet_dev_t *info = PRIV(dev);
+    struct pcnet_dev *info = PRIV(dev);
     int start_pg, stop_pg, cm_offset;
     int has_shmem = 0;
-    hw_info_t *local_hw_info;
+    struct hw_info *local_hw_info;
     struct ei_device *ei_local;
 
     dev_dbg(&link->dev, "pcnet_config\n");
@@ -639,7 +639,7 @@ failed:
 
 static void pcnet_release(struct pcmcia_device *link)
 {
-       pcnet_dev_t *info = PRIV(link->priv);
+       struct pcnet_dev *info = PRIV(link->priv);
 
        dev_dbg(&link->dev, "pcnet_release\n");
 
@@ -836,7 +836,7 @@ static void write_asic(unsigned int ioaddr, int location, short asic_data)
 static void set_misc_reg(struct net_device *dev)
 {
     unsigned int nic_base = dev->base_addr;
-    pcnet_dev_t *info = PRIV(dev);
+    struct pcnet_dev *info = PRIV(dev);
     u_char tmp;
 
     if (info->flags & HAS_MISC_REG) {
@@ -873,7 +873,7 @@ static void set_misc_reg(struct net_device *dev)
 
 static void mii_phy_probe(struct net_device *dev)
 {
-    pcnet_dev_t *info = PRIV(dev);
+    struct pcnet_dev *info = PRIV(dev);
     unsigned int mii_addr = dev->base_addr + DLINK_GPIO;
     int i;
     u_int tmp, phyid;
@@ -898,7 +898,7 @@ static void mii_phy_probe(struct net_device *dev)
 static int pcnet_open(struct net_device *dev)
 {
     int ret;
-    pcnet_dev_t *info = PRIV(dev);
+    struct pcnet_dev *info = PRIV(dev);
     struct pcmcia_device *link = info->p_dev;
     unsigned int nic_base = dev->base_addr;
 
@@ -931,7 +931,7 @@ static int pcnet_open(struct net_device *dev)
 
 static int pcnet_close(struct net_device *dev)
 {
-    pcnet_dev_t *info = PRIV(dev);
+    struct pcnet_dev *info = PRIV(dev);
     struct pcmcia_device *link = info->p_dev;
 
     dev_dbg(&link->dev, "pcnet_close('%s')\n", dev->name);
@@ -982,7 +982,7 @@ static void pcnet_reset_8390(struct net_device *dev)
 
 static int set_config(struct net_device *dev, struct ifmap *map)
 {
-    pcnet_dev_t *info = PRIV(dev);
+    struct pcnet_dev *info = PRIV(dev);
     if ((map->port != (u_char)(-1)) && (map->port != dev->if_port)) {
        if (!(info->flags & HAS_MISC_REG))
            return -EOPNOTSUPP;
@@ -1000,7 +1000,7 @@ static int set_config(struct net_device *dev, struct ifmap *map)
 static irqreturn_t ei_irq_wrapper(int irq, void *dev_id)
 {
     struct net_device *dev = dev_id;
-    pcnet_dev_t *info;
+    struct pcnet_dev *info;
     irqreturn_t ret = ei_interrupt(irq, dev_id);
 
     if (ret == IRQ_HANDLED) {
@@ -1013,7 +1013,7 @@ static irqreturn_t ei_irq_wrapper(int irq, void *dev_id)
 static void ei_watchdog(u_long arg)
 {
     struct net_device *dev = (struct net_device *)arg;
-    pcnet_dev_t *info = PRIV(dev);
+    struct pcnet_dev *info = PRIV(dev);
     unsigned int nic_base = dev->base_addr;
     unsigned int mii_addr = nic_base + DLINK_GPIO;
     u_short link;
@@ -1101,7 +1101,7 @@ reschedule:
 
 static int ei_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
-    pcnet_dev_t *info = PRIV(dev);
+    struct pcnet_dev *info = PRIV(dev);
     struct mii_ioctl_data *data = if_mii(rq);
     unsigned int mii_addr = dev->base_addr + DLINK_GPIO;
 
@@ -1214,7 +1214,7 @@ static void dma_block_output(struct net_device *dev, int count,
                             const u_char *buf, const int start_page)
 {
     unsigned int nic_base = dev->base_addr;
-    pcnet_dev_t *info = PRIV(dev);
+    struct pcnet_dev *info = PRIV(dev);
 #ifdef PCMCIA_DEBUG
     int retries = 0;
     struct ei_device *ei_local = netdev_priv(dev);
@@ -1403,7 +1403,7 @@ static int setup_shmem_window(struct pcmcia_device *link, int start_pg,
                              int stop_pg, int cm_offset)
 {
     struct net_device *dev = link->priv;
-    pcnet_dev_t *info = PRIV(dev);
+    struct pcnet_dev *info = PRIV(dev);
     int i, window_size, offset, ret;
 
     window_size = (stop_pg - start_pg) << 8;
index edb718661850f350711e73ada6368d2e080fe486..dc7406c81c452cacb2d83cb42d5ec5181ade9326 100644 (file)
@@ -24,6 +24,7 @@ source "drivers/net/ethernet/allwinner/Kconfig"
 source "drivers/net/ethernet/alteon/Kconfig"
 source "drivers/net/ethernet/altera/Kconfig"
 source "drivers/net/ethernet/amd/Kconfig"
+source "drivers/net/ethernet/apm/Kconfig"
 source "drivers/net/ethernet/apple/Kconfig"
 source "drivers/net/ethernet/arc/Kconfig"
 source "drivers/net/ethernet/atheros/Kconfig"
index 58de3339ab3c7a443d591a37425bb032c5b93ab5..224a018771499f7d6e03dee2fd913978772725f4 100644 (file)
@@ -10,6 +10,7 @@ obj-$(CONFIG_NET_VENDOR_ALLWINNER) += allwinner/
 obj-$(CONFIG_NET_VENDOR_ALTEON) += alteon/
 obj-$(CONFIG_ALTERA_TSE) += altera/
 obj-$(CONFIG_NET_VENDOR_AMD) += amd/
+obj-$(CONFIG_NET_XGENE) += apm/
 obj-$(CONFIG_NET_VENDOR_APPLE) += apple/
 obj-$(CONFIG_NET_VENDOR_ARC) += arc/
 obj-$(CONFIG_NET_VENDOR_ATHEROS) += atheros/
index 1f5487f4888c8c1e5a4d11fa9646fbd86f8d3f1c..dc84f7193c2db62aaf5d922f9beb409b13a975c3 100644 (file)
 #include <linux/spinlock.h>
 #include <linux/tcp.h>
 #include <linux/if_vlan.h>
-#include <linux/phy.h>
 #include <net/busy_poll.h>
 #include <linux/clk.h>
 #include <linux/if_ether.h>
diff --git a/drivers/net/ethernet/apm/Kconfig b/drivers/net/ethernet/apm/Kconfig
new file mode 100644 (file)
index 0000000..ec63d70
--- /dev/null
@@ -0,0 +1 @@
+source "drivers/net/ethernet/apm/xgene/Kconfig"
diff --git a/drivers/net/ethernet/apm/Makefile b/drivers/net/ethernet/apm/Makefile
new file mode 100644 (file)
index 0000000..65ce32a
--- /dev/null
@@ -0,0 +1,5 @@
+#
+# Makefile for APM X-GENE Ethernet driver.
+#
+
+obj-$(CONFIG_NET_XGENE) += xgene/
diff --git a/drivers/net/ethernet/apm/xgene/Kconfig b/drivers/net/ethernet/apm/xgene/Kconfig
new file mode 100644 (file)
index 0000000..616dff6
--- /dev/null
@@ -0,0 +1,9 @@
+config NET_XGENE
+       tristate "APM X-Gene SoC Ethernet Driver"
+       select PHYLIB
+       help
+         This is the Ethernet driver for the on-chip ethernet interface on the
+         APM X-Gene SoC.
+
+         To compile this driver as a module, choose M here. This module will
+         be called xgene_enet.
diff --git a/drivers/net/ethernet/apm/xgene/Makefile b/drivers/net/ethernet/apm/xgene/Makefile
new file mode 100644 (file)
index 0000000..c643e8a
--- /dev/null
@@ -0,0 +1,6 @@
+#
+# Makefile for APM X-Gene Ethernet Driver.
+#
+
+xgene-enet-objs := xgene_enet_hw.o xgene_enet_main.o xgene_enet_ethtool.o
+obj-$(CONFIG_NET_XGENE) += xgene-enet.o
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c b/drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c
new file mode 100644 (file)
index 0000000..63f2aa5
--- /dev/null
@@ -0,0 +1,125 @@
+/* Applied Micro X-Gene SoC Ethernet Driver
+ *
+ * Copyright (c) 2014, Applied Micro Circuits Corporation
+ * Authors: Iyappan Subramanian <isubramanian@apm.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/ethtool.h>
+#include "xgene_enet_main.h"
+
+struct xgene_gstrings_stats {
+       char name[ETH_GSTRING_LEN];
+       int offset;
+};
+
+#define XGENE_STAT(m) { #m, offsetof(struct xgene_enet_pdata, stats.m) }
+
+static const struct xgene_gstrings_stats gstrings_stats[] = {
+       XGENE_STAT(rx_packets),
+       XGENE_STAT(tx_packets),
+       XGENE_STAT(rx_bytes),
+       XGENE_STAT(tx_bytes),
+       XGENE_STAT(rx_errors),
+       XGENE_STAT(tx_errors),
+       XGENE_STAT(rx_length_errors),
+       XGENE_STAT(rx_crc_errors),
+       XGENE_STAT(rx_frame_errors),
+       XGENE_STAT(rx_fifo_errors)
+};
+
+#define XGENE_STATS_LEN                ARRAY_SIZE(gstrings_stats)
+
+static void xgene_get_drvinfo(struct net_device *ndev,
+                             struct ethtool_drvinfo *info)
+{
+       struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+       struct platform_device *pdev = pdata->pdev;
+
+       strcpy(info->driver, "xgene_enet");
+       strcpy(info->version, XGENE_DRV_VERSION);
+       snprintf(info->fw_version, ETHTOOL_FWVERS_LEN, "N/A");
+       sprintf(info->bus_info, "%s", pdev->name);
+}
+
+static int xgene_get_settings(struct net_device *ndev, struct ethtool_cmd *cmd)
+{
+       struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+       struct phy_device *phydev = pdata->phy_dev;
+
+       if (phydev == NULL)
+               return -ENODEV;
+
+       return phy_ethtool_gset(phydev, cmd);
+}
+
+static int xgene_set_settings(struct net_device *ndev, struct ethtool_cmd *cmd)
+{
+       struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+       struct phy_device *phydev = pdata->phy_dev;
+
+       if (phydev == NULL)
+               return -ENODEV;
+
+       return phy_ethtool_sset(phydev, cmd);
+}
+
+static void xgene_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
+{
+       int i;
+       u8 *p = data;
+
+       if (stringset != ETH_SS_STATS)
+               return;
+
+       for (i = 0; i < XGENE_STATS_LEN; i++) {
+               memcpy(p, gstrings_stats[i].name, ETH_GSTRING_LEN);
+               p += ETH_GSTRING_LEN;
+       }
+}
+
+static int xgene_get_sset_count(struct net_device *ndev, int sset)
+{
+       if (sset != ETH_SS_STATS)
+               return -EINVAL;
+
+       return XGENE_STATS_LEN;
+}
+
+static void xgene_get_ethtool_stats(struct net_device *ndev,
+                                   struct ethtool_stats *dummy,
+                                   u64 *data)
+{
+       void *pdata = netdev_priv(ndev);
+       int i;
+
+       for (i = 0; i < XGENE_STATS_LEN; i++)
+               *data++ = *(u64 *)(pdata + gstrings_stats[i].offset);
+}
+
+static const struct ethtool_ops xgene_ethtool_ops = {
+       .get_drvinfo = xgene_get_drvinfo,
+       .get_settings = xgene_get_settings,
+       .set_settings = xgene_set_settings,
+       .get_link = ethtool_op_get_link,
+       .get_strings = xgene_get_strings,
+       .get_sset_count = xgene_get_sset_count,
+       .get_ethtool_stats = xgene_get_ethtool_stats
+};
+
+void xgene_enet_set_ethtool_ops(struct net_device *ndev)
+{
+       ndev->ethtool_ops = &xgene_ethtool_ops;
+}
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
new file mode 100644 (file)
index 0000000..812d8d6
--- /dev/null
@@ -0,0 +1,728 @@
+/* Applied Micro X-Gene SoC Ethernet Driver
+ *
+ * Copyright (c) 2014, Applied Micro Circuits Corporation
+ * Authors: Iyappan Subramanian <isubramanian@apm.com>
+ *         Ravi Patel <rapatel@apm.com>
+ *         Keyur Chudgar <kchudgar@apm.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "xgene_enet_main.h"
+#include "xgene_enet_hw.h"
+
+static void xgene_enet_ring_init(struct xgene_enet_desc_ring *ring)
+{
+       u32 *ring_cfg = ring->state;
+       u64 addr = ring->dma;
+       enum xgene_enet_ring_cfgsize cfgsize = ring->cfgsize;
+
+       ring_cfg[4] |= (1 << SELTHRSH_POS) &
+                       CREATE_MASK(SELTHRSH_POS, SELTHRSH_LEN);
+       ring_cfg[3] |= ACCEPTLERR;
+       ring_cfg[2] |= QCOHERENT;
+
+       addr >>= 8;
+       ring_cfg[2] |= (addr << RINGADDRL_POS) &
+                       CREATE_MASK_ULL(RINGADDRL_POS, RINGADDRL_LEN);
+       addr >>= RINGADDRL_LEN;
+       ring_cfg[3] |= addr & CREATE_MASK_ULL(RINGADDRH_POS, RINGADDRH_LEN);
+       ring_cfg[3] |= ((u32)cfgsize << RINGSIZE_POS) &
+                       CREATE_MASK(RINGSIZE_POS, RINGSIZE_LEN);
+}
+
+static void xgene_enet_ring_set_type(struct xgene_enet_desc_ring *ring)
+{
+       u32 *ring_cfg = ring->state;
+       bool is_bufpool;
+       u32 val;
+
+       is_bufpool = xgene_enet_is_bufpool(ring->id);
+       val = (is_bufpool) ? RING_BUFPOOL : RING_REGULAR;
+       ring_cfg[4] |= (val << RINGTYPE_POS) &
+                       CREATE_MASK(RINGTYPE_POS, RINGTYPE_LEN);
+
+       if (is_bufpool) {
+               ring_cfg[3] |= (BUFPOOL_MODE << RINGMODE_POS) &
+                               CREATE_MASK(RINGMODE_POS, RINGMODE_LEN);
+       }
+}
+
+static void xgene_enet_ring_set_recombbuf(struct xgene_enet_desc_ring *ring)
+{
+       u32 *ring_cfg = ring->state;
+
+       ring_cfg[3] |= RECOMBBUF;
+       ring_cfg[3] |= (0xf << RECOMTIMEOUTL_POS) &
+                       CREATE_MASK(RECOMTIMEOUTL_POS, RECOMTIMEOUTL_LEN);
+       ring_cfg[4] |= 0x7 & CREATE_MASK(RECOMTIMEOUTH_POS, RECOMTIMEOUTH_LEN);
+}
+
+static void xgene_enet_ring_wr32(struct xgene_enet_desc_ring *ring,
+                                u32 offset, u32 data)
+{
+       struct xgene_enet_pdata *pdata = netdev_priv(ring->ndev);
+
+       iowrite32(data, pdata->ring_csr_addr + offset);
+}
+
+static void xgene_enet_ring_rd32(struct xgene_enet_desc_ring *ring,
+                                u32 offset, u32 *data)
+{
+       struct xgene_enet_pdata *pdata = netdev_priv(ring->ndev);
+
+       *data = ioread32(pdata->ring_csr_addr + offset);
+}
+
+static void xgene_enet_write_ring_state(struct xgene_enet_desc_ring *ring)
+{
+       int i;
+
+       xgene_enet_ring_wr32(ring, CSR_RING_CONFIG, ring->num);
+       for (i = 0; i < NUM_RING_CONFIG; i++) {
+               xgene_enet_ring_wr32(ring, CSR_RING_WR_BASE + (i * 4),
+                                    ring->state[i]);
+       }
+}
+
+static void xgene_enet_clr_ring_state(struct xgene_enet_desc_ring *ring)
+{
+       memset(ring->state, 0, sizeof(u32) * NUM_RING_CONFIG);
+       xgene_enet_write_ring_state(ring);
+}
+
+static void xgene_enet_set_ring_state(struct xgene_enet_desc_ring *ring)
+{
+       xgene_enet_ring_set_type(ring);
+
+       if (xgene_enet_ring_owner(ring->id) == RING_OWNER_ETH0)
+               xgene_enet_ring_set_recombbuf(ring);
+
+       xgene_enet_ring_init(ring);
+       xgene_enet_write_ring_state(ring);
+}
+
+static void xgene_enet_set_ring_id(struct xgene_enet_desc_ring *ring)
+{
+       u32 ring_id_val, ring_id_buf;
+       bool is_bufpool;
+
+       is_bufpool = xgene_enet_is_bufpool(ring->id);
+
+       ring_id_val = ring->id & GENMASK(9, 0);
+       ring_id_val |= OVERWRITE;
+
+       ring_id_buf = (ring->num << 9) & GENMASK(18, 9);
+       ring_id_buf |= PREFETCH_BUF_EN;
+       if (is_bufpool)
+               ring_id_buf |= IS_BUFFER_POOL;
+
+       xgene_enet_ring_wr32(ring, CSR_RING_ID, ring_id_val);
+       xgene_enet_ring_wr32(ring, CSR_RING_ID_BUF, ring_id_buf);
+}
+
+static void xgene_enet_clr_desc_ring_id(struct xgene_enet_desc_ring *ring)
+{
+       u32 ring_id;
+
+       ring_id = ring->id | OVERWRITE;
+       xgene_enet_ring_wr32(ring, CSR_RING_ID, ring_id);
+       xgene_enet_ring_wr32(ring, CSR_RING_ID_BUF, 0);
+}
+
+struct xgene_enet_desc_ring *xgene_enet_setup_ring(
+                                       struct xgene_enet_desc_ring *ring)
+{
+       u32 size = ring->size;
+       u32 i, data;
+       bool is_bufpool;
+
+       xgene_enet_clr_ring_state(ring);
+       xgene_enet_set_ring_state(ring);
+       xgene_enet_set_ring_id(ring);
+
+       ring->slots = xgene_enet_get_numslots(ring->id, size);
+
+       is_bufpool = xgene_enet_is_bufpool(ring->id);
+       if (is_bufpool || xgene_enet_ring_owner(ring->id) != RING_OWNER_CPU)
+               return ring;
+
+       for (i = 0; i < ring->slots; i++)
+               xgene_enet_mark_desc_slot_empty(&ring->raw_desc[i]);
+
+       xgene_enet_ring_rd32(ring, CSR_RING_NE_INT_MODE, &data);
+       data |= BIT(31 - xgene_enet_ring_bufnum(ring->id));
+       xgene_enet_ring_wr32(ring, CSR_RING_NE_INT_MODE, data);
+
+       return ring;
+}
+
+void xgene_enet_clear_ring(struct xgene_enet_desc_ring *ring)
+{
+       u32 data;
+       bool is_bufpool;
+
+       is_bufpool = xgene_enet_is_bufpool(ring->id);
+       if (is_bufpool || xgene_enet_ring_owner(ring->id) != RING_OWNER_CPU)
+               goto out;
+
+       xgene_enet_ring_rd32(ring, CSR_RING_NE_INT_MODE, &data);
+       data &= ~BIT(31 - xgene_enet_ring_bufnum(ring->id));
+       xgene_enet_ring_wr32(ring, CSR_RING_NE_INT_MODE, data);
+
+out:
+       xgene_enet_clr_desc_ring_id(ring);
+       xgene_enet_clr_ring_state(ring);
+}
+
+void xgene_enet_parse_error(struct xgene_enet_desc_ring *ring,
+                           struct xgene_enet_pdata *pdata,
+                           enum xgene_enet_err_code status)
+{
+       struct rtnl_link_stats64 *stats = &pdata->stats;
+
+       switch (status) {
+       case INGRESS_CRC:
+               stats->rx_crc_errors++;
+               break;
+       case INGRESS_CHECKSUM:
+       case INGRESS_CHECKSUM_COMPUTE:
+               stats->rx_errors++;
+               break;
+       case INGRESS_TRUNC_FRAME:
+               stats->rx_frame_errors++;
+               break;
+       case INGRESS_PKT_LEN:
+               stats->rx_length_errors++;
+               break;
+       case INGRESS_PKT_UNDER:
+               stats->rx_frame_errors++;
+               break;
+       case INGRESS_FIFO_OVERRUN:
+               stats->rx_fifo_errors++;
+               break;
+       default:
+               break;
+       }
+}
+
+static void xgene_enet_wr_csr(struct xgene_enet_pdata *pdata,
+                             u32 offset, u32 val)
+{
+       void __iomem *addr = pdata->eth_csr_addr + offset;
+
+       iowrite32(val, addr);
+}
+
+static void xgene_enet_wr_ring_if(struct xgene_enet_pdata *pdata,
+                                 u32 offset, u32 val)
+{
+       void __iomem *addr = pdata->eth_ring_if_addr + offset;
+
+       iowrite32(val, addr);
+}
+
+static void xgene_enet_wr_diag_csr(struct xgene_enet_pdata *pdata,
+                                  u32 offset, u32 val)
+{
+       void __iomem *addr = pdata->eth_diag_csr_addr + offset;
+
+       iowrite32(val, addr);
+}
+
+static void xgene_enet_wr_mcx_csr(struct xgene_enet_pdata *pdata,
+                                 u32 offset, u32 val)
+{
+       void __iomem *addr = pdata->mcx_mac_csr_addr + offset;
+
+       iowrite32(val, addr);
+}
+
+static bool xgene_enet_wr_indirect(void __iomem *addr, void __iomem *wr,
+                                  void __iomem *cmd, void __iomem *cmd_done,
+                                  u32 wr_addr, u32 wr_data)
+{
+       u32 done;
+       u8 wait = 10;
+
+       iowrite32(wr_addr, addr);
+       iowrite32(wr_data, wr);
+       iowrite32(XGENE_ENET_WR_CMD, cmd);
+
+       /* wait for write command to complete */
+       while (!(done = ioread32(cmd_done)) && wait--)
+               udelay(1);
+
+       if (!done)
+               return false;
+
+       iowrite32(0, cmd);
+
+       return true;
+}
+
+static void xgene_enet_wr_mcx_mac(struct xgene_enet_pdata *pdata,
+                                 u32 wr_addr, u32 wr_data)
+{
+       void __iomem *addr, *wr, *cmd, *cmd_done;
+
+       addr = pdata->mcx_mac_addr + MAC_ADDR_REG_OFFSET;
+       wr = pdata->mcx_mac_addr + MAC_WRITE_REG_OFFSET;
+       cmd = pdata->mcx_mac_addr + MAC_COMMAND_REG_OFFSET;
+       cmd_done = pdata->mcx_mac_addr + MAC_COMMAND_DONE_REG_OFFSET;
+
+       if (!xgene_enet_wr_indirect(addr, wr, cmd, cmd_done, wr_addr, wr_data))
+               netdev_err(pdata->ndev, "MCX mac write failed, addr: %04x\n",
+                          wr_addr);
+}
+
+static void xgene_enet_rd_csr(struct xgene_enet_pdata *pdata,
+                             u32 offset, u32 *val)
+{
+       void __iomem *addr = pdata->eth_csr_addr + offset;
+
+       *val = ioread32(addr);
+}
+
+static void xgene_enet_rd_diag_csr(struct xgene_enet_pdata *pdata,
+                                  u32 offset, u32 *val)
+{
+       void __iomem *addr = pdata->eth_diag_csr_addr + offset;
+
+       *val = ioread32(addr);
+}
+
+static void xgene_enet_rd_mcx_csr(struct xgene_enet_pdata *pdata,
+                                 u32 offset, u32 *val)
+{
+       void __iomem *addr = pdata->mcx_mac_csr_addr + offset;
+
+       *val = ioread32(addr);
+}
+
+static bool xgene_enet_rd_indirect(void __iomem *addr, void __iomem *rd,
+                                  void __iomem *cmd, void __iomem *cmd_done,
+                                  u32 rd_addr, u32 *rd_data)
+{
+       u32 done;
+       u8 wait = 10;
+
+       iowrite32(rd_addr, addr);
+       iowrite32(XGENE_ENET_RD_CMD, cmd);
+
+       /* wait for read command to complete */
+       while (!(done = ioread32(cmd_done)) && wait--)
+               udelay(1);
+
+       if (!done)
+               return false;
+
+       *rd_data = ioread32(rd);
+       iowrite32(0, cmd);
+
+       return true;
+}
+
+static void xgene_enet_rd_mcx_mac(struct xgene_enet_pdata *pdata,
+                                 u32 rd_addr, u32 *rd_data)
+{
+       void __iomem *addr, *rd, *cmd, *cmd_done;
+
+       addr = pdata->mcx_mac_addr + MAC_ADDR_REG_OFFSET;
+       rd = pdata->mcx_mac_addr + MAC_READ_REG_OFFSET;
+       cmd = pdata->mcx_mac_addr + MAC_COMMAND_REG_OFFSET;
+       cmd_done = pdata->mcx_mac_addr + MAC_COMMAND_DONE_REG_OFFSET;
+
+       if (!xgene_enet_rd_indirect(addr, rd, cmd, cmd_done, rd_addr, rd_data))
+               netdev_err(pdata->ndev, "MCX mac read failed, addr: %04x\n",
+                          rd_addr);
+}
+
+static int xgene_mii_phy_write(struct xgene_enet_pdata *pdata, int phy_id,
+                              u32 reg, u16 data)
+{
+       u32 addr = 0, wr_data = 0;
+       u32 done;
+       u8 wait = 10;
+
+       PHY_ADDR_SET(&addr, phy_id);
+       REG_ADDR_SET(&addr, reg);
+       xgene_enet_wr_mcx_mac(pdata, MII_MGMT_ADDRESS_ADDR, addr);
+
+       PHY_CONTROL_SET(&wr_data, data);
+       xgene_enet_wr_mcx_mac(pdata, MII_MGMT_CONTROL_ADDR, wr_data);
+       do {
+               usleep_range(5, 10);
+               xgene_enet_rd_mcx_mac(pdata, MII_MGMT_INDICATORS_ADDR, &done);
+       } while ((done & BUSY_MASK) && wait--);
+
+       if (done & BUSY_MASK) {
+               netdev_err(pdata->ndev, "MII_MGMT write failed\n");
+               return -EBUSY;
+       }
+
+       return 0;
+}
+
+static int xgene_mii_phy_read(struct xgene_enet_pdata *pdata,
+                             u8 phy_id, u32 reg)
+{
+       u32 addr = 0;
+       u32 data, done;
+       u8 wait = 10;
+
+       PHY_ADDR_SET(&addr, phy_id);
+       REG_ADDR_SET(&addr, reg);
+       xgene_enet_wr_mcx_mac(pdata, MII_MGMT_ADDRESS_ADDR, addr);
+       xgene_enet_wr_mcx_mac(pdata, MII_MGMT_COMMAND_ADDR, READ_CYCLE_MASK);
+       do {
+               usleep_range(5, 10);
+               xgene_enet_rd_mcx_mac(pdata, MII_MGMT_INDICATORS_ADDR, &done);
+       } while ((done & BUSY_MASK) && wait--);
+
+       if (done & BUSY_MASK) {
+               netdev_err(pdata->ndev, "MII_MGMT read failed\n");
+               return -EBUSY;
+       }
+
+       xgene_enet_rd_mcx_mac(pdata, MII_MGMT_STATUS_ADDR, &data);
+       xgene_enet_wr_mcx_mac(pdata, MII_MGMT_COMMAND_ADDR, 0);
+
+       return data;
+}
+
+void xgene_gmac_set_mac_addr(struct xgene_enet_pdata *pdata)
+{
+       u32 addr0, addr1;
+       u8 *dev_addr = pdata->ndev->dev_addr;
+
+       addr0 = (dev_addr[3] << 24) | (dev_addr[2] << 16) |
+               (dev_addr[1] << 8) | dev_addr[0];
+       addr1 = (dev_addr[5] << 24) | (dev_addr[4] << 16);
+       addr1 |= pdata->phy_addr & 0xFFFF;
+
+       xgene_enet_wr_mcx_mac(pdata, STATION_ADDR0_ADDR, addr0);
+       xgene_enet_wr_mcx_mac(pdata, STATION_ADDR1_ADDR, addr1);
+}
+
+static int xgene_enet_ecc_init(struct xgene_enet_pdata *pdata)
+{
+       struct net_device *ndev = pdata->ndev;
+       u32 data;
+       u8 wait = 10;
+
+       xgene_enet_wr_diag_csr(pdata, ENET_CFG_MEM_RAM_SHUTDOWN_ADDR, 0x0);
+       do {
+               usleep_range(100, 110);
+               xgene_enet_rd_diag_csr(pdata, ENET_BLOCK_MEM_RDY_ADDR, &data);
+       } while ((data != 0xffffffff) && wait--);
+
+       if (data != 0xffffffff) {
+               netdev_err(ndev, "Failed to release memory from shutdown\n");
+               return -ENODEV;
+       }
+
+       return 0;
+}
+
+void xgene_gmac_reset(struct xgene_enet_pdata *pdata)
+{
+       xgene_enet_wr_mcx_mac(pdata, MAC_CONFIG_1_ADDR, SOFT_RESET1);
+       xgene_enet_wr_mcx_mac(pdata, MAC_CONFIG_1_ADDR, 0);
+}
+
+void xgene_gmac_init(struct xgene_enet_pdata *pdata, int speed)
+{
+       u32 value, mc2;
+       u32 intf_ctl, rgmii;
+       u32 icm0, icm2;
+
+       xgene_gmac_reset(pdata);
+
+       xgene_enet_rd_mcx_csr(pdata, ICM_CONFIG0_REG_0_ADDR, &icm0);
+       xgene_enet_rd_mcx_csr(pdata, ICM_CONFIG2_REG_0_ADDR, &icm2);
+       xgene_enet_rd_mcx_mac(pdata, MAC_CONFIG_2_ADDR, &mc2);
+       xgene_enet_rd_mcx_mac(pdata, INTERFACE_CONTROL_ADDR, &intf_ctl);
+       xgene_enet_rd_csr(pdata, RGMII_REG_0_ADDR, &rgmii);
+
+       switch (speed) {
+       case SPEED_10:
+               ENET_INTERFACE_MODE2_SET(&mc2, 1);
+               CFG_MACMODE_SET(&icm0, 0);
+               CFG_WAITASYNCRD_SET(&icm2, 500);
+               rgmii &= ~CFG_SPEED_1250;
+               break;
+       case SPEED_100:
+               ENET_INTERFACE_MODE2_SET(&mc2, 1);
+               intf_ctl |= ENET_LHD_MODE;
+               CFG_MACMODE_SET(&icm0, 1);
+               CFG_WAITASYNCRD_SET(&icm2, 80);
+               rgmii &= ~CFG_SPEED_1250;
+               break;
+       default:
+               ENET_INTERFACE_MODE2_SET(&mc2, 2);
+               intf_ctl |= ENET_GHD_MODE;
+               CFG_TXCLK_MUXSEL0_SET(&rgmii, 4);
+               xgene_enet_rd_csr(pdata, DEBUG_REG_ADDR, &value);
+               value |= CFG_BYPASS_UNISEC_TX | CFG_BYPASS_UNISEC_RX;
+               xgene_enet_wr_csr(pdata, DEBUG_REG_ADDR, value);
+               break;
+       }
+
+       mc2 |= FULL_DUPLEX2;
+       xgene_enet_wr_mcx_mac(pdata, MAC_CONFIG_2_ADDR, mc2);
+       xgene_enet_wr_mcx_mac(pdata, INTERFACE_CONTROL_ADDR, intf_ctl);
+
+       xgene_gmac_set_mac_addr(pdata);
+
+       /* Adjust MDC clock frequency */
+       xgene_enet_rd_mcx_mac(pdata, MII_MGMT_CONFIG_ADDR, &value);
+       MGMT_CLOCK_SEL_SET(&value, 7);
+       xgene_enet_wr_mcx_mac(pdata, MII_MGMT_CONFIG_ADDR, value);
+
+       /* Enable drop if bufpool not available */
+       xgene_enet_rd_csr(pdata, RSIF_CONFIG_REG_ADDR, &value);
+       value |= CFG_RSIF_FPBUFF_TIMEOUT_EN;
+       xgene_enet_wr_csr(pdata, RSIF_CONFIG_REG_ADDR, value);
+
+       /* Rtype should be copied from FP */
+       xgene_enet_wr_csr(pdata, RSIF_RAM_DBG_REG0_ADDR, 0);
+       xgene_enet_wr_csr(pdata, RGMII_REG_0_ADDR, rgmii);
+
+       /* Rx-Tx traffic resume */
+       xgene_enet_wr_csr(pdata, CFG_LINK_AGGR_RESUME_0_ADDR, TX_PORT0);
+
+       xgene_enet_wr_mcx_csr(pdata, ICM_CONFIG0_REG_0_ADDR, icm0);
+       xgene_enet_wr_mcx_csr(pdata, ICM_CONFIG2_REG_0_ADDR, icm2);
+
+       xgene_enet_rd_mcx_csr(pdata, RX_DV_GATE_REG_0_ADDR, &value);
+       value &= ~TX_DV_GATE_EN0;
+       value &= ~RX_DV_GATE_EN0;
+       value |= RESUME_RX0;
+       xgene_enet_wr_mcx_csr(pdata, RX_DV_GATE_REG_0_ADDR, value);
+
+       xgene_enet_wr_csr(pdata, CFG_BYPASS_ADDR, RESUME_TX);
+}
+
+static void xgene_enet_config_ring_if_assoc(struct xgene_enet_pdata *pdata)
+{
+       u32 val = 0xffffffff;
+
+       xgene_enet_wr_ring_if(pdata, ENET_CFGSSQMIWQASSOC_ADDR, val);
+       xgene_enet_wr_ring_if(pdata, ENET_CFGSSQMIFPQASSOC_ADDR, val);
+       xgene_enet_wr_ring_if(pdata, ENET_CFGSSQMIQMLITEWQASSOC_ADDR, val);
+       xgene_enet_wr_ring_if(pdata, ENET_CFGSSQMIQMLITEFPQASSOC_ADDR, val);
+}
+
+void xgene_enet_cle_bypass(struct xgene_enet_pdata *pdata,
+                          u32 dst_ring_num, u16 bufpool_id)
+{
+       u32 cb;
+       u32 fpsel;
+
+       fpsel = xgene_enet_ring_bufnum(bufpool_id) - 0x20;
+
+       xgene_enet_rd_csr(pdata, CLE_BYPASS_REG0_0_ADDR, &cb);
+       cb |= CFG_CLE_BYPASS_EN0;
+       CFG_CLE_IP_PROTOCOL0_SET(&cb, 3);
+       xgene_enet_wr_csr(pdata, CLE_BYPASS_REG0_0_ADDR, cb);
+
+       xgene_enet_rd_csr(pdata, CLE_BYPASS_REG1_0_ADDR, &cb);
+       CFG_CLE_DSTQID0_SET(&cb, dst_ring_num);
+       CFG_CLE_FPSEL0_SET(&cb, fpsel);
+       xgene_enet_wr_csr(pdata, CLE_BYPASS_REG1_0_ADDR, cb);
+}
+
+void xgene_gmac_rx_enable(struct xgene_enet_pdata *pdata)
+{
+       u32 data;
+
+       xgene_enet_rd_mcx_mac(pdata, MAC_CONFIG_1_ADDR, &data);
+       xgene_enet_wr_mcx_mac(pdata, MAC_CONFIG_1_ADDR, data | RX_EN);
+}
+
+void xgene_gmac_tx_enable(struct xgene_enet_pdata *pdata)
+{
+       u32 data;
+
+       xgene_enet_rd_mcx_mac(pdata, MAC_CONFIG_1_ADDR, &data);
+       xgene_enet_wr_mcx_mac(pdata, MAC_CONFIG_1_ADDR, data | TX_EN);
+}
+
+void xgene_gmac_rx_disable(struct xgene_enet_pdata *pdata)
+{
+       u32 data;
+
+       xgene_enet_rd_mcx_mac(pdata, MAC_CONFIG_1_ADDR, &data);
+       xgene_enet_wr_mcx_mac(pdata, MAC_CONFIG_1_ADDR, data & ~RX_EN);
+}
+
+void xgene_gmac_tx_disable(struct xgene_enet_pdata *pdata)
+{
+       u32 data;
+
+       xgene_enet_rd_mcx_mac(pdata, MAC_CONFIG_1_ADDR, &data);
+       xgene_enet_wr_mcx_mac(pdata, MAC_CONFIG_1_ADDR, data & ~TX_EN);
+}
+
+void xgene_enet_reset(struct xgene_enet_pdata *pdata)
+{
+       u32 val;
+
+       clk_prepare_enable(pdata->clk);
+       clk_disable_unprepare(pdata->clk);
+       clk_prepare_enable(pdata->clk);
+       xgene_enet_ecc_init(pdata);
+       xgene_enet_config_ring_if_assoc(pdata);
+
+       /* Enable auto-incr for scanning */
+       xgene_enet_rd_mcx_mac(pdata, MII_MGMT_CONFIG_ADDR, &val);
+       val |= SCAN_AUTO_INCR;
+       MGMT_CLOCK_SEL_SET(&val, 1);
+       xgene_enet_wr_mcx_mac(pdata, MII_MGMT_CONFIG_ADDR, val);
+}
+
+void xgene_gport_shutdown(struct xgene_enet_pdata *pdata)
+{
+       clk_disable_unprepare(pdata->clk);
+}
+
+static int xgene_enet_mdio_read(struct mii_bus *bus, int mii_id, int regnum)
+{
+       struct xgene_enet_pdata *pdata = bus->priv;
+       u32 val;
+
+       val = xgene_mii_phy_read(pdata, mii_id, regnum);
+       netdev_dbg(pdata->ndev, "mdio_rd: bus=%d reg=%d val=%x\n",
+                  mii_id, regnum, val);
+
+       return val;
+}
+
+static int xgene_enet_mdio_write(struct mii_bus *bus, int mii_id, int regnum,
+                                u16 val)
+{
+       struct xgene_enet_pdata *pdata = bus->priv;
+
+       netdev_dbg(pdata->ndev, "mdio_wr: bus=%d reg=%d val=%x\n",
+                  mii_id, regnum, val);
+       return xgene_mii_phy_write(pdata, mii_id, regnum, val);
+}
+
+static void xgene_enet_adjust_link(struct net_device *ndev)
+{
+       struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+       struct phy_device *phydev = pdata->phy_dev;
+
+       if (phydev->link) {
+               if (pdata->phy_speed != phydev->speed) {
+                       xgene_gmac_init(pdata, phydev->speed);
+                       xgene_gmac_rx_enable(pdata);
+                       xgene_gmac_tx_enable(pdata);
+                       pdata->phy_speed = phydev->speed;
+                       phy_print_status(phydev);
+               }
+       } else {
+               xgene_gmac_rx_disable(pdata);
+               xgene_gmac_tx_disable(pdata);
+               pdata->phy_speed = SPEED_UNKNOWN;
+               phy_print_status(phydev);
+       }
+}
+
+static int xgene_enet_phy_connect(struct net_device *ndev)
+{
+       struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+       struct device_node *phy_np;
+       struct phy_device *phy_dev;
+       struct device *dev = &pdata->pdev->dev;
+
+       phy_np = of_parse_phandle(dev->of_node, "phy-handle", 0);
+       if (!phy_np) {
+               netdev_dbg(ndev, "No phy-handle found\n");
+               return -ENODEV;
+       }
+
+       phy_dev = of_phy_connect(ndev, phy_np, &xgene_enet_adjust_link,
+                                0, pdata->phy_mode);
+       if (!phy_dev) {
+               netdev_err(ndev, "Could not connect to PHY\n");
+               return  -ENODEV;
+       }
+
+       pdata->phy_speed = SPEED_UNKNOWN;
+       phy_dev->supported &= ~SUPPORTED_10baseT_Half &
+                             ~SUPPORTED_100baseT_Half &
+                             ~SUPPORTED_1000baseT_Half;
+       phy_dev->advertising = phy_dev->supported;
+       pdata->phy_dev = phy_dev;
+
+       return 0;
+}
+
+int xgene_enet_mdio_config(struct xgene_enet_pdata *pdata)
+{
+       struct net_device *ndev = pdata->ndev;
+       struct device *dev = &pdata->pdev->dev;
+       struct device_node *child_np;
+       struct device_node *mdio_np = NULL;
+       struct mii_bus *mdio_bus;
+       int ret;
+
+       for_each_child_of_node(dev->of_node, child_np) {
+               if (of_device_is_compatible(child_np, "apm,xgene-mdio")) {
+                       mdio_np = child_np;
+                       break;
+               }
+       }
+
+       if (!mdio_np) {
+               netdev_dbg(ndev, "No mdio node in the dts\n");
+               return -ENXIO;
+       }
+
+       mdio_bus = mdiobus_alloc();
+       if (!mdio_bus)
+               return -ENOMEM;
+
+       mdio_bus->name = "APM X-Gene MDIO bus";
+       mdio_bus->read = xgene_enet_mdio_read;
+       mdio_bus->write = xgene_enet_mdio_write;
+       snprintf(mdio_bus->id, MII_BUS_ID_SIZE, "%s-%s", "xgene-mii",
+                ndev->name);
+
+       mdio_bus->priv = pdata;
+       mdio_bus->parent = &ndev->dev;
+
+       ret = of_mdiobus_register(mdio_bus, mdio_np);
+       if (ret) {
+               netdev_err(ndev, "Failed to register MDIO bus\n");
+               mdiobus_free(mdio_bus);
+               return ret;
+       }
+       pdata->mdio_bus = mdio_bus;
+
+       ret = xgene_enet_phy_connect(ndev);
+       if (ret)
+               xgene_enet_mdio_remove(pdata);
+
+       return ret;
+}
+
+void xgene_enet_mdio_remove(struct xgene_enet_pdata *pdata)
+{
+       mdiobus_unregister(pdata->mdio_bus);
+       mdiobus_free(pdata->mdio_bus);
+       pdata->mdio_bus = NULL;
+}
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h
new file mode 100644 (file)
index 0000000..371e7a5
--- /dev/null
@@ -0,0 +1,337 @@
+/* Applied Micro X-Gene SoC Ethernet Driver
+ *
+ * Copyright (c) 2014, Applied Micro Circuits Corporation
+ * Authors: Iyappan Subramanian <isubramanian@apm.com>
+ *         Ravi Patel <rapatel@apm.com>
+ *         Keyur Chudgar <kchudgar@apm.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __XGENE_ENET_HW_H__
+#define __XGENE_ENET_HW_H__
+
+#include "xgene_enet_main.h"
+
+struct xgene_enet_pdata;
+struct xgene_enet_stats;
+
+/* clears and then set bits */
+static inline void xgene_set_bits(u32 *dst, u32 val, u32 start, u32 len)
+{
+       u32 end = start + len - 1;
+       u32 mask = GENMASK(end, start);
+
+       *dst &= ~mask;
+       *dst |= (val << start) & mask;
+}
+
+static inline u32 xgene_get_bits(u32 val, u32 start, u32 end)
+{
+       return (val & GENMASK(end, start)) >> start;
+}
+
+#define CSR_RING_ID            0x0008
+#define OVERWRITE              BIT(31)
+#define IS_BUFFER_POOL         BIT(20)
+#define PREFETCH_BUF_EN                BIT(21)
+#define CSR_RING_ID_BUF                0x000c
+#define CSR_RING_NE_INT_MODE   0x017c
+#define CSR_RING_CONFIG                0x006c
+#define CSR_RING_WR_BASE       0x0070
+#define NUM_RING_CONFIG                5
+#define BUFPOOL_MODE           3
+#define RM3                    3
+#define INC_DEC_CMD_ADDR       0x002c
+#define UDP_HDR_SIZE           2
+#define BUF_LEN_CODE_2K                0x5000
+
+#define CREATE_MASK(pos, len)          GENMASK((pos)+(len)-1, (pos))
+#define CREATE_MASK_ULL(pos, len)      GENMASK_ULL((pos)+(len)-1, (pos))
+
+/* Empty slot soft signature */
+#define EMPTY_SLOT_INDEX       1
+#define EMPTY_SLOT             ~0ULL
+
+#define WORK_DESC_SIZE         32
+#define BUFPOOL_DESC_SIZE      16
+
+#define RING_OWNER_MASK                GENMASK(9, 6)
+#define RING_BUFNUM_MASK       GENMASK(5, 0)
+
+#define SELTHRSH_POS           3
+#define SELTHRSH_LEN           3
+#define RINGADDRL_POS          5
+#define RINGADDRL_LEN          27
+#define RINGADDRH_POS          0
+#define RINGADDRH_LEN          6
+#define RINGSIZE_POS           23
+#define RINGSIZE_LEN           3
+#define RINGTYPE_POS           19
+#define RINGTYPE_LEN           2
+#define RINGMODE_POS           20
+#define RINGMODE_LEN           3
+#define RECOMTIMEOUTL_POS      28
+#define RECOMTIMEOUTL_LEN      3
+#define RECOMTIMEOUTH_POS      0
+#define RECOMTIMEOUTH_LEN      2
+#define NUMMSGSINQ_POS         1
+#define NUMMSGSINQ_LEN         16
+#define ACCEPTLERR             BIT(19)
+#define QCOHERENT              BIT(4)
+#define RECOMBBUF              BIT(27)
+
+#define BLOCK_ETH_CSR_OFFSET           0x2000
+#define BLOCK_ETH_RING_IF_OFFSET       0x9000
+#define BLOCK_ETH_CLKRST_CSR_OFFSET    0xC000
+#define BLOCK_ETH_DIAG_CSR_OFFSET      0xD000
+
+#define BLOCK_ETH_MAC_OFFSET           0x0000
+#define BLOCK_ETH_STATS_OFFSET         0x0014
+#define BLOCK_ETH_MAC_CSR_OFFSET       0x2800
+
+#define MAC_ADDR_REG_OFFSET            0x00
+#define MAC_COMMAND_REG_OFFSET         0x04
+#define MAC_WRITE_REG_OFFSET           0x08
+#define MAC_READ_REG_OFFSET            0x0c
+#define MAC_COMMAND_DONE_REG_OFFSET    0x10
+
+#define STAT_ADDR_REG_OFFSET           0x00
+#define STAT_COMMAND_REG_OFFSET                0x04
+#define STAT_WRITE_REG_OFFSET          0x08
+#define STAT_READ_REG_OFFSET           0x0c
+#define STAT_COMMAND_DONE_REG_OFFSET   0x10
+
+#define MII_MGMT_CONFIG_ADDR           0x20
+#define MII_MGMT_COMMAND_ADDR          0x24
+#define MII_MGMT_ADDRESS_ADDR          0x28
+#define MII_MGMT_CONTROL_ADDR          0x2c
+#define MII_MGMT_STATUS_ADDR           0x30
+#define MII_MGMT_INDICATORS_ADDR       0x34
+
+#define BUSY_MASK                      BIT(0)
+#define READ_CYCLE_MASK                        BIT(0)
+#define PHY_CONTROL_SET(dst, val)      xgene_set_bits(dst, val, 0, 16)
+
+#define ENET_SPARE_CFG_REG_ADDR                0x0750
+#define RSIF_CONFIG_REG_ADDR           0x0010
+#define RSIF_RAM_DBG_REG0_ADDR         0x0048
+#define RGMII_REG_0_ADDR               0x07e0
+#define CFG_LINK_AGGR_RESUME_0_ADDR    0x07c8
+#define DEBUG_REG_ADDR                 0x0700
+#define CFG_BYPASS_ADDR                        0x0294
+#define CLE_BYPASS_REG0_0_ADDR         0x0490
+#define CLE_BYPASS_REG1_0_ADDR         0x0494
+#define CFG_RSIF_FPBUFF_TIMEOUT_EN     BIT(31)
+#define RESUME_TX                      BIT(0)
+#define CFG_SPEED_1250                 BIT(24)
+#define TX_PORT0                       BIT(0)
+#define CFG_BYPASS_UNISEC_TX           BIT(2)
+#define CFG_BYPASS_UNISEC_RX           BIT(1)
+#define CFG_CLE_BYPASS_EN0             BIT(31)
+#define CFG_TXCLK_MUXSEL0_SET(dst, val)        xgene_set_bits(dst, val, 29, 3)
+
+#define CFG_CLE_IP_PROTOCOL0_SET(dst, val)     xgene_set_bits(dst, val, 16, 2)
+#define CFG_CLE_DSTQID0_SET(dst, val)          xgene_set_bits(dst, val, 0, 12)
+#define CFG_CLE_FPSEL0_SET(dst, val)           xgene_set_bits(dst, val, 16, 4)
+#define CFG_MACMODE_SET(dst, val)              xgene_set_bits(dst, val, 18, 2)
+#define CFG_WAITASYNCRD_SET(dst, val)          xgene_set_bits(dst, val, 0, 16)
+#define ICM_CONFIG0_REG_0_ADDR         0x0400
+#define ICM_CONFIG2_REG_0_ADDR         0x0410
+#define RX_DV_GATE_REG_0_ADDR          0x05fc
+#define TX_DV_GATE_EN0                 BIT(2)
+#define RX_DV_GATE_EN0                 BIT(1)
+#define RESUME_RX0                     BIT(0)
+#define ENET_CFGSSQMIWQASSOC_ADDR              0xe0
+#define ENET_CFGSSQMIFPQASSOC_ADDR             0xdc
+#define ENET_CFGSSQMIQMLITEFPQASSOC_ADDR       0xf0
+#define ENET_CFGSSQMIQMLITEWQASSOC_ADDR                0xf4
+#define ENET_CFG_MEM_RAM_SHUTDOWN_ADDR         0x70
+#define ENET_BLOCK_MEM_RDY_ADDR                        0x74
+#define MAC_CONFIG_1_ADDR                      0x00
+#define MAC_CONFIG_2_ADDR                      0x04
+#define MAX_FRAME_LEN_ADDR                     0x10
+#define INTERFACE_CONTROL_ADDR                 0x38
+#define STATION_ADDR0_ADDR                     0x40
+#define STATION_ADDR1_ADDR                     0x44
+#define PHY_ADDR_SET(dst, val)                 xgene_set_bits(dst, val, 8, 5)
+#define REG_ADDR_SET(dst, val)                 xgene_set_bits(dst, val, 0, 5)
+#define ENET_INTERFACE_MODE2_SET(dst, val)     xgene_set_bits(dst, val, 8, 2)
+#define MGMT_CLOCK_SEL_SET(dst, val)           xgene_set_bits(dst, val, 0, 3)
+#define SOFT_RESET1                    BIT(31)
+#define TX_EN                          BIT(0)
+#define RX_EN                          BIT(2)
+#define ENET_LHD_MODE                  BIT(25)
+#define ENET_GHD_MODE                  BIT(26)
+#define FULL_DUPLEX2                   BIT(0)
+#define SCAN_AUTO_INCR                 BIT(5)
+#define TBYT_ADDR                      0x38
+#define TPKT_ADDR                      0x39
+#define TDRP_ADDR                      0x45
+#define TFCS_ADDR                      0x47
+#define TUND_ADDR                      0x4a
+
+#define TSO_IPPROTO_TCP                        1
+#define        FULL_DUPLEX                     2
+
+#define USERINFO_POS                   0
+#define USERINFO_LEN                   32
+#define FPQNUM_POS                     32
+#define FPQNUM_LEN                     12
+#define LERR_POS                       60
+#define LERR_LEN                       3
+#define STASH_POS                      52
+#define STASH_LEN                      2
+#define BUFDATALEN_POS                 48
+#define BUFDATALEN_LEN                 12
+#define DATAADDR_POS                   0
+#define DATAADDR_LEN                   42
+#define COHERENT_POS                   63
+#define HENQNUM_POS                    48
+#define HENQNUM_LEN                    12
+#define TYPESEL_POS                    44
+#define TYPESEL_LEN                    4
+#define ETHHDR_POS                     12
+#define ETHHDR_LEN                     8
+#define IC_POS                         35      /* Insert CRC */
+#define TCPHDR_POS                     0
+#define TCPHDR_LEN                     6
+#define IPHDR_POS                      6
+#define IPHDR_LEN                      6
+#define EC_POS                         22      /* Enable checksum */
+#define EC_LEN                         1
+#define IS_POS                         24      /* IP protocol select */
+#define IS_LEN                         1
+#define TYPE_ETH_WORK_MESSAGE_POS      44
+
+struct xgene_enet_raw_desc {
+       __le64 m0;
+       __le64 m1;
+       __le64 m2;
+       __le64 m3;
+};
+
+struct xgene_enet_raw_desc16 {
+       __le64 m0;
+       __le64 m1;
+};
+
+static inline void xgene_enet_mark_desc_slot_empty(void *desc_slot_ptr)
+{
+       __le64 *desc_slot = desc_slot_ptr;
+
+       desc_slot[EMPTY_SLOT_INDEX] = cpu_to_le64(EMPTY_SLOT);
+}
+
+static inline bool xgene_enet_is_desc_slot_empty(void *desc_slot_ptr)
+{
+       __le64 *desc_slot = desc_slot_ptr;
+
+       return (desc_slot[EMPTY_SLOT_INDEX] == cpu_to_le64(EMPTY_SLOT));
+}
+
+enum xgene_enet_ring_cfgsize {
+       RING_CFGSIZE_512B,
+       RING_CFGSIZE_2KB,
+       RING_CFGSIZE_16KB,
+       RING_CFGSIZE_64KB,
+       RING_CFGSIZE_512KB,
+       RING_CFGSIZE_INVALID
+};
+
+enum xgene_enet_ring_type {
+       RING_DISABLED,
+       RING_REGULAR,
+       RING_BUFPOOL
+};
+
+enum xgene_ring_owner {
+       RING_OWNER_ETH0,
+       RING_OWNER_CPU = 15,
+       RING_OWNER_INVALID
+};
+
+enum xgene_enet_ring_bufnum {
+       RING_BUFNUM_REGULAR = 0x0,
+       RING_BUFNUM_BUFPOOL = 0x20,
+       RING_BUFNUM_INVALID
+};
+
+enum xgene_enet_cmd {
+       XGENE_ENET_WR_CMD = BIT(31),
+       XGENE_ENET_RD_CMD = BIT(30)
+};
+
+enum xgene_enet_err_code {
+       HBF_READ_DATA = 3,
+       HBF_LL_READ = 4,
+       BAD_WORK_MSG = 6,
+       BUFPOOL_TIMEOUT = 15,
+       INGRESS_CRC = 16,
+       INGRESS_CHECKSUM = 17,
+       INGRESS_TRUNC_FRAME = 18,
+       INGRESS_PKT_LEN = 19,
+       INGRESS_PKT_UNDER = 20,
+       INGRESS_FIFO_OVERRUN = 21,
+       INGRESS_CHECKSUM_COMPUTE = 26,
+       ERR_CODE_INVALID
+};
+
+static inline enum xgene_ring_owner xgene_enet_ring_owner(u16 id)
+{
+       return (id & RING_OWNER_MASK) >> 6;
+}
+
+static inline u8 xgene_enet_ring_bufnum(u16 id)
+{
+       return id & RING_BUFNUM_MASK;
+}
+
+static inline bool xgene_enet_is_bufpool(u16 id)
+{
+       return ((id & RING_BUFNUM_MASK) >= 0x20) ? true : false;
+}
+
+static inline u16 xgene_enet_get_numslots(u16 id, u32 size)
+{
+       bool is_bufpool = xgene_enet_is_bufpool(id);
+
+       return (is_bufpool) ? size / BUFPOOL_DESC_SIZE :
+                     size / WORK_DESC_SIZE;
+}
+
+struct xgene_enet_desc_ring *xgene_enet_setup_ring(
+               struct xgene_enet_desc_ring *ring);
+void xgene_enet_clear_ring(struct xgene_enet_desc_ring *ring);
+void xgene_enet_parse_error(struct xgene_enet_desc_ring *ring,
+                           struct xgene_enet_pdata *pdata,
+                           enum xgene_enet_err_code status);
+
+void xgene_enet_reset(struct xgene_enet_pdata *priv);
+void xgene_gmac_reset(struct xgene_enet_pdata *priv);
+void xgene_gmac_init(struct xgene_enet_pdata *priv, int speed);
+void xgene_gmac_tx_enable(struct xgene_enet_pdata *priv);
+void xgene_gmac_rx_enable(struct xgene_enet_pdata *priv);
+void xgene_gmac_tx_disable(struct xgene_enet_pdata *priv);
+void xgene_gmac_rx_disable(struct xgene_enet_pdata *priv);
+void xgene_gmac_set_mac_addr(struct xgene_enet_pdata *pdata);
+void xgene_enet_cle_bypass(struct xgene_enet_pdata *pdata,
+                          u32 dst_ring_num, u16 bufpool_id);
+void xgene_gport_shutdown(struct xgene_enet_pdata *priv);
+void xgene_gmac_get_tx_stats(struct xgene_enet_pdata *pdata);
+
+int xgene_enet_mdio_config(struct xgene_enet_pdata *pdata);
+void xgene_enet_mdio_remove(struct xgene_enet_pdata *pdata);
+
+#endif /* __XGENE_ENET_HW_H__ */
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
new file mode 100644 (file)
index 0000000..af7c40a
--- /dev/null
@@ -0,0 +1,951 @@
+/* Applied Micro X-Gene SoC Ethernet Driver
+ *
+ * Copyright (c) 2014, Applied Micro Circuits Corporation
+ * Authors: Iyappan Subramanian <isubramanian@apm.com>
+ *         Ravi Patel <rapatel@apm.com>
+ *         Keyur Chudgar <kchudgar@apm.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "xgene_enet_main.h"
+#include "xgene_enet_hw.h"
+
+static void xgene_enet_init_bufpool(struct xgene_enet_desc_ring *buf_pool)
+{
+       struct xgene_enet_raw_desc16 *raw_desc;
+       int i;
+
+       for (i = 0; i < buf_pool->slots; i++) {
+               raw_desc = &buf_pool->raw_desc16[i];
+
+               /* Hardware expects descriptor in little endian format */
+               raw_desc->m0 = cpu_to_le64(i |
+                               SET_VAL(FPQNUM, buf_pool->dst_ring_num) |
+                               SET_VAL(STASH, 3));
+       }
+}
+
+static int xgene_enet_refill_bufpool(struct xgene_enet_desc_ring *buf_pool,
+                                    u32 nbuf)
+{
+       struct sk_buff *skb;
+       struct xgene_enet_raw_desc16 *raw_desc;
+       struct net_device *ndev;
+       struct device *dev;
+       dma_addr_t dma_addr;
+       u32 tail = buf_pool->tail;
+       u32 slots = buf_pool->slots - 1;
+       u16 bufdatalen, len;
+       int i;
+
+       ndev = buf_pool->ndev;
+       dev = ndev_to_dev(buf_pool->ndev);
+       bufdatalen = BUF_LEN_CODE_2K | (SKB_BUFFER_SIZE & GENMASK(11, 0));
+       len = XGENE_ENET_MAX_MTU;
+
+       for (i = 0; i < nbuf; i++) {
+               raw_desc = &buf_pool->raw_desc16[tail];
+
+               skb = netdev_alloc_skb_ip_align(ndev, len);
+               if (unlikely(!skb))
+                       return -ENOMEM;
+               buf_pool->rx_skb[tail] = skb;
+
+               dma_addr = dma_map_single(dev, skb->data, len, DMA_FROM_DEVICE);
+               if (dma_mapping_error(dev, dma_addr)) {
+                       netdev_err(ndev, "DMA mapping error\n");
+                       dev_kfree_skb_any(skb);
+                       return -EINVAL;
+               }
+
+               raw_desc->m1 = cpu_to_le64(SET_VAL(DATAADDR, dma_addr) |
+                                          SET_VAL(BUFDATALEN, bufdatalen) |
+                                          SET_BIT(COHERENT));
+               tail = (tail + 1) & slots;
+       }
+
+       iowrite32(nbuf, buf_pool->cmd);
+       buf_pool->tail = tail;
+
+       return 0;
+}
+
+static u16 xgene_enet_dst_ring_num(struct xgene_enet_desc_ring *ring)
+{
+       struct xgene_enet_pdata *pdata = netdev_priv(ring->ndev);
+
+       return ((u16)pdata->rm << 10) | ring->num;
+}
+
+static u8 xgene_enet_hdr_len(const void *data)
+{
+       const struct ethhdr *eth = data;
+
+       return (eth->h_proto == htons(ETH_P_8021Q)) ? VLAN_ETH_HLEN : ETH_HLEN;
+}
+
+static u32 xgene_enet_ring_len(struct xgene_enet_desc_ring *ring)
+{
+       u32 __iomem *cmd_base = ring->cmd_base;
+       u32 ring_state, num_msgs;
+
+       ring_state = ioread32(&cmd_base[1]);
+       num_msgs = ring_state & CREATE_MASK(NUMMSGSINQ_POS, NUMMSGSINQ_LEN);
+
+       return num_msgs >> NUMMSGSINQ_POS;
+}
+
+static void xgene_enet_delete_bufpool(struct xgene_enet_desc_ring *buf_pool)
+{
+       struct xgene_enet_raw_desc16 *raw_desc;
+       u32 slots = buf_pool->slots - 1;
+       u32 tail = buf_pool->tail;
+       u32 userinfo;
+       int i, len;
+
+       len = xgene_enet_ring_len(buf_pool);
+       for (i = 0; i < len; i++) {
+               tail = (tail - 1) & slots;
+               raw_desc = &buf_pool->raw_desc16[tail];
+
+               /* Hardware stores descriptor in little endian format */
+               userinfo = GET_VAL(USERINFO, le64_to_cpu(raw_desc->m0));
+               dev_kfree_skb_any(buf_pool->rx_skb[userinfo]);
+       }
+
+       iowrite32(-len, buf_pool->cmd);
+       buf_pool->tail = tail;
+}
+
+static irqreturn_t xgene_enet_rx_irq(const int irq, void *data)
+{
+       struct xgene_enet_desc_ring *rx_ring = data;
+
+       if (napi_schedule_prep(&rx_ring->napi)) {
+               disable_irq_nosync(irq);
+               __napi_schedule(&rx_ring->napi);
+       }
+
+       return IRQ_HANDLED;
+}
+
+static int xgene_enet_tx_completion(struct xgene_enet_desc_ring *cp_ring,
+                                   struct xgene_enet_raw_desc *raw_desc)
+{
+       struct sk_buff *skb;
+       struct device *dev;
+       u16 skb_index;
+       u8 status;
+       int ret = 0;
+
+       skb_index = GET_VAL(USERINFO, le64_to_cpu(raw_desc->m0));
+       skb = cp_ring->cp_skb[skb_index];
+
+       dev = ndev_to_dev(cp_ring->ndev);
+       dma_unmap_single(dev, GET_VAL(DATAADDR, le64_to_cpu(raw_desc->m1)),
+                        GET_VAL(BUFDATALEN, le64_to_cpu(raw_desc->m1)),
+                        DMA_TO_DEVICE);
+
+       /* Checking for error */
+       status = GET_VAL(LERR, le64_to_cpu(raw_desc->m0));
+       if (unlikely(status > 2)) {
+               xgene_enet_parse_error(cp_ring, netdev_priv(cp_ring->ndev),
+                                      status);
+               ret = -EIO;
+       }
+
+       if (likely(skb)) {
+               dev_kfree_skb_any(skb);
+       } else {
+               netdev_err(cp_ring->ndev, "completion skb is NULL\n");
+               ret = -EIO;
+       }
+
+       return ret;
+}
+
+static u64 xgene_enet_work_msg(struct sk_buff *skb)
+{
+       struct iphdr *iph;
+       u8 l3hlen, l4hlen = 0;
+       u8 csum_enable = 0;
+       u8 proto = 0;
+       u8 ethhdr;
+       u64 hopinfo;
+
+       if (unlikely(skb->protocol != htons(ETH_P_IP)) &&
+           unlikely(skb->protocol != htons(ETH_P_8021Q)))
+               goto out;
+
+       if (unlikely(!(skb->dev->features & NETIF_F_IP_CSUM)))
+               goto out;
+
+       iph = ip_hdr(skb);
+       if (unlikely(ip_is_fragment(iph)))
+               goto out;
+
+       if (likely(iph->protocol == IPPROTO_TCP)) {
+               l4hlen = tcp_hdrlen(skb) >> 2;
+               csum_enable = 1;
+               proto = TSO_IPPROTO_TCP;
+       } else if (iph->protocol == IPPROTO_UDP) {
+               l4hlen = UDP_HDR_SIZE;
+               csum_enable = 1;
+       }
+out:
+       l3hlen = ip_hdrlen(skb) >> 2;
+       ethhdr = xgene_enet_hdr_len(skb->data);
+       hopinfo = SET_VAL(TCPHDR, l4hlen) |
+                 SET_VAL(IPHDR, l3hlen) |
+                 SET_VAL(ETHHDR, ethhdr) |
+                 SET_VAL(EC, csum_enable) |
+                 SET_VAL(IS, proto) |
+                 SET_BIT(IC) |
+                 SET_BIT(TYPE_ETH_WORK_MESSAGE);
+
+       return hopinfo;
+}
+
+static int xgene_enet_setup_tx_desc(struct xgene_enet_desc_ring *tx_ring,
+                                   struct sk_buff *skb)
+{
+       struct device *dev = ndev_to_dev(tx_ring->ndev);
+       struct xgene_enet_raw_desc *raw_desc;
+       dma_addr_t dma_addr;
+       u16 tail = tx_ring->tail;
+       u64 hopinfo;
+
+       raw_desc = &tx_ring->raw_desc[tail];
+       memset(raw_desc, 0, sizeof(struct xgene_enet_raw_desc));
+
+       dma_addr = dma_map_single(dev, skb->data, skb->len, DMA_TO_DEVICE);
+       if (dma_mapping_error(dev, dma_addr)) {
+               netdev_err(tx_ring->ndev, "DMA mapping error\n");
+               return -EINVAL;
+       }
+
+       /* Hardware expects descriptor in little endian format */
+       raw_desc->m0 = cpu_to_le64(tail);
+       raw_desc->m1 = cpu_to_le64(SET_VAL(DATAADDR, dma_addr) |
+                                  SET_VAL(BUFDATALEN, skb->len) |
+                                  SET_BIT(COHERENT));
+       hopinfo = xgene_enet_work_msg(skb);
+       raw_desc->m3 = cpu_to_le64(SET_VAL(HENQNUM, tx_ring->dst_ring_num) |
+                                  hopinfo);
+       tx_ring->cp_ring->cp_skb[tail] = skb;
+
+       return 0;
+}
+
+static netdev_tx_t xgene_enet_start_xmit(struct sk_buff *skb,
+                                        struct net_device *ndev)
+{
+       struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+       struct xgene_enet_desc_ring *tx_ring = pdata->tx_ring;
+       struct xgene_enet_desc_ring *cp_ring = tx_ring->cp_ring;
+       u32 tx_level, cq_level;
+
+       tx_level = xgene_enet_ring_len(tx_ring);
+       cq_level = xgene_enet_ring_len(cp_ring);
+       if (unlikely(tx_level > pdata->tx_qcnt_hi ||
+                    cq_level > pdata->cp_qcnt_hi)) {
+               netif_stop_queue(ndev);
+               return NETDEV_TX_BUSY;
+       }
+
+       if (xgene_enet_setup_tx_desc(tx_ring, skb)) {
+               dev_kfree_skb_any(skb);
+               return NETDEV_TX_OK;
+       }
+
+       iowrite32(1, tx_ring->cmd);
+       skb_tx_timestamp(skb);
+       tx_ring->tail = (tx_ring->tail + 1) & (tx_ring->slots - 1);
+
+       pdata->stats.tx_packets++;
+       pdata->stats.tx_bytes += skb->len;
+
+       return NETDEV_TX_OK;
+}
+
+static void xgene_enet_skip_csum(struct sk_buff *skb)
+{
+       struct iphdr *iph = ip_hdr(skb);
+
+       if (!ip_is_fragment(iph) ||
+           (iph->protocol != IPPROTO_TCP && iph->protocol != IPPROTO_UDP)) {
+               skb->ip_summed = CHECKSUM_UNNECESSARY;
+       }
+}
+
+static int xgene_enet_rx_frame(struct xgene_enet_desc_ring *rx_ring,
+                              struct xgene_enet_raw_desc *raw_desc)
+{
+       struct net_device *ndev;
+       struct xgene_enet_pdata *pdata;
+       struct device *dev;
+       struct xgene_enet_desc_ring *buf_pool;
+       u32 datalen, skb_index;
+       struct sk_buff *skb;
+       u8 status;
+       int ret = 0;
+
+       ndev = rx_ring->ndev;
+       pdata = netdev_priv(ndev);
+       dev = ndev_to_dev(rx_ring->ndev);
+       buf_pool = rx_ring->buf_pool;
+
+       dma_unmap_single(dev, GET_VAL(DATAADDR, le64_to_cpu(raw_desc->m1)),
+                        XGENE_ENET_MAX_MTU, DMA_FROM_DEVICE);
+       skb_index = GET_VAL(USERINFO, le64_to_cpu(raw_desc->m0));
+       skb = buf_pool->rx_skb[skb_index];
+
+       /* checking for error */
+       status = GET_VAL(LERR, le64_to_cpu(raw_desc->m0));
+       if (unlikely(status > 2)) {
+               dev_kfree_skb_any(skb);
+               xgene_enet_parse_error(rx_ring, netdev_priv(rx_ring->ndev),
+                                      status);
+               pdata->stats.rx_dropped++;
+               ret = -EIO;
+               goto out;
+       }
+
+       /* strip off CRC as HW isn't doing this */
+       datalen = GET_VAL(BUFDATALEN, le64_to_cpu(raw_desc->m1));
+       datalen -= 4;
+       prefetch(skb->data - NET_IP_ALIGN);
+       skb_put(skb, datalen);
+
+       skb_checksum_none_assert(skb);
+       skb->protocol = eth_type_trans(skb, ndev);
+       if (likely((ndev->features & NETIF_F_IP_CSUM) &&
+                  skb->protocol == htons(ETH_P_IP))) {
+               xgene_enet_skip_csum(skb);
+       }
+
+       pdata->stats.rx_packets++;
+       pdata->stats.rx_bytes += datalen;
+       napi_gro_receive(&rx_ring->napi, skb);
+out:
+       if (--rx_ring->nbufpool == 0) {
+               ret = xgene_enet_refill_bufpool(buf_pool, NUM_BUFPOOL);
+               rx_ring->nbufpool = NUM_BUFPOOL;
+       }
+
+       return ret;
+}
+
+static bool is_rx_desc(struct xgene_enet_raw_desc *raw_desc)
+{
+       return GET_VAL(FPQNUM, le64_to_cpu(raw_desc->m0)) ? true : false;
+}
+
+static int xgene_enet_process_ring(struct xgene_enet_desc_ring *ring,
+                                  int budget)
+{
+       struct xgene_enet_pdata *pdata = netdev_priv(ring->ndev);
+       struct xgene_enet_raw_desc *raw_desc;
+       u16 head = ring->head;
+       u16 slots = ring->slots - 1;
+       int ret, count = 0;
+
+       do {
+               raw_desc = &ring->raw_desc[head];
+               if (unlikely(xgene_enet_is_desc_slot_empty(raw_desc)))
+                       break;
+
+               if (is_rx_desc(raw_desc))
+                       ret = xgene_enet_rx_frame(ring, raw_desc);
+               else
+                       ret = xgene_enet_tx_completion(ring, raw_desc);
+               xgene_enet_mark_desc_slot_empty(raw_desc);
+
+               head = (head + 1) & slots;
+               count++;
+
+               if (ret)
+                       break;
+       } while (--budget);
+
+       if (likely(count)) {
+               iowrite32(-count, ring->cmd);
+               ring->head = head;
+
+               if (netif_queue_stopped(ring->ndev)) {
+                       if (xgene_enet_ring_len(ring) < pdata->cp_qcnt_low)
+                               netif_wake_queue(ring->ndev);
+               }
+       }
+
+       return budget;
+}
+
+static int xgene_enet_napi(struct napi_struct *napi, const int budget)
+{
+       struct xgene_enet_desc_ring *ring;
+       int processed;
+
+       ring = container_of(napi, struct xgene_enet_desc_ring, napi);
+       processed = xgene_enet_process_ring(ring, budget);
+
+       if (processed != budget) {
+               napi_complete(napi);
+               enable_irq(ring->irq);
+       }
+
+       return processed;
+}
+
+static void xgene_enet_timeout(struct net_device *ndev)
+{
+       struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+
+       xgene_gmac_reset(pdata);
+}
+
+static int xgene_enet_register_irq(struct net_device *ndev)
+{
+       struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+       struct device *dev = ndev_to_dev(ndev);
+       int ret;
+
+       ret = devm_request_irq(dev, pdata->rx_ring->irq, xgene_enet_rx_irq,
+                              IRQF_SHARED, ndev->name, pdata->rx_ring);
+       if (ret) {
+               netdev_err(ndev, "rx%d interrupt request failed\n",
+                          pdata->rx_ring->irq);
+       }
+
+       return ret;
+}
+
+static void xgene_enet_free_irq(struct net_device *ndev)
+{
+       struct xgene_enet_pdata *pdata;
+       struct device *dev;
+
+       pdata = netdev_priv(ndev);
+       dev = ndev_to_dev(ndev);
+       devm_free_irq(dev, pdata->rx_ring->irq, pdata->rx_ring);
+}
+
+static int xgene_enet_open(struct net_device *ndev)
+{
+       struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+       int ret;
+
+       xgene_gmac_tx_enable(pdata);
+       xgene_gmac_rx_enable(pdata);
+
+       ret = xgene_enet_register_irq(ndev);
+       if (ret)
+               return ret;
+       napi_enable(&pdata->rx_ring->napi);
+
+       if (pdata->phy_dev)
+               phy_start(pdata->phy_dev);
+
+       netif_start_queue(ndev);
+
+       return ret;
+}
+
+static int xgene_enet_close(struct net_device *ndev)
+{
+       struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+
+       netif_stop_queue(ndev);
+
+       if (pdata->phy_dev)
+               phy_stop(pdata->phy_dev);
+
+       napi_disable(&pdata->rx_ring->napi);
+       xgene_enet_free_irq(ndev);
+       xgene_enet_process_ring(pdata->rx_ring, -1);
+
+       xgene_gmac_tx_disable(pdata);
+       xgene_gmac_rx_disable(pdata);
+
+       return 0;
+}
+
+static void xgene_enet_delete_ring(struct xgene_enet_desc_ring *ring)
+{
+       struct xgene_enet_pdata *pdata;
+       struct device *dev;
+
+       pdata = netdev_priv(ring->ndev);
+       dev = ndev_to_dev(ring->ndev);
+
+       xgene_enet_clear_ring(ring);
+       dma_free_coherent(dev, ring->size, ring->desc_addr, ring->dma);
+}
+
+static void xgene_enet_delete_desc_rings(struct xgene_enet_pdata *pdata)
+{
+       struct xgene_enet_desc_ring *buf_pool;
+
+       if (pdata->tx_ring) {
+               xgene_enet_delete_ring(pdata->tx_ring);
+               pdata->tx_ring = NULL;
+       }
+
+       if (pdata->rx_ring) {
+               buf_pool = pdata->rx_ring->buf_pool;
+               xgene_enet_delete_bufpool(buf_pool);
+               xgene_enet_delete_ring(buf_pool);
+               xgene_enet_delete_ring(pdata->rx_ring);
+               pdata->rx_ring = NULL;
+       }
+}
+
+static int xgene_enet_get_ring_size(struct device *dev,
+                                   enum xgene_enet_ring_cfgsize cfgsize)
+{
+       int size = -EINVAL;
+
+       switch (cfgsize) {
+       case RING_CFGSIZE_512B:
+               size = 0x200;
+               break;
+       case RING_CFGSIZE_2KB:
+               size = 0x800;
+               break;
+       case RING_CFGSIZE_16KB:
+               size = 0x4000;
+               break;
+       case RING_CFGSIZE_64KB:
+               size = 0x10000;
+               break;
+       case RING_CFGSIZE_512KB:
+               size = 0x80000;
+               break;
+       default:
+               dev_err(dev, "Unsupported cfg ring size %d\n", cfgsize);
+               break;
+       }
+
+       return size;
+}
+
+static void xgene_enet_free_desc_ring(struct xgene_enet_desc_ring *ring)
+{
+       struct device *dev;
+
+       if (!ring)
+               return;
+
+       dev = ndev_to_dev(ring->ndev);
+
+       if (ring->desc_addr) {
+               xgene_enet_clear_ring(ring);
+               dma_free_coherent(dev, ring->size, ring->desc_addr, ring->dma);
+       }
+       devm_kfree(dev, ring);
+}
+
+static void xgene_enet_free_desc_rings(struct xgene_enet_pdata *pdata)
+{
+       struct device *dev = &pdata->pdev->dev;
+       struct xgene_enet_desc_ring *ring;
+
+       ring = pdata->tx_ring;
+       if (ring && ring->cp_ring && ring->cp_ring->cp_skb)
+               devm_kfree(dev, ring->cp_ring->cp_skb);
+       xgene_enet_free_desc_ring(ring);
+
+       ring = pdata->rx_ring;
+       if (ring && ring->buf_pool && ring->buf_pool->rx_skb)
+               devm_kfree(dev, ring->buf_pool->rx_skb);
+       xgene_enet_free_desc_ring(ring->buf_pool);
+       xgene_enet_free_desc_ring(ring);
+}
+
+static struct xgene_enet_desc_ring *xgene_enet_create_desc_ring(
+                       struct net_device *ndev, u32 ring_num,
+                       enum xgene_enet_ring_cfgsize cfgsize, u32 ring_id)
+{
+       struct xgene_enet_desc_ring *ring;
+       struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+       struct device *dev = ndev_to_dev(ndev);
+       u32 size;
+
+       ring = devm_kzalloc(dev, sizeof(struct xgene_enet_desc_ring),
+                           GFP_KERNEL);
+       if (!ring)
+               return NULL;
+
+       ring->ndev = ndev;
+       ring->num = ring_num;
+       ring->cfgsize = cfgsize;
+       ring->id = ring_id;
+
+       size = xgene_enet_get_ring_size(dev, cfgsize);
+       ring->desc_addr = dma_zalloc_coherent(dev, size, &ring->dma,
+                                             GFP_KERNEL);
+       if (!ring->desc_addr) {
+               devm_kfree(dev, ring);
+               return NULL;
+       }
+       ring->size = size;
+
+       ring->cmd_base = pdata->ring_cmd_addr + (ring->num << 6);
+       ring->cmd = ring->cmd_base + INC_DEC_CMD_ADDR;
+       pdata->rm = RM3;
+       ring = xgene_enet_setup_ring(ring);
+       netdev_dbg(ndev, "ring info: num=%d  size=%d  id=%d  slots=%d\n",
+                  ring->num, ring->size, ring->id, ring->slots);
+
+       return ring;
+}
+
+static u16 xgene_enet_get_ring_id(enum xgene_ring_owner owner, u8 bufnum)
+{
+       return (owner << 6) | (bufnum & GENMASK(5, 0));
+}
+
+static int xgene_enet_create_desc_rings(struct net_device *ndev)
+{
+       struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+       struct device *dev = ndev_to_dev(ndev);
+       struct xgene_enet_desc_ring *rx_ring, *tx_ring, *cp_ring;
+       struct xgene_enet_desc_ring *buf_pool = NULL;
+       u8 cpu_bufnum = 0, eth_bufnum = 0;
+       u8 bp_bufnum = 0x20;
+       u16 ring_id, ring_num = 0;
+       int ret;
+
+       /* allocate rx descriptor ring */
+       ring_id = xgene_enet_get_ring_id(RING_OWNER_CPU, cpu_bufnum++);
+       rx_ring = xgene_enet_create_desc_ring(ndev, ring_num++,
+                                             RING_CFGSIZE_16KB, ring_id);
+       if (!rx_ring) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       /* allocate buffer pool for receiving packets */
+       ring_id = xgene_enet_get_ring_id(RING_OWNER_ETH0, bp_bufnum++);
+       buf_pool = xgene_enet_create_desc_ring(ndev, ring_num++,
+                                              RING_CFGSIZE_2KB, ring_id);
+       if (!buf_pool) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       rx_ring->nbufpool = NUM_BUFPOOL;
+       rx_ring->buf_pool = buf_pool;
+       rx_ring->irq = pdata->rx_irq;
+       buf_pool->rx_skb = devm_kcalloc(dev, buf_pool->slots,
+                                       sizeof(struct sk_buff *), GFP_KERNEL);
+       if (!buf_pool->rx_skb) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       buf_pool->dst_ring_num = xgene_enet_dst_ring_num(buf_pool);
+       rx_ring->buf_pool = buf_pool;
+       pdata->rx_ring = rx_ring;
+
+       /* allocate tx descriptor ring */
+       ring_id = xgene_enet_get_ring_id(RING_OWNER_ETH0, eth_bufnum++);
+       tx_ring = xgene_enet_create_desc_ring(ndev, ring_num++,
+                                             RING_CFGSIZE_16KB, ring_id);
+       if (!tx_ring) {
+               ret = -ENOMEM;
+               goto err;
+       }
+       pdata->tx_ring = tx_ring;
+
+       cp_ring = pdata->rx_ring;
+       cp_ring->cp_skb = devm_kcalloc(dev, tx_ring->slots,
+                                      sizeof(struct sk_buff *), GFP_KERNEL);
+       if (!cp_ring->cp_skb) {
+               ret = -ENOMEM;
+               goto err;
+       }
+       pdata->tx_ring->cp_ring = cp_ring;
+       pdata->tx_ring->dst_ring_num = xgene_enet_dst_ring_num(cp_ring);
+
+       pdata->tx_qcnt_hi = pdata->tx_ring->slots / 2;
+       pdata->cp_qcnt_hi = pdata->rx_ring->slots / 2;
+       pdata->cp_qcnt_low = pdata->cp_qcnt_hi / 2;
+
+       return 0;
+
+err:
+       xgene_enet_free_desc_rings(pdata);
+       return ret;
+}
+
+static struct rtnl_link_stats64 *xgene_enet_get_stats64(
+                       struct net_device *ndev,
+                       struct rtnl_link_stats64 *storage)
+{
+       struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+       struct rtnl_link_stats64 *stats = &pdata->stats;
+
+       stats->rx_errors += stats->rx_length_errors +
+                           stats->rx_crc_errors +
+                           stats->rx_frame_errors +
+                           stats->rx_fifo_errors;
+       memcpy(storage, &pdata->stats, sizeof(struct rtnl_link_stats64));
+
+       return storage;
+}
+
+static int xgene_enet_set_mac_address(struct net_device *ndev, void *addr)
+{
+       struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+       int ret;
+
+       ret = eth_mac_addr(ndev, addr);
+       if (ret)
+               return ret;
+       xgene_gmac_set_mac_addr(pdata);
+
+       return ret;
+}
+
+static const struct net_device_ops xgene_ndev_ops = {
+       .ndo_open = xgene_enet_open,
+       .ndo_stop = xgene_enet_close,
+       .ndo_start_xmit = xgene_enet_start_xmit,
+       .ndo_tx_timeout = xgene_enet_timeout,
+       .ndo_get_stats64 = xgene_enet_get_stats64,
+       .ndo_change_mtu = eth_change_mtu,
+       .ndo_set_mac_address = xgene_enet_set_mac_address,
+};
+
+static int xgene_enet_get_resources(struct xgene_enet_pdata *pdata)
+{
+       struct platform_device *pdev;
+       struct net_device *ndev;
+       struct device *dev;
+       struct resource *res;
+       void __iomem *base_addr;
+       const char *mac;
+       int ret;
+
+       pdev = pdata->pdev;
+       dev = &pdev->dev;
+       ndev = pdata->ndev;
+
+       res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "enet_csr");
+       if (!res) {
+               dev_err(dev, "Resource enet_csr not defined\n");
+               return -ENODEV;
+       }
+       pdata->base_addr = devm_ioremap_resource(dev, res);
+       if (IS_ERR(pdata->base_addr)) {
+               dev_err(dev, "Unable to retrieve ENET Port CSR region\n");
+               return PTR_ERR(pdata->base_addr);
+       }
+
+       res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "ring_csr");
+       if (!res) {
+               dev_err(dev, "Resource ring_csr not defined\n");
+               return -ENODEV;
+       }
+       pdata->ring_csr_addr = devm_ioremap_resource(dev, res);
+       if (IS_ERR(pdata->ring_csr_addr)) {
+               dev_err(dev, "Unable to retrieve ENET Ring CSR region\n");
+               return PTR_ERR(pdata->ring_csr_addr);
+       }
+
+       res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "ring_cmd");
+       if (!res) {
+               dev_err(dev, "Resource ring_cmd not defined\n");
+               return -ENODEV;
+       }
+       pdata->ring_cmd_addr = devm_ioremap_resource(dev, res);
+       if (IS_ERR(pdata->ring_cmd_addr)) {
+               dev_err(dev, "Unable to retrieve ENET Ring command region\n");
+               return PTR_ERR(pdata->ring_cmd_addr);
+       }
+
+       ret = platform_get_irq(pdev, 0);
+       if (ret <= 0) {
+               dev_err(dev, "Unable to get ENET Rx IRQ\n");
+               ret = ret ? : -ENXIO;
+               return ret;
+       }
+       pdata->rx_irq = ret;
+
+       mac = of_get_mac_address(dev->of_node);
+       if (mac)
+               memcpy(ndev->dev_addr, mac, ndev->addr_len);
+       else
+               eth_hw_addr_random(ndev);
+       memcpy(ndev->perm_addr, ndev->dev_addr, ndev->addr_len);
+
+       pdata->phy_mode = of_get_phy_mode(pdev->dev.of_node);
+       if (pdata->phy_mode < 0) {
+               dev_err(dev, "Incorrect phy-connection-type in DTS\n");
+               return -EINVAL;
+       }
+
+       pdata->clk = devm_clk_get(&pdev->dev, NULL);
+       ret = IS_ERR(pdata->clk);
+       if (IS_ERR(pdata->clk)) {
+               dev_err(&pdev->dev, "can't get clock\n");
+               ret = PTR_ERR(pdata->clk);
+               return ret;
+       }
+
+       base_addr = pdata->base_addr;
+       pdata->eth_csr_addr = base_addr + BLOCK_ETH_CSR_OFFSET;
+       pdata->eth_ring_if_addr = base_addr + BLOCK_ETH_RING_IF_OFFSET;
+       pdata->eth_diag_csr_addr = base_addr + BLOCK_ETH_DIAG_CSR_OFFSET;
+       pdata->mcx_mac_addr = base_addr + BLOCK_ETH_MAC_OFFSET;
+       pdata->mcx_stats_addr = base_addr + BLOCK_ETH_STATS_OFFSET;
+       pdata->mcx_mac_csr_addr = base_addr + BLOCK_ETH_MAC_CSR_OFFSET;
+       pdata->rx_buff_cnt = NUM_PKT_BUF;
+
+       return ret;
+}
+
+static int xgene_enet_init_hw(struct xgene_enet_pdata *pdata)
+{
+       struct net_device *ndev = pdata->ndev;
+       struct xgene_enet_desc_ring *buf_pool;
+       u16 dst_ring_num;
+       int ret;
+
+       xgene_gmac_tx_disable(pdata);
+       xgene_gmac_rx_disable(pdata);
+
+       ret = xgene_enet_create_desc_rings(ndev);
+       if (ret) {
+               netdev_err(ndev, "Error in ring configuration\n");
+               return ret;
+       }
+
+       /* setup buffer pool */
+       buf_pool = pdata->rx_ring->buf_pool;
+       xgene_enet_init_bufpool(buf_pool);
+       ret = xgene_enet_refill_bufpool(buf_pool, pdata->rx_buff_cnt);
+       if (ret) {
+               xgene_enet_delete_desc_rings(pdata);
+               return ret;
+       }
+
+       dst_ring_num = xgene_enet_dst_ring_num(pdata->rx_ring);
+       xgene_enet_cle_bypass(pdata, dst_ring_num, buf_pool->id);
+
+       return ret;
+}
+
+static int xgene_enet_probe(struct platform_device *pdev)
+{
+       struct net_device *ndev;
+       struct xgene_enet_pdata *pdata;
+       struct device *dev = &pdev->dev;
+       struct napi_struct *napi;
+       int ret;
+
+       ndev = alloc_etherdev(sizeof(struct xgene_enet_pdata));
+       if (!ndev)
+               return -ENOMEM;
+
+       pdata = netdev_priv(ndev);
+
+       pdata->pdev = pdev;
+       pdata->ndev = ndev;
+       SET_NETDEV_DEV(ndev, dev);
+       platform_set_drvdata(pdev, pdata);
+       ndev->netdev_ops = &xgene_ndev_ops;
+       xgene_enet_set_ethtool_ops(ndev);
+       ndev->features |= NETIF_F_IP_CSUM |
+                         NETIF_F_GSO |
+                         NETIF_F_GRO;
+
+       ret = xgene_enet_get_resources(pdata);
+       if (ret)
+               goto err;
+
+       xgene_enet_reset(pdata);
+       xgene_gmac_init(pdata, SPEED_1000);
+
+       ret = register_netdev(ndev);
+       if (ret) {
+               netdev_err(ndev, "Failed to register netdev\n");
+               goto err;
+       }
+
+       ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
+       if (ret) {
+               netdev_err(ndev, "No usable DMA configuration\n");
+               goto err;
+       }
+
+       ret = xgene_enet_init_hw(pdata);
+       if (ret)
+               goto err;
+
+       napi = &pdata->rx_ring->napi;
+       netif_napi_add(ndev, napi, xgene_enet_napi, NAPI_POLL_WEIGHT);
+       ret = xgene_enet_mdio_config(pdata);
+
+       return ret;
+err:
+       free_netdev(ndev);
+       return ret;
+}
+
+static int xgene_enet_remove(struct platform_device *pdev)
+{
+       struct xgene_enet_pdata *pdata;
+       struct net_device *ndev;
+
+       pdata = platform_get_drvdata(pdev);
+       ndev = pdata->ndev;
+
+       xgene_gmac_rx_disable(pdata);
+       xgene_gmac_tx_disable(pdata);
+
+       netif_napi_del(&pdata->rx_ring->napi);
+       xgene_enet_mdio_remove(pdata);
+       xgene_enet_delete_desc_rings(pdata);
+       unregister_netdev(ndev);
+       xgene_gport_shutdown(pdata);
+       free_netdev(ndev);
+
+       return 0;
+}
+
+static struct of_device_id xgene_enet_match[] = {
+       {.compatible = "apm,xgene-enet",},
+       {},
+};
+
+MODULE_DEVICE_TABLE(of, xgene_enet_match);
+
+static struct platform_driver xgene_enet_driver = {
+       .driver = {
+                  .name = "xgene-enet",
+                  .of_match_table = xgene_enet_match,
+       },
+       .probe = xgene_enet_probe,
+       .remove = xgene_enet_remove,
+};
+
+module_platform_driver(xgene_enet_driver);
+
+MODULE_DESCRIPTION("APM X-Gene SoC Ethernet driver");
+MODULE_VERSION(XGENE_DRV_VERSION);
+MODULE_AUTHOR("Keyur Chudgar <kchudgar@apm.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.h b/drivers/net/ethernet/apm/xgene/xgene_enet_main.h
new file mode 100644 (file)
index 0000000..0815866
--- /dev/null
@@ -0,0 +1,135 @@
+/* Applied Micro X-Gene SoC Ethernet Driver
+ *
+ * Copyright (c) 2014, Applied Micro Circuits Corporation
+ * Authors: Iyappan Subramanian <isubramanian@apm.com>
+ *         Ravi Patel <rapatel@apm.com>
+ *         Keyur Chudgar <kchudgar@apm.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __XGENE_ENET_MAIN_H__
+#define __XGENE_ENET_MAIN_H__
+
+#include <linux/clk.h>
+#include <linux/of_platform.h>
+#include <linux/of_net.h>
+#include <linux/of_mdio.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <linux/prefetch.h>
+#include <linux/if_vlan.h>
+#include <linux/phy.h>
+#include "xgene_enet_hw.h"
+
+#define XGENE_DRV_VERSION      "v1.0"
+#define XGENE_ENET_MAX_MTU     1536
+#define SKB_BUFFER_SIZE                (XGENE_ENET_MAX_MTU - NET_IP_ALIGN)
+#define NUM_PKT_BUF    64
+#define NUM_BUFPOOL    32
+
+/* software context of a descriptor ring */
+struct xgene_enet_desc_ring {
+       struct net_device *ndev;
+       u16 id;
+       u16 num;
+       u16 head;
+       u16 tail;
+       u16 slots;
+       u16 irq;
+       u32 size;
+       u32 state[NUM_RING_CONFIG];
+       void __iomem *cmd_base;
+       void __iomem *cmd;
+       dma_addr_t dma;
+       u16 dst_ring_num;
+       u8 nbufpool;
+       struct sk_buff *(*rx_skb);
+       struct sk_buff *(*cp_skb);
+       enum xgene_enet_ring_cfgsize cfgsize;
+       struct xgene_enet_desc_ring *cp_ring;
+       struct xgene_enet_desc_ring *buf_pool;
+       struct napi_struct napi;
+       union {
+               void *desc_addr;
+               struct xgene_enet_raw_desc *raw_desc;
+               struct xgene_enet_raw_desc16 *raw_desc16;
+       };
+};
+
+/* ethernet private data */
+struct xgene_enet_pdata {
+       struct net_device *ndev;
+       struct mii_bus *mdio_bus;
+       struct phy_device *phy_dev;
+       int phy_speed;
+       struct clk *clk;
+       struct platform_device *pdev;
+       struct xgene_enet_desc_ring *tx_ring;
+       struct xgene_enet_desc_ring *rx_ring;
+       char *dev_name;
+       u32 rx_buff_cnt;
+       u32 tx_qcnt_hi;
+       u32 cp_qcnt_hi;
+       u32 cp_qcnt_low;
+       u32 rx_irq;
+       void __iomem *eth_csr_addr;
+       void __iomem *eth_ring_if_addr;
+       void __iomem *eth_diag_csr_addr;
+       void __iomem *mcx_mac_addr;
+       void __iomem *mcx_stats_addr;
+       void __iomem *mcx_mac_csr_addr;
+       void __iomem *base_addr;
+       void __iomem *ring_csr_addr;
+       void __iomem *ring_cmd_addr;
+       u32 phy_addr;
+       int phy_mode;
+       u32 speed;
+       u16 rm;
+       struct rtnl_link_stats64 stats;
+};
+
+/* Set the specified value into a bit-field defined by its starting position
+ * and length within a single u64.
+ */
+static inline u64 xgene_enet_set_field_value(int pos, int len, u64 val)
+{
+       return (val & ((1ULL << len) - 1)) << pos;
+}
+
+#define SET_VAL(field, val) \
+               xgene_enet_set_field_value(field ## _POS, field ## _LEN, val)
+
+#define SET_BIT(field) \
+               xgene_enet_set_field_value(field ## _POS, 1, 1)
+
+/* Get the value from a bit-field defined by its starting position
+ * and length within the specified u64.
+ */
+static inline u64 xgene_enet_get_field_value(int pos, int len, u64 src)
+{
+       return (src >> pos) & ((1ULL << len) - 1);
+}
+
+#define GET_VAL(field, src) \
+               xgene_enet_get_field_value(field ## _POS, field ## _LEN, src)
+
+static inline struct device *ndev_to_dev(struct net_device *ndev)
+{
+       return ndev->dev.parent;
+}
+
+void xgene_enet_set_ethtool_ops(struct net_device *netdev);
+
+#endif /* __XGENE_ENET_MAIN_H__ */
index ce455aed5a2f557067e479632553ca461450a7dd..3f9d4de8173cdd0f86589925cf48a78d0ab7648d 100644 (file)
@@ -739,7 +739,6 @@ static void bcmgenet_power_down(struct bcmgenet_priv *priv,
 
        case GENET_POWER_PASSIVE:
                /* Power down LED */
-               bcmgenet_mii_reset(priv->dev);
                if (priv->hw_params->flags & GENET_HAS_EXT) {
                        reg = bcmgenet_ext_readl(priv, EXT_EXT_PWR_MGMT);
                        reg |= (EXT_PWR_DOWN_PHY |
@@ -779,7 +778,9 @@ static void bcmgenet_power_up(struct bcmgenet_priv *priv,
        }
 
        bcmgenet_ext_writel(priv, reg, EXT_EXT_PWR_MGMT);
-       bcmgenet_mii_reset(priv->dev);
+
+       if (mode == GENET_POWER_PASSIVE)
+               bcmgenet_mii_reset(priv->dev);
 }
 
 /* ioctl handle special commands that are not present in ethtool. */
@@ -1961,7 +1962,8 @@ static void bcmgenet_set_hw_addr(struct bcmgenet_priv *priv,
 static int bcmgenet_wol_resume(struct bcmgenet_priv *priv)
 {
        /* From WOL-enabled suspend, switch to regular clock */
-       clk_disable_unprepare(priv->clk_wol);
+       if (priv->wolopts)
+               clk_disable_unprepare(priv->clk_wol);
 
        phy_init_hw(priv->phydev);
        /* Speed settings must be restored */
@@ -2164,6 +2166,10 @@ static void bcmgenet_netif_stop(struct net_device *dev)
         * disabled no new work will be scheduled.
         */
        cancel_work_sync(&priv->bcmgenet_irq_work);
+
+       priv->old_pause = -1;
+       priv->old_link = -1;
+       priv->old_duplex = -1;
 }
 
 static int bcmgenet_close(struct net_device *dev)
@@ -2533,6 +2539,13 @@ static int bcmgenet_probe(struct platform_device *pdev)
        priv->pdev = pdev;
        priv->version = (enum bcmgenet_version)of_id->data;
 
+       priv->clk = devm_clk_get(&priv->pdev->dev, "enet");
+       if (IS_ERR(priv->clk))
+               dev_warn(&priv->pdev->dev, "failed to get enet clock\n");
+
+       if (!IS_ERR(priv->clk))
+               clk_prepare_enable(priv->clk);
+
        bcmgenet_set_hw_params(priv);
 
        /* Mii wait queue */
@@ -2541,17 +2554,10 @@ static int bcmgenet_probe(struct platform_device *pdev)
        priv->rx_buf_len = RX_BUF_LENGTH;
        INIT_WORK(&priv->bcmgenet_irq_work, bcmgenet_irq_task);
 
-       priv->clk = devm_clk_get(&priv->pdev->dev, "enet");
-       if (IS_ERR(priv->clk))
-               dev_warn(&priv->pdev->dev, "failed to get enet clock\n");
-
        priv->clk_wol = devm_clk_get(&priv->pdev->dev, "enet-wol");
        if (IS_ERR(priv->clk_wol))
                dev_warn(&priv->pdev->dev, "failed to get enet-wol clock\n");
 
-       if (!IS_ERR(priv->clk))
-               clk_prepare_enable(priv->clk);
-
        err = reset_umac(priv);
        if (err)
                goto err_clk_disable;
@@ -2611,6 +2617,8 @@ static int bcmgenet_suspend(struct device *d)
 
        bcmgenet_netif_stop(dev);
 
+       phy_suspend(priv->phydev);
+
        netif_device_detach(dev);
 
        /* Disable MAC receive */
@@ -2661,9 +2669,7 @@ static int bcmgenet_resume(struct device *d)
        if (ret)
                goto out_clk_disable;
 
-       if (priv->wolopts)
-               ret = bcmgenet_wol_resume(priv);
-
+       ret = bcmgenet_wol_resume(priv);
        if (ret)
                goto out_clk_disable;
 
@@ -2678,6 +2684,9 @@ static int bcmgenet_resume(struct device *d)
                bcmgenet_ext_writel(priv, reg, EXT_EXT_PWR_MGMT);
        }
 
+       if (priv->wolopts)
+               bcmgenet_power_up(priv, GENET_POWER_WOL_MAGIC);
+
        /* Disable RX/TX DMA and flush TX queues */
        dma_ctrl = bcmgenet_dma_disable(priv);
 
@@ -2693,6 +2702,8 @@ static int bcmgenet_resume(struct device *d)
 
        netif_device_attach(dev);
 
+       phy_resume(priv->phydev);
+
        bcmgenet_netif_start(dev);
 
        return 0;
index 18961613d385aa5e8e0cd99cb54e0027f2f8ced1..c88f7ae9963698b97837dc094f96d8b163c512c7 100644 (file)
@@ -129,7 +129,10 @@ static void bcmgenet_mii_setup(struct net_device *dev)
                        cmd_bits |= CMD_RX_PAUSE_IGNORE | CMD_TX_PAUSE_IGNORE;
        }
 
-       if (status_changed) {
+       if (!status_changed)
+               return;
+
+       if (phydev->link) {
                reg = bcmgenet_umac_readl(priv, UMAC_CMD);
                reg &= ~((CMD_SPEED_MASK << CMD_SPEED_SHIFT) |
                               CMD_HD_EN |
@@ -137,8 +140,9 @@ static void bcmgenet_mii_setup(struct net_device *dev)
                reg |= cmd_bits;
                bcmgenet_umac_writel(priv, reg, UMAC_CMD);
 
-               phy_print_status(phydev);
        }
+
+       phy_print_status(phydev);
 }
 
 void bcmgenet_mii_reset(struct net_device *dev)
@@ -303,12 +307,12 @@ static int bcmgenet_mii_probe(struct net_device *dev)
        /* In the case of a fixed PHY, the DT node associated
         * to the PHY is the Ethernet MAC DT node.
         */
-       if (of_phy_is_fixed_link(dn)) {
+       if (!priv->phy_dn && of_phy_is_fixed_link(dn)) {
                ret = of_phy_register_fixed_link(dn);
                if (ret)
                        return ret;
 
-               priv->phy_dn = dn;
+               priv->phy_dn = of_node_get(dn);
        }
 
        phydev = of_phy_connect(dev, priv->phy_dn, bcmgenet_mii_setup, 0,
@@ -444,6 +448,7 @@ int bcmgenet_mii_init(struct net_device *dev)
        return 0;
 
 out:
+       of_node_put(priv->phy_dn);
        mdiobus_unregister(priv->mii_bus);
 out_free:
        kfree(priv->mii_bus->irq);
@@ -455,6 +460,7 @@ void bcmgenet_mii_exit(struct net_device *dev)
 {
        struct bcmgenet_priv *priv = netdev_priv(dev);
 
+       of_node_put(priv->phy_dn);
        mdiobus_unregister(priv->mii_bus);
        kfree(priv->mii_bus->irq);
        mdiobus_free(priv->mii_bus);
index c9b922cc3e67765838ab0376b9d6441fba932edb..d57282172ea5497610dbb27185b41869372b9f13 100644 (file)
 #include "cxgb4_uld.h"
 
 #define T4FW_VERSION_MAJOR 0x01
-#define T4FW_VERSION_MINOR 0x09
-#define T4FW_VERSION_MICRO 0x17
+#define T4FW_VERSION_MINOR 0x0B
+#define T4FW_VERSION_MICRO 0x1B
 #define T4FW_VERSION_BUILD 0x00
 
 #define T5FW_VERSION_MAJOR 0x01
-#define T5FW_VERSION_MINOR 0x09
-#define T5FW_VERSION_MICRO 0x17
+#define T5FW_VERSION_MINOR 0x0B
+#define T5FW_VERSION_MICRO 0x1B
 #define T5FW_VERSION_BUILD 0x00
 
 #define CH_WARN(adap, fmt, ...) dev_warn(adap->pdev_dev, fmt, ## __VA_ARGS__)
@@ -522,6 +522,9 @@ struct sge_txq {
 struct sge_eth_txq {                /* state for an SGE Ethernet Tx queue */
        struct sge_txq q;
        struct netdev_queue *txq;   /* associated netdev TX queue */
+#ifdef CONFIG_CHELSIO_T4_DCB
+       u8 dcb_prio;                /* DCB Priority bound to queue */
+#endif
        unsigned long tso;          /* # of TSO requests */
        unsigned long tx_cso;       /* # of Tx checksum offloads */
        unsigned long vlan_ins;     /* # of Tx VLAN insertions */
index 0d3a9df5be36a49a1e992e010b918c76dd37eb82..8edf0f5bd679bf24b366887ff973a5d454c3af73 100644 (file)
 
 #include "cxgb4.h"
 
+/* DCBx version control
+ */
+char *dcb_ver_array[] = {
+       "Unknown",
+       "DCBx-CIN",
+       "DCBx-CEE 1.01",
+       "DCBx-IEEE",
+       "", "", "",
+       "Auto Negotiated"
+};
+
 /* Initialize a port's Data Center Bridging state.  Typically used after a
  * Link Down event.
  */
@@ -27,25 +38,45 @@ void cxgb4_dcb_state_init(struct net_device *dev)
 {
        struct port_info *pi = netdev2pinfo(dev);
        struct port_dcb_info *dcb = &pi->dcb;
+       int version_temp = dcb->dcb_version;
 
        memset(dcb, 0, sizeof(struct port_dcb_info));
        dcb->state = CXGB4_DCB_STATE_START;
+       if (version_temp)
+               dcb->dcb_version = version_temp;
+
+       netdev_dbg(dev, "%s: Initializing DCB state for port[%d]\n",
+                   __func__, pi->port_id);
+}
+
+void cxgb4_dcb_version_init(struct net_device *dev)
+{
+       struct port_info *pi = netdev2pinfo(dev);
+       struct port_dcb_info *dcb = &pi->dcb;
+
+       /* Any writes here are only done on kernels that exlicitly need
+        * a specific version, say < 2.6.38 which only support CEE
+        */
+       dcb->dcb_version = FW_PORT_DCB_VER_AUTO;
 }
 
 /* Finite State machine for Data Center Bridging.
  */
 void cxgb4_dcb_state_fsm(struct net_device *dev,
-                        enum cxgb4_dcb_state_input input)
+                        enum cxgb4_dcb_state_input transition_to)
 {
        struct port_info *pi = netdev2pinfo(dev);
        struct port_dcb_info *dcb = &pi->dcb;
        struct adapter *adap = pi->adapter;
+       enum cxgb4_dcb_state current_state = dcb->state;
 
-       switch (input) {
-       case CXGB4_DCB_INPUT_FW_DISABLED: {
-               /* Firmware tells us it's not doing DCB */
-               switch (dcb->state) {
-               case CXGB4_DCB_STATE_START: {
+       netdev_dbg(dev, "%s: State change from %d to %d for %s\n",
+                   __func__, dcb->state, transition_to, dev->name);
+
+       switch (current_state) {
+       case CXGB4_DCB_STATE_START: {
+               switch (transition_to) {
+               case CXGB4_DCB_INPUT_FW_DISABLED: {
                        /* we're going to use Host DCB */
                        dcb->state = CXGB4_DCB_STATE_HOST;
                        dcb->supported = CXGB4_DCBX_HOST_SUPPORT;
@@ -53,48 +84,62 @@ void cxgb4_dcb_state_fsm(struct net_device *dev,
                        break;
                }
 
-               case CXGB4_DCB_STATE_HOST: {
-                       /* we're alreaady in Host DCB mode */
+               case CXGB4_DCB_INPUT_FW_ENABLED: {
+                       /* we're going to use Firmware DCB */
+                       dcb->state = CXGB4_DCB_STATE_FW_INCOMPLETE;
+                       dcb->supported = CXGB4_DCBX_FW_SUPPORT;
+                       break;
+               }
+
+               case CXGB4_DCB_INPUT_FW_INCOMPLETE: {
+                       /* expected transition */
+                       break;
+               }
+
+               case CXGB4_DCB_INPUT_FW_ALLSYNCED: {
+                       dcb->state = CXGB4_DCB_STATE_FW_ALLSYNCED;
                        break;
                }
 
                default:
-                       goto bad_state_transition;
+                       goto bad_state_input;
                }
                break;
        }
 
-       case CXGB4_DCB_INPUT_FW_ENABLED: {
-               /* Firmware tells us that it is doing DCB */
-               switch (dcb->state) {
-               case CXGB4_DCB_STATE_START: {
-                       /* we're going to use Firmware DCB */
-                       dcb->state = CXGB4_DCB_STATE_FW_INCOMPLETE;
-                       dcb->supported = CXGB4_DCBX_FW_SUPPORT;
+       case CXGB4_DCB_STATE_FW_INCOMPLETE: {
+               switch (transition_to) {
+               case CXGB4_DCB_INPUT_FW_ENABLED: {
+                       /* we're alreaady in firmware DCB mode */
                        break;
                }
 
-               case CXGB4_DCB_STATE_FW_INCOMPLETE:
-               case CXGB4_DCB_STATE_FW_ALLSYNCED: {
-                       /* we're alreaady in firmware DCB mode */
+               case CXGB4_DCB_INPUT_FW_INCOMPLETE: {
+                       /* we're already incomplete */
+                       break;
+               }
+
+               case CXGB4_DCB_INPUT_FW_ALLSYNCED: {
+                       dcb->state = CXGB4_DCB_STATE_FW_ALLSYNCED;
+                       dcb->enabled = 1;
+                       linkwatch_fire_event(dev);
                        break;
                }
 
                default:
-                       goto bad_state_transition;
+                       goto bad_state_input;
                }
                break;
        }
 
-       case CXGB4_DCB_INPUT_FW_INCOMPLETE: {
-               /* Firmware tells us that its DCB state is incomplete */
-               switch (dcb->state) {
-               case CXGB4_DCB_STATE_FW_INCOMPLETE: {
-                       /* we're already incomplete */
+       case CXGB4_DCB_STATE_FW_ALLSYNCED: {
+               switch (transition_to) {
+               case CXGB4_DCB_INPUT_FW_ENABLED: {
+                       /* we're alreaady in firmware DCB mode */
                        break;
                }
 
-               case CXGB4_DCB_STATE_FW_ALLSYNCED: {
+               case CXGB4_DCB_INPUT_FW_INCOMPLETE: {
                        /* We were successfully running with firmware DCB but
                         * now it's telling us that it's in an "incomplete
                         * state.  We need to reset back to a ground state
@@ -107,46 +152,48 @@ void cxgb4_dcb_state_fsm(struct net_device *dev,
                        break;
                }
 
-               default:
-                       goto bad_state_transition;
-               }
-               break;
-       }
-
-       case CXGB4_DCB_INPUT_FW_ALLSYNCED: {
-               /* Firmware tells us that its DCB state is complete */
-               switch (dcb->state) {
-               case CXGB4_DCB_STATE_FW_INCOMPLETE: {
-                       dcb->state = CXGB4_DCB_STATE_FW_ALLSYNCED;
+               case CXGB4_DCB_INPUT_FW_ALLSYNCED: {
+                       /* we're already all sync'ed
+                        * this is only applicable for IEEE or
+                        * when another VI already completed negotiaton
+                        */
                        dcb->enabled = 1;
                        linkwatch_fire_event(dev);
                        break;
                }
 
-               case CXGB4_DCB_STATE_FW_ALLSYNCED: {
-                       /* we're already all sync'ed */
+               default:
+                       goto bad_state_input;
+               }
+               break;
+       }
+
+       case CXGB4_DCB_STATE_HOST: {
+               switch (transition_to) {
+               case CXGB4_DCB_INPUT_FW_DISABLED: {
+                       /* we're alreaady in Host DCB mode */
                        break;
                }
 
                default:
-                       goto bad_state_transition;
+                       goto bad_state_input;
                }
                break;
        }
 
        default:
-               goto  bad_state_input;
+               goto bad_state_transition;
        }
        return;
 
 bad_state_input:
        dev_err(adap->pdev_dev, "cxgb4_dcb_state_fsm: illegal input symbol %d\n",
-               input);
+               transition_to);
        return;
 
 bad_state_transition:
        dev_err(adap->pdev_dev, "cxgb4_dcb_state_fsm: bad state transition, state = %d, input = %d\n",
-               dcb->state, input);
+               current_state, transition_to);
 }
 
 /* Handle a DCB/DCBX update message from the firmware.
@@ -160,6 +207,7 @@ void cxgb4_dcb_handle_fw_update(struct adapter *adap,
        struct port_info *pi = netdev_priv(dev);
        struct port_dcb_info *dcb = &pi->dcb;
        int dcb_type = pcmd->u.dcb.pgid.type;
+       int dcb_running_version;
 
        /* Handle Firmware DCB Control messages separately since they drive
         * our state machine.
@@ -171,6 +219,25 @@ void cxgb4_dcb_handle_fw_update(struct adapter *adap,
                         ? CXGB4_DCB_STATE_FW_ALLSYNCED
                         : CXGB4_DCB_STATE_FW_INCOMPLETE);
 
+               if (dcb->dcb_version != FW_PORT_DCB_VER_UNKNOWN) {
+                       dcb_running_version = FW_PORT_CMD_DCB_VERSION_GET(
+                               be16_to_cpu(
+                               pcmd->u.dcb.control.dcb_version_to_app_state));
+                       if (dcb_running_version == FW_PORT_DCB_VER_CEE1D01 ||
+                           dcb_running_version == FW_PORT_DCB_VER_IEEE) {
+                               dcb->dcb_version = dcb_running_version;
+                               dev_warn(adap->pdev_dev, "Interface %s is running %s\n",
+                                        dev->name,
+                                        dcb_ver_array[dcb->dcb_version]);
+                       } else {
+                               dev_warn(adap->pdev_dev,
+                                        "Something screwed up, requested firmware for %s, but firmware returned %s instead\n",
+                                        dcb_ver_array[dcb->dcb_version],
+                                        dcb_ver_array[dcb_running_version]);
+                               dcb->dcb_version = FW_PORT_DCB_VER_UNKNOWN;
+                       }
+               }
+
                cxgb4_dcb_state_fsm(dev, input);
                return;
        }
@@ -199,7 +266,11 @@ void cxgb4_dcb_handle_fw_update(struct adapter *adap,
                dcb->pg_num_tcs_supported = fwdcb->pgrate.num_tcs_supported;
                memcpy(dcb->pgrate, &fwdcb->pgrate.pgrate,
                       sizeof(dcb->pgrate));
+               memcpy(dcb->tsa, &fwdcb->pgrate.tsa,
+                      sizeof(dcb->tsa));
                dcb->msgs |= CXGB4_DCB_FW_PGRATE;
+               if (dcb->msgs & CXGB4_DCB_FW_PGID)
+                       IEEE_FAUX_SYNC(dev, dcb);
                break;
 
        case FW_PORT_DCB_TYPE_PRIORATE:
@@ -212,6 +283,7 @@ void cxgb4_dcb_handle_fw_update(struct adapter *adap,
                dcb->pfcen = fwdcb->pfc.pfcen;
                dcb->pfc_num_tcs_supported = fwdcb->pfc.max_pfc_tcs;
                dcb->msgs |= CXGB4_DCB_FW_PFC;
+               IEEE_FAUX_SYNC(dev, dcb);
                break;
 
        case FW_PORT_DCB_TYPE_APP_ID: {
@@ -220,13 +292,25 @@ void cxgb4_dcb_handle_fw_update(struct adapter *adap,
                struct app_priority *ap = &dcb->app_priority[idx];
 
                struct dcb_app app = {
-                       .selector = fwap->sel_field,
                        .protocol = be16_to_cpu(fwap->protocolid),
-                       .priority = fwap->user_prio_map,
                };
                int err;
 
-               err = dcb_setapp(dev, &app);
+               /* Convert from firmware format to relevant format
+                * when using app selector
+                */
+               if (dcb->dcb_version == FW_PORT_DCB_VER_IEEE) {
+                       app.selector = (fwap->sel_field + 1);
+                       app.priority = ffs(fwap->user_prio_map) - 1;
+                       err = dcb_ieee_setapp(dev, &app);
+                       IEEE_FAUX_SYNC(dev, dcb);
+               } else {
+                       /* Default is CEE */
+                       app.selector = !!(fwap->sel_field);
+                       app.priority = fwap->user_prio_map;
+                       err = dcb_setapp(dev, &app);
+               }
+
                if (err)
                        dev_err(adap->pdev_dev,
                                "Failed DCB Set Application Priority: sel=%d, prot=%d, prio=%d, err=%d\n",
@@ -408,9 +492,10 @@ static void cxgb4_getpgbwgcfg(struct net_device *dev, int pgid, u8 *bw_per,
        if (err != FW_PORT_DCB_CFG_SUCCESS) {
                dev_err(adap->pdev_dev, "DCB read PGRATE failed with %d\n",
                        -err);
-       } else {
-               *bw_per = pcmd.u.dcb.pgrate.pgrate[pgid];
+               return;
        }
+
+       *bw_per = pcmd.u.dcb.pgrate.pgrate[pgid];
 }
 
 static void cxgb4_getpgbwgcfg_tx(struct net_device *dev, int pgid, u8 *bw_per)
@@ -637,7 +722,8 @@ static int __cxgb4_getapp(struct net_device *dev, u8 app_idtype, u16 app_id,
                        return err;
                }
                if (be16_to_cpu(pcmd.u.dcb.app_priority.protocolid) == app_id)
-                       return pcmd.u.dcb.app_priority.user_prio_map;
+                       if (pcmd.u.dcb.app_priority.sel_field == app_idtype)
+                               return pcmd.u.dcb.app_priority.user_prio_map;
 
                /* exhausted app list */
                if (!pcmd.u.dcb.app_priority.protocolid)
@@ -657,8 +743,8 @@ static int cxgb4_getapp(struct net_device *dev, u8 app_idtype, u16 app_id)
 
 /* Write a new Application User Priority Map for the specified Application ID
  */
-static int cxgb4_setapp(struct net_device *dev, u8 app_idtype, u16 app_id,
-                       u8 app_prio)
+static int __cxgb4_setapp(struct net_device *dev, u8 app_idtype, u16 app_id,
+                         u8 app_prio)
 {
        struct fw_port_cmd pcmd;
        struct port_info *pi = netdev2pinfo(dev);
@@ -673,10 +759,6 @@ static int cxgb4_setapp(struct net_device *dev, u8 app_idtype, u16 app_id,
        if (!netif_carrier_ok(dev))
                return -ENOLINK;
 
-       if (app_idtype != DCB_APP_IDTYPE_ETHTYPE &&
-           app_idtype != DCB_APP_IDTYPE_PORTNUM)
-               return -EINVAL;
-
        for (i = 0; i < CXGB4_MAX_DCBX_APP_SUPPORTED; i++) {
                INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id);
                pcmd.u.dcb.app_priority.type = FW_PORT_DCB_TYPE_APP_ID;
@@ -725,6 +807,30 @@ static int cxgb4_setapp(struct net_device *dev, u8 app_idtype, u16 app_id,
        return 0;
 }
 
+/* Priority for CEE inside dcb_app is bitmask, with 0 being an invalid value */
+static int cxgb4_setapp(struct net_device *dev, u8 app_idtype, u16 app_id,
+                       u8 app_prio)
+{
+       int ret;
+       struct dcb_app app = {
+               .selector = app_idtype,
+               .protocol = app_id,
+               .priority = app_prio,
+       };
+
+       if (app_idtype != DCB_APP_IDTYPE_ETHTYPE &&
+           app_idtype != DCB_APP_IDTYPE_PORTNUM)
+               return -EINVAL;
+
+       /* Convert app_idtype to a format that firmware understands */
+       ret = __cxgb4_setapp(dev, app_idtype == DCB_APP_IDTYPE_ETHTYPE ?
+                             app_idtype : 3, app_id, app_prio);
+       if (ret)
+               return ret;
+
+       return dcb_setapp(dev, &app);
+}
+
 /* Return whether IEEE Data Center Bridging has been negotiated.
  */
 static inline int cxgb4_ieee_negotiation_complete(struct net_device *dev)
@@ -738,6 +844,7 @@ static inline int cxgb4_ieee_negotiation_complete(struct net_device *dev)
 
 /* Fill in the Application User Priority Map associated with the
  * specified Application.
+ * Priority for IEEE dcb_app is an integer, with 0 being a valid value
  */
 static int cxgb4_ieee_getapp(struct net_device *dev, struct dcb_app *app)
 {
@@ -748,28 +855,39 @@ static int cxgb4_ieee_getapp(struct net_device *dev, struct dcb_app *app)
        if (!(app->selector && app->protocol))
                return -EINVAL;
 
-       prio = dcb_getapp(dev, app);
-       if (prio == 0) {
-               /* If app doesn't exist in dcb_app table, try firmware
-                * directly.
-                */
-               prio = __cxgb4_getapp(dev, app->selector, app->protocol, 0);
-       }
+       /* Try querying firmware first, use firmware format */
+       prio = __cxgb4_getapp(dev, app->selector - 1, app->protocol, 0);
+
+       if (prio < 0)
+               prio = dcb_ieee_getapp_mask(dev, app);
 
-       app->priority = prio;
+       app->priority = ffs(prio) - 1;
        return 0;
 }
 
-/* Write a new Application User Priority Map for the specified App id. */
+/* Write a new Application User Priority Map for the specified Application ID.
+ * Priority for IEEE dcb_app is an integer, with 0 being a valid value
+ */
 static int cxgb4_ieee_setapp(struct net_device *dev, struct dcb_app *app)
 {
+       int ret;
+
        if (!cxgb4_ieee_negotiation_complete(dev))
                return -EINVAL;
-       if (!(app->selector && app->protocol && app->priority))
+       if (!(app->selector && app->protocol))
+               return -EINVAL;
+
+       if (!(app->selector > IEEE_8021QAZ_APP_SEL_ETHERTYPE  &&
+             app->selector < IEEE_8021QAZ_APP_SEL_ANY))
                return -EINVAL;
 
-       cxgb4_setapp(dev, app->selector, app->protocol, app->priority);
-       return dcb_setapp(dev, app);
+       /* change selector to a format that firmware understands */
+       ret = __cxgb4_setapp(dev, app->selector - 1, app->protocol,
+                            (1 << app->priority));
+       if (ret)
+               return ret;
+
+       return dcb_ieee_setapp(dev, app);
 }
 
 /* Return our DCBX parameters.
@@ -794,8 +912,9 @@ static u8 cxgb4_setdcbx(struct net_device *dev, u8 dcb_request)
            != dcb_request)
                return 1;
 
-       /* Can't set DCBX capabilities if DCBX isn't enabled. */
-       if (!pi->dcb.state)
+       /* Can't enable DCB if we haven't successfully negotiated it.
+        */
+       if (pi->dcb.state != CXGB4_DCB_STATE_FW_ALLSYNCED)
                return 1;
 
        /* There's currently no mechanism to allow for the firmware DCBX
@@ -874,7 +993,8 @@ static int cxgb4_getpeerapp_tbl(struct net_device *dev, struct dcb_app *table)
                table[i].selector = pcmd.u.dcb.app_priority.sel_field;
                table[i].protocol =
                        be16_to_cpu(pcmd.u.dcb.app_priority.protocolid);
-               table[i].priority = pcmd.u.dcb.app_priority.user_prio_map;
+               table[i].priority =
+                       ffs(pcmd.u.dcb.app_priority.user_prio_map) - 1;
        }
        return err;
 }
index 1ec1d834e25778ef81b0414e1315a452d3084a5a..2a6aa88984f44f31d34cc55664705ac6b8dc4a90 100644 (file)
 #define INIT_PORT_DCB_WRITE_CMD(__pcmd, __port) \
        INIT_PORT_DCB_CMD(__pcmd, __port, EXEC, FW_PORT_ACTION_L2_DCB_CFG)
 
+#define IEEE_FAUX_SYNC(__dev, __dcb) \
+       do { \
+               if ((__dcb)->dcb_version == FW_PORT_DCB_VER_IEEE) \
+                       cxgb4_dcb_state_fsm((__dev), \
+                                           CXGB4_DCB_STATE_FW_ALLSYNCED); \
+       } while (0)
+
 /* States we can be in for a port's Data Center Bridging.
  */
 enum cxgb4_dcb_state {
@@ -108,11 +115,13 @@ struct port_dcb_info {
         * Native Endian format).
         */
        u32     pgid;                   /* Priority Group[0..7] */
+       u8      dcb_version;            /* Running DCBx version */
        u8      pfcen;                  /* Priority Flow Control[0..7] */
        u8      pg_num_tcs_supported;   /* max PG Traffic Classes */
        u8      pfc_num_tcs_supported;  /* max PFC Traffic Classes */
        u8      pgrate[8];              /* Priority Group Rate[0..7] */
        u8      priorate[8];            /* Priority Rate[0..7] */
+       u8      tsa[8];                 /* TSA Algorithm[0..7] */
        struct app_priority { /* Application Information */
                u8      user_prio_map;  /* Priority Map bitfield */
                u8      sel_field;      /* Protocol ID interpretation */
@@ -121,6 +130,7 @@ struct port_dcb_info {
 };
 
 void cxgb4_dcb_state_init(struct net_device *);
+void cxgb4_dcb_version_init(struct net_device *);
 void cxgb4_dcb_state_fsm(struct net_device *, enum cxgb4_dcb_state_input);
 void cxgb4_dcb_handle_fw_update(struct adapter *, const struct fw_port_cmd *);
 void cxgb4_dcb_set_caps(struct adapter *, const struct fw_port_cmd *);
index 1a162d21d8ace587476789609be1d7326ebb3305..a62d3f468c524767a9d511057caf49fb03216805 100644 (file)
@@ -522,6 +522,8 @@ static void dcb_tx_queue_prio_enable(struct net_device *dev, int enable)
                        dev_err(adap->pdev_dev,
                                "Can't %s DCB Priority on port %d, TX Queue %d: err=%d\n",
                                enable ? "set" : "unset", pi->port_id, i, -err);
+               else
+                       txq->dcb_prio = value;
        }
 }
 #endif /* CONFIG_CHELSIO_T4_DCB */
index ff709e3b3e7e43371518222d93a5119dcd3b185d..0549170d7e2ed2b60f68322b4ef44887471bbc52 100644 (file)
@@ -1629,6 +1629,14 @@ enum fw_port_l2cfg_ctlbf {
        FW_PORT_L2_CTLBF_TXIPG  = 0x20
 };
 
+enum fw_port_dcb_versions {
+       FW_PORT_DCB_VER_UNKNOWN,
+       FW_PORT_DCB_VER_CEE1D0,
+       FW_PORT_DCB_VER_CEE1D01,
+       FW_PORT_DCB_VER_IEEE,
+       FW_PORT_DCB_VER_AUTO = 7
+};
+
 enum fw_port_dcb_cfg {
        FW_PORT_DCB_CFG_PG      = 0x01,
        FW_PORT_DCB_CFG_PFC     = 0x02,
@@ -1709,6 +1717,7 @@ struct fw_port_cmd {
                                __u8   r10_lo[5];
                                __u8   num_tcs_supported;
                                __u8   pgrate[8];
+                               __u8   tsa[8];
                        } pgrate;
                        struct fw_port_dcb_priorate {
                                __u8   type;
@@ -1735,7 +1744,7 @@ struct fw_port_cmd {
                        struct fw_port_dcb_control {
                                __u8   type;
                                __u8   all_syncd_pkd;
-                               __be16 pfc_state_to_app_state;
+                               __be16 dcb_version_to_app_state;
                                __be32 r11;
                                __be64 r12;
                        } control;
@@ -1778,6 +1787,7 @@ struct fw_port_cmd {
 #define FW_PORT_CMD_DCBXDIS (1U << 7)
 #define FW_PORT_CMD_APPLY (1U <<  7)
 #define FW_PORT_CMD_ALL_SYNCD (1U << 7)
+#define FW_PORT_CMD_DCB_VERSION_GET(x) (((x) >> 8) & 0xf)
 
 #define FW_PORT_CMD_PPPEN(x) ((x) << 31)
 #define FW_PORT_CMD_TPSRC(x) ((x) << 28)
index 23084fb2090e3689bef788c99923045f5acc46ea..9b33057a94779f239b2515269444dd8c4617a018 100644 (file)
@@ -93,7 +93,7 @@ enum dm9000_type {
 };
 
 /* Structure/enum declaration ------------------------------- */
-typedef struct board_info {
+struct board_info {
 
        void __iomem    *io_addr;       /* Register I/O base address */
        void __iomem    *io_data;       /* Data I/O address */
@@ -141,7 +141,7 @@ typedef struct board_info {
        u32             wake_state;
 
        int             ip_summed;
-} board_info_t;
+};
 
 /* debug code */
 
@@ -151,7 +151,7 @@ typedef struct board_info {
        }                                               \
 } while (0)
 
-static inline board_info_t *to_dm9000_board(struct net_device *dev)
+static inline struct board_info *to_dm9000_board(struct net_device *dev)
 {
        return netdev_priv(dev);
 }
@@ -162,7 +162,7 @@ static inline board_info_t *to_dm9000_board(struct net_device *dev)
  *   Read a byte from I/O port
  */
 static u8
-ior(board_info_t *db, int reg)
+ior(struct board_info *db, int reg)
 {
        writeb(reg, db->io_addr);
        return readb(db->io_data);
@@ -173,14 +173,14 @@ ior(board_info_t *db, int reg)
  */
 
 static void
-iow(board_info_t *db, int reg, int value)
+iow(struct board_info *db, int reg, int value)
 {
        writeb(reg, db->io_addr);
        writeb(value, db->io_data);
 }
 
 static void
-dm9000_reset(board_info_t *db)
+dm9000_reset(struct board_info *db)
 {
        dev_dbg(db->dev, "resetting device\n");
 
@@ -272,7 +272,7 @@ static void dm9000_dumpblk_32bit(void __iomem *reg, int count)
  * Sleep, either by using msleep() or if we are suspending, then
  * use mdelay() to sleep.
  */
-static void dm9000_msleep(board_info_t *db, unsigned int ms)
+static void dm9000_msleep(struct board_info *db, unsigned int ms)
 {
        if (db->in_suspend || db->in_timeout)
                mdelay(ms);
@@ -284,7 +284,7 @@ static void dm9000_msleep(board_info_t *db, unsigned int ms)
 static int
 dm9000_phy_read(struct net_device *dev, int phy_reg_unused, int reg)
 {
-       board_info_t *db = netdev_priv(dev);
+       struct board_info *db = netdev_priv(dev);
        unsigned long flags;
        unsigned int reg_save;
        int ret;
@@ -330,7 +330,7 @@ static void
 dm9000_phy_write(struct net_device *dev,
                 int phyaddr_unused, int reg, int value)
 {
-       board_info_t *db = netdev_priv(dev);
+       struct board_info *db = netdev_priv(dev);
        unsigned long flags;
        unsigned long reg_save;
 
@@ -408,7 +408,7 @@ static void dm9000_set_io(struct board_info *db, int byte_width)
        }
 }
 
-static void dm9000_schedule_poll(board_info_t *db)
+static void dm9000_schedule_poll(struct board_info *db)
 {
        if (db->type == TYPE_DM9000E)
                schedule_delayed_work(&db->phy_poll, HZ * 2);
@@ -416,7 +416,7 @@ static void dm9000_schedule_poll(board_info_t *db)
 
 static int dm9000_ioctl(struct net_device *dev, struct ifreq *req, int cmd)
 {
-       board_info_t *dm = to_dm9000_board(dev);
+       struct board_info *dm = to_dm9000_board(dev);
 
        if (!netif_running(dev))
                return -EINVAL;
@@ -425,7 +425,7 @@ static int dm9000_ioctl(struct net_device *dev, struct ifreq *req, int cmd)
 }
 
 static unsigned int
-dm9000_read_locked(board_info_t *db, int reg)
+dm9000_read_locked(struct board_info *db, int reg)
 {
        unsigned long flags;
        unsigned int ret;
@@ -437,7 +437,7 @@ dm9000_read_locked(board_info_t *db, int reg)
        return ret;
 }
 
-static int dm9000_wait_eeprom(board_info_t *db)
+static int dm9000_wait_eeprom(struct board_info *db)
 {
        unsigned int status;
        int timeout = 8;        /* wait max 8msec */
@@ -474,7 +474,7 @@ static int dm9000_wait_eeprom(board_info_t *db)
  *  Read a word data from EEPROM
  */
 static void
-dm9000_read_eeprom(board_info_t *db, int offset, u8 *to)
+dm9000_read_eeprom(struct board_info *db, int offset, u8 *to)
 {
        unsigned long flags;
 
@@ -514,7 +514,7 @@ dm9000_read_eeprom(board_info_t *db, int offset, u8 *to)
  * Write a word data to SROM
  */
 static void
-dm9000_write_eeprom(board_info_t *db, int offset, u8 *data)
+dm9000_write_eeprom(struct board_info *db, int offset, u8 *data)
 {
        unsigned long flags;
 
@@ -546,7 +546,7 @@ dm9000_write_eeprom(board_info_t *db, int offset, u8 *data)
 static void dm9000_get_drvinfo(struct net_device *dev,
                               struct ethtool_drvinfo *info)
 {
-       board_info_t *dm = to_dm9000_board(dev);
+       struct board_info *dm = to_dm9000_board(dev);
 
        strlcpy(info->driver, CARDNAME, sizeof(info->driver));
        strlcpy(info->version, DRV_VERSION, sizeof(info->version));
@@ -556,21 +556,21 @@ static void dm9000_get_drvinfo(struct net_device *dev,
 
 static u32 dm9000_get_msglevel(struct net_device *dev)
 {
-       board_info_t *dm = to_dm9000_board(dev);
+       struct board_info *dm = to_dm9000_board(dev);
 
        return dm->msg_enable;
 }
 
 static void dm9000_set_msglevel(struct net_device *dev, u32 value)
 {
-       board_info_t *dm = to_dm9000_board(dev);
+       struct board_info *dm = to_dm9000_board(dev);
 
        dm->msg_enable = value;
 }
 
 static int dm9000_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
-       board_info_t *dm = to_dm9000_board(dev);
+       struct board_info *dm = to_dm9000_board(dev);
 
        mii_ethtool_gset(&dm->mii, cmd);
        return 0;
@@ -578,21 +578,21 @@ static int dm9000_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 
 static int dm9000_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
-       board_info_t *dm = to_dm9000_board(dev);
+       struct board_info *dm = to_dm9000_board(dev);
 
        return mii_ethtool_sset(&dm->mii, cmd);
 }
 
 static int dm9000_nway_reset(struct net_device *dev)
 {
-       board_info_t *dm = to_dm9000_board(dev);
+       struct board_info *dm = to_dm9000_board(dev);
        return mii_nway_restart(&dm->mii);
 }
 
 static int dm9000_set_features(struct net_device *dev,
        netdev_features_t features)
 {
-       board_info_t *dm = to_dm9000_board(dev);
+       struct board_info *dm = to_dm9000_board(dev);
        netdev_features_t changed = dev->features ^ features;
        unsigned long flags;
 
@@ -608,7 +608,7 @@ static int dm9000_set_features(struct net_device *dev,
 
 static u32 dm9000_get_link(struct net_device *dev)
 {
-       board_info_t *dm = to_dm9000_board(dev);
+       struct board_info *dm = to_dm9000_board(dev);
        u32 ret;
 
        if (dm->flags & DM9000_PLATF_EXT_PHY)
@@ -629,7 +629,7 @@ static int dm9000_get_eeprom_len(struct net_device *dev)
 static int dm9000_get_eeprom(struct net_device *dev,
                             struct ethtool_eeprom *ee, u8 *data)
 {
-       board_info_t *dm = to_dm9000_board(dev);
+       struct board_info *dm = to_dm9000_board(dev);
        int offset = ee->offset;
        int len = ee->len;
        int i;
@@ -653,7 +653,7 @@ static int dm9000_get_eeprom(struct net_device *dev,
 static int dm9000_set_eeprom(struct net_device *dev,
                             struct ethtool_eeprom *ee, u8 *data)
 {
-       board_info_t *dm = to_dm9000_board(dev);
+       struct board_info *dm = to_dm9000_board(dev);
        int offset = ee->offset;
        int len = ee->len;
        int done;
@@ -691,7 +691,7 @@ static int dm9000_set_eeprom(struct net_device *dev,
 
 static void dm9000_get_wol(struct net_device *dev, struct ethtool_wolinfo *w)
 {
-       board_info_t *dm = to_dm9000_board(dev);
+       struct board_info *dm = to_dm9000_board(dev);
 
        memset(w, 0, sizeof(struct ethtool_wolinfo));
 
@@ -702,7 +702,7 @@ static void dm9000_get_wol(struct net_device *dev, struct ethtool_wolinfo *w)
 
 static int dm9000_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
 {
-       board_info_t *dm = to_dm9000_board(dev);
+       struct board_info *dm = to_dm9000_board(dev);
        unsigned long flags;
        u32 opts = w->wolopts;
        u32 wcr = 0;
@@ -752,7 +752,7 @@ static const struct ethtool_ops dm9000_ethtool_ops = {
        .set_eeprom             = dm9000_set_eeprom,
 };
 
-static void dm9000_show_carrier(board_info_t *db,
+static void dm9000_show_carrier(struct board_info *db,
                                unsigned carrier, unsigned nsr)
 {
        int lpa;
@@ -775,7 +775,7 @@ static void
 dm9000_poll_work(struct work_struct *w)
 {
        struct delayed_work *dw = to_delayed_work(w);
-       board_info_t *db = container_of(dw, board_info_t, phy_poll);
+       struct board_info *db = container_of(dw, struct board_info, phy_poll);
        struct net_device *ndev = db->ndev;
 
        if (db->flags & DM9000_PLATF_SIMPLE_PHY &&
@@ -843,7 +843,7 @@ static unsigned char dm9000_type_to_char(enum dm9000_type type)
 static void
 dm9000_hash_table_unlocked(struct net_device *dev)
 {
-       board_info_t *db = netdev_priv(dev);
+       struct board_info *db = netdev_priv(dev);
        struct netdev_hw_addr *ha;
        int i, oft;
        u32 hash_val;
@@ -879,7 +879,7 @@ dm9000_hash_table_unlocked(struct net_device *dev)
 static void
 dm9000_hash_table(struct net_device *dev)
 {
-       board_info_t *db = netdev_priv(dev);
+       struct board_info *db = netdev_priv(dev);
        unsigned long flags;
 
        spin_lock_irqsave(&db->lock, flags);
@@ -888,13 +888,13 @@ dm9000_hash_table(struct net_device *dev)
 }
 
 static void
-dm9000_mask_interrupts(board_info_t *db)
+dm9000_mask_interrupts(struct board_info *db)
 {
        iow(db, DM9000_IMR, IMR_PAR);
 }
 
 static void
-dm9000_unmask_interrupts(board_info_t *db)
+dm9000_unmask_interrupts(struct board_info *db)
 {
        iow(db, DM9000_IMR, db->imr_all);
 }
@@ -905,7 +905,7 @@ dm9000_unmask_interrupts(board_info_t *db)
 static void
 dm9000_init_dm9000(struct net_device *dev)
 {
-       board_info_t *db = netdev_priv(dev);
+       struct board_info *db = netdev_priv(dev);
        unsigned int imr;
        unsigned int ncr;
 
@@ -970,7 +970,7 @@ dm9000_init_dm9000(struct net_device *dev)
 /* Our watchdog timed out. Called by the networking layer */
 static void dm9000_timeout(struct net_device *dev)
 {
-       board_info_t *db = netdev_priv(dev);
+       struct board_info *db = netdev_priv(dev);
        u8 reg_save;
        unsigned long flags;
 
@@ -996,7 +996,7 @@ static void dm9000_send_packet(struct net_device *dev,
                               int ip_summed,
                               u16 pkt_len)
 {
-       board_info_t *dm = to_dm9000_board(dev);
+       struct board_info *dm = to_dm9000_board(dev);
 
        /* The DM9000 is not smart enough to leave fragmented packets alone. */
        if (dm->ip_summed != ip_summed) {
@@ -1023,7 +1023,7 @@ static int
 dm9000_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
        unsigned long flags;
-       board_info_t *db = netdev_priv(dev);
+       struct board_info *db = netdev_priv(dev);
 
        dm9000_dbg(db, 3, "%s:\n", __func__);
 
@@ -1062,7 +1062,7 @@ dm9000_start_xmit(struct sk_buff *skb, struct net_device *dev)
  * receive the packet to upper layer, free the transmitted packet
  */
 
-static void dm9000_tx_done(struct net_device *dev, board_info_t *db)
+static void dm9000_tx_done(struct net_device *dev, struct board_info *db)
 {
        int tx_status = ior(db, DM9000_NSR);    /* Got TX status */
 
@@ -1094,7 +1094,7 @@ struct dm9000_rxhdr {
 static void
 dm9000_rx(struct net_device *dev)
 {
-       board_info_t *db = netdev_priv(dev);
+       struct board_info *db = netdev_priv(dev);
        struct dm9000_rxhdr rxhdr;
        struct sk_buff *skb;
        u8 rxbyte, *rdptr;
@@ -1196,7 +1196,7 @@ dm9000_rx(struct net_device *dev)
 static irqreturn_t dm9000_interrupt(int irq, void *dev_id)
 {
        struct net_device *dev = dev_id;
-       board_info_t *db = netdev_priv(dev);
+       struct board_info *db = netdev_priv(dev);
        int int_status;
        unsigned long flags;
        u8 reg_save;
@@ -1246,7 +1246,7 @@ static irqreturn_t dm9000_interrupt(int irq, void *dev_id)
 static irqreturn_t dm9000_wol_interrupt(int irq, void *dev_id)
 {
        struct net_device *dev = dev_id;
-       board_info_t *db = netdev_priv(dev);
+       struct board_info *db = netdev_priv(dev);
        unsigned long flags;
        unsigned nsr, wcr;
 
@@ -1296,7 +1296,7 @@ static void dm9000_poll_controller(struct net_device *dev)
 static int
 dm9000_open(struct net_device *dev)
 {
-       board_info_t *db = netdev_priv(dev);
+       struct board_info *db = netdev_priv(dev);
        unsigned long irqflags = db->irq_res->flags & IRQF_TRIGGER_MASK;
 
        if (netif_msg_ifup(db))
@@ -1342,7 +1342,7 @@ dm9000_open(struct net_device *dev)
 static void
 dm9000_shutdown(struct net_device *dev)
 {
-       board_info_t *db = netdev_priv(dev);
+       struct board_info *db = netdev_priv(dev);
 
        /* RESET device */
        dm9000_phy_write(dev, 0, MII_BMCR, BMCR_RESET); /* PHY RESET */
@@ -1358,7 +1358,7 @@ dm9000_shutdown(struct net_device *dev)
 static int
 dm9000_stop(struct net_device *ndev)
 {
-       board_info_t *db = netdev_priv(ndev);
+       struct board_info *db = netdev_priv(ndev);
 
        if (netif_msg_ifdown(db))
                dev_dbg(db->dev, "shutting down %s\n", ndev->name);
@@ -1681,7 +1681,7 @@ dm9000_drv_suspend(struct device *dev)
 {
        struct platform_device *pdev = to_platform_device(dev);
        struct net_device *ndev = platform_get_drvdata(pdev);
-       board_info_t *db;
+       struct board_info *db;
 
        if (ndev) {
                db = netdev_priv(ndev);
@@ -1704,7 +1704,7 @@ dm9000_drv_resume(struct device *dev)
 {
        struct platform_device *pdev = to_platform_device(dev);
        struct net_device *ndev = platform_get_drvdata(pdev);
-       board_info_t *db = netdev_priv(ndev);
+       struct board_info *db = netdev_priv(ndev);
 
        if (ndev) {
                if (netif_running(ndev)) {
index bd53caf1c1eb6b735be414aecd11a4e3ff732747..9f7fa644a397a57c21bfa317b9154f1bc5b8e2ee 100644 (file)
@@ -310,6 +310,7 @@ struct fec_enet_private {
        int     mii_timeout;
        uint    phy_speed;
        phy_interface_t phy_interface;
+       struct device_node *phy_node;
        int     link;
        int     full_duplex;
        int     speed;
index 66fe1f6724996b57eba5e30b4e8b2bbc908ca38e..4f87dffcb9b26688ba7ef51145138c88b8d6ad8a 100644 (file)
@@ -52,6 +52,7 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 #include <linux/of_gpio.h>
+#include <linux/of_mdio.h>
 #include <linux/of_net.h>
 #include <linux/regulator/consumer.h>
 #include <linux/if_vlan.h>
@@ -1648,29 +1649,37 @@ static int fec_enet_mii_probe(struct net_device *ndev)
 
        fep->phy_dev = NULL;
 
-       /* check for attached phy */
-       for (phy_id = 0; (phy_id < PHY_MAX_ADDR); phy_id++) {
-               if ((fep->mii_bus->phy_mask & (1 << phy_id)))
-                       continue;
-               if (fep->mii_bus->phy_map[phy_id] == NULL)
-                       continue;
-               if (fep->mii_bus->phy_map[phy_id]->phy_id == 0)
-                       continue;
-               if (dev_id--)
-                       continue;
-               strncpy(mdio_bus_id, fep->mii_bus->id, MII_BUS_ID_SIZE);
-               break;
-       }
+       if (fep->phy_node) {
+               phy_dev = of_phy_connect(ndev, fep->phy_node,
+                                        &fec_enet_adjust_link, 0,
+                                        fep->phy_interface);
+       } else {
+               /* check for attached phy */
+               for (phy_id = 0; (phy_id < PHY_MAX_ADDR); phy_id++) {
+                       if ((fep->mii_bus->phy_mask & (1 << phy_id)))
+                               continue;
+                       if (fep->mii_bus->phy_map[phy_id] == NULL)
+                               continue;
+                       if (fep->mii_bus->phy_map[phy_id]->phy_id == 0)
+                               continue;
+                       if (dev_id--)
+                               continue;
+                       strncpy(mdio_bus_id, fep->mii_bus->id, MII_BUS_ID_SIZE);
+                       break;
+               }
 
-       if (phy_id >= PHY_MAX_ADDR) {
-               netdev_info(ndev, "no PHY, assuming direct connection to switch\n");
-               strncpy(mdio_bus_id, "fixed-0", MII_BUS_ID_SIZE);
-               phy_id = 0;
+               if (phy_id >= PHY_MAX_ADDR) {
+                       netdev_info(ndev, "no PHY, assuming direct connection to switch\n");
+                       strncpy(mdio_bus_id, "fixed-0", MII_BUS_ID_SIZE);
+                       phy_id = 0;
+               }
+
+               snprintf(phy_name, sizeof(phy_name),
+                        PHY_ID_FMT, mdio_bus_id, phy_id);
+               phy_dev = phy_connect(ndev, phy_name, &fec_enet_adjust_link,
+                                     fep->phy_interface);
        }
 
-       snprintf(phy_name, sizeof(phy_name), PHY_ID_FMT, mdio_bus_id, phy_id);
-       phy_dev = phy_connect(ndev, phy_name, &fec_enet_adjust_link,
-                             fep->phy_interface);
        if (IS_ERR(phy_dev)) {
                netdev_err(ndev, "could not attach to PHY\n");
                return PTR_ERR(phy_dev);
@@ -1707,6 +1716,7 @@ static int fec_enet_mii_init(struct platform_device *pdev)
        struct fec_enet_private *fep = netdev_priv(ndev);
        const struct platform_device_id *id_entry =
                                platform_get_device_id(fep->pdev);
+       struct device_node *node;
        int err = -ENXIO, i;
 
        /*
@@ -1774,7 +1784,15 @@ static int fec_enet_mii_init(struct platform_device *pdev)
        for (i = 0; i < PHY_MAX_ADDR; i++)
                fep->mii_bus->irq[i] = PHY_POLL;
 
-       if (mdiobus_register(fep->mii_bus))
+       node = of_get_child_by_name(pdev->dev.of_node, "mdio");
+       if (node) {
+               err = of_mdiobus_register(fep->mii_bus, node);
+               of_node_put(node);
+       } else {
+               err = mdiobus_register(fep->mii_bus);
+       }
+
+       if (err)
                goto err_out_free_mdio_irq;
 
        mii_cnt++;
@@ -2527,6 +2545,7 @@ fec_probe(struct platform_device *pdev)
        struct resource *r;
        const struct of_device_id *of_id;
        static int dev_id;
+       struct device_node *np = pdev->dev.of_node, *phy_node;
 
        of_id = of_match_device(fec_dt_ids, &pdev->dev);
        if (of_id)
@@ -2566,6 +2585,18 @@ fec_probe(struct platform_device *pdev)
 
        platform_set_drvdata(pdev, ndev);
 
+       phy_node = of_parse_phandle(np, "phy-handle", 0);
+       if (!phy_node && of_phy_is_fixed_link(np)) {
+               ret = of_phy_register_fixed_link(np);
+               if (ret < 0) {
+                       dev_err(&pdev->dev,
+                               "broken fixed-link specification\n");
+                       goto failed_phy;
+               }
+               phy_node = of_node_get(np);
+       }
+       fep->phy_node = phy_node;
+
        ret = of_get_phy_mode(pdev->dev.of_node);
        if (ret < 0) {
                pdata = dev_get_platdata(&pdev->dev);
@@ -2670,6 +2701,8 @@ failed_init:
 failed_regulator:
        fec_enet_clk_enable(ndev, false);
 failed_clk:
+failed_phy:
+       of_node_put(phy_node);
 failed_ioremap:
        free_netdev(ndev);
 
@@ -2691,6 +2724,7 @@ fec_drv_remove(struct platform_device *pdev)
        if (fep->ptp_clock)
                ptp_clock_unregister(fep->ptp_clock);
        fec_enet_clk_enable(ndev, false);
+       of_node_put(fep->phy_node);
        free_netdev(ndev);
 
        return 0;
index 9947765e90c547e97eee26edbc55aa06fd436329..ff55fbb20a75c774f72418bdca4e14d9e956a047 100644 (file)
@@ -1015,8 +1015,7 @@ mpc52xx_fec_remove(struct platform_device *op)
 
        unregister_netdev(ndev);
 
-       if (priv->phy_node)
-               of_node_put(priv->phy_node);
+       of_node_put(priv->phy_node);
        priv->phy_node = NULL;
 
        irq_dispose_mapping(ndev->irq);
index cfaf17b70f3fc5d6ab7a11a81266d67267646f33..748fd24d3d9ec3fa26ff80b528c9f9ff19fafbb4 100644 (file)
@@ -1033,7 +1033,7 @@ static int fs_enet_probe(struct platform_device *ofdev)
                /* In the case of a fixed PHY, the DT node associated
                 * to the PHY is the Ethernet MAC DT node.
                 */
-               fpi->phy_node = ofdev->dev.of_node;
+               fpi->phy_node = of_node_get(ofdev->dev.of_node);
        }
 
        if (of_device_is_compatible(ofdev->dev.of_node, "fsl,mpc5125-fec")) {
index a6cf40e62f3a9c531f2997c34590fbe89d0ddc80..fb29d049f4e12b8df13bbe21ad44c8e73de1ea01 100644 (file)
@@ -892,12 +892,12 @@ static int gfar_of_init(struct platform_device *ofdev, struct net_device **pdev)
        /* In the case of a fixed PHY, the DT node associated
         * to the PHY is the Ethernet MAC DT node.
         */
-       if (of_phy_is_fixed_link(np)) {
+       if (!priv->phy_node && of_phy_is_fixed_link(np)) {
                err = of_phy_register_fixed_link(np);
                if (err)
                        goto err_grp_init;
 
-               priv->phy_node = np;
+               priv->phy_node = of_node_get(np);
        }
 
        /* Find the TBI PHY.  If it's not there, we don't support SGMII */
@@ -1435,10 +1435,8 @@ register_fail:
        unmap_group_regs(priv);
        gfar_free_rx_queues(priv);
        gfar_free_tx_queues(priv);
-       if (priv->phy_node)
-               of_node_put(priv->phy_node);
-       if (priv->tbi_node)
-               of_node_put(priv->tbi_node);
+       of_node_put(priv->phy_node);
+       of_node_put(priv->tbi_node);
        free_gfar_dev(priv);
        return err;
 }
@@ -1447,10 +1445,8 @@ static int gfar_remove(struct platform_device *ofdev)
 {
        struct gfar_private *priv = platform_get_drvdata(ofdev);
 
-       if (priv->phy_node)
-               of_node_put(priv->phy_node);
-       if (priv->tbi_node)
-               of_node_put(priv->tbi_node);
+       of_node_put(priv->phy_node);
+       of_node_put(priv->tbi_node);
 
        unregister_netdev(priv->ndev);
        unmap_group_regs(priv);
index 8ceaf7a2660c8f3f38dc27ce71701a3290ce14d2..3cf0478b3728e51a6ba7e0b1d0e12743f539e016 100644 (file)
@@ -3785,16 +3785,15 @@ static int ucc_geth_probe(struct platform_device* ofdev)
        ug_info->uf_info.irq = irq_of_parse_and_map(np, 0);
 
        ug_info->phy_node = of_parse_phandle(np, "phy-handle", 0);
-       if (!ug_info->phy_node) {
-               /* In the case of a fixed PHY, the DT node associated
+       if (!ug_info->phy_node && of_phy_is_fixed_link(np)) {
+               /*
+                * In the case of a fixed PHY, the DT node associated
                 * to the PHY is the Ethernet MAC DT node.
                 */
-               if (of_phy_is_fixed_link(np)) {
-                       err = of_phy_register_fixed_link(np);
-                       if (err)
-                               return err;
-               }
-               ug_info->phy_node = np;
+               err = of_phy_register_fixed_link(np);
+               if (err)
+                       return err;
+               ug_info->phy_node = of_node_get(np);
        }
 
        /* Find the TBI PHY node.  If it's not there, we don't support SGMII */
@@ -3862,8 +3861,11 @@ static int ucc_geth_probe(struct platform_device* ofdev)
        /* Create an ethernet device instance */
        dev = alloc_etherdev(sizeof(*ugeth));
 
-       if (dev == NULL)
+       if (dev == NULL) {
+               of_node_put(ug_info->tbi_node);
+               of_node_put(ug_info->phy_node);
                return -ENOMEM;
+       }
 
        ugeth = netdev_priv(dev);
        spin_lock_init(&ugeth->lock);
@@ -3897,6 +3899,8 @@ static int ucc_geth_probe(struct platform_device* ofdev)
                        pr_err("%s: Cannot register net device, aborting\n",
                               dev->name);
                free_netdev(dev);
+               of_node_put(ug_info->tbi_node);
+               of_node_put(ug_info->phy_node);
                return err;
        }
 
@@ -3920,6 +3924,8 @@ static int ucc_geth_remove(struct platform_device* ofdev)
        unregister_netdev(dev);
        free_netdev(dev);
        ucc_geth_memclean(ugeth);
+       of_node_put(ugeth->ug_info->tbi_node);
+       of_node_put(ugeth->ug_info->phy_node);
 
        return 0;
 }
index cfe7a74317307f8ef1ef39acd2c78c97e903d423..a7139f588ad205e0a9441ce8e7d62a2513e4d814 100644 (file)
@@ -99,23 +99,23 @@ static const struct ethtool_ops netdev_ethtool_ops;
 /*
     card type
  */
-typedef enum { MBH10302, MBH10304, TDK, CONTEC, LA501, UNGERMANN, 
+enum cardtype { MBH10302, MBH10304, TDK, CONTEC, LA501, UNGERMANN,
               XXX10304, NEC, KME
-} cardtype_t;
+};
 
 /*
     driver specific data structure
 */
-typedef struct local_info_t {
+struct local_info {
        struct pcmcia_device    *p_dev;
     long open_time;
     uint tx_started:1;
     uint tx_queue;
     u_short tx_queue_len;
-    cardtype_t cardtype;
+    enum cardtype cardtype;
     u_short sent;
     u_char __iomem *base;
-} local_info_t;
+};
 
 #define MC_FILTERBREAK 64
 
@@ -232,13 +232,13 @@ static const struct net_device_ops fjn_netdev_ops = {
 
 static int fmvj18x_probe(struct pcmcia_device *link)
 {
-    local_info_t *lp;
+    struct local_info *lp;
     struct net_device *dev;
 
     dev_dbg(&link->dev, "fmvj18x_attach()\n");
 
     /* Make up a FMVJ18x specific data structure */
-    dev = alloc_etherdev(sizeof(local_info_t));
+    dev = alloc_etherdev(sizeof(struct local_info));
     if (!dev)
        return -ENOMEM;
     lp = netdev_priv(dev);
@@ -327,10 +327,10 @@ static int fmvj18x_ioprobe(struct pcmcia_device *p_dev, void *priv_data)
 static int fmvj18x_config(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
-    local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     int i, ret;
     unsigned int ioaddr;
-    cardtype_t cardtype;
+    enum cardtype cardtype;
     char *card_name = "unknown";
     u8 *buf;
     size_t len;
@@ -584,7 +584,7 @@ static int fmvj18x_setup_mfc(struct pcmcia_device *link)
     int i;
     struct net_device *dev = link->priv;
     unsigned int ioaddr;
-    local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
 
     /* Allocate a small memory window */
     link->resource[3]->flags = WIN_DATA_WIDTH_8|WIN_MEMORY_TYPE_AM|WIN_ENABLE;
@@ -626,7 +626,7 @@ static void fmvj18x_release(struct pcmcia_device *link)
 {
 
     struct net_device *dev = link->priv;
-    local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     u_char __iomem *tmp;
 
     dev_dbg(&link->dev, "fmvj18x_release\n");
@@ -711,7 +711,7 @@ module_pcmcia_driver(fmvj18x_cs_driver);
 static irqreturn_t fjn_interrupt(int dummy, void *dev_id)
 {
     struct net_device *dev = dev_id;
-    local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     unsigned int ioaddr;
     unsigned short tx_stat, rx_stat;
 
@@ -772,7 +772,7 @@ static irqreturn_t fjn_interrupt(int dummy, void *dev_id)
 
 static void fjn_tx_timeout(struct net_device *dev)
 {
-    struct local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     unsigned int ioaddr = dev->base_addr;
 
     netdev_notice(dev, "transmit timed out with status %04x, %s?\n",
@@ -802,7 +802,7 @@ static void fjn_tx_timeout(struct net_device *dev)
 static netdev_tx_t fjn_start_xmit(struct sk_buff *skb,
                                        struct net_device *dev)
 {
-    struct local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     unsigned int ioaddr = dev->base_addr;
     short length = skb->len;
     
@@ -874,7 +874,7 @@ static netdev_tx_t fjn_start_xmit(struct sk_buff *skb,
 
 static void fjn_reset(struct net_device *dev)
 {
-    struct local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     unsigned int ioaddr = dev->base_addr;
     int i;
 
@@ -1058,7 +1058,7 @@ static int fjn_config(struct net_device *dev, struct ifmap *map){
 
 static int fjn_open(struct net_device *dev)
 {
-    struct local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     struct pcmcia_device *link = lp->p_dev;
 
     pr_debug("fjn_open('%s').\n", dev->name);
@@ -1083,7 +1083,7 @@ static int fjn_open(struct net_device *dev)
 
 static int fjn_close(struct net_device *dev)
 {
-    struct local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     struct pcmcia_device *link = lp->p_dev;
     unsigned int ioaddr = dev->base_addr;
 
index dadd9a5f6323c5915be126aff248631e3fedc46e..c9f1d1b7ef378bef042b12c9e00f5bf2786594a8 100644 (file)
@@ -2969,14 +2969,14 @@ static int mvneta_probe(struct platform_device *pdev)
                /* In the case of a fixed PHY, the DT node associated
                 * to the PHY is the Ethernet MAC DT node.
                 */
-               phy_node = dn;
+               phy_node = of_node_get(dn);
        }
 
        phy_mode = of_get_phy_mode(dn);
        if (phy_mode < 0) {
                dev_err(&pdev->dev, "incorrect phy-mode\n");
                err = -EINVAL;
-               goto err_free_irq;
+               goto err_put_phy_node;
        }
 
        dev->tx_queue_len = MVNETA_MAX_TXD;
@@ -2992,7 +2992,7 @@ static int mvneta_probe(struct platform_device *pdev)
        pp->clk = devm_clk_get(&pdev->dev, NULL);
        if (IS_ERR(pp->clk)) {
                err = PTR_ERR(pp->clk);
-               goto err_free_irq;
+               goto err_put_phy_node;
        }
 
        clk_prepare_enable(pp->clk);
@@ -3071,6 +3071,8 @@ err_free_stats:
        free_percpu(pp->stats);
 err_clk:
        clk_disable_unprepare(pp->clk);
+err_put_phy_node:
+       of_node_put(phy_node);
 err_free_irq:
        irq_dispose_mapping(dev->irq);
 err_free_netdev:
@@ -3088,6 +3090,7 @@ static int mvneta_remove(struct platform_device *pdev)
        clk_disable_unprepare(pp->clk);
        free_percpu(pp->stats);
        irq_dispose_mapping(dev->irq);
+       of_node_put(pp->phy_node);
        free_netdev(dev);
 
        return 0;
index f3d5d79f1cd15de8dff66fa4aeab6fccaa25ab8e..69c26f04d8ce06aa0b74b944e12ed282e618e7fb 100644 (file)
@@ -574,6 +574,7 @@ myri10ge_validate_firmware(struct myri10ge_priv *mgp,
 
        /* save firmware version for ethtool */
        strncpy(mgp->fw_version, hdr->version, sizeof(mgp->fw_version));
+       mgp->fw_version[sizeof(mgp->fw_version) - 1] = '\0';
 
        sscanf(mgp->fw_version, "%d.%d.%d", &mgp->fw_ver_major,
               &mgp->fw_ver_minor, &mgp->fw_ver_tiny);
index a848d2979722486b4489cc479f2bae166f0c6038..3c2c2c7c155925f4dbf147a43dbbbee3f7dc12b5 100644 (file)
@@ -8,7 +8,7 @@ qlcnic-y := qlcnic_hw.o qlcnic_main.o qlcnic_init.o \
        qlcnic_ethtool.o qlcnic_ctx.o qlcnic_io.o \
        qlcnic_sysfs.o qlcnic_minidump.o qlcnic_83xx_hw.o \
        qlcnic_83xx_init.o qlcnic_83xx_vnic.o \
-       qlcnic_minidump.o qlcnic_sriov_common.o
+       qlcnic_sriov_common.o
 
 qlcnic-$(CONFIG_QLCNIC_SRIOV) += qlcnic_sriov_pf.o
 
index 23953957fed8a37e7e8daa81d6474e9d33b5413d..54d648920a1b9dfcb5d4f5a01271784e7df8723c 100644 (file)
@@ -51,7 +51,7 @@
 
 #ifdef CONFIG_DEBUG_SPINLOCK
 #define SMSC_ASSERT_MAC_LOCK(pdata) \
-               WARN_ON(!spin_is_locked(&pdata->mac_lock))
+               WARN_ON_SMP(!spin_is_locked(&pdata->mac_lock))
 #else
 #define SMSC_ASSERT_MAC_LOCK(pdata) do {} while (0)
 #endif                         /* CONFIG_DEBUG_SPINLOCK */
index 3809f4ec28202db36e2e91bf15d5ee73633e1294..f9bcf7aa88ca3a2d6aa4b607bd134e0c36fc6d91 100644 (file)
@@ -1130,6 +1130,7 @@ static int cpmac_probe(struct platform_device *pdev)
                strncpy(mdio_bus_id, "fixed-0", MII_BUS_ID_SIZE); /* fixed phys bus */
                phy_id = pdev->id;
        }
+       mdio_bus_id[sizeof(mdio_bus_id) - 1] = '\0';
 
        dev = alloc_etherdev_mq(sizeof(*priv), CPMAC_QUEUES);
        if (!dev)
index 8a6e5c2d6f95a3f2f52f3d9b8148c81fffe7c5d3..36f4459520c366a88a33c9cf2663bf9e713955c3 100644 (file)
@@ -1148,8 +1148,7 @@ static int temac_of_remove(struct platform_device *op)
        temac_mdio_teardown(lp);
        unregister_netdev(ndev);
        sysfs_remove_group(&lp->dev->kobj, &temac_attr_group);
-       if (lp->phy_node)
-               of_node_put(lp->phy_node);
+       of_node_put(lp->phy_node);
        lp->phy_node = NULL;
        iounmap(lp->regs);
        if (lp->sdma_regs)
index 7b0a735562645cf042cfb312ace5018180d94559..30e8608ff05079c3083ba2a6783070e056e36571 100644 (file)
@@ -1630,8 +1630,7 @@ static int axienet_of_remove(struct platform_device *op)
        axienet_mdio_teardown(lp);
        unregister_netdev(ndev);
 
-       if (lp->phy_node)
-               of_node_put(lp->phy_node);
+       of_node_put(lp->phy_node);
        lp->phy_node = NULL;
 
        iounmap(lp->regs);
index 7c81ffb861e8cf99d8e66490e6b3026df71879cd..d56f8693202bb45ea79e6eedf5141cf40d97b53f 100644 (file)
@@ -266,7 +266,7 @@ static void xirc2ps_detach(struct pcmcia_device *p_dev);
 
 static irqreturn_t xirc2ps_interrupt(int irq, void *dev_id);
 
-typedef struct local_info_t {
+struct local_info {
        struct net_device       *dev;
        struct pcmcia_device    *p_dev;
 
@@ -281,7 +281,7 @@ typedef struct local_info_t {
     unsigned last_ptr_value; /* last packets transmitted value */
     const char *manf_str;
     struct work_struct tx_timeout_task;
-} local_info_t;
+};
 
 /****************
  * Some more prototypes
@@ -475,12 +475,12 @@ static int
 xirc2ps_probe(struct pcmcia_device *link)
 {
     struct net_device *dev;
-    local_info_t *local;
+    struct local_info *local;
 
     dev_dbg(&link->dev, "attach()\n");
 
     /* Allocate the device structure */
-    dev = alloc_etherdev(sizeof(local_info_t));
+    dev = alloc_etherdev(sizeof(struct local_info));
     if (!dev)
            return -ENOMEM;
     local = netdev_priv(dev);
@@ -536,7 +536,7 @@ static int
 set_card_type(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
-    local_info_t *local = netdev_priv(dev);
+    struct local_info *local = netdev_priv(dev);
     u8 *buf;
     unsigned int cisrev, mediaid, prodid;
     size_t len;
@@ -690,7 +690,7 @@ static int
 xirc2ps_config(struct pcmcia_device * link)
 {
     struct net_device *dev = link->priv;
-    local_info_t *local = netdev_priv(dev);
+    struct local_info *local = netdev_priv(dev);
     unsigned int ioaddr;
     int err;
     u8 *buf;
@@ -931,7 +931,7 @@ xirc2ps_release(struct pcmcia_device *link)
 
        if (link->resource[2]->end) {
                struct net_device *dev = link->priv;
-               local_info_t *local = netdev_priv(dev);
+               struct local_info *local = netdev_priv(dev);
                if (local->dingo)
                        iounmap(local->dingo_ccr - 0x0800);
        }
@@ -975,7 +975,7 @@ static irqreturn_t
 xirc2ps_interrupt(int irq, void *dev_id)
 {
     struct net_device *dev = (struct net_device *)dev_id;
-    local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     unsigned int ioaddr;
     u_char saved_page;
     unsigned bytes_rcvd;
@@ -1194,8 +1194,8 @@ xirc2ps_interrupt(int irq, void *dev_id)
 static void
 xirc2ps_tx_timeout_task(struct work_struct *work)
 {
-       local_info_t *local =
-               container_of(work, local_info_t, tx_timeout_task);
+       struct local_info *local =
+               container_of(work, struct local_info, tx_timeout_task);
        struct net_device *dev = local->dev;
     /* reset the card */
     do_reset(dev,1);
@@ -1206,7 +1206,7 @@ xirc2ps_tx_timeout_task(struct work_struct *work)
 static void
 xirc_tx_timeout(struct net_device *dev)
 {
-    local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     dev->stats.tx_errors++;
     netdev_notice(dev, "transmit timed out\n");
     schedule_work(&lp->tx_timeout_task);
@@ -1215,7 +1215,7 @@ xirc_tx_timeout(struct net_device *dev)
 static netdev_tx_t
 do_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-    local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     unsigned int ioaddr = dev->base_addr;
     int okay;
     unsigned freespace;
@@ -1300,7 +1300,7 @@ static void set_address(struct set_address_info *sa_info, char *addr)
 static void set_addresses(struct net_device *dev)
 {
        unsigned int ioaddr = dev->base_addr;
-       local_info_t *lp = netdev_priv(dev);
+       struct local_info *lp = netdev_priv(dev);
        struct netdev_hw_addr *ha;
        struct set_address_info sa_info;
        int i;
@@ -1362,7 +1362,7 @@ set_multicast_list(struct net_device *dev)
 static int
 do_config(struct net_device *dev, struct ifmap *map)
 {
-    local_info_t *local = netdev_priv(dev);
+    struct local_info *local = netdev_priv(dev);
 
     pr_debug("do_config(%p)\n", dev);
     if (map->port != 255 && map->port != dev->if_port) {
@@ -1387,7 +1387,7 @@ do_config(struct net_device *dev, struct ifmap *map)
 static int
 do_open(struct net_device *dev)
 {
-    local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     struct pcmcia_device *link = lp->p_dev;
 
     dev_dbg(&link->dev, "do_open(%p)\n", dev);
@@ -1421,7 +1421,7 @@ static const struct ethtool_ops netdev_ethtool_ops = {
 static int
 do_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
-    local_info_t *local = netdev_priv(dev);
+    struct local_info *local = netdev_priv(dev);
     unsigned int ioaddr = dev->base_addr;
     struct mii_ioctl_data *data = if_mii(rq);
 
@@ -1453,7 +1453,7 @@ do_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 static void
 hardreset(struct net_device *dev)
 {
-    local_info_t *local = netdev_priv(dev);
+    struct local_info *local = netdev_priv(dev);
     unsigned int ioaddr = dev->base_addr;
 
     SelectPage(4);
@@ -1470,7 +1470,7 @@ hardreset(struct net_device *dev)
 static void
 do_reset(struct net_device *dev, int full)
 {
-    local_info_t *local = netdev_priv(dev);
+    struct local_info *local = netdev_priv(dev);
     unsigned int ioaddr = dev->base_addr;
     unsigned value;
 
@@ -1631,7 +1631,7 @@ do_reset(struct net_device *dev, int full)
 static int
 init_mii(struct net_device *dev)
 {
-    local_info_t *local = netdev_priv(dev);
+    struct local_info *local = netdev_priv(dev);
     unsigned int ioaddr = dev->base_addr;
     unsigned control, status, linkpartner;
     int i;
@@ -1715,7 +1715,7 @@ static int
 do_stop(struct net_device *dev)
 {
     unsigned int ioaddr = dev->base_addr;
-    local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     struct pcmcia_device *link = lp->p_dev;
 
     dev_dbg(&link->dev, "do_stop(%p)\n", dev);
index 7cc64eac0fa3b78b29d5cd667c143f2d9c24b79d..e5c7e6165a4b658046c0bba6173e14c7c62671f5 100644 (file)
@@ -90,7 +90,7 @@
 #define LMI_ANSI_LENGTH                  14
 
 
-typedef struct {
+struct fr_hdr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
        unsigned ea1:   1;
        unsigned cr:    1;
@@ -112,14 +112,14 @@ typedef struct {
        unsigned de:    1;
        unsigned ea2:   1;
 #endif
-}__packed fr_hdr;
+} __packed;
 
 
-typedef struct pvc_device_struct {
+struct pvc_device {
        struct net_device *frad;
        struct net_device *main;
        struct net_device *ether;       /* bridged Ethernet interface   */
-       struct pvc_device_struct *next; /* Sorted in ascending DLCI order */
+       struct pvc_device *next;        /* Sorted in ascending DLCI order */
        int dlci;
        int open_count;
 
@@ -132,11 +132,11 @@ typedef struct pvc_device_struct {
                unsigned int becn: 1;
                unsigned int bandwidth; /* Cisco LMI reporting only */
        }state;
-}pvc_device;
+};
 
 struct frad_state {
        fr_proto settings;
-       pvc_device *first_pvc;
+       struct pvc_device *first_pvc;
        int dce_pvc_count;
 
        struct timer_list timer;
@@ -174,9 +174,9 @@ static inline struct frad_state* state(hdlc_device *hdlc)
 }
 
 
-static inline pvc_device* find_pvc(hdlc_device *hdlc, u16 dlci)
+static inline struct pvc_device *find_pvc(hdlc_device *hdlc, u16 dlci)
 {
-       pvc_device *pvc = state(hdlc)->first_pvc;
+       struct pvc_device *pvc = state(hdlc)->first_pvc;
 
        while (pvc) {
                if (pvc->dlci == dlci)
@@ -190,10 +190,10 @@ static inline pvc_device* find_pvc(hdlc_device *hdlc, u16 dlci)
 }
 
 
-static pvc_device* add_pvc(struct net_device *dev, u16 dlci)
+static struct pvc_device *add_pvc(struct net_device *dev, u16 dlci)
 {
        hdlc_device *hdlc = dev_to_hdlc(dev);
-       pvc_device *pvc, **pvc_p = &state(hdlc)->first_pvc;
+       struct pvc_device *pvc, **pvc_p = &state(hdlc)->first_pvc;
 
        while (*pvc_p) {
                if ((*pvc_p)->dlci == dlci)
@@ -203,7 +203,7 @@ static pvc_device* add_pvc(struct net_device *dev, u16 dlci)
                pvc_p = &(*pvc_p)->next;
        }
 
-       pvc = kzalloc(sizeof(pvc_device), GFP_ATOMIC);
+       pvc = kzalloc(sizeof(*pvc), GFP_ATOMIC);
 #ifdef DEBUG_PVC
        printk(KERN_DEBUG "add_pvc: allocated pvc %p, frad %p\n", pvc, dev);
 #endif
@@ -218,13 +218,13 @@ static pvc_device* add_pvc(struct net_device *dev, u16 dlci)
 }
 
 
-static inline int pvc_is_used(pvc_device *pvc)
+static inline int pvc_is_used(struct pvc_device *pvc)
 {
        return pvc->main || pvc->ether;
 }
 
 
-static inline void pvc_carrier(int on, pvc_device *pvc)
+static inline void pvc_carrier(int on, struct pvc_device *pvc)
 {
        if (on) {
                if (pvc->main)
@@ -246,11 +246,11 @@ static inline void pvc_carrier(int on, pvc_device *pvc)
 
 static inline void delete_unused_pvcs(hdlc_device *hdlc)
 {
-       pvc_device **pvc_p = &state(hdlc)->first_pvc;
+       struct pvc_device **pvc_p = &state(hdlc)->first_pvc;
 
        while (*pvc_p) {
                if (!pvc_is_used(*pvc_p)) {
-                       pvc_device *pvc = *pvc_p;
+                       struct pvc_device *pvc = *pvc_p;
 #ifdef DEBUG_PVC
                        printk(KERN_DEBUG "freeing unused pvc: %p\n", pvc);
 #endif
@@ -263,7 +263,8 @@ static inline void delete_unused_pvcs(hdlc_device *hdlc)
 }
 
 
-static inline struct net_device** get_dev_p(pvc_device *pvc, int type)
+static inline struct net_device **get_dev_p(struct pvc_device *pvc,
+                                           int type)
 {
        if (type == ARPHRD_ETHER)
                return &pvc->ether;
@@ -342,7 +343,7 @@ static int fr_hard_header(struct sk_buff **skb_p, u16 dlci)
 
 static int pvc_open(struct net_device *dev)
 {
-       pvc_device *pvc = dev->ml_priv;
+       struct pvc_device *pvc = dev->ml_priv;
 
        if ((pvc->frad->flags & IFF_UP) == 0)
                return -EIO;  /* Frad must be UP in order to activate PVC */
@@ -362,7 +363,7 @@ static int pvc_open(struct net_device *dev)
 
 static int pvc_close(struct net_device *dev)
 {
-       pvc_device *pvc = dev->ml_priv;
+       struct pvc_device *pvc = dev->ml_priv;
 
        if (--pvc->open_count == 0) {
                hdlc_device *hdlc = dev_to_hdlc(pvc->frad);
@@ -381,7 +382,7 @@ static int pvc_close(struct net_device *dev)
 
 static int pvc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
-       pvc_device *pvc = dev->ml_priv;
+       struct pvc_device *pvc = dev->ml_priv;
        fr_proto_pvc_info info;
 
        if (ifr->ifr_settings.type == IF_GET_PROTO) {
@@ -409,7 +410,7 @@ static int pvc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 
 static netdev_tx_t pvc_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-       pvc_device *pvc = dev->ml_priv;
+       struct pvc_device *pvc = dev->ml_priv;
 
        if (pvc->state.active) {
                if (dev->type == ARPHRD_ETHER) {
@@ -444,7 +445,7 @@ static netdev_tx_t pvc_xmit(struct sk_buff *skb, struct net_device *dev)
        return NETDEV_TX_OK;
 }
 
-static inline void fr_log_dlci_active(pvc_device *pvc)
+static inline void fr_log_dlci_active(struct pvc_device *pvc)
 {
        netdev_info(pvc->frad, "DLCI %d [%s%s%s]%s %s\n",
                    pvc->dlci,
@@ -469,7 +470,7 @@ static void fr_lmi_send(struct net_device *dev, int fullrep)
 {
        hdlc_device *hdlc = dev_to_hdlc(dev);
        struct sk_buff *skb;
-       pvc_device *pvc = state(hdlc)->first_pvc;
+       struct pvc_device *pvc = state(hdlc)->first_pvc;
        int lmi = state(hdlc)->settings.lmi;
        int dce = state(hdlc)->settings.dce;
        int len = lmi == LMI_ANSI ? LMI_ANSI_LENGTH : LMI_CCITT_CISCO_LENGTH;
@@ -566,7 +567,7 @@ static void fr_lmi_send(struct net_device *dev, int fullrep)
 static void fr_set_link_state(int reliable, struct net_device *dev)
 {
        hdlc_device *hdlc = dev_to_hdlc(dev);
-       pvc_device *pvc = state(hdlc)->first_pvc;
+       struct pvc_device *pvc = state(hdlc)->first_pvc;
 
        state(hdlc)->reliable = reliable;
        if (reliable) {
@@ -652,7 +653,7 @@ static void fr_timer(unsigned long arg)
 static int fr_lmi_recv(struct net_device *dev, struct sk_buff *skb)
 {
        hdlc_device *hdlc = dev_to_hdlc(dev);
-       pvc_device *pvc;
+       struct pvc_device *pvc;
        u8 rxseq, txseq;
        int lmi = state(hdlc)->settings.lmi;
        int dce = state(hdlc)->settings.dce;
@@ -869,10 +870,10 @@ static int fr_rx(struct sk_buff *skb)
 {
        struct net_device *frad = skb->dev;
        hdlc_device *hdlc = dev_to_hdlc(frad);
-       fr_hdr *fh = (fr_hdr*)skb->data;
+       struct fr_hdr *fh = (struct fr_hdr *)skb->data;
        u8 *data = skb->data;
        u16 dlci;
-       pvc_device *pvc;
+       struct pvc_device *pvc;
        struct net_device *dev = NULL;
 
        if (skb->len <= 4 || fh->ea1 || data[2] != FR_UI)
@@ -1028,7 +1029,7 @@ static void fr_stop(struct net_device *dev)
 static void fr_close(struct net_device *dev)
 {
        hdlc_device *hdlc = dev_to_hdlc(dev);
-       pvc_device *pvc = state(hdlc)->first_pvc;
+       struct pvc_device *pvc = state(hdlc)->first_pvc;
 
        while (pvc) {           /* Shutdown all PVCs for this FRAD */
                if (pvc->main)
@@ -1060,7 +1061,7 @@ static const struct net_device_ops pvc_ops = {
 static int fr_add_pvc(struct net_device *frad, unsigned int dlci, int type)
 {
        hdlc_device *hdlc = dev_to_hdlc(frad);
-       pvc_device *pvc;
+       struct pvc_device *pvc;
        struct net_device *dev;
        int used;
 
@@ -1117,7 +1118,7 @@ static int fr_add_pvc(struct net_device *frad, unsigned int dlci, int type)
 
 static int fr_del_pvc(hdlc_device *hdlc, unsigned int dlci, int type)
 {
-       pvc_device *pvc;
+       struct pvc_device *pvc;
        struct net_device *dev;
 
        if ((pvc = find_pvc(hdlc, dlci)) == NULL)
@@ -1145,13 +1146,13 @@ static int fr_del_pvc(hdlc_device *hdlc, unsigned int dlci, int type)
 static void fr_destroy(struct net_device *frad)
 {
        hdlc_device *hdlc = dev_to_hdlc(frad);
-       pvc_device *pvc = state(hdlc)->first_pvc;
+       struct pvc_device *pvc = state(hdlc)->first_pvc;
        state(hdlc)->first_pvc = NULL; /* All PVCs destroyed */
        state(hdlc)->dce_pvc_count = 0;
        state(hdlc)->dce_changed = 1;
 
        while (pvc) {
-               pvc_device *next = pvc->next;
+               struct pvc_device *next = pvc->next;
                /* destructors will free_netdev() main and ether */
                if (pvc->main)
                        unregister_netdevice(pvc->main);
index f76aa9081585542321a423162662109edd742e47..1287c3eb33c2fa67c3a9885ac84e592c22340b4d 100644 (file)
@@ -54,24 +54,24 @@ static const char* version = "wanXL serial card driver version: 0.48";
 #define MBX2_MEMSZ_MASK 0xFFFF0000 /* PUTS Memory Size Register mask */
 
 
-typedef struct {
+struct port {
        struct net_device *dev;
-       struct card_t *card;
+       struct card *card;
        spinlock_t lock;        /* for wanxl_xmit */
         int node;              /* physical port #0 - 3 */
        unsigned int clock_type;
        int tx_in, tx_out;
        struct sk_buff *tx_skbs[TX_BUFFERS];
-}port_t;
+};
 
 
-typedef struct {
+struct card_status {
        desc_t rx_descs[RX_QUEUE_LENGTH];
        port_status_t port_status[4];
-}card_status_t;
+};
 
 
-typedef struct card_t {
+struct card {
        int n_ports;            /* 1, 2 or 4 ports */
        u8 irq;
 
@@ -79,20 +79,20 @@ typedef struct card_t {
        struct pci_dev *pdev;   /* for pci_name(pdev) */
        int rx_in;
        struct sk_buff *rx_skbs[RX_QUEUE_LENGTH];
-       card_status_t *status;  /* shared between host and card */
+       struct card_status *status;     /* shared between host and card */
        dma_addr_t status_address;
-       port_t ports[0];        /* 1 - 4 port_t structures follow */
-}card_t;
+       struct port ports[0];   /* 1 - 4 port structures follow */
+};
 
 
 
-static inline port_t* dev_to_port(struct net_device *dev)
+static inline struct port *dev_to_port(struct net_device *dev)
 {
-        return (port_t *)dev_to_hdlc(dev)->priv;
+       return (struct port *)dev_to_hdlc(dev)->priv;
 }
 
 
-static inline port_status_t* get_status(port_t *port)
+static inline port_status_t *get_status(struct port *port)
 {
        return &port->card->status->port_status[port->node];
 }
@@ -115,7 +115,7 @@ static inline dma_addr_t pci_map_single_debug(struct pci_dev *pdev, void *ptr,
 
 
 /* Cable and/or personality module change interrupt service */
-static inline void wanxl_cable_intr(port_t *port)
+static inline void wanxl_cable_intr(struct port *port)
 {
        u32 value = get_status(port)->cable;
        int valid = 1;
@@ -160,7 +160,7 @@ static inline void wanxl_cable_intr(port_t *port)
 
 
 /* Transmit complete interrupt service */
-static inline void wanxl_tx_intr(port_t *port)
+static inline void wanxl_tx_intr(struct port *port)
 {
        struct net_device *dev = port->dev;
        while (1) {
@@ -193,7 +193,7 @@ static inline void wanxl_tx_intr(port_t *port)
 
 
 /* Receive complete interrupt service */
-static inline void wanxl_rx_intr(card_t *card)
+static inline void wanxl_rx_intr(struct card *card)
 {
        desc_t *desc;
        while (desc = &card->status->rx_descs[card->rx_in],
@@ -203,7 +203,7 @@ static inline void wanxl_rx_intr(card_t *card)
                                pci_name(card->pdev));
                else {
                        struct sk_buff *skb = card->rx_skbs[card->rx_in];
-                       port_t *port = &card->ports[desc->stat &
+                       struct port *port = &card->ports[desc->stat &
                                                    PACKET_PORT_MASK];
                        struct net_device *dev = port->dev;
 
@@ -245,7 +245,7 @@ static inline void wanxl_rx_intr(card_t *card)
 
 static irqreturn_t wanxl_intr(int irq, void* dev_id)
 {
-        card_t *card = dev_id;
+       struct card *card = dev_id;
         int i;
         u32 stat;
         int handled = 0;
@@ -272,7 +272,7 @@ static irqreturn_t wanxl_intr(int irq, void* dev_id)
 
 static netdev_tx_t wanxl_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-        port_t *port = dev_to_port(dev);
+       struct port *port = dev_to_port(dev);
        desc_t *desc;
 
         spin_lock(&port->lock);
@@ -319,7 +319,7 @@ static netdev_tx_t wanxl_xmit(struct sk_buff *skb, struct net_device *dev)
 static int wanxl_attach(struct net_device *dev, unsigned short encoding,
                        unsigned short parity)
 {
-       port_t *port = dev_to_port(dev);
+       struct port *port = dev_to_port(dev);
 
        if (encoding != ENCODING_NRZ &&
            encoding != ENCODING_NRZI)
@@ -343,7 +343,7 @@ static int wanxl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
        const size_t size = sizeof(sync_serial_settings);
        sync_serial_settings line;
-       port_t *port = dev_to_port(dev);
+       struct port *port = dev_to_port(dev);
 
        if (cmd != SIOCWANDEV)
                return hdlc_ioctl(dev, ifr, cmd);
@@ -393,7 +393,7 @@ static int wanxl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 
 static int wanxl_open(struct net_device *dev)
 {
-       port_t *port = dev_to_port(dev);
+       struct port *port = dev_to_port(dev);
        u8 __iomem *dbr = port->card->plx + PLX_DOORBELL_TO_CARD;
        unsigned long timeout;
        int i;
@@ -429,7 +429,7 @@ static int wanxl_open(struct net_device *dev)
 
 static int wanxl_close(struct net_device *dev)
 {
-       port_t *port = dev_to_port(dev);
+       struct port *port = dev_to_port(dev);
        unsigned long timeout;
        int i;
 
@@ -467,7 +467,7 @@ static int wanxl_close(struct net_device *dev)
 
 static struct net_device_stats *wanxl_get_stats(struct net_device *dev)
 {
-       port_t *port = dev_to_port(dev);
+       struct port *port = dev_to_port(dev);
 
        dev->stats.rx_over_errors = get_status(port)->rx_overruns;
        dev->stats.rx_frame_errors = get_status(port)->rx_frame_errors;
@@ -478,7 +478,7 @@ static struct net_device_stats *wanxl_get_stats(struct net_device *dev)
 
 
 
-static int wanxl_puts_command(card_t *card, u32 cmd)
+static int wanxl_puts_command(struct card *card, u32 cmd)
 {
        unsigned long timeout = jiffies + 5 * HZ;
 
@@ -495,7 +495,7 @@ static int wanxl_puts_command(card_t *card, u32 cmd)
 
 
 
-static void wanxl_reset(card_t *card)
+static void wanxl_reset(struct card *card)
 {
        u32 old_value = readl(card->plx + PLX_CONTROL) & ~PLX_CTL_RESET;
 
@@ -511,7 +511,7 @@ static void wanxl_reset(card_t *card)
 
 static void wanxl_pci_remove_one(struct pci_dev *pdev)
 {
-       card_t *card = pci_get_drvdata(pdev);
+       struct card *card = pci_get_drvdata(pdev);
        int i;
 
        for (i = 0; i < card->n_ports; i++) {
@@ -537,7 +537,7 @@ static void wanxl_pci_remove_one(struct pci_dev *pdev)
                iounmap(card->plx);
 
        if (card->status)
-               pci_free_consistent(pdev, sizeof(card_status_t),
+               pci_free_consistent(pdev, sizeof(struct card_status),
                                    card->status, card->status_address);
 
        pci_release_regions(pdev);
@@ -560,7 +560,7 @@ static const struct net_device_ops wanxl_ops = {
 static int wanxl_pci_init_one(struct pci_dev *pdev,
                              const struct pci_device_id *ent)
 {
-       card_t *card;
+       struct card *card;
        u32 ramsize, stat;
        unsigned long timeout;
        u32 plx_phy;            /* PLX PCI base address */
@@ -601,7 +601,7 @@ static int wanxl_pci_init_one(struct pci_dev *pdev,
        default: ports = 4;
        }
 
-       alloc_size = sizeof(card_t) + ports * sizeof(port_t);
+       alloc_size = sizeof(struct card) + ports * sizeof(struct port);
        card = kzalloc(alloc_size, GFP_KERNEL);
        if (card == NULL) {
                pci_release_regions(pdev);
@@ -612,7 +612,8 @@ static int wanxl_pci_init_one(struct pci_dev *pdev,
        pci_set_drvdata(pdev, card);
        card->pdev = pdev;
 
-       card->status = pci_alloc_consistent(pdev, sizeof(card_status_t),
+       card->status = pci_alloc_consistent(pdev,
+                                           sizeof(struct card_status),
                                            &card->status_address);
        if (card->status == NULL) {
                wanxl_pci_remove_one(pdev);
@@ -766,7 +767,7 @@ static int wanxl_pci_init_one(struct pci_dev *pdev,
 
        for (i = 0; i < ports; i++) {
                hdlc_device *hdlc;
-               port_t *port = &card->ports[i];
+               struct port *port = &card->ports[i];
                struct net_device *dev = alloc_hdlcdev(port);
                if (!dev) {
                        pr_err("%s: unable to allocate memory\n",
index 7e9ede6c5798a83c20c82a0e3ca087f74d80442b..d9ed22b4cc6b60dfe0a59d32ed44b78923ec483f 100644 (file)
@@ -56,18 +56,18 @@ static void airo_release(struct pcmcia_device *link);
 
 static void airo_detach(struct pcmcia_device *p_dev);
 
-typedef struct local_info_t {
+struct local_info {
        struct net_device *eth_dev;
-} local_info_t;
+};
 
 static int airo_probe(struct pcmcia_device *p_dev)
 {
-       local_info_t *local;
+       struct local_info *local;
 
        dev_dbg(&p_dev->dev, "airo_attach()\n");
 
        /* Allocate space for private device-specific data */
-       local = kzalloc(sizeof(local_info_t), GFP_KERNEL);
+       local = kzalloc(sizeof(*local), GFP_KERNEL);
        if (!local)
                return -ENOMEM;
 
@@ -82,10 +82,11 @@ static void airo_detach(struct pcmcia_device *link)
 
        airo_release(link);
 
-       if (((local_info_t *)link->priv)->eth_dev) {
-               stop_airo_card(((local_info_t *)link->priv)->eth_dev, 0);
+       if (((struct local_info *)link->priv)->eth_dev) {
+               stop_airo_card(((struct local_info *)link->priv)->eth_dev,
+                              0);
        }
-       ((local_info_t *)link->priv)->eth_dev = NULL;
+       ((struct local_info *)link->priv)->eth_dev = NULL;
 
        kfree(link->priv);
 } /* airo_detach */
@@ -101,7 +102,7 @@ static int airo_cs_config_check(struct pcmcia_device *p_dev, void *priv_data)
 
 static int airo_config(struct pcmcia_device *link)
 {
-       local_info_t *dev;
+       struct local_info *dev;
        int ret;
 
        dev = link->priv;
@@ -121,10 +122,10 @@ static int airo_config(struct pcmcia_device *link)
        ret = pcmcia_enable_device(link);
        if (ret)
                goto failed;
-       ((local_info_t *)link->priv)->eth_dev =
+       ((struct local_info *)link->priv)->eth_dev =
                init_airo_card(link->irq,
                               link->resource[0]->start, 1, &link->dev);
-       if (!((local_info_t *)link->priv)->eth_dev)
+       if (!((struct local_info *)link->priv)->eth_dev)
                goto failed;
 
        return 0;
@@ -142,7 +143,7 @@ static void airo_release(struct pcmcia_device *link)
 
 static int airo_suspend(struct pcmcia_device *link)
 {
-       local_info_t *local = link->priv;
+       struct local_info *local = link->priv;
 
        netif_device_detach(local->eth_dev);
 
@@ -151,7 +152,7 @@ static int airo_suspend(struct pcmcia_device *link)
 
 static int airo_resume(struct pcmcia_device *link)
 {
-       local_info_t *local = link->priv;
+       struct local_info *local = link->priv;
 
        if (link->open) {
                reset_airo_card(local->eth_dev);
index 1fe41af81a597a4a90c75aac5557567725b09fc5..9183f1cf89a76d9cce88a9da9b68dc570592bd58 100644 (file)
@@ -2598,11 +2598,11 @@ static const iw_handler atmel_private_handler[] =
        NULL,                           /* SIOCIWFIRSTPRIV */
 };
 
-typedef struct atmel_priv_ioctl {
+struct atmel_priv_ioctl {
        char id[32];
        unsigned char __user *data;
        unsigned short len;
-} atmel_priv_ioctl;
+};
 
 #define ATMELFWL       SIOCIWFIRSTPRIV
 #define ATMELIDIFC     ATMELFWL + 1
@@ -2615,7 +2615,7 @@ static const struct iw_priv_args atmel_private_args[] = {
                .cmd = ATMELFWL,
                .set_args = IW_PRIV_TYPE_BYTE
                                | IW_PRIV_SIZE_FIXED
-                               | sizeof (atmel_priv_ioctl),
+                               | sizeof(struct atmel_priv_ioctl),
                .get_args = IW_PRIV_TYPE_NONE,
                .name = "atmelfwl"
        }, {
@@ -2645,7 +2645,7 @@ static int atmel_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
        int i, rc = 0;
        struct atmel_private *priv = netdev_priv(dev);
-       atmel_priv_ioctl com;
+       struct atmel_priv_ioctl com;
        struct iwreq *wrq = (struct iwreq *) rq;
        unsigned char *new_firmware;
        char domain[REGDOMAINSZ + 1];
index 48a55cda979b0f31ed8c8f94f30c0616fda76538..bfd10cb9c8def782e412d6317f4b5b2cb23d0ef6 100644 (file)
@@ -78,12 +78,8 @@ int xenvif_poll(struct napi_struct *napi, int budget)
        /* This vif is rogue, we pretend we've there is nothing to do
         * for this vif to deschedule it from NAPI. But this interface
         * will be turned off in thread context later.
-        * Also, if a guest doesn't post enough slots to receive data on one of
-        * its queues, the carrier goes down and NAPI is descheduled here so
-        * the guest can't send more packets until it's ready to receive.
         */
-       if (unlikely(queue->vif->disabled ||
-                    !netif_carrier_ok(queue->vif->dev))) {
+       if (unlikely(queue->vif->disabled)) {
                napi_complete(napi);
                return 0;
        }
index aa2093325be1d625f34db77df00db6899d80304c..4734472aa6201a5dac4ac8c0f42e4faba30dcf88 100644 (file)
@@ -2025,9 +2025,15 @@ int xenvif_kthread_guest_rx(void *data)
                 * context so we defer it here, if this thread is
                 * associated with queue 0.
                 */
-               if (unlikely(queue->vif->disabled && queue->id == 0))
+               if (unlikely(queue->vif->disabled && queue->id == 0)) {
                        xenvif_carrier_off(queue->vif);
-               else if (unlikely(test_and_clear_bit(QUEUE_STATUS_RX_PURGE_EVENT,
+               } else if (unlikely(queue->vif->disabled)) {
+                       /* kthread_stop() would be called upon this thread soon,
+                        * be a bit proactive
+                        */
+                       skb_queue_purge(&queue->rx_queue);
+                       queue->rx_last_skb_slots = 0;
+               } else if (unlikely(test_and_clear_bit(QUEUE_STATUS_RX_PURGE_EVENT,
                                                     &queue->status))) {
                        xenvif_rx_purge_event(queue);
                } else if (!netif_carrier_ok(queue->vif->dev)) {
index 28204bc4f369550bc816a81fcf6b933d71e52af4..ca82f545ec2ca12678ca4f4ce6669c0b564f8bb2 100644 (file)
@@ -628,9 +628,10 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
        slots = DIV_ROUND_UP(offset + len, PAGE_SIZE) +
                xennet_count_skb_frag_slots(skb);
        if (unlikely(slots > MAX_SKB_FRAGS + 1)) {
-               net_alert_ratelimited(
-                       "xennet: skb rides the rocket: %d slots\n", slots);
-               goto drop;
+               net_dbg_ratelimited("xennet: skb rides the rocket: %d slots, %d bytes\n",
+                                   slots, skb->len);
+               if (skb_linearize(skb))
+                       goto drop;
        }
 
        spin_lock_irqsave(&queue->tx_lock, flags);
index 2dcb0541012d0133a417cc18ff2cd7ec2acbee2f..5160c4eb73c2e3788cd310693be3969b88642ea9 100644 (file)
@@ -9,7 +9,8 @@ menu "Device Tree and Open Firmware support"
 
 config OF_SELFTEST
        bool "Device Tree Runtime self tests"
-       depends on OF_IRQ
+       depends on OF_IRQ && OF_EARLY_FLATTREE
+       select OF_DYNAMIC
        help
          This option builds in test cases for the device tree infrastructure
          that are executed once at boot time, and the results dumped to the
index 099b1fb00af4ac6fd09be4341919cfeb84e45634..2b6a7b129d10b2092cb9e891158908c86c82fcfa 100644 (file)
@@ -1,11 +1,13 @@
 obj-y = base.o device.o platform.o
+obj-$(CONFIG_OF_DYNAMIC) += dynamic.o
 obj-$(CONFIG_OF_FLATTREE) += fdt.o
 obj-$(CONFIG_OF_EARLY_FLATTREE) += fdt_address.o
 obj-$(CONFIG_OF_PROMTREE) += pdt.o
 obj-$(CONFIG_OF_ADDRESS)  += address.o
 obj-$(CONFIG_OF_IRQ)    += irq.o
 obj-$(CONFIG_OF_NET)   += of_net.o
-obj-$(CONFIG_OF_SELFTEST) += selftest.o
+obj-$(CONFIG_OF_SELFTEST) += of_selftest.o
+of_selftest-objs := selftest.o testcase-data/testcases.dtb.o
 obj-$(CONFIG_OF_MDIO)  += of_mdio.o
 obj-$(CONFIG_OF_PCI)   += of_pci.o
 obj-$(CONFIG_OF_PCI_IRQ)  += of_pci_irq.o
index b9864806e9b811a0c3cc3b0e16404a19fad271eb..d8574adf0d62d446c02011970096e74c921ec6f1 100644 (file)
@@ -17,6 +17,7 @@
  *      as published by the Free Software Foundation; either version
  *      2 of the License, or (at your option) any later version.
  */
+#include <linux/console.h>
 #include <linux/ctype.h>
 #include <linux/cpu.h>
 #include <linux/module.h>
@@ -35,15 +36,17 @@ struct device_node *of_allnodes;
 EXPORT_SYMBOL(of_allnodes);
 struct device_node *of_chosen;
 struct device_node *of_aliases;
-static struct device_node *of_stdout;
+struct device_node *of_stdout;
 
-static struct kset *of_kset;
+struct kset *of_kset;
 
 /*
- * Used to protect the of_aliases; but also overloaded to hold off addition of
- * nodes to sysfs
+ * Used to protect the of_aliases, to hold off addition of nodes to sysfs.
+ * This mutex must be held whenever modifications are being made to the
+ * device tree. The of_{attach,detach}_node() and
+ * of_{add,remove,update}_property() helpers make sure this happens.
  */
-DEFINE_MUTEX(of_aliases_mutex);
+DEFINE_MUTEX(of_mutex);
 
 /* use when traversing tree through the allnext, child, sibling,
  * or parent members of struct device_node.
@@ -89,79 +92,7 @@ int __weak of_node_to_nid(struct device_node *np)
 }
 #endif
 
-#if defined(CONFIG_OF_DYNAMIC)
-/**
- *     of_node_get - Increment refcount of a node
- *     @node:  Node to inc refcount, NULL is supported to
- *             simplify writing of callers
- *
- *     Returns node.
- */
-struct device_node *of_node_get(struct device_node *node)
-{
-       if (node)
-               kobject_get(&node->kobj);
-       return node;
-}
-EXPORT_SYMBOL(of_node_get);
-
-static inline struct device_node *kobj_to_device_node(struct kobject *kobj)
-{
-       return container_of(kobj, struct device_node, kobj);
-}
-
-/**
- *     of_node_release - release a dynamically allocated node
- *     @kref:  kref element of the node to be released
- *
- *     In of_node_put() this function is passed to kref_put()
- *     as the destructor.
- */
-static void of_node_release(struct kobject *kobj)
-{
-       struct device_node *node = kobj_to_device_node(kobj);
-       struct property *prop = node->properties;
-
-       /* We should never be releasing nodes that haven't been detached. */
-       if (!of_node_check_flag(node, OF_DETACHED)) {
-               pr_err("ERROR: Bad of_node_put() on %s\n", node->full_name);
-               dump_stack();
-               return;
-       }
-
-       if (!of_node_check_flag(node, OF_DYNAMIC))
-               return;
-
-       while (prop) {
-               struct property *next = prop->next;
-               kfree(prop->name);
-               kfree(prop->value);
-               kfree(prop);
-               prop = next;
-
-               if (!prop) {
-                       prop = node->deadprops;
-                       node->deadprops = NULL;
-               }
-       }
-       kfree(node->full_name);
-       kfree(node->data);
-       kfree(node);
-}
-
-/**
- *     of_node_put - Decrement refcount of a node
- *     @node:  Node to dec refcount, NULL is supported to
- *             simplify writing of callers
- *
- */
-void of_node_put(struct device_node *node)
-{
-       if (node)
-               kobject_put(&node->kobj);
-}
-EXPORT_SYMBOL(of_node_put);
-#else
+#ifndef CONFIG_OF_DYNAMIC
 static void of_node_release(struct kobject *kobj)
 {
        /* Without CONFIG_OF_DYNAMIC, no nodes gets freed */
@@ -200,13 +131,16 @@ static const char *safe_name(struct kobject *kobj, const char *orig_name)
        return name;
 }
 
-static int __of_add_property_sysfs(struct device_node *np, struct property *pp)
+int __of_add_property_sysfs(struct device_node *np, struct property *pp)
 {
        int rc;
 
        /* Important: Don't leak passwords */
        bool secure = strncmp(pp->name, "security-", 9) == 0;
 
+       if (!of_kset || !of_node_is_attached(np))
+               return 0;
+
        sysfs_bin_attr_init(&pp->attr);
        pp->attr.attr.name = safe_name(&np->kobj, pp->name);
        pp->attr.attr.mode = secure ? S_IRUSR : S_IRUGO;
@@ -218,12 +152,15 @@ static int __of_add_property_sysfs(struct device_node *np, struct property *pp)
        return rc;
 }
 
-static int __of_node_add(struct device_node *np)
+int __of_attach_node_sysfs(struct device_node *np)
 {
        const char *name;
        struct property *pp;
        int rc;
 
+       if (!of_kset)
+               return 0;
+
        np->kobj.kset = of_kset;
        if (!np->parent) {
                /* Nodes without parents are new top level trees */
@@ -245,59 +182,20 @@ static int __of_node_add(struct device_node *np)
        return 0;
 }
 
-int of_node_add(struct device_node *np)
-{
-       int rc = 0;
-
-       BUG_ON(!of_node_is_initialized(np));
-
-       /*
-        * Grab the mutex here so that in a race condition between of_init() and
-        * of_node_add(), node addition will still be consistent.
-        */
-       mutex_lock(&of_aliases_mutex);
-       if (of_kset)
-               rc = __of_node_add(np);
-       else
-               /* This scenario may be perfectly valid, but report it anyway */
-               pr_info("of_node_add(%s) before of_init()\n", np->full_name);
-       mutex_unlock(&of_aliases_mutex);
-       return rc;
-}
-
-#if defined(CONFIG_OF_DYNAMIC)
-static void of_node_remove(struct device_node *np)
-{
-       struct property *pp;
-
-       BUG_ON(!of_node_is_initialized(np));
-
-       /* only remove properties if on sysfs */
-       if (of_node_is_attached(np)) {
-               for_each_property_of_node(np, pp)
-                       sysfs_remove_bin_file(&np->kobj, &pp->attr);
-               kobject_del(&np->kobj);
-       }
-
-       /* finally remove the kobj_init ref */
-       of_node_put(np);
-}
-#endif
-
 static int __init of_init(void)
 {
        struct device_node *np;
 
        /* Create the kset, and register existing nodes */
-       mutex_lock(&of_aliases_mutex);
+       mutex_lock(&of_mutex);
        of_kset = kset_create_and_add("devicetree", NULL, firmware_kobj);
        if (!of_kset) {
-               mutex_unlock(&of_aliases_mutex);
+               mutex_unlock(&of_mutex);
                return -ENOMEM;
        }
        for_each_of_allnodes(np)
-               __of_node_add(np);
-       mutex_unlock(&of_aliases_mutex);
+               __of_attach_node_sysfs(np);
+       mutex_unlock(&of_mutex);
 
        /* Symlink in /proc as required by userspace ABI */
        if (of_allnodes)
@@ -369,8 +267,8 @@ EXPORT_SYMBOL(of_find_all_nodes);
  * Find a property with a given name for a given node
  * and return the value.
  */
-static const void *__of_get_property(const struct device_node *np,
-                                    const char *name, int *lenp)
+const void *__of_get_property(const struct device_node *np,
+                             const char *name, int *lenp)
 {
        struct property *pp = __of_find_property(np, name, lenp);
 
@@ -1748,32 +1646,10 @@ int of_count_phandle_with_args(const struct device_node *np, const char *list_na
 }
 EXPORT_SYMBOL(of_count_phandle_with_args);
 
-#if defined(CONFIG_OF_DYNAMIC)
-static int of_property_notify(int action, struct device_node *np,
-                             struct property *prop)
-{
-       struct of_prop_reconfig pr;
-
-       /* only call notifiers if the node is attached */
-       if (!of_node_is_attached(np))
-               return 0;
-
-       pr.dn = np;
-       pr.prop = prop;
-       return of_reconfig_notify(action, &pr);
-}
-#else
-static int of_property_notify(int action, struct device_node *np,
-                             struct property *prop)
-{
-       return 0;
-}
-#endif
-
 /**
  * __of_add_property - Add a property to a node without lock operations
  */
-static int __of_add_property(struct device_node *np, struct property *prop)
+int __of_add_property(struct device_node *np, struct property *prop)
 {
        struct property **next;
 
@@ -1799,22 +1675,49 @@ int of_add_property(struct device_node *np, struct property *prop)
        unsigned long flags;
        int rc;
 
-       rc = of_property_notify(OF_RECONFIG_ADD_PROPERTY, np, prop);
-       if (rc)
-               return rc;
+       mutex_lock(&of_mutex);
 
        raw_spin_lock_irqsave(&devtree_lock, flags);
        rc = __of_add_property(np, prop);
        raw_spin_unlock_irqrestore(&devtree_lock, flags);
-       if (rc)
-               return rc;
 
-       if (of_node_is_attached(np))
+       if (!rc)
                __of_add_property_sysfs(np, prop);
 
+       mutex_unlock(&of_mutex);
+
+       if (!rc)
+               of_property_notify(OF_RECONFIG_ADD_PROPERTY, np, prop, NULL);
+
        return rc;
 }
 
+int __of_remove_property(struct device_node *np, struct property *prop)
+{
+       struct property **next;
+
+       for (next = &np->properties; *next; next = &(*next)->next) {
+               if (*next == prop)
+                       break;
+       }
+       if (*next == NULL)
+               return -ENODEV;
+
+       /* found the node */
+       *next = prop->next;
+       prop->next = np->deadprops;
+       np->deadprops = prop;
+
+       return 0;
+}
+
+void __of_remove_property_sysfs(struct device_node *np, struct property *prop)
+{
+       /* at early boot, bail here and defer setup to of_init() */
+       if (of_kset && of_node_is_attached(np))
+               sysfs_remove_bin_file(&np->kobj, &prop->attr);
+}
+
 /**
  * of_remove_property - Remove a property from a node.
  *
@@ -1825,211 +1728,98 @@ int of_add_property(struct device_node *np, struct property *prop)
  */
 int of_remove_property(struct device_node *np, struct property *prop)
 {
-       struct property **next;
        unsigned long flags;
-       int found = 0;
        int rc;
 
-       rc = of_property_notify(OF_RECONFIG_REMOVE_PROPERTY, np, prop);
-       if (rc)
-               return rc;
+       mutex_lock(&of_mutex);
 
        raw_spin_lock_irqsave(&devtree_lock, flags);
-       next = &np->properties;
-       while (*next) {
-               if (*next == prop) {
-                       /* found the node */
-                       *next = prop->next;
-                       prop->next = np->deadprops;
-                       np->deadprops = prop;
-                       found = 1;
-                       break;
-               }
-               next = &(*next)->next;
-       }
+       rc = __of_remove_property(np, prop);
        raw_spin_unlock_irqrestore(&devtree_lock, flags);
 
-       if (!found)
-               return -ENODEV;
+       if (!rc)
+               __of_remove_property_sysfs(np, prop);
 
-       /* at early boot, bail hear and defer setup to of_init() */
-       if (!of_kset)
-               return 0;
+       mutex_unlock(&of_mutex);
 
-       sysfs_remove_bin_file(&np->kobj, &prop->attr);
+       if (!rc)
+               of_property_notify(OF_RECONFIG_REMOVE_PROPERTY, np, prop, NULL);
 
-       return 0;
+       return rc;
 }
 
-/*
- * of_update_property - Update a property in a node, if the property does
- * not exist, add it.
- *
- * Note that we don't actually remove it, since we have given out
- * who-knows-how-many pointers to the data using get-property.
- * Instead we just move the property to the "dead properties" list,
- * and add the new property to the property list
- */
-int of_update_property(struct device_node *np, struct property *newprop)
+int __of_update_property(struct device_node *np, struct property *newprop,
+               struct property **oldpropp)
 {
        struct property **next, *oldprop;
-       unsigned long flags;
-       int rc;
-
-       rc = of_property_notify(OF_RECONFIG_UPDATE_PROPERTY, np, newprop);
-       if (rc)
-               return rc;
 
-       if (!newprop->name)
-               return -EINVAL;
+       for (next = &np->properties; *next; next = &(*next)->next) {
+               if (of_prop_cmp((*next)->name, newprop->name) == 0)
+                       break;
+       }
+       *oldpropp = oldprop = *next;
 
-       raw_spin_lock_irqsave(&devtree_lock, flags);
-       next = &np->properties;
-       oldprop = __of_find_property(np, newprop->name, NULL);
-       if (!oldprop) {
-               /* add the new node */
-               rc = __of_add_property(np, newprop);
-       } else while (*next) {
+       if (oldprop) {
                /* replace the node */
-               if (*next == oldprop) {
-                       newprop->next = oldprop->next;
-                       *next = newprop;
-                       oldprop->next = np->deadprops;
-                       np->deadprops = oldprop;
-                       break;
-               }
-               next = &(*next)->next;
+               newprop->next = oldprop->next;
+               *next = newprop;
+               oldprop->next = np->deadprops;
+               np->deadprops = oldprop;
+       } else {
+               /* new node */
+               newprop->next = NULL;
+               *next = newprop;
        }
-       raw_spin_unlock_irqrestore(&devtree_lock, flags);
-       if (rc)
-               return rc;
 
+       return 0;
+}
+
+void __of_update_property_sysfs(struct device_node *np, struct property *newprop,
+               struct property *oldprop)
+{
        /* At early boot, bail out and defer setup to of_init() */
        if (!of_kset)
-               return 0;
+               return;
 
-       /* Update the sysfs attribute */
        if (oldprop)
                sysfs_remove_bin_file(&np->kobj, &oldprop->attr);
        __of_add_property_sysfs(np, newprop);
-
-       return 0;
 }
 
-#if defined(CONFIG_OF_DYNAMIC)
 /*
- * Support for dynamic device trees.
+ * of_update_property - Update a property in a node, if the property does
+ * not exist, add it.
  *
- * On some platforms, the device tree can be manipulated at runtime.
- * The routines in this section support adding, removing and changing
- * device tree nodes.
- */
-
-static BLOCKING_NOTIFIER_HEAD(of_reconfig_chain);
-
-int of_reconfig_notifier_register(struct notifier_block *nb)
-{
-       return blocking_notifier_chain_register(&of_reconfig_chain, nb);
-}
-EXPORT_SYMBOL_GPL(of_reconfig_notifier_register);
-
-int of_reconfig_notifier_unregister(struct notifier_block *nb)
-{
-       return blocking_notifier_chain_unregister(&of_reconfig_chain, nb);
-}
-EXPORT_SYMBOL_GPL(of_reconfig_notifier_unregister);
-
-int of_reconfig_notify(unsigned long action, void *p)
-{
-       int rc;
-
-       rc = blocking_notifier_call_chain(&of_reconfig_chain, action, p);
-       return notifier_to_errno(rc);
-}
-
-/**
- * of_attach_node - Plug a device node into the tree and global list.
+ * Note that we don't actually remove it, since we have given out
+ * who-knows-how-many pointers to the data using get-property.
+ * Instead we just move the property to the "dead properties" list,
+ * and add the new property to the property list
  */
-int of_attach_node(struct device_node *np)
+int of_update_property(struct device_node *np, struct property *newprop)
 {
+       struct property *oldprop;
        unsigned long flags;
        int rc;
 
-       rc = of_reconfig_notify(OF_RECONFIG_ATTACH_NODE, np);
-       if (rc)
-               return rc;
-
-       raw_spin_lock_irqsave(&devtree_lock, flags);
-       np->sibling = np->parent->child;
-       np->allnext = np->parent->allnext;
-       np->parent->allnext = np;
-       np->parent->child = np;
-       of_node_clear_flag(np, OF_DETACHED);
-       raw_spin_unlock_irqrestore(&devtree_lock, flags);
-
-       of_node_add(np);
-       return 0;
-}
-
-/**
- * of_detach_node - "Unplug" a node from the device tree.
- *
- * The caller must hold a reference to the node.  The memory associated with
- * the node is not freed until its refcount goes to zero.
- */
-int of_detach_node(struct device_node *np)
-{
-       struct device_node *parent;
-       unsigned long flags;
-       int rc = 0;
+       if (!newprop->name)
+               return -EINVAL;
 
-       rc = of_reconfig_notify(OF_RECONFIG_DETACH_NODE, np);
-       if (rc)
-               return rc;
+       mutex_lock(&of_mutex);
 
        raw_spin_lock_irqsave(&devtree_lock, flags);
+       rc = __of_update_property(np, newprop, &oldprop);
+       raw_spin_unlock_irqrestore(&devtree_lock, flags);
 
-       if (of_node_check_flag(np, OF_DETACHED)) {
-               /* someone already detached it */
-               raw_spin_unlock_irqrestore(&devtree_lock, flags);
-               return rc;
-       }
-
-       parent = np->parent;
-       if (!parent) {
-               raw_spin_unlock_irqrestore(&devtree_lock, flags);
-               return rc;
-       }
+       if (!rc)
+               __of_update_property_sysfs(np, newprop, oldprop);
 
-       if (of_allnodes == np)
-               of_allnodes = np->allnext;
-       else {
-               struct device_node *prev;
-               for (prev = of_allnodes;
-                    prev->allnext != np;
-                    prev = prev->allnext)
-                       ;
-               prev->allnext = np->allnext;
-       }
+       mutex_unlock(&of_mutex);
 
-       if (parent->child == np)
-               parent->child = np->sibling;
-       else {
-               struct device_node *prevsib;
-               for (prevsib = np->parent->child;
-                    prevsib->sibling != np;
-                    prevsib = prevsib->sibling)
-                       ;
-               prevsib->sibling = np->sibling;
-       }
-
-       of_node_set_flag(np, OF_DETACHED);
-       raw_spin_unlock_irqrestore(&devtree_lock, flags);
+       if (!rc)
+               of_property_notify(OF_RECONFIG_UPDATE_PROPERTY, np, newprop, oldprop);
 
-       of_node_remove(np);
        return rc;
 }
-#endif /* defined(CONFIG_OF_DYNAMIC) */
 
 static void of_alias_add(struct alias_prop *ap, struct device_node *np,
                         int id, const char *stem, int stem_len)
@@ -2062,9 +1852,12 @@ void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align))
                of_chosen = of_find_node_by_path("/chosen@0");
 
        if (of_chosen) {
+               /* linux,stdout-path and /aliases/stdout are for legacy compatibility */
                const char *name = of_get_property(of_chosen, "stdout-path", NULL);
                if (!name)
                        name = of_get_property(of_chosen, "linux,stdout-path", NULL);
+               if (IS_ENABLED(CONFIG_PPC) && !name)
+                       name = of_get_property(of_aliases, "stdout", NULL);
                if (name)
                        of_stdout = of_find_node_by_path(name);
        }
@@ -2122,7 +1915,7 @@ int of_alias_get_id(struct device_node *np, const char *stem)
        struct alias_prop *app;
        int id = -ENODEV;
 
-       mutex_lock(&of_aliases_mutex);
+       mutex_lock(&of_mutex);
        list_for_each_entry(app, &aliases_lookup, link) {
                if (strcmp(app->stem, stem) != 0)
                        continue;
@@ -2132,7 +1925,7 @@ int of_alias_get_id(struct device_node *np, const char *stem)
                        break;
                }
        }
-       mutex_unlock(&of_aliases_mutex);
+       mutex_unlock(&of_mutex);
 
        return id;
 }
@@ -2180,20 +1973,22 @@ const char *of_prop_next_string(struct property *prop, const char *cur)
 EXPORT_SYMBOL_GPL(of_prop_next_string);
 
 /**
- * of_device_is_stdout_path - check if a device node matches the
- *                            linux,stdout-path property
- *
- * Check if this device node matches the linux,stdout-path property
- * in the chosen node. return true if yes, false otherwise.
+ * of_console_check() - Test and setup console for DT setup
+ * @dn - Pointer to device node
+ * @name - Name to use for preferred console without index. ex. "ttyS"
+ * @index - Index to use for preferred console.
+ *
+ * Check if the given device node matches the stdout-path property in the
+ * /chosen node. If it does then register it as the preferred console and return
+ * TRUE. Otherwise return FALSE.
  */
-int of_device_is_stdout_path(struct device_node *dn)
+bool of_console_check(struct device_node *dn, char *name, int index)
 {
-       if (!of_stdout)
+       if (!dn || dn != of_stdout || console_set_on_cmdline)
                return false;
-
-       return of_stdout == dn;
+       return add_preferred_console(name, index, NULL);
 }
-EXPORT_SYMBOL_GPL(of_device_is_stdout_path);
+EXPORT_SYMBOL_GPL(of_console_check);
 
 /**
  *     of_find_next_cache_node - Find a node's subsidiary cache
index dafb9736ab9b59bae3a8cd52e1daa303ea076907..46d6c75c14040903edf9b14f2ec2030c179b2c1e 100644 (file)
@@ -160,7 +160,7 @@ void of_device_uevent(struct device *dev, struct kobj_uevent_env *env)
        add_uevent_var(env, "OF_COMPATIBLE_N=%d", seen);
 
        seen = 0;
-       mutex_lock(&of_aliases_mutex);
+       mutex_lock(&of_mutex);
        list_for_each_entry(app, &aliases_lookup, link) {
                if (dev->of_node == app->np) {
                        add_uevent_var(env, "OF_ALIAS_%d=%s", seen,
@@ -168,7 +168,7 @@ void of_device_uevent(struct device *dev, struct kobj_uevent_env *env)
                        seen++;
                }
        }
-       mutex_unlock(&of_aliases_mutex);
+       mutex_unlock(&of_mutex);
 }
 
 int of_device_uevent_modalias(struct device *dev, struct kobj_uevent_env *env)
diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
new file mode 100644 (file)
index 0000000..54fecc4
--- /dev/null
@@ -0,0 +1,660 @@
+/*
+ * Support for dynamic device trees.
+ *
+ * On some platforms, the device tree can be manipulated at runtime.
+ * The routines in this section support adding, removing and changing
+ * device tree nodes.
+ */
+
+#include <linux/of.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/proc_fs.h>
+
+#include "of_private.h"
+
+/**
+ * of_node_get() - Increment refcount of a node
+ * @node:      Node to inc refcount, NULL is supported to simplify writing of
+ *             callers
+ *
+ * Returns node.
+ */
+struct device_node *of_node_get(struct device_node *node)
+{
+       if (node)
+               kobject_get(&node->kobj);
+       return node;
+}
+EXPORT_SYMBOL(of_node_get);
+
+/**
+ * of_node_put() - Decrement refcount of a node
+ * @node:      Node to dec refcount, NULL is supported to simplify writing of
+ *             callers
+ */
+void of_node_put(struct device_node *node)
+{
+       if (node)
+               kobject_put(&node->kobj);
+}
+EXPORT_SYMBOL(of_node_put);
+
+void __of_detach_node_sysfs(struct device_node *np)
+{
+       struct property *pp;
+
+       BUG_ON(!of_node_is_initialized(np));
+       if (!of_kset)
+               return;
+
+       /* only remove properties if on sysfs */
+       if (of_node_is_attached(np)) {
+               for_each_property_of_node(np, pp)
+                       sysfs_remove_bin_file(&np->kobj, &pp->attr);
+               kobject_del(&np->kobj);
+       }
+
+       /* finally remove the kobj_init ref */
+       of_node_put(np);
+}
+
+static BLOCKING_NOTIFIER_HEAD(of_reconfig_chain);
+
+int of_reconfig_notifier_register(struct notifier_block *nb)
+{
+       return blocking_notifier_chain_register(&of_reconfig_chain, nb);
+}
+EXPORT_SYMBOL_GPL(of_reconfig_notifier_register);
+
+int of_reconfig_notifier_unregister(struct notifier_block *nb)
+{
+       return blocking_notifier_chain_unregister(&of_reconfig_chain, nb);
+}
+EXPORT_SYMBOL_GPL(of_reconfig_notifier_unregister);
+
+int of_reconfig_notify(unsigned long action, void *p)
+{
+       int rc;
+
+       rc = blocking_notifier_call_chain(&of_reconfig_chain, action, p);
+       return notifier_to_errno(rc);
+}
+
+int of_property_notify(int action, struct device_node *np,
+                      struct property *prop, struct property *oldprop)
+{
+       struct of_prop_reconfig pr;
+
+       /* only call notifiers if the node is attached */
+       if (!of_node_is_attached(np))
+               return 0;
+
+       pr.dn = np;
+       pr.prop = prop;
+       pr.old_prop = oldprop;
+       return of_reconfig_notify(action, &pr);
+}
+
+void __of_attach_node(struct device_node *np)
+{
+       const __be32 *phandle;
+       int sz;
+
+       np->name = __of_get_property(np, "name", NULL) ? : "<NULL>";
+       np->type = __of_get_property(np, "device_type", NULL) ? : "<NULL>";
+
+       phandle = __of_get_property(np, "phandle", &sz);
+       if (!phandle)
+               phandle = __of_get_property(np, "linux,phandle", &sz);
+       if (IS_ENABLED(PPC_PSERIES) && !phandle)
+               phandle = __of_get_property(np, "ibm,phandle", &sz);
+       np->phandle = (phandle && (sz >= 4)) ? be32_to_cpup(phandle) : 0;
+
+       np->child = NULL;
+       np->sibling = np->parent->child;
+       np->allnext = np->parent->allnext;
+       np->parent->allnext = np;
+       np->parent->child = np;
+       of_node_clear_flag(np, OF_DETACHED);
+}
+
+/**
+ * of_attach_node() - Plug a device node into the tree and global list.
+ */
+int of_attach_node(struct device_node *np)
+{
+       unsigned long flags;
+
+       mutex_lock(&of_mutex);
+       raw_spin_lock_irqsave(&devtree_lock, flags);
+       __of_attach_node(np);
+       raw_spin_unlock_irqrestore(&devtree_lock, flags);
+
+       __of_attach_node_sysfs(np);
+       mutex_unlock(&of_mutex);
+
+       of_reconfig_notify(OF_RECONFIG_ATTACH_NODE, np);
+
+       return 0;
+}
+
+void __of_detach_node(struct device_node *np)
+{
+       struct device_node *parent;
+
+       if (WARN_ON(of_node_check_flag(np, OF_DETACHED)))
+               return;
+
+       parent = np->parent;
+       if (WARN_ON(!parent))
+               return;
+
+       if (of_allnodes == np)
+               of_allnodes = np->allnext;
+       else {
+               struct device_node *prev;
+               for (prev = of_allnodes;
+                    prev->allnext != np;
+                    prev = prev->allnext)
+                       ;
+               prev->allnext = np->allnext;
+       }
+
+       if (parent->child == np)
+               parent->child = np->sibling;
+       else {
+               struct device_node *prevsib;
+               for (prevsib = np->parent->child;
+                    prevsib->sibling != np;
+                    prevsib = prevsib->sibling)
+                       ;
+               prevsib->sibling = np->sibling;
+       }
+
+       of_node_set_flag(np, OF_DETACHED);
+}
+
+/**
+ * of_detach_node() - "Unplug" a node from the device tree.
+ *
+ * The caller must hold a reference to the node.  The memory associated with
+ * the node is not freed until its refcount goes to zero.
+ */
+int of_detach_node(struct device_node *np)
+{
+       unsigned long flags;
+       int rc = 0;
+
+       mutex_lock(&of_mutex);
+       raw_spin_lock_irqsave(&devtree_lock, flags);
+       __of_detach_node(np);
+       raw_spin_unlock_irqrestore(&devtree_lock, flags);
+
+       __of_detach_node_sysfs(np);
+       mutex_unlock(&of_mutex);
+
+       of_reconfig_notify(OF_RECONFIG_DETACH_NODE, np);
+
+       return rc;
+}
+
+/**
+ * of_node_release() - release a dynamically allocated node
+ * @kref: kref element of the node to be released
+ *
+ * In of_node_put() this function is passed to kref_put() as the destructor.
+ */
+void of_node_release(struct kobject *kobj)
+{
+       struct device_node *node = kobj_to_device_node(kobj);
+       struct property *prop = node->properties;
+
+       /* We should never be releasing nodes that haven't been detached. */
+       if (!of_node_check_flag(node, OF_DETACHED)) {
+               pr_err("ERROR: Bad of_node_put() on %s\n", node->full_name);
+               dump_stack();
+               return;
+       }
+
+       if (!of_node_check_flag(node, OF_DYNAMIC))
+               return;
+
+       while (prop) {
+               struct property *next = prop->next;
+               kfree(prop->name);
+               kfree(prop->value);
+               kfree(prop);
+               prop = next;
+
+               if (!prop) {
+                       prop = node->deadprops;
+                       node->deadprops = NULL;
+               }
+       }
+       kfree(node->full_name);
+       kfree(node->data);
+       kfree(node);
+}
+
+/**
+ * __of_prop_dup - Copy a property dynamically.
+ * @prop:      Property to copy
+ * @allocflags:        Allocation flags (typically pass GFP_KERNEL)
+ *
+ * Copy a property by dynamically allocating the memory of both the
+ * property stucture and the property name & contents. The property's
+ * flags have the OF_DYNAMIC bit set so that we can differentiate between
+ * dynamically allocated properties and not.
+ * Returns the newly allocated property or NULL on out of memory error.
+ */
+struct property *__of_prop_dup(const struct property *prop, gfp_t allocflags)
+{
+       struct property *new;
+
+       new = kzalloc(sizeof(*new), allocflags);
+       if (!new)
+               return NULL;
+
+       /*
+        * NOTE: There is no check for zero length value.
+        * In case of a boolean property, this will allocate a value
+        * of zero bytes. We do this to work around the use
+        * of of_get_property() calls on boolean values.
+        */
+       new->name = kstrdup(prop->name, allocflags);
+       new->value = kmemdup(prop->value, prop->length, allocflags);
+       new->length = prop->length;
+       if (!new->name || !new->value)
+               goto err_free;
+
+       /* mark the property as dynamic */
+       of_property_set_flag(new, OF_DYNAMIC);
+
+       return new;
+
+ err_free:
+       kfree(new->name);
+       kfree(new->value);
+       kfree(new);
+       return NULL;
+}
+
+/**
+ * __of_node_alloc() - Create an empty device node dynamically.
+ * @full_name: Full name of the new device node
+ * @allocflags:        Allocation flags (typically pass GFP_KERNEL)
+ *
+ * Create an empty device tree node, suitable for further modification.
+ * The node data are dynamically allocated and all the node flags
+ * have the OF_DYNAMIC & OF_DETACHED bits set.
+ * Returns the newly allocated node or NULL on out of memory error.
+ */
+struct device_node *__of_node_alloc(const char *full_name, gfp_t allocflags)
+{
+       struct device_node *node;
+
+       node = kzalloc(sizeof(*node), allocflags);
+       if (!node)
+               return NULL;
+
+       node->full_name = kstrdup(full_name, allocflags);
+       of_node_set_flag(node, OF_DYNAMIC);
+       of_node_set_flag(node, OF_DETACHED);
+       if (!node->full_name)
+               goto err_free;
+
+       of_node_init(node);
+
+       return node;
+
+ err_free:
+       kfree(node->full_name);
+       kfree(node);
+       return NULL;
+}
+
+static void __of_changeset_entry_destroy(struct of_changeset_entry *ce)
+{
+       of_node_put(ce->np);
+       list_del(&ce->node);
+       kfree(ce);
+}
+
+#ifdef DEBUG
+static void __of_changeset_entry_dump(struct of_changeset_entry *ce)
+{
+       switch (ce->action) {
+       case OF_RECONFIG_ADD_PROPERTY:
+               pr_debug("%p: %s %s/%s\n",
+                       ce, "ADD_PROPERTY   ", ce->np->full_name,
+                       ce->prop->name);
+               break;
+       case OF_RECONFIG_REMOVE_PROPERTY:
+               pr_debug("%p: %s %s/%s\n",
+                       ce, "REMOVE_PROPERTY", ce->np->full_name,
+                       ce->prop->name);
+               break;
+       case OF_RECONFIG_UPDATE_PROPERTY:
+               pr_debug("%p: %s %s/%s\n",
+                       ce, "UPDATE_PROPERTY", ce->np->full_name,
+                       ce->prop->name);
+               break;
+       case OF_RECONFIG_ATTACH_NODE:
+               pr_debug("%p: %s %s\n",
+                       ce, "ATTACH_NODE    ", ce->np->full_name);
+               break;
+       case OF_RECONFIG_DETACH_NODE:
+               pr_debug("%p: %s %s\n",
+                       ce, "DETACH_NODE    ", ce->np->full_name);
+               break;
+       }
+}
+#else
+static inline void __of_changeset_entry_dump(struct of_changeset_entry *ce)
+{
+       /* empty */
+}
+#endif
+
+static void __of_changeset_entry_invert(struct of_changeset_entry *ce,
+                                         struct of_changeset_entry *rce)
+{
+       memcpy(rce, ce, sizeof(*rce));
+
+       switch (ce->action) {
+       case OF_RECONFIG_ATTACH_NODE:
+               rce->action = OF_RECONFIG_DETACH_NODE;
+               break;
+       case OF_RECONFIG_DETACH_NODE:
+               rce->action = OF_RECONFIG_ATTACH_NODE;
+               break;
+       case OF_RECONFIG_ADD_PROPERTY:
+               rce->action = OF_RECONFIG_REMOVE_PROPERTY;
+               break;
+       case OF_RECONFIG_REMOVE_PROPERTY:
+               rce->action = OF_RECONFIG_ADD_PROPERTY;
+               break;
+       case OF_RECONFIG_UPDATE_PROPERTY:
+               rce->old_prop = ce->prop;
+               rce->prop = ce->old_prop;
+               break;
+       }
+}
+
+static void __of_changeset_entry_notify(struct of_changeset_entry *ce, bool revert)
+{
+       struct of_changeset_entry ce_inverted;
+       int ret;
+
+       if (revert) {
+               __of_changeset_entry_invert(ce, &ce_inverted);
+               ce = &ce_inverted;
+       }
+
+       switch (ce->action) {
+       case OF_RECONFIG_ATTACH_NODE:
+       case OF_RECONFIG_DETACH_NODE:
+               ret = of_reconfig_notify(ce->action, ce->np);
+               break;
+       case OF_RECONFIG_ADD_PROPERTY:
+       case OF_RECONFIG_REMOVE_PROPERTY:
+       case OF_RECONFIG_UPDATE_PROPERTY:
+               ret = of_property_notify(ce->action, ce->np, ce->prop, ce->old_prop);
+               break;
+       default:
+               pr_err("%s: invalid devicetree changeset action: %i\n", __func__,
+                       (int)ce->action);
+               return;
+       }
+
+       if (ret)
+               pr_err("%s: notifier error @%s\n", __func__, ce->np->full_name);
+}
+
+static int __of_changeset_entry_apply(struct of_changeset_entry *ce)
+{
+       struct property *old_prop, **propp;
+       unsigned long flags;
+       int ret = 0;
+
+       __of_changeset_entry_dump(ce);
+
+       raw_spin_lock_irqsave(&devtree_lock, flags);
+       switch (ce->action) {
+       case OF_RECONFIG_ATTACH_NODE:
+               __of_attach_node(ce->np);
+               break;
+       case OF_RECONFIG_DETACH_NODE:
+               __of_detach_node(ce->np);
+               break;
+       case OF_RECONFIG_ADD_PROPERTY:
+               /* If the property is in deadprops then it must be removed */
+               for (propp = &ce->np->deadprops; *propp; propp = &(*propp)->next) {
+                       if (*propp == ce->prop) {
+                               *propp = ce->prop->next;
+                               ce->prop->next = NULL;
+                               break;
+                       }
+               }
+
+               ret = __of_add_property(ce->np, ce->prop);
+               if (ret) {
+                       pr_err("%s: add_property failed @%s/%s\n",
+                               __func__, ce->np->full_name,
+                               ce->prop->name);
+                       break;
+               }
+               break;
+       case OF_RECONFIG_REMOVE_PROPERTY:
+               ret = __of_remove_property(ce->np, ce->prop);
+               if (ret) {
+                       pr_err("%s: remove_property failed @%s/%s\n",
+                               __func__, ce->np->full_name,
+                               ce->prop->name);
+                       break;
+               }
+               break;
+
+       case OF_RECONFIG_UPDATE_PROPERTY:
+               /* If the property is in deadprops then it must be removed */
+               for (propp = &ce->np->deadprops; *propp; propp = &(*propp)->next) {
+                       if (*propp == ce->prop) {
+                               *propp = ce->prop->next;
+                               ce->prop->next = NULL;
+                               break;
+                       }
+               }
+
+               ret = __of_update_property(ce->np, ce->prop, &old_prop);
+               if (ret) {
+                       pr_err("%s: update_property failed @%s/%s\n",
+                               __func__, ce->np->full_name,
+                               ce->prop->name);
+                       break;
+               }
+               break;
+       default:
+               ret = -EINVAL;
+       }
+       raw_spin_unlock_irqrestore(&devtree_lock, flags);
+
+       if (ret)
+               return ret;
+
+       switch (ce->action) {
+       case OF_RECONFIG_ATTACH_NODE:
+               __of_attach_node_sysfs(ce->np);
+               break;
+       case OF_RECONFIG_DETACH_NODE:
+               __of_detach_node_sysfs(ce->np);
+               break;
+       case OF_RECONFIG_ADD_PROPERTY:
+               /* ignore duplicate names */
+               __of_add_property_sysfs(ce->np, ce->prop);
+               break;
+       case OF_RECONFIG_REMOVE_PROPERTY:
+               __of_remove_property_sysfs(ce->np, ce->prop);
+               break;
+       case OF_RECONFIG_UPDATE_PROPERTY:
+               __of_update_property_sysfs(ce->np, ce->prop, ce->old_prop);
+               break;
+       }
+
+       return 0;
+}
+
+static inline int __of_changeset_entry_revert(struct of_changeset_entry *ce)
+{
+       struct of_changeset_entry ce_inverted;
+
+       __of_changeset_entry_invert(ce, &ce_inverted);
+       return __of_changeset_entry_apply(&ce_inverted);
+}
+
+/**
+ * of_changeset_init - Initialize a changeset for use
+ *
+ * @ocs:       changeset pointer
+ *
+ * Initialize a changeset structure
+ */
+void of_changeset_init(struct of_changeset *ocs)
+{
+       memset(ocs, 0, sizeof(*ocs));
+       INIT_LIST_HEAD(&ocs->entries);
+}
+
+/**
+ * of_changeset_destroy - Destroy a changeset
+ *
+ * @ocs:       changeset pointer
+ *
+ * Destroys a changeset. Note that if a changeset is applied,
+ * its changes to the tree cannot be reverted.
+ */
+void of_changeset_destroy(struct of_changeset *ocs)
+{
+       struct of_changeset_entry *ce, *cen;
+
+       list_for_each_entry_safe_reverse(ce, cen, &ocs->entries, node)
+               __of_changeset_entry_destroy(ce);
+}
+
+/**
+ * of_changeset_apply - Applies a changeset
+ *
+ * @ocs:       changeset pointer
+ *
+ * Applies a changeset to the live tree.
+ * Any side-effects of live tree state changes are applied here on
+ * sucess, like creation/destruction of devices and side-effects
+ * like creation of sysfs properties and directories.
+ * Returns 0 on success, a negative error value in case of an error.
+ * On error the partially applied effects are reverted.
+ */
+int of_changeset_apply(struct of_changeset *ocs)
+{
+       struct of_changeset_entry *ce;
+       int ret;
+
+       /* perform the rest of the work */
+       pr_debug("of_changeset: applying...\n");
+       list_for_each_entry(ce, &ocs->entries, node) {
+               ret = __of_changeset_entry_apply(ce);
+               if (ret) {
+                       pr_err("%s: Error applying changeset (%d)\n", __func__, ret);
+                       list_for_each_entry_continue_reverse(ce, &ocs->entries, node)
+                               __of_changeset_entry_revert(ce);
+                       return ret;
+               }
+       }
+       pr_debug("of_changeset: applied, emitting notifiers.\n");
+
+       /* drop the global lock while emitting notifiers */
+       mutex_unlock(&of_mutex);
+       list_for_each_entry(ce, &ocs->entries, node)
+               __of_changeset_entry_notify(ce, 0);
+       mutex_lock(&of_mutex);
+       pr_debug("of_changeset: notifiers sent.\n");
+
+       return 0;
+}
+
+/**
+ * of_changeset_revert - Reverts an applied changeset
+ *
+ * @ocs:       changeset pointer
+ *
+ * Reverts a changeset returning the state of the tree to what it
+ * was before the application.
+ * Any side-effects like creation/destruction of devices and
+ * removal of sysfs properties and directories are applied.
+ * Returns 0 on success, a negative error value in case of an error.
+ */
+int of_changeset_revert(struct of_changeset *ocs)
+{
+       struct of_changeset_entry *ce;
+       int ret;
+
+       pr_debug("of_changeset: reverting...\n");
+       list_for_each_entry_reverse(ce, &ocs->entries, node) {
+               ret = __of_changeset_entry_revert(ce);
+               if (ret) {
+                       pr_err("%s: Error reverting changeset (%d)\n", __func__, ret);
+                       list_for_each_entry_continue(ce, &ocs->entries, node)
+                               __of_changeset_entry_apply(ce);
+                       return ret;
+               }
+       }
+       pr_debug("of_changeset: reverted, emitting notifiers.\n");
+
+       /* drop the global lock while emitting notifiers */
+       mutex_unlock(&of_mutex);
+       list_for_each_entry_reverse(ce, &ocs->entries, node)
+               __of_changeset_entry_notify(ce, 1);
+       mutex_lock(&of_mutex);
+       pr_debug("of_changeset: notifiers sent.\n");
+
+       return 0;
+}
+
+/**
+ * of_changeset_action - Perform a changeset action
+ *
+ * @ocs:       changeset pointer
+ * @action:    action to perform
+ * @np:                Pointer to device node
+ * @prop:      Pointer to property
+ *
+ * On action being one of:
+ * + OF_RECONFIG_ATTACH_NODE
+ * + OF_RECONFIG_DETACH_NODE,
+ * + OF_RECONFIG_ADD_PROPERTY
+ * + OF_RECONFIG_REMOVE_PROPERTY,
+ * + OF_RECONFIG_UPDATE_PROPERTY
+ * Returns 0 on success, a negative error value in case of an error.
+ */
+int of_changeset_action(struct of_changeset *ocs, unsigned long action,
+               struct device_node *np, struct property *prop)
+{
+       struct of_changeset_entry *ce;
+
+       ce = kzalloc(sizeof(*ce), GFP_KERNEL);
+       if (!ce) {
+               pr_err("%s: Failed to allocate\n", __func__);
+               return -ENOMEM;
+       }
+       /* get a reference to the node */
+       ce->action = action;
+       ce->np = of_node_get(np);
+       ce->prop = prop;
+
+       if (action == OF_RECONFIG_UPDATE_PROPERTY && prop)
+               ce->old_prop = of_find_property(np, prop->name, NULL);
+
+       /* add it to the list */
+       list_add_tail(&ce->node, &ocs->entries);
+       return 0;
+}
index 9aa012e6ea0a6ed988150624bacc0eb2b8066c06..f46a24ffa3fe7be040d488bc49bfba912848fde1 100644 (file)
@@ -923,24 +923,24 @@ int __init early_init_dt_scan_chosen(unsigned long node, const char *uname,
 }
 
 #ifdef CONFIG_HAVE_MEMBLOCK
+#define MAX_PHYS_ADDR  ((phys_addr_t)~0)
+
 void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size)
 {
        const u64 phys_offset = __pa(PAGE_OFFSET);
        base &= PAGE_MASK;
        size &= PAGE_MASK;
 
-       if (sizeof(phys_addr_t) < sizeof(u64)) {
-               if (base > ULONG_MAX) {
-                       pr_warning("Ignoring memory block 0x%llx - 0x%llx\n",
-                                       base, base + size);
-                       return;
-               }
+       if (base > MAX_PHYS_ADDR) {
+               pr_warning("Ignoring memory block 0x%llx - 0x%llx\n",
+                               base, base + size);
+               return;
+       }
 
-               if (base + size > ULONG_MAX) {
-                       pr_warning("Ignoring memory range 0x%lx - 0x%llx\n",
-                                       ULONG_MAX, base + size);
-                       size = ULONG_MAX - base;
-               }
+       if (base + size > MAX_PHYS_ADDR) {
+               pr_warning("Ignoring memory range 0x%lx - 0x%llx\n",
+                               ULONG_MAX, base + size);
+               size = MAX_PHYS_ADDR - base;
        }
 
        if (base + size < phys_offset) {
index ff350c8fa7acc4398c088ddc5d346906df3a5efd..858e0a5d9a115fc1a53b3d324207420d2e29ff40 100644 (file)
@@ -31,6 +31,63 @@ struct alias_prop {
        char stem[0];
 };
 
-extern struct mutex of_aliases_mutex;
+extern struct mutex of_mutex;
 extern struct list_head aliases_lookup;
+extern struct kset *of_kset;
+
+
+static inline struct device_node *kobj_to_device_node(struct kobject *kobj)
+{
+       return container_of(kobj, struct device_node, kobj);
+}
+
+#if defined(CONFIG_OF_DYNAMIC)
+extern int of_property_notify(int action, struct device_node *np,
+                             struct property *prop, struct property *old_prop);
+extern void of_node_release(struct kobject *kobj);
+#else /* CONFIG_OF_DYNAMIC */
+static inline int of_property_notify(int action, struct device_node *np,
+                                    struct property *prop, struct property *old_prop)
+{
+       return 0;
+}
+#endif /* CONFIG_OF_DYNAMIC */
+
+/**
+ * General utilities for working with live trees.
+ *
+ * All functions with two leading underscores operate
+ * without taking node references, so you either have to
+ * own the devtree lock or work on detached trees only.
+ */
+struct property *__of_prop_dup(const struct property *prop, gfp_t allocflags);
+struct device_node *__of_node_alloc(const char *full_name, gfp_t allocflags);
+
+extern const void *__of_get_property(const struct device_node *np,
+                                    const char *name, int *lenp);
+extern int __of_add_property(struct device_node *np, struct property *prop);
+extern int __of_add_property_sysfs(struct device_node *np,
+               struct property *prop);
+extern int __of_remove_property(struct device_node *np, struct property *prop);
+extern void __of_remove_property_sysfs(struct device_node *np,
+               struct property *prop);
+extern int __of_update_property(struct device_node *np,
+               struct property *newprop, struct property **oldprop);
+extern void __of_update_property_sysfs(struct device_node *np,
+               struct property *newprop, struct property *oldprop);
+
+extern void __of_attach_node(struct device_node *np);
+extern int __of_attach_node_sysfs(struct device_node *np);
+extern void __of_detach_node(struct device_node *np);
+extern void __of_detach_node_sysfs(struct device_node *np);
+
+/* iterators for transactions, used for overlays */
+/* forward iterator */
+#define for_each_transaction_entry(_oft, _te) \
+       list_for_each_entry(_te, &(_oft)->te_list, node)
+
+/* reverse iterator */
+#define for_each_transaction_entry_reverse(_oft, _te) \
+       list_for_each_entry_reverse(_te, &(_oft)->te_list, node)
+
 #endif /* _LINUX_OF_PRIVATE_H */
index 632aae8613756b6d1c433485e6ad0d075f11af66..59fb12e84e6bb8da22f62c6de25817cf3d743682 100644 (file)
@@ -206,8 +206,16 @@ void __init fdt_init_reserved_mem(void)
        for (i = 0; i < reserved_mem_count; i++) {
                struct reserved_mem *rmem = &reserved_mem[i];
                unsigned long node = rmem->fdt_node;
+               int len;
+               const __be32 *prop;
                int err = 0;
 
+               prop = of_get_flat_dt_prop(node, "phandle", &len);
+               if (!prop)
+                       prop = of_get_flat_dt_prop(node, "linux,phandle", &len);
+               if (prop)
+                       rmem->phandle = of_read_number(prop, len/4);
+
                if (rmem->size == 0)
                        err = __reserved_mem_alloc_size(node, rmem->name,
                                                 &rmem->base, &rmem->size);
@@ -215,3 +223,65 @@ void __init fdt_init_reserved_mem(void)
                        __reserved_mem_init_node(rmem);
        }
 }
+
+static inline struct reserved_mem *__find_rmem(struct device_node *node)
+{
+       unsigned int i;
+
+       if (!node->phandle)
+               return NULL;
+
+       for (i = 0; i < reserved_mem_count; i++)
+               if (reserved_mem[i].phandle == node->phandle)
+                       return &reserved_mem[i];
+       return NULL;
+}
+
+/**
+ * of_reserved_mem_device_init() - assign reserved memory region to given device
+ *
+ * This function assign memory region pointed by "memory-region" device tree
+ * property to the given device.
+ */
+void of_reserved_mem_device_init(struct device *dev)
+{
+       struct reserved_mem *rmem;
+       struct device_node *np;
+
+       np = of_parse_phandle(dev->of_node, "memory-region", 0);
+       if (!np)
+               return;
+
+       rmem = __find_rmem(np);
+       of_node_put(np);
+
+       if (!rmem || !rmem->ops || !rmem->ops->device_init)
+               return;
+
+       rmem->ops->device_init(rmem, dev);
+       dev_info(dev, "assigned reserved memory node %s\n", rmem->name);
+}
+
+/**
+ * of_reserved_mem_device_release() - release reserved memory device structures
+ *
+ * This function releases structures allocated for memory region handling for
+ * the given device.
+ */
+void of_reserved_mem_device_release(struct device *dev)
+{
+       struct reserved_mem *rmem;
+       struct device_node *np;
+
+       np = of_parse_phandle(dev->of_node, "memory-region", 0);
+       if (!np)
+               return;
+
+       rmem = __find_rmem(np);
+       of_node_put(np);
+
+       if (!rmem || !rmem->ops || !rmem->ops->device_release)
+               return;
+
+       rmem->ops->device_release(rmem, dev);
+}
index 500436f9be7f8257e17965e53eeb1dcf276221d3..0197725e033a6a8d667c6a9e1e593c60010b3608 100644 (file)
@@ -422,6 +422,7 @@ static int of_platform_bus_create(struct device_node *bus,
                        break;
                }
        }
+       of_node_set_flag(bus, OF_POPULATED_BUS);
        return rc;
 }
 
@@ -508,19 +509,13 @@ EXPORT_SYMBOL_GPL(of_platform_populate);
 
 static int of_platform_device_destroy(struct device *dev, void *data)
 {
-       bool *children_left = data;
-
        /* Do not touch devices not populated from the device tree */
-       if (!dev->of_node || !of_node_check_flag(dev->of_node, OF_POPULATED)) {
-               *children_left = true;
+       if (!dev->of_node || !of_node_check_flag(dev->of_node, OF_POPULATED))
                return 0;
-       }
 
-       /* Recurse, but don't touch this device if it has any children left */
-       if (of_platform_depopulate(dev) != 0) {
-               *children_left = true;
-               return 0;
-       }
+       /* Recurse for any nodes that were treated as busses */
+       if (of_node_check_flag(dev->of_node, OF_POPULATED_BUS))
+               device_for_each_child(dev, NULL, of_platform_device_destroy);
 
        if (dev->bus == &platform_bus_type)
                platform_device_unregister(to_platform_device(dev));
@@ -528,19 +523,15 @@ static int of_platform_device_destroy(struct device *dev, void *data)
        else if (dev->bus == &amba_bustype)
                amba_device_unregister(to_amba_device(dev));
 #endif
-       else {
-               *children_left = true;
-               return 0;
-       }
 
        of_node_clear_flag(dev->of_node, OF_POPULATED);
-
+       of_node_clear_flag(dev->of_node, OF_POPULATED_BUS);
        return 0;
 }
 
 /**
  * of_platform_depopulate() - Remove devices populated from device tree
- * @parent: device which childred will be removed
+ * @parent: device which children will be removed
  *
  * Complementary to of_platform_populate(), this function removes children
  * of the given device (and, recurrently, their children) that have been
@@ -550,14 +541,9 @@ static int of_platform_device_destroy(struct device *dev, void *data)
  * Returns 0 when all children devices have been removed or
  * -EBUSY when some children remained.
  */
-int of_platform_depopulate(struct device *parent)
+void of_platform_depopulate(struct device *parent)
 {
-       bool children_left = false;
-
-       device_for_each_child(parent, &children_left,
-                             of_platform_device_destroy);
-
-       return children_left ? -EBUSY : 0;
+       device_for_each_child(parent, NULL, of_platform_device_destroy);
 }
 EXPORT_SYMBOL_GPL(of_platform_depopulate);
 
index 077314eebb95c785b1f1cb8f46790f6004464ee3..d410026678334885bae9e51893fde5f9873a7831 100644 (file)
@@ -9,6 +9,7 @@
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/of.h>
+#include <linux/of_fdt.h>
 #include <linux/of_irq.h>
 #include <linux/of_platform.h>
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/device.h>
 
+#include "of_private.h"
+
 static struct selftest_results {
        int passed;
        int failed;
 } selftest_results;
 
+#define NO_OF_NODES 2
+static struct device_node *nodes[NO_OF_NODES];
+static int last_node_index;
+
 #define selftest(result, fmt, ...) { \
        if (!(result)) { \
                selftest_results.failed++; \
@@ -266,6 +273,81 @@ static void __init of_selftest_property_match_string(void)
        selftest(rc == -EILSEQ, "unterminated string; rc=%i", rc);
 }
 
+#define propcmp(p1, p2) (((p1)->length == (p2)->length) && \
+                       (p1)->value && (p2)->value && \
+                       !memcmp((p1)->value, (p2)->value, (p1)->length) && \
+                       !strcmp((p1)->name, (p2)->name))
+static void __init of_selftest_property_copy(void)
+{
+#ifdef CONFIG_OF_DYNAMIC
+       struct property p1 = { .name = "p1", .length = 0, .value = "" };
+       struct property p2 = { .name = "p2", .length = 5, .value = "abcd" };
+       struct property *new;
+
+       new = __of_prop_dup(&p1, GFP_KERNEL);
+       selftest(new && propcmp(&p1, new), "empty property didn't copy correctly\n");
+       kfree(new->value);
+       kfree(new->name);
+       kfree(new);
+
+       new = __of_prop_dup(&p2, GFP_KERNEL);
+       selftest(new && propcmp(&p2, new), "non-empty property didn't copy correctly\n");
+       kfree(new->value);
+       kfree(new->name);
+       kfree(new);
+#endif
+}
+
+static void __init of_selftest_changeset(void)
+{
+#ifdef CONFIG_OF_DYNAMIC
+       struct property *ppadd, padd = { .name = "prop-add", .length = 0, .value = "" };
+       struct property *ppupdate, pupdate = { .name = "prop-update", .length = 5, .value = "abcd" };
+       struct property *ppremove;
+       struct device_node *n1, *n2, *n21, *nremove, *parent;
+       struct of_changeset chgset;
+
+       of_changeset_init(&chgset);
+       n1 = __of_node_alloc("/testcase-data/changeset/n1", GFP_KERNEL);
+       selftest(n1, "testcase setup failure\n");
+       n2 = __of_node_alloc("/testcase-data/changeset/n2", GFP_KERNEL);
+       selftest(n2, "testcase setup failure\n");
+       n21 = __of_node_alloc("/testcase-data/changeset/n2/n21", GFP_KERNEL);
+       selftest(n21, "testcase setup failure %p\n", n21);
+       nremove = of_find_node_by_path("/testcase-data/changeset/node-remove");
+       selftest(nremove, "testcase setup failure\n");
+       ppadd = __of_prop_dup(&padd, GFP_KERNEL);
+       selftest(ppadd, "testcase setup failure\n");
+       ppupdate = __of_prop_dup(&pupdate, GFP_KERNEL);
+       selftest(ppupdate, "testcase setup failure\n");
+       parent = nremove->parent;
+       n1->parent = parent;
+       n2->parent = parent;
+       n21->parent = n2;
+       n2->child = n21;
+       ppremove = of_find_property(parent, "prop-remove", NULL);
+       selftest(ppremove, "failed to find removal prop");
+
+       of_changeset_init(&chgset);
+       selftest(!of_changeset_attach_node(&chgset, n1), "fail attach n1\n");
+       selftest(!of_changeset_attach_node(&chgset, n2), "fail attach n2\n");
+       selftest(!of_changeset_detach_node(&chgset, nremove), "fail remove node\n");
+       selftest(!of_changeset_attach_node(&chgset, n21), "fail attach n21\n");
+       selftest(!of_changeset_add_property(&chgset, parent, ppadd), "fail add prop\n");
+       selftest(!of_changeset_update_property(&chgset, parent, ppupdate), "fail update prop\n");
+       selftest(!of_changeset_remove_property(&chgset, parent, ppremove), "fail remove prop\n");
+       mutex_lock(&of_mutex);
+       selftest(!of_changeset_apply(&chgset), "apply failed\n");
+       mutex_unlock(&of_mutex);
+
+       mutex_lock(&of_mutex);
+       selftest(!of_changeset_revert(&chgset), "revert failed\n");
+       mutex_unlock(&of_mutex);
+
+       of_changeset_destroy(&chgset);
+#endif
+}
+
 static void __init of_selftest_parse_interrupts(void)
 {
        struct device_node *np;
@@ -517,9 +599,156 @@ static void __init of_selftest_platform_populate(void)
        }
 }
 
+/**
+ *     update_node_properties - adds the properties
+ *     of np into dup node (present in live tree) and
+ *     updates parent of children of np to dup.
+ *
+ *     @np:    node already present in live tree
+ *     @dup:   node present in live tree to be updated
+ */
+static void update_node_properties(struct device_node *np,
+                                       struct device_node *dup)
+{
+       struct property *prop;
+       struct device_node *child;
+
+       for_each_property_of_node(np, prop)
+               of_add_property(dup, prop);
+
+       for_each_child_of_node(np, child)
+               child->parent = dup;
+}
+
+/**
+ *     attach_node_and_children - attaches nodes
+ *     and its children to live tree
+ *
+ *     @np:    Node to attach to live tree
+ */
+static int attach_node_and_children(struct device_node *np)
+{
+       struct device_node *next, *root = np, *dup;
+
+       if (!np) {
+               pr_warn("%s: No tree to attach; not running tests\n",
+                       __func__);
+               return -ENODATA;
+       }
+
+
+       /* skip root node */
+       np = np->child;
+       /* storing a copy in temporary node */
+       dup = np;
+
+       while (dup) {
+               nodes[last_node_index++] = dup;
+               dup = dup->sibling;
+       }
+       dup = NULL;
+
+       while (np) {
+               next = np->allnext;
+               dup = of_find_node_by_path(np->full_name);
+               if (dup)
+                       update_node_properties(np, dup);
+               else {
+                       np->child = NULL;
+                       if (np->parent == root)
+                               np->parent = of_allnodes;
+                       of_attach_node(np);
+               }
+               np = next;
+       }
+
+       return 0;
+}
+
+/**
+ *     selftest_data_add - Reads, copies data from
+ *     linked tree and attaches it to the live tree
+ */
+static int __init selftest_data_add(void)
+{
+       void *selftest_data;
+       struct device_node *selftest_data_node;
+       extern uint8_t __dtb_testcases_begin[];
+       extern uint8_t __dtb_testcases_end[];
+       const int size = __dtb_testcases_end - __dtb_testcases_begin;
+
+       if (!size || !of_allnodes) {
+               pr_warn("%s: No testcase data to attach; not running tests\n",
+                       __func__);
+               return -ENODATA;
+       }
+
+       /* creating copy */
+       selftest_data = kmemdup(__dtb_testcases_begin, size, GFP_KERNEL);
+
+       if (!selftest_data) {
+               pr_warn("%s: Failed to allocate memory for selftest_data; "
+                       "not running tests\n", __func__);
+               return -ENOMEM;
+       }
+       of_fdt_unflatten_tree(selftest_data, &selftest_data_node);
+
+       /* attach the sub-tree to live tree */
+       return attach_node_and_children(selftest_data_node);
+}
+
+/**
+ *     detach_node_and_children - detaches node
+ *     and its children from live tree
+ *
+ *     @np:    Node to detach from live tree
+ */
+static void detach_node_and_children(struct device_node *np)
+{
+       while (np->child)
+               detach_node_and_children(np->child);
+
+       while (np->sibling)
+               detach_node_and_children(np->sibling);
+
+       of_detach_node(np);
+}
+
+/**
+ *     selftest_data_remove - removes the selftest data
+ *     nodes from the live tree
+ */
+static void selftest_data_remove(void)
+{
+       struct device_node *np;
+       struct property *prop;
+
+       while (last_node_index >= 0) {
+               if (nodes[last_node_index]) {
+                       np = of_find_node_by_path(nodes[last_node_index]->full_name);
+                       if (strcmp(np->full_name, "/aliases") != 0) {
+                               detach_node_and_children(np->child);
+                               of_detach_node(np);
+                       } else {
+                               for_each_property_of_node(np, prop) {
+                                       if (strcmp(prop->name, "testcase-alias") == 0)
+                                               of_remove_property(np, prop);
+                               }
+                       }
+               }
+               last_node_index--;
+       }
+}
+
 static int __init of_selftest(void)
 {
        struct device_node *np;
+       int res;
+
+       /* adding data for selftest */
+       res = selftest_data_add();
+       if (res)
+               return res;
 
        np = of_find_node_by_path("/testcase-data/phandle-tests/consumer-a");
        if (!np) {
@@ -533,12 +762,18 @@ static int __init of_selftest(void)
        of_selftest_dynamic();
        of_selftest_parse_phandle_with_args();
        of_selftest_property_match_string();
+       of_selftest_property_copy();
+       of_selftest_changeset();
        of_selftest_parse_interrupts();
        of_selftest_parse_interrupts_extended();
        of_selftest_match_node();
        of_selftest_platform_populate();
        pr_info("end of selftest - %i passed, %i failed\n",
                selftest_results.passed, selftest_results.failed);
+
+       /* removing selftest data from live tree */
+       selftest_data_remove();
+
        return 0;
 }
 late_initcall(of_selftest);
diff --git a/drivers/of/testcase-data/testcases.dts b/drivers/of/testcase-data/testcases.dts
new file mode 100644 (file)
index 0000000..219ef93
--- /dev/null
@@ -0,0 +1,15 @@
+/dts-v1/;
+/ {
+       testcase-data {
+               changeset {
+                       prop-update = "hello";
+                       prop-remove = "world";
+                       node-remove {
+                       };
+               };
+       };
+};
+#include "tests-phandle.dtsi"
+#include "tests-interrupts.dtsi"
+#include "tests-match.dtsi"
+#include "tests-platform.dtsi"
diff --git a/drivers/of/testcase-data/testcases.dtsi b/drivers/of/testcase-data/testcases.dtsi
deleted file mode 100644 (file)
index 6d8d980..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "tests-phandle.dtsi"
-#include "tests-interrupts.dtsi"
-#include "tests-match.dtsi"
-#include "tests-platform.dtsi"
index 93aa29f6d39c5b3a06e699f13f72a8c960b7da34..f2945fa73d4ffeeb5010358b5cc63fce32b73a5a 100644 (file)
@@ -375,11 +375,11 @@ static void __exit cleanup_slots(void)
 
 static int __init rpaphp_init(void)
 {
-       struct device_node *dn = NULL;
+       struct device_node *dn;
 
        info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
 
-       while ((dn = of_find_node_by_name(dn, "pci")))
+       for_each_node_by_name(dn, "pci")
                rpaphp_add_slot(dn);
 
        return 0;
index f9a13867cb70df87fa58aa48a86645eddbefd9c8..693208eb904721ad7e55bb009097377f685d1efa 100644 (file)
@@ -151,7 +151,7 @@ config KIRKWOOD_THERMAL
 
 config DOVE_THERMAL
        tristate "Temperature sensor on Marvell Dove SoCs"
-       depends on ARCH_DOVE
+       depends on ARCH_DOVE || MACH_DOVE
        depends on OF
        help
          Support for the Dove thermal sensor driver in the Linux thermal
@@ -243,4 +243,9 @@ depends on ARCH_EXYNOS
 source "drivers/thermal/samsung/Kconfig"
 endmenu
 
+menu "STMicroelectronics thermal drivers"
+depends on ARCH_STI && OF
+source "drivers/thermal/st/Kconfig"
+endmenu
+
 endif
index de0636a57a6470e33aa1f057770b43d4c3062358..31e232f84b6ba80fa082d0cfd8ca33a8640a05a8 100644 (file)
@@ -32,3 +32,4 @@ obj-$(CONFIG_X86_PKG_TEMP_THERMAL)    += x86_pkg_temp_thermal.o
 obj-$(CONFIG_INTEL_SOC_DTS_THERMAL)    += intel_soc_dts_thermal.o
 obj-$(CONFIG_TI_SOC_THERMAL)   += ti-soc-thermal/
 obj-$(CONFIG_ACPI_INT3403_THERMAL)     += int3403_thermal.o
+obj-$(CONFIG_ST_THERMAL)       += st/
index 84a75f89bf74d07786989cd643b1cf327224c2ff..1ab0018271c5c622c0b7556fb92a3a1d9ad38e5b 100644 (file)
@@ -305,7 +305,7 @@ static int cpufreq_apply_cooling(struct cpufreq_cooling_device *cpufreq_device,
  * @event: value showing cpufreq event for which this function invoked.
  * @data: callback-specific data
  *
- * Callback to highjack the notification on cpufreq policy transition.
+ * Callback to hijack the notification on cpufreq policy transition.
  * Every time there is a change in policy, we will intercept and
  * update the cpufreq policy with thermal constraints.
  *
index e93f0253f6ed2070c934570cbe5c62f400bdfcc1..17554eeb3953e2cf3c8a6190675be1a50e5387a7 100644 (file)
 struct int3403_sensor {
        struct thermal_zone_device *tzone;
        unsigned long *thresholds;
+       unsigned long   crit_temp;
+       int             crit_trip_id;
+       unsigned long   psv_temp;
+       int             psv_trip_id;
 };
 
 static int sys_get_curr_temp(struct thermal_zone_device *tzone,
@@ -79,12 +83,18 @@ static int sys_get_trip_temp(struct thermal_zone_device *tzone,
        struct acpi_device *device = tzone->devdata;
        struct int3403_sensor *obj = acpi_driver_data(device);
 
-       /*
-        * get_trip_temp is a mandatory callback but
-        * PATx method doesn't return any value, so return
-        * cached value, which was last set from user space.
-        */
-       *temp = obj->thresholds[trip];
+       if (trip == obj->crit_trip_id)
+               *temp = obj->crit_temp;
+       else if (trip == obj->psv_trip_id)
+               *temp = obj->psv_temp;
+       else {
+               /*
+                * get_trip_temp is a mandatory callback but
+                * PATx method doesn't return any value, so return
+                * cached value, which was last set from user space.
+                */
+               *temp = obj->thresholds[trip];
+       }
 
        return 0;
 }
@@ -92,8 +102,14 @@ static int sys_get_trip_temp(struct thermal_zone_device *tzone,
 static int sys_get_trip_type(struct thermal_zone_device *thermal,
                int trip, enum thermal_trip_type *type)
 {
+       struct acpi_device *device = thermal->devdata;
+       struct int3403_sensor *obj = acpi_driver_data(device);
+
        /* Mandatory callback, may not mean much here */
-       *type = THERMAL_TRIP_PASSIVE;
+       if (trip == obj->crit_trip_id)
+               *type = THERMAL_TRIP_CRITICAL;
+       else
+               *type = THERMAL_TRIP_PASSIVE;
 
        return 0;
 }
@@ -155,6 +171,34 @@ static void acpi_thermal_notify(struct acpi_device *device, u32 event)
        }
 }
 
+static int sys_get_trip_crt(struct acpi_device *device, unsigned long *temp)
+{
+       unsigned long long crt;
+       acpi_status status;
+
+       status = acpi_evaluate_integer(device->handle, "_CRT", NULL, &crt);
+       if (ACPI_FAILURE(status))
+               return -EIO;
+
+       *temp = DECI_KELVIN_TO_MILLI_CELSIUS(crt, KELVIN_OFFSET);
+
+       return 0;
+}
+
+static int sys_get_trip_psv(struct acpi_device *device, unsigned long *temp)
+{
+       unsigned long long psv;
+       acpi_status status;
+
+       status = acpi_evaluate_integer(device->handle, "_PSV", NULL, &psv);
+       if (ACPI_FAILURE(status))
+               return -EIO;
+
+       *temp = DECI_KELVIN_TO_MILLI_CELSIUS(psv, KELVIN_OFFSET);
+
+       return 0;
+}
+
 static int acpi_int3403_add(struct acpi_device *device)
 {
        int result = 0;
@@ -194,6 +238,15 @@ static int acpi_int3403_add(struct acpi_device *device)
                        return -ENOMEM;
                trip_mask = BIT(trip_cnt) - 1;
        }
+
+       obj->psv_trip_id = -1;
+       if (!sys_get_trip_psv(device, &obj->psv_temp))
+               obj->psv_trip_id = trip_cnt++;
+
+       obj->crit_trip_id = -1;
+       if (!sys_get_trip_crt(device, &obj->crit_temp))
+               obj->crit_trip_id = trip_cnt++;
+
        obj->tzone = thermal_zone_device_register(acpi_device_bid(device),
                                trip_cnt, trip_mask, device, &tzone_ops,
                                NULL, 0, 0);
index d7ca9f49c9cb2201d41f55c4fe98f0fb04595173..acbff14da3a492b776c1e48c2c90461c82779f77 100644 (file)
@@ -504,6 +504,10 @@ static irqreturn_t exynos_tmu_irq(int irq, void *id)
 }
 
 static const struct of_device_id exynos_tmu_match[] = {
+       {
+               .compatible = "samsung,exynos3250-tmu",
+               .data = (void *)EXYNOS3250_TMU_DRV_DATA,
+       },
        {
                .compatible = "samsung,exynos4210-tmu",
                .data = (void *)EXYNOS4210_TMU_DRV_DATA,
@@ -677,7 +681,8 @@ static int exynos_tmu_probe(struct platform_device *pdev)
                goto err_clk_sec;
        }
 
-       if (pdata->type == SOC_ARCH_EXYNOS4210 ||
+       if (pdata->type == SOC_ARCH_EXYNOS3250 ||
+           pdata->type == SOC_ARCH_EXYNOS4210 ||
            pdata->type == SOC_ARCH_EXYNOS4412 ||
            pdata->type == SOC_ARCH_EXYNOS5250 ||
            pdata->type == SOC_ARCH_EXYNOS5260 ||
@@ -759,10 +764,10 @@ static int exynos_tmu_remove(struct platform_device *pdev)
 {
        struct exynos_tmu_data *data = platform_get_drvdata(pdev);
 
-       exynos_tmu_control(pdev, false);
-
        exynos_unregister_thermal(data->reg_conf);
 
+       exynos_tmu_control(pdev, false);
+
        clk_unprepare(data->clk);
        if (!IS_ERR(data->clk_sec))
                clk_unprepare(data->clk_sec);
index edd08cf7672921bffd56f6986e125cb1684b0081..1b4a6444ea617574ae817b157f76d47329ddb2a7 100644 (file)
@@ -40,7 +40,8 @@ enum calibration_mode {
 };
 
 enum soc_type {
-       SOC_ARCH_EXYNOS4210 = 1,
+       SOC_ARCH_EXYNOS3250 = 1,
+       SOC_ARCH_EXYNOS4210,
        SOC_ARCH_EXYNOS4412,
        SOC_ARCH_EXYNOS5250,
        SOC_ARCH_EXYNOS5260,
index c1d81dcd781993858f2e6e17ab845c85c169fc42..aa8e0dee2055d73c74719a912ef0595a1739ab9a 100644 (file)
@@ -90,6 +90,95 @@ struct exynos_tmu_init_data const exynos4210_default_tmu_data = {
 };
 #endif
 
+#if defined(CONFIG_SOC_EXYNOS3250)
+static const struct exynos_tmu_registers exynos3250_tmu_registers = {
+       .triminfo_data = EXYNOS_TMU_REG_TRIMINFO,
+       .triminfo_25_shift = EXYNOS_TRIMINFO_25_SHIFT,
+       .triminfo_85_shift = EXYNOS_TRIMINFO_85_SHIFT,
+       .tmu_ctrl = EXYNOS_TMU_REG_CONTROL,
+       .test_mux_addr_shift = EXYNOS4412_MUX_ADDR_SHIFT,
+       .buf_vref_sel_shift = EXYNOS_TMU_REF_VOLTAGE_SHIFT,
+       .buf_vref_sel_mask = EXYNOS_TMU_REF_VOLTAGE_MASK,
+       .therm_trip_mode_shift = EXYNOS_TMU_TRIP_MODE_SHIFT,
+       .therm_trip_mode_mask = EXYNOS_TMU_TRIP_MODE_MASK,
+       .therm_trip_en_shift = EXYNOS_TMU_THERM_TRIP_EN_SHIFT,
+       .buf_slope_sel_shift = EXYNOS_TMU_BUF_SLOPE_SEL_SHIFT,
+       .buf_slope_sel_mask = EXYNOS_TMU_BUF_SLOPE_SEL_MASK,
+       .core_en_shift = EXYNOS_TMU_CORE_EN_SHIFT,
+       .tmu_status = EXYNOS_TMU_REG_STATUS,
+       .tmu_cur_temp = EXYNOS_TMU_REG_CURRENT_TEMP,
+       .threshold_th0 = EXYNOS_THD_TEMP_RISE,
+       .threshold_th1 = EXYNOS_THD_TEMP_FALL,
+       .tmu_inten = EXYNOS_TMU_REG_INTEN,
+       .inten_rise0_shift = EXYNOS_TMU_INTEN_RISE0_SHIFT,
+       .inten_rise1_shift = EXYNOS_TMU_INTEN_RISE1_SHIFT,
+       .inten_rise2_shift = EXYNOS_TMU_INTEN_RISE2_SHIFT,
+       .inten_fall0_shift = EXYNOS_TMU_INTEN_FALL0_SHIFT,
+       .tmu_intstat = EXYNOS_TMU_REG_INTSTAT,
+       .tmu_intclear = EXYNOS_TMU_REG_INTCLEAR,
+       .intclr_fall_shift = EXYNOS_TMU_CLEAR_FALL_INT_SHIFT,
+       .intclr_rise_shift = EXYNOS_TMU_RISE_INT_SHIFT,
+       .intclr_rise_mask = EXYNOS_TMU_RISE_INT_MASK,
+       .intclr_fall_mask = EXYNOS_TMU_FALL_INT_MASK,
+       .emul_con = EXYNOS_EMUL_CON,
+       .emul_temp_shift = EXYNOS_EMUL_DATA_SHIFT,
+       .emul_time_shift = EXYNOS_EMUL_TIME_SHIFT,
+       .emul_time_mask = EXYNOS_EMUL_TIME_MASK,
+};
+
+#define EXYNOS3250_TMU_DATA \
+       .threshold_falling = 10, \
+       .trigger_levels[0] = 70, \
+       .trigger_levels[1] = 95, \
+       .trigger_levels[2] = 110, \
+       .trigger_levels[3] = 120, \
+       .trigger_enable[0] = true, \
+       .trigger_enable[1] = true, \
+       .trigger_enable[2] = true, \
+       .trigger_enable[3] = false, \
+       .trigger_type[0] = THROTTLE_ACTIVE, \
+       .trigger_type[1] = THROTTLE_ACTIVE, \
+       .trigger_type[2] = SW_TRIP, \
+       .trigger_type[3] = HW_TRIP, \
+       .max_trigger_level = 4, \
+       .gain = 8, \
+       .reference_voltage = 16, \
+       .noise_cancel_mode = 4, \
+       .cal_type = TYPE_TWO_POINT_TRIMMING, \
+       .efuse_value = 55, \
+       .min_efuse_value = 40, \
+       .max_efuse_value = 100, \
+       .first_point_trim = 25, \
+       .second_point_trim = 85, \
+       .default_temp_offset = 50, \
+       .freq_tab[0] = { \
+               .freq_clip_max = 800 * 1000, \
+               .temp_level = 70, \
+       }, \
+       .freq_tab[1] = { \
+               .freq_clip_max = 400 * 1000, \
+               .temp_level = 95, \
+       }, \
+       .freq_tab_count = 2, \
+       .registers = &exynos3250_tmu_registers, \
+       .features = (TMU_SUPPORT_EMULATION | \
+                       TMU_SUPPORT_FALLING_TRIP | TMU_SUPPORT_READY_STATUS | \
+                       TMU_SUPPORT_EMUL_TIME)
+#endif
+
+#if defined(CONFIG_SOC_EXYNOS3250)
+struct exynos_tmu_init_data const exynos3250_default_tmu_data = {
+       .tmu_data = {
+               {
+                       EXYNOS3250_TMU_DATA,
+                       .type = SOC_ARCH_EXYNOS3250,
+                       .test_mux = EXYNOS4412_MUX_ADDR_VALUE,
+               },
+       },
+       .tmu_count = 1,
+};
+#endif
+
 #if defined(CONFIG_SOC_EXYNOS4412) || defined(CONFIG_SOC_EXYNOS5250)
 static const struct exynos_tmu_registers exynos4412_tmu_registers = {
        .triminfo_data = EXYNOS_TMU_REG_TRIMINFO,
index d268981b65e5f122ddc56baf0202e81d1334bc5c..f0979e598491cc80f3bd0c3782090b06b172e223 100644 (file)
 #define EXYNOS5440_TMU_TH_RISE4_SHIFT          24
 #define EXYNOS5440_EFUSE_SWAP_OFFSET           8
 
+#if defined(CONFIG_SOC_EXYNOS3250)
+extern struct exynos_tmu_init_data const exynos3250_default_tmu_data;
+#define EXYNOS3250_TMU_DRV_DATA (&exynos3250_default_tmu_data)
+#else
+#define EXYNOS3250_TMU_DRV_DATA (NULL)
+#endif
+
 #if defined(CONFIG_CPU_EXYNOS4210)
 extern struct exynos_tmu_init_data const exynos4210_default_tmu_data;
 #define EXYNOS4210_TMU_DRV_DATA (&exynos4210_default_tmu_data)
diff --git a/drivers/thermal/st/Kconfig b/drivers/thermal/st/Kconfig
new file mode 100644 (file)
index 0000000..490fdbe
--- /dev/null
@@ -0,0 +1,12 @@
+config ST_THERMAL
+       tristate "Thermal sensors on STMicroelectronics STi series of SoCs"
+       help
+         Support for thermal sensors on STMicroelectronics STi series of SoCs.
+
+config ST_THERMAL_SYSCFG
+       select ST_THERMAL
+       tristate "STi series syscfg register access based thermal sensors"
+
+config ST_THERMAL_MEMMAP
+       select ST_THERMAL
+       tristate "STi series memory mapped access based thermal sensors"
diff --git a/drivers/thermal/st/Makefile b/drivers/thermal/st/Makefile
new file mode 100644 (file)
index 0000000..b388789
--- /dev/null
@@ -0,0 +1,3 @@
+obj-$(CONFIG_ST_THERMAL)               := st_thermal.o
+obj-$(CONFIG_ST_THERMAL_SYSCFG)                += st_thermal_syscfg.o
+obj-$(CONFIG_ST_THERMAL_MEMMAP)                += st_thermal_memmap.o
diff --git a/drivers/thermal/st/st_thermal.c b/drivers/thermal/st/st_thermal.c
new file mode 100644 (file)
index 0000000..90163b3
--- /dev/null
@@ -0,0 +1,313 @@
+/*
+ * ST Thermal Sensor Driver core routines
+ * Author: Ajit Pal Singh <ajitpal.singh@st.com>
+ *
+ * Copyright (C) 2003-2014 STMicroelectronics (R&D) Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/clk.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+
+#include "st_thermal.h"
+
+/* The Thermal Framework expects millidegrees */
+#define mcelsius(temp)                 ((temp) * 1000)
+
+/*
+ * Function to allocate regfields which are common
+ * between syscfg and memory mapped based sensors
+ */
+int st_thermal_alloc_regfields(struct st_thermal_sensor *sensor)
+{
+       struct device *dev = sensor->dev;
+       struct regmap *regmap = sensor->regmap;
+       const struct reg_field *reg_fields = sensor->cdata->reg_fields;
+
+       sensor->dcorrect = devm_regmap_field_alloc(dev, regmap,
+                                                  reg_fields[DCORRECT]);
+
+       sensor->overflow = devm_regmap_field_alloc(dev, regmap,
+                                                  reg_fields[OVERFLOW]);
+
+       sensor->temp_data = devm_regmap_field_alloc(dev, regmap,
+                                                   reg_fields[DATA]);
+
+       if (IS_ERR(sensor->dcorrect) ||
+           IS_ERR(sensor->overflow) ||
+           IS_ERR(sensor->temp_data)) {
+               dev_err(dev, "failed to allocate common regfields\n");
+               return -EINVAL;
+       }
+
+       return sensor->ops->alloc_regfields(sensor);
+}
+
+static int st_thermal_sensor_on(struct st_thermal_sensor *sensor)
+{
+       int ret;
+       struct device *dev = sensor->dev;
+
+       ret = clk_prepare_enable(sensor->clk);
+       if (ret) {
+               dev_err(dev, "failed to enable clk\n");
+               return ret;
+       }
+
+       ret = sensor->ops->power_ctrl(sensor, POWER_ON);
+       if (ret) {
+               dev_err(dev, "failed to power on sensor\n");
+               clk_disable_unprepare(sensor->clk);
+       }
+
+       return ret;
+}
+
+static int st_thermal_sensor_off(struct st_thermal_sensor *sensor)
+{
+       int ret;
+
+       ret = sensor->ops->power_ctrl(sensor, POWER_OFF);
+       if (ret)
+               return ret;
+
+       clk_disable_unprepare(sensor->clk);
+
+       return 0;
+}
+
+static int st_thermal_calibration(struct st_thermal_sensor *sensor)
+{
+       int ret;
+       unsigned int val;
+       struct device *dev = sensor->dev;
+
+       /* Check if sensor calibration data is already written */
+       ret = regmap_field_read(sensor->dcorrect, &val);
+       if (ret) {
+               dev_err(dev, "failed to read calibration data\n");
+               return ret;
+       }
+
+       if (!val) {
+               /*
+                * Sensor calibration value not set by bootloader,
+                * default calibration data to be used
+                */
+               ret = regmap_field_write(sensor->dcorrect,
+                                        sensor->cdata->calibration_val);
+               if (ret)
+                       dev_err(dev, "failed to set calibration data\n");
+       }
+
+       return ret;
+}
+
+/* Callback to get temperature from HW*/
+static int st_thermal_get_temp(struct thermal_zone_device *th,
+               unsigned long *temperature)
+{
+       struct st_thermal_sensor *sensor = th->devdata;
+       struct device *dev = sensor->dev;
+       unsigned int temp;
+       unsigned int overflow;
+       int ret;
+
+       ret = regmap_field_read(sensor->overflow, &overflow);
+       if (ret)
+               return ret;
+       if (overflow)
+               return -EIO;
+
+       ret = regmap_field_read(sensor->temp_data, &temp);
+       if (ret)
+               return ret;
+
+       temp += sensor->cdata->temp_adjust_val;
+       temp = mcelsius(temp);
+
+       dev_dbg(dev, "temperature: %d\n", temp);
+
+       *temperature = temp;
+
+       return 0;
+}
+
+static int st_thermal_get_trip_type(struct thermal_zone_device *th,
+                               int trip, enum thermal_trip_type *type)
+{
+       struct st_thermal_sensor *sensor = th->devdata;
+       struct device *dev = sensor->dev;
+
+       switch (trip) {
+       case 0:
+               *type = THERMAL_TRIP_CRITICAL;
+               break;
+       default:
+               dev_err(dev, "invalid trip point\n");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int st_thermal_get_trip_temp(struct thermal_zone_device *th,
+                                   int trip, unsigned long *temp)
+{
+       struct st_thermal_sensor *sensor = th->devdata;
+       struct device *dev = sensor->dev;
+
+       switch (trip) {
+       case 0:
+               *temp = mcelsius(sensor->cdata->crit_temp);
+               break;
+       default:
+               dev_err(dev, "Invalid trip point\n");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static struct thermal_zone_device_ops st_tz_ops = {
+       .get_temp       = st_thermal_get_temp,
+       .get_trip_type  = st_thermal_get_trip_type,
+       .get_trip_temp  = st_thermal_get_trip_temp,
+};
+
+int st_thermal_register(struct platform_device *pdev,
+                       const struct of_device_id *st_thermal_of_match)
+{
+       struct st_thermal_sensor *sensor;
+       struct device *dev = &pdev->dev;
+       struct device_node *np = dev->of_node;
+       const struct of_device_id *match;
+
+       int polling_delay;
+       int ret;
+
+       if (!np) {
+               dev_err(dev, "device tree node not found\n");
+               return -EINVAL;
+       }
+
+       sensor = devm_kzalloc(dev, sizeof(*sensor), GFP_KERNEL);
+       if (!sensor)
+               return -ENOMEM;
+
+       sensor->dev = dev;
+
+       match = of_match_device(st_thermal_of_match, dev);
+       if (!(match && match->data))
+               return -EINVAL;
+
+       sensor->cdata = match->data;
+       if (!sensor->cdata->ops)
+               return -EINVAL;
+
+       sensor->ops = sensor->cdata->ops;
+
+       ret = sensor->ops->regmap_init(sensor);
+       if (ret)
+               return ret;
+
+       ret = st_thermal_alloc_regfields(sensor);
+       if (ret)
+               return ret;
+
+       sensor->clk = devm_clk_get(dev, "thermal");
+       if (IS_ERR(sensor->clk)) {
+               dev_err(dev, "failed to fetch clock\n");
+               return PTR_ERR(sensor->clk);
+       }
+
+       if (sensor->ops->register_enable_irq) {
+               ret = sensor->ops->register_enable_irq(sensor);
+               if (ret)
+                       return ret;
+       }
+
+       ret = st_thermal_sensor_on(sensor);
+       if (ret)
+               return ret;
+
+       ret = st_thermal_calibration(sensor);
+       if (ret)
+               goto sensor_off;
+
+       polling_delay = sensor->ops->register_enable_irq ? 0 : 1000;
+
+       sensor->thermal_dev =
+               thermal_zone_device_register(dev_name(dev), 1, 0, sensor,
+                                            &st_tz_ops, NULL, 0, polling_delay);
+       if (IS_ERR(sensor->thermal_dev)) {
+               dev_err(dev, "failed to register thermal zone device\n");
+               ret = PTR_ERR(sensor->thermal_dev);
+               goto sensor_off;
+       }
+
+       platform_set_drvdata(pdev, sensor);
+
+       return 0;
+
+sensor_off:
+       st_thermal_sensor_off(sensor);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(st_thermal_register);
+
+int st_thermal_unregister(struct platform_device *pdev)
+{
+       struct st_thermal_sensor *sensor = platform_get_drvdata(pdev);
+
+       st_thermal_sensor_off(sensor);
+       thermal_zone_device_unregister(sensor->thermal_dev);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(st_thermal_unregister);
+
+static int st_thermal_suspend(struct device *dev)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct st_thermal_sensor *sensor = platform_get_drvdata(pdev);
+
+       return st_thermal_sensor_off(sensor);
+}
+
+static int st_thermal_resume(struct device *dev)
+{
+       int ret;
+       struct platform_device *pdev = to_platform_device(dev);
+       struct st_thermal_sensor *sensor = platform_get_drvdata(pdev);
+
+       ret = st_thermal_sensor_on(sensor);
+       if (ret)
+               return ret;
+
+       ret = st_thermal_calibration(sensor);
+       if (ret)
+               return ret;
+
+       if (sensor->ops->enable_irq) {
+               ret = sensor->ops->enable_irq(sensor);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+SIMPLE_DEV_PM_OPS(st_thermal_pm_ops, st_thermal_suspend, st_thermal_resume);
+EXPORT_SYMBOL_GPL(st_thermal_pm_ops);
+
+MODULE_AUTHOR("STMicroelectronics (R&D) Limited <ajitpal.singh@st.com>");
+MODULE_DESCRIPTION("STMicroelectronics STi SoC Thermal Sensor Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/thermal/st/st_thermal.h b/drivers/thermal/st/st_thermal.h
new file mode 100644 (file)
index 0000000..fecafbe
--- /dev/null
@@ -0,0 +1,104 @@
+/*
+ * ST Thermal Sensor Driver for STi series of SoCs
+ * Author: Ajit Pal Singh <ajitpal.singh@st.com>
+ *
+ * Copyright (C) 2003-2014 STMicroelectronics (R&D) Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __STI_THERMAL_SYSCFG_H
+#define __STI_THERMAL_SYSCFG_H
+
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/thermal.h>
+
+enum st_thermal_regfield_ids {
+       INT_THRESH_HI = 0, /* Top two regfield IDs are mutually exclusive */
+       TEMP_PWR = 0,
+       DCORRECT,
+       OVERFLOW,
+       DATA,
+       INT_ENABLE,
+
+       MAX_REGFIELDS
+};
+
+/* Thermal sensor power states */
+enum st_thermal_power_state {
+       POWER_OFF = 0,
+       POWER_ON
+};
+
+struct st_thermal_sensor;
+
+/**
+ * Description of private thermal sensor ops.
+ *
+ * @power_ctrl:                Function for powering on/off a sensor. Clock to the
+ *                     sensor is also controlled from this function.
+ * @alloc_regfields:   Allocate regmap register fields, specific to a sensor.
+ * @do_memmap_regmap:  Memory map the thermal register space and init regmap
+ *                     instance or find regmap instance.
+ * @register_irq:      Register an interrupt handler for a sensor.
+ */
+struct st_thermal_sensor_ops {
+       int (*power_ctrl)(struct st_thermal_sensor *, enum st_thermal_power_state);
+       int (*alloc_regfields)(struct st_thermal_sensor *);
+       int (*regmap_init)(struct st_thermal_sensor *);
+       int (*register_enable_irq)(struct st_thermal_sensor *);
+       int (*enable_irq)(struct st_thermal_sensor *);
+};
+
+/**
+ * Description of thermal driver compatible data.
+ *
+ * @reg_fields:                Pointer to the regfields array for a sensor.
+ * @sys_compat:                Pointer to the syscon node compatible string.
+ * @ops:               Pointer to private thermal ops for a sensor.
+ * @calibration_val:   Default calibration value to be written to the DCORRECT
+ *                     register field for a sensor.
+ * @temp_adjust_val:   Value to be added/subtracted from the data read from
+ *                     the sensor. If value needs to be added please provide a
+ *                     positive value and if it is to be subtracted please
+ *                     provide a negative value.
+ * @crit_temp:                 The temperature beyond which the SoC should be shutdown
+ *                     to prevent damage.
+ */
+struct st_thermal_compat_data {
+       char *sys_compat;
+       const struct reg_field *reg_fields;
+       const struct st_thermal_sensor_ops *ops;
+       unsigned int calibration_val;
+       int temp_adjust_val;
+       int crit_temp;
+};
+
+struct st_thermal_sensor {
+       struct device *dev;
+       struct thermal_zone_device *thermal_dev;
+       const struct st_thermal_sensor_ops *ops;
+       const struct st_thermal_compat_data *cdata;
+       struct clk *clk;
+       struct regmap *regmap;
+       struct regmap_field *pwr;
+       struct regmap_field *dcorrect;
+       struct regmap_field *overflow;
+       struct regmap_field *temp_data;
+       struct regmap_field *int_thresh_hi;
+       struct regmap_field *int_enable;
+       int irq;
+       void __iomem *mmio_base;
+};
+
+extern int st_thermal_register(struct platform_device *pdev,
+                              const struct of_device_id *st_thermal_of_match);
+extern int st_thermal_unregister(struct platform_device *pdev);
+extern const struct dev_pm_ops st_thermal_pm_ops;
+
+#endif /* __STI_RESET_SYSCFG_H */
diff --git a/drivers/thermal/st/st_thermal_memmap.c b/drivers/thermal/st/st_thermal_memmap.c
new file mode 100644 (file)
index 0000000..39896ce
--- /dev/null
@@ -0,0 +1,209 @@
+/*
+ * ST Thermal Sensor Driver for memory mapped sensors.
+ * Author: Ajit Pal Singh <ajitpal.singh@st.com>
+ *
+ * Copyright (C) 2003-2014 STMicroelectronics (R&D) Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/of.h>
+#include <linux/module.h>
+
+#include "st_thermal.h"
+
+#define STIH416_MPE_CONF                       0x0
+#define STIH416_MPE_STATUS                     0x4
+#define STIH416_MPE_INT_THRESH                 0x8
+#define STIH416_MPE_INT_EN                     0xC
+
+/* Power control bits for the memory mapped thermal sensor */
+#define THERMAL_PDN                            BIT(4)
+#define THERMAL_SRSTN                          BIT(10)
+
+static const struct reg_field st_mmap_thermal_regfields[MAX_REGFIELDS] = {
+       /*
+        * According to the STIH416 MPE temp sensor data sheet -
+        * the PDN (Power Down Bit) and SRSTN (Soft Reset Bit) need to be
+        * written simultaneously for powering on and off the temperature
+        * sensor. regmap_update_bits() will be used to update the register.
+        */
+       [INT_THRESH_HI] = REG_FIELD(STIH416_MPE_INT_THRESH,     0,  7),
+       [DCORRECT]      = REG_FIELD(STIH416_MPE_CONF,           5,  9),
+       [OVERFLOW]      = REG_FIELD(STIH416_MPE_STATUS,         9,  9),
+       [DATA]          = REG_FIELD(STIH416_MPE_STATUS,         11, 18),
+       [INT_ENABLE]    = REG_FIELD(STIH416_MPE_INT_EN,         0,  0),
+};
+
+static irqreturn_t st_mmap_thermal_trip_handler(int irq, void *sdata)
+{
+       struct st_thermal_sensor *sensor = sdata;
+
+       thermal_zone_device_update(sensor->thermal_dev);
+
+       return IRQ_HANDLED;
+}
+
+/* Private ops for the Memory Mapped based thermal sensors */
+static int st_mmap_power_ctrl(struct st_thermal_sensor *sensor,
+                             enum st_thermal_power_state power_state)
+{
+       const unsigned int mask = (THERMAL_PDN | THERMAL_SRSTN);
+       const unsigned int val = power_state ? mask : 0;
+
+       return regmap_update_bits(sensor->regmap, STIH416_MPE_CONF, mask, val);
+}
+
+static int st_mmap_alloc_regfields(struct st_thermal_sensor *sensor)
+{
+       struct device *dev = sensor->dev;
+       struct regmap *regmap = sensor->regmap;
+       const struct reg_field *reg_fields = sensor->cdata->reg_fields;
+
+       sensor->int_thresh_hi = devm_regmap_field_alloc(dev, regmap,
+                                               reg_fields[INT_THRESH_HI]);
+       sensor->int_enable = devm_regmap_field_alloc(dev, regmap,
+                                               reg_fields[INT_ENABLE]);
+
+       if (IS_ERR(sensor->int_thresh_hi) || IS_ERR(sensor->int_enable)) {
+               dev_err(dev, "failed to alloc mmap regfields\n");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int st_mmap_enable_irq(struct st_thermal_sensor *sensor)
+{
+       int ret;
+
+       /* Set upper critical threshold */
+       ret = regmap_field_write(sensor->int_thresh_hi,
+                                sensor->cdata->crit_temp -
+                                sensor->cdata->temp_adjust_val);
+       if (ret)
+               return ret;
+
+       return regmap_field_write(sensor->int_enable, 1);
+}
+
+static int st_mmap_register_enable_irq(struct st_thermal_sensor *sensor)
+{
+       struct device *dev = sensor->dev;
+       struct platform_device *pdev = to_platform_device(dev);
+       int ret;
+
+       sensor->irq = platform_get_irq(pdev, 0);
+       if (sensor->irq < 0) {
+               dev_err(dev, "failed to register IRQ\n");
+               return sensor->irq;
+       }
+
+       ret = devm_request_threaded_irq(dev, sensor->irq,
+                                       NULL, st_mmap_thermal_trip_handler,
+                                       IRQF_TRIGGER_RISING | IRQF_ONESHOT,
+                                       dev->driver->name, sensor);
+       if (ret) {
+               dev_err(dev, "failed to register IRQ %d\n", sensor->irq);
+               return ret;
+       }
+
+       return st_mmap_enable_irq(sensor);
+}
+
+static const struct regmap_config st_416mpe_regmap_config = {
+       .reg_bits = 32,
+       .val_bits = 32,
+       .reg_stride = 4,
+};
+
+static int st_mmap_regmap_init(struct st_thermal_sensor *sensor)
+{
+       struct device *dev = sensor->dev;
+       struct platform_device *pdev = to_platform_device(dev);
+       struct resource *res;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       if (!res) {
+               dev_err(dev, "no memory resources defined\n");
+               return -ENODEV;
+       }
+
+       sensor->mmio_base = devm_ioremap_resource(dev, res);
+       if (IS_ERR(sensor->mmio_base)) {
+               dev_err(dev, "failed to remap IO\n");
+               return PTR_ERR(sensor->mmio_base);
+       }
+
+       sensor->regmap = devm_regmap_init_mmio(dev, sensor->mmio_base,
+                               &st_416mpe_regmap_config);
+       if (IS_ERR(sensor->regmap)) {
+               dev_err(dev, "failed to initialise regmap\n");
+               return PTR_ERR(sensor->regmap);
+       }
+
+       return 0;
+}
+
+static const struct st_thermal_sensor_ops st_mmap_sensor_ops = {
+       .power_ctrl             = st_mmap_power_ctrl,
+       .alloc_regfields        = st_mmap_alloc_regfields,
+       .regmap_init            = st_mmap_regmap_init,
+       .register_enable_irq    = st_mmap_register_enable_irq,
+       .enable_irq             = st_mmap_enable_irq,
+};
+
+/* Compatible device data stih416 mpe thermal sensor */
+const struct st_thermal_compat_data st_416mpe_cdata = {
+       .reg_fields             = st_mmap_thermal_regfields,
+       .ops                    = &st_mmap_sensor_ops,
+       .calibration_val        = 14,
+       .temp_adjust_val        = -95,
+       .crit_temp              = 120,
+};
+
+/* Compatible device data stih407 thermal sensor */
+const struct st_thermal_compat_data st_407_cdata = {
+       .reg_fields             = st_mmap_thermal_regfields,
+       .ops                    = &st_mmap_sensor_ops,
+       .calibration_val        = 16,
+       .temp_adjust_val        = -95,
+       .crit_temp              = 120,
+};
+
+static struct of_device_id st_mmap_thermal_of_match[] = {
+       { .compatible = "st,stih416-mpe-thermal", .data = &st_416mpe_cdata },
+       { .compatible = "st,stih407-thermal",     .data = &st_407_cdata },
+       { /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, st_mmap_thermal_of_match);
+
+int st_mmap_probe(struct platform_device *pdev)
+{
+       return st_thermal_register(pdev,  st_mmap_thermal_of_match);
+}
+
+int st_mmap_remove(struct platform_device *pdev)
+{
+       return st_thermal_unregister(pdev);
+}
+
+static struct platform_driver st_mmap_thermal_driver = {
+       .driver = {
+               .name   = "st_thermal_mmap",
+               .owner  = THIS_MODULE,
+               .pm     = &st_thermal_pm_ops,
+               .of_match_table = st_mmap_thermal_of_match,
+       },
+       .probe          = st_mmap_probe,
+       .remove         = st_mmap_remove,
+};
+
+module_platform_driver(st_mmap_thermal_driver);
+
+MODULE_AUTHOR("STMicroelectronics (R&D) Limited <ajitpal.singh@st.com>");
+MODULE_DESCRIPTION("STMicroelectronics STi SoC Thermal Sensor Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/thermal/st/st_thermal_syscfg.c b/drivers/thermal/st/st_thermal_syscfg.c
new file mode 100644 (file)
index 0000000..888b58e
--- /dev/null
@@ -0,0 +1,179 @@
+/*
+ * ST Thermal Sensor Driver for syscfg based sensors.
+ * Author: Ajit Pal Singh <ajitpal.singh@st.com>
+ *
+ * Copyright (C) 2003-2014 STMicroelectronics (R&D) Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/of.h>
+#include <linux/module.h>
+#include <linux/mfd/syscon.h>
+
+#include "st_thermal.h"
+
+/* STiH415 */
+#define STIH415_SYSCFG_FRONT(num)              ((num - 100) * 4)
+#define STIH415_SAS_THSENS_CONF                        STIH415_SYSCFG_FRONT(178)
+#define STIH415_SAS_THSENS_STATUS              STIH415_SYSCFG_FRONT(198)
+#define STIH415_SYSCFG_MPE(num)                        ((num - 600) * 4)
+#define STIH415_MPE_THSENS_CONF                        STIH415_SYSCFG_MPE(607)
+#define STIH415_MPE_THSENS_STATUS              STIH415_SYSCFG_MPE(667)
+
+/* STiH416 */
+#define STIH416_SYSCFG_FRONT(num)              ((num - 1000) * 4)
+#define STIH416_SAS_THSENS_CONF                        STIH416_SYSCFG_FRONT(1552)
+#define STIH416_SAS_THSENS_STATUS1             STIH416_SYSCFG_FRONT(1554)
+#define STIH416_SAS_THSENS_STATUS2             STIH416_SYSCFG_FRONT(1594)
+
+/* STiD127 */
+#define STID127_SYSCFG_CPU(num)                        ((num - 700) * 4)
+#define STID127_THSENS_CONF                    STID127_SYSCFG_CPU(743)
+#define STID127_THSENS_STATUS                  STID127_SYSCFG_CPU(767)
+
+static const struct reg_field st_415sas_regfields[MAX_REGFIELDS] = {
+       [TEMP_PWR] = REG_FIELD(STIH415_SAS_THSENS_CONF,   9,  9),
+       [DCORRECT] = REG_FIELD(STIH415_SAS_THSENS_CONF,   4,  8),
+       [OVERFLOW] = REG_FIELD(STIH415_SAS_THSENS_STATUS, 8,  8),
+       [DATA]     = REG_FIELD(STIH415_SAS_THSENS_STATUS, 10, 16),
+};
+
+static const struct reg_field st_415mpe_regfields[MAX_REGFIELDS] = {
+       [TEMP_PWR] = REG_FIELD(STIH415_MPE_THSENS_CONF,   8,  8),
+       [DCORRECT] = REG_FIELD(STIH415_MPE_THSENS_CONF,   3,  7),
+       [OVERFLOW] = REG_FIELD(STIH415_MPE_THSENS_STATUS, 9,  9),
+       [DATA]     = REG_FIELD(STIH415_MPE_THSENS_STATUS, 11, 18),
+};
+
+static const struct reg_field st_416sas_regfields[MAX_REGFIELDS] = {
+       [TEMP_PWR] = REG_FIELD(STIH416_SAS_THSENS_CONF,    9,  9),
+       [DCORRECT] = REG_FIELD(STIH416_SAS_THSENS_CONF,    4,  8),
+       [OVERFLOW] = REG_FIELD(STIH416_SAS_THSENS_STATUS1, 8,  8),
+       [DATA]     = REG_FIELD(STIH416_SAS_THSENS_STATUS2, 10, 16),
+};
+
+static const struct reg_field st_127_regfields[MAX_REGFIELDS] = {
+       [TEMP_PWR] = REG_FIELD(STID127_THSENS_CONF,   7,  7),
+       [DCORRECT] = REG_FIELD(STID127_THSENS_CONF,   2,  6),
+       [OVERFLOW] = REG_FIELD(STID127_THSENS_STATUS, 9,  9),
+       [DATA]     = REG_FIELD(STID127_THSENS_STATUS, 11, 18),
+};
+
+/* Private OPs for System Configuration Register based thermal sensors */
+static int st_syscfg_power_ctrl(struct st_thermal_sensor *sensor,
+                               enum st_thermal_power_state power_state)
+{
+       return regmap_field_write(sensor->pwr, power_state);
+}
+
+static int st_syscfg_alloc_regfields(struct st_thermal_sensor *sensor)
+{
+       struct device *dev = sensor->dev;
+
+       sensor->pwr = devm_regmap_field_alloc(dev, sensor->regmap,
+                                       sensor->cdata->reg_fields[TEMP_PWR]);
+
+       if (IS_ERR(sensor->pwr)) {
+               dev_err(dev, "failed to alloc syscfg regfields\n");
+               return PTR_ERR(sensor->pwr);
+       }
+
+       return 0;
+}
+
+static int st_syscfg_regmap_init(struct st_thermal_sensor *sensor)
+{
+       sensor->regmap =
+               syscon_regmap_lookup_by_compatible(sensor->cdata->sys_compat);
+       if (IS_ERR(sensor->regmap)) {
+               dev_err(sensor->dev, "failed to find syscfg regmap\n");
+               return PTR_ERR(sensor->regmap);
+       }
+
+       return 0;
+}
+
+static const struct st_thermal_sensor_ops st_syscfg_sensor_ops = {
+       .power_ctrl             = st_syscfg_power_ctrl,
+       .alloc_regfields        = st_syscfg_alloc_regfields,
+       .regmap_init            = st_syscfg_regmap_init,
+};
+
+/* Compatible device data for stih415 sas thermal sensor */
+const struct st_thermal_compat_data st_415sas_cdata = {
+       .sys_compat             = "st,stih415-front-syscfg",
+       .reg_fields             = st_415sas_regfields,
+       .ops                    = &st_syscfg_sensor_ops,
+       .calibration_val        = 16,
+       .temp_adjust_val        = 20,
+       .crit_temp              = 120,
+};
+
+/* Compatible device data for stih415 mpe thermal sensor */
+const struct st_thermal_compat_data st_415mpe_cdata = {
+       .sys_compat             = "st,stih415-system-syscfg",
+       .reg_fields             = st_415mpe_regfields,
+       .ops                    = &st_syscfg_sensor_ops,
+       .calibration_val        = 16,
+       .temp_adjust_val        = -103,
+       .crit_temp              = 120,
+};
+
+/* Compatible device data for stih416 sas thermal sensor */
+const struct st_thermal_compat_data st_416sas_cdata = {
+       .sys_compat             = "st,stih416-front-syscfg",
+       .reg_fields             = st_416sas_regfields,
+       .ops                    = &st_syscfg_sensor_ops,
+       .calibration_val        = 16,
+       .temp_adjust_val        = 20,
+       .crit_temp              = 120,
+};
+
+/* Compatible device data for stid127 thermal sensor */
+const struct st_thermal_compat_data st_127_cdata = {
+       .sys_compat             = "st,stid127-cpu-syscfg",
+       .reg_fields             = st_127_regfields,
+       .ops                    = &st_syscfg_sensor_ops,
+       .calibration_val        = 8,
+       .temp_adjust_val        = -103,
+       .crit_temp              = 120,
+};
+
+static struct of_device_id st_syscfg_thermal_of_match[] = {
+       { .compatible = "st,stih415-sas-thermal", .data = &st_415sas_cdata },
+       { .compatible = "st,stih415-mpe-thermal", .data = &st_415mpe_cdata },
+       { .compatible = "st,stih416-sas-thermal", .data = &st_416sas_cdata },
+       { .compatible = "st,stid127-thermal",     .data = &st_127_cdata },
+       { /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, st_syscfg_thermal_of_match);
+
+int st_syscfg_probe(struct platform_device *pdev)
+{
+       return st_thermal_register(pdev, st_syscfg_thermal_of_match);
+}
+
+int st_syscfg_remove(struct platform_device *pdev)
+{
+       return st_thermal_unregister(pdev);
+}
+
+static struct platform_driver st_syscfg_thermal_driver = {
+       .driver = {
+               .name   = "st_syscfg_thermal",
+               .owner  = THIS_MODULE,
+               .pm     = &st_thermal_pm_ops,
+               .of_match_table =  st_syscfg_thermal_of_match,
+       },
+       .probe          = st_syscfg_probe,
+       .remove         = st_syscfg_remove,
+};
+module_platform_driver(st_syscfg_thermal_driver);
+
+MODULE_AUTHOR("STMicroelectronics (R&D) Limited <ajitpal.singh@st.com>");
+MODULE_DESCRIPTION("STMicroelectronics STi SoC Thermal Sensor Driver");
+MODULE_LICENSE("GPL v2");
index 0419b69e270fc7158613cbd78efbddb2e0d45d01..4f485e88f60c51213b96b8ebeac110b5364364da 100644 (file)
@@ -108,55 +108,23 @@ static void disable_tx_interrupt(struct ehv_bc_data *bc)
  *
  * The byte channel to be used for the console is specified via a "stdout"
  * property in the /chosen node.
- *
- * For compatible with legacy device trees, we also look for a "stdout" alias.
  */
 static int find_console_handle(void)
 {
-       struct device_node *np, *np2;
+       struct device_node *np = of_stdout;
        const char *sprop = NULL;
        const uint32_t *iprop;
 
-       np = of_find_node_by_path("/chosen");
-       if (np)
-               sprop = of_get_property(np, "stdout-path", NULL);
-
-       if (!np || !sprop) {
-               of_node_put(np);
-               np = of_find_node_by_name(NULL, "aliases");
-               if (np)
-                       sprop = of_get_property(np, "stdout", NULL);
-       }
-
-       if (!sprop) {
-               of_node_put(np);
-               return 0;
-       }
-
        /* We don't care what the aliased node is actually called.  We only
         * care if it's compatible with "epapr,hv-byte-channel", because that
-        * indicates that it's a byte channel node.  We use a temporary
-        * variable, 'np2', because we can't release 'np' until we're done with
-        * 'sprop'.
+        * indicates that it's a byte channel node.
         */
-       np2 = of_find_node_by_path(sprop);
-       of_node_put(np);
-       np = np2;
-       if (!np) {
-               pr_warning("ehv-bc: stdout node '%s' does not exist\n", sprop);
-               return 0;
-       }
-
-       /* Is it a byte channel? */
-       if (!of_device_is_compatible(np, "epapr,hv-byte-channel")) {
-               of_node_put(np);
+       if (!np || !of_device_is_compatible(np, "epapr,hv-byte-channel"))
                return 0;
-       }
 
        stdout_irq = irq_of_parse_and_map(np, 0);
        if (stdout_irq == NO_IRQ) {
-               pr_err("ehv-bc: no 'interrupts' property in %s node\n", sprop);
-               of_node_put(np);
+               pr_err("ehv-bc: no 'interrupts' property in %s node\n", np->full_name);
                return 0;
        }
 
@@ -167,12 +135,9 @@ static int find_console_handle(void)
        if (!iprop) {
                pr_err("ehv-bc: no 'hv-handle' property in %s node\n",
                       np->name);
-               of_node_put(np);
                return 0;
        }
        stdout_bc = be32_to_cpu(*iprop);
-
-       of_node_put(np);
        return 1;
 }
 
index a585079b4b38822a03770b6d3c6ab8fd0bbea036..a2cc5f834c633836065b215a07f597a4d458fba7 100644 (file)
@@ -342,22 +342,13 @@ static void udbg_init_opal_common(void)
 
 void __init hvc_opal_init_early(void)
 {
-       struct device_node *stdout_node = NULL;
+       struct device_node *stdout_node = of_node_get(of_stdout);
        const __be32 *termno;
-       const char *name = NULL;
        const struct hv_ops *ops;
        u32 index;
 
-       /* find the boot console from /chosen/stdout */
-       if (of_chosen)
-               name = of_get_property(of_chosen, "linux,stdout-path", NULL);
-       if (name) {
-               stdout_node = of_find_node_by_path(name);
-               if (!stdout_node) {
-                       pr_err("hvc_opal: Failed to locate default console!\n");
-                       return;
-               }
-       } else {
+       /* If the console wasn't in /chosen, try /ibm,opal */
+       if (!stdout_node) {
                struct device_node *opal, *np;
 
                /* Current OPAL takeover doesn't provide the stdout
index b594abfbf21e76d58acc1df534a30f603670fb5a..5618b5fc7500e149dfd79ac2515309749f905834 100644 (file)
@@ -404,42 +404,35 @@ module_exit(hvc_vio_exit);
 
 void __init hvc_vio_init_early(void)
 {
-       struct device_node *stdout_node;
        const __be32 *termno;
        const char *name;
        const struct hv_ops *ops;
 
        /* find the boot console from /chosen/stdout */
-       if (!of_chosen)
+       if (!of_stdout)
                return;
-       name = of_get_property(of_chosen, "linux,stdout-path", NULL);
-       if (name == NULL)
-               return;
-       stdout_node = of_find_node_by_path(name);
-       if (!stdout_node)
-               return;
-       name = of_get_property(stdout_node, "name", NULL);
+       name = of_get_property(of_stdout, "name", NULL);
        if (!name) {
                printk(KERN_WARNING "stdout node missing 'name' property!\n");
-               goto out;
+               return;
        }
 
        /* Check if it's a virtual terminal */
        if (strncmp(name, "vty", 3) != 0)
-               goto out;
-       termno = of_get_property(stdout_node, "reg", NULL);
+               return;
+       termno = of_get_property(of_stdout, "reg", NULL);
        if (termno == NULL)
-               goto out;
+               return;
        hvterm_priv0.termno = of_read_number(termno, 1);
        spin_lock_init(&hvterm_priv0.buf_lock);
        hvterm_privs[0] = &hvterm_priv0;
 
        /* Check the protocol */
-       if (of_device_is_compatible(stdout_node, "hvterm1")) {
+       if (of_device_is_compatible(of_stdout, "hvterm1")) {
                hvterm_priv0.proto = HV_PROTOCOL_RAW;
                ops = &hvterm_raw_ops;
        }
-       else if (of_device_is_compatible(stdout_node, "hvterm-protocol")) {
+       else if (of_device_is_compatible(of_stdout, "hvterm-protocol")) {
                hvterm_priv0.proto = HV_PROTOCOL_HVSI;
                ops = &hvterm_hvsi_ops;
                hvsilib_init(&hvterm_priv0.hvsi, hvc_get_chars, hvc_put_chars,
@@ -447,7 +440,7 @@ void __init hvc_vio_init_early(void)
                /* HVSI, perform the handshake now */
                hvsilib_establish(&hvterm_priv0.hvsi);
        } else
-               goto out;
+               return;
        udbg_putc = udbg_hvc_putc;
        udbg_getc = udbg_hvc_getc;
        udbg_getc_poll = udbg_hvc_getc_poll;
@@ -456,14 +449,12 @@ void __init hvc_vio_init_early(void)
         * backend for HVSI, only do udbg
         */
        if (hvterm_priv0.proto == HV_PROTOCOL_HVSI)
-               goto out;
+               return;
 #endif
        /* Check whether the user has requested a different console. */
        if (!strstr(cmd_line, "console="))
                add_preferred_console("hvc", 0, NULL);
        hvc_instantiate(0, 0, ops);
-out:
-       of_node_put(stdout_node);
 }
 
 /* call this from early_init() for a working debug console on
index f7ad5b903055852fad68d2a69a2f826e5f795172..abbfedb84901731793aeddef5a98910c72520b42 100644 (file)
@@ -1653,8 +1653,7 @@ static int __init pmz_probe(void)
        /*
         * Find all escc chips in the system
         */
-       node_p = of_find_node_by_name(NULL, "escc");
-       while (node_p) {
+       for_each_node_by_name(node_p, "escc") {
                /*
                 * First get channel A/B node pointers
                 * 
@@ -1672,7 +1671,7 @@ static int __init pmz_probe(void)
                        of_node_put(node_b);
                        printk(KERN_ERR "pmac_zilog: missing node %c for escc %s\n",
                                (!node_a) ? 'a' : 'b', node_p->full_name);
-                       goto next;
+                       continue;
                }
 
                /*
@@ -1699,11 +1698,9 @@ static int __init pmz_probe(void)
                        of_node_put(node_b);
                        memset(&pmz_ports[count], 0, sizeof(struct uart_pmac_port));
                        memset(&pmz_ports[count+1], 0, sizeof(struct uart_pmac_port));
-                       goto next;
+                       continue;
                }
                count += 2;
-next:
-               node_p = of_find_node_by_name(node_p, "escc");
        }
        pmz_ports_count = count;
 
index 8bb19da01639bb51cc2bc88f40cada8f686cc5a2..29a7be47389a9339fe2c049f7d3ee2cfd6160397 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/console.h>
+#include <linux/of.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/device.h>
@@ -2611,6 +2612,8 @@ int uart_add_one_port(struct uart_driver *drv, struct uart_port *uport)
                spin_lock_init(&uport->lock);
                lockdep_set_class(&uport->lock, &port_lock_key);
        }
+       if (uport->cons && uport->dev)
+               of_console_check(uport->dev->of_node, uport->cons->name, uport->line);
 
        uart_configure_port(drv, state, uport);
 
index af7b204b921555ec980d64b09b26543d8932c890..d8c57636b9ce8750d128169e85b1c9ce3094219a 100644 (file)
@@ -8,11 +8,17 @@ config VFIO_IOMMU_SPAPR_TCE
        depends on VFIO && SPAPR_TCE_IOMMU
        default n
 
+config VFIO_SPAPR_EEH
+       tristate
+       depends on EEH && VFIO_IOMMU_SPAPR_TCE
+       default n
+
 menuconfig VFIO
        tristate "VFIO Non-Privileged userspace driver framework"
        depends on IOMMU_API
        select VFIO_IOMMU_TYPE1 if X86
        select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES)
+       select VFIO_SPAPR_EEH if (PPC_POWERNV || PPC_PSERIES)
        select ANON_INODES
        help
          VFIO provides a framework for secure userspace device drivers.
index 50e30bc75e855f6564bd10a35d88a4040b6ee3e7..0b035b12600a7ba6aeaa8d16367fdfbe4f78c9da 100644 (file)
@@ -1,5 +1,5 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
 obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
-obj-$(CONFIG_EEH) += vfio_spapr_eeh.o
+obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o
 obj-$(CONFIG_VFIO_PCI) += pci/
index e2ee80f36e3ea2d4a845e1e7b22ce47b070dc9c8..f7825332a3251b2001645b581eec09524562fbec 100644 (file)
@@ -37,6 +37,10 @@ module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(nointxmask,
                  "Disable support for PCI 2.3 style INTx masking.  If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
 
+static DEFINE_MUTEX(driver_lock);
+
+static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
+
 static int vfio_pci_enable(struct vfio_pci_device *vdev)
 {
        struct pci_dev *pdev = vdev->pdev;
@@ -44,6 +48,9 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
        u16 cmd;
        u8 msix_pos;
 
+       /* Don't allow our initial saved state to include busmaster */
+       pci_clear_master(pdev);
+
        ret = pci_enable_device(pdev);
        if (ret)
                return ret;
@@ -99,7 +106,8 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
        struct pci_dev *pdev = vdev->pdev;
        int bar;
 
-       pci_disable_device(pdev);
+       /* Stop the device from further DMA */
+       pci_clear_master(pdev);
 
        vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
                                VFIO_IRQ_SET_ACTION_TRIGGER,
@@ -117,6 +125,8 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
                vdev->barmap[bar] = NULL;
        }
 
+       vdev->needs_reset = true;
+
        /*
         * If we have saved state, restore it.  If we can reset the device,
         * even better.  Resetting with current state seems better than
@@ -128,7 +138,7 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
                        __func__, dev_name(&pdev->dev));
 
                if (!vdev->reset_works)
-                       return;
+                       goto out;
 
                pci_save_state(pdev);
        }
@@ -148,46 +158,55 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
                if (ret)
                        pr_warn("%s: Failed to reset device %s (%d)\n",
                                __func__, dev_name(&pdev->dev), ret);
+               else
+                       vdev->needs_reset = false;
        }
 
        pci_restore_state(pdev);
+out:
+       pci_disable_device(pdev);
+
+       vfio_pci_try_bus_reset(vdev);
 }
 
 static void vfio_pci_release(void *device_data)
 {
        struct vfio_pci_device *vdev = device_data;
 
-       if (atomic_dec_and_test(&vdev->refcnt)) {
+       mutex_lock(&driver_lock);
+
+       if (!(--vdev->refcnt)) {
                vfio_spapr_pci_eeh_release(vdev->pdev);
                vfio_pci_disable(vdev);
        }
 
+       mutex_unlock(&driver_lock);
+
        module_put(THIS_MODULE);
 }
 
 static int vfio_pci_open(void *device_data)
 {
        struct vfio_pci_device *vdev = device_data;
-       int ret;
+       int ret = 0;
 
        if (!try_module_get(THIS_MODULE))
                return -ENODEV;
 
-       if (atomic_inc_return(&vdev->refcnt) == 1) {
+       mutex_lock(&driver_lock);
+
+       if (!vdev->refcnt) {
                ret = vfio_pci_enable(vdev);
                if (ret)
                        goto error;
 
-               ret = vfio_spapr_pci_eeh_open(vdev->pdev);
-               if (ret) {
-                       vfio_pci_disable(vdev);
-                       goto error;
-               }
+               vfio_spapr_pci_eeh_open(vdev->pdev);
        }
-
-       return 0;
+       vdev->refcnt++;
 error:
-       module_put(THIS_MODULE);
+       mutex_unlock(&driver_lock);
+       if (ret)
+               module_put(THIS_MODULE);
        return ret;
 }
 
@@ -843,7 +862,6 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        vdev->irq_type = VFIO_PCI_NUM_IRQS;
        mutex_init(&vdev->igate);
        spin_lock_init(&vdev->irqlock);
-       atomic_set(&vdev->refcnt, 0);
 
        ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
        if (ret) {
@@ -858,12 +876,15 @@ static void vfio_pci_remove(struct pci_dev *pdev)
 {
        struct vfio_pci_device *vdev;
 
+       mutex_lock(&driver_lock);
+
        vdev = vfio_del_group_dev(&pdev->dev);
-       if (!vdev)
-               return;
+       if (vdev) {
+               iommu_group_put(pdev->dev.iommu_group);
+               kfree(vdev);
+       }
 
-       iommu_group_put(pdev->dev.iommu_group);
-       kfree(vdev);
+       mutex_unlock(&driver_lock);
 }
 
 static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
@@ -906,6 +927,110 @@ static struct pci_driver vfio_pci_driver = {
        .err_handler    = &vfio_err_handlers,
 };
 
+/*
+ * Test whether a reset is necessary and possible.  We mark devices as
+ * needs_reset when they are released, but don't have a function-local reset
+ * available.  If any of these exist in the affected devices, we want to do
+ * a bus/slot reset.  We also need all of the affected devices to be unused,
+ * so we abort if any device has a non-zero refcnt.  driver_lock prevents a
+ * device from being opened during the scan or unbound from vfio-pci.
+ */
+static int vfio_pci_test_bus_reset(struct pci_dev *pdev, void *data)
+{
+       bool *needs_reset = data;
+       struct pci_driver *pci_drv = ACCESS_ONCE(pdev->driver);
+       int ret = -EBUSY;
+
+       if (pci_drv == &vfio_pci_driver) {
+               struct vfio_device *device;
+               struct vfio_pci_device *vdev;
+
+               device = vfio_device_get_from_dev(&pdev->dev);
+               if (!device)
+                       return ret;
+
+               vdev = vfio_device_data(device);
+               if (vdev) {
+                       if (vdev->needs_reset)
+                               *needs_reset = true;
+
+                       if (!vdev->refcnt)
+                               ret = 0;
+               }
+
+               vfio_device_put(device);
+       }
+
+       /*
+        * TODO: vfio-core considers groups to be viable even if some devices
+        * are attached to known drivers, like pci-stub or pcieport.  We can't
+        * freeze devices from being unbound to those drivers like we can
+        * here though, so it would be racy to test for them.  We also can't
+        * use device_lock() to prevent changes as that would interfere with
+        * PCI-core taking device_lock during bus reset.  For now, we require
+        * devices to be bound to vfio-pci to get a bus/slot reset on release.
+        */
+
+       return ret;
+}
+
+/* Clear needs_reset on all affected devices after successful bus/slot reset */
+static int vfio_pci_clear_needs_reset(struct pci_dev *pdev, void *data)
+{
+       struct pci_driver *pci_drv = ACCESS_ONCE(pdev->driver);
+
+       if (pci_drv == &vfio_pci_driver) {
+               struct vfio_device *device;
+               struct vfio_pci_device *vdev;
+
+               device = vfio_device_get_from_dev(&pdev->dev);
+               if (!device)
+                       return 0;
+
+               vdev = vfio_device_data(device);
+               if (vdev)
+                       vdev->needs_reset = false;
+
+               vfio_device_put(device);
+       }
+
+       return 0;
+}
+
+/*
+ * Attempt to do a bus/slot reset if there are devices affected by a reset for
+ * this device that are needs_reset and all of the affected devices are unused
+ * (!refcnt).  Callers of this function are required to hold driver_lock such
+ * that devices can not be unbound from vfio-pci or opened by a user while we
+ * test for and perform a bus/slot reset.
+ */
+static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
+{
+       bool needs_reset = false, slot = false;
+       int ret;
+
+       if (!pci_probe_reset_slot(vdev->pdev->slot))
+               slot = true;
+       else if (pci_probe_reset_bus(vdev->pdev->bus))
+               return;
+
+       if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
+                                         vfio_pci_test_bus_reset,
+                                         &needs_reset, slot) || !needs_reset)
+               return;
+
+       if (slot)
+               ret = pci_try_reset_slot(vdev->pdev->slot);
+       else
+               ret = pci_try_reset_bus(vdev->pdev->bus);
+
+       if (ret)
+               return;
+
+       vfio_pci_for_each_slot_or_bus(vdev->pdev,
+                                     vfio_pci_clear_needs_reset, NULL, slot);
+}
+
 static void __exit vfio_pci_cleanup(void)
 {
        pci_unregister_driver(&vfio_pci_driver);
index 9c6d5d0f3b02db8cfb1acdd418b4a5abb44e12e5..671c17a6e6d029dfdffe5d7243150ed757e44cf4 100644 (file)
@@ -54,8 +54,9 @@ struct vfio_pci_device {
        bool                    extended_caps;
        bool                    bardirty;
        bool                    has_vga;
+       bool                    needs_reset;
        struct pci_saved_state  *pci_saved_state;
-       atomic_t                refcnt;
+       int                     refcnt;
        struct eventfd_ctx      *err_trigger;
 };
 
index f834b4ce1431b133c1fa5aa6c7f6187c420a0ad8..86dfceb9201f2503e2050dc71c8906fcd8abf281 100644 (file)
@@ -9,20 +9,27 @@
  * published by the Free Software Foundation.
  */
 
+#include <linux/module.h>
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
 #include <asm/eeh.h>
 
+#define DRIVER_VERSION "0.1"
+#define DRIVER_AUTHOR  "Gavin Shan, IBM Corporation"
+#define DRIVER_DESC    "VFIO IOMMU SPAPR EEH"
+
 /* We might build address mapping here for "fast" path later */
-int vfio_spapr_pci_eeh_open(struct pci_dev *pdev)
+void vfio_spapr_pci_eeh_open(struct pci_dev *pdev)
 {
-       return eeh_dev_open(pdev);
+       eeh_dev_open(pdev);
 }
+EXPORT_SYMBOL_GPL(vfio_spapr_pci_eeh_open);
 
 void vfio_spapr_pci_eeh_release(struct pci_dev *pdev)
 {
        eeh_dev_release(pdev);
 }
+EXPORT_SYMBOL_GPL(vfio_spapr_pci_eeh_release);
 
 long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
                                unsigned int cmd, unsigned long arg)
@@ -85,3 +92,9 @@ long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
 
        return ret;
 }
+EXPORT_SYMBOL(vfio_spapr_iommu_eeh_ioctl);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
index 4030cbfbc9af8572673eb06c997603ee529443e4..90c88529892bfc9723a9aec9ed6540daeb372f76 100644 (file)
@@ -11,7 +11,7 @@ obj-y :=      open.o read_write.o file_table.o super.o \
                attr.o bad_inode.o file.o filesystems.o namespace.o \
                seq_file.o xattr.o libfs.o fs-writeback.o \
                pnode.o splice.o sync.o utimes.o \
-               stack.o fs_struct.o statfs.o
+               stack.o fs_struct.o statfs.o fs_pin.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=       buffer.o block_dev.o direct-io.o mpage.o
index 7c93953030fbe5eda13d76b6a8c53d6f2a31902d..afd2b4408adf53d78c716043c847ba48cf5c3bef 100644 (file)
@@ -218,8 +218,9 @@ static int bad_inode_mknod (struct inode *dir, struct dentry *dentry,
        return -EIO;
 }
 
-static int bad_inode_rename (struct inode *old_dir, struct dentry *old_dentry,
-               struct inode *new_dir, struct dentry *new_dentry)
+static int bad_inode_rename2(struct inode *old_dir, struct dentry *old_dentry,
+                            struct inode *new_dir, struct dentry *new_dentry,
+                            unsigned int flags)
 {
        return -EIO;
 }
@@ -279,7 +280,7 @@ static const struct inode_operations bad_inode_ops =
        .mkdir          = bad_inode_mkdir,
        .rmdir          = bad_inode_rmdir,
        .mknod          = bad_inode_mknod,
-       .rename         = bad_inode_rename,
+       .rename2        = bad_inode_rename2,
        .readlink       = bad_inode_readlink,
        /* follow_link must be no-op, otherwise unmounting this inode
           won't work */
index 3668048e16f8fa835c3ffff9ed85998ee201ec58..3183742d6f0d74d131c16e29dde59205f3e7f8a4 100644 (file)
@@ -8476,6 +8476,16 @@ out_notrans:
        return ret;
 }
 
+static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+                        struct inode *new_dir, struct dentry *new_dentry,
+                        unsigned int flags)
+{
+       if (flags & ~RENAME_NOREPLACE)
+               return -EINVAL;
+
+       return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
 static void btrfs_run_delalloc_work(struct btrfs_work *work)
 {
        struct btrfs_delalloc_work *delalloc_work;
@@ -9019,7 +9029,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
        .link           = btrfs_link,
        .mkdir          = btrfs_mkdir,
        .rmdir          = btrfs_rmdir,
-       .rename         = btrfs_rename,
+       .rename2        = btrfs_rename2,
        .symlink        = btrfs_symlink,
        .setattr        = btrfs_setattr,
        .mknod          = btrfs_mknod,
index 8e16bca69c56de7fa54c1680698b60d7319a03a9..67b48b9a03e044eb2d83ab28447c8ead9db5192b 100644 (file)
@@ -851,7 +851,6 @@ static struct dentry *get_default_root(struct super_block *sb,
        struct btrfs_path *path;
        struct btrfs_key location;
        struct inode *inode;
-       struct dentry *dentry;
        u64 dir_id;
        int new = 0;
 
@@ -922,13 +921,7 @@ setup_root:
                return dget(sb->s_root);
        }
 
-       dentry = d_obtain_alias(inode);
-       if (!IS_ERR(dentry)) {
-               spin_lock(&dentry->d_lock);
-               dentry->d_flags &= ~DCACHE_DISCONNECTED;
-               spin_unlock(&dentry->d_lock);
-       }
-       return dentry;
+       return d_obtain_root(inode);
 }
 
 static int btrfs_fill_super(struct super_block *sb,
index 469f2e8657e8426bfb3ad94fb12dde25e06ccb18..cebf2ebefb55dfeb79fe7b23bddd1165e4d4f344 100644 (file)
@@ -172,14 +172,24 @@ out:
 int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir)
 {
        struct posix_acl *default_acl, *acl;
+       umode_t new_mode = inode->i_mode;
        int error;
 
-       error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+       error = posix_acl_create(dir, &new_mode, &default_acl, &acl);
        if (error)
                return error;
 
-       if (!default_acl && !acl)
+       if (!default_acl && !acl) {
                cache_no_acl(inode);
+               if (new_mode != inode->i_mode) {
+                       struct iattr newattrs = {
+                               .ia_mode = new_mode,
+                               .ia_valid = ATTR_MODE,
+                       };
+                       error = ceph_setattr(dentry, &newattrs);
+               }
+               return error;
+       }
 
        if (default_acl) {
                error = ceph_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
index 1fde164b74b54a258cdff8e7c89f52191713c986..6d1cd45dca890f9ab51a7087dbba207035792f20 100644 (file)
@@ -3277,7 +3277,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
                        rel->ino = cpu_to_le64(ceph_ino(inode));
                        rel->cap_id = cpu_to_le64(cap->cap_id);
                        rel->seq = cpu_to_le32(cap->seq);
-                       rel->issue_seq = cpu_to_le32(cap->issue_seq),
+                       rel->issue_seq = cpu_to_le32(cap->issue_seq);
                        rel->mseq = cpu_to_le32(cap->mseq);
                        rel->caps = cpu_to_le32(cap->implemented);
                        rel->wanted = cpu_to_le32(cap->mds_wanted);
index 302085100c28af1a2ed67269e955b3d0839539be..2eb02f80a0ab05de9071bc5d076f6c8fe952cbf5 100644 (file)
@@ -423,6 +423,9 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
        dout("sync_read on file %p %llu~%u %s\n", file, off,
             (unsigned)len,
             (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+
+       if (!len)
+               return 0;
        /*
         * flush any page cache pages in this range.  this
         * will make concurrent normal and sync io slow,
@@ -470,8 +473,11 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
                        size_t left = ret;
 
                        while (left) {
-                               int copy = min_t(size_t, PAGE_SIZE, left);
-                               l = copy_page_to_iter(pages[k++], 0, copy, i);
+                               size_t page_off = off & ~PAGE_MASK;
+                               size_t copy = min_t(size_t,
+                                                   PAGE_SIZE - page_off, left);
+                               l = copy_page_to_iter(pages[k++], page_off,
+                                                     copy, i);
                                off += l;
                                left -= l;
                                if (l < copy)
@@ -531,7 +537,7 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
  * objects, rollback on failure, etc.)
  */
 static ssize_t
-ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from)
+ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
@@ -547,7 +553,6 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from)
        int check_caps = 0;
        int ret;
        struct timespec mtime = CURRENT_TIME;
-       loff_t pos = iocb->ki_pos;
        size_t count = iov_iter_count(from);
 
        if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
@@ -646,7 +651,8 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from)
  * correct atomic write, we should e.g. take write locks on all
  * objects, rollback on failure, etc.)
  */
-static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t
+ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
@@ -663,7 +669,6 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from)
        int check_caps = 0;
        int ret;
        struct timespec mtime = CURRENT_TIME;
-       loff_t pos = iocb->ki_pos;
        size_t count = iov_iter_count(from);
 
        if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
@@ -918,9 +923,9 @@ retry_snap:
                /* we might need to revert back to that point */
                data = *from;
                if (file->f_flags & O_DIRECT)
-                       written = ceph_sync_direct_write(iocb, &data);
+                       written = ceph_sync_direct_write(iocb, &data, pos);
                else
-                       written = ceph_sync_write(iocb, &data);
+                       written = ceph_sync_write(iocb, &data, pos);
                if (written == -EOLDSNAPC) {
                        dout("aio_write %p %llx.%llx %llu~%u"
                                "got EOLDSNAPC, retrying\n",
@@ -1177,6 +1182,9 @@ static long ceph_fallocate(struct file *file, int mode,
        loff_t endoff = 0;
        loff_t size;
 
+       if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+               return -EOPNOTSUPP;
+
        if (!S_ISREG(inode->i_mode))
                return -EOPNOTSUPP;
 
index 92a2548278fca0c52609d120db33cf070db621c6..bad07c09f91ead03419fe8e25fb1df18b4ac439e 100644 (file)
@@ -1904,6 +1904,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
             req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
 
        if (req->r_got_unsafe) {
+               void *p;
                /*
                 * Replay.  Do not regenerate message (and rebuild
                 * paths, etc.); just use the original message.
@@ -1924,8 +1925,13 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
 
                /* remove cap/dentry releases from message */
                rhead->num_releases = 0;
-               msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset);
-               msg->front.iov_len = req->r_request_release_offset;
+
+               /* time stamp */
+               p = msg->front.iov_base + req->r_request_release_offset;
+               ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp));
+
+               msg->front.iov_len = p - msg->front.iov_base;
+               msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
                return 0;
        }
 
@@ -2061,11 +2067,12 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
 static void kick_requests(struct ceph_mds_client *mdsc, int mds)
 {
        struct ceph_mds_request *req;
-       struct rb_node *p;
+       struct rb_node *p = rb_first(&mdsc->request_tree);
 
        dout("kick_requests mds%d\n", mds);
-       for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
+       while (p) {
                req = rb_entry(p, struct ceph_mds_request, r_node);
+               p = rb_next(p);
                if (req->r_got_unsafe)
                        continue;
                if (req->r_session &&
@@ -2248,6 +2255,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
         */
        if (result == -ESTALE) {
                dout("got ESTALE on request %llu", req->r_tid);
+               req->r_resend_mds = -1;
                if (req->r_direct_mode != USE_AUTH_MDS) {
                        dout("not using auth, setting for that now");
                        req->r_direct_mode = USE_AUTH_MDS;
index 06150fd745ac65d72c456b1fe57f537336f1064a..f6e12377335c1a2aa030e86f1d903daa365792b1 100644 (file)
@@ -755,7 +755,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
                                goto out;
                        }
                } else {
-                       root = d_obtain_alias(inode);
+                       root = d_obtain_root(inode);
                }
                ceph_init_dentry(root);
                dout("open_root_inode success, root dentry is %p\n", root);
index c9c2b887381ec2504ee48c673cf113e396f3b04e..12f58d22e01798844d17d9b526cf19d1ba112a36 100644 (file)
@@ -592,12 +592,12 @@ start:
                xattr_version = ci->i_xattrs.version;
                spin_unlock(&ci->i_ceph_lock);
 
-               xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
+               xattrs = kcalloc(numattr, sizeof(struct ceph_inode_xattr *),
                                 GFP_NOFS);
                err = -ENOMEM;
                if (!xattrs)
                        goto bad_lock;
-               memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
+
                for (i = 0; i < numattr; i++) {
                        xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
                                            GFP_NOFS);
index 88839806742007ca8dfaf8d77754695d4924fef9..ac4f260155c875b80d9d6611af1139438e62fc91 100644 (file)
@@ -848,7 +848,7 @@ const struct inode_operations cifs_dir_inode_ops = {
        .link = cifs_hardlink,
        .mkdir = cifs_mkdir,
        .rmdir = cifs_rmdir,
-       .rename = cifs_rename,
+       .rename2 = cifs_rename2,
        .permission = cifs_permission,
 /*     revalidate:cifs_revalidate,   */
        .setattr = cifs_setattr,
index 560480263336ce23a4ffa456b76e02f59f71c0fd..b0fafa499505218fb0cecac43cfb9db4e9a77f26 100644 (file)
@@ -68,8 +68,8 @@ extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
 extern int cifs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
 extern int cifs_mkdir(struct inode *, struct dentry *, umode_t);
 extern int cifs_rmdir(struct inode *, struct dentry *);
-extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
-                      struct dentry *);
+extern int cifs_rename2(struct inode *, struct dentry *, struct inode *,
+                       struct dentry *, unsigned int);
 extern int cifs_revalidate_file_attr(struct file *filp);
 extern int cifs_revalidate_dentry_attr(struct dentry *);
 extern int cifs_revalidate_file(struct file *filp);
index 41de3935caa0cff86e58e8c4a0db829e4ca88a7e..426d6c6ad8bfacfa188f2eb2dfb5385620559f44 100644 (file)
@@ -1627,8 +1627,9 @@ do_rename_exit:
 }
 
 int
-cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
-           struct inode *target_dir, struct dentry *target_dentry)
+cifs_rename2(struct inode *source_dir, struct dentry *source_dentry,
+            struct inode *target_dir, struct dentry *target_dentry,
+            unsigned int flags)
 {
        char *from_name = NULL;
        char *to_name = NULL;
@@ -1640,6 +1641,9 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
        unsigned int xid;
        int rc, tmprc;
 
+       if (flags & ~RENAME_NOREPLACE)
+               return -EINVAL;
+
        cifs_sb = CIFS_SB(source_dir->i_sb);
        tlink = cifs_sb_tlink(cifs_sb);
        if (IS_ERR(tlink))
@@ -1667,6 +1671,12 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
        rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry,
                            to_name);
 
+       /*
+        * No-replace is the natural behavior for CIFS, so skip unlink hacks.
+        */
+       if (flags & RENAME_NOREPLACE)
+               goto cifs_rename_exit;
+
        if (rc == -EEXIST && tcon->unix_ext) {
                /*
                 * Are src and dst hardlinks of same inode? We can only tell
index 06f65857a855725247c1190d243c0e19cccd8570..d30ce699ae4b6ea4ac3ad1b8ec955fa271bdcb16 100644 (file)
@@ -731,8 +731,6 @@ EXPORT_SYMBOL(dget_parent);
 /**
  * d_find_alias - grab a hashed alias of inode
  * @inode: inode in question
- * @want_discon:  flag, used by d_splice_alias, to request
- *          that only a DISCONNECTED alias be returned.
  *
  * If inode has a hashed alias, or is a directory and has any alias,
  * acquire the reference to alias and return it. Otherwise return NULL.
@@ -741,10 +739,9 @@ EXPORT_SYMBOL(dget_parent);
  * of a filesystem.
  *
  * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
- * any other hashed alias over that one unless @want_discon is set,
- * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias.
+ * any other hashed alias over that one.
  */
-static struct dentry *__d_find_alias(struct inode *inode, int want_discon)
+static struct dentry *__d_find_alias(struct inode *inode)
 {
        struct dentry *alias, *discon_alias;
 
@@ -756,7 +753,7 @@ again:
                        if (IS_ROOT(alias) &&
                            (alias->d_flags & DCACHE_DISCONNECTED)) {
                                discon_alias = alias;
-                       } else if (!want_discon) {
+                       } else {
                                __dget_dlock(alias);
                                spin_unlock(&alias->d_lock);
                                return alias;
@@ -768,12 +765,9 @@ again:
                alias = discon_alias;
                spin_lock(&alias->d_lock);
                if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
-                       if (IS_ROOT(alias) &&
-                           (alias->d_flags & DCACHE_DISCONNECTED)) {
-                               __dget_dlock(alias);
-                               spin_unlock(&alias->d_lock);
-                               return alias;
-                       }
+                       __dget_dlock(alias);
+                       spin_unlock(&alias->d_lock);
+                       return alias;
                }
                spin_unlock(&alias->d_lock);
                goto again;
@@ -787,7 +781,7 @@ struct dentry *d_find_alias(struct inode *inode)
 
        if (!hlist_empty(&inode->i_dentry)) {
                spin_lock(&inode->i_lock);
-               de = __d_find_alias(inode, 0);
+               de = __d_find_alias(inode);
                spin_unlock(&inode->i_lock);
        }
        return de;
@@ -1781,25 +1775,7 @@ struct dentry *d_find_any_alias(struct inode *inode)
 }
 EXPORT_SYMBOL(d_find_any_alias);
 
-/**
- * d_obtain_alias - find or allocate a dentry for a given inode
- * @inode: inode to allocate the dentry for
- *
- * Obtain a dentry for an inode resulting from NFS filehandle conversion or
- * similar open by handle operations.  The returned dentry may be anonymous,
- * or may have a full name (if the inode was already in the cache).
- *
- * When called on a directory inode, we must ensure that the inode only ever
- * has one dentry.  If a dentry is found, that is returned instead of
- * allocating a new one.
- *
- * On successful return, the reference to the inode has been transferred
- * to the dentry.  In case of an error the reference on the inode is released.
- * To make it easier to use in export operations a %NULL or IS_ERR inode may
- * be passed in and will be the error will be propagate to the return value,
- * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
- */
-struct dentry *d_obtain_alias(struct inode *inode)
+static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected)
 {
        static const struct qstr anonstring = QSTR_INIT("/", 1);
        struct dentry *tmp;
@@ -1830,7 +1806,10 @@ struct dentry *d_obtain_alias(struct inode *inode)
        }
 
        /* attach a disconnected dentry */
-       add_flags = d_flags_for_inode(inode) | DCACHE_DISCONNECTED;
+       add_flags = d_flags_for_inode(inode);
+
+       if (disconnected)
+               add_flags |= DCACHE_DISCONNECTED;
 
        spin_lock(&tmp->d_lock);
        tmp->d_inode = inode;
@@ -1851,59 +1830,51 @@ struct dentry *d_obtain_alias(struct inode *inode)
        iput(inode);
        return res;
 }
-EXPORT_SYMBOL(d_obtain_alias);
 
 /**
- * d_splice_alias - splice a disconnected dentry into the tree if one exists
- * @inode:  the inode which may have a disconnected dentry
- * @dentry: a negative dentry which we want to point to the inode.
- *
- * If inode is a directory and has a 'disconnected' dentry (i.e. IS_ROOT and
- * DCACHE_DISCONNECTED), then d_move that in place of the given dentry
- * and return it, else simply d_add the inode to the dentry and return NULL.
+ * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode
+ * @inode: inode to allocate the dentry for
  *
- * This is needed in the lookup routine of any filesystem that is exportable
- * (via knfsd) so that we can build dcache paths to directories effectively.
+ * Obtain a dentry for an inode resulting from NFS filehandle conversion or
+ * similar open by handle operations.  The returned dentry may be anonymous,
+ * or may have a full name (if the inode was already in the cache).
  *
- * If a dentry was found and moved, then it is returned.  Otherwise NULL
- * is returned.  This matches the expected return value of ->lookup.
+ * When called on a directory inode, we must ensure that the inode only ever
+ * has one dentry.  If a dentry is found, that is returned instead of
+ * allocating a new one.
  *
- * Cluster filesystems may call this function with a negative, hashed dentry.
- * In that case, we know that the inode will be a regular file, and also this
- * will only occur during atomic_open. So we need to check for the dentry
- * being already hashed only in the final case.
+ * On successful return, the reference to the inode has been transferred
+ * to the dentry.  In case of an error the reference on the inode is released.
+ * To make it easier to use in export operations a %NULL or IS_ERR inode may
+ * be passed in and the error will be propagated to the return value,
+ * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
  */
-struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
+struct dentry *d_obtain_alias(struct inode *inode)
 {
-       struct dentry *new = NULL;
-
-       if (IS_ERR(inode))
-               return ERR_CAST(inode);
+       return __d_obtain_alias(inode, 1);
+}
+EXPORT_SYMBOL(d_obtain_alias);
 
-       if (inode && S_ISDIR(inode->i_mode)) {
-               spin_lock(&inode->i_lock);
-               new = __d_find_alias(inode, 1);
-               if (new) {
-                       BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
-                       spin_unlock(&inode->i_lock);
-                       security_d_instantiate(new, inode);
-                       d_move(new, dentry);
-                       iput(inode);
-               } else {
-                       /* already taking inode->i_lock, so d_add() by hand */
-                       __d_instantiate(dentry, inode);
-                       spin_unlock(&inode->i_lock);
-                       security_d_instantiate(dentry, inode);
-                       d_rehash(dentry);
-               }
-       } else {
-               d_instantiate(dentry, inode);
-               if (d_unhashed(dentry))
-                       d_rehash(dentry);
-       }
-       return new;
+/**
+ * d_obtain_root - find or allocate a dentry for a given inode
+ * @inode: inode to allocate the dentry for
+ *
+ * Obtain an IS_ROOT dentry for the root of a filesystem.
+ *
+ * We must ensure that directory inodes only ever have one dentry.  If a
+ * dentry is found, that is returned instead of allocating a new one.
+ *
+ * On successful return, the reference to the inode has been transferred
+ * to the dentry.  In case of an error the reference on the inode is
+ * released.  A %NULL or IS_ERR inode may be passed in and will be the
+ * error will be propagate to the return value, with a %NULL @inode
+ * replaced by ERR_PTR(-ESTALE).
+ */
+struct dentry *d_obtain_root(struct inode *inode)
+{
+       return __d_obtain_alias(inode, 0);
 }
-EXPORT_SYMBOL(d_splice_alias);
+EXPORT_SYMBOL(d_obtain_root);
 
 /**
  * d_add_ci - lookup or allocate new dentry with case-exact name
@@ -2696,6 +2667,75 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
        /* anon->d_lock still locked, returns locked */
 }
 
+/**
+ * d_splice_alias - splice a disconnected dentry into the tree if one exists
+ * @inode:  the inode which may have a disconnected dentry
+ * @dentry: a negative dentry which we want to point to the inode.
+ *
+ * If inode is a directory and has an IS_ROOT alias, then d_move that in
+ * place of the given dentry and return it, else simply d_add the inode
+ * to the dentry and return NULL.
+ *
+ * If a non-IS_ROOT directory is found, the filesystem is corrupt, and
+ * we should error out: directories can't have multiple aliases.
+ *
+ * This is needed in the lookup routine of any filesystem that is exportable
+ * (via knfsd) so that we can build dcache paths to directories effectively.
+ *
+ * If a dentry was found and moved, then it is returned.  Otherwise NULL
+ * is returned.  This matches the expected return value of ->lookup.
+ *
+ * Cluster filesystems may call this function with a negative, hashed dentry.
+ * In that case, we know that the inode will be a regular file, and also this
+ * will only occur during atomic_open. So we need to check for the dentry
+ * being already hashed only in the final case.
+ */
+struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
+{
+       struct dentry *new = NULL;
+
+       if (IS_ERR(inode))
+               return ERR_CAST(inode);
+
+       if (inode && S_ISDIR(inode->i_mode)) {
+               spin_lock(&inode->i_lock);
+               new = __d_find_any_alias(inode);
+               if (new) {
+                       if (!IS_ROOT(new)) {
+                               spin_unlock(&inode->i_lock);
+                               dput(new);
+                               return ERR_PTR(-EIO);
+                       }
+                       if (d_ancestor(new, dentry)) {
+                               spin_unlock(&inode->i_lock);
+                               dput(new);
+                               return ERR_PTR(-EIO);
+                       }
+                       write_seqlock(&rename_lock);
+                       __d_materialise_dentry(dentry, new);
+                       write_sequnlock(&rename_lock);
+                       __d_drop(new);
+                       _d_rehash(new);
+                       spin_unlock(&new->d_lock);
+                       spin_unlock(&inode->i_lock);
+                       security_d_instantiate(new, inode);
+                       iput(inode);
+               } else {
+                       /* already taking inode->i_lock, so d_add() by hand */
+                       __d_instantiate(dentry, inode);
+                       spin_unlock(&inode->i_lock);
+                       security_d_instantiate(dentry, inode);
+                       d_rehash(dentry);
+               }
+       } else {
+               d_instantiate(dentry, inode);
+               if (d_unhashed(dentry))
+                       d_rehash(dentry);
+       }
+       return new;
+}
+EXPORT_SYMBOL(d_splice_alias);
+
 /**
  * d_materialise_unique - introduce an inode into the tree
  * @dentry: candidate dentry
@@ -2724,7 +2764,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
                struct dentry *alias;
 
                /* Does an aliased dentry already exist? */
-               alias = __d_find_alias(inode, 0);
+               alias = __d_find_alias(inode);
                if (alias) {
                        actual = alias;
                        write_seqlock(&rename_lock);
index 17e39b047de5b8a64349c86a5f9432efcb19f2e5..c3116404ab49a29c3b11fb53c175132f715750a5 100644 (file)
@@ -158,7 +158,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
 {
        ssize_t ret;
 
-       ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES * PAGE_SIZE,
+       ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES,
                                &sdio->from);
 
        if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
index 3750031cfa2f796af344e19f4cc234df5f0c1eba..b88edc05c2300871ea70ed16b6c42ece6b3dcf2f 100644 (file)
@@ -161,7 +161,7 @@ static struct kmem_cache * ext2_inode_cachep;
 static struct inode *ext2_alloc_inode(struct super_block *sb)
 {
        struct ext2_inode_info *ei;
-       ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL);
+       ei = kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;
        ei->i_block_alloc_info = NULL;
index 3520ab8a66390c03d0fd95adccb6d7c8d74bb0c7..b147a67baa0d976d026857308350d85b8de72e01 100644 (file)
@@ -3455,7 +3455,6 @@ const struct inode_operations ext4_dir_inode_operations = {
        .rmdir          = ext4_rmdir,
        .mknod          = ext4_mknod,
        .tmpfile        = ext4_tmpfile,
-       .rename         = ext4_rename,
        .rename2        = ext4_rename2,
        .setattr        = ext4_setattr,
        .setxattr       = generic_setxattr,
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
new file mode 100644 (file)
index 0000000..9368236
--- /dev/null
@@ -0,0 +1,78 @@
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/fs_pin.h>
+#include "internal.h"
+#include "mount.h"
+
+static void pin_free_rcu(struct rcu_head *head)
+{
+       kfree(container_of(head, struct fs_pin, rcu));
+}
+
+static DEFINE_SPINLOCK(pin_lock);
+
+void pin_put(struct fs_pin *p)
+{
+       if (atomic_long_dec_and_test(&p->count))
+               call_rcu(&p->rcu, pin_free_rcu);
+}
+
+void pin_remove(struct fs_pin *pin)
+{
+       spin_lock(&pin_lock);
+       hlist_del(&pin->m_list);
+       hlist_del(&pin->s_list);
+       spin_unlock(&pin_lock);
+}
+
+void pin_insert(struct fs_pin *pin, struct vfsmount *m)
+{
+       spin_lock(&pin_lock);
+       hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins);
+       hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins);
+       spin_unlock(&pin_lock);
+}
+
+void mnt_pin_kill(struct mount *m)
+{
+       while (1) {
+               struct hlist_node *p;
+               struct fs_pin *pin;
+               rcu_read_lock();
+               p = ACCESS_ONCE(m->mnt_pins.first);
+               if (!p) {
+                       rcu_read_unlock();
+                       break;
+               }
+               pin = hlist_entry(p, struct fs_pin, m_list);
+               if (!atomic_long_inc_not_zero(&pin->count)) {
+                       rcu_read_unlock();
+                       cpu_relax();
+                       continue;
+               }
+               rcu_read_unlock();
+               pin->kill(pin);
+       }
+}
+
+void sb_pin_kill(struct super_block *sb)
+{
+       while (1) {
+               struct hlist_node *p;
+               struct fs_pin *pin;
+               rcu_read_lock();
+               p = ACCESS_ONCE(sb->s_pins.first);
+               if (!p) {
+                       rcu_read_unlock();
+                       break;
+               }
+               pin = hlist_entry(p, struct fs_pin, s_list);
+               if (!atomic_long_inc_not_zero(&pin->count)) {
+                       rcu_read_unlock();
+                       cpu_relax();
+                       continue;
+               }
+               rcu_read_unlock();
+               pin->kill(pin);
+       }
+}
index 0c6048247a34eb16a5146f7ea67479a21fec6173..de1d84af9f7c6baf27b64218fbff43604ab47c04 100644 (file)
@@ -845,12 +845,6 @@ static int fuse_rename2(struct inode *olddir, struct dentry *oldent,
        return err;
 }
 
-static int fuse_rename(struct inode *olddir, struct dentry *oldent,
-                      struct inode *newdir, struct dentry *newent)
-{
-       return fuse_rename2(olddir, oldent, newdir, newent, 0);
-}
-
 static int fuse_link(struct dentry *entry, struct inode *newdir,
                     struct dentry *newent)
 {
@@ -2024,7 +2018,6 @@ static const struct inode_operations fuse_dir_inode_operations = {
        .symlink        = fuse_symlink,
        .unlink         = fuse_unlink,
        .rmdir          = fuse_rmdir,
-       .rename         = fuse_rename,
        .rename2        = fuse_rename2,
        .link           = fuse_link,
        .setattr        = fuse_setattr,
index 40ac2628ddcf46f3be8fe96ed46bb37ee6ef1c62..912061ac4baf9e6eeca15ea82394052e2583a809 100644 (file)
@@ -1303,10 +1303,10 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
        while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
                unsigned npages;
                size_t start;
-               unsigned n = req->max_pages - req->num_pages;
                ssize_t ret = iov_iter_get_pages(ii,
                                        &req->pages[req->num_pages],
-                                       n * PAGE_SIZE, &start);
+                                       req->max_pages - req->num_pages,
+                                       &start);
                if (ret < 0)
                        return ret;
 
index 9c88da0e855a1001fd50e9979c3cc487d7bc80dc..4fcd40d6f30820817892e4908477aefc06073e85 100644 (file)
@@ -89,6 +89,7 @@ extern int do_mknod(const char *file, int mode, unsigned int major,
 extern int link_file(const char *from, const char *to);
 extern int hostfs_do_readlink(char *file, char *buf, int size);
 extern int rename_file(char *from, char *to);
+extern int rename2_file(char *from, char *to, unsigned int flags);
 extern int do_statfs(char *root, long *bsize_out, long long *blocks_out,
                     long long *bfree_out, long long *bavail_out,
                     long long *files_out, long long *ffree_out,
index bb529f3b7f2bf8a119ec5f2eba3c260e09c361c7..fd62cae0fdcb66db03712d419c25014312112546 100644 (file)
@@ -741,21 +741,31 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
        return err;
 }
 
-static int hostfs_rename(struct inode *from_ino, struct dentry *from,
-                        struct inode *to_ino, struct dentry *to)
+static int hostfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+                         struct inode *new_dir, struct dentry *new_dentry,
+                         unsigned int flags)
 {
-       char *from_name, *to_name;
+       char *old_name, *new_name;
        int err;
 
-       if ((from_name = dentry_name(from)) == NULL)
+       if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+               return -EINVAL;
+
+       old_name = dentry_name(old_dentry);
+       if (old_name == NULL)
                return -ENOMEM;
-       if ((to_name = dentry_name(to)) == NULL) {
-               __putname(from_name);
+       new_name = dentry_name(new_dentry);
+       if (new_name == NULL) {
+               __putname(old_name);
                return -ENOMEM;
        }
-       err = rename_file(from_name, to_name);
-       __putname(from_name);
-       __putname(to_name);
+       if (!flags)
+               err = rename_file(old_name, new_name);
+       else
+               err = rename2_file(old_name, new_name, flags);
+
+       __putname(old_name);
+       __putname(new_name);
        return err;
 }
 
@@ -867,7 +877,7 @@ static const struct inode_operations hostfs_dir_iops = {
        .mkdir          = hostfs_mkdir,
        .rmdir          = hostfs_rmdir,
        .mknod          = hostfs_mknod,
-       .rename         = hostfs_rename,
+       .rename2        = hostfs_rename2,
        .permission     = hostfs_permission,
        .setattr        = hostfs_setattr,
 };
index 67838f3aa20a8fa6937eb6e48b39a9bef15d51aa..9765dab95cbdaefa4e53d8fdd51a72ffca4e6061 100644 (file)
@@ -14,6 +14,7 @@
 #include <sys/time.h>
 #include <sys/types.h>
 #include <sys/vfs.h>
+#include <sys/syscall.h>
 #include "hostfs.h"
 #include <utime.h>
 
@@ -360,6 +361,33 @@ int rename_file(char *from, char *to)
        return 0;
 }
 
+int rename2_file(char *from, char *to, unsigned int flags)
+{
+       int err;
+
+#ifndef SYS_renameat2
+#  ifdef __x86_64__
+#    define SYS_renameat2 316
+#  endif
+#  ifdef __i386__
+#    define SYS_renameat2 353
+#  endif
+#endif
+
+#ifdef SYS_renameat2
+       err = syscall(SYS_renameat2, AT_FDCWD, from, AT_FDCWD, to, flags);
+       if (err < 0) {
+               if (errno != ENOSYS)
+                       return -errno;
+               else
+                       return -EINVAL;
+       }
+       return 0;
+#else
+       return -EINVAL;
+#endif
+}
+
 int do_statfs(char *root, long *bsize_out, long long *blocks_out,
              long long *bfree_out, long long *bavail_out,
              long long *files_out, long long *ffree_out,
index 4657424074668dcb933c8d375244b1d3b9d72561..e325b4f9c799669ebfa02e0ee143231e98510ba6 100644 (file)
@@ -131,7 +131,6 @@ extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
 /*
  * read_write.c
  */
-extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
 extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
 
 /*
@@ -144,3 +143,9 @@ extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
  * pipe.c
  */
 extern const struct file_operations pipefifo_fops;
+
+/*
+ * fs_pin.c
+ */
+extern void sb_pin_kill(struct super_block *sb);
+extern void mnt_pin_kill(struct mount *m);
index d55297f2fa058c18512d0136b8a5437b29bc481c..6740a621552950e7a795b249f635b5eeea7065d1 100644 (file)
@@ -55,7 +55,7 @@ struct mount {
        int mnt_id;                     /* mount identifier */
        int mnt_group_id;               /* peer group identifier */
        int mnt_expiry_mark;            /* true if marked for expiry */
-       int mnt_pinned;
+       struct hlist_head mnt_pins;
        struct path mnt_ex_mountpoint;
 };
 
index 9eb787e5c167fb601845590f0181d249bf515fb0..a996bb48dfabf4f645dced56f527ed3d5568bb2c 100644 (file)
@@ -1091,10 +1091,10 @@ int follow_down_one(struct path *path)
 }
 EXPORT_SYMBOL(follow_down_one);
 
-static inline bool managed_dentry_might_block(struct dentry *dentry)
+static inline int managed_dentry_rcu(struct dentry *dentry)
 {
-       return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
-               dentry->d_op->d_manage(dentry, true) < 0);
+       return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
+               dentry->d_op->d_manage(dentry, true) : 0;
 }
 
 /*
@@ -1110,11 +1110,18 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
                 * Don't forget we might have a non-mountpoint managed dentry
                 * that wants to block transit.
                 */
-               if (unlikely(managed_dentry_might_block(path->dentry)))
+               switch (managed_dentry_rcu(path->dentry)) {
+               case -ECHILD:
+               default:
                        return false;
+               case -EISDIR:
+                       return true;
+               case 0:
+                       break;
+               }
 
                if (!d_mountpoint(path->dentry))
-                       return true;
+                       return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
 
                mounted = __lookup_mnt(path->mnt, path->dentry);
                if (!mounted)
@@ -1130,7 +1137,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
                 */
                *inode = path->dentry->d_inode;
        }
-       return read_seqretry(&mount_lock, nd->m_seq);
+       return read_seqretry(&mount_lock, nd->m_seq) &&
+               !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
 }
 
 static int follow_dotdot_rcu(struct nameidata *nd)
@@ -1402,11 +1410,8 @@ static int lookup_fast(struct nameidata *nd,
                }
                path->mnt = mnt;
                path->dentry = dentry;
-               if (unlikely(!__follow_mount_rcu(nd, path, inode)))
-                       goto unlazy;
-               if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
-                       goto unlazy;
-               return 0;
+               if (likely(__follow_mount_rcu(nd, path, inode)))
+                       return 0;
 unlazy:
                if (unlazy_walk(nd, dentry))
                        return -ECHILD;
@@ -4019,7 +4024,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
  * The worst of all namespace operations - renaming directory. "Perverted"
  * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
  * Problems:
- *     a) we can get into loop creation. Check is done in is_subdir().
+ *     a) we can get into loop creation.
  *     b) race potential - two innocent renames can create a loop together.
  *        That's where 4.4 screws up. Current fix: serialization on
  *        sb->s_vfs_rename_mutex. We might be more accurate, but that's another
@@ -4075,7 +4080,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (error)
                return error;
 
-       if (!old_dir->i_op->rename)
+       if (!old_dir->i_op->rename && !old_dir->i_op->rename2)
                return -EPERM;
 
        if (flags && !old_dir->i_op->rename2)
@@ -4134,10 +4139,11 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                if (error)
                        goto out;
        }
-       if (!flags) {
+       if (!old_dir->i_op->rename2) {
                error = old_dir->i_op->rename(old_dir, old_dentry,
                                              new_dir, new_dentry);
        } else {
+               WARN_ON(old_dir->i_op->rename != NULL);
                error = old_dir->i_op->rename2(old_dir, old_dentry,
                                               new_dir, new_dentry, flags);
        }
index 0acabea583191aaa427fab317429210756e1a643..a01c7730e9af3ad07f3e993d2f7f27f04f615604 100644 (file)
@@ -16,7 +16,6 @@
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/idr.h>
-#include <linux/acct.h>                /* acct_auto_close_mnt */
 #include <linux/init.h>                /* init_rootfs */
 #include <linux/fs_struct.h>   /* get_fs_root et.al. */
 #include <linux/fsnotify.h>    /* fsnotify_vfsmount_delete */
@@ -779,6 +778,20 @@ static void attach_mnt(struct mount *mnt,
        list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
 }
 
+static void attach_shadowed(struct mount *mnt,
+                       struct mount *parent,
+                       struct mount *shadows)
+{
+       if (shadows) {
+               hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
+               list_add(&mnt->mnt_child, &shadows->mnt_child);
+       } else {
+               hlist_add_head_rcu(&mnt->mnt_hash,
+                               m_hash(&parent->mnt, mnt->mnt_mountpoint));
+               list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+       }
+}
+
 /*
  * vfsmount lock must be held for write
  */
@@ -797,12 +810,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
 
        list_splice(&head, n->list.prev);
 
-       if (shadows)
-               hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
-       else
-               hlist_add_head_rcu(&mnt->mnt_hash,
-                               m_hash(&parent->mnt, mnt->mnt_mountpoint));
-       list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+       attach_shadowed(mnt, parent, shadows);
        touch_mnt_namespace(n);
 }
 
@@ -951,7 +959,6 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 
 static void mntput_no_expire(struct mount *mnt)
 {
-put_again:
        rcu_read_lock();
        mnt_add_count(mnt, -1);
        if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
@@ -964,14 +971,6 @@ put_again:
                unlock_mount_hash();
                return;
        }
-       if (unlikely(mnt->mnt_pinned)) {
-               mnt_add_count(mnt, mnt->mnt_pinned + 1);
-               mnt->mnt_pinned = 0;
-               rcu_read_unlock();
-               unlock_mount_hash();
-               acct_auto_close_mnt(&mnt->mnt);
-               goto put_again;
-       }
        if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
                rcu_read_unlock();
                unlock_mount_hash();
@@ -994,6 +993,8 @@ put_again:
         * so mnt_get_writers() below is safe.
         */
        WARN_ON(mnt_get_writers(mnt));
+       if (unlikely(mnt->mnt_pins.first))
+               mnt_pin_kill(mnt);
        fsnotify_vfsmount_delete(&mnt->mnt);
        dput(mnt->mnt.mnt_root);
        deactivate_super(mnt->mnt.mnt_sb);
@@ -1021,25 +1022,15 @@ struct vfsmount *mntget(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL(mntget);
 
-void mnt_pin(struct vfsmount *mnt)
-{
-       lock_mount_hash();
-       real_mount(mnt)->mnt_pinned++;
-       unlock_mount_hash();
-}
-EXPORT_SYMBOL(mnt_pin);
-
-void mnt_unpin(struct vfsmount *m)
+struct vfsmount *mnt_clone_internal(struct path *path)
 {
-       struct mount *mnt = real_mount(m);
-       lock_mount_hash();
-       if (mnt->mnt_pinned) {
-               mnt_add_count(mnt, 1);
-               mnt->mnt_pinned--;
-       }
-       unlock_mount_hash();
+       struct mount *p;
+       p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
+       if (IS_ERR(p))
+               return ERR_CAST(p);
+       p->mnt.mnt_flags |= MNT_INTERNAL;
+       return &p->mnt;
 }
-EXPORT_SYMBOL(mnt_unpin);
 
 static inline void mangle(struct seq_file *m, const char *s)
 {
@@ -1505,6 +1496,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
                        continue;
 
                for (s = r; s; s = next_mnt(s, r)) {
+                       struct mount *t = NULL;
                        if (!(flag & CL_COPY_UNBINDABLE) &&
                            IS_MNT_UNBINDABLE(s)) {
                                s = skip_mnt_tree(s);
@@ -1526,7 +1518,14 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
                                goto out;
                        lock_mount_hash();
                        list_add_tail(&q->mnt_list, &res->mnt_list);
-                       attach_mnt(q, parent, p->mnt_mp);
+                       mnt_set_mountpoint(parent, p->mnt_mp, q);
+                       if (!list_empty(&parent->mnt_mounts)) {
+                               t = list_last_entry(&parent->mnt_mounts,
+                                       struct mount, mnt_child);
+                               if (t->mnt_mp != p->mnt_mp)
+                                       t = NULL;
+                       }
+                       attach_shadowed(q, parent, t);
                        unlock_mount_hash();
                }
        }
index 9b431f44fad9d19a31d6c61ca3986453b137f4d4..cbb1797149d5731a77f22b97fc4a3ba2cdfbacfd 100644 (file)
@@ -210,8 +210,7 @@ static void bl_end_io_read(struct bio *bio, int err)
                        SetPageUptodate(bvec->bv_page);
 
        if (err) {
-               struct nfs_pgio_data *rdata = par->data;
-               struct nfs_pgio_header *header = rdata->header;
+               struct nfs_pgio_header *header = par->data;
 
                if (!header->pnfs_error)
                        header->pnfs_error = -EIO;
@@ -224,43 +223,44 @@ static void bl_end_io_read(struct bio *bio, int err)
 static void bl_read_cleanup(struct work_struct *work)
 {
        struct rpc_task *task;
-       struct nfs_pgio_data *rdata;
+       struct nfs_pgio_header *hdr;
        dprintk("%s enter\n", __func__);
        task = container_of(work, struct rpc_task, u.tk_work);
-       rdata = container_of(task, struct nfs_pgio_data, task);
-       pnfs_ld_read_done(rdata);
+       hdr = container_of(task, struct nfs_pgio_header, task);
+       pnfs_ld_read_done(hdr);
 }
 
 static void
 bl_end_par_io_read(void *data, int unused)
 {
-       struct nfs_pgio_data *rdata = data;
+       struct nfs_pgio_header *hdr = data;
 
-       rdata->task.tk_status = rdata->header->pnfs_error;
-       INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
-       schedule_work(&rdata->task.u.tk_work);
+       hdr->task.tk_status = hdr->pnfs_error;
+       INIT_WORK(&hdr->task.u.tk_work, bl_read_cleanup);
+       schedule_work(&hdr->task.u.tk_work);
 }
 
 static enum pnfs_try_status
-bl_read_pagelist(struct nfs_pgio_data *rdata)
+bl_read_pagelist(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *header = rdata->header;
+       struct nfs_pgio_header *header = hdr;
        int i, hole;
        struct bio *bio = NULL;
        struct pnfs_block_extent *be = NULL, *cow_read = NULL;
        sector_t isect, extent_length = 0;
        struct parallel_io *par;
-       loff_t f_offset = rdata->args.offset;
-       size_t bytes_left = rdata->args.count;
+       loff_t f_offset = hdr->args.offset;
+       size_t bytes_left = hdr->args.count;
        unsigned int pg_offset, pg_len;
-       struct page **pages = rdata->args.pages;
-       int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
+       struct page **pages = hdr->args.pages;
+       int pg_index = hdr->args.pgbase >> PAGE_CACHE_SHIFT;
        const bool is_dio = (header->dreq != NULL);
 
        dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
-              rdata->pages.npages, f_offset, (unsigned int)rdata->args.count);
+               hdr->page_array.npages, f_offset,
+               (unsigned int)hdr->args.count);
 
-       par = alloc_parallel(rdata);
+       par = alloc_parallel(hdr);
        if (!par)
                goto use_mds;
        par->pnfs_callback = bl_end_par_io_read;
@@ -268,7 +268,7 @@ bl_read_pagelist(struct nfs_pgio_data *rdata)
 
        isect = (sector_t) (f_offset >> SECTOR_SHIFT);
        /* Code assumes extents are page-aligned */
-       for (i = pg_index; i < rdata->pages.npages; i++) {
+       for (i = pg_index; i < hdr->page_array.npages; i++) {
                if (!extent_length) {
                        /* We've used up the previous extent */
                        bl_put_extent(be);
@@ -317,7 +317,8 @@ bl_read_pagelist(struct nfs_pgio_data *rdata)
                        struct pnfs_block_extent *be_read;
 
                        be_read = (hole && cow_read) ? cow_read : be;
-                       bio = do_add_page_to_bio(bio, rdata->pages.npages - i,
+                       bio = do_add_page_to_bio(bio,
+                                                hdr->page_array.npages - i,
                                                 READ,
                                                 isect, pages[i], be_read,
                                                 bl_end_io_read, par,
@@ -332,10 +333,10 @@ bl_read_pagelist(struct nfs_pgio_data *rdata)
                extent_length -= PAGE_CACHE_SECTORS;
        }
        if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
-               rdata->res.eof = 1;
-               rdata->res.count = header->inode->i_size - rdata->args.offset;
+               hdr->res.eof = 1;
+               hdr->res.count = header->inode->i_size - hdr->args.offset;
        } else {
-               rdata->res.count = (isect << SECTOR_SHIFT) - rdata->args.offset;
+               hdr->res.count = (isect << SECTOR_SHIFT) - hdr->args.offset;
        }
 out:
        bl_put_extent(be);
@@ -390,8 +391,7 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
        }
 
        if (unlikely(err)) {
-               struct nfs_pgio_data *data = par->data;
-               struct nfs_pgio_header *header = data->header;
+               struct nfs_pgio_header *header = par->data;
 
                if (!header->pnfs_error)
                        header->pnfs_error = -EIO;
@@ -405,8 +405,7 @@ static void bl_end_io_write(struct bio *bio, int err)
 {
        struct parallel_io *par = bio->bi_private;
        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-       struct nfs_pgio_data *data = par->data;
-       struct nfs_pgio_header *header = data->header;
+       struct nfs_pgio_header *header = par->data;
 
        if (!uptodate) {
                if (!header->pnfs_error)
@@ -423,32 +422,32 @@ static void bl_end_io_write(struct bio *bio, int err)
 static void bl_write_cleanup(struct work_struct *work)
 {
        struct rpc_task *task;
-       struct nfs_pgio_data *wdata;
+       struct nfs_pgio_header *hdr;
        dprintk("%s enter\n", __func__);
        task = container_of(work, struct rpc_task, u.tk_work);
-       wdata = container_of(task, struct nfs_pgio_data, task);
-       if (likely(!wdata->header->pnfs_error)) {
+       hdr = container_of(task, struct nfs_pgio_header, task);
+       if (likely(!hdr->pnfs_error)) {
                /* Marks for LAYOUTCOMMIT */
-               mark_extents_written(BLK_LSEG2EXT(wdata->header->lseg),
-                                    wdata->args.offset, wdata->args.count);
+               mark_extents_written(BLK_LSEG2EXT(hdr->lseg),
+                                    hdr->args.offset, hdr->args.count);
        }
-       pnfs_ld_write_done(wdata);
+       pnfs_ld_write_done(hdr);
 }
 
 /* Called when last of bios associated with a bl_write_pagelist call finishes */
 static void bl_end_par_io_write(void *data, int num_se)
 {
-       struct nfs_pgio_data *wdata = data;
+       struct nfs_pgio_header *hdr = data;
 
-       if (unlikely(wdata->header->pnfs_error)) {
-               bl_free_short_extents(&BLK_LSEG2EXT(wdata->header->lseg)->bl_inval,
+       if (unlikely(hdr->pnfs_error)) {
+               bl_free_short_extents(&BLK_LSEG2EXT(hdr->lseg)->bl_inval,
                                        num_se);
        }
 
-       wdata->task.tk_status = wdata->header->pnfs_error;
-       wdata->verf.committed = NFS_FILE_SYNC;
-       INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
-       schedule_work(&wdata->task.u.tk_work);
+       hdr->task.tk_status = hdr->pnfs_error;
+       hdr->verf.committed = NFS_FILE_SYNC;
+       INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup);
+       schedule_work(&hdr->task.u.tk_work);
 }
 
 /* FIXME STUB - mark intersection of layout and page as bad, so is not
@@ -673,18 +672,17 @@ check_page:
 }
 
 static enum pnfs_try_status
-bl_write_pagelist(struct nfs_pgio_data *wdata, int sync)
+bl_write_pagelist(struct nfs_pgio_header *header, int sync)
 {
-       struct nfs_pgio_header *header = wdata->header;
        int i, ret, npg_zero, pg_index, last = 0;
        struct bio *bio = NULL;
        struct pnfs_block_extent *be = NULL, *cow_read = NULL;
        sector_t isect, last_isect = 0, extent_length = 0;
        struct parallel_io *par = NULL;
-       loff_t offset = wdata->args.offset;
-       size_t count = wdata->args.count;
+       loff_t offset = header->args.offset;
+       size_t count = header->args.count;
        unsigned int pg_offset, pg_len, saved_len;
-       struct page **pages = wdata->args.pages;
+       struct page **pages = header->args.pages;
        struct page *page;
        pgoff_t index;
        u64 temp;
@@ -699,11 +697,11 @@ bl_write_pagelist(struct nfs_pgio_data *wdata, int sync)
                dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n");
                goto out_mds;
        }
-       /* At this point, wdata->pages is a (sequential) list of nfs_pages.
+       /* At this point, header->page_aray is a (sequential) list of nfs_pages.
         * We want to write each, and if there is an error set pnfs_error
         * to have it redone using nfs.
         */
-       par = alloc_parallel(wdata);
+       par = alloc_parallel(header);
        if (!par)
                goto out_mds;
        par->pnfs_callback = bl_end_par_io_write;
@@ -790,8 +788,8 @@ next_page:
        bio = bl_submit_bio(WRITE, bio);
 
        /* Middle pages */
-       pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
-       for (i = pg_index; i < wdata->pages.npages; i++) {
+       pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
+       for (i = pg_index; i < header->page_array.npages; i++) {
                if (!extent_length) {
                        /* We've used up the previous extent */
                        bl_put_extent(be);
@@ -862,7 +860,8 @@ next_page:
                }
 
 
-               bio = do_add_page_to_bio(bio, wdata->pages.npages - i, WRITE,
+               bio = do_add_page_to_bio(bio, header->page_array.npages - i,
+                                        WRITE,
                                         isect, pages[i], be,
                                         bl_end_io_write, par,
                                         pg_offset, pg_len);
@@ -890,7 +889,7 @@ next_page:
        }
 
 write_done:
-       wdata->res.count = wdata->args.count;
+       header->res.count = header->args.count;
 out:
        bl_put_extent(be);
        bl_put_extent(cow_read);
@@ -1063,7 +1062,7 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
                return ERR_PTR(-ENOMEM);
        }
 
-       pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS);
+       pages = kcalloc(max_pages, sizeof(struct page *), GFP_NOFS);
        if (pages == NULL) {
                kfree(dev);
                return ERR_PTR(-ENOMEM);
index 073b4cf67ed9d39690626add5517708debeab021..54de482143cc0708638e12bde9fbbfbd66d59224 100644 (file)
@@ -428,6 +428,18 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
        if (p == NULL)
                return 0;
 
+       /*
+        * Did we get the acceptor from userland during the SETCLIENID
+        * negotiation?
+        */
+       if (clp->cl_acceptor)
+               return !strcmp(p, clp->cl_acceptor);
+
+       /*
+        * Otherwise try to verify it using the cl_hostname. Note that this
+        * doesn't work if a non-canonical hostname was used in the devname.
+        */
+
        /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
 
        if (memcmp(p, "nfs@", 4) != 0)
index 180d1ec9c32ed511f3ce33f72088902ad050fffc..1c5ff6d5838585c4b6b0806aa64b579e58a64bc0 100644 (file)
@@ -110,8 +110,8 @@ struct nfs_subversion *get_nfs_version(unsigned int version)
                mutex_unlock(&nfs_version_mutex);
        }
 
-       if (!IS_ERR(nfs))
-               try_module_get(nfs->owner);
+       if (!IS_ERR(nfs) && !try_module_get(nfs->owner))
+               return ERR_PTR(-EAGAIN);
        return nfs;
 }
 
@@ -158,7 +158,8 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
                goto error_0;
 
        clp->cl_nfs_mod = cl_init->nfs_mod;
-       try_module_get(clp->cl_nfs_mod->owner);
+       if (!try_module_get(clp->cl_nfs_mod->owner))
+               goto error_dealloc;
 
        clp->rpc_ops = clp->cl_nfs_mod->rpc_ops;
 
@@ -190,6 +191,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
 
 error_cleanup:
        put_nfs_version(clp->cl_nfs_mod);
+error_dealloc:
        kfree(clp);
 error_0:
        return ERR_PTR(err);
@@ -252,6 +254,7 @@ void nfs_free_client(struct nfs_client *clp)
        put_net(clp->cl_net);
        put_nfs_version(clp->cl_nfs_mod);
        kfree(clp->cl_hostname);
+       kfree(clp->cl_acceptor);
        kfree(clp);
 
        dprintk("<-- nfs_free_client()\n");
@@ -482,8 +485,13 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
        struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id);
        const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod->rpc_ops;
 
+       if (cl_init->hostname == NULL) {
+               WARN_ON(1);
+               return NULL;
+       }
+
        dprintk("--> nfs_get_client(%s,v%u)\n",
-               cl_init->hostname ?: "", rpc_ops->version);
+               cl_init->hostname, rpc_ops->version);
 
        /* see if the client already exists */
        do {
@@ -510,7 +518,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
        } while (!IS_ERR(new));
 
        dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n",
-               cl_init->hostname ?: "", PTR_ERR(new));
+               cl_init->hostname, PTR_ERR(new));
        return new;
 }
 EXPORT_SYMBOL_GPL(nfs_get_client);
index 5d8ccecf5f5caada2de94bf30689ecd9e725a15a..5853f53db73246df670ce9daedb73e10d62d2da3 100644 (file)
@@ -41,14 +41,8 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
        set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
 }
 
-/**
- * nfs_have_delegation - check if inode has a delegation
- * @inode: inode to check
- * @flags: delegation types to check for
- *
- * Returns one if inode has the indicated delegation, otherwise zero.
- */
-int nfs4_have_delegation(struct inode *inode, fmode_t flags)
+static int
+nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
 {
        struct nfs_delegation *delegation;
        int ret = 0;
@@ -58,12 +52,34 @@ int nfs4_have_delegation(struct inode *inode, fmode_t flags)
        delegation = rcu_dereference(NFS_I(inode)->delegation);
        if (delegation != NULL && (delegation->type & flags) == flags &&
            !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
-               nfs_mark_delegation_referenced(delegation);
+               if (mark)
+                       nfs_mark_delegation_referenced(delegation);
                ret = 1;
        }
        rcu_read_unlock();
        return ret;
 }
+/**
+ * nfs_have_delegation - check if inode has a delegation, mark it
+ * NFS_DELEGATION_REFERENCED if there is one.
+ * @inode: inode to check
+ * @flags: delegation types to check for
+ *
+ * Returns one if inode has the indicated delegation, otherwise zero.
+ */
+int nfs4_have_delegation(struct inode *inode, fmode_t flags)
+{
+       return nfs4_do_check_delegation(inode, flags, true);
+}
+
+/*
+ * nfs4_check_delegation - check if inode has a delegation, do not mark
+ * NFS_DELEGATION_REFERENCED if it has one.
+ */
+int nfs4_check_delegation(struct inode *inode, fmode_t flags)
+{
+       return nfs4_do_check_delegation(inode, flags, false);
+}
 
 static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
 {
index 9a79c7a99d6d6dd64b03f58481dc8d0fb4b32818..5c1cce39297f68fb178b4a125a83001aeac4afde 100644 (file)
@@ -59,6 +59,7 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_
 
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
 int nfs4_have_delegation(struct inode *inode, fmode_t flags);
+int nfs4_check_delegation(struct inode *inode, fmode_t flags);
 
 #endif
 
index 4a3d4ef76127bc716028d3d9df25791d91ff76ce..36d921f0c6026c27170b565f46eb4e26999ea812 100644 (file)
@@ -988,9 +988,13 @@ EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate);
  * A check for whether or not the parent directory has changed.
  * In the case it has, we assume that the dentries are untrustworthy
  * and may need to be looked up again.
+ * If rcu_walk prevents us from performing a full check, return 0.
  */
-static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
+static int nfs_check_verifier(struct inode *dir, struct dentry *dentry,
+                             int rcu_walk)
 {
+       int ret;
+
        if (IS_ROOT(dentry))
                return 1;
        if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
@@ -998,7 +1002,11 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
        if (!nfs_verify_change_attribute(dir, dentry->d_time))
                return 0;
        /* Revalidate nfsi->cache_change_attribute before we declare a match */
-       if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
+       if (rcu_walk)
+               ret = nfs_revalidate_inode_rcu(NFS_SERVER(dir), dir);
+       else
+               ret = nfs_revalidate_inode(NFS_SERVER(dir), dir);
+       if (ret < 0)
                return 0;
        if (!nfs_verify_change_attribute(dir, dentry->d_time))
                return 0;
@@ -1042,6 +1050,8 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
 out:
        return (inode->i_nlink == 0) ? -ENOENT : 0;
 out_force:
+       if (flags & LOOKUP_RCU)
+               return -ECHILD;
        ret = __nfs_revalidate_inode(server, inode);
        if (ret != 0)
                return ret;
@@ -1054,6 +1064,9 @@ out_force:
  *
  * If parent mtime has changed, we revalidate, else we wait for a
  * period corresponding to the parent's attribute cache timeout value.
+ *
+ * If LOOKUP_RCU prevents us from performing a full check, return 1
+ * suggesting a reval is needed.
  */
 static inline
 int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
@@ -1064,7 +1077,7 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
                return 0;
        if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG)
                return 1;
-       return !nfs_check_verifier(dir, dentry);
+       return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU);
 }
 
 /*
@@ -1088,21 +1101,30 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
        struct nfs4_label *label = NULL;
        int error;
 
-       if (flags & LOOKUP_RCU)
-               return -ECHILD;
-
-       parent = dget_parent(dentry);
-       dir = parent->d_inode;
+       if (flags & LOOKUP_RCU) {
+               parent = ACCESS_ONCE(dentry->d_parent);
+               dir = ACCESS_ONCE(parent->d_inode);
+               if (!dir)
+                       return -ECHILD;
+       } else {
+               parent = dget_parent(dentry);
+               dir = parent->d_inode;
+       }
        nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
        inode = dentry->d_inode;
 
        if (!inode) {
-               if (nfs_neg_need_reval(dir, dentry, flags))
+               if (nfs_neg_need_reval(dir, dentry, flags)) {
+                       if (flags & LOOKUP_RCU)
+                               return -ECHILD;
                        goto out_bad;
+               }
                goto out_valid_noent;
        }
 
        if (is_bad_inode(inode)) {
+               if (flags & LOOKUP_RCU)
+                       return -ECHILD;
                dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n",
                                __func__, dentry);
                goto out_bad;
@@ -1112,12 +1134,20 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
                goto out_set_verifier;
 
        /* Force a full look up iff the parent directory has changed */
-       if (!nfs_is_exclusive_create(dir, flags) && nfs_check_verifier(dir, dentry)) {
-               if (nfs_lookup_verify_inode(inode, flags))
+       if (!nfs_is_exclusive_create(dir, flags) &&
+           nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) {
+
+               if (nfs_lookup_verify_inode(inode, flags)) {
+                       if (flags & LOOKUP_RCU)
+                               return -ECHILD;
                        goto out_zap_parent;
+               }
                goto out_valid;
        }
 
+       if (flags & LOOKUP_RCU)
+               return -ECHILD;
+
        if (NFS_STALE(inode))
                goto out_bad;
 
@@ -1153,13 +1183,18 @@ out_set_verifier:
        /* Success: notify readdir to use READDIRPLUS */
        nfs_advise_use_readdirplus(dir);
  out_valid_noent:
-       dput(parent);
+       if (flags & LOOKUP_RCU) {
+               if (parent != ACCESS_ONCE(dentry->d_parent))
+                       return -ECHILD;
+       } else
+               dput(parent);
        dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n",
                        __func__, dentry);
        return 1;
 out_zap_parent:
        nfs_zap_caches(dir);
  out_bad:
+       WARN_ON(flags & LOOKUP_RCU);
        nfs_free_fattr(fattr);
        nfs_free_fhandle(fhandle);
        nfs4_label_free(label);
@@ -1185,6 +1220,7 @@ out_zap_parent:
                        __func__, dentry);
        return 0;
 out_error:
+       WARN_ON(flags & LOOKUP_RCU);
        nfs_free_fattr(fattr);
        nfs_free_fhandle(fhandle);
        nfs4_label_free(label);
@@ -1529,14 +1565,9 @@ EXPORT_SYMBOL_GPL(nfs_atomic_open);
 
 static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 {
-       struct dentry *parent = NULL;
        struct inode *inode;
-       struct inode *dir;
        int ret = 0;
 
-       if (flags & LOOKUP_RCU)
-               return -ECHILD;
-
        if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY))
                goto no_open;
        if (d_mountpoint(dentry))
@@ -1545,34 +1576,47 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
                goto no_open;
 
        inode = dentry->d_inode;
-       parent = dget_parent(dentry);
-       dir = parent->d_inode;
 
        /* We can't create new files in nfs_open_revalidate(), so we
         * optimize away revalidation of negative dentries.
         */
        if (inode == NULL) {
+               struct dentry *parent;
+               struct inode *dir;
+
+               if (flags & LOOKUP_RCU) {
+                       parent = ACCESS_ONCE(dentry->d_parent);
+                       dir = ACCESS_ONCE(parent->d_inode);
+                       if (!dir)
+                               return -ECHILD;
+               } else {
+                       parent = dget_parent(dentry);
+                       dir = parent->d_inode;
+               }
                if (!nfs_neg_need_reval(dir, dentry, flags))
                        ret = 1;
+               else if (flags & LOOKUP_RCU)
+                       ret = -ECHILD;
+               if (!(flags & LOOKUP_RCU))
+                       dput(parent);
+               else if (parent != ACCESS_ONCE(dentry->d_parent))
+                       return -ECHILD;
                goto out;
        }
 
        /* NFS only supports OPEN on regular files */
        if (!S_ISREG(inode->i_mode))
-               goto no_open_dput;
+               goto no_open;
        /* We cannot do exclusive creation on a positive dentry */
        if (flags & LOOKUP_EXCL)
-               goto no_open_dput;
+               goto no_open;
 
        /* Let f_op->open() actually open (and revalidate) the file */
        ret = 1;
 
 out:
-       dput(parent);
        return ret;
 
-no_open_dput:
-       dput(parent);
 no_open:
        return nfs_lookup_revalidate(dentry, flags);
 }
@@ -2028,10 +2072,14 @@ static DEFINE_SPINLOCK(nfs_access_lru_lock);
 static LIST_HEAD(nfs_access_lru_list);
 static atomic_long_t nfs_access_nr_entries;
 
+static unsigned long nfs_access_max_cachesize = ULONG_MAX;
+module_param(nfs_access_max_cachesize, ulong, 0644);
+MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache length");
+
 static void nfs_access_free_entry(struct nfs_access_entry *entry)
 {
        put_rpccred(entry->cred);
-       kfree(entry);
+       kfree_rcu(entry, rcu_head);
        smp_mb__before_atomic();
        atomic_long_dec(&nfs_access_nr_entries);
        smp_mb__after_atomic();
@@ -2048,19 +2096,14 @@ static void nfs_access_free_list(struct list_head *head)
        }
 }
 
-unsigned long
-nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long
+nfs_do_access_cache_scan(unsigned int nr_to_scan)
 {
        LIST_HEAD(head);
        struct nfs_inode *nfsi, *next;
        struct nfs_access_entry *cache;
-       int nr_to_scan = sc->nr_to_scan;
-       gfp_t gfp_mask = sc->gfp_mask;
        long freed = 0;
 
-       if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
-               return SHRINK_STOP;
-
        spin_lock(&nfs_access_lru_lock);
        list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
                struct inode *inode;
@@ -2093,12 +2136,40 @@ remove_lru_entry:
        return freed;
 }
 
+unsigned long
+nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+       int nr_to_scan = sc->nr_to_scan;
+       gfp_t gfp_mask = sc->gfp_mask;
+
+       if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
+               return SHRINK_STOP;
+       return nfs_do_access_cache_scan(nr_to_scan);
+}
+
+
 unsigned long
 nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc)
 {
        return vfs_pressure_ratio(atomic_long_read(&nfs_access_nr_entries));
 }
 
+static void
+nfs_access_cache_enforce_limit(void)
+{
+       long nr_entries = atomic_long_read(&nfs_access_nr_entries);
+       unsigned long diff;
+       unsigned int nr_to_scan;
+
+       if (nr_entries < 0 || nr_entries <= nfs_access_max_cachesize)
+               return;
+       nr_to_scan = 100;
+       diff = nr_entries - nfs_access_max_cachesize;
+       if (diff < nr_to_scan)
+               nr_to_scan = diff;
+       nfs_do_access_cache_scan(nr_to_scan);
+}
+
 static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
 {
        struct rb_root *root_node = &nfsi->access_cache;
@@ -2186,6 +2257,38 @@ out_zap:
        return -ENOENT;
 }
 
+static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
+{
+       /* Only check the most recently returned cache entry,
+        * but do it without locking.
+        */
+       struct nfs_inode *nfsi = NFS_I(inode);
+       struct nfs_access_entry *cache;
+       int err = -ECHILD;
+       struct list_head *lh;
+
+       rcu_read_lock();
+       if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
+               goto out;
+       lh = rcu_dereference(nfsi->access_cache_entry_lru.prev);
+       cache = list_entry(lh, struct nfs_access_entry, lru);
+       if (lh == &nfsi->access_cache_entry_lru ||
+           cred != cache->cred)
+               cache = NULL;
+       if (cache == NULL)
+               goto out;
+       if (!nfs_have_delegated_attributes(inode) &&
+           !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
+               goto out;
+       res->jiffies = cache->jiffies;
+       res->cred = cache->cred;
+       res->mask = cache->mask;
+       err = 0;
+out:
+       rcu_read_unlock();
+       return err;
+}
+
 static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
@@ -2229,6 +2332,11 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
        cache->cred = get_rpccred(set->cred);
        cache->mask = set->mask;
 
+       /* The above field assignments must be visible
+        * before this item appears on the lru.  We cannot easily
+        * use rcu_assign_pointer, so just force the memory barrier.
+        */
+       smp_wmb();
        nfs_access_add_rbtree(inode, cache);
 
        /* Update accounting */
@@ -2244,6 +2352,7 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
                                        &nfs_access_lru_list);
                spin_unlock(&nfs_access_lru_lock);
        }
+       nfs_access_cache_enforce_limit();
 }
 EXPORT_SYMBOL_GPL(nfs_access_add_cache);
 
@@ -2267,10 +2376,16 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
 
        trace_nfs_access_enter(inode);
 
-       status = nfs_access_get_cached(inode, cred, &cache);
+       status = nfs_access_get_cached_rcu(inode, cred, &cache);
+       if (status != 0)
+               status = nfs_access_get_cached(inode, cred, &cache);
        if (status == 0)
                goto out_cached;
 
+       status = -ECHILD;
+       if (mask & MAY_NOT_BLOCK)
+               goto out;
+
        /* Be clever: ask server to check for all possible rights */
        cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ;
        cache.cred = cred;
@@ -2321,9 +2436,6 @@ int nfs_permission(struct inode *inode, int mask)
        struct rpc_cred *cred;
        int res = 0;
 
-       if (mask & MAY_NOT_BLOCK)
-               return -ECHILD;
-
        nfs_inc_stats(inode, NFSIOS_VFSACCESS);
 
        if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
@@ -2350,12 +2462,23 @@ force_lookup:
        if (!NFS_PROTO(inode)->access)
                goto out_notsup;
 
-       cred = rpc_lookup_cred();
-       if (!IS_ERR(cred)) {
-               res = nfs_do_access(inode, cred, mask);
-               put_rpccred(cred);
-       } else
+       /* Always try fast lookups first */
+       rcu_read_lock();
+       cred = rpc_lookup_cred_nonblock();
+       if (!IS_ERR(cred))
+               res = nfs_do_access(inode, cred, mask|MAY_NOT_BLOCK);
+       else
                res = PTR_ERR(cred);
+       rcu_read_unlock();
+       if (res == -ECHILD && !(mask & MAY_NOT_BLOCK)) {
+               /* Fast lookup failed, try the slow way */
+               cred = rpc_lookup_cred();
+               if (!IS_ERR(cred)) {
+                       res = nfs_do_access(inode, cred, mask);
+                       put_rpccred(cred);
+               } else
+                       res = PTR_ERR(cred);
+       }
 out:
        if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
                res = -EACCES;
@@ -2364,6 +2487,9 @@ out:
                inode->i_sb->s_id, inode->i_ino, mask, res);
        return res;
 out_notsup:
+       if (mask & MAY_NOT_BLOCK)
+               return -ECHILD;
+
        res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
        if (res == 0)
                res = generic_permission(inode, mask);
index f11b9eed0de109d057cd86ef42c577400698992c..65ef6e00deee428a601e5534e2d4db6489199bf9 100644 (file)
@@ -148,8 +148,8 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
 {
        struct nfs_writeverf *verfp;
 
-       verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp,
-                                     hdr->data->ds_idx);
+       verfp = nfs_direct_select_verf(dreq, hdr->ds_clp,
+                                     hdr->ds_idx);
        WARN_ON_ONCE(verfp->committed >= 0);
        memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
        WARN_ON_ONCE(verfp->committed < 0);
@@ -169,8 +169,8 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
 {
        struct nfs_writeverf *verfp;
 
-       verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp,
-                                        hdr->data->ds_idx);
+       verfp = nfs_direct_select_verf(dreq, hdr->ds_clp,
+                                        hdr->ds_idx);
        if (verfp->committed < 0) {
                nfs_direct_set_hdr_verf(dreq, hdr);
                return 0;
@@ -715,7 +715,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
 {
        struct nfs_direct_req *dreq = hdr->dreq;
        struct nfs_commit_info cinfo;
-       int bit = -1;
+       bool request_commit = false;
        struct nfs_page *req = nfs_list_entry(hdr->pages.next);
 
        if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
@@ -729,27 +729,20 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
                dreq->flags = 0;
                dreq->error = hdr->error;
        }
-       if (dreq->error != 0)
-               bit = NFS_IOHDR_ERROR;
-       else {
+       if (dreq->error == 0) {
                dreq->count += hdr->good_bytes;
-               if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) {
-                       dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
-                       bit = NFS_IOHDR_NEED_RESCHED;
-               } else if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
+               if (nfs_write_need_commit(hdr)) {
                        if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
-                               bit = NFS_IOHDR_NEED_RESCHED;
+                               request_commit = true;
                        else if (dreq->flags == 0) {
                                nfs_direct_set_hdr_verf(dreq, hdr);
-                               bit = NFS_IOHDR_NEED_COMMIT;
+                               request_commit = true;
                                dreq->flags = NFS_ODIRECT_DO_COMMIT;
                        } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
-                               if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr)) {
+                               request_commit = true;
+                               if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr))
                                        dreq->flags =
                                                NFS_ODIRECT_RESCHED_WRITES;
-                                       bit = NFS_IOHDR_NEED_RESCHED;
-                               } else
-                                       bit = NFS_IOHDR_NEED_COMMIT;
                        }
                }
        }
@@ -759,9 +752,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
 
                req = nfs_list_entry(hdr->pages.next);
                nfs_list_remove_request(req);
-               switch (bit) {
-               case NFS_IOHDR_NEED_RESCHED:
-               case NFS_IOHDR_NEED_COMMIT:
+               if (request_commit) {
                        kref_get(&req->wb_kref);
                        nfs_mark_request_commit(req, hdr->lseg, &cinfo);
                }
index d2eba1c13b7eeab12f5eb5e53d3acb878f4f6220..1359c4a27393a6723fc3b22c244dd6422da4a9f3 100644 (file)
@@ -84,45 +84,37 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
        BUG();
 }
 
-static void filelayout_reset_write(struct nfs_pgio_data *data)
+static void filelayout_reset_write(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
-       struct rpc_task *task = &data->task;
+       struct rpc_task *task = &hdr->task;
 
        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
                dprintk("%s Reset task %5u for i/o through MDS "
                        "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
-                       data->task.tk_pid,
+                       hdr->task.tk_pid,
                        hdr->inode->i_sb->s_id,
                        (unsigned long long)NFS_FILEID(hdr->inode),
-                       data->args.count,
-                       (unsigned long long)data->args.offset);
+                       hdr->args.count,
+                       (unsigned long long)hdr->args.offset);
 
-               task->tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
-                                                       &hdr->pages,
-                                                       hdr->completion_ops,
-                                                       hdr->dreq);
+               task->tk_status = pnfs_write_done_resend_to_mds(hdr);
        }
 }
 
-static void filelayout_reset_read(struct nfs_pgio_data *data)
+static void filelayout_reset_read(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
-       struct rpc_task *task = &data->task;
+       struct rpc_task *task = &hdr->task;
 
        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
                dprintk("%s Reset task %5u for i/o through MDS "
                        "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
-                       data->task.tk_pid,
+                       hdr->task.tk_pid,
                        hdr->inode->i_sb->s_id,
                        (unsigned long long)NFS_FILEID(hdr->inode),
-                       data->args.count,
-                       (unsigned long long)data->args.offset);
+                       hdr->args.count,
+                       (unsigned long long)hdr->args.offset);
 
-               task->tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
-                                                       &hdr->pages,
-                                                       hdr->completion_ops,
-                                                       hdr->dreq);
+               task->tk_status = pnfs_read_done_resend_to_mds(hdr);
        }
 }
 
@@ -243,18 +235,17 @@ wait_on_recovery:
 /* NFS_PROTO call done callback routines */
 
 static int filelayout_read_done_cb(struct rpc_task *task,
-                               struct nfs_pgio_data *data)
+                               struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
        int err;
 
-       trace_nfs4_pnfs_read(data, task->tk_status);
-       err = filelayout_async_handle_error(task, data->args.context->state,
-                                           data->ds_clp, hdr->lseg);
+       trace_nfs4_pnfs_read(hdr, task->tk_status);
+       err = filelayout_async_handle_error(task, hdr->args.context->state,
+                                           hdr->ds_clp, hdr->lseg);
 
        switch (err) {
        case -NFS4ERR_RESET_TO_MDS:
-               filelayout_reset_read(data);
+               filelayout_reset_read(hdr);
                return task->tk_status;
        case -EAGAIN:
                rpc_restart_call_prepare(task);
@@ -270,15 +261,14 @@ static int filelayout_read_done_cb(struct rpc_task *task,
  * rfc5661 is not clear about which credential should be used.
  */
 static void
-filelayout_set_layoutcommit(struct nfs_pgio_data *wdata)
+filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = wdata->header;
 
        if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
-           wdata->res.verf->committed == NFS_FILE_SYNC)
+           hdr->res.verf->committed == NFS_FILE_SYNC)
                return;
 
-       pnfs_set_layoutcommit(wdata);
+       pnfs_set_layoutcommit(hdr);
        dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
                (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
 }
@@ -305,83 +295,82 @@ filelayout_reset_to_mds(struct pnfs_layout_segment *lseg)
  */
 static void filelayout_read_prepare(struct rpc_task *task, void *data)
 {
-       struct nfs_pgio_data *rdata = data;
+       struct nfs_pgio_header *hdr = data;
 
-       if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) {
+       if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
                rpc_exit(task, -EIO);
                return;
        }
-       if (filelayout_reset_to_mds(rdata->header->lseg)) {
+       if (filelayout_reset_to_mds(hdr->lseg)) {
                dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
-               filelayout_reset_read(rdata);
+               filelayout_reset_read(hdr);
                rpc_exit(task, 0);
                return;
        }
-       rdata->pgio_done_cb = filelayout_read_done_cb;
+       hdr->pgio_done_cb = filelayout_read_done_cb;
 
-       if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
-                       &rdata->args.seq_args,
-                       &rdata->res.seq_res,
+       if (nfs41_setup_sequence(hdr->ds_clp->cl_session,
+                       &hdr->args.seq_args,
+                       &hdr->res.seq_res,
                        task))
                return;
-       if (nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context,
-                       rdata->args.lock_context, FMODE_READ) == -EIO)
+       if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+                       hdr->args.lock_context, FMODE_READ) == -EIO)
                rpc_exit(task, -EIO); /* lost lock, terminate I/O */
 }
 
 static void filelayout_read_call_done(struct rpc_task *task, void *data)
 {
-       struct nfs_pgio_data *rdata = data;
+       struct nfs_pgio_header *hdr = data;
 
        dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
 
-       if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) &&
+       if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
            task->tk_status == 0) {
-               nfs41_sequence_done(task, &rdata->res.seq_res);
+               nfs41_sequence_done(task, &hdr->res.seq_res);
                return;
        }
 
        /* Note this may cause RPC to be resent */
-       rdata->header->mds_ops->rpc_call_done(task, data);
+       hdr->mds_ops->rpc_call_done(task, data);
 }
 
 static void filelayout_read_count_stats(struct rpc_task *task, void *data)
 {
-       struct nfs_pgio_data *rdata = data;
+       struct nfs_pgio_header *hdr = data;
 
-       rpc_count_iostats(task, NFS_SERVER(rdata->header->inode)->client->cl_metrics);
+       rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
 }
 
 static void filelayout_read_release(void *data)
 {
-       struct nfs_pgio_data *rdata = data;
-       struct pnfs_layout_hdr *lo = rdata->header->lseg->pls_layout;
+       struct nfs_pgio_header *hdr = data;
+       struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
 
        filelayout_fenceme(lo->plh_inode, lo);
-       nfs_put_client(rdata->ds_clp);
-       rdata->header->mds_ops->rpc_release(data);
+       nfs_put_client(hdr->ds_clp);
+       hdr->mds_ops->rpc_release(data);
 }
 
 static int filelayout_write_done_cb(struct rpc_task *task,
-                               struct nfs_pgio_data *data)
+                               struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
        int err;
 
-       trace_nfs4_pnfs_write(data, task->tk_status);
-       err = filelayout_async_handle_error(task, data->args.context->state,
-                                           data->ds_clp, hdr->lseg);
+       trace_nfs4_pnfs_write(hdr, task->tk_status);
+       err = filelayout_async_handle_error(task, hdr->args.context->state,
+                                           hdr->ds_clp, hdr->lseg);
 
        switch (err) {
        case -NFS4ERR_RESET_TO_MDS:
-               filelayout_reset_write(data);
+               filelayout_reset_write(hdr);
                return task->tk_status;
        case -EAGAIN:
                rpc_restart_call_prepare(task);
                return -EAGAIN;
        }
 
-       filelayout_set_layoutcommit(data);
+       filelayout_set_layoutcommit(hdr);
        return 0;
 }
 
@@ -419,57 +408,57 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
 
 static void filelayout_write_prepare(struct rpc_task *task, void *data)
 {
-       struct nfs_pgio_data *wdata = data;
+       struct nfs_pgio_header *hdr = data;
 
-       if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) {
+       if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
                rpc_exit(task, -EIO);
                return;
        }
-       if (filelayout_reset_to_mds(wdata->header->lseg)) {
+       if (filelayout_reset_to_mds(hdr->lseg)) {
                dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
-               filelayout_reset_write(wdata);
+               filelayout_reset_write(hdr);
                rpc_exit(task, 0);
                return;
        }
-       if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
-                       &wdata->args.seq_args,
-                       &wdata->res.seq_res,
+       if (nfs41_setup_sequence(hdr->ds_clp->cl_session,
+                       &hdr->args.seq_args,
+                       &hdr->res.seq_res,
                        task))
                return;
-       if (nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context,
-                       wdata->args.lock_context, FMODE_WRITE) == -EIO)
+       if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+                       hdr->args.lock_context, FMODE_WRITE) == -EIO)
                rpc_exit(task, -EIO); /* lost lock, terminate I/O */
 }
 
 static void filelayout_write_call_done(struct rpc_task *task, void *data)
 {
-       struct nfs_pgio_data *wdata = data;
+       struct nfs_pgio_header *hdr = data;
 
-       if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) &&
+       if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
            task->tk_status == 0) {
-               nfs41_sequence_done(task, &wdata->res.seq_res);
+               nfs41_sequence_done(task, &hdr->res.seq_res);
                return;
        }
 
        /* Note this may cause RPC to be resent */
-       wdata->header->mds_ops->rpc_call_done(task, data);
+       hdr->mds_ops->rpc_call_done(task, data);
 }
 
 static void filelayout_write_count_stats(struct rpc_task *task, void *data)
 {
-       struct nfs_pgio_data *wdata = data;
+       struct nfs_pgio_header *hdr = data;
 
-       rpc_count_iostats(task, NFS_SERVER(wdata->header->inode)->client->cl_metrics);
+       rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
 }
 
 static void filelayout_write_release(void *data)
 {
-       struct nfs_pgio_data *wdata = data;
-       struct pnfs_layout_hdr *lo = wdata->header->lseg->pls_layout;
+       struct nfs_pgio_header *hdr = data;
+       struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
 
        filelayout_fenceme(lo->plh_inode, lo);
-       nfs_put_client(wdata->ds_clp);
-       wdata->header->mds_ops->rpc_release(data);
+       nfs_put_client(hdr->ds_clp);
+       hdr->mds_ops->rpc_release(data);
 }
 
 static void filelayout_commit_prepare(struct rpc_task *task, void *data)
@@ -529,19 +518,18 @@ static const struct rpc_call_ops filelayout_commit_call_ops = {
 };
 
 static enum pnfs_try_status
-filelayout_read_pagelist(struct nfs_pgio_data *data)
+filelayout_read_pagelist(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
        struct pnfs_layout_segment *lseg = hdr->lseg;
        struct nfs4_pnfs_ds *ds;
        struct rpc_clnt *ds_clnt;
-       loff_t offset = data->args.offset;
+       loff_t offset = hdr->args.offset;
        u32 j, idx;
        struct nfs_fh *fh;
 
        dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
                __func__, hdr->inode->i_ino,
-               data->args.pgbase, (size_t)data->args.count, offset);
+               hdr->args.pgbase, (size_t)hdr->args.count, offset);
 
        /* Retrieve the correct rpc_client for the byte range */
        j = nfs4_fl_calc_j_index(lseg, offset);
@@ -559,30 +547,29 @@ filelayout_read_pagelist(struct nfs_pgio_data *data)
 
        /* No multipath support. Use first DS */
        atomic_inc(&ds->ds_clp->cl_count);
-       data->ds_clp = ds->ds_clp;
-       data->ds_idx = idx;
+       hdr->ds_clp = ds->ds_clp;
+       hdr->ds_idx = idx;
        fh = nfs4_fl_select_ds_fh(lseg, j);
        if (fh)
-               data->args.fh = fh;
+               hdr->args.fh = fh;
 
-       data->args.offset = filelayout_get_dserver_offset(lseg, offset);
-       data->mds_offset = offset;
+       hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
+       hdr->mds_offset = offset;
 
        /* Perform an asynchronous read to ds */
-       nfs_initiate_pgio(ds_clnt, data,
+       nfs_initiate_pgio(ds_clnt, hdr,
                            &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN);
        return PNFS_ATTEMPTED;
 }
 
 /* Perform async writes. */
 static enum pnfs_try_status
-filelayout_write_pagelist(struct nfs_pgio_data *data, int sync)
+filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
 {
-       struct nfs_pgio_header *hdr = data->header;
        struct pnfs_layout_segment *lseg = hdr->lseg;
        struct nfs4_pnfs_ds *ds;
        struct rpc_clnt *ds_clnt;
-       loff_t offset = data->args.offset;
+       loff_t offset = hdr->args.offset;
        u32 j, idx;
        struct nfs_fh *fh;
 
@@ -598,21 +585,20 @@ filelayout_write_pagelist(struct nfs_pgio_data *data, int sync)
                return PNFS_NOT_ATTEMPTED;
 
        dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n",
-               __func__, hdr->inode->i_ino, sync, (size_t) data->args.count,
+               __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
                offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
 
-       data->pgio_done_cb = filelayout_write_done_cb;
+       hdr->pgio_done_cb = filelayout_write_done_cb;
        atomic_inc(&ds->ds_clp->cl_count);
-       data->ds_clp = ds->ds_clp;
-       data->ds_idx = idx;
+       hdr->ds_clp = ds->ds_clp;
+       hdr->ds_idx = idx;
        fh = nfs4_fl_select_ds_fh(lseg, j);
        if (fh)
-               data->args.fh = fh;
-
-       data->args.offset = filelayout_get_dserver_offset(lseg, offset);
+               hdr->args.fh = fh;
+       hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
 
        /* Perform an asynchronous write */
-       nfs_initiate_pgio(ds_clnt, data,
+       nfs_initiate_pgio(ds_clnt, hdr,
                                    &filelayout_write_call_ops, sync,
                                    RPC_TASK_SOFTCONN);
        return PNFS_ATTEMPTED;
@@ -1023,6 +1009,7 @@ static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
 
 /* The generic layer is about to remove the req from the commit list.
  * If this will make the bucket empty, it will need to put the lseg reference.
+ * Note this is must be called holding the inode (/cinfo) lock
  */
 static void
 filelayout_clear_request_commit(struct nfs_page *req,
@@ -1030,7 +1017,6 @@ filelayout_clear_request_commit(struct nfs_page *req,
 {
        struct pnfs_layout_segment *freeme = NULL;
 
-       spin_lock(cinfo->lock);
        if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
                goto out;
        cinfo->ds->nwritten--;
@@ -1045,22 +1031,25 @@ filelayout_clear_request_commit(struct nfs_page *req,
        }
 out:
        nfs_request_remove_commit_list(req, cinfo);
-       spin_unlock(cinfo->lock);
-       pnfs_put_lseg(freeme);
+       pnfs_put_lseg_async(freeme);
 }
 
-static struct list_head *
-filelayout_choose_commit_list(struct nfs_page *req,
-                             struct pnfs_layout_segment *lseg,
-                             struct nfs_commit_info *cinfo)
+static void
+filelayout_mark_request_commit(struct nfs_page *req,
+                              struct pnfs_layout_segment *lseg,
+                              struct nfs_commit_info *cinfo)
+
 {
        struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
        u32 i, j;
        struct list_head *list;
        struct pnfs_commit_bucket *buckets;
 
-       if (fl->commit_through_mds)
-               return &cinfo->mds->list;
+       if (fl->commit_through_mds) {
+               list = &cinfo->mds->list;
+               spin_lock(cinfo->lock);
+               goto mds_commit;
+       }
 
        /* Note that we are calling nfs4_fl_calc_j_index on each page
         * that ends up being committed to a data server.  An attractive
@@ -1084,19 +1073,22 @@ filelayout_choose_commit_list(struct nfs_page *req,
        }
        set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
        cinfo->ds->nwritten++;
-       spin_unlock(cinfo->lock);
-       return list;
-}
 
-static void
-filelayout_mark_request_commit(struct nfs_page *req,
-                              struct pnfs_layout_segment *lseg,
-                              struct nfs_commit_info *cinfo)
-{
-       struct list_head *list;
-
-       list = filelayout_choose_commit_list(req, lseg, cinfo);
-       nfs_request_add_commit_list(req, list, cinfo);
+mds_commit:
+       /* nfs_request_add_commit_list(). We need to add req to list without
+        * dropping cinfo lock.
+        */
+       set_bit(PG_CLEAN, &(req)->wb_flags);
+       nfs_list_add_request(req, list);
+       cinfo->mds->ncommit++;
+       spin_unlock(cinfo->lock);
+       if (!cinfo->dreq) {
+               inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+               inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+                            BDI_RECLAIMABLE);
+               __mark_inode_dirty(req->wb_context->dentry->d_inode,
+                                  I_DIRTY_DATASYNC);
+       }
 }
 
 static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
@@ -1244,15 +1236,63 @@ restart:
        spin_unlock(cinfo->lock);
 }
 
+/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest
+ *                                for @page
+ * @cinfo - commit info for current inode
+ * @page - page to search for matching head request
+ *
+ * Returns a the head request if one is found, otherwise returns NULL.
+ */
+static struct nfs_page *
+filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
+{
+       struct nfs_page *freq, *t;
+       struct pnfs_commit_bucket *b;
+       int i;
+
+       /* Linearly search the commit lists for each bucket until a matching
+        * request is found */
+       for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
+               list_for_each_entry_safe(freq, t, &b->written, wb_list) {
+                       if (freq->wb_page == page)
+                               return freq->wb_head;
+               }
+               list_for_each_entry_safe(freq, t, &b->committing, wb_list) {
+                       if (freq->wb_page == page)
+                               return freq->wb_head;
+               }
+       }
+
+       return NULL;
+}
+
+static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx)
+{
+       struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+       struct pnfs_commit_bucket *bucket = fl_cinfo->buckets;
+       struct pnfs_layout_segment *freeme;
+       int i;
+
+       for (i = idx; i < fl_cinfo->nbuckets; i++, bucket++) {
+               if (list_empty(&bucket->committing))
+                       continue;
+               nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
+               spin_lock(cinfo->lock);
+               freeme = bucket->clseg;
+               bucket->clseg = NULL;
+               spin_unlock(cinfo->lock);
+               pnfs_put_lseg(freeme);
+       }
+}
+
 static unsigned int
 alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
 {
        struct pnfs_ds_commit_info *fl_cinfo;
        struct pnfs_commit_bucket *bucket;
        struct nfs_commit_data *data;
-       int i, j;
+       int i;
        unsigned int nreq = 0;
-       struct pnfs_layout_segment *freeme;
 
        fl_cinfo = cinfo->ds;
        bucket = fl_cinfo->buckets;
@@ -1272,16 +1312,7 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
        }
 
        /* Clean up on error */
-       for (j = i; j < fl_cinfo->nbuckets; j++, bucket++) {
-               if (list_empty(&bucket->committing))
-                       continue;
-               nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
-               spin_lock(cinfo->lock);
-               freeme = bucket->clseg;
-               bucket->clseg = NULL;
-               spin_unlock(cinfo->lock);
-               pnfs_put_lseg(freeme);
-       }
+       filelayout_retry_commit(cinfo, i);
        /* Caller will clean up entries put on list */
        return nreq;
 }
@@ -1301,8 +1332,12 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
                        data->lseg = NULL;
                        list_add(&data->pages, &list);
                        nreq++;
-               } else
+               } else {
                        nfs_retry_commit(mds_pages, NULL, cinfo);
+                       filelayout_retry_commit(cinfo, 0);
+                       cinfo->completion_ops->error_cleanup(NFS_I(inode));
+                       return -ENOMEM;
+               }
        }
 
        nreq += alloc_ds_commits(cinfo, &list);
@@ -1380,6 +1415,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
        .clear_request_commit   = filelayout_clear_request_commit,
        .scan_commit_lists      = filelayout_scan_commit_lists,
        .recover_commit_reqs    = filelayout_recover_commit_reqs,
+       .search_commit_reqs     = filelayout_search_commit_reqs,
        .commit_pagelist        = filelayout_commit_pagelist,
        .read_pagelist          = filelayout_read_pagelist,
        .write_pagelist         = filelayout_write_pagelist,
index e2a0361e24c680165a93597ca3226b0fde5fd373..8540516f4d719bff7d80c98068ee77a634485ec9 100644 (file)
@@ -695,7 +695,7 @@ filelayout_get_device_info(struct inode *inode,
        if (pdev == NULL)
                return NULL;
 
-       pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags);
+       pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
        if (pages == NULL) {
                kfree(pdev);
                return NULL;
index b94f80420a584e596789fd048489463728a58789..880618a8b0489da1580e7cea0058de74ae2ceece 100644 (file)
@@ -112,7 +112,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
         * if the dentry tree reaches them; however if the dentry already
         * exists, we'll pick it up at this point and use it as the root
         */
-       ret = d_obtain_alias(inode);
+       ret = d_obtain_root(inode);
        if (IS_ERR(ret)) {
                dprintk("nfs_get_root: get root dentry failed\n");
                goto out;
index 68921b01b792634be15cb8c213319b0ae1ecd02d..577a36f0a510b27cefe15429523750b6d21ca45a 100644 (file)
@@ -1002,6 +1002,15 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 }
 EXPORT_SYMBOL_GPL(nfs_revalidate_inode);
 
+int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *inode)
+{
+       if (!(NFS_I(inode)->cache_validity &
+                       (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
+                       && !nfs_attribute_cache_expired(inode))
+               return NFS_STALE(inode) ? -ESTALE : 0;
+       return -ECHILD;
+}
+
 static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
index e2a45ae5014e0d4dc21c4bed16d1602f02fd88be..9056622d223005c087caac590ed8139743665277 100644 (file)
@@ -247,11 +247,11 @@ void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
 int nfs_iocounter_wait(struct nfs_io_counter *c);
 
 extern const struct nfs_pageio_ops nfs_pgio_rw_ops;
-struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *);
-void nfs_rw_header_free(struct nfs_pgio_header *);
-void nfs_pgio_data_release(struct nfs_pgio_data *);
+struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
+void nfs_pgio_header_free(struct nfs_pgio_header *);
+void nfs_pgio_data_destroy(struct nfs_pgio_header *);
 int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
-int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_data *,
+int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_header *,
                      const struct rpc_call_ops *, int, int);
 void nfs_free_request(struct nfs_page *req);
 
@@ -451,6 +451,7 @@ int nfs_scan_commit(struct inode *inode, struct list_head *dst,
 void nfs_mark_request_commit(struct nfs_page *req,
                             struct pnfs_layout_segment *lseg,
                             struct nfs_commit_info *cinfo);
+int nfs_write_need_commit(struct nfs_pgio_header *);
 int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
                            int how, struct nfs_commit_info *cinfo);
 void nfs_retry_commit(struct list_head *page_list,
@@ -491,7 +492,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
 extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
 
 /* nfs4proc.c */
-extern void __nfs4_read_done_cb(struct nfs_pgio_data *);
+extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
 extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
                            const struct rpc_timeout *timeparms,
                            const char *ip_addr);
index 8f854dde4150e1f3dc2ace238d2ddda44e580d63..d0fec260132add4ce0d8917cd1bb47c15408be90 100644 (file)
@@ -256,7 +256,7 @@ nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data,
        char *p = data + *result;
 
        acl = get_acl(inode, type);
-       if (!acl)
+       if (IS_ERR_OR_NULL(acl))
                return 0;
 
        posix_acl_release(acl);
index f0afa291fd5883278783f846e6b2770ef69232d8..809670eba52a7b2111c159f2a42bd2e1fd86b213 100644 (file)
@@ -795,41 +795,44 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
        return status;
 }
 
-static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-       struct inode *inode = data->header->inode;
+       struct inode *inode = hdr->inode;
 
        if (nfs3_async_handle_jukebox(task, inode))
                return -EAGAIN;
 
        nfs_invalidate_atime(inode);
-       nfs_refresh_inode(inode, &data->fattr);
+       nfs_refresh_inode(inode, &hdr->fattr);
        return 0;
 }
 
-static void nfs3_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs3_proc_read_setup(struct nfs_pgio_header *hdr,
+                                struct rpc_message *msg)
 {
        msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
 }
 
-static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task,
+                                     struct nfs_pgio_header *hdr)
 {
        rpc_call_start(task);
        return 0;
 }
 
-static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-       struct inode *inode = data->header->inode;
+       struct inode *inode = hdr->inode;
 
        if (nfs3_async_handle_jukebox(task, inode))
                return -EAGAIN;
        if (task->tk_status >= 0)
-               nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
+               nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr);
        return 0;
 }
 
-static void nfs3_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs3_proc_write_setup(struct nfs_pgio_header *hdr,
+                                 struct rpc_message *msg)
 {
        msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
 }
index ba2affa51941bc5ff304fce2288fe50b9c84b1e5..92193eddb41dc315868af5f437f083a3a4c0302a 100644 (file)
@@ -54,7 +54,7 @@ struct nfs4_minor_version_ops {
                        const nfs4_stateid *);
        int     (*find_root_sec)(struct nfs_server *, struct nfs_fh *,
                        struct nfs_fsinfo *);
-       int     (*free_lock_state)(struct nfs_server *,
+       void    (*free_lock_state)(struct nfs_server *,
                        struct nfs4_lock_state *);
        const struct rpc_call_ops *call_sync_ops;
        const struct nfs4_state_recovery_ops *reboot_recovery_ops;
@@ -129,27 +129,17 @@ enum {
  * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN)
  */
 
-struct nfs4_lock_owner {
-       unsigned int lo_type;
-#define NFS4_ANY_LOCK_TYPE     (0U)
-#define NFS4_FLOCK_LOCK_TYPE   (1U << 0)
-#define NFS4_POSIX_LOCK_TYPE   (1U << 1)
-       union {
-               fl_owner_t posix_owner;
-               pid_t flock_owner;
-       } lo_u;
-};
-
 struct nfs4_lock_state {
-       struct list_head        ls_locks;       /* Other lock stateids */
-       struct nfs4_state *     ls_state;       /* Pointer to open state */
+       struct list_head                ls_locks;   /* Other lock stateids */
+       struct nfs4_state *             ls_state;   /* Pointer to open state */
 #define NFS_LOCK_INITIALIZED 0
 #define NFS_LOCK_LOST        1
-       unsigned long           ls_flags;
+       unsigned long                   ls_flags;
        struct nfs_seqid_counter        ls_seqid;
-       nfs4_stateid            ls_stateid;
-       atomic_t                ls_count;
-       struct nfs4_lock_owner  ls_owner;
+       nfs4_stateid                    ls_stateid;
+       atomic_t                        ls_count;
+       fl_owner_t                      ls_owner;
+       struct work_struct              ls_release;
 };
 
 /* bits for nfs4_state->flags */
@@ -337,11 +327,11 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
  */
 static inline void
 nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
-                        struct rpc_message *msg, struct nfs_pgio_data *wdata)
+                        struct rpc_message *msg, struct nfs_pgio_header *hdr)
 {
        if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) &&
            !test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags))
-               wdata->args.stable = NFS_FILE_SYNC;
+               hdr->args.stable = NFS_FILE_SYNC;
 }
 #else /* CONFIG_NFS_v4_1 */
 static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
@@ -369,7 +359,7 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_flags,
 
 static inline void
 nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
-                        struct rpc_message *msg, struct nfs_pgio_data *wdata)
+                        struct rpc_message *msg, struct nfs_pgio_header *hdr)
 {
 }
 #endif /* CONFIG_NFS_V4_1 */
index aa9ef4876046aa17cc43c201d33ba6dbe49f1b8f..53e435a952602aa5037cc8a9605b399f646edc81 100644 (file)
@@ -855,6 +855,11 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
        };
        struct rpc_timeout ds_timeout;
        struct nfs_client *clp;
+       char buf[INET6_ADDRSTRLEN + 1];
+
+       if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0)
+               return ERR_PTR(-EINVAL);
+       cl_init.hostname = buf;
 
        /*
         * Set an authflavor equual to the MDS value. Use the MDS nfs_client
index 4bf3d97cc5a094da598789ad58f05d92e10c3929..75ae8d22f067d55b7edfe77bbb44d2b0067880f8 100644 (file)
@@ -1952,6 +1952,14 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
        return status;
 }
 
+/*
+ * Additional permission checks in order to distinguish between an
+ * open for read, and an open for execute. This works around the
+ * fact that NFSv4 OPEN treats read and execute permissions as being
+ * the same.
+ * Note that in the non-execute case, we want to turn off permission
+ * checking if we just created a new file (POSIX open() semantics).
+ */
 static int nfs4_opendata_access(struct rpc_cred *cred,
                                struct nfs4_opendata *opendata,
                                struct nfs4_state *state, fmode_t fmode,
@@ -1966,14 +1974,14 @@ static int nfs4_opendata_access(struct rpc_cred *cred,
                return 0;
 
        mask = 0;
-       /* don't check MAY_WRITE - a newly created file may not have
-        * write mode bits, but POSIX allows the creating process to write.
-        * use openflags to check for exec, because fmode won't
-        * always have FMODE_EXEC set when file open for exec. */
+       /*
+        * Use openflags to check for exec, because fmode won't
+        * always have FMODE_EXEC set when file open for exec.
+        */
        if (openflags & __FMODE_EXEC) {
                /* ONLY check for exec rights */
                mask = MAY_EXEC;
-       } else if (fmode & FMODE_READ)
+       } else if ((fmode & FMODE_READ) && !opendata->file_created)
                mask = MAY_READ;
 
        cache.cred = cred;
@@ -2216,8 +2224,15 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
        seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
 
        ret = _nfs4_proc_open(opendata);
-       if (ret != 0)
+       if (ret != 0) {
+               if (ret == -ENOENT) {
+                       d_drop(opendata->dentry);
+                       d_add(opendata->dentry, NULL);
+                       nfs_set_verifier(opendata->dentry,
+                                        nfs_save_change_attribute(opendata->dir->d_inode));
+               }
                goto out;
+       }
 
        state = nfs4_opendata_to_nfs4_state(opendata);
        ret = PTR_ERR(state);
@@ -2647,6 +2662,48 @@ static const struct rpc_call_ops nfs4_close_ops = {
        .rpc_release = nfs4_free_closedata,
 };
 
+static bool nfs4_state_has_opener(struct nfs4_state *state)
+{
+       /* first check existing openers */
+       if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 &&
+           state->n_rdonly != 0)
+               return true;
+
+       if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0 &&
+           state->n_wronly != 0)
+               return true;
+
+       if (test_bit(NFS_O_RDWR_STATE, &state->flags) != 0 &&
+           state->n_rdwr != 0)
+               return true;
+
+       return false;
+}
+
+static bool nfs4_roc(struct inode *inode)
+{
+       struct nfs_inode *nfsi = NFS_I(inode);
+       struct nfs_open_context *ctx;
+       struct nfs4_state *state;
+
+       spin_lock(&inode->i_lock);
+       list_for_each_entry(ctx, &nfsi->open_files, list) {
+               state = ctx->state;
+               if (state == NULL)
+                       continue;
+               if (nfs4_state_has_opener(state)) {
+                       spin_unlock(&inode->i_lock);
+                       return false;
+               }
+       }
+       spin_unlock(&inode->i_lock);
+
+       if (nfs4_check_delegation(inode, FMODE_READ))
+               return false;
+
+       return pnfs_roc(inode);
+}
+
 /* 
  * It is possible for data to be read/written from a mem-mapped file 
  * after the sys_close call (which hits the vfs layer as a flush).
@@ -2697,7 +2754,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
        calldata->res.fattr = &calldata->fattr;
        calldata->res.seqid = calldata->arg.seqid;
        calldata->res.server = server;
-       calldata->roc = pnfs_roc(state->inode);
+       calldata->roc = nfs4_roc(state->inode);
        nfs_sb_active(calldata->inode->i_sb);
 
        msg.rpc_argp = &calldata->arg;
@@ -4033,24 +4090,25 @@ static bool nfs4_error_stateid_expired(int err)
        return false;
 }
 
-void __nfs4_read_done_cb(struct nfs_pgio_data *data)
+void __nfs4_read_done_cb(struct nfs_pgio_header *hdr)
 {
-       nfs_invalidate_atime(data->header->inode);
+       nfs_invalidate_atime(hdr->inode);
 }
 
-static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-       struct nfs_server *server = NFS_SERVER(data->header->inode);
+       struct nfs_server *server = NFS_SERVER(hdr->inode);
 
-       trace_nfs4_read(data, task->tk_status);
-       if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
+       trace_nfs4_read(hdr, task->tk_status);
+       if (nfs4_async_handle_error(task, server,
+                                   hdr->args.context->state) == -EAGAIN) {
                rpc_restart_call_prepare(task);
                return -EAGAIN;
        }
 
-       __nfs4_read_done_cb(data);
+       __nfs4_read_done_cb(hdr);
        if (task->tk_status > 0)
-               renew_lease(server, data->timestamp);
+               renew_lease(server, hdr->timestamp);
        return 0;
 }
 
@@ -4068,54 +4126,59 @@ static bool nfs4_read_stateid_changed(struct rpc_task *task,
        return true;
 }
 
-static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
 
        dprintk("--> %s\n", __func__);
 
-       if (!nfs4_sequence_done(task, &data->res.seq_res))
+       if (!nfs4_sequence_done(task, &hdr->res.seq_res))
                return -EAGAIN;
-       if (nfs4_read_stateid_changed(task, &data->args))
+       if (nfs4_read_stateid_changed(task, &hdr->args))
                return -EAGAIN;
-       return data->pgio_done_cb ? data->pgio_done_cb(task, data) :
-                                   nfs4_read_done_cb(task, data);
+       return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
+                                   nfs4_read_done_cb(task, hdr);
 }
 
-static void nfs4_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
+                                struct rpc_message *msg)
 {
-       data->timestamp   = jiffies;
-       data->pgio_done_cb = nfs4_read_done_cb;
+       hdr->timestamp   = jiffies;
+       hdr->pgio_done_cb = nfs4_read_done_cb;
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
-       nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+       nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0);
 }
 
-static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task,
+                                     struct nfs_pgio_header *hdr)
 {
-       if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
-                       &data->args.seq_args,
-                       &data->res.seq_res,
+       if (nfs4_setup_sequence(NFS_SERVER(hdr->inode),
+                       &hdr->args.seq_args,
+                       &hdr->res.seq_res,
                        task))
                return 0;
-       if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
-                               data->args.lock_context, data->header->rw_ops->rw_mode) == -EIO)
+       if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+                               hdr->args.lock_context,
+                               hdr->rw_ops->rw_mode) == -EIO)
                return -EIO;
-       if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
+       if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags)))
                return -EIO;
        return 0;
 }
 
-static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_write_done_cb(struct rpc_task *task,
+                             struct nfs_pgio_header *hdr)
 {
-       struct inode *inode = data->header->inode;
+       struct inode *inode = hdr->inode;
        
-       trace_nfs4_write(data, task->tk_status);
-       if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
+       trace_nfs4_write(hdr, task->tk_status);
+       if (nfs4_async_handle_error(task, NFS_SERVER(inode),
+                                   hdr->args.context->state) == -EAGAIN) {
                rpc_restart_call_prepare(task);
                return -EAGAIN;
        }
        if (task->tk_status >= 0) {
-               renew_lease(NFS_SERVER(inode), data->timestamp);
-               nfs_post_op_update_inode_force_wcc(inode, &data->fattr);
+               renew_lease(NFS_SERVER(inode), hdr->timestamp);
+               nfs_post_op_update_inode_force_wcc(inode, &hdr->fattr);
        }
        return 0;
 }
@@ -4134,23 +4197,21 @@ static bool nfs4_write_stateid_changed(struct rpc_task *task,
        return true;
 }
 
-static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-       if (!nfs4_sequence_done(task, &data->res.seq_res))
+       if (!nfs4_sequence_done(task, &hdr->res.seq_res))
                return -EAGAIN;
-       if (nfs4_write_stateid_changed(task, &data->args))
+       if (nfs4_write_stateid_changed(task, &hdr->args))
                return -EAGAIN;
-       return data->pgio_done_cb ? data->pgio_done_cb(task, data) :
-               nfs4_write_done_cb(task, data);
+       return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
+               nfs4_write_done_cb(task, hdr);
 }
 
 static
-bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data)
+bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr)
 {
-       const struct nfs_pgio_header *hdr = data->header;
-
        /* Don't request attributes for pNFS or O_DIRECT writes */
-       if (data->ds_clp != NULL || hdr->dreq != NULL)
+       if (hdr->ds_clp != NULL || hdr->dreq != NULL)
                return false;
        /* Otherwise, request attributes if and only if we don't hold
         * a delegation
@@ -4158,23 +4219,24 @@ bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data)
        return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0;
 }
 
-static void nfs4_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
+                                 struct rpc_message *msg)
 {
-       struct nfs_server *server = NFS_SERVER(data->header->inode);
+       struct nfs_server *server = NFS_SERVER(hdr->inode);
 
-       if (!nfs4_write_need_cache_consistency_data(data)) {
-               data->args.bitmask = NULL;
-               data->res.fattr = NULL;
+       if (!nfs4_write_need_cache_consistency_data(hdr)) {
+               hdr->args.bitmask = NULL;
+               hdr->res.fattr = NULL;
        } else
-               data->args.bitmask = server->cache_consistency_bitmask;
+               hdr->args.bitmask = server->cache_consistency_bitmask;
 
-       if (!data->pgio_done_cb)
-               data->pgio_done_cb = nfs4_write_done_cb;
-       data->res.server = server;
-       data->timestamp   = jiffies;
+       if (!hdr->pgio_done_cb)
+               hdr->pgio_done_cb = nfs4_write_done_cb;
+       hdr->res.server = server;
+       hdr->timestamp   = jiffies;
 
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
-       nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
+       nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 1);
 }
 
 static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
@@ -4881,6 +4943,18 @@ nfs4_init_callback_netid(const struct nfs_client *clp, char *buf, size_t len)
                return scnprintf(buf, len, "tcp");
 }
 
+static void nfs4_setclientid_done(struct rpc_task *task, void *calldata)
+{
+       struct nfs4_setclientid *sc = calldata;
+
+       if (task->tk_status == 0)
+               sc->sc_cred = get_rpccred(task->tk_rqstp->rq_cred);
+}
+
+static const struct rpc_call_ops nfs4_setclientid_ops = {
+       .rpc_call_done = nfs4_setclientid_done,
+};
+
 /**
  * nfs4_proc_setclientid - Negotiate client ID
  * @clp: state data structure
@@ -4907,6 +4981,14 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
                .rpc_resp = res,
                .rpc_cred = cred,
        };
+       struct rpc_task *task;
+       struct rpc_task_setup task_setup_data = {
+               .rpc_client = clp->cl_rpcclient,
+               .rpc_message = &msg,
+               .callback_ops = &nfs4_setclientid_ops,
+               .callback_data = &setclientid,
+               .flags = RPC_TASK_TIMEOUT,
+       };
        int status;
 
        /* nfs_client_id4 */
@@ -4933,7 +5015,18 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
        dprintk("NFS call  setclientid auth=%s, '%.*s'\n",
                clp->cl_rpcclient->cl_auth->au_ops->au_name,
                setclientid.sc_name_len, setclientid.sc_name);
-       status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+       task = rpc_run_task(&task_setup_data);
+       if (IS_ERR(task)) {
+               status = PTR_ERR(task);
+               goto out;
+       }
+       status = task->tk_status;
+       if (setclientid.sc_cred) {
+               clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred);
+               put_rpccred(setclientid.sc_cred);
+       }
+       rpc_put_task(task);
+out:
        trace_nfs4_setclientid(clp, status);
        dprintk("NFS reply setclientid: %d\n", status);
        return status;
@@ -4975,6 +5068,9 @@ struct nfs4_delegreturndata {
        unsigned long timestamp;
        struct nfs_fattr fattr;
        int rpc_status;
+       struct inode *inode;
+       bool roc;
+       u32 roc_barrier;
 };
 
 static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
@@ -4988,7 +5084,6 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
        switch (task->tk_status) {
        case 0:
                renew_lease(data->res.server, data->timestamp);
-               break;
        case -NFS4ERR_ADMIN_REVOKED:
        case -NFS4ERR_DELEG_REVOKED:
        case -NFS4ERR_BAD_STATEID:
@@ -4996,6 +5091,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
        case -NFS4ERR_STALE_STATEID:
        case -NFS4ERR_EXPIRED:
                task->tk_status = 0;
+               if (data->roc)
+                       pnfs_roc_set_barrier(data->inode, data->roc_barrier);
                break;
        default:
                if (nfs4_async_handle_error(task, data->res.server, NULL) ==
@@ -5009,6 +5106,10 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 
 static void nfs4_delegreturn_release(void *calldata)
 {
+       struct nfs4_delegreturndata *data = calldata;
+
+       if (data->roc)
+               pnfs_roc_release(data->inode);
        kfree(calldata);
 }
 
@@ -5018,6 +5119,10 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
 
        d_data = (struct nfs4_delegreturndata *)data;
 
+       if (d_data->roc &&
+           pnfs_roc_drain(d_data->inode, &d_data->roc_barrier, task))
+               return;
+
        nfs4_setup_sequence(d_data->res.server,
                        &d_data->args.seq_args,
                        &d_data->res.seq_res,
@@ -5061,6 +5166,9 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
        nfs_fattr_init(data->res.fattr);
        data->timestamp = jiffies;
        data->rpc_status = 0;
+       data->inode = inode;
+       data->roc = list_empty(&NFS_I(inode)->open_files) ?
+                   pnfs_roc(inode) : false;
 
        task_setup_data.callback_data = data;
        msg.rpc_argp = &data->args;
@@ -5834,8 +5942,10 @@ struct nfs_release_lockowner_data {
 static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs_release_lockowner_data *data = calldata;
-       nfs40_setup_sequence(data->server,
-                               &data->args.seq_args, &data->res.seq_res, task);
+       struct nfs_server *server = data->server;
+       nfs40_setup_sequence(server, &data->args.seq_args,
+                               &data->res.seq_res, task);
+       data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
        data->timestamp = jiffies;
 }
 
@@ -5852,6 +5962,8 @@ static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
                break;
        case -NFS4ERR_STALE_CLIENTID:
        case -NFS4ERR_EXPIRED:
+               nfs4_schedule_lease_recovery(server->nfs_client);
+               break;
        case -NFS4ERR_LEASE_MOVED:
        case -NFS4ERR_DELAY:
                if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN)
@@ -5872,7 +5984,8 @@ static const struct rpc_call_ops nfs4_release_lockowner_ops = {
        .rpc_release = nfs4_release_lockowner_release,
 };
 
-static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
+static void
+nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
 {
        struct nfs_release_lockowner_data *data;
        struct rpc_message msg = {
@@ -5880,11 +5993,11 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
        };
 
        if (server->nfs_client->cl_mvops->minor_version != 0)
-               return -EINVAL;
+               return;
 
        data = kmalloc(sizeof(*data), GFP_NOFS);
        if (!data)
-               return -ENOMEM;
+               return;
        data->lsp = lsp;
        data->server = server;
        data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
@@ -5895,7 +6008,6 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
        msg.rpc_resp = &data->res;
        nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
        rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data);
-       return 0;
 }
 
 #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
@@ -8182,7 +8294,8 @@ static int nfs41_free_stateid(struct nfs_server *server,
        return ret;
 }
 
-static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
+static void
+nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
 {
        struct rpc_task *task;
        struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
@@ -8190,9 +8303,8 @@ static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_sta
        task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
        nfs4_free_lock_state(server, lsp);
        if (IS_ERR(task))
-               return PTR_ERR(task);
+               return;
        rpc_put_task(task);
-       return 0;
 }
 
 static bool nfs41_match_stateid(const nfs4_stateid *s1,
index 42f12118216700d2cde7c6093b8470b1f20ab674..a043f618cd5a30ef35a8ec63d54ff12034a2387f 100644 (file)
@@ -787,33 +787,36 @@ void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode)
  * that is compatible with current->files
  */
 static struct nfs4_lock_state *
-__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
+__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
 {
        struct nfs4_lock_state *pos;
        list_for_each_entry(pos, &state->lock_states, ls_locks) {
-               if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type)
+               if (pos->ls_owner != fl_owner)
                        continue;
-               switch (pos->ls_owner.lo_type) {
-               case NFS4_POSIX_LOCK_TYPE:
-                       if (pos->ls_owner.lo_u.posix_owner != fl_owner)
-                               continue;
-                       break;
-               case NFS4_FLOCK_LOCK_TYPE:
-                       if (pos->ls_owner.lo_u.flock_owner != fl_pid)
-                               continue;
-               }
                atomic_inc(&pos->ls_count);
                return pos;
        }
        return NULL;
 }
 
+static void
+free_lock_state_work(struct work_struct *work)
+{
+       struct nfs4_lock_state *lsp = container_of(work,
+                                       struct nfs4_lock_state, ls_release);
+       struct nfs4_state *state = lsp->ls_state;
+       struct nfs_server *server = state->owner->so_server;
+       struct nfs_client *clp = server->nfs_client;
+
+       clp->cl_mvops->free_lock_state(server, lsp);
+}
+
 /*
  * Return a compatible lock_state. If no initialized lock_state structure
  * exists, return an uninitialized one.
  *
  */
-static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
+static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
 {
        struct nfs4_lock_state *lsp;
        struct nfs_server *server = state->owner->so_server;
@@ -824,21 +827,12 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
        nfs4_init_seqid_counter(&lsp->ls_seqid);
        atomic_set(&lsp->ls_count, 1);
        lsp->ls_state = state;
-       lsp->ls_owner.lo_type = type;
-       switch (lsp->ls_owner.lo_type) {
-       case NFS4_FLOCK_LOCK_TYPE:
-               lsp->ls_owner.lo_u.flock_owner = fl_pid;
-               break;
-       case NFS4_POSIX_LOCK_TYPE:
-               lsp->ls_owner.lo_u.posix_owner = fl_owner;
-               break;
-       default:
-               goto out_free;
-       }
+       lsp->ls_owner = fl_owner;
        lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS);
        if (lsp->ls_seqid.owner_id < 0)
                goto out_free;
        INIT_LIST_HEAD(&lsp->ls_locks);
+       INIT_WORK(&lsp->ls_release, free_lock_state_work);
        return lsp;
 out_free:
        kfree(lsp);
@@ -857,13 +851,13 @@ void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp
  * exists, return an uninitialized one.
  *
  */
-static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type)
+static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner)
 {
        struct nfs4_lock_state *lsp, *new = NULL;
        
        for(;;) {
                spin_lock(&state->state_lock);
-               lsp = __nfs4_find_lock_state(state, owner, pid, type);
+               lsp = __nfs4_find_lock_state(state, owner);
                if (lsp != NULL)
                        break;
                if (new != NULL) {
@@ -874,7 +868,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
                        break;
                }
                spin_unlock(&state->state_lock);
-               new = nfs4_alloc_lock_state(state, owner, pid, type);
+               new = nfs4_alloc_lock_state(state, owner);
                if (new == NULL)
                        return NULL;
        }
@@ -902,13 +896,12 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
        if (list_empty(&state->lock_states))
                clear_bit(LK_STATE_IN_USE, &state->flags);
        spin_unlock(&state->state_lock);
-       server = state->owner->so_server;
-       if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
-               struct nfs_client *clp = server->nfs_client;
-
-               clp->cl_mvops->free_lock_state(server, lsp);
-       } else
+       if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags))
+               queue_work(nfsiod_workqueue, &lsp->ls_release);
+       else {
+               server = state->owner->so_server;
                nfs4_free_lock_state(server, lsp);
+       }
 }
 
 static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
@@ -935,13 +928,7 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
 
        if (fl->fl_ops != NULL)
                return 0;
-       if (fl->fl_flags & FL_POSIX)
-               lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE);
-       else if (fl->fl_flags & FL_FLOCK)
-               lsp = nfs4_get_lock_state(state, NULL, fl->fl_pid,
-                               NFS4_FLOCK_LOCK_TYPE);
-       else
-               return -EINVAL;
+       lsp = nfs4_get_lock_state(state, fl->fl_owner);
        if (lsp == NULL)
                return -ENOMEM;
        fl->fl_u.nfs4_fl.owner = lsp;
@@ -955,7 +942,6 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
 {
        struct nfs4_lock_state *lsp;
        fl_owner_t fl_owner;
-       pid_t fl_pid;
        int ret = -ENOENT;
 
 
@@ -966,9 +952,8 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
                goto out;
 
        fl_owner = lockowner->l_owner;
-       fl_pid = lockowner->l_pid;
        spin_lock(&state->state_lock);
-       lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
+       lsp = __nfs4_find_lock_state(state, fl_owner);
        if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
                ret = -EIO;
        else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
index 0a744f3a86f6f592c9913f60a4b99c5d715d6324..1c32adbe728df548bcc92ee175b4d6061b3b4489 100644 (file)
@@ -932,11 +932,11 @@ DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group);
 
 DECLARE_EVENT_CLASS(nfs4_read_event,
                TP_PROTO(
-                       const struct nfs_pgio_data *data,
+                       const struct nfs_pgio_header *hdr,
                        int error
                ),
 
-               TP_ARGS(data, error),
+               TP_ARGS(hdr, error),
 
                TP_STRUCT__entry(
                        __field(dev_t, dev)
@@ -948,12 +948,12 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
                ),
 
                TP_fast_assign(
-                       const struct inode *inode = data->header->inode;
+                       const struct inode *inode = hdr->inode;
                        __entry->dev = inode->i_sb->s_dev;
                        __entry->fileid = NFS_FILEID(inode);
                        __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
-                       __entry->offset = data->args.offset;
-                       __entry->count = data->args.count;
+                       __entry->offset = hdr->args.offset;
+                       __entry->count = hdr->args.count;
                        __entry->error = error;
                ),
 
@@ -972,10 +972,10 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
 #define DEFINE_NFS4_READ_EVENT(name) \
        DEFINE_EVENT(nfs4_read_event, name, \
                        TP_PROTO( \
-                               const struct nfs_pgio_data *data, \
+                               const struct nfs_pgio_header *hdr, \
                                int error \
                        ), \
-                       TP_ARGS(data, error))
+                       TP_ARGS(hdr, error))
 DEFINE_NFS4_READ_EVENT(nfs4_read);
 #ifdef CONFIG_NFS_V4_1
 DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read);
@@ -983,11 +983,11 @@ DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read);
 
 DECLARE_EVENT_CLASS(nfs4_write_event,
                TP_PROTO(
-                       const struct nfs_pgio_data *data,
+                       const struct nfs_pgio_header *hdr,
                        int error
                ),
 
-               TP_ARGS(data, error),
+               TP_ARGS(hdr, error),
 
                TP_STRUCT__entry(
                        __field(dev_t, dev)
@@ -999,12 +999,12 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
                ),
 
                TP_fast_assign(
-                       const struct inode *inode = data->header->inode;
+                       const struct inode *inode = hdr->inode;
                        __entry->dev = inode->i_sb->s_dev;
                        __entry->fileid = NFS_FILEID(inode);
                        __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
-                       __entry->offset = data->args.offset;
-                       __entry->count = data->args.count;
+                       __entry->offset = hdr->args.offset;
+                       __entry->count = hdr->args.count;
                        __entry->error = error;
                ),
 
@@ -1024,10 +1024,10 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
 #define DEFINE_NFS4_WRITE_EVENT(name) \
        DEFINE_EVENT(nfs4_write_event, name, \
                        TP_PROTO( \
-                               const struct nfs_pgio_data *data, \
+                               const struct nfs_pgio_header *hdr, \
                                int error \
                        ), \
-                       TP_ARGS(data, error))
+                       TP_ARGS(hdr, error))
 DEFINE_NFS4_WRITE_EVENT(nfs4_write);
 #ifdef CONFIG_NFS_V4_1
 DEFINE_NFS4_WRITE_EVENT(nfs4_pnfs_write);
index 939ae606cfa4c96d4d607afd8886f5d45ed85896..e13b59d8d9aa1374990c5eee9acec623c1193537 100644 (file)
@@ -7092,7 +7092,7 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
        if (!status)
                status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (!status)
-               status = decode_reclaim_complete(xdr, (void *)NULL);
+               status = decode_reclaim_complete(xdr, NULL);
        return status;
 }
 
index 611320753db2117469765bbdee79f74b1d721424..ae05278b3761df60ed622195d011df5a1059ab8a 100644 (file)
@@ -439,22 +439,21 @@ static void _read_done(struct ore_io_state *ios, void *private)
        objlayout_read_done(&objios->oir, status, objios->sync);
 }
 
-int objio_read_pagelist(struct nfs_pgio_data *rdata)
+int objio_read_pagelist(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = rdata->header;
        struct objio_state *objios;
        int ret;
 
        ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true,
-                       hdr->lseg, rdata->args.pages, rdata->args.pgbase,
-                       rdata->args.offset, rdata->args.count, rdata,
+                       hdr->lseg, hdr->args.pages, hdr->args.pgbase,
+                       hdr->args.offset, hdr->args.count, hdr,
                        GFP_KERNEL, &objios);
        if (unlikely(ret))
                return ret;
 
        objios->ios->done = _read_done;
        dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
-               rdata->args.offset, rdata->args.count);
+               hdr->args.offset, hdr->args.count);
        ret = ore_read(objios->ios);
        if (unlikely(ret))
                objio_free_result(&objios->oir);
@@ -487,11 +486,11 @@ static void _write_done(struct ore_io_state *ios, void *private)
 static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
 {
        struct objio_state *objios = priv;
-       struct nfs_pgio_data *wdata = objios->oir.rpcdata;
-       struct address_space *mapping = wdata->header->inode->i_mapping;
+       struct nfs_pgio_header *hdr = objios->oir.rpcdata;
+       struct address_space *mapping = hdr->inode->i_mapping;
        pgoff_t index = offset / PAGE_SIZE;
        struct page *page;
-       loff_t i_size = i_size_read(wdata->header->inode);
+       loff_t i_size = i_size_read(hdr->inode);
 
        if (offset >= i_size) {
                *uptodate = true;
@@ -531,15 +530,14 @@ static const struct _ore_r4w_op _r4w_op = {
        .put_page = &__r4w_put_page,
 };
 
-int objio_write_pagelist(struct nfs_pgio_data *wdata, int how)
+int objio_write_pagelist(struct nfs_pgio_header *hdr, int how)
 {
-       struct nfs_pgio_header *hdr = wdata->header;
        struct objio_state *objios;
        int ret;
 
        ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false,
-                       hdr->lseg, wdata->args.pages, wdata->args.pgbase,
-                       wdata->args.offset, wdata->args.count, wdata, GFP_NOFS,
+                       hdr->lseg, hdr->args.pages, hdr->args.pgbase,
+                       hdr->args.offset, hdr->args.count, hdr, GFP_NOFS,
                        &objios);
        if (unlikely(ret))
                return ret;
@@ -551,7 +549,7 @@ int objio_write_pagelist(struct nfs_pgio_data *wdata, int how)
                objios->ios->done = _write_done;
 
        dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
-               wdata->args.offset, wdata->args.count);
+               hdr->args.offset, hdr->args.count);
        ret = ore_write(objios->ios);
        if (unlikely(ret)) {
                objio_free_result(&objios->oir);
index 765d3f54e9860b18404dea42a757f899966ccda8..697a16d11fac3204b0574c084965662988626f88 100644 (file)
@@ -229,36 +229,36 @@ objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,
 static void _rpc_read_complete(struct work_struct *work)
 {
        struct rpc_task *task;
-       struct nfs_pgio_data *rdata;
+       struct nfs_pgio_header *hdr;
 
        dprintk("%s enter\n", __func__);
        task = container_of(work, struct rpc_task, u.tk_work);
-       rdata = container_of(task, struct nfs_pgio_data, task);
+       hdr = container_of(task, struct nfs_pgio_header, task);
 
-       pnfs_ld_read_done(rdata);
+       pnfs_ld_read_done(hdr);
 }
 
 void
 objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 {
-       struct nfs_pgio_data *rdata = oir->rpcdata;
+       struct nfs_pgio_header *hdr = oir->rpcdata;
 
-       oir->status = rdata->task.tk_status = status;
+       oir->status = hdr->task.tk_status = status;
        if (status >= 0)
-               rdata->res.count = status;
+               hdr->res.count = status;
        else
-               rdata->header->pnfs_error = status;
+               hdr->pnfs_error = status;
        objlayout_iodone(oir);
        /* must not use oir after this point */
 
        dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__,
-               status, rdata->res.eof, sync);
+               status, hdr->res.eof, sync);
 
        if (sync)
-               pnfs_ld_read_done(rdata);
+               pnfs_ld_read_done(hdr);
        else {
-               INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
-               schedule_work(&rdata->task.u.tk_work);
+               INIT_WORK(&hdr->task.u.tk_work, _rpc_read_complete);
+               schedule_work(&hdr->task.u.tk_work);
        }
 }
 
@@ -266,12 +266,11 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
  * Perform sync or async reads.
  */
 enum pnfs_try_status
-objlayout_read_pagelist(struct nfs_pgio_data *rdata)
+objlayout_read_pagelist(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = rdata->header;
        struct inode *inode = hdr->inode;
-       loff_t offset = rdata->args.offset;
-       size_t count = rdata->args.count;
+       loff_t offset = hdr->args.offset;
+       size_t count = hdr->args.count;
        int err;
        loff_t eof;
 
@@ -279,23 +278,23 @@ objlayout_read_pagelist(struct nfs_pgio_data *rdata)
        if (unlikely(offset + count > eof)) {
                if (offset >= eof) {
                        err = 0;
-                       rdata->res.count = 0;
-                       rdata->res.eof = 1;
+                       hdr->res.count = 0;
+                       hdr->res.eof = 1;
                        /*FIXME: do we need to call pnfs_ld_read_done() */
                        goto out;
                }
                count = eof - offset;
        }
 
-       rdata->res.eof = (offset + count) >= eof;
-       _fix_verify_io_params(hdr->lseg, &rdata->args.pages,
-                             &rdata->args.pgbase,
-                             rdata->args.offset, rdata->args.count);
+       hdr->res.eof = (offset + count) >= eof;
+       _fix_verify_io_params(hdr->lseg, &hdr->args.pages,
+                             &hdr->args.pgbase,
+                             hdr->args.offset, hdr->args.count);
 
        dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
-               __func__, inode->i_ino, offset, count, rdata->res.eof);
+               __func__, inode->i_ino, offset, count, hdr->res.eof);
 
-       err = objio_read_pagelist(rdata);
+       err = objio_read_pagelist(hdr);
  out:
        if (unlikely(err)) {
                hdr->pnfs_error = err;
@@ -312,38 +311,38 @@ objlayout_read_pagelist(struct nfs_pgio_data *rdata)
 static void _rpc_write_complete(struct work_struct *work)
 {
        struct rpc_task *task;
-       struct nfs_pgio_data *wdata;
+       struct nfs_pgio_header *hdr;
 
        dprintk("%s enter\n", __func__);
        task = container_of(work, struct rpc_task, u.tk_work);
-       wdata = container_of(task, struct nfs_pgio_data, task);
+       hdr = container_of(task, struct nfs_pgio_header, task);
 
-       pnfs_ld_write_done(wdata);
+       pnfs_ld_write_done(hdr);
 }
 
 void
 objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 {
-       struct nfs_pgio_data *wdata = oir->rpcdata;
+       struct nfs_pgio_header *hdr = oir->rpcdata;
 
-       oir->status = wdata->task.tk_status = status;
+       oir->status = hdr->task.tk_status = status;
        if (status >= 0) {
-               wdata->res.count = status;
-               wdata->verf.committed = oir->committed;
+               hdr->res.count = status;
+               hdr->verf.committed = oir->committed;
        } else {
-               wdata->header->pnfs_error = status;
+               hdr->pnfs_error = status;
        }
        objlayout_iodone(oir);
        /* must not use oir after this point */
 
        dprintk("%s: Return status %zd committed %d sync=%d\n", __func__,
-               status, wdata->verf.committed, sync);
+               status, hdr->verf.committed, sync);
 
        if (sync)
-               pnfs_ld_write_done(wdata);
+               pnfs_ld_write_done(hdr);
        else {
-               INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
-               schedule_work(&wdata->task.u.tk_work);
+               INIT_WORK(&hdr->task.u.tk_work, _rpc_write_complete);
+               schedule_work(&hdr->task.u.tk_work);
        }
 }
 
@@ -351,17 +350,15 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
  * Perform sync or async writes.
  */
 enum pnfs_try_status
-objlayout_write_pagelist(struct nfs_pgio_data *wdata,
-                        int how)
+objlayout_write_pagelist(struct nfs_pgio_header *hdr, int how)
 {
-       struct nfs_pgio_header *hdr = wdata->header;
        int err;
 
-       _fix_verify_io_params(hdr->lseg, &wdata->args.pages,
-                             &wdata->args.pgbase,
-                             wdata->args.offset, wdata->args.count);
+       _fix_verify_io_params(hdr->lseg, &hdr->args.pages,
+                             &hdr->args.pgbase,
+                             hdr->args.offset, hdr->args.count);
 
-       err = objio_write_pagelist(wdata, how);
+       err = objio_write_pagelist(hdr, how);
        if (unlikely(err)) {
                hdr->pnfs_error = err;
                dprintk("%s: Returned Error %d\n", __func__, err);
index 01e041029a6ca6ab5be86062dbb54775c3ae87f4..fd13f1d2f136d6c35dd29b56b405a2815f17bef8 100644 (file)
@@ -119,8 +119,8 @@ extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
  */
 extern void objio_free_result(struct objlayout_io_res *oir);
 
-extern int objio_read_pagelist(struct nfs_pgio_data *rdata);
-extern int objio_write_pagelist(struct nfs_pgio_data *wdata, int how);
+extern int objio_read_pagelist(struct nfs_pgio_header *rdata);
+extern int objio_write_pagelist(struct nfs_pgio_header *wdata, int how);
 
 /*
  * callback API
@@ -168,10 +168,10 @@ extern struct pnfs_layout_segment *objlayout_alloc_lseg(
 extern void objlayout_free_lseg(struct pnfs_layout_segment *);
 
 extern enum pnfs_try_status objlayout_read_pagelist(
-       struct nfs_pgio_data *);
+       struct nfs_pgio_header *);
 
 extern enum pnfs_try_status objlayout_write_pagelist(
-       struct nfs_pgio_data *,
+       struct nfs_pgio_header *,
        int how);
 
 extern void objlayout_encode_layoutcommit(
index 0be5050638f7c026f14eb8eb728530a6238160d0..ba491926df5f7df2db1e224c96e7e070bd36dcec 100644 (file)
@@ -141,16 +141,24 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
  * @req - request in group that is to be locked
  *
  * this lock must be held if modifying the page group list
+ *
+ * returns result from wait_on_bit_lock: 0 on success, < 0 on error
  */
-void
-nfs_page_group_lock(struct nfs_page *req)
+int
+nfs_page_group_lock(struct nfs_page *req, bool wait)
 {
        struct nfs_page *head = req->wb_head;
+       int ret;
 
        WARN_ON_ONCE(head != head->wb_head);
 
-       wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
+       do {
+               ret = wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
                        TASK_UNINTERRUPTIBLE);
+       } while (wait && ret != 0);
+
+       WARN_ON_ONCE(ret > 0);
+       return ret;
 }
 
 /*
@@ -211,7 +219,7 @@ bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
 {
        bool ret;
 
-       nfs_page_group_lock(req);
+       nfs_page_group_lock(req, true);
        ret = nfs_page_group_sync_on_bit_locked(req, bit);
        nfs_page_group_unlock(req);
 
@@ -454,123 +462,72 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
 }
 EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
 
-static inline struct nfs_rw_header *NFS_RW_HEADER(struct nfs_pgio_header *hdr)
-{
-       return container_of(hdr, struct nfs_rw_header, header);
-}
-
-/**
- * nfs_rw_header_alloc - Allocate a header for a read or write
- * @ops: Read or write function vector
- */
-struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *ops)
+struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops)
 {
-       struct nfs_rw_header *header = ops->rw_alloc_header();
-
-       if (header) {
-               struct nfs_pgio_header *hdr = &header->header;
+       struct nfs_pgio_header *hdr = ops->rw_alloc_header();
 
+       if (hdr) {
                INIT_LIST_HEAD(&hdr->pages);
                spin_lock_init(&hdr->lock);
-               atomic_set(&hdr->refcnt, 0);
                hdr->rw_ops = ops;
        }
-       return header;
+       return hdr;
 }
-EXPORT_SYMBOL_GPL(nfs_rw_header_alloc);
+EXPORT_SYMBOL_GPL(nfs_pgio_header_alloc);
 
 /*
- * nfs_rw_header_free - Free a read or write header
+ * nfs_pgio_header_free - Free a read or write header
  * @hdr: The header to free
  */
-void nfs_rw_header_free(struct nfs_pgio_header *hdr)
+void nfs_pgio_header_free(struct nfs_pgio_header *hdr)
 {
-       hdr->rw_ops->rw_free_header(NFS_RW_HEADER(hdr));
+       hdr->rw_ops->rw_free_header(hdr);
 }
-EXPORT_SYMBOL_GPL(nfs_rw_header_free);
+EXPORT_SYMBOL_GPL(nfs_pgio_header_free);
 
 /**
- * nfs_pgio_data_alloc - Allocate pageio data
- * @hdr: The header making a request
- * @pagecount: Number of pages to create
- */
-static struct nfs_pgio_data *nfs_pgio_data_alloc(struct nfs_pgio_header *hdr,
-                                                unsigned int pagecount)
-{
-       struct nfs_pgio_data *data, *prealloc;
-
-       prealloc = &NFS_RW_HEADER(hdr)->rpc_data;
-       if (prealloc->header == NULL)
-               data = prealloc;
-       else
-               data = kzalloc(sizeof(*data), GFP_KERNEL);
-       if (!data)
-               goto out;
-
-       if (nfs_pgarray_set(&data->pages, pagecount)) {
-               data->header = hdr;
-               atomic_inc(&hdr->refcnt);
-       } else {
-               if (data != prealloc)
-                       kfree(data);
-               data = NULL;
-       }
-out:
-       return data;
-}
-
-/**
- * nfs_pgio_data_release - Properly free pageio data
- * @data: The data to release
+ * nfs_pgio_data_destroy - make @hdr suitable for reuse
+ *
+ * Frees memory and releases refs from nfs_generic_pgio, so that it may
+ * be called again.
+ *
+ * @hdr: A header that has had nfs_generic_pgio called
  */
-void nfs_pgio_data_release(struct nfs_pgio_data *data)
+void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
-       struct nfs_rw_header *pageio_header = NFS_RW_HEADER(hdr);
-
-       put_nfs_open_context(data->args.context);
-       if (data->pages.pagevec != data->pages.page_array)
-               kfree(data->pages.pagevec);
-       if (data == &pageio_header->rpc_data) {
-               data->header = NULL;
-               data = NULL;
-       }
-       if (atomic_dec_and_test(&hdr->refcnt))
-               hdr->completion_ops->completion(hdr);
-       /* Note: we only free the rpc_task after callbacks are done.
-        * See the comment in rpc_free_task() for why
-        */
-       kfree(data);
+       put_nfs_open_context(hdr->args.context);
+       if (hdr->page_array.pagevec != hdr->page_array.page_array)
+               kfree(hdr->page_array.pagevec);
 }
-EXPORT_SYMBOL_GPL(nfs_pgio_data_release);
+EXPORT_SYMBOL_GPL(nfs_pgio_data_destroy);
 
 /**
  * nfs_pgio_rpcsetup - Set up arguments for a pageio call
- * @data: The pageio data
+ * @hdr: The pageio hdr
  * @count: Number of bytes to read
  * @offset: Initial offset
  * @how: How to commit data (writes only)
  * @cinfo: Commit information for the call (writes only)
  */
-static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data,
+static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr,
                              unsigned int count, unsigned int offset,
                              int how, struct nfs_commit_info *cinfo)
 {
-       struct nfs_page *req = data->header->req;
+       struct nfs_page *req = hdr->req;
 
        /* Set up the RPC argument and reply structs
-        * NB: take care not to mess about with data->commit et al. */
+        * NB: take care not to mess about with hdr->commit et al. */
 
-       data->args.fh     = NFS_FH(data->header->inode);
-       data->args.offset = req_offset(req) + offset;
+       hdr->args.fh     = NFS_FH(hdr->inode);
+       hdr->args.offset = req_offset(req) + offset;
        /* pnfs_set_layoutcommit needs this */
-       data->mds_offset = data->args.offset;
-       data->args.pgbase = req->wb_pgbase + offset;
-       data->args.pages  = data->pages.pagevec;
-       data->args.count  = count;
-       data->args.context = get_nfs_open_context(req->wb_context);
-       data->args.lock_context = req->wb_lock_context;
-       data->args.stable  = NFS_UNSTABLE;
+       hdr->mds_offset = hdr->args.offset;
+       hdr->args.pgbase = req->wb_pgbase + offset;
+       hdr->args.pages  = hdr->page_array.pagevec;
+       hdr->args.count  = count;
+       hdr->args.context = get_nfs_open_context(req->wb_context);
+       hdr->args.lock_context = req->wb_lock_context;
+       hdr->args.stable  = NFS_UNSTABLE;
        switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
        case 0:
                break;
@@ -578,59 +535,59 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data,
                if (nfs_reqs_to_commit(cinfo))
                        break;
        default:
-               data->args.stable = NFS_FILE_SYNC;
+               hdr->args.stable = NFS_FILE_SYNC;
        }
 
-       data->res.fattr   = &data->fattr;
-       data->res.count   = count;
-       data->res.eof     = 0;
-       data->res.verf    = &data->verf;
-       nfs_fattr_init(&data->fattr);
+       hdr->res.fattr   = &hdr->fattr;
+       hdr->res.count   = count;
+       hdr->res.eof     = 0;
+       hdr->res.verf    = &hdr->verf;
+       nfs_fattr_init(&hdr->fattr);
 }
 
 /**
- * nfs_pgio_prepare - Prepare pageio data to go over the wire
+ * nfs_pgio_prepare - Prepare pageio hdr to go over the wire
  * @task: The current task
- * @calldata: pageio data to prepare
+ * @calldata: pageio header to prepare
  */
 static void nfs_pgio_prepare(struct rpc_task *task, void *calldata)
 {
-       struct nfs_pgio_data *data = calldata;
+       struct nfs_pgio_header *hdr = calldata;
        int err;
-       err = NFS_PROTO(data->header->inode)->pgio_rpc_prepare(task, data);
+       err = NFS_PROTO(hdr->inode)->pgio_rpc_prepare(task, hdr);
        if (err)
                rpc_exit(task, err);
 }
 
-int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_data *data,
+int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
                      const struct rpc_call_ops *call_ops, int how, int flags)
 {
        struct rpc_task *task;
        struct rpc_message msg = {
-               .rpc_argp = &data->args,
-               .rpc_resp = &data->res,
-               .rpc_cred = data->header->cred,
+               .rpc_argp = &hdr->args,
+               .rpc_resp = &hdr->res,
+               .rpc_cred = hdr->cred,
        };
        struct rpc_task_setup task_setup_data = {
                .rpc_client = clnt,
-               .task = &data->task,
+               .task = &hdr->task,
                .rpc_message = &msg,
                .callback_ops = call_ops,
-               .callback_data = data,
+               .callback_data = hdr,
                .workqueue = nfsiod_workqueue,
                .flags = RPC_TASK_ASYNC | flags,
        };
        int ret = 0;
 
-       data->header->rw_ops->rw_initiate(data, &msg, &task_setup_data, how);
+       hdr->rw_ops->rw_initiate(hdr, &msg, &task_setup_data, how);
 
        dprintk("NFS: %5u initiated pgio call "
                "(req %s/%llu, %u bytes @ offset %llu)\n",
-               data->task.tk_pid,
-               data->header->inode->i_sb->s_id,
-               (unsigned long long)NFS_FILEID(data->header->inode),
-               data->args.count,
-               (unsigned long long)data->args.offset);
+               hdr->task.tk_pid,
+               hdr->inode->i_sb->s_id,
+               (unsigned long long)NFS_FILEID(hdr->inode),
+               hdr->args.count,
+               (unsigned long long)hdr->args.offset);
 
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task)) {
@@ -657,22 +614,23 @@ static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
                          struct nfs_pgio_header *hdr)
 {
        set_bit(NFS_IOHDR_REDO, &hdr->flags);
-       nfs_pgio_data_release(hdr->data);
-       hdr->data = NULL;
+       nfs_pgio_data_destroy(hdr);
+       hdr->completion_ops->completion(hdr);
        desc->pg_completion_ops->error_cleanup(&desc->pg_list);
        return -ENOMEM;
 }
 
 /**
  * nfs_pgio_release - Release pageio data
- * @calldata: The pageio data to release
+ * @calldata: The pageio header to release
  */
 static void nfs_pgio_release(void *calldata)
 {
-       struct nfs_pgio_data *data = calldata;
-       if (data->header->rw_ops->rw_release)
-               data->header->rw_ops->rw_release(data);
-       nfs_pgio_data_release(data);
+       struct nfs_pgio_header *hdr = calldata;
+       if (hdr->rw_ops->rw_release)
+               hdr->rw_ops->rw_release(hdr);
+       nfs_pgio_data_destroy(hdr);
+       hdr->completion_ops->completion(hdr);
 }
 
 /**
@@ -713,22 +671,22 @@ EXPORT_SYMBOL_GPL(nfs_pageio_init);
 /**
  * nfs_pgio_result - Basic pageio error handling
  * @task: The task that ran
- * @calldata: Pageio data to check
+ * @calldata: Pageio header to check
  */
 static void nfs_pgio_result(struct rpc_task *task, void *calldata)
 {
-       struct nfs_pgio_data *data = calldata;
-       struct inode *inode = data->header->inode;
+       struct nfs_pgio_header *hdr = calldata;
+       struct inode *inode = hdr->inode;
 
        dprintk("NFS: %s: %5u, (status %d)\n", __func__,
                task->tk_pid, task->tk_status);
 
-       if (data->header->rw_ops->rw_done(task, data, inode) != 0)
+       if (hdr->rw_ops->rw_done(task, hdr, inode) != 0)
                return;
        if (task->tk_status < 0)
-               nfs_set_pgio_error(data->header, task->tk_status, data->args.offset);
+               nfs_set_pgio_error(hdr, task->tk_status, hdr->args.offset);
        else
-               data->header->rw_ops->rw_result(task, data);
+               hdr->rw_ops->rw_result(task, hdr);
 }
 
 /*
@@ -744,17 +702,16 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
 {
        struct nfs_page         *req;
        struct page             **pages;
-       struct nfs_pgio_data    *data;
        struct list_head *head = &desc->pg_list;
        struct nfs_commit_info cinfo;
+       unsigned int pagecount;
 
-       data = nfs_pgio_data_alloc(hdr, nfs_page_array_len(desc->pg_base,
-                                                          desc->pg_count));
-       if (!data)
+       pagecount = nfs_page_array_len(desc->pg_base, desc->pg_count);
+       if (!nfs_pgarray_set(&hdr->page_array, pagecount))
                return nfs_pgio_error(desc, hdr);
 
        nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
-       pages = data->pages.pagevec;
+       pages = hdr->page_array.pagevec;
        while (!list_empty(head)) {
                req = nfs_list_entry(head->next);
                nfs_list_remove_request(req);
@@ -767,8 +724,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
                desc->pg_ioflags &= ~FLUSH_COND_STABLE;
 
        /* Set up the argument struct */
-       nfs_pgio_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
-       hdr->data = data;
+       nfs_pgio_rpcsetup(hdr, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
        desc->pg_rpc_callops = &nfs_pgio_common_ops;
        return 0;
 }
@@ -776,25 +732,20 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio);
 
 static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
 {
-       struct nfs_rw_header *rw_hdr;
        struct nfs_pgio_header *hdr;
        int ret;
 
-       rw_hdr = nfs_rw_header_alloc(desc->pg_rw_ops);
-       if (!rw_hdr) {
+       hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
+       if (!hdr) {
                desc->pg_completion_ops->error_cleanup(&desc->pg_list);
                return -ENOMEM;
        }
-       hdr = &rw_hdr->header;
-       nfs_pgheader_init(desc, hdr, nfs_rw_header_free);
-       atomic_inc(&hdr->refcnt);
+       nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
        ret = nfs_generic_pgio(desc, hdr);
        if (ret == 0)
                ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode),
-                                       hdr->data, desc->pg_rpc_callops,
+                                       hdr, desc->pg_rpc_callops,
                                        desc->pg_ioflags, 0);
-       if (atomic_dec_and_test(&hdr->refcnt))
-               hdr->completion_ops->completion(hdr);
        return ret;
 }
 
@@ -907,8 +858,13 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
        struct nfs_page *subreq;
        unsigned int bytes_left = 0;
        unsigned int offset, pgbase;
+       int ret;
 
-       nfs_page_group_lock(req);
+       ret = nfs_page_group_lock(req, false);
+       if (ret < 0) {
+               desc->pg_error = ret;
+               return 0;
+       }
 
        subreq = req;
        bytes_left = subreq->wb_bytes;
@@ -930,7 +886,11 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
                        if (desc->pg_recoalesce)
                                return 0;
                        /* retry add_request for this subreq */
-                       nfs_page_group_lock(req);
+                       ret = nfs_page_group_lock(req, false);
+                       if (ret < 0) {
+                               desc->pg_error = ret;
+                               return 0;
+                       }
                        continue;
                }
 
@@ -1005,7 +965,38 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
        } while (ret);
        return ret;
 }
-EXPORT_SYMBOL_GPL(nfs_pageio_add_request);
+
+/*
+ * nfs_pageio_resend - Transfer requests to new descriptor and resend
+ * @hdr - the pgio header to move request from
+ * @desc - the pageio descriptor to add requests to
+ *
+ * Try to move each request (nfs_page) from @hdr to @desc then attempt
+ * to send them.
+ *
+ * Returns 0 on success and < 0 on error.
+ */
+int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
+                     struct nfs_pgio_header *hdr)
+{
+       LIST_HEAD(failed);
+
+       desc->pg_dreq = hdr->dreq;
+       while (!list_empty(&hdr->pages)) {
+               struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+
+               nfs_list_remove_request(req);
+               if (!nfs_pageio_add_request(desc, req))
+                       nfs_list_add_request(req, &failed);
+       }
+       nfs_pageio_complete(desc);
+       if (!list_empty(&failed)) {
+               list_move(&failed, &hdr->pages);
+               return -EIO;
+       }
+       return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_pageio_resend);
 
 /**
  * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor
@@ -1021,7 +1012,6 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
                        break;
        }
 }
-EXPORT_SYMBOL_GPL(nfs_pageio_complete);
 
 /**
  * nfs_pageio_cond_complete - Conditional I/O completion
index a8914b3356174a5063369452bc73fab7c0b8db2e..a3851debf8a2f481435b3750f5cb1bec678ab4fe 100644 (file)
@@ -361,6 +361,23 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 }
 EXPORT_SYMBOL_GPL(pnfs_put_lseg);
 
+static void pnfs_put_lseg_async_work(struct work_struct *work)
+{
+       struct pnfs_layout_segment *lseg;
+
+       lseg = container_of(work, struct pnfs_layout_segment, pls_work);
+
+       pnfs_put_lseg(lseg);
+}
+
+void
+pnfs_put_lseg_async(struct pnfs_layout_segment *lseg)
+{
+       INIT_WORK(&lseg->pls_work, pnfs_put_lseg_async_work);
+       schedule_work(&lseg->pls_work);
+}
+EXPORT_SYMBOL_GPL(pnfs_put_lseg_async);
+
 static u64
 end_offset(u64 start, u64 len)
 {
@@ -1470,41 +1487,19 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
 
-int pnfs_write_done_resend_to_mds(struct inode *inode,
-                               struct list_head *head,
-                               const struct nfs_pgio_completion_ops *compl_ops,
-                               struct nfs_direct_req *dreq)
+int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
 {
        struct nfs_pageio_descriptor pgio;
-       LIST_HEAD(failed);
 
        /* Resend all requests through the MDS */
-       nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, true, compl_ops);
-       pgio.pg_dreq = dreq;
-       while (!list_empty(head)) {
-               struct nfs_page *req = nfs_list_entry(head->next);
-
-               nfs_list_remove_request(req);
-               if (!nfs_pageio_add_request(&pgio, req))
-                       nfs_list_add_request(req, &failed);
-       }
-       nfs_pageio_complete(&pgio);
-
-       if (!list_empty(&failed)) {
-               /* For some reason our attempt to resend pages. Mark the
-                * overall send request as having failed, and let
-                * nfs_writeback_release_full deal with the error.
-                */
-               list_move(&failed, head);
-               return -EIO;
-       }
-       return 0;
+       nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
+                             hdr->completion_ops);
+       return nfs_pageio_resend(&pgio, hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
 
-static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data)
+static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
 
        dprintk("pnfs write error = %d\n", hdr->pnfs_error);
        if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
@@ -1512,50 +1507,42 @@ static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data)
                pnfs_return_layout(hdr->inode);
        }
        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
-               data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
-                                                       &hdr->pages,
-                                                       hdr->completion_ops,
-                                                       hdr->dreq);
+               hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr);
 }
 
 /*
  * Called by non rpc-based layout drivers
  */
-void pnfs_ld_write_done(struct nfs_pgio_data *data)
+void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
-
-       trace_nfs4_pnfs_write(data, hdr->pnfs_error);
+       trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
        if (!hdr->pnfs_error) {
-               pnfs_set_layoutcommit(data);
-               hdr->mds_ops->rpc_call_done(&data->task, data);
+               pnfs_set_layoutcommit(hdr);
+               hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
        } else
-               pnfs_ld_handle_write_error(data);
-       hdr->mds_ops->rpc_release(data);
+               pnfs_ld_handle_write_error(hdr);
+       hdr->mds_ops->rpc_release(hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
 
 static void
 pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
-               struct nfs_pgio_data *data)
+               struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
-
        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
                list_splice_tail_init(&hdr->pages, &desc->pg_list);
                nfs_pageio_reset_write_mds(desc);
                desc->pg_recoalesce = 1;
        }
-       nfs_pgio_data_release(data);
+       nfs_pgio_data_destroy(hdr);
 }
 
 static enum pnfs_try_status
-pnfs_try_to_write_data(struct nfs_pgio_data *wdata,
+pnfs_try_to_write_data(struct nfs_pgio_header *hdr,
                        const struct rpc_call_ops *call_ops,
                        struct pnfs_layout_segment *lseg,
                        int how)
 {
-       struct nfs_pgio_header *hdr = wdata->header;
        struct inode *inode = hdr->inode;
        enum pnfs_try_status trypnfs;
        struct nfs_server *nfss = NFS_SERVER(inode);
@@ -1563,8 +1550,8 @@ pnfs_try_to_write_data(struct nfs_pgio_data *wdata,
        hdr->mds_ops = call_ops;
 
        dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
-               inode->i_ino, wdata->args.count, wdata->args.offset, how);
-       trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how);
+               inode->i_ino, hdr->args.count, hdr->args.offset, how);
+       trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how);
        if (trypnfs != PNFS_NOT_ATTEMPTED)
                nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
        dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
@@ -1575,139 +1562,105 @@ static void
 pnfs_do_write(struct nfs_pageio_descriptor *desc,
              struct nfs_pgio_header *hdr, int how)
 {
-       struct nfs_pgio_data *data = hdr->data;
        const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
        struct pnfs_layout_segment *lseg = desc->pg_lseg;
        enum pnfs_try_status trypnfs;
 
        desc->pg_lseg = NULL;
-       trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
+       trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
        if (trypnfs == PNFS_NOT_ATTEMPTED)
-               pnfs_write_through_mds(desc, data);
+               pnfs_write_through_mds(desc, hdr);
        pnfs_put_lseg(lseg);
 }
 
 static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
 {
        pnfs_put_lseg(hdr->lseg);
-       nfs_rw_header_free(hdr);
+       nfs_pgio_header_free(hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
 
 int
 pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 {
-       struct nfs_rw_header *whdr;
        struct nfs_pgio_header *hdr;
        int ret;
 
-       whdr = nfs_rw_header_alloc(desc->pg_rw_ops);
-       if (!whdr) {
+       hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
+       if (!hdr) {
                desc->pg_completion_ops->error_cleanup(&desc->pg_list);
                pnfs_put_lseg(desc->pg_lseg);
                desc->pg_lseg = NULL;
                return -ENOMEM;
        }
-       hdr = &whdr->header;
        nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
        hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
-       atomic_inc(&hdr->refcnt);
        ret = nfs_generic_pgio(desc, hdr);
        if (ret != 0) {
                pnfs_put_lseg(desc->pg_lseg);
                desc->pg_lseg = NULL;
        } else
                pnfs_do_write(desc, hdr, desc->pg_ioflags);
-       if (atomic_dec_and_test(&hdr->refcnt))
-               hdr->completion_ops->completion(hdr);
        return ret;
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
 
-int pnfs_read_done_resend_to_mds(struct inode *inode,
-                               struct list_head *head,
-                               const struct nfs_pgio_completion_ops *compl_ops,
-                               struct nfs_direct_req *dreq)
+int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr)
 {
        struct nfs_pageio_descriptor pgio;
-       LIST_HEAD(failed);
 
        /* Resend all requests through the MDS */
-       nfs_pageio_init_read(&pgio, inode, true, compl_ops);
-       pgio.pg_dreq = dreq;
-       while (!list_empty(head)) {
-               struct nfs_page *req = nfs_list_entry(head->next);
-
-               nfs_list_remove_request(req);
-               if (!nfs_pageio_add_request(&pgio, req))
-                       nfs_list_add_request(req, &failed);
-       }
-       nfs_pageio_complete(&pgio);
-
-       if (!list_empty(&failed)) {
-               list_move(&failed, head);
-               return -EIO;
-       }
-       return 0;
+       nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops);
+       return nfs_pageio_resend(&pgio, hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
 
-static void pnfs_ld_handle_read_error(struct nfs_pgio_data *data)
+static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
-
        dprintk("pnfs read error = %d\n", hdr->pnfs_error);
        if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
            PNFS_LAYOUTRET_ON_ERROR) {
                pnfs_return_layout(hdr->inode);
        }
        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
-               data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
-                                                       &hdr->pages,
-                                                       hdr->completion_ops,
-                                                       hdr->dreq);
+               hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr);
 }
 
 /*
  * Called by non rpc-based layout drivers
  */
-void pnfs_ld_read_done(struct nfs_pgio_data *data)
+void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
-
-       trace_nfs4_pnfs_read(data, hdr->pnfs_error);
+       trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
        if (likely(!hdr->pnfs_error)) {
-               __nfs4_read_done_cb(data);
-               hdr->mds_ops->rpc_call_done(&data->task, data);
+               __nfs4_read_done_cb(hdr);
+               hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
        } else
-               pnfs_ld_handle_read_error(data);
-       hdr->mds_ops->rpc_release(data);
+               pnfs_ld_handle_read_error(hdr);
+       hdr->mds_ops->rpc_release(hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
 
 static void
 pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
-               struct nfs_pgio_data *data)
+               struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
-
        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
                list_splice_tail_init(&hdr->pages, &desc->pg_list);
                nfs_pageio_reset_read_mds(desc);
                desc->pg_recoalesce = 1;
        }
-       nfs_pgio_data_release(data);
+       nfs_pgio_data_destroy(hdr);
 }
 
 /*
  * Call the appropriate parallel I/O subsystem read function.
  */
 static enum pnfs_try_status
-pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
+pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
                       const struct rpc_call_ops *call_ops,
                       struct pnfs_layout_segment *lseg)
 {
-       struct nfs_pgio_header *hdr = rdata->header;
        struct inode *inode = hdr->inode;
        struct nfs_server *nfss = NFS_SERVER(inode);
        enum pnfs_try_status trypnfs;
@@ -1715,9 +1668,9 @@ pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
        hdr->mds_ops = call_ops;
 
        dprintk("%s: Reading ino:%lu %u@%llu\n",
-               __func__, inode->i_ino, rdata->args.count, rdata->args.offset);
+               __func__, inode->i_ino, hdr->args.count, hdr->args.offset);
 
-       trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata);
+       trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr);
        if (trypnfs != PNFS_NOT_ATTEMPTED)
                nfs_inc_stats(inode, NFSIOS_PNFS_READ);
        dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
@@ -1727,52 +1680,46 @@ pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
 static void
 pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_data *data = hdr->data;
        const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
        struct pnfs_layout_segment *lseg = desc->pg_lseg;
        enum pnfs_try_status trypnfs;
 
        desc->pg_lseg = NULL;
-       trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
+       trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
        if (trypnfs == PNFS_NOT_ATTEMPTED)
-               pnfs_read_through_mds(desc, data);
+               pnfs_read_through_mds(desc, hdr);
        pnfs_put_lseg(lseg);
 }
 
 static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
 {
        pnfs_put_lseg(hdr->lseg);
-       nfs_rw_header_free(hdr);
+       nfs_pgio_header_free(hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
 
 int
 pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 {
-       struct nfs_rw_header *rhdr;
        struct nfs_pgio_header *hdr;
        int ret;
 
-       rhdr = nfs_rw_header_alloc(desc->pg_rw_ops);
-       if (!rhdr) {
+       hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
+       if (!hdr) {
                desc->pg_completion_ops->error_cleanup(&desc->pg_list);
                ret = -ENOMEM;
                pnfs_put_lseg(desc->pg_lseg);
                desc->pg_lseg = NULL;
                return ret;
        }
-       hdr = &rhdr->header;
        nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
        hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
-       atomic_inc(&hdr->refcnt);
        ret = nfs_generic_pgio(desc, hdr);
        if (ret != 0) {
                pnfs_put_lseg(desc->pg_lseg);
                desc->pg_lseg = NULL;
        } else
                pnfs_do_read(desc, hdr);
-       if (atomic_dec_and_test(&hdr->refcnt))
-               hdr->completion_ops->completion(hdr);
        return ret;
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
@@ -1820,12 +1767,11 @@ void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
 EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
 
 void
-pnfs_set_layoutcommit(struct nfs_pgio_data *wdata)
+pnfs_set_layoutcommit(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = wdata->header;
        struct inode *inode = hdr->inode;
        struct nfs_inode *nfsi = NFS_I(inode);
-       loff_t end_pos = wdata->mds_offset + wdata->res.count;
+       loff_t end_pos = hdr->mds_offset + hdr->res.count;
        bool mark_as_dirty = false;
 
        spin_lock(&inode->i_lock);
index 4fb309a2b4c48e871de3a13a0b60c8ce66e08c7e..aca3dff5dae63e3a5d41f3e43b7055e77372b841 100644 (file)
@@ -32,6 +32,7 @@
 
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
+#include <linux/workqueue.h>
 
 enum {
        NFS_LSEG_VALID = 0,     /* cleared when lseg is recalled/returned */
@@ -46,6 +47,7 @@ struct pnfs_layout_segment {
        atomic_t pls_refcount;
        unsigned long pls_flags;
        struct pnfs_layout_hdr *pls_layout;
+       struct work_struct pls_work;
 };
 
 enum pnfs_try_status {
@@ -104,6 +106,8 @@ struct pnfs_layoutdriver_type {
                                  int max);
        void (*recover_commit_reqs) (struct list_head *list,
                                     struct nfs_commit_info *cinfo);
+       struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo,
+                                               struct page *page);
        int (*commit_pagelist)(struct inode *inode,
                               struct list_head *mds_pages,
                               int how,
@@ -113,8 +117,8 @@ struct pnfs_layoutdriver_type {
         * Return PNFS_ATTEMPTED to indicate the layout code has attempted
         * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
         */
-       enum pnfs_try_status (*read_pagelist) (struct nfs_pgio_data *nfs_data);
-       enum pnfs_try_status (*write_pagelist) (struct nfs_pgio_data *nfs_data, int how);
+       enum pnfs_try_status (*read_pagelist)(struct nfs_pgio_header *);
+       enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int);
 
        void (*free_deviceid_node) (struct nfs4_deviceid_node *);
 
@@ -179,6 +183,7 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
 /* pnfs.c */
 void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
 void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
+void pnfs_put_lseg_async(struct pnfs_layout_segment *lseg);
 
 void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
 void unset_pnfs_layoutdriver(struct nfs_server *);
@@ -213,13 +218,13 @@ bool pnfs_roc(struct inode *ino);
 void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
 bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
-void pnfs_set_layoutcommit(struct nfs_pgio_data *wdata);
+void pnfs_set_layoutcommit(struct nfs_pgio_header *);
 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
 int _pnfs_return_layout(struct inode *);
 int pnfs_commit_and_return_layout(struct inode *);
-void pnfs_ld_write_done(struct nfs_pgio_data *);
-void pnfs_ld_read_done(struct nfs_pgio_data *);
+void pnfs_ld_write_done(struct nfs_pgio_header *);
+void pnfs_ld_read_done(struct nfs_pgio_header *);
 struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
                                               struct nfs_open_context *ctx,
                                               loff_t pos,
@@ -228,12 +233,8 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
                                               gfp_t gfp_flags);
 
 void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
-int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head,
-                       const struct nfs_pgio_completion_ops *compl_ops,
-                       struct nfs_direct_req *dreq);
-int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head,
-                       const struct nfs_pgio_completion_ops *compl_ops,
-                       struct nfs_direct_req *dreq);
+int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
+int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
 struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
 
 /* nfs4_deviceid_flags */
@@ -345,6 +346,17 @@ pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
        NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
 }
 
+static inline struct nfs_page *
+pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
+                       struct page *page)
+{
+       struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+       if (ld == NULL || ld->search_commit_reqs == NULL)
+               return NULL;
+       return ld->search_commit_reqs(cinfo, page);
+}
+
 /* Should the pNFS client commit and return the layout upon a setattr */
 static inline bool
 pnfs_ld_layoutret_on_setattr(struct inode *inode)
@@ -410,6 +422,10 @@ static inline void pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 {
 }
 
+static inline void pnfs_put_lseg_async(struct pnfs_layout_segment *lseg)
+{
+}
+
 static inline int pnfs_return_layout(struct inode *ino)
 {
        return 0;
@@ -496,6 +512,13 @@ pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
 {
 }
 
+static inline struct nfs_page *
+pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
+                       struct page *page)
+{
+       return NULL;
+}
+
 static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 {
        return 0;
index c171ce1a8a3098f20f53d19c36f7dc35789efbe2..b09cc23d6f433bc5ea8aff6cfe68c3910c4f8319 100644 (file)
@@ -578,46 +578,49 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
        return 0;
 }
 
-static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-       struct inode *inode = data->header->inode;
+       struct inode *inode = hdr->inode;
 
        nfs_invalidate_atime(inode);
        if (task->tk_status >= 0) {
-               nfs_refresh_inode(inode, data->res.fattr);
+               nfs_refresh_inode(inode, hdr->res.fattr);
                /* Emulate the eof flag, which isn't normally needed in NFSv2
                 * as it is guaranteed to always return the file attributes
                 */
-               if (data->args.offset + data->res.count >= data->res.fattr->size)
-                       data->res.eof = 1;
+               if (hdr->args.offset + hdr->res.count >= hdr->res.fattr->size)
+                       hdr->res.eof = 1;
        }
        return 0;
 }
 
-static void nfs_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs_proc_read_setup(struct nfs_pgio_header *hdr,
+                               struct rpc_message *msg)
 {
        msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
 }
 
-static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task,
+                                    struct nfs_pgio_header *hdr)
 {
        rpc_call_start(task);
        return 0;
 }
 
-static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-       struct inode *inode = data->header->inode;
+       struct inode *inode = hdr->inode;
 
        if (task->tk_status >= 0)
-               nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
+               nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr);
        return 0;
 }
 
-static void nfs_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs_proc_write_setup(struct nfs_pgio_header *hdr,
+                                struct rpc_message *msg)
 {
        /* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */
-       data->args.stable = NFS_FILE_SYNC;
+       hdr->args.stable = NFS_FILE_SYNC;
        msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
 }
 
index e818a475ca64351f0ae00e2484c2e76a640b6bec..beff2769c5c587f9955ec55fd41444326a36c97e 100644 (file)
@@ -33,12 +33,12 @@ static const struct nfs_rw_ops nfs_rw_read_ops;
 
 static struct kmem_cache *nfs_rdata_cachep;
 
-static struct nfs_rw_header *nfs_readhdr_alloc(void)
+static struct nfs_pgio_header *nfs_readhdr_alloc(void)
 {
        return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
 }
 
-static void nfs_readhdr_free(struct nfs_rw_header *rhdr)
+static void nfs_readhdr_free(struct nfs_pgio_header *rhdr)
 {
        kmem_cache_free(nfs_rdata_cachep, rhdr);
 }
@@ -115,12 +115,6 @@ static void nfs_readpage_release(struct nfs_page *req)
 
                unlock_page(req->wb_page);
        }
-
-       dprintk("NFS: read done (%s/%Lu %d@%Ld)\n",
-                       req->wb_context->dentry->d_inode->i_sb->s_id,
-                       (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),
-                       req->wb_bytes,
-                       (long long)req_offset(req));
        nfs_release_request(req);
 }
 
@@ -172,14 +166,15 @@ out:
        hdr->release(hdr);
 }
 
-static void nfs_initiate_read(struct nfs_pgio_data *data, struct rpc_message *msg,
+static void nfs_initiate_read(struct nfs_pgio_header *hdr,
+                             struct rpc_message *msg,
                              struct rpc_task_setup *task_setup_data, int how)
 {
-       struct inode *inode = data->header->inode;
+       struct inode *inode = hdr->inode;
        int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
 
        task_setup_data->flags |= swap_flags;
-       NFS_PROTO(inode)->read_setup(data, msg);
+       NFS_PROTO(inode)->read_setup(hdr, msg);
 }
 
 static void
@@ -203,14 +198,15 @@ static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = {
  * This is the callback from RPC telling us whether a reply was
  * received or some error occurred (timeout or socket shutdown).
  */
-static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data,
+static int nfs_readpage_done(struct rpc_task *task,
+                            struct nfs_pgio_header *hdr,
                             struct inode *inode)
 {
-       int status = NFS_PROTO(inode)->read_done(task, data);
+       int status = NFS_PROTO(inode)->read_done(task, hdr);
        if (status != 0)
                return status;
 
-       nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, data->res.count);
+       nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, hdr->res.count);
 
        if (task->tk_status == -ESTALE) {
                set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
@@ -219,34 +215,34 @@ static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data,
        return 0;
 }
 
-static void nfs_readpage_retry(struct rpc_task *task, struct nfs_pgio_data *data)
+static void nfs_readpage_retry(struct rpc_task *task,
+                              struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_args *argp = &data->args;
-       struct nfs_pgio_res  *resp = &data->res;
+       struct nfs_pgio_args *argp = &hdr->args;
+       struct nfs_pgio_res  *resp = &hdr->res;
 
        /* This is a short read! */
-       nfs_inc_stats(data->header->inode, NFSIOS_SHORTREAD);
+       nfs_inc_stats(hdr->inode, NFSIOS_SHORTREAD);
        /* Has the server at least made some progress? */
        if (resp->count == 0) {
-               nfs_set_pgio_error(data->header, -EIO, argp->offset);
+               nfs_set_pgio_error(hdr, -EIO, argp->offset);
                return;
        }
-       /* Yes, so retry the read at the end of the data */
-       data->mds_offset += resp->count;
+       /* Yes, so retry the read at the end of the hdr */
+       hdr->mds_offset += resp->count;
        argp->offset += resp->count;
        argp->pgbase += resp->count;
        argp->count -= resp->count;
        rpc_restart_call_prepare(task);
 }
 
-static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *data)
+static void nfs_readpage_result(struct rpc_task *task,
+                               struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
-
-       if (data->res.eof) {
+       if (hdr->res.eof) {
                loff_t bound;
 
-               bound = data->args.offset + data->res.count;
+               bound = hdr->args.offset + hdr->res.count;
                spin_lock(&hdr->lock);
                if (bound < hdr->io_start + hdr->good_bytes) {
                        set_bit(NFS_IOHDR_EOF, &hdr->flags);
@@ -254,8 +250,8 @@ static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *dat
                        hdr->good_bytes = bound - hdr->io_start;
                }
                spin_unlock(&hdr->lock);
-       } else if (data->res.count != data->args.count)
-               nfs_readpage_retry(task, data);
+       } else if (hdr->res.count != hdr->args.count)
+               nfs_readpage_retry(task, hdr);
 }
 
 /*
@@ -404,7 +400,7 @@ out:
 int __init nfs_init_readpagecache(void)
 {
        nfs_rdata_cachep = kmem_cache_create("nfs_read_data",
-                                            sizeof(struct nfs_rw_header),
+                                            sizeof(struct nfs_pgio_header),
                                             0, SLAB_HWCACHE_ALIGN,
                                             NULL);
        if (nfs_rdata_cachep == NULL)
index 084af1060d79e1b289f6989dd251eeab2e59f7f6..e4499d5b51e8f33a05fec43843c8648de834dfd7 100644 (file)
@@ -1027,8 +1027,7 @@ static bool nfs_auth_info_add(struct nfs_auth_info *auth_info,
                              rpc_authflavor_t flavor)
 {
        unsigned int i;
-       unsigned int max_flavor_len = (sizeof(auth_info->flavors) /
-                                      sizeof(auth_info->flavors[0]));
+       unsigned int max_flavor_len = ARRAY_SIZE(auth_info->flavors);
 
        /* make sure this flavor isn't already in the list */
        for (i = 0; i < auth_info->flavor_len; i++) {
@@ -2180,7 +2179,7 @@ out_no_address:
        return -EINVAL;
 }
 
-#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \
+#define NFS_REMOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \
                | NFS_MOUNT_SECURE \
                | NFS_MOUNT_TCP \
                | NFS_MOUNT_VER3 \
@@ -2188,15 +2187,16 @@ out_no_address:
                | NFS_MOUNT_NONLM \
                | NFS_MOUNT_BROKEN_SUID \
                | NFS_MOUNT_STRICTLOCK \
-               | NFS_MOUNT_UNSHARED \
-               | NFS_MOUNT_NORESVPORT \
                | NFS_MOUNT_LEGACY_INTERFACE)
 
+#define NFS_MOUNT_CMP_FLAGMASK (NFS_REMOUNT_CMP_FLAGMASK & \
+               ~(NFS_MOUNT_UNSHARED | NFS_MOUNT_NORESVPORT))
+
 static int
 nfs_compare_remount_data(struct nfs_server *nfss,
                         struct nfs_parsed_mount_data *data)
 {
-       if ((data->flags ^ nfss->flags) & NFS_MOUNT_CMP_FLAGMASK ||
+       if ((data->flags ^ nfss->flags) & NFS_REMOUNT_CMP_FLAGMASK ||
            data->rsize != nfss->rsize ||
            data->wsize != nfss->wsize ||
            data->version != nfss->nfs_client->rpc_ops->version ||
index 962c9ee758be30e57141761bd29898a1df129009..e3b5cf28bdc5c2dbfba5d3c16b06b5724afe6c60 100644 (file)
@@ -47,6 +47,8 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops;
 static const struct nfs_commit_completion_ops nfs_commit_completion_ops;
 static const struct nfs_rw_ops nfs_rw_write_ops;
 static void nfs_clear_request_commit(struct nfs_page *req);
+static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
+                                     struct inode *inode);
 
 static struct kmem_cache *nfs_wdata_cachep;
 static mempool_t *nfs_wdata_mempool;
@@ -71,18 +73,18 @@ void nfs_commit_free(struct nfs_commit_data *p)
 }
 EXPORT_SYMBOL_GPL(nfs_commit_free);
 
-static struct nfs_rw_header *nfs_writehdr_alloc(void)
+static struct nfs_pgio_header *nfs_writehdr_alloc(void)
 {
-       struct nfs_rw_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
+       struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
 
        if (p)
                memset(p, 0, sizeof(*p));
        return p;
 }
 
-static void nfs_writehdr_free(struct nfs_rw_header *whdr)
+static void nfs_writehdr_free(struct nfs_pgio_header *hdr)
 {
-       mempool_free(whdr, nfs_wdata_mempool);
+       mempool_free(hdr, nfs_wdata_mempool);
 }
 
 static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
@@ -92,6 +94,38 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
        set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
 }
 
+/*
+ * nfs_page_search_commits_for_head_request_locked
+ *
+ * Search through commit lists on @inode for the head request for @page.
+ * Must be called while holding the inode (which is cinfo) lock.
+ *
+ * Returns the head request if found, or NULL if not found.
+ */
+static struct nfs_page *
+nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
+                                               struct page *page)
+{
+       struct nfs_page *freq, *t;
+       struct nfs_commit_info cinfo;
+       struct inode *inode = &nfsi->vfs_inode;
+
+       nfs_init_cinfo_from_inode(&cinfo, inode);
+
+       /* search through pnfs commit lists */
+       freq = pnfs_search_commit_reqs(inode, &cinfo, page);
+       if (freq)
+               return freq->wb_head;
+
+       /* Linearly search the commit list for the correct request */
+       list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
+               if (freq->wb_page == page)
+                       return freq->wb_head;
+       }
+
+       return NULL;
+}
+
 /*
  * nfs_page_find_head_request_locked - find head request associated with @page
  *
@@ -106,21 +140,12 @@ nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page)
 
        if (PagePrivate(page))
                req = (struct nfs_page *)page_private(page);
-       else if (unlikely(PageSwapCache(page))) {
-               struct nfs_page *freq, *t;
-
-               /* Linearly search the commit list for the correct req */
-               list_for_each_entry_safe(freq, t, &nfsi->commit_info.list, wb_list) {
-                       if (freq->wb_page == page) {
-                               req = freq->wb_head;
-                               break;
-                       }
-               }
-       }
+       else if (unlikely(PageSwapCache(page)))
+               req = nfs_page_search_commits_for_head_request_locked(nfsi,
+                       page);
 
        if (req) {
                WARN_ON_ONCE(req->wb_head != req);
-
                kref_get(&req->wb_kref);
        }
 
@@ -216,7 +241,7 @@ static bool nfs_page_group_covers_page(struct nfs_page *req)
        unsigned int pos = 0;
        unsigned int len = nfs_page_length(req->wb_page);
 
-       nfs_page_group_lock(req);
+       nfs_page_group_lock(req, true);
 
        do {
                tmp = nfs_page_group_search_locked(req->wb_head, pos);
@@ -379,8 +404,6 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
                subreq->wb_head = subreq;
                subreq->wb_this_page = subreq;
 
-               nfs_clear_request_commit(subreq);
-
                /* subreq is now totally disconnected from page group or any
                 * write / commit lists. last chance to wake any waiters */
                nfs_unlock_request(subreq);
@@ -456,7 +479,9 @@ try_again:
        }
 
        /* lock each request in the page group */
-       nfs_page_group_lock(head);
+       ret = nfs_page_group_lock(head, false);
+       if (ret < 0)
+               return ERR_PTR(ret);
        subreq = head;
        do {
                /*
@@ -488,7 +513,7 @@ try_again:
         * Commit list removal accounting is done after locks are dropped */
        subreq = head;
        do {
-               nfs_list_remove_request(subreq);
+               nfs_clear_request_commit(subreq);
                subreq = subreq->wb_this_page;
        } while (subreq != head);
 
@@ -518,15 +543,11 @@ try_again:
 
        nfs_page_group_unlock(head);
 
-       /* drop lock to clear_request_commit the head req and clean up
-        * requests on destroy list */
+       /* drop lock to clean uprequests on destroy list */
        spin_unlock(&inode->i_lock);
 
        nfs_destroy_unlinked_subrequests(destroy_list, head);
 
-       /* clean up commit list state */
-       nfs_clear_request_commit(head);
-
        /* still holds ref on head from nfs_page_find_head_request_locked
         * and still has lock on head from lock loop */
        return head;
@@ -705,6 +726,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 
        if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags))
                nfs_release_request(req);
+       else
+               WARN_ON_ONCE(1);
 }
 
 static void
@@ -808,6 +831,7 @@ nfs_clear_page_commit(struct page *page)
        dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE);
 }
 
+/* Called holding inode (/cinfo) lock */
 static void
 nfs_clear_request_commit(struct nfs_page *req)
 {
@@ -817,20 +841,17 @@ nfs_clear_request_commit(struct nfs_page *req)
 
                nfs_init_cinfo_from_inode(&cinfo, inode);
                if (!pnfs_clear_request_commit(req, &cinfo)) {
-                       spin_lock(cinfo.lock);
                        nfs_request_remove_commit_list(req, &cinfo);
-                       spin_unlock(cinfo.lock);
                }
                nfs_clear_page_commit(req->wb_page);
        }
 }
 
-static inline
-int nfs_write_need_commit(struct nfs_pgio_data *data)
+int nfs_write_need_commit(struct nfs_pgio_header *hdr)
 {
-       if (data->verf.committed == NFS_DATA_SYNC)
-               return data->header->lseg == NULL;
-       return data->verf.committed != NFS_FILE_SYNC;
+       if (hdr->verf.committed == NFS_DATA_SYNC)
+               return hdr->lseg == NULL;
+       return hdr->verf.committed != NFS_FILE_SYNC;
 }
 
 #else
@@ -856,8 +877,7 @@ nfs_clear_request_commit(struct nfs_page *req)
 {
 }
 
-static inline
-int nfs_write_need_commit(struct nfs_pgio_data *data)
+int nfs_write_need_commit(struct nfs_pgio_header *hdr)
 {
        return 0;
 }
@@ -883,11 +903,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
                        nfs_context_set_write_error(req->wb_context, hdr->error);
                        goto remove_req;
                }
-               if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) {
-                       nfs_mark_request_dirty(req);
-                       goto next;
-               }
-               if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
+               if (nfs_write_need_commit(hdr)) {
                        memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
                        nfs_mark_request_commit(req, hdr->lseg, &cinfo);
                        goto next;
@@ -1038,9 +1054,9 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
        else
                req->wb_bytes = rqend - req->wb_offset;
 out_unlock:
-       spin_unlock(&inode->i_lock);
        if (req)
                nfs_clear_request_commit(req);
+       spin_unlock(&inode->i_lock);
        return req;
 out_flushme:
        spin_unlock(&inode->i_lock);
@@ -1241,17 +1257,18 @@ static int flush_task_priority(int how)
        return RPC_PRIORITY_NORMAL;
 }
 
-static void nfs_initiate_write(struct nfs_pgio_data *data, struct rpc_message *msg,
+static void nfs_initiate_write(struct nfs_pgio_header *hdr,
+                              struct rpc_message *msg,
                               struct rpc_task_setup *task_setup_data, int how)
 {
-       struct inode *inode = data->header->inode;
+       struct inode *inode = hdr->inode;
        int priority = flush_task_priority(how);
 
        task_setup_data->priority = priority;
-       NFS_PROTO(inode)->write_setup(data, msg);
+       NFS_PROTO(inode)->write_setup(hdr, msg);
 
        nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client,
-                                &task_setup_data->rpc_client, msg, data);
+                                &task_setup_data->rpc_client, msg, hdr);
 }
 
 /* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -1313,21 +1330,9 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)
        NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
 }
 
-static void nfs_writeback_release_common(struct nfs_pgio_data *data)
+static void nfs_writeback_release_common(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
-       int status = data->task.tk_status;
-
-       if ((status >= 0) && nfs_write_need_commit(data)) {
-               spin_lock(&hdr->lock);
-               if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags))
-                       ; /* Do nothing */
-               else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags))
-                       memcpy(&hdr->verf, &data->verf, sizeof(hdr->verf));
-               else if (memcmp(&hdr->verf, &data->verf, sizeof(hdr->verf)))
-                       set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags);
-               spin_unlock(&hdr->lock);
-       }
+       /* do nothing! */
 }
 
 /*
@@ -1358,7 +1363,8 @@ static int nfs_should_remove_suid(const struct inode *inode)
 /*
  * This function is called when the WRITE call is complete.
  */
-static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
+static int nfs_writeback_done(struct rpc_task *task,
+                             struct nfs_pgio_header *hdr,
                              struct inode *inode)
 {
        int status;
@@ -1370,13 +1376,14 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
         * another writer had changed the file, but some applications
         * depend on tighter cache coherency when writing.
         */
-       status = NFS_PROTO(inode)->write_done(task, data);
+       status = NFS_PROTO(inode)->write_done(task, hdr);
        if (status != 0)
                return status;
-       nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, data->res.count);
+       nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count);
 
 #if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
-       if (data->res.verf->committed < data->args.stable && task->tk_status >= 0) {
+       if (hdr->res.verf->committed < hdr->args.stable &&
+           task->tk_status >= 0) {
                /* We tried a write call, but the server did not
                 * commit data to stable storage even though we
                 * requested it.
@@ -1392,7 +1399,7 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
                        dprintk("NFS:       faulty NFS server %s:"
                                " (committed = %d) != (stable = %d)\n",
                                NFS_SERVER(inode)->nfs_client->cl_hostname,
-                               data->res.verf->committed, data->args.stable);
+                               hdr->res.verf->committed, hdr->args.stable);
                        complain = jiffies + 300 * HZ;
                }
        }
@@ -1407,16 +1414,17 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
 /*
  * This function is called when the WRITE call is complete.
  */
-static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *data)
+static void nfs_writeback_result(struct rpc_task *task,
+                                struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_args    *argp = &data->args;
-       struct nfs_pgio_res     *resp = &data->res;
+       struct nfs_pgio_args    *argp = &hdr->args;
+       struct nfs_pgio_res     *resp = &hdr->res;
 
        if (resp->count < argp->count) {
                static unsigned long    complain;
 
                /* This a short write! */
-               nfs_inc_stats(data->header->inode, NFSIOS_SHORTWRITE);
+               nfs_inc_stats(hdr->inode, NFSIOS_SHORTWRITE);
 
                /* Has the server at least made some progress? */
                if (resp->count == 0) {
@@ -1426,14 +1434,14 @@ static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *da
                                       argp->count);
                                complain = jiffies + 300 * HZ;
                        }
-                       nfs_set_pgio_error(data->header, -EIO, argp->offset);
+                       nfs_set_pgio_error(hdr, -EIO, argp->offset);
                        task->tk_status = -EIO;
                        return;
                }
                /* Was this an NFSv2 write or an NFSv3 stable write? */
                if (resp->verf->committed != NFS_UNSTABLE) {
                        /* Resend from where the server left off */
-                       data->mds_offset += resp->count;
+                       hdr->mds_offset += resp->count;
                        argp->offset += resp->count;
                        argp->pgbase += resp->count;
                        argp->count -= resp->count;
@@ -1884,7 +1892,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
 int __init nfs_init_writepagecache(void)
 {
        nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
-                                            sizeof(struct nfs_rw_header),
+                                            sizeof(struct nfs_pgio_header),
                                             0, SLAB_HWCACHE_ALIGN,
                                             NULL);
        if (nfs_wdata_cachep == NULL)
index ed628f71274c7d22a0cbc1fde941c79f710fc7ab..538f142935ea89b2c14c47e78119f61e56f3bc59 100644 (file)
@@ -30,9 +30,6 @@
 
 MODULE_LICENSE("GPL");
 
-EXPORT_SYMBOL_GPL(nfsacl_encode);
-EXPORT_SYMBOL_GPL(nfsacl_decode);
-
 struct nfsacl_encode_desc {
        struct xdr_array2_desc desc;
        unsigned int count;
@@ -136,6 +133,7 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
                          nfsacl_desc.desc.array_len;
        return err;
 }
+EXPORT_SYMBOL_GPL(nfsacl_encode);
 
 struct nfsacl_decode_desc {
        struct xdr_array2_desc desc;
@@ -295,3 +293,4 @@ int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
        return 8 + nfsacl_desc.desc.elem_size *
                   nfsacl_desc.desc.array_len;
 }
+EXPORT_SYMBOL_GPL(nfsacl_decode);
index c519927b7b5e8c955cf223cbd40924f648376ed2..228f5bdf07721dabae68349461eb1f24f99a2f33 100644 (file)
@@ -942,7 +942,7 @@ static int nilfs_get_root_dentry(struct super_block *sb,
                        iput(inode);
                }
        } else {
-               dentry = d_obtain_alias(inode);
+               dentry = d_obtain_root(inode);
                if (IS_ERR(dentry)) {
                        ret = PTR_ERR(dentry);
                        goto failed_dentry;
index 7f30bdc57d13be7a86bb2e06c20c0e78ea3aa056..f2d0eee9d1f1061399c30a5af36c204c2e3bf491 100644 (file)
  * Note that some things (eg. sb pointer, type, id) doesn't change during
  * the life of the dquot structure and so needn't to be protected by a lock
  *
- * Any operation working on dquots via inode pointers must hold dqptr_sem.  If
- * operation is just reading pointers from inode (or not using them at all) the
- * read lock is enough. If pointers are altered function must hold write lock.
+ * Operation accessing dquots via inode pointers are protected by dquot_srcu.
+ * Operation of reading pointer needs srcu_read_lock(&dquot_srcu), and
+ * synchronize_srcu(&dquot_srcu) is called after clearing pointers from
+ * inode and before dropping dquot references to avoid use of dquots after
+ * they are freed. dq_data_lock is used to serialize the pointer setting and
+ * clearing operations.
  * Special care needs to be taken about S_NOQUOTA inode flag (marking that
  * inode is a quota file). Functions adding pointers from inode to dquots have
- * to check this flag under dqptr_sem and then (if S_NOQUOTA is not set) they
- * have to do all pointer modifications before dropping dqptr_sem. This makes
+ * to check this flag under dq_data_lock and then (if S_NOQUOTA is not set) they
+ * have to do all pointer modifications before dropping dq_data_lock. This makes
  * sure they cannot race with quotaon which first sets S_NOQUOTA flag and
  * then drops all pointers to dquots from an inode.
  *
  * spinlock to internal buffers before writing.
  *
  * Lock ordering (including related VFS locks) is the following:
- *   dqonoff_mutex > i_mutex > journal_lock > dqptr_sem > dquot->dq_lock >
- *   dqio_mutex
+ *   dqonoff_mutex > i_mutex > journal_lock > dquot->dq_lock > dqio_mutex
  * dqonoff_mutex > i_mutex comes from dquot_quota_sync, dquot_enable, etc.
- * The lock ordering of dqptr_sem imposed by quota code is only dqonoff_sem >
- * dqptr_sem. But filesystem has to count with the fact that functions such as
- * dquot_alloc_space() acquire dqptr_sem and they usually have to be called
- * from inside a transaction to keep filesystem consistency after a crash. Also
- * filesystems usually want to do some IO on dquot from ->mark_dirty which is
- * called with dqptr_sem held.
  */
 
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock);
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
 EXPORT_SYMBOL(dq_data_lock);
+DEFINE_STATIC_SRCU(dquot_srcu);
 
 void __quota_error(struct super_block *sb, const char *func,
                   const char *fmt, ...)
@@ -733,7 +730,6 @@ static struct shrinker dqcache_shrinker = {
 
 /*
  * Put reference to dquot
- * NOTE: If you change this function please check whether dqput_blocks() works right...
  */
 void dqput(struct dquot *dquot)
 {
@@ -962,47 +958,34 @@ static void add_dquot_ref(struct super_block *sb, int type)
 #endif
 }
 
-/*
- * Return 0 if dqput() won't block.
- * (note that 1 doesn't necessarily mean blocking)
- */
-static inline int dqput_blocks(struct dquot *dquot)
-{
-       if (atomic_read(&dquot->dq_count) <= 1)
-               return 1;
-       return 0;
-}
-
 /*
  * Remove references to dquots from inode and add dquot to list for freeing
  * if we have the last reference to dquot
- * We can't race with anybody because we hold dqptr_sem for writing...
  */
-static int remove_inode_dquot_ref(struct inode *inode, int type,
-                                 struct list_head *tofree_head)
+static void remove_inode_dquot_ref(struct inode *inode, int type,
+                                  struct list_head *tofree_head)
 {
        struct dquot *dquot = inode->i_dquot[type];
 
        inode->i_dquot[type] = NULL;
-       if (dquot) {
-               if (dqput_blocks(dquot)) {
-#ifdef CONFIG_QUOTA_DEBUG
-                       if (atomic_read(&dquot->dq_count) != 1)
-                               quota_error(inode->i_sb, "Adding dquot with "
-                                           "dq_count %d to dispose list",
-                                           atomic_read(&dquot->dq_count));
-#endif
-                       spin_lock(&dq_list_lock);
-                       /* As dquot must have currently users it can't be on
-                        * the free list... */
-                       list_add(&dquot->dq_free, tofree_head);
-                       spin_unlock(&dq_list_lock);
-                       return 1;
-               }
-               else
-                       dqput(dquot);   /* We have guaranteed we won't block */
+       if (!dquot)
+               return;
+
+       if (list_empty(&dquot->dq_free)) {
+               /*
+                * The inode still has reference to dquot so it can't be in the
+                * free list
+                */
+               spin_lock(&dq_list_lock);
+               list_add(&dquot->dq_free, tofree_head);
+               spin_unlock(&dq_list_lock);
+       } else {
+               /*
+                * Dquot is already in a list to put so we won't drop the last
+                * reference here.
+                */
+               dqput(dquot);
        }
-       return 0;
 }
 
 /*
@@ -1037,13 +1020,15 @@ static void remove_dquot_ref(struct super_block *sb, int type,
                 *  We have to scan also I_NEW inodes because they can already
                 *  have quota pointer initialized. Luckily, we need to touch
                 *  only quota pointers and these have separate locking
-                *  (dqptr_sem).
+                *  (dq_data_lock).
                 */
+               spin_lock(&dq_data_lock);
                if (!IS_NOQUOTA(inode)) {
                        if (unlikely(inode_get_rsv_space(inode) > 0))
                                reserved = 1;
                        remove_inode_dquot_ref(inode, type, tofree_head);
                }
+               spin_unlock(&dq_data_lock);
        }
        spin_unlock(&inode_sb_list_lock);
 #ifdef CONFIG_QUOTA_DEBUG
@@ -1061,9 +1046,8 @@ static void drop_dquot_ref(struct super_block *sb, int type)
        LIST_HEAD(tofree_head);
 
        if (sb->dq_op) {
-               down_write(&sb_dqopt(sb)->dqptr_sem);
                remove_dquot_ref(sb, type, &tofree_head);
-               up_write(&sb_dqopt(sb)->dqptr_sem);
+               synchronize_srcu(&dquot_srcu);
                put_dquot_list(&tofree_head);
        }
 }
@@ -1394,21 +1378,16 @@ static int dquot_active(const struct inode *inode)
 /*
  * Initialize quota pointers in inode
  *
- * We do things in a bit complicated way but by that we avoid calling
- * dqget() and thus filesystem callbacks under dqptr_sem.
- *
  * It is better to call this function outside of any transaction as it
  * might need a lot of space in journal for dquot structure allocation.
  */
 static void __dquot_initialize(struct inode *inode, int type)
 {
-       int cnt;
+       int cnt, init_needed = 0;
        struct dquot *got[MAXQUOTAS];
        struct super_block *sb = inode->i_sb;
        qsize_t rsv;
 
-       /* First test before acquiring mutex - solves deadlocks when we
-         * re-enter the quota code and are already holding the mutex */
        if (!dquot_active(inode))
                return;
 
@@ -1418,6 +1397,15 @@ static void __dquot_initialize(struct inode *inode, int type)
                got[cnt] = NULL;
                if (type != -1 && cnt != type)
                        continue;
+               /*
+                * The i_dquot should have been initialized in most cases,
+                * we check it without locking here to avoid unnecessary
+                * dqget()/dqput() calls.
+                */
+               if (inode->i_dquot[cnt])
+                       continue;
+               init_needed = 1;
+
                switch (cnt) {
                case USRQUOTA:
                        qid = make_kqid_uid(inode->i_uid);
@@ -1429,7 +1417,11 @@ static void __dquot_initialize(struct inode *inode, int type)
                got[cnt] = dqget(sb, qid);
        }
 
-       down_write(&sb_dqopt(sb)->dqptr_sem);
+       /* All required i_dquot has been initialized */
+       if (!init_needed)
+               return;
+
+       spin_lock(&dq_data_lock);
        if (IS_NOQUOTA(inode))
                goto out_err;
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1449,15 +1441,12 @@ static void __dquot_initialize(struct inode *inode, int type)
                         * did a write before quota was turned on
                         */
                        rsv = inode_get_rsv_space(inode);
-                       if (unlikely(rsv)) {
-                               spin_lock(&dq_data_lock);
+                       if (unlikely(rsv))
                                dquot_resv_space(inode->i_dquot[cnt], rsv);
-                               spin_unlock(&dq_data_lock);
-                       }
                }
        }
 out_err:
-       up_write(&sb_dqopt(sb)->dqptr_sem);
+       spin_unlock(&dq_data_lock);
        /* Drop unused references */
        dqput_all(got);
 }
@@ -1469,19 +1458,24 @@ void dquot_initialize(struct inode *inode)
 EXPORT_SYMBOL(dquot_initialize);
 
 /*
- *     Release all quotas referenced by inode
+ * Release all quotas referenced by inode.
+ *
+ * This function only be called on inode free or converting
+ * a file to quota file, no other users for the i_dquot in
+ * both cases, so we needn't call synchronize_srcu() after
+ * clearing i_dquot.
  */
 static void __dquot_drop(struct inode *inode)
 {
        int cnt;
        struct dquot *put[MAXQUOTAS];
 
-       down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+       spin_lock(&dq_data_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                put[cnt] = inode->i_dquot[cnt];
                inode->i_dquot[cnt] = NULL;
        }
-       up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+       spin_unlock(&dq_data_lock);
        dqput_all(put);
 }
 
@@ -1599,15 +1593,11 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
  */
 int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
 {
-       int cnt, ret = 0;
+       int cnt, ret = 0, index;
        struct dquot_warn warn[MAXQUOTAS];
        struct dquot **dquots = inode->i_dquot;
        int reserve = flags & DQUOT_SPACE_RESERVE;
 
-       /*
-        * First test before acquiring mutex - solves deadlocks when we
-        * re-enter the quota code and are already holding the mutex
-        */
        if (!dquot_active(inode)) {
                inode_incr_space(inode, number, reserve);
                goto out;
@@ -1616,7 +1606,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                warn[cnt].w_type = QUOTA_NL_NOWARN;
 
-       down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+       index = srcu_read_lock(&dquot_srcu);
        spin_lock(&dq_data_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (!dquots[cnt])
@@ -1643,7 +1633,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
                goto out_flush_warn;
        mark_all_dquot_dirty(dquots);
 out_flush_warn:
-       up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+       srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
 out:
        return ret;
@@ -1655,17 +1645,16 @@ EXPORT_SYMBOL(__dquot_alloc_space);
  */
 int dquot_alloc_inode(const struct inode *inode)
 {
-       int cnt, ret = 0;
+       int cnt, ret = 0, index;
        struct dquot_warn warn[MAXQUOTAS];
        struct dquot * const *dquots = inode->i_dquot;
 
-       /* First test before acquiring mutex - solves deadlocks when we
-         * re-enter the quota code and are already holding the mutex */
        if (!dquot_active(inode))
                return 0;
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                warn[cnt].w_type = QUOTA_NL_NOWARN;
-       down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+
+       index = srcu_read_lock(&dquot_srcu);
        spin_lock(&dq_data_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (!dquots[cnt])
@@ -1685,7 +1674,7 @@ warn_put_all:
        spin_unlock(&dq_data_lock);
        if (ret == 0)
                mark_all_dquot_dirty(dquots);
-       up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+       srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
        return ret;
 }
@@ -1696,14 +1685,14 @@ EXPORT_SYMBOL(dquot_alloc_inode);
  */
 int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
 {
-       int cnt;
+       int cnt, index;
 
        if (!dquot_active(inode)) {
                inode_claim_rsv_space(inode, number);
                return 0;
        }
 
-       down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+       index = srcu_read_lock(&dquot_srcu);
        spin_lock(&dq_data_lock);
        /* Claim reserved quotas to allocated quotas */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1715,7 +1704,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
        inode_claim_rsv_space(inode, number);
        spin_unlock(&dq_data_lock);
        mark_all_dquot_dirty(inode->i_dquot);
-       up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+       srcu_read_unlock(&dquot_srcu, index);
        return 0;
 }
 EXPORT_SYMBOL(dquot_claim_space_nodirty);
@@ -1725,14 +1714,14 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
  */
 void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
 {
-       int cnt;
+       int cnt, index;
 
        if (!dquot_active(inode)) {
                inode_reclaim_rsv_space(inode, number);
                return;
        }
 
-       down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+       index = srcu_read_lock(&dquot_srcu);
        spin_lock(&dq_data_lock);
        /* Claim reserved quotas to allocated quotas */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1744,7 +1733,7 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
        inode_reclaim_rsv_space(inode, number);
        spin_unlock(&dq_data_lock);
        mark_all_dquot_dirty(inode->i_dquot);
-       up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+       srcu_read_unlock(&dquot_srcu, index);
        return;
 }
 EXPORT_SYMBOL(dquot_reclaim_space_nodirty);
@@ -1757,16 +1746,14 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
        unsigned int cnt;
        struct dquot_warn warn[MAXQUOTAS];
        struct dquot **dquots = inode->i_dquot;
-       int reserve = flags & DQUOT_SPACE_RESERVE;
+       int reserve = flags & DQUOT_SPACE_RESERVE, index;
 
-       /* First test before acquiring mutex - solves deadlocks when we
-         * re-enter the quota code and are already holding the mutex */
        if (!dquot_active(inode)) {
                inode_decr_space(inode, number, reserve);
                return;
        }
 
-       down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+       index = srcu_read_lock(&dquot_srcu);
        spin_lock(&dq_data_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                int wtype;
@@ -1789,7 +1776,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
                goto out_unlock;
        mark_all_dquot_dirty(dquots);
 out_unlock:
-       up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+       srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
 }
 EXPORT_SYMBOL(__dquot_free_space);
@@ -1802,13 +1789,12 @@ void dquot_free_inode(const struct inode *inode)
        unsigned int cnt;
        struct dquot_warn warn[MAXQUOTAS];
        struct dquot * const *dquots = inode->i_dquot;
+       int index;
 
-       /* First test before acquiring mutex - solves deadlocks when we
-         * re-enter the quota code and are already holding the mutex */
        if (!dquot_active(inode))
                return;
 
-       down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+       index = srcu_read_lock(&dquot_srcu);
        spin_lock(&dq_data_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                int wtype;
@@ -1823,7 +1809,7 @@ void dquot_free_inode(const struct inode *inode)
        }
        spin_unlock(&dq_data_lock);
        mark_all_dquot_dirty(dquots);
-       up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+       srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
 }
 EXPORT_SYMBOL(dquot_free_inode);
@@ -1837,6 +1823,8 @@ EXPORT_SYMBOL(dquot_free_inode);
  * This operation can block, but only after everything is updated
  * A transaction must be started when entering this function.
  *
+ * We are holding reference on transfer_from & transfer_to, no need to
+ * protect them by srcu_read_lock().
  */
 int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 {
@@ -1849,8 +1837,6 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
        struct dquot_warn warn_from_inodes[MAXQUOTAS];
        struct dquot_warn warn_from_space[MAXQUOTAS];
 
-       /* First test before acquiring mutex - solves deadlocks when we
-         * re-enter the quota code and are already holding the mutex */
        if (IS_NOQUOTA(inode))
                return 0;
        /* Initialize the arrays */
@@ -1859,12 +1845,12 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
                warn_from_inodes[cnt].w_type = QUOTA_NL_NOWARN;
                warn_from_space[cnt].w_type = QUOTA_NL_NOWARN;
        }
-       down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+
+       spin_lock(&dq_data_lock);
        if (IS_NOQUOTA(inode)) {        /* File without quota accounting? */
-               up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+               spin_unlock(&dq_data_lock);
                return 0;
        }
-       spin_lock(&dq_data_lock);
        cur_space = inode_get_bytes(inode);
        rsv_space = inode_get_rsv_space(inode);
        space = cur_space + rsv_space;
@@ -1918,7 +1904,6 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
                inode->i_dquot[cnt] = transfer_to[cnt];
        }
        spin_unlock(&dq_data_lock);
-       up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 
        mark_all_dquot_dirty(transfer_from);
        mark_all_dquot_dirty(transfer_to);
@@ -1932,7 +1917,6 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
        return 0;
 over_quota:
        spin_unlock(&dq_data_lock);
-       up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
        flush_warnings(warn_to);
        return ret;
 }
index 2f97b0e2c501d37ab45d600ec0870bd4985f5546..ebc5e62858007d8f9ed6acee5f6aef2afb28c9c0 100644 (file)
@@ -55,7 +55,7 @@ EXPORT_SYMBOL(qid_lt);
 /**
  *     from_kqid - Create a qid from a kqid user-namespace pair.
  *     @targ: The user namespace we want a qid in.
- *     @kuid: The kernel internal quota identifier to start with.
+ *     @kqid: The kernel internal quota identifier to start with.
  *
  *     Map @kqid into the user-namespace specified by @targ and
  *     return the resulting qid.
index 72d29177998ebbf22e9888c9cd4cf43b3cc37e91..bb2869f5dfd89528f6986528369438a8cc565efd 100644 (file)
@@ -32,8 +32,7 @@ static struct genl_family quota_genl_family = {
 
 /**
  * quota_send_warning - Send warning to userspace about exceeded quota
- * @type: The quota type: USRQQUOTA, GRPQUOTA,...
- * @id: The user or group id of the quota that was exceeded
+ * @qid: The kernel internal quota identifier.
  * @dev: The device on which the fs is mounted (sb->s_dev)
  * @warntype: The type of the warning: QUOTA_NL_...
  *
index ff3f0b3cfdb31b97183d09e8d1428cd55fd61a12..75621649dbd76dd997aa96462d5d2b6c2124e5a9 100644 (file)
@@ -79,13 +79,13 @@ static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
 {
        __u32 fmt;
 
-       down_read(&sb_dqopt(sb)->dqptr_sem);
+       mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
        if (!sb_has_quota_active(sb, type)) {
-               up_read(&sb_dqopt(sb)->dqptr_sem);
+               mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
                return -ESRCH;
        }
        fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id;
-       up_read(&sb_dqopt(sb)->dqptr_sem);
+       mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
        if (copy_to_user(addr, &fmt, sizeof(fmt)))
                return -EFAULT;
        return 0;
index 5739cb99de7bdf7a1e3edf39a9e69d81016daa60..9c02d96d3a424cefd997471c1e0b1fdc0407495d 100644 (file)
@@ -286,12 +286,14 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
        return 0;
 }
 
-static void balance_leaf_insert_left(struct tree_balance *tb,
-                                    struct item_head *ih, const char *body)
+static unsigned int balance_leaf_insert_left(struct tree_balance *tb,
+                                            struct item_head *const ih,
+                                            const char * const body)
 {
        int ret;
        struct buffer_info bi;
        int n = B_NR_ITEMS(tb->L[0]);
+       unsigned body_shift_bytes = 0;
 
        if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) {
                /* part of new item falls into L[0] */
@@ -329,7 +331,7 @@ static void balance_leaf_insert_left(struct tree_balance *tb,
 
                put_ih_item_len(ih, new_item_len);
                if (tb->lbytes > tb->zeroes_num) {
-                       body += (tb->lbytes - tb->zeroes_num);
+                       body_shift_bytes = tb->lbytes - tb->zeroes_num;
                        tb->zeroes_num = 0;
                } else
                        tb->zeroes_num -= tb->lbytes;
@@ -349,11 +351,12 @@ static void balance_leaf_insert_left(struct tree_balance *tb,
                tb->insert_size[0] = 0;
                tb->zeroes_num = 0;
        }
+       return body_shift_bytes;
 }
 
 static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb,
-                                                struct item_head *ih,
-                                                const char *body)
+                                                struct item_head * const ih,
+                                                const char * const body)
 {
        int n = B_NR_ITEMS(tb->L[0]);
        struct buffer_info bi;
@@ -413,17 +416,18 @@ static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb,
        tb->pos_in_item -= tb->lbytes;
 }
 
-static void balance_leaf_paste_left_shift(struct tree_balance *tb,
-                                         struct item_head *ih,
-                                         const char *body)
+static unsigned int balance_leaf_paste_left_shift(struct tree_balance *tb,
+                                                 struct item_head * const ih,
+                                                 const char * const body)
 {
        struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
        int n = B_NR_ITEMS(tb->L[0]);
        struct buffer_info bi;
+       int body_shift_bytes = 0;
 
        if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) {
                balance_leaf_paste_left_shift_dirent(tb, ih, body);
-               return;
+               return 0;
        }
 
        RFALSE(tb->lbytes <= 0,
@@ -497,7 +501,7 @@ static void balance_leaf_paste_left_shift(struct tree_balance *tb,
                 * insert_size[0]
                 */
                if (l_n > tb->zeroes_num) {
-                       body += (l_n - tb->zeroes_num);
+                       body_shift_bytes = l_n - tb->zeroes_num;
                        tb->zeroes_num = 0;
                } else
                        tb->zeroes_num -= l_n;
@@ -526,13 +530,14 @@ static void balance_leaf_paste_left_shift(struct tree_balance *tb,
                 */
                leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
        }
+       return body_shift_bytes;
 }
 
 
 /* appended item will be in L[0] in whole */
 static void balance_leaf_paste_left_whole(struct tree_balance *tb,
-                                         struct item_head *ih,
-                                         const char *body)
+                                         struct item_head * const ih,
+                                         const char * const body)
 {
        struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
        int n = B_NR_ITEMS(tb->L[0]);
@@ -584,39 +589,44 @@ static void balance_leaf_paste_left_whole(struct tree_balance *tb,
        tb->zeroes_num = 0;
 }
 
-static void balance_leaf_paste_left(struct tree_balance *tb,
-                                   struct item_head *ih, const char *body)
+static unsigned int balance_leaf_paste_left(struct tree_balance *tb,
+                                           struct item_head * const ih,
+                                           const char * const body)
 {
        /* we must shift the part of the appended item */
        if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1)
-               balance_leaf_paste_left_shift(tb, ih, body);
+               return balance_leaf_paste_left_shift(tb, ih, body);
        else
                balance_leaf_paste_left_whole(tb, ih, body);
+       return 0;
 }
 
 /* Shift lnum[0] items from S[0] to the left neighbor L[0] */
-static void balance_leaf_left(struct tree_balance *tb, struct item_head *ih,
-                             const char *body, int flag)
+static unsigned int balance_leaf_left(struct tree_balance *tb,
+                                     struct item_head * const ih,
+                                     const char * const body, int flag)
 {
        if (tb->lnum[0] <= 0)
-               return;
+               return 0;
 
        /* new item or it part falls to L[0], shift it too */
        if (tb->item_pos < tb->lnum[0]) {
                BUG_ON(flag != M_INSERT && flag != M_PASTE);
 
                if (flag == M_INSERT)
-                       balance_leaf_insert_left(tb, ih, body);
+                       return balance_leaf_insert_left(tb, ih, body);
                else /* M_PASTE */
-                       balance_leaf_paste_left(tb, ih, body);
+                       return balance_leaf_paste_left(tb, ih, body);
        } else
                /* new item doesn't fall into L[0] */
                leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+       return 0;
 }
 
 
 static void balance_leaf_insert_right(struct tree_balance *tb,
-                                     struct item_head *ih, const char *body)
+                                     struct item_head * const ih,
+                                     const char * const body)
 {
 
        struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
@@ -704,7 +714,8 @@ static void balance_leaf_insert_right(struct tree_balance *tb,
 
 
 static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb,
-                                    struct item_head *ih, const char *body)
+                                    struct item_head * const ih,
+                                    const char * const body)
 {
        struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
        struct buffer_info bi;
@@ -754,7 +765,8 @@ static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb,
 }
 
 static void balance_leaf_paste_right_shift(struct tree_balance *tb,
-                                    struct item_head *ih, const char *body)
+                                    struct item_head * const ih,
+                                    const char * const body)
 {
        struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
        int n_shift, n_rem, r_zeroes_number, version;
@@ -831,7 +843,8 @@ static void balance_leaf_paste_right_shift(struct tree_balance *tb,
 }
 
 static void balance_leaf_paste_right_whole(struct tree_balance *tb,
-                                    struct item_head *ih, const char *body)
+                                    struct item_head * const ih,
+                                    const char * const body)
 {
        struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
        int n = B_NR_ITEMS(tbS0);
@@ -874,7 +887,8 @@ static void balance_leaf_paste_right_whole(struct tree_balance *tb,
 }
 
 static void balance_leaf_paste_right(struct tree_balance *tb,
-                                    struct item_head *ih, const char *body)
+                                    struct item_head * const ih,
+                                    const char * const body)
 {
        struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
        int n = B_NR_ITEMS(tbS0);
@@ -896,8 +910,9 @@ static void balance_leaf_paste_right(struct tree_balance *tb,
 }
 
 /* shift rnum[0] items from S[0] to the right neighbor R[0] */
-static void balance_leaf_right(struct tree_balance *tb, struct item_head *ih,
-                              const char *body, int flag)
+static void balance_leaf_right(struct tree_balance *tb,
+                              struct item_head * const ih,
+                              const char * const body, int flag)
 {
        if (tb->rnum[0] <= 0)
                return;
@@ -911,8 +926,8 @@ static void balance_leaf_right(struct tree_balance *tb, struct item_head *ih,
 }
 
 static void balance_leaf_new_nodes_insert(struct tree_balance *tb,
-                                         struct item_head *ih,
-                                         const char *body,
+                                         struct item_head * const ih,
+                                         const char * const body,
                                          struct item_head *insert_key,
                                          struct buffer_head **insert_ptr,
                                          int i)
@@ -1003,8 +1018,8 @@ static void balance_leaf_new_nodes_insert(struct tree_balance *tb,
 
 /* we append to directory item */
 static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb,
-                                        struct item_head *ih,
-                                        const char *body,
+                                        struct item_head * const ih,
+                                        const char * const body,
                                         struct item_head *insert_key,
                                         struct buffer_head **insert_ptr,
                                         int i)
@@ -1058,8 +1073,8 @@ static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb,
 }
 
 static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb,
-                                        struct item_head *ih,
-                                        const char *body,
+                                        struct item_head * const ih,
+                                        const char * const body,
                                         struct item_head *insert_key,
                                         struct buffer_head **insert_ptr,
                                         int i)
@@ -1131,8 +1146,8 @@ static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb,
 }
 
 static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb,
-                                              struct item_head *ih,
-                                              const char *body,
+                                              struct item_head * const ih,
+                                              const char * const body,
                                               struct item_head *insert_key,
                                               struct buffer_head **insert_ptr,
                                               int i)
@@ -1184,8 +1199,8 @@ static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb,
 
 }
 static void balance_leaf_new_nodes_paste(struct tree_balance *tb,
-                                        struct item_head *ih,
-                                        const char *body,
+                                        struct item_head * const ih,
+                                        const char * const body,
                                         struct item_head *insert_key,
                                         struct buffer_head **insert_ptr,
                                         int i)
@@ -1214,8 +1229,8 @@ static void balance_leaf_new_nodes_paste(struct tree_balance *tb,
 
 /* Fill new nodes that appear in place of S[0] */
 static void balance_leaf_new_nodes(struct tree_balance *tb,
-                                  struct item_head *ih,
-                                  const char *body,
+                                  struct item_head * const ih,
+                                  const char * const body,
                                   struct item_head *insert_key,
                                   struct buffer_head **insert_ptr,
                                   int flag)
@@ -1254,8 +1269,8 @@ static void balance_leaf_new_nodes(struct tree_balance *tb,
 }
 
 static void balance_leaf_finish_node_insert(struct tree_balance *tb,
-                                           struct item_head *ih,
-                                           const char *body)
+                                           struct item_head * const ih,
+                                           const char * const body)
 {
        struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
        struct buffer_info bi;
@@ -1271,8 +1286,8 @@ static void balance_leaf_finish_node_insert(struct tree_balance *tb,
 }
 
 static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb,
-                                                 struct item_head *ih,
-                                                 const char *body)
+                                                 struct item_head * const ih,
+                                                 const char * const body)
 {
        struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
        struct item_head *pasted = item_head(tbS0, tb->item_pos);
@@ -1305,8 +1320,8 @@ static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb,
 }
 
 static void balance_leaf_finish_node_paste(struct tree_balance *tb,
-                                          struct item_head *ih,
-                                          const char *body)
+                                          struct item_head * const ih,
+                                          const char * const body)
 {
        struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
        struct buffer_info bi;
@@ -1349,8 +1364,8 @@ static void balance_leaf_finish_node_paste(struct tree_balance *tb,
  * of the affected item which remains in S
  */
 static void balance_leaf_finish_node(struct tree_balance *tb,
-                                     struct item_head *ih,
-                                     const char *body, int flag)
+                                     struct item_head * const ih,
+                                     const char * const body, int flag)
 {
        /* if we must insert or append into buffer S[0] */
        if (0 <= tb->item_pos && tb->item_pos < tb->s0num) {
@@ -1402,7 +1417,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,
            && is_indirect_le_ih(item_head(tbS0, tb->item_pos)))
                tb->pos_in_item *= UNFM_P_SIZE;
 
-       balance_leaf_left(tb, ih, body, flag);
+       body += balance_leaf_left(tb, ih, body, flag);
 
        /* tb->lnum[0] > 0 */
        /* Calculate new item position */
index e8870de4627e6d49cbbab6ed42ea5bd49709a236..a88b1b3e7db3e4f4f5c514062586dbf7a62fa21e 100644 (file)
@@ -1947,8 +1947,6 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
                }
        }
 
-       /* wait for all commits to finish */
-       cancel_delayed_work(&SB_JOURNAL(sb)->j_work);
 
        /*
         * We must release the write lock here because
@@ -1956,8 +1954,14 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
         */
        reiserfs_write_unlock(sb);
 
+       /*
+        * Cancel flushing of old commits. Note that neither of these works
+        * will be requeued because superblock is being shutdown and doesn't
+        * have MS_ACTIVE set.
+        */
        cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work);
-       flush_workqueue(REISERFS_SB(sb)->commit_wq);
+       /* wait for all commits to finish */
+       cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work);
 
        free_journal_ram(sb);
 
@@ -4292,9 +4296,15 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags)
        if (flush) {
                flush_commit_list(sb, jl, 1);
                flush_journal_list(sb, jl, 1);
-       } else if (!(jl->j_state & LIST_COMMIT_PENDING))
-               queue_delayed_work(REISERFS_SB(sb)->commit_wq,
-                                  &journal->j_work, HZ / 10);
+       } else if (!(jl->j_state & LIST_COMMIT_PENDING)) {
+               /*
+                * Avoid queueing work when sb is being shut down. Transaction
+                * will be flushed on journal shutdown.
+                */
+               if (sb->s_flags & MS_ACTIVE)
+                       queue_delayed_work(REISERFS_SB(sb)->commit_wq,
+                                          &journal->j_work, HZ / 10);
+       }
 
        /*
         * if the next transaction has any chance of wrapping, flush
index 814dda3ec998d770c9a0e50420599002b9353cc0..249594a821e0a5e8c60d96c7cad0cd841fccd70b 100644 (file)
@@ -899,8 +899,9 @@ void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
 
 /* insert item into the leaf node in position before */
 void leaf_insert_into_buf(struct buffer_info *bi, int before,
-                         struct item_head *inserted_item_ih,
-                         const char *inserted_item_body, int zeros_number)
+                         struct item_head * const inserted_item_ih,
+                         const char * const inserted_item_body,
+                         int zeros_number)
 {
        struct buffer_head *bh = bi->bi_bh;
        int nr, free_space;
index bf53888c7f59a677081c1e84425b9a236affccbc..735c2c2b4536b0546e728b38cf663e310b3ac45c 100644 (file)
@@ -3216,11 +3216,12 @@ int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes);
 void leaf_delete_items(struct buffer_info *cur_bi, int last_first, int first,
                       int del_num, int del_bytes);
 void leaf_insert_into_buf(struct buffer_info *bi, int before,
-                         struct item_head *inserted_item_ih,
-                         const char *inserted_item_body, int zeros_number);
-void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num,
-                         int pos_in_item, int paste_size, const char *body,
+                         struct item_head * const inserted_item_ih,
+                         const char * const inserted_item_body,
                          int zeros_number);
+void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num,
+                         int pos_in_item, int paste_size,
+                         const char * const body, int zeros_number);
 void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
                          int pos_in_item, int cut_size);
 void leaf_paste_entries(struct buffer_info *bi, int item_num, int before,
index 709ea92d716f03ca78b7e45b6f2677ac50b3ba1d..d46e88a33b02451d2cffc955a64c6752fab7c2cd 100644 (file)
@@ -100,7 +100,11 @@ void reiserfs_schedule_old_flush(struct super_block *s)
        struct reiserfs_sb_info *sbi = REISERFS_SB(s);
        unsigned long delay;
 
-       if (s->s_flags & MS_RDONLY)
+       /*
+        * Avoid scheduling flush when sb is being shut down. It can race
+        * with journal shutdown and free still queued delayed work.
+        */
+       if (s->s_flags & MS_RDONLY || !(s->s_flags & MS_ACTIVE))
                return;
 
        spin_lock(&sbi->old_work_lock);
index d20d5b11dedf80a7d6c2f39872e6a892b21d7206..b9a214d2fe98b8b37a7560ebf1a7d1dc7c5d83e0 100644 (file)
@@ -22,7 +22,6 @@
 
 #include <linux/export.h>
 #include <linux/slab.h>
-#include <linux/acct.h>
 #include <linux/blkdev.h>
 #include <linux/mount.h>
 #include <linux/security.h>
@@ -218,7 +217,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
        lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
        mutex_init(&s->s_dquot.dqio_mutex);
        mutex_init(&s->s_dquot.dqonoff_mutex);
-       init_rwsem(&s->s_dquot.dqptr_sem);
        s->s_maxbytes = MAX_NON_LFS;
        s->s_op = &default_op;
        s->s_time_gran = 1000000000;
@@ -702,12 +700,22 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
                return -EACCES;
 #endif
 
-       if (flags & MS_RDONLY)
-               acct_auto_close(sb);
-       shrink_dcache_sb(sb);
-
        remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
 
+       if (remount_ro) {
+               if (sb->s_pins.first) {
+                       up_write(&sb->s_umount);
+                       sb_pin_kill(sb);
+                       down_write(&sb->s_umount);
+                       if (!sb->s_root)
+                               return 0;
+                       if (sb->s_writers.frozen != SB_UNFROZEN)
+                               return -EBUSY;
+                       remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
+               }
+       }
+       shrink_dcache_sb(sb);
+
        /* If we are remounting RDONLY and current sb is read/write,
           make sure there are no rw files opened */
        if (remount_ro) {
index ff8229340cd537286fb612efa7041fd125378617..aa13ad053b14075b8b5a9b0b62067ab0b9904c2d 100644 (file)
@@ -174,7 +174,6 @@ static int do_commit(struct ubifs_info *c)
        if (err)
                goto out;
 
-       mutex_lock(&c->mst_mutex);
        c->mst_node->cmt_no      = cpu_to_le64(c->cmt_no);
        c->mst_node->log_lnum    = cpu_to_le32(new_ltail_lnum);
        c->mst_node->root_lnum   = cpu_to_le32(zroot.lnum);
@@ -204,7 +203,6 @@ static int do_commit(struct ubifs_info *c)
        else
                c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_NO_ORPHS);
        err = ubifs_write_master(c);
-       mutex_unlock(&c->mst_mutex);
        if (err)
                goto out;
 
index 2290d5866725b1cb52543e25aca22680a13b54b1..fb08b0c514b68d5c226b986b6a946b77d14be255 100644 (file)
@@ -431,7 +431,7 @@ void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last)
 
 /**
  * wbuf_timer_callback - write-buffer timer callback function.
- * @data: timer data (write-buffer descriptor)
+ * @timer: timer data (write-buffer descriptor)
  *
  * This function is called when the write-buffer timer expires.
  */
index a902c5919e423ca619fcf0508345909b51ec99f8..a47ddfc9be6b2373c92e2b9eb8e5ab4454be581a 100644 (file)
@@ -240,6 +240,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
 
        if (c->lhead_offs > c->leb_size - c->ref_node_alsz) {
                c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
+               ubifs_assert(c->lhead_lnum != c->ltail_lnum);
                c->lhead_offs = 0;
        }
 
@@ -404,15 +405,14 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
        /* Switch to the next log LEB */
        if (c->lhead_offs) {
                c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
+               ubifs_assert(c->lhead_lnum != c->ltail_lnum);
                c->lhead_offs = 0;
        }
 
-       if (c->lhead_offs == 0) {
-               /* Must ensure next LEB has been unmapped */
-               err = ubifs_leb_unmap(c, c->lhead_lnum);
-               if (err)
-                       goto out;
-       }
+       /* Must ensure next LEB has been unmapped */
+       err = ubifs_leb_unmap(c, c->lhead_lnum);
+       if (err)
+               goto out;
 
        len = ALIGN(len, c->min_io_size);
        dbg_log("writing commit start at LEB %d:0, len %d", c->lhead_lnum, len);
index d46b19ec1815eb1532d4d8f848b610b4437bb63a..421bd0a80424d40942f04e8a6bdb0499ee9cc055 100644 (file)
@@ -1464,7 +1464,6 @@ struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum)
                        return ERR_CAST(nnode);
        }
        iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
-       shft -= UBIFS_LPT_FANOUT_SHIFT;
        pnode = ubifs_get_pnode(c, nnode, iip);
        if (IS_ERR(pnode))
                return ERR_CAST(pnode);
@@ -1604,7 +1603,6 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
                        return ERR_CAST(nnode);
        }
        iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
-       shft -= UBIFS_LPT_FANOUT_SHIFT;
        pnode = ubifs_get_pnode(c, nnode, iip);
        if (IS_ERR(pnode))
                return ERR_CAST(pnode);
@@ -1964,7 +1962,6 @@ again:
                }
        }
        iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
-       shft -= UBIFS_LPT_FANOUT_SHIFT;
        pnode = scan_get_pnode(c, path + h, nnode, iip);
        if (IS_ERR(pnode)) {
                err = PTR_ERR(pnode);
@@ -2198,6 +2195,7 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
                                          lprops->dirty);
                                return -EINVAL;
                        }
+                       break;
                case LPROPS_FREEABLE:
                case LPROPS_FRDI_IDX:
                        if (lprops->free + lprops->dirty != c->leb_size) {
@@ -2206,6 +2204,7 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
                                          lprops->dirty);
                                return -EINVAL;
                        }
+                       break;
                }
        }
        return 0;
index 45d4e96a6bac0fe36f67b268ada5fea066627eee..d9c02928e992dc9f00bdb7f2fd5ae75bdd32e4e6 100644 (file)
@@ -304,7 +304,6 @@ static int layout_cnodes(struct ubifs_info *c)
                        ubifs_assert(lnum >= c->lpt_first &&
                                     lnum <= c->lpt_last);
                }
-               done_ltab = 1;
                c->ltab_lnum = lnum;
                c->ltab_offs = offs;
                offs += c->ltab_sz;
@@ -514,7 +513,6 @@ static int write_cnodes(struct ubifs_info *c)
                        if (err)
                                return err;
                }
-               done_ltab = 1;
                ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
                offs += c->ltab_sz;
                dbg_chk_lpt_sz(c, 1, c->ltab_sz);
@@ -1941,6 +1939,11 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
                                pr_err("LEB %d:%d, nnode, ",
                                       lnum, offs);
                        err = ubifs_unpack_nnode(c, p, &nnode);
+                       if (err) {
+                               pr_err("failed to unpack_node, error %d\n",
+                                      err);
+                               break;
+                       }
                        for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
                                pr_cont("%d:%d", nnode.nbranch[i].lnum,
                                       nnode.nbranch[i].offs);
index ab83ace9910a0964544a41c7d289c84c2b075057..1a4bb9e8b3b8925b57be7a6c0a62c1675ddb9bac 100644 (file)
@@ -352,10 +352,9 @@ int ubifs_read_master(struct ubifs_info *c)
  * ubifs_write_master - write master node.
  * @c: UBIFS file-system description object
  *
- * This function writes the master node. The caller has to take the
- * @c->mst_mutex lock before calling this function. Returns zero in case of
- * success and a negative error code in case of failure. The master node is
- * written twice to enable recovery.
+ * This function writes the master node. Returns zero in case of success and a
+ * negative error code in case of failure. The master node is written twice to
+ * enable recovery.
  */
 int ubifs_write_master(struct ubifs_info *c)
 {
index f1c3e5a1b31554c4fbd9c9710d5925ad90682cb1..4409f486ecef3b1aa5dcbea949509d03c6f68ef9 100644 (file)
@@ -346,7 +346,6 @@ static int write_orph_nodes(struct ubifs_info *c, int atomic)
                int lnum;
 
                /* Unmap any unused LEBs after consolidation */
-               lnum = c->ohead_lnum + 1;
                for (lnum = c->ohead_lnum + 1; lnum <= c->orph_last; lnum++) {
                        err = ubifs_leb_unmap(c, lnum);
                        if (err)
index c14adb2f420cb09c48fef049bf6d246b8d22cd28..c640938f62f0666cf8c4b2b0e3006ed01f5cd75d 100644 (file)
@@ -596,7 +596,6 @@ static void drop_last_group(struct ubifs_scan_leb *sleb, int *offs)
  * drop_last_node - drop the last node.
  * @sleb: scanned LEB information
  * @offs: offset of dropped nodes is returned here
- * @grouped: non-zero if whole group of nodes have to be dropped
  *
  * This is a helper function for 'ubifs_recover_leb()' which drops the last
  * node of the scanned LEB.
@@ -629,8 +628,8 @@ static void drop_last_node(struct ubifs_scan_leb *sleb, int *offs)
  *
  * This function does a scan of a LEB, but caters for errors that might have
  * been caused by the unclean unmount from which we are attempting to recover.
- * Returns %0 in case of success, %-EUCLEAN if an unrecoverable corruption is
- * found, and a negative error code in case of failure.
+ * Returns the scanned information on success and a negative error code on
+ * failure.
  */
 struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                                         int offs, void *sbuf, int jhead)
index 4c37607a958e037f7242b2f01ef9d69dd836512b..79c6dbbc0e04ddacc39aaa57e726314971c4e65b 100644 (file)
@@ -332,6 +332,8 @@ static int create_default_filesystem(struct ubifs_info *c)
        cs->ch.node_type = UBIFS_CS_NODE;
        err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM, 0);
        kfree(cs);
+       if (err)
+               return err;
 
        ubifs_msg("default file-system created");
        return 0;
@@ -447,7 +449,7 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
                goto failed;
        }
 
-       if (c->default_compr < 0 || c->default_compr >= UBIFS_COMPR_TYPES_CNT) {
+       if (c->default_compr >= UBIFS_COMPR_TYPES_CNT) {
                err = 13;
                goto failed;
        }
index 58aa05df2bb66848513e3a11718049dc9cddd1f1..89adbc4d08ac1ac2defe2fa5293816fe579fcef6 100644 (file)
@@ -131,7 +131,8 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
  * @offs: offset to start at (usually zero)
  * @sbuf: scan buffer (must be c->leb_size)
  *
- * This function returns %0 on success and a negative error code on failure.
+ * This function returns the scanned information on success and a negative error
+ * code on failure.
  */
 struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
                                        int offs, void *sbuf)
@@ -157,9 +158,10 @@ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
                return ERR_PTR(err);
        }
 
-       if (err == -EBADMSG)
-               sleb->ecc = 1;
-
+       /*
+        * Note, we ignore integrity errors (EBASMSG) because all the nodes are
+        * protected by CRC checksums.
+        */
        return sleb;
 }
 
@@ -169,8 +171,6 @@ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
  * @sleb: scanning information
  * @lnum: logical eraseblock number
  * @offs: offset to start at (usually zero)
- *
- * This function returns %0 on success and a negative error code on failure.
  */
 void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
                    int lnum, int offs)
@@ -257,7 +257,7 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
  * @quiet: print no messages
  *
  * This function scans LEB number @lnum and returns complete information about
- * its contents. Returns the scaned information in case of success and,
+ * its contents. Returns the scanned information in case of success and,
  * %-EUCLEAN if the LEB neads recovery, and other negative error codes in case
  * of failure.
  *
index 3904c8574ef931dda6473123ff6299f9c5c513b8..106bf20629ce6d01128b75680936082bdc0be83a 100644 (file)
@@ -75,7 +75,7 @@ static int validate_inode(struct ubifs_info *c, const struct inode *inode)
                return 1;
        }
 
-       if (ui->compr_type < 0 || ui->compr_type >= UBIFS_COMPR_TYPES_CNT) {
+       if (ui->compr_type >= UBIFS_COMPR_TYPES_CNT) {
                ubifs_err("unknown compression type %d", ui->compr_type);
                return 2;
        }
@@ -424,19 +424,19 @@ static int ubifs_show_options(struct seq_file *s, struct dentry *root)
        struct ubifs_info *c = root->d_sb->s_fs_info;
 
        if (c->mount_opts.unmount_mode == 2)
-               seq_printf(s, ",fast_unmount");
+               seq_puts(s, ",fast_unmount");
        else if (c->mount_opts.unmount_mode == 1)
-               seq_printf(s, ",norm_unmount");
+               seq_puts(s, ",norm_unmount");
 
        if (c->mount_opts.bulk_read == 2)
-               seq_printf(s, ",bulk_read");
+               seq_puts(s, ",bulk_read");
        else if (c->mount_opts.bulk_read == 1)
-               seq_printf(s, ",no_bulk_read");
+               seq_puts(s, ",no_bulk_read");
 
        if (c->mount_opts.chk_data_crc == 2)
-               seq_printf(s, ",chk_data_crc");
+               seq_puts(s, ",chk_data_crc");
        else if (c->mount_opts.chk_data_crc == 1)
-               seq_printf(s, ",no_chk_data_crc");
+               seq_puts(s, ",no_chk_data_crc");
 
        if (c->mount_opts.override_compr) {
                seq_printf(s, ",compr=%s",
@@ -796,8 +796,8 @@ static int alloc_wbufs(struct ubifs_info *c)
 {
        int i, err;
 
-       c->jheads = kzalloc(c->jhead_cnt * sizeof(struct ubifs_jhead),
-                          GFP_KERNEL);
+       c->jheads = kcalloc(c->jhead_cnt, sizeof(struct ubifs_jhead),
+                           GFP_KERNEL);
        if (!c->jheads)
                return -ENOMEM;
 
@@ -1963,7 +1963,6 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
                mutex_init(&c->lp_mutex);
                mutex_init(&c->tnc_mutex);
                mutex_init(&c->log_mutex);
-               mutex_init(&c->mst_mutex);
                mutex_init(&c->umount_mutex);
                mutex_init(&c->bu_mutex);
                mutex_init(&c->write_reserve_mutex);
index 8a40cf9c02d7057408a41b4215431a96f5171f04..6793db0754f6cc058f0d0b8f35ebb9f2eaa67bc8 100644 (file)
@@ -3294,7 +3294,6 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
                goto out_unlock;
 
        if (err) {
-               err = -EINVAL;
                key = &from_key;
                goto out_dump;
        }
index 3600994f84112e99b4b6d3a4a21209cd6f9738ff..7a205e0467768e573a98657b42ed7bd8ffb0be6b 100644 (file)
@@ -389,7 +389,6 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
                                ubifs_dump_lprops(c);
                        }
                        /* Try to commit anyway */
-                       err = 0;
                        break;
                }
                p++;
index c1f71fe17cc00e72212bf84033117581a5dd04f3..c4fe900c67ab277508f84f6f8eb951c0796f25c3 100644 (file)
@@ -314,7 +314,6 @@ struct ubifs_scan_node {
  * @nodes_cnt: number of nodes scanned
  * @nodes: list of struct ubifs_scan_node
  * @endpt: end point (and therefore the start of empty space)
- * @ecc: read returned -EBADMSG
  * @buf: buffer containing entire LEB scanned
  */
 struct ubifs_scan_leb {
@@ -322,7 +321,6 @@ struct ubifs_scan_leb {
        int nodes_cnt;
        struct list_head nodes;
        int endpt;
-       int ecc;
        void *buf;
 };
 
@@ -1051,7 +1049,6 @@ struct ubifs_debug_info;
  *
  * @mst_node: master node
  * @mst_offs: offset of valid master node
- * @mst_mutex: protects the master node area, @mst_node, and @mst_offs
  *
  * @max_bu_buf_len: maximum bulk-read buffer length
  * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu
@@ -1292,7 +1289,6 @@ struct ubifs_info {
 
        struct ubifs_mst_node *mst_node;
        int mst_offs;
-       struct mutex mst_mutex;
 
        int max_bu_buf_len;
        struct mutex bu_mutex;
index d80738fdf424cd61579a0544eab720bb8d7b0a64..86c6743ec1feb7eb99b17d6557c2ed1a1a90c1b8 100644 (file)
@@ -27,7 +27,7 @@
 
 #include "udfdecl.h"
 #include <linux/fs.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/kernel.h>
 #include <linux/string.h> /* memset */
 #include <linux/capability.h>
@@ -100,24 +100,6 @@ static int udf_adinicb_write_begin(struct file *file,
        return 0;
 }
 
-static int udf_adinicb_write_end(struct file *file,
-                       struct address_space *mapping,
-                       loff_t pos, unsigned len, unsigned copied,
-                       struct page *page, void *fsdata)
-{
-       struct inode *inode = mapping->host;
-       unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
-       char *kaddr;
-       struct udf_inode_info *iinfo = UDF_I(inode);
-
-       kaddr = kmap_atomic(page);
-       memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr + offset,
-               kaddr + offset, copied);
-       kunmap_atomic(kaddr);
-
-       return simple_write_end(file, mapping, pos, len, copied, page, fsdata);
-}
-
 static ssize_t udf_adinicb_direct_IO(int rw, struct kiocb *iocb,
                                     struct iov_iter *iter,
                                     loff_t offset)
@@ -130,7 +112,7 @@ const struct address_space_operations udf_adinicb_aops = {
        .readpage       = udf_adinicb_readpage,
        .writepage      = udf_adinicb_writepage,
        .write_begin    = udf_adinicb_write_begin,
-       .write_end      = udf_adinicb_write_end,
+       .write_end      = simple_write_end,
        .direct_IO      = udf_adinicb_direct_IO,
 };
 
index 6583fe9b0645f6f4b5c417bc33ae5fea5bf99828..6ad5a453af97a60a9adfd12f38e788e887f8b417 100644 (file)
@@ -21,7 +21,7 @@
 
 #include <linux/blkdev.h>
 #include <linux/cdrom.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "udf_sb.h"
 
index 3286db047a40230f50cb5e18a177b744f150342b..813da94d447b3b44418d2ce84bc70121cb62c92e 100644 (file)
@@ -63,7 +63,7 @@
 #include "udf_i.h"
 
 #include <linux/init.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define VDS_POS_PRIMARY_VOL_DESC       0
 #define VDS_POS_UNALLOC_SPACE_DESC     1
index d7c6dbe4194bb33bbe37930c63cf23377e5d9343..6fb7945c1e6e8813afce2ad81aa41ea28a6f4565 100644 (file)
@@ -20,7 +20,7 @@
  */
 
 #include "udfdecl.h"
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/time.h>
index 44b815e57f9439116199f91b67489a98272272bc..afd470e588ffbbd24ec886b3e8a619833a5e3e9a 100644 (file)
@@ -412,7 +412,6 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
        int extIndex = 0, newExtIndex = 0, hasExt = 0;
        unsigned short valueCRC;
        uint8_t curr;
-       const uint8_t hexChar[] = "0123456789ABCDEF";
 
        if (udfName[0] == '.' &&
            (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) {
@@ -477,10 +476,10 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
                        newIndex = 250;
                newName[newIndex++] = CRC_MARK;
                valueCRC = crc_itu_t(0, fidName, fidNameLen);
-               newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12];
-               newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8];
-               newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4];
-               newName[newIndex++] = hexChar[(valueCRC & 0x000f)];
+               newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8);
+               newName[newIndex++] = hex_asc_upper_lo(valueCRC >> 8);
+               newName[newIndex++] = hex_asc_upper_hi(valueCRC);
+               newName[newIndex++] = hex_asc_upper_lo(valueCRC);
 
                if (hasExt) {
                        newName[newIndex++] = EXT_MARK;
index 399e8cec6e603bb84df41b3d062e2a88e99a98a3..5d47b4df61eac13fe2176455fcb20c9f86de1dde 100644 (file)
@@ -1,6 +1,7 @@
 config XFS_FS
        tristate "XFS filesystem support"
        depends on BLOCK
+       depends on (64BIT || LBDAF)
        select EXPORTFS
        select LIBCRC32C
        help
index c21f4350666112c4222b33ad7c268e1d759d540c..d61799949580a497ccae45883cb2f4c3f8e34495 100644 (file)
@@ -17,6 +17,7 @@
 #
 
 ccflags-y += -I$(src)                  # needed for trace events
+ccflags-y += -I$(src)/libxfs
 
 ccflags-$(CONFIG_XFS_DEBUG) += -g
 
@@ -25,6 +26,39 @@ obj-$(CONFIG_XFS_FS)         += xfs.o
 # this one should be compiled first, as the tracing macros can easily blow up
 xfs-y                          += xfs_trace.o
 
+# build the libxfs code first
+xfs-y                          += $(addprefix libxfs/, \
+                                  xfs_alloc.o \
+                                  xfs_alloc_btree.o \
+                                  xfs_attr.o \
+                                  xfs_attr_leaf.o \
+                                  xfs_attr_remote.o \
+                                  xfs_bmap.o \
+                                  xfs_bmap_btree.o \
+                                  xfs_btree.o \
+                                  xfs_da_btree.o \
+                                  xfs_da_format.o \
+                                  xfs_dir2.o \
+                                  xfs_dir2_block.o \
+                                  xfs_dir2_data.o \
+                                  xfs_dir2_leaf.o \
+                                  xfs_dir2_node.o \
+                                  xfs_dir2_sf.o \
+                                  xfs_dquot_buf.o \
+                                  xfs_ialloc.o \
+                                  xfs_ialloc_btree.o \
+                                  xfs_inode_fork.o \
+                                  xfs_inode_buf.o \
+                                  xfs_log_rlimit.o \
+                                  xfs_sb.o \
+                                  xfs_symlink_remote.o \
+                                  xfs_trans_resv.o \
+                                  )
+# xfs_rtbitmap is shared with libxfs
+xfs-$(CONFIG_XFS_RT)           += $(addprefix libxfs/, \
+                                  xfs_rtbitmap.o \
+                                  )
+
 # highlevel code
 xfs-y                          += xfs_aops.o \
                                   xfs_attr_inactive.o \
@@ -45,53 +79,27 @@ xfs-y                               += xfs_aops.o \
                                   xfs_ioctl.o \
                                   xfs_iomap.o \
                                   xfs_iops.o \
+                                  xfs_inode.o \
                                   xfs_itable.o \
                                   xfs_message.o \
                                   xfs_mount.o \
                                   xfs_mru_cache.o \
                                   xfs_super.o \
                                   xfs_symlink.o \
+                                  xfs_sysfs.o \
                                   xfs_trans.o \
                                   xfs_xattr.o \
                                   kmem.o \
                                   uuid.o
 
-# code shared with libxfs
-xfs-y                          += xfs_alloc.o \
-                                  xfs_alloc_btree.o \
-                                  xfs_attr.o \
-                                  xfs_attr_leaf.o \
-                                  xfs_attr_remote.o \
-                                  xfs_bmap.o \
-                                  xfs_bmap_btree.o \
-                                  xfs_btree.o \
-                                  xfs_da_btree.o \
-                                  xfs_da_format.o \
-                                  xfs_dir2.o \
-                                  xfs_dir2_block.o \
-                                  xfs_dir2_data.o \
-                                  xfs_dir2_leaf.o \
-                                  xfs_dir2_node.o \
-                                  xfs_dir2_sf.o \
-                                  xfs_dquot_buf.o \
-                                  xfs_ialloc.o \
-                                  xfs_ialloc_btree.o \
-                                  xfs_icreate_item.o \
-                                  xfs_inode.o \
-                                  xfs_inode_fork.o \
-                                  xfs_inode_buf.o \
-                                  xfs_log_recover.o \
-                                  xfs_log_rlimit.o \
-                                  xfs_sb.o \
-                                  xfs_symlink_remote.o \
-                                  xfs_trans_resv.o
-
 # low-level transaction/log code
 xfs-y                          += xfs_log.o \
                                   xfs_log_cil.o \
                                   xfs_buf_item.o \
                                   xfs_extfree_item.o \
+                                  xfs_icreate_item.o \
                                   xfs_inode_item.o \
+                                  xfs_log_recover.o \
                                   xfs_trans_ail.o \
                                   xfs_trans_buf.o \
                                   xfs_trans_extfree.o \
@@ -107,8 +115,7 @@ xfs-$(CONFIG_XFS_QUOTA)             += xfs_dquot.o \
                                   xfs_quotaops.o
 
 # xfs_rtbitmap is shared with libxfs
-xfs-$(CONFIG_XFS_RT)           += xfs_rtalloc.o \
-                                  xfs_rtbitmap.o
+xfs-$(CONFIG_XFS_RT)           += xfs_rtalloc.o
 
 xfs-$(CONFIG_XFS_POSIX_ACL)    += xfs_acl.o
 xfs-$(CONFIG_PROC_FS)          += xfs_stats.o
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
new file mode 100644 (file)
index 0000000..6e247a9
--- /dev/null
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_AG_H__
+#define        __XFS_AG_H__
+
+/*
+ * Allocation group header
+ * This is divided into three structures, placed in sequential 512-byte
+ * buffers after a copy of the superblock (also in a 512-byte buffer).
+ */
+
+struct xfs_buf;
+struct xfs_mount;
+struct xfs_trans;
+
+#define        XFS_AGF_MAGIC   0x58414746      /* 'XAGF' */
+#define        XFS_AGI_MAGIC   0x58414749      /* 'XAGI' */
+#define        XFS_AGFL_MAGIC  0x5841464c      /* 'XAFL' */
+#define        XFS_AGF_VERSION 1
+#define        XFS_AGI_VERSION 1
+
+#define        XFS_AGF_GOOD_VERSION(v) ((v) == XFS_AGF_VERSION)
+#define        XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION)
+
+/*
+ * Btree number 0 is bno, 1 is cnt.  This value gives the size of the
+ * arrays below.
+ */
+#define        XFS_BTNUM_AGF   ((int)XFS_BTNUM_CNTi + 1)
+
+/*
+ * The second word of agf_levels in the first a.g. overlaps the EFS
+ * superblock's magic number.  Since the magic numbers valid for EFS
+ * are > 64k, our value cannot be confused for an EFS superblock's.
+ */
+
+typedef struct xfs_agf {
+       /*
+        * Common allocation group header information
+        */
+       __be32          agf_magicnum;   /* magic number == XFS_AGF_MAGIC */
+       __be32          agf_versionnum; /* header version == XFS_AGF_VERSION */
+       __be32          agf_seqno;      /* sequence # starting from 0 */
+       __be32          agf_length;     /* size in blocks of a.g. */
+       /*
+        * Freespace information
+        */
+       __be32          agf_roots[XFS_BTNUM_AGF];       /* root blocks */
+       __be32          agf_spare0;     /* spare field */
+       __be32          agf_levels[XFS_BTNUM_AGF];      /* btree levels */
+       __be32          agf_spare1;     /* spare field */
+
+       __be32          agf_flfirst;    /* first freelist block's index */
+       __be32          agf_fllast;     /* last freelist block's index */
+       __be32          agf_flcount;    /* count of blocks in freelist */
+       __be32          agf_freeblks;   /* total free blocks */
+
+       __be32          agf_longest;    /* longest free space */
+       __be32          agf_btreeblks;  /* # of blocks held in AGF btrees */
+       uuid_t          agf_uuid;       /* uuid of filesystem */
+
+       /*
+        * reserve some contiguous space for future logged fields before we add
+        * the unlogged fields. This makes the range logging via flags and
+        * structure offsets much simpler.
+        */
+       __be64          agf_spare64[16];
+
+       /* unlogged fields, written during buffer writeback. */
+       __be64          agf_lsn;        /* last write sequence */
+       __be32          agf_crc;        /* crc of agf sector */
+       __be32          agf_spare2;
+
+       /* structure must be padded to 64 bit alignment */
+} xfs_agf_t;
+
+#define XFS_AGF_CRC_OFF                offsetof(struct xfs_agf, agf_crc)
+
+#define        XFS_AGF_MAGICNUM        0x00000001
+#define        XFS_AGF_VERSIONNUM      0x00000002
+#define        XFS_AGF_SEQNO           0x00000004
+#define        XFS_AGF_LENGTH          0x00000008
+#define        XFS_AGF_ROOTS           0x00000010
+#define        XFS_AGF_LEVELS          0x00000020
+#define        XFS_AGF_FLFIRST         0x00000040
+#define        XFS_AGF_FLLAST          0x00000080
+#define        XFS_AGF_FLCOUNT         0x00000100
+#define        XFS_AGF_FREEBLKS        0x00000200
+#define        XFS_AGF_LONGEST         0x00000400
+#define        XFS_AGF_BTREEBLKS       0x00000800
+#define        XFS_AGF_UUID            0x00001000
+#define        XFS_AGF_NUM_BITS        13
+#define        XFS_AGF_ALL_BITS        ((1 << XFS_AGF_NUM_BITS) - 1)
+
+#define XFS_AGF_FLAGS \
+       { XFS_AGF_MAGICNUM,     "MAGICNUM" }, \
+       { XFS_AGF_VERSIONNUM,   "VERSIONNUM" }, \
+       { XFS_AGF_SEQNO,        "SEQNO" }, \
+       { XFS_AGF_LENGTH,       "LENGTH" }, \
+       { XFS_AGF_ROOTS,        "ROOTS" }, \
+       { XFS_AGF_LEVELS,       "LEVELS" }, \
+       { XFS_AGF_FLFIRST,      "FLFIRST" }, \
+       { XFS_AGF_FLLAST,       "FLLAST" }, \
+       { XFS_AGF_FLCOUNT,      "FLCOUNT" }, \
+       { XFS_AGF_FREEBLKS,     "FREEBLKS" }, \
+       { XFS_AGF_LONGEST,      "LONGEST" }, \
+       { XFS_AGF_BTREEBLKS,    "BTREEBLKS" }, \
+       { XFS_AGF_UUID,         "UUID" }
+
+/* disk block (xfs_daddr_t) in the AG */
+#define XFS_AGF_DADDR(mp)      ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
+#define        XFS_AGF_BLOCK(mp)       XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
+#define        XFS_BUF_TO_AGF(bp)      ((xfs_agf_t *)((bp)->b_addr))
+
+extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
+                       xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
+
+/*
+ * Size of the unlinked inode hash table in the agi.
+ */
+#define        XFS_AGI_UNLINKED_BUCKETS        64
+
+typedef struct xfs_agi {
+       /*
+        * Common allocation group header information
+        */
+       __be32          agi_magicnum;   /* magic number == XFS_AGI_MAGIC */
+       __be32          agi_versionnum; /* header version == XFS_AGI_VERSION */
+       __be32          agi_seqno;      /* sequence # starting from 0 */
+       __be32          agi_length;     /* size in blocks of a.g. */
+       /*
+        * Inode information
+        * Inodes are mapped by interpreting the inode number, so no
+        * mapping data is needed here.
+        */
+       __be32          agi_count;      /* count of allocated inodes */
+       __be32          agi_root;       /* root of inode btree */
+       __be32          agi_level;      /* levels in inode btree */
+       __be32          agi_freecount;  /* number of free inodes */
+
+       __be32          agi_newino;     /* new inode just allocated */
+       __be32          agi_dirino;     /* last directory inode chunk */
+       /*
+        * Hash table of inodes which have been unlinked but are
+        * still being referenced.
+        */
+       __be32          agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
+       /*
+        * This marks the end of logging region 1 and start of logging region 2.
+        */
+       uuid_t          agi_uuid;       /* uuid of filesystem */
+       __be32          agi_crc;        /* crc of agi sector */
+       __be32          agi_pad32;
+       __be64          agi_lsn;        /* last write sequence */
+
+       __be32          agi_free_root; /* root of the free inode btree */
+       __be32          agi_free_level;/* levels in free inode btree */
+
+       /* structure must be padded to 64 bit alignment */
+} xfs_agi_t;
+
+#define XFS_AGI_CRC_OFF                offsetof(struct xfs_agi, agi_crc)
+
+#define        XFS_AGI_MAGICNUM        (1 << 0)
+#define        XFS_AGI_VERSIONNUM      (1 << 1)
+#define        XFS_AGI_SEQNO           (1 << 2)
+#define        XFS_AGI_LENGTH          (1 << 3)
+#define        XFS_AGI_COUNT           (1 << 4)
+#define        XFS_AGI_ROOT            (1 << 5)
+#define        XFS_AGI_LEVEL           (1 << 6)
+#define        XFS_AGI_FREECOUNT       (1 << 7)
+#define        XFS_AGI_NEWINO          (1 << 8)
+#define        XFS_AGI_DIRINO          (1 << 9)
+#define        XFS_AGI_UNLINKED        (1 << 10)
+#define        XFS_AGI_NUM_BITS_R1     11      /* end of the 1st agi logging region */
+#define        XFS_AGI_ALL_BITS_R1     ((1 << XFS_AGI_NUM_BITS_R1) - 1)
+#define        XFS_AGI_FREE_ROOT       (1 << 11)
+#define        XFS_AGI_FREE_LEVEL      (1 << 12)
+#define        XFS_AGI_NUM_BITS_R2     13
+
+/* disk block (xfs_daddr_t) in the AG */
+#define XFS_AGI_DADDR(mp)      ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
+#define        XFS_AGI_BLOCK(mp)       XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
+#define        XFS_BUF_TO_AGI(bp)      ((xfs_agi_t *)((bp)->b_addr))
+
+extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
+                               xfs_agnumber_t agno, struct xfs_buf **bpp);
+
+/*
+ * The third a.g. block contains the a.g. freelist, an array
+ * of block pointers to blocks owned by the allocation btree code.
+ */
+#define XFS_AGFL_DADDR(mp)     ((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
+#define        XFS_AGFL_BLOCK(mp)      XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
+#define        XFS_BUF_TO_AGFL(bp)     ((xfs_agfl_t *)((bp)->b_addr))
+
+#define XFS_BUF_TO_AGFL_BNO(mp, bp) \
+       (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+               &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \
+               (__be32 *)(bp)->b_addr)
+
+/*
+ * Size of the AGFL.  For CRC-enabled filesystes we steal a couple of
+ * slots in the beginning of the block for a proper header with the
+ * location information and CRC.
+ */
+#define XFS_AGFL_SIZE(mp) \
+       (((mp)->m_sb.sb_sectsize - \
+        (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+               sizeof(struct xfs_agfl) : 0)) / \
+         sizeof(xfs_agblock_t))
+
+typedef struct xfs_agfl {
+       __be32          agfl_magicnum;
+       __be32          agfl_seqno;
+       uuid_t          agfl_uuid;
+       __be64          agfl_lsn;
+       __be32          agfl_crc;
+       __be32          agfl_bno[];     /* actually XFS_AGFL_SIZE(mp) */
+} xfs_agfl_t;
+
+#define XFS_AGFL_CRC_OFF       offsetof(struct xfs_agfl, agfl_crc)
+
+/*
+ * tags for inode radix tree
+ */
+#define XFS_ICI_NO_TAG         (-1)    /* special flag for an untagged lookup
+                                          in xfs_inode_ag_iterator */
+#define XFS_ICI_RECLAIM_TAG    0       /* inode is to be reclaimed */
+#define XFS_ICI_EOFBLOCKS_TAG  1       /* inode has blocks beyond EOF */
+
+#define        XFS_AG_MAXLEVELS(mp)            ((mp)->m_ag_maxlevels)
+#define        XFS_MIN_FREELIST_RAW(bl,cl,mp)  \
+       (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
+#define        XFS_MIN_FREELIST(a,mp)          \
+       (XFS_MIN_FREELIST_RAW(          \
+               be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \
+               be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
+#define        XFS_MIN_FREELIST_PAG(pag,mp)    \
+       (XFS_MIN_FREELIST_RAW(          \
+               (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
+               (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
+
+#define XFS_AGB_TO_FSB(mp,agno,agbno)  \
+       (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
+#define        XFS_FSB_TO_AGNO(mp,fsbno)       \
+       ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
+#define        XFS_FSB_TO_AGBNO(mp,fsbno)      \
+       ((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog)))
+#define        XFS_AGB_TO_DADDR(mp,agno,agbno) \
+       ((xfs_daddr_t)XFS_FSB_TO_BB(mp, \
+               (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno)))
+#define        XFS_AG_DADDR(mp,agno,d)         (XFS_AGB_TO_DADDR(mp, agno, 0) + (d))
+
+/*
+ * For checking for bad ranges of xfs_daddr_t's, covering multiple
+ * allocation groups or a single xfs_daddr_t that's a superblock copy.
+ */
+#define        XFS_AG_CHECK_DADDR(mp,d,len)    \
+       ((len) == 1 ? \
+           ASSERT((d) == XFS_SB_DADDR || \
+                  xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \
+           ASSERT(xfs_daddr_to_agno(mp, d) == \
+                  xfs_daddr_to_agno(mp, (d) + (len) - 1)))
+
+#endif /* __XFS_AG_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
new file mode 100644 (file)
index 0000000..4bffffe
--- /dev/null
@@ -0,0 +1,2630 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
+#include "xfs_error.h"
+#include "xfs_cksum.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_log.h"
+
+struct workqueue_struct *xfs_alloc_wq;
+
+#define XFS_ABSDIFF(a,b)       (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
+
+#define        XFSA_FIXUP_BNO_OK       1
+#define        XFSA_FIXUP_CNT_OK       2
+
+STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
+               xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+
+/*
+ * Lookup the record equal to [bno, len] in the btree given by cur.
+ */
+STATIC int                             /* error */
+xfs_alloc_lookup_eq(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agblock_t           bno,    /* starting block of extent */
+       xfs_extlen_t            len,    /* length of extent */
+       int                     *stat)  /* success/failure */
+{
+       cur->bc_rec.a.ar_startblock = bno;
+       cur->bc_rec.a.ar_blockcount = len;
+       return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+/*
+ * Lookup the first record greater than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int                            /* error */
+xfs_alloc_lookup_ge(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agblock_t           bno,    /* starting block of extent */
+       xfs_extlen_t            len,    /* length of extent */
+       int                     *stat)  /* success/failure */
+{
+       cur->bc_rec.a.ar_startblock = bno;
+       cur->bc_rec.a.ar_blockcount = len;
+       return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+ * Lookup the first record less than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int                                    /* error */
+xfs_alloc_lookup_le(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agblock_t           bno,    /* starting block of extent */
+       xfs_extlen_t            len,    /* length of extent */
+       int                     *stat)  /* success/failure */
+{
+       cur->bc_rec.a.ar_startblock = bno;
+       cur->bc_rec.a.ar_blockcount = len;
+       return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int                             /* error */
+xfs_alloc_update(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agblock_t           bno,    /* starting block of extent */
+       xfs_extlen_t            len)    /* length of extent */
+{
+       union xfs_btree_rec     rec;
+
+       rec.alloc.ar_startblock = cpu_to_be32(bno);
+       rec.alloc.ar_blockcount = cpu_to_be32(len);
+       return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int                                    /* error */
+xfs_alloc_get_rec(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agblock_t           *bno,   /* output: starting block of extent */
+       xfs_extlen_t            *len,   /* output: length of extent */
+       int                     *stat)  /* output: success/failure */
+{
+       union xfs_btree_rec     *rec;
+       int                     error;
+
+       error = xfs_btree_get_rec(cur, &rec, stat);
+       if (!error && *stat == 1) {
+               *bno = be32_to_cpu(rec->alloc.ar_startblock);
+               *len = be32_to_cpu(rec->alloc.ar_blockcount);
+       }
+       return error;
+}
+
+/*
+ * Compute aligned version of the found extent.
+ * Takes alignment and min length into account.
+ */
+STATIC void
+xfs_alloc_compute_aligned(
+       xfs_alloc_arg_t *args,          /* allocation argument structure */
+       xfs_agblock_t   foundbno,       /* starting block in found extent */
+       xfs_extlen_t    foundlen,       /* length in found extent */
+       xfs_agblock_t   *resbno,        /* result block number */
+       xfs_extlen_t    *reslen)        /* result length */
+{
+       xfs_agblock_t   bno;
+       xfs_extlen_t    len;
+
+       /* Trim busy sections out of found extent */
+       xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len);
+
+       if (args->alignment > 1 && len >= args->minlen) {
+               xfs_agblock_t   aligned_bno = roundup(bno, args->alignment);
+               xfs_extlen_t    diff = aligned_bno - bno;
+
+               *resbno = aligned_bno;
+               *reslen = diff >= len ? 0 : len - diff;
+       } else {
+               *resbno = bno;
+               *reslen = len;
+       }
+}
+
+/*
+ * Compute best start block and diff for "near" allocations.
+ * freelen >= wantlen already checked by caller.
+ */
+STATIC xfs_extlen_t                    /* difference value (absolute) */
+xfs_alloc_compute_diff(
+       xfs_agblock_t   wantbno,        /* target starting block */
+       xfs_extlen_t    wantlen,        /* target length */
+       xfs_extlen_t    alignment,      /* target alignment */
+       char            userdata,       /* are we allocating data? */
+       xfs_agblock_t   freebno,        /* freespace's starting block */
+       xfs_extlen_t    freelen,        /* freespace's length */
+       xfs_agblock_t   *newbnop)       /* result: best start block from free */
+{
+       xfs_agblock_t   freeend;        /* end of freespace extent */
+       xfs_agblock_t   newbno1;        /* return block number */
+       xfs_agblock_t   newbno2;        /* other new block number */
+       xfs_extlen_t    newlen1=0;      /* length with newbno1 */
+       xfs_extlen_t    newlen2=0;      /* length with newbno2 */
+       xfs_agblock_t   wantend;        /* end of target extent */
+
+       ASSERT(freelen >= wantlen);
+       freeend = freebno + freelen;
+       wantend = wantbno + wantlen;
+       /*
+        * We want to allocate from the start of a free extent if it is past
+        * the desired block or if we are allocating user data and the free
+        * extent is before desired block. The second case is there to allow
+        * for contiguous allocation from the remaining free space if the file
+        * grows in the short term.
+        */
+       if (freebno >= wantbno || (userdata && freeend < wantend)) {
+               if ((newbno1 = roundup(freebno, alignment)) >= freeend)
+                       newbno1 = NULLAGBLOCK;
+       } else if (freeend >= wantend && alignment > 1) {
+               newbno1 = roundup(wantbno, alignment);
+               newbno2 = newbno1 - alignment;
+               if (newbno1 >= freeend)
+                       newbno1 = NULLAGBLOCK;
+               else
+                       newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
+               if (newbno2 < freebno)
+                       newbno2 = NULLAGBLOCK;
+               else
+                       newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
+               if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
+                       if (newlen1 < newlen2 ||
+                           (newlen1 == newlen2 &&
+                            XFS_ABSDIFF(newbno1, wantbno) >
+                            XFS_ABSDIFF(newbno2, wantbno)))
+                               newbno1 = newbno2;
+               } else if (newbno2 != NULLAGBLOCK)
+                       newbno1 = newbno2;
+       } else if (freeend >= wantend) {
+               newbno1 = wantbno;
+       } else if (alignment > 1) {
+               newbno1 = roundup(freeend - wantlen, alignment);
+               if (newbno1 > freeend - wantlen &&
+                   newbno1 - alignment >= freebno)
+                       newbno1 -= alignment;
+               else if (newbno1 >= freeend)
+                       newbno1 = NULLAGBLOCK;
+       } else
+               newbno1 = freeend - wantlen;
+       *newbnop = newbno1;
+       return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
+}
+
+/*
+ * Fix up the length, based on mod and prod.
+ * len should be k * prod + mod for some k.
+ * If len is too small it is returned unchanged.
+ * If len hits maxlen it is left alone.
+ */
+STATIC void
+xfs_alloc_fix_len(
+       xfs_alloc_arg_t *args)          /* allocation argument structure */
+{
+       xfs_extlen_t    k;
+       xfs_extlen_t    rlen;
+
+       ASSERT(args->mod < args->prod);
+       rlen = args->len;
+       ASSERT(rlen >= args->minlen);
+       ASSERT(rlen <= args->maxlen);
+       if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
+           (args->mod == 0 && rlen < args->prod))
+               return;
+       k = rlen % args->prod;
+       if (k == args->mod)
+               return;
+       if (k > args->mod)
+               rlen = rlen - (k - args->mod);
+       else
+               rlen = rlen - args->prod + (args->mod - k);
+       if ((int)rlen < (int)args->minlen)
+               return;
+       ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
+       ASSERT(rlen % args->prod == args->mod);
+       args->len = rlen;
+}
+
+/*
+ * Fix up length if there is too little space left in the a.g.
+ * Return 1 if ok, 0 if too little, should give up.
+ */
+STATIC int
+xfs_alloc_fix_minleft(
+       xfs_alloc_arg_t *args)          /* allocation argument structure */
+{
+       xfs_agf_t       *agf;           /* a.g. freelist header */
+       int             diff;           /* free space difference */
+
+       if (args->minleft == 0)
+               return 1;
+       agf = XFS_BUF_TO_AGF(args->agbp);
+       diff = be32_to_cpu(agf->agf_freeblks)
+               - args->len - args->minleft;
+       if (diff >= 0)
+               return 1;
+       args->len += diff;              /* shrink the allocated space */
+       if (args->len >= args->minlen)
+               return 1;
+       args->agbno = NULLAGBLOCK;
+       return 0;
+}
+
+/*
+ * Update the two btrees, logically removing from freespace the extent
+ * starting at rbno, rlen blocks.  The extent is contained within the
+ * actual (current) free extent fbno for flen blocks.
+ * Flags are passed in indicating whether the cursors are set to the
+ * relevant records.
+ */
+STATIC int                             /* error code */
+xfs_alloc_fixup_trees(
+       xfs_btree_cur_t *cnt_cur,       /* cursor for by-size btree */
+       xfs_btree_cur_t *bno_cur,       /* cursor for by-block btree */
+       xfs_agblock_t   fbno,           /* starting block of free extent */
+       xfs_extlen_t    flen,           /* length of free extent */
+       xfs_agblock_t   rbno,           /* starting block of returned extent */
+       xfs_extlen_t    rlen,           /* length of returned extent */
+       int             flags)          /* flags, XFSA_FIXUP_... */
+{
+       int             error;          /* error code */
+       int             i;              /* operation results */
+       xfs_agblock_t   nfbno1;         /* first new free startblock */
+       xfs_agblock_t   nfbno2;         /* second new free startblock */
+       xfs_extlen_t    nflen1=0;       /* first new free length */
+       xfs_extlen_t    nflen2=0;       /* second new free length */
+
+       /*
+        * Look up the record in the by-size tree if necessary.
+        */
+       if (flags & XFSA_FIXUP_CNT_OK) {
+#ifdef DEBUG
+               if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
+                       return error;
+               XFS_WANT_CORRUPTED_RETURN(
+                       i == 1 && nfbno1 == fbno && nflen1 == flen);
+#endif
+       } else {
+               if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
+                       return error;
+               XFS_WANT_CORRUPTED_RETURN(i == 1);
+       }
+       /*
+        * Look up the record in the by-block tree if necessary.
+        */
+       if (flags & XFSA_FIXUP_BNO_OK) {
+#ifdef DEBUG
+               if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
+                       return error;
+               XFS_WANT_CORRUPTED_RETURN(
+                       i == 1 && nfbno1 == fbno && nflen1 == flen);
+#endif
+       } else {
+               if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
+                       return error;
+               XFS_WANT_CORRUPTED_RETURN(i == 1);
+       }
+
+#ifdef DEBUG
+       if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
+               struct xfs_btree_block  *bnoblock;
+               struct xfs_btree_block  *cntblock;
+
+               bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
+               cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
+
+               XFS_WANT_CORRUPTED_RETURN(
+                       bnoblock->bb_numrecs == cntblock->bb_numrecs);
+       }
+#endif
+
+       /*
+        * Deal with all four cases: the allocated record is contained
+        * within the freespace record, so we can have new freespace
+        * at either (or both) end, or no freespace remaining.
+        */
+       if (rbno == fbno && rlen == flen)
+               nfbno1 = nfbno2 = NULLAGBLOCK;
+       else if (rbno == fbno) {
+               nfbno1 = rbno + rlen;
+               nflen1 = flen - rlen;
+               nfbno2 = NULLAGBLOCK;
+       } else if (rbno + rlen == fbno + flen) {
+               nfbno1 = fbno;
+               nflen1 = flen - rlen;
+               nfbno2 = NULLAGBLOCK;
+       } else {
+               nfbno1 = fbno;
+               nflen1 = rbno - fbno;
+               nfbno2 = rbno + rlen;
+               nflen2 = (fbno + flen) - nfbno2;
+       }
+       /*
+        * Delete the entry from the by-size btree.
+        */
+       if ((error = xfs_btree_delete(cnt_cur, &i)))
+               return error;
+       XFS_WANT_CORRUPTED_RETURN(i == 1);
+       /*
+        * Add new by-size btree entry(s).
+        */
+       if (nfbno1 != NULLAGBLOCK) {
+               if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
+                       return error;
+               XFS_WANT_CORRUPTED_RETURN(i == 0);
+               if ((error = xfs_btree_insert(cnt_cur, &i)))
+                       return error;
+               XFS_WANT_CORRUPTED_RETURN(i == 1);
+       }
+       if (nfbno2 != NULLAGBLOCK) {
+               if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
+                       return error;
+               XFS_WANT_CORRUPTED_RETURN(i == 0);
+               if ((error = xfs_btree_insert(cnt_cur, &i)))
+                       return error;
+               XFS_WANT_CORRUPTED_RETURN(i == 1);
+       }
+       /*
+        * Fix up the by-block btree entry(s).
+        */
+       if (nfbno1 == NULLAGBLOCK) {
+               /*
+                * No remaining freespace, just delete the by-block tree entry.
+                */
+               if ((error = xfs_btree_delete(bno_cur, &i)))
+                       return error;
+               XFS_WANT_CORRUPTED_RETURN(i == 1);
+       } else {
+               /*
+                * Update the by-block entry to start later|be shorter.
+                */
+               if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
+                       return error;
+       }
+       if (nfbno2 != NULLAGBLOCK) {
+               /*
+                * 2 resulting free entries, need to add one.
+                */
+               if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
+                       return error;
+               XFS_WANT_CORRUPTED_RETURN(i == 0);
+               if ((error = xfs_btree_insert(bno_cur, &i)))
+                       return error;
+               XFS_WANT_CORRUPTED_RETURN(i == 1);
+       }
+       return 0;
+}
+
+static bool
+xfs_agfl_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+       struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
+       int             i;
+
+       if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid))
+               return false;
+       if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
+               return false;
+       /*
+        * during growfs operations, the perag is not fully initialised,
+        * so we can't use it for any useful checking. growfs ensures we can't
+        * use it by using uncached buffers that don't have the perag attached
+        * so we can detect and avoid this problem.
+        */
+       if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
+               return false;
+
+       for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
+               if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK &&
+                   be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
+                       return false;
+       }
+       return true;
+}
+
+static void
+xfs_agfl_read_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+
+       /*
+        * There is no verification of non-crc AGFLs because mkfs does not
+        * initialise the AGFL to zero or NULL. Hence the only valid part of the
+        * AGFL is what the AGF says is active. We can't get to the AGF, so we
+        * can't verify just those entries are valid.
+        */
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return;
+
+       if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
+               xfs_buf_ioerror(bp, -EFSBADCRC);
+       else if (!xfs_agfl_verify(bp))
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+       if (bp->b_error)
+               xfs_verifier_error(bp);
+}
+
+static void
+xfs_agfl_write_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+       struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+       /* no verification of non-crc AGFLs */
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return;
+
+       if (!xfs_agfl_verify(bp)) {
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+               xfs_verifier_error(bp);
+               return;
+       }
+
+       if (bip)
+               XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+       xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_agfl_buf_ops = {
+       .verify_read = xfs_agfl_read_verify,
+       .verify_write = xfs_agfl_write_verify,
+};
+
+/*
+ * Read in the allocation group free block array.
+ */
+STATIC int                             /* error */
+xfs_alloc_read_agfl(
+       xfs_mount_t     *mp,            /* mount point structure */
+       xfs_trans_t     *tp,            /* transaction pointer */
+       xfs_agnumber_t  agno,           /* allocation group number */
+       xfs_buf_t       **bpp)          /* buffer for the ag free block array */
+{
+       xfs_buf_t       *bp;            /* return value */
+       int             error;
+
+       ASSERT(agno != NULLAGNUMBER);
+       error = xfs_trans_read_buf(
+                       mp, tp, mp->m_ddev_targp,
+                       XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
+                       XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
+       if (error)
+               return error;
+       xfs_buf_set_ref(bp, XFS_AGFL_REF);
+       *bpp = bp;
+       return 0;
+}
+
+STATIC int
+xfs_alloc_update_counters(
+       struct xfs_trans        *tp,
+       struct xfs_perag        *pag,
+       struct xfs_buf          *agbp,
+       long                    len)
+{
+       struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
+
+       pag->pagf_freeblks += len;
+       be32_add_cpu(&agf->agf_freeblks, len);
+
+       xfs_trans_agblocks_delta(tp, len);
+       if (unlikely(be32_to_cpu(agf->agf_freeblks) >
+                    be32_to_cpu(agf->agf_length)))
+               return -EFSCORRUPTED;
+
+       xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
+       return 0;
+}
+
+/*
+ * Allocation group level functions.
+ */
+
+/*
+ * Allocate a variable extent in the allocation group agno.
+ * Type and bno are used to determine where in the allocation group the
+ * extent will start.
+ * Extent's length (returned in *len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int                     /* error */
+xfs_alloc_ag_vextent(
+       xfs_alloc_arg_t *args)  /* argument structure for allocation */
+{
+       int             error=0;
+
+       ASSERT(args->minlen > 0);
+       ASSERT(args->maxlen > 0);
+       ASSERT(args->minlen <= args->maxlen);
+       ASSERT(args->mod < args->prod);
+       ASSERT(args->alignment > 0);
+       /*
+        * Branch to correct routine based on the type.
+        */
+       args->wasfromfl = 0;
+       switch (args->type) {
+       case XFS_ALLOCTYPE_THIS_AG:
+               error = xfs_alloc_ag_vextent_size(args);
+               break;
+       case XFS_ALLOCTYPE_NEAR_BNO:
+               error = xfs_alloc_ag_vextent_near(args);
+               break;
+       case XFS_ALLOCTYPE_THIS_BNO:
+               error = xfs_alloc_ag_vextent_exact(args);
+               break;
+       default:
+               ASSERT(0);
+               /* NOTREACHED */
+       }
+
+       if (error || args->agbno == NULLAGBLOCK)
+               return error;
+
+       ASSERT(args->len >= args->minlen);
+       ASSERT(args->len <= args->maxlen);
+       ASSERT(!args->wasfromfl || !args->isfl);
+       ASSERT(args->agbno % args->alignment == 0);
+
+       if (!args->wasfromfl) {
+               error = xfs_alloc_update_counters(args->tp, args->pag,
+                                                 args->agbp,
+                                                 -((long)(args->len)));
+               if (error)
+                       return error;
+
+               ASSERT(!xfs_extent_busy_search(args->mp, args->agno,
+                                             args->agbno, args->len));
+       }
+
+       if (!args->isfl) {
+               xfs_trans_mod_sb(args->tp, args->wasdel ?
+                                XFS_TRANS_SB_RES_FDBLOCKS :
+                                XFS_TRANS_SB_FDBLOCKS,
+                                -((long)(args->len)));
+       }
+
+       XFS_STATS_INC(xs_allocx);
+       XFS_STATS_ADD(xs_allocb, args->len);
+       return error;
+}
+
+/*
+ * Allocate a variable extent at exactly agno/bno.
+ * Extent's length (returned in *len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
+ */
+STATIC int                     /* error */
+xfs_alloc_ag_vextent_exact(
+       xfs_alloc_arg_t *args)  /* allocation argument structure */
+{
+       xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
+       xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
+       int             error;
+       xfs_agblock_t   fbno;   /* start block of found extent */
+       xfs_extlen_t    flen;   /* length of found extent */
+       xfs_agblock_t   tbno;   /* start block of trimmed extent */
+       xfs_extlen_t    tlen;   /* length of trimmed extent */
+       xfs_agblock_t   tend;   /* end block of trimmed extent */
+       int             i;      /* success/failure of operation */
+
+       ASSERT(args->alignment == 1);
+
+       /*
+        * Allocate/initialize a cursor for the by-number freespace btree.
+        */
+       bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+                                         args->agno, XFS_BTNUM_BNO);
+
+       /*
+        * Lookup bno and minlen in the btree (minlen is irrelevant, really).
+        * Look for the closest free block <= bno, it must contain bno
+        * if any free block does.
+        */
+       error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
+       if (error)
+               goto error0;
+       if (!i)
+               goto not_found;
+
+       /*
+        * Grab the freespace record.
+        */
+       error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
+       if (error)
+               goto error0;
+       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+       ASSERT(fbno <= args->agbno);
+
+       /*
+        * Check for overlapping busy extents.
+        */
+       xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen);
+
+       /*
+        * Give up if the start of the extent is busy, or the freespace isn't
+        * long enough for the minimum request.
+        */
+       if (tbno > args->agbno)
+               goto not_found;
+       if (tlen < args->minlen)
+               goto not_found;
+       tend = tbno + tlen;
+       if (tend < args->agbno + args->minlen)
+               goto not_found;
+
+       /*
+        * End of extent will be smaller of the freespace end and the
+        * maximal requested end.
+        *
+        * Fix the length according to mod and prod if given.
+        */
+       args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
+                                               - args->agbno;
+       xfs_alloc_fix_len(args);
+       if (!xfs_alloc_fix_minleft(args))
+               goto not_found;
+
+       ASSERT(args->agbno + args->len <= tend);
+
+       /*
+        * We are allocating agbno for args->len
+        * Allocate/initialize a cursor for the by-size btree.
+        */
+       cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+               args->agno, XFS_BTNUM_CNT);
+       ASSERT(args->agbno + args->len <=
+               be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+       error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
+                                     args->len, XFSA_FIXUP_BNO_OK);
+       if (error) {
+               xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+               goto error0;
+       }
+
+       xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+       xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+
+       args->wasfromfl = 0;
+       trace_xfs_alloc_exact_done(args);
+       return 0;
+
+not_found:
+       /* Didn't find it, return null. */
+       xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+       args->agbno = NULLAGBLOCK;
+       trace_xfs_alloc_exact_notfound(args);
+       return 0;
+
+error0:
+       xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+       trace_xfs_alloc_exact_error(args);
+       return error;
+}
+
+/*
+ * Search the btree in a given direction via the search cursor and compare
+ * the records found against the good extent we've already found.
+ */
+STATIC int
+xfs_alloc_find_best_extent(
+       struct xfs_alloc_arg    *args,  /* allocation argument structure */
+       struct xfs_btree_cur    **gcur, /* good cursor */
+       struct xfs_btree_cur    **scur, /* searching cursor */
+       xfs_agblock_t           gdiff,  /* difference for search comparison */
+       xfs_agblock_t           *sbno,  /* extent found by search */
+       xfs_extlen_t            *slen,  /* extent length */
+       xfs_agblock_t           *sbnoa, /* aligned extent found by search */
+       xfs_extlen_t            *slena, /* aligned extent length */
+       int                     dir)    /* 0 = search right, 1 = search left */
+{
+       xfs_agblock_t           new;
+       xfs_agblock_t           sdiff;
+       int                     error;
+       int                     i;
+
+       /* The good extent is perfect, no need to  search. */
+       if (!gdiff)
+               goto out_use_good;
+
+       /*
+        * Look until we find a better one, run out of space or run off the end.
+        */
+       do {
+               error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
+               if (error)
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
+
+               /*
+                * The good extent is closer than this one.
+                */
+               if (!dir) {
+                       if (*sbnoa >= args->agbno + gdiff)
+                               goto out_use_good;
+               } else {
+                       if (*sbnoa <= args->agbno - gdiff)
+                               goto out_use_good;
+               }
+
+               /*
+                * Same distance, compare length and pick the best.
+                */
+               if (*slena >= args->minlen) {
+                       args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
+                       xfs_alloc_fix_len(args);
+
+                       sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+                                                      args->alignment,
+                                                      args->userdata, *sbnoa,
+                                                      *slena, &new);
+
+                       /*
+                        * Choose closer size and invalidate other cursor.
+                        */
+                       if (sdiff < gdiff)
+                               goto out_use_search;
+                       goto out_use_good;
+               }
+
+               if (!dir)
+                       error = xfs_btree_increment(*scur, 0, &i);
+               else
+                       error = xfs_btree_decrement(*scur, 0, &i);
+               if (error)
+                       goto error0;
+       } while (i);
+
+out_use_good:
+       xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
+       *scur = NULL;
+       return 0;
+
+out_use_search:
+       xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
+       *gcur = NULL;
+       return 0;
+
+error0:
+       /* caller invalidates cursors */
+       return error;
+}
+
+/*
+ * Allocate a variable extent near bno in the allocation group agno.
+ * Extent's length (returned in len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int                             /* error */
+xfs_alloc_ag_vextent_near(
+       xfs_alloc_arg_t *args)          /* allocation argument structure */
+{
+       xfs_btree_cur_t *bno_cur_gt;    /* cursor for bno btree, right side */
+       xfs_btree_cur_t *bno_cur_lt;    /* cursor for bno btree, left side */
+       xfs_btree_cur_t *cnt_cur;       /* cursor for count btree */
+       xfs_agblock_t   gtbno;          /* start bno of right side entry */
+       xfs_agblock_t   gtbnoa;         /* aligned ... */
+       xfs_extlen_t    gtdiff;         /* difference to right side entry */
+       xfs_extlen_t    gtlen;          /* length of right side entry */
+       xfs_extlen_t    gtlena;         /* aligned ... */
+       xfs_agblock_t   gtnew;          /* useful start bno of right side */
+       int             error;          /* error code */
+       int             i;              /* result code, temporary */
+       int             j;              /* result code, temporary */
+       xfs_agblock_t   ltbno;          /* start bno of left side entry */
+       xfs_agblock_t   ltbnoa;         /* aligned ... */
+       xfs_extlen_t    ltdiff;         /* difference to left side entry */
+       xfs_extlen_t    ltlen;          /* length of left side entry */
+       xfs_extlen_t    ltlena;         /* aligned ... */
+       xfs_agblock_t   ltnew;          /* useful start bno of left side */
+       xfs_extlen_t    rlen;           /* length of returned extent */
+       int             forced = 0;
+#ifdef DEBUG
+       /*
+        * Randomly don't execute the first algorithm.
+        */
+       int             dofirst;        /* set to do first algorithm */
+
+       dofirst = prandom_u32() & 1;
+#endif
+
+restart:
+       bno_cur_lt = NULL;
+       bno_cur_gt = NULL;
+       ltlen = 0;
+       gtlena = 0;
+       ltlena = 0;
+
+       /*
+        * Get a cursor for the by-size btree.
+        */
+       cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+               args->agno, XFS_BTNUM_CNT);
+
+       /*
+        * See if there are any free extents as big as maxlen.
+        */
+       if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen, &i)))
+               goto error0;
+       /*
+        * If none, then pick up the last entry in the tree unless the
+        * tree is empty.
+        */
+       if (!i) {
+               if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &ltbno,
+                               &ltlen, &i)))
+                       goto error0;
+               if (i == 0 || ltlen == 0) {
+                       xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+                       trace_xfs_alloc_near_noentry(args);
+                       return 0;
+               }
+               ASSERT(i == 1);
+       }
+       args->wasfromfl = 0;
+
+       /*
+        * First algorithm.
+        * If the requested extent is large wrt the freespaces available
+        * in this a.g., then the cursor will be pointing to a btree entry
+        * near the right edge of the tree.  If it's in the last btree leaf
+        * block, then we just examine all the entries in that block
+        * that are big enough, and pick the best one.
+        * This is written as a while loop so we can break out of it,
+        * but we never loop back to the top.
+        */
+       while (xfs_btree_islastblock(cnt_cur, 0)) {
+               xfs_extlen_t    bdiff;
+               int             besti=0;
+               xfs_extlen_t    blen=0;
+               xfs_agblock_t   bnew=0;
+
+#ifdef DEBUG
+               if (dofirst)
+                       break;
+#endif
+               /*
+                * Start from the entry that lookup found, sequence through
+                * all larger free blocks.  If we're actually pointing at a
+                * record smaller than maxlen, go to the start of this block,
+                * and skip all those smaller than minlen.
+                */
+               if (ltlen || args->alignment > 1) {
+                       cnt_cur->bc_ptrs[0] = 1;
+                       do {
+                               if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno,
+                                               &ltlen, &i)))
+                                       goto error0;
+                               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                               if (ltlen >= args->minlen)
+                                       break;
+                               if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
+                                       goto error0;
+                       } while (i);
+                       ASSERT(ltlen >= args->minlen);
+                       if (!i)
+                               break;
+               }
+               i = cnt_cur->bc_ptrs[0];
+               for (j = 1, blen = 0, bdiff = 0;
+                    !error && j && (blen < args->maxlen || bdiff > 0);
+                    error = xfs_btree_increment(cnt_cur, 0, &j)) {
+                       /*
+                        * For each entry, decide if it's better than
+                        * the previous best entry.
+                        */
+                       if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
+                               goto error0;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                       xfs_alloc_compute_aligned(args, ltbno, ltlen,
+                                                 &ltbnoa, &ltlena);
+                       if (ltlena < args->minlen)
+                               continue;
+                       args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+                       xfs_alloc_fix_len(args);
+                       ASSERT(args->len >= args->minlen);
+                       if (args->len < blen)
+                               continue;
+                       ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+                               args->alignment, args->userdata, ltbnoa,
+                               ltlena, &ltnew);
+                       if (ltnew != NULLAGBLOCK &&
+                           (args->len > blen || ltdiff < bdiff)) {
+                               bdiff = ltdiff;
+                               bnew = ltnew;
+                               blen = args->len;
+                               besti = cnt_cur->bc_ptrs[0];
+                       }
+               }
+               /*
+                * It didn't work.  We COULD be in a case where
+                * there's a good record somewhere, so try again.
+                */
+               if (blen == 0)
+                       break;
+               /*
+                * Point at the best entry, and retrieve it again.
+                */
+               cnt_cur->bc_ptrs[0] = besti;
+               if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+               args->len = blen;
+               if (!xfs_alloc_fix_minleft(args)) {
+                       xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+                       trace_xfs_alloc_near_nominleft(args);
+                       return 0;
+               }
+               blen = args->len;
+               /*
+                * We are allocating starting at bnew for blen blocks.
+                */
+               args->agbno = bnew;
+               ASSERT(bnew >= ltbno);
+               ASSERT(bnew + blen <= ltbno + ltlen);
+               /*
+                * Set up a cursor for the by-bno tree.
+                */
+               bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp,
+                       args->agbp, args->agno, XFS_BTNUM_BNO);
+               /*
+                * Fix up the btree entries.
+                */
+               if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno,
+                               ltlen, bnew, blen, XFSA_FIXUP_CNT_OK)))
+                       goto error0;
+               xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+               xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+
+               trace_xfs_alloc_near_first(args);
+               return 0;
+       }
+       /*
+        * Second algorithm.
+        * Search in the by-bno tree to the left and to the right
+        * simultaneously, until in each case we find a space big enough,
+        * or run into the edge of the tree.  When we run into the edge,
+        * we deallocate that cursor.
+        * If both searches succeed, we compare the two spaces and pick
+        * the better one.
+        * With alignment, it's possible for both to fail; the upper
+        * level algorithm that picks allocation groups for allocations
+        * is not supposed to do this.
+        */
+       /*
+        * Allocate and initialize the cursor for the leftward search.
+        */
+       bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+               args->agno, XFS_BTNUM_BNO);
+       /*
+        * Lookup <= bno to find the leftward search's starting point.
+        */
+       if ((error = xfs_alloc_lookup_le(bno_cur_lt, args->agbno, args->maxlen, &i)))
+               goto error0;
+       if (!i) {
+               /*
+                * Didn't find anything; use this cursor for the rightward
+                * search.
+                */
+               bno_cur_gt = bno_cur_lt;
+               bno_cur_lt = NULL;
+       }
+       /*
+        * Found something.  Duplicate the cursor for the rightward search.
+        */
+       else if ((error = xfs_btree_dup_cursor(bno_cur_lt, &bno_cur_gt)))
+               goto error0;
+       /*
+        * Increment the cursor, so we will point at the entry just right
+        * of the leftward entry if any, or to the leftmost entry.
+        */
+       if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
+               goto error0;
+       if (!i) {
+               /*
+                * It failed, there are no rightward entries.
+                */
+               xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR);
+               bno_cur_gt = NULL;
+       }
+       /*
+        * Loop going left with the leftward cursor, right with the
+        * rightward cursor, until either both directions give up or
+        * we find an entry at least as big as minlen.
+        */
+       do {
+               if (bno_cur_lt) {
+                       if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
+                               goto error0;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                       xfs_alloc_compute_aligned(args, ltbno, ltlen,
+                                                 &ltbnoa, &ltlena);
+                       if (ltlena >= args->minlen)
+                               break;
+                       if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
+                               goto error0;
+                       if (!i) {
+                               xfs_btree_del_cursor(bno_cur_lt,
+                                                    XFS_BTREE_NOERROR);
+                               bno_cur_lt = NULL;
+                       }
+               }
+               if (bno_cur_gt) {
+                       if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
+                               goto error0;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                       xfs_alloc_compute_aligned(args, gtbno, gtlen,
+                                                 &gtbnoa, &gtlena);
+                       if (gtlena >= args->minlen)
+                               break;
+                       if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
+                               goto error0;
+                       if (!i) {
+                               xfs_btree_del_cursor(bno_cur_gt,
+                                                    XFS_BTREE_NOERROR);
+                               bno_cur_gt = NULL;
+                       }
+               }
+       } while (bno_cur_lt || bno_cur_gt);
+
+       /*
+        * Got both cursors still active, need to find better entry.
+        */
+       if (bno_cur_lt && bno_cur_gt) {
+               if (ltlena >= args->minlen) {
+                       /*
+                        * Left side is good, look for a right side entry.
+                        */
+                       args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+                       xfs_alloc_fix_len(args);
+                       ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+                               args->alignment, args->userdata, ltbnoa,
+                               ltlena, &ltnew);
+
+                       error = xfs_alloc_find_best_extent(args,
+                                               &bno_cur_lt, &bno_cur_gt,
+                                               ltdiff, &gtbno, &gtlen,
+                                               &gtbnoa, &gtlena,
+                                               0 /* search right */);
+               } else {
+                       ASSERT(gtlena >= args->minlen);
+
+                       /*
+                        * Right side is good, look for a left side entry.
+                        */
+                       args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
+                       xfs_alloc_fix_len(args);
+                       gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+                               args->alignment, args->userdata, gtbnoa,
+                               gtlena, &gtnew);
+
+                       error = xfs_alloc_find_best_extent(args,
+                                               &bno_cur_gt, &bno_cur_lt,
+                                               gtdiff, &ltbno, &ltlen,
+                                               &ltbnoa, &ltlena,
+                                               1 /* search left */);
+               }
+
+               if (error)
+                       goto error0;
+       }
+
+       /*
+        * If we couldn't get anything, give up.
+        */
+       if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
+               xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+
+               if (!forced++) {
+                       trace_xfs_alloc_near_busy(args);
+                       xfs_log_force(args->mp, XFS_LOG_SYNC);
+                       goto restart;
+               }
+               trace_xfs_alloc_size_neither(args);
+               args->agbno = NULLAGBLOCK;
+               return 0;
+       }
+
+       /*
+        * At this point we have selected a freespace entry, either to the
+        * left or to the right.  If it's on the right, copy all the
+        * useful variables to the "left" set so we only have one
+        * copy of this code.
+        */
+       if (bno_cur_gt) {
+               bno_cur_lt = bno_cur_gt;
+               bno_cur_gt = NULL;
+               ltbno = gtbno;
+               ltbnoa = gtbnoa;
+               ltlen = gtlen;
+               ltlena = gtlena;
+               j = 1;
+       } else
+               j = 0;
+
+       /*
+        * Fix up the length and compute the useful address.
+        */
+       args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+       xfs_alloc_fix_len(args);
+       if (!xfs_alloc_fix_minleft(args)) {
+               trace_xfs_alloc_near_nominleft(args);
+               xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+               xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+               return 0;
+       }
+       rlen = args->len;
+       (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
+                                    args->userdata, ltbnoa, ltlena, &ltnew);
+       ASSERT(ltnew >= ltbno);
+       ASSERT(ltnew + rlen <= ltbnoa + ltlena);
+       ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+       args->agbno = ltnew;
+
+       if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
+                       ltnew, rlen, XFSA_FIXUP_BNO_OK)))
+               goto error0;
+
+       if (j)
+               trace_xfs_alloc_near_greater(args);
+       else
+               trace_xfs_alloc_near_lesser(args);
+
+       xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+       xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+       return 0;
+
+ error0:
+       trace_xfs_alloc_near_error(args);
+       if (cnt_cur != NULL)
+               xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+       if (bno_cur_lt != NULL)
+               xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_ERROR);
+       if (bno_cur_gt != NULL)
+               xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_ERROR);
+       return error;
+}
+
+/*
+ * Allocate a variable extent anywhere in the allocation group agno.
+ * Extent's length (returned in len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int                             /* error */
+xfs_alloc_ag_vextent_size(
+       xfs_alloc_arg_t *args)          /* allocation argument structure */
+{
+       xfs_btree_cur_t *bno_cur;       /* cursor for bno btree */
+       xfs_btree_cur_t *cnt_cur;       /* cursor for cnt btree */
+       int             error;          /* error result */
+       xfs_agblock_t   fbno;           /* start of found freespace */
+       xfs_extlen_t    flen;           /* length of found freespace */
+       int             i;              /* temp status variable */
+       xfs_agblock_t   rbno;           /* returned block number */
+       xfs_extlen_t    rlen;           /* length of returned extent */
+       int             forced = 0;
+
+restart:
+       /*
+        * Allocate and initialize a cursor for the by-size btree.
+        */
+       cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+               args->agno, XFS_BTNUM_CNT);
+       bno_cur = NULL;
+
+       /*
+        * Look for an entry >= maxlen+alignment-1 blocks.
+        */
+       if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
+                       args->maxlen + args->alignment - 1, &i)))
+               goto error0;
+
+       /*
+        * If none or we have busy extents that we cannot allocate from, then
+        * we have to settle for a smaller extent. In the case that there are
+        * no large extents, this will return the last entry in the tree unless
+        * the tree is empty. In the case that there are only busy large
+        * extents, this will return the largest small extent unless there
+        * are no smaller extents available.
+        */
+       if (!i || forced > 1) {
+               error = xfs_alloc_ag_vextent_small(args, cnt_cur,
+                                                  &fbno, &flen, &i);
+               if (error)
+                       goto error0;
+               if (i == 0 || flen == 0) {
+                       xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+                       trace_xfs_alloc_size_noentry(args);
+                       return 0;
+               }
+               ASSERT(i == 1);
+               xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
+       } else {
+               /*
+                * Search for a non-busy extent that is large enough.
+                * If we are at low space, don't check, or if we fall of
+                * the end of the btree, turn off the busy check and
+                * restart.
+                */
+               for (;;) {
+                       error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
+                       if (error)
+                               goto error0;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+                       xfs_alloc_compute_aligned(args, fbno, flen,
+                                                 &rbno, &rlen);
+
+                       if (rlen >= args->maxlen)
+                               break;
+
+                       error = xfs_btree_increment(cnt_cur, 0, &i);
+                       if (error)
+                               goto error0;
+                       if (i == 0) {
+                               /*
+                                * Our only valid extents must have been busy.
+                                * Make it unbusy by forcing the log out and
+                                * retrying. If we've been here before, forcing
+                                * the log isn't making the extents available,
+                                * which means they have probably been freed in
+                                * this transaction.  In that case, we have to
+                                * give up on them and we'll attempt a minlen
+                                * allocation the next time around.
+                                */
+                               xfs_btree_del_cursor(cnt_cur,
+                                                    XFS_BTREE_NOERROR);
+                               trace_xfs_alloc_size_busy(args);
+                               if (!forced++)
+                                       xfs_log_force(args->mp, XFS_LOG_SYNC);
+                               goto restart;
+                       }
+               }
+       }
+
+       /*
+        * In the first case above, we got the last entry in the
+        * by-size btree.  Now we check to see if the space hits maxlen
+        * once aligned; if not, we search left for something better.
+        * This can't happen in the second case above.
+        */
+       rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
+       XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
+                       (rlen <= flen && rbno + rlen <= fbno + flen), error0);
+       if (rlen < args->maxlen) {
+               xfs_agblock_t   bestfbno;
+               xfs_extlen_t    bestflen;
+               xfs_agblock_t   bestrbno;
+               xfs_extlen_t    bestrlen;
+
+               bestrlen = rlen;
+               bestrbno = rbno;
+               bestflen = flen;
+               bestfbno = fbno;
+               for (;;) {
+                       if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
+                               goto error0;
+                       if (i == 0)
+                               break;
+                       if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
+                                       &i)))
+                               goto error0;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                       if (flen < bestrlen)
+                               break;
+                       xfs_alloc_compute_aligned(args, fbno, flen,
+                                                 &rbno, &rlen);
+                       rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
+                       XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
+                               (rlen <= flen && rbno + rlen <= fbno + flen),
+                               error0);
+                       if (rlen > bestrlen) {
+                               bestrlen = rlen;
+                               bestrbno = rbno;
+                               bestflen = flen;
+                               bestfbno = fbno;
+                               if (rlen == args->maxlen)
+                                       break;
+                       }
+               }
+               if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
+                               &i)))
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               rlen = bestrlen;
+               rbno = bestrbno;
+               flen = bestflen;
+               fbno = bestfbno;
+       }
+       args->wasfromfl = 0;
+       /*
+        * Fix up the length.
+        */
+       args->len = rlen;
+       if (rlen < args->minlen) {
+               if (!forced++) {
+                       xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+                       trace_xfs_alloc_size_busy(args);
+                       xfs_log_force(args->mp, XFS_LOG_SYNC);
+                       goto restart;
+               }
+               goto out_nominleft;
+       }
+       xfs_alloc_fix_len(args);
+
+       if (!xfs_alloc_fix_minleft(args))
+               goto out_nominleft;
+       rlen = args->len;
+       XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
+       /*
+        * Allocate and initialize a cursor for the by-block tree.
+        */
+       bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+               args->agno, XFS_BTNUM_BNO);
+       if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
+                       rbno, rlen, XFSA_FIXUP_CNT_OK)))
+               goto error0;
+       xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+       xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+       cnt_cur = bno_cur = NULL;
+       args->len = rlen;
+       args->agbno = rbno;
+       XFS_WANT_CORRUPTED_GOTO(
+               args->agbno + args->len <=
+                       be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
+               error0);
+       trace_xfs_alloc_size_done(args);
+       return 0;
+
+error0:
+       trace_xfs_alloc_size_error(args);
+       if (cnt_cur)
+               xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+       if (bno_cur)
+               xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+       return error;
+
+out_nominleft:
+       xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+       trace_xfs_alloc_size_nominleft(args);
+       args->agbno = NULLAGBLOCK;
+       return 0;
+}
+
+/*
+ * Deal with the case where only small freespaces remain.
+ * Either return the contents of the last freespace record,
+ * or allocate space from the freelist if there is nothing in the tree.
+ */
+STATIC int                     /* error */
+xfs_alloc_ag_vextent_small(
+       xfs_alloc_arg_t *args,  /* allocation argument structure */
+       xfs_btree_cur_t *ccur,  /* by-size cursor */
+       xfs_agblock_t   *fbnop, /* result block number */
+       xfs_extlen_t    *flenp, /* result length */
+       int             *stat)  /* status: 0-freelist, 1-normal/none */
+{
+       int             error;
+       xfs_agblock_t   fbno;
+       xfs_extlen_t    flen;
+       int             i;
+
+       if ((error = xfs_btree_decrement(ccur, 0, &i)))
+               goto error0;
+       if (i) {
+               if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+       }
+       /*
+        * Nothing in the btree, try the freelist.  Make sure
+        * to respect minleft even when pulling from the
+        * freelist.
+        */
+       else if (args->minlen == 1 && args->alignment == 1 && !args->isfl &&
+                (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)
+                 > args->minleft)) {
+               error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
+               if (error)
+                       goto error0;
+               if (fbno != NULLAGBLOCK) {
+                       xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
+                                            args->userdata);
+
+                       if (args->userdata) {
+                               xfs_buf_t       *bp;
+
+                               bp = xfs_btree_get_bufs(args->mp, args->tp,
+                                       args->agno, fbno, 0);
+                               xfs_trans_binval(args->tp, bp);
+                       }
+                       args->len = 1;
+                       args->agbno = fbno;
+                       XFS_WANT_CORRUPTED_GOTO(
+                               args->agbno + args->len <=
+                               be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
+                               error0);
+                       args->wasfromfl = 1;
+                       trace_xfs_alloc_small_freelist(args);
+                       *stat = 0;
+                       return 0;
+               }
+               /*
+                * Nothing in the freelist.
+                */
+               else
+                       flen = 0;
+       }
+       /*
+        * Can't allocate from the freelist for some reason.
+        */
+       else {
+               fbno = NULLAGBLOCK;
+               flen = 0;
+       }
+       /*
+        * Can't do the allocation, give up.
+        */
+       if (flen < args->minlen) {
+               args->agbno = NULLAGBLOCK;
+               trace_xfs_alloc_small_notenough(args);
+               flen = 0;
+       }
+       *fbnop = fbno;
+       *flenp = flen;
+       *stat = 1;
+       trace_xfs_alloc_small_done(args);
+       return 0;
+
+error0:
+       trace_xfs_alloc_small_error(args);
+       return error;
+}
+
+/*
+ * Free the extent starting at agno/bno for length.
+ */
+STATIC int                     /* error */
+xfs_free_ag_extent(
+       xfs_trans_t     *tp,    /* transaction pointer */
+       xfs_buf_t       *agbp,  /* buffer for a.g. freelist header */
+       xfs_agnumber_t  agno,   /* allocation group number */
+       xfs_agblock_t   bno,    /* starting block number */
+       xfs_extlen_t    len,    /* length of extent */
+       int             isfl)   /* set if is freelist blocks - no sb acctg */
+{
+       xfs_btree_cur_t *bno_cur;       /* cursor for by-block btree */
+       xfs_btree_cur_t *cnt_cur;       /* cursor for by-size btree */
+       int             error;          /* error return value */
+       xfs_agblock_t   gtbno;          /* start of right neighbor block */
+       xfs_extlen_t    gtlen;          /* length of right neighbor block */
+       int             haveleft;       /* have a left neighbor block */
+       int             haveright;      /* have a right neighbor block */
+       int             i;              /* temp, result code */
+       xfs_agblock_t   ltbno;          /* start of left neighbor block */
+       xfs_extlen_t    ltlen;          /* length of left neighbor block */
+       xfs_mount_t     *mp;            /* mount point struct for filesystem */
+       xfs_agblock_t   nbno;           /* new starting block of freespace */
+       xfs_extlen_t    nlen;           /* new length of freespace */
+       xfs_perag_t     *pag;           /* per allocation group data */
+
+       mp = tp->t_mountp;
+       /*
+        * Allocate and initialize a cursor for the by-block btree.
+        */
+       bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
+       cnt_cur = NULL;
+       /*
+        * Look for a neighboring block on the left (lower block numbers)
+        * that is contiguous with this space.
+        */
+       if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
+               goto error0;
+       if (haveleft) {
+               /*
+                * There is a block to our left.
+                */
+               if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               /*
+                * It's not contiguous, though.
+                */
+               if (ltbno + ltlen < bno)
+                       haveleft = 0;
+               else {
+                       /*
+                        * If this failure happens the request to free this
+                        * space was invalid, it's (partly) already free.
+                        * Very bad.
+                        */
+                       XFS_WANT_CORRUPTED_GOTO(ltbno + ltlen <= bno, error0);
+               }
+       }
+       /*
+        * Look for a neighboring block on the right (higher block numbers)
+        * that is contiguous with this space.
+        */
+       if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
+               goto error0;
+       if (haveright) {
+               /*
+                * There is a block to our right.
+                */
+               if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               /*
+                * It's not contiguous, though.
+                */
+               if (bno + len < gtbno)
+                       haveright = 0;
+               else {
+                       /*
+                        * If this failure happens the request to free this
+                        * space was invalid, it's (partly) already free.
+                        * Very bad.
+                        */
+                       XFS_WANT_CORRUPTED_GOTO(gtbno >= bno + len, error0);
+               }
+       }
+       /*
+        * Now allocate and initialize a cursor for the by-size tree.
+        */
+       cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
+       /*
+        * Have both left and right contiguous neighbors.
+        * Merge all three into a single free block.
+        */
+       if (haveleft && haveright) {
+               /*
+                * Delete the old by-size entry on the left.
+                */
+               if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               if ((error = xfs_btree_delete(cnt_cur, &i)))
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               /*
+                * Delete the old by-size entry on the right.
+                */
+               if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               if ((error = xfs_btree_delete(cnt_cur, &i)))
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               /*
+                * Delete the old by-block entry for the right block.
+                */
+               if ((error = xfs_btree_delete(bno_cur, &i)))
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               /*
+                * Move the by-block cursor back to the left neighbor.
+                */
+               if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+#ifdef DEBUG
+               /*
+                * Check that this is the right record: delete didn't
+                * mangle the cursor.
+                */
+               {
+                       xfs_agblock_t   xxbno;
+                       xfs_extlen_t    xxlen;
+
+                       if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
+                                       &i)))
+                               goto error0;
+                       XFS_WANT_CORRUPTED_GOTO(
+                               i == 1 && xxbno == ltbno && xxlen == ltlen,
+                               error0);
+               }
+#endif
+               /*
+                * Update remaining by-block entry to the new, joined block.
+                */
+               nbno = ltbno;
+               nlen = len + ltlen + gtlen;
+               if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+                       goto error0;
+       }
+       /*
+        * Have only a left contiguous neighbor.
+        * Merge it together with the new freespace.
+        */
+       else if (haveleft) {
+               /*
+                * Delete the old by-size entry on the left.
+                */
+               if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               if ((error = xfs_btree_delete(cnt_cur, &i)))
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               /*
+                * Back up the by-block cursor to the left neighbor, and
+                * update its length.
+                */
+               if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               nbno = ltbno;
+               nlen = len + ltlen;
+               if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+                       goto error0;
+       }
+       /*
+        * Have only a right contiguous neighbor.
+        * Merge it together with the new freespace.
+        */
+       else if (haveright) {
+               /*
+                * Delete the old by-size entry on the right.
+                */
+               if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               if ((error = xfs_btree_delete(cnt_cur, &i)))
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               /*
+                * Update the starting block and length of the right
+                * neighbor in the by-block tree.
+                */
+               nbno = bno;
+               nlen = len + gtlen;
+               if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+                       goto error0;
+       }
+       /*
+        * No contiguous neighbors.
+        * Insert the new freespace into the by-block tree.
+        */
+       else {
+               nbno = bno;
+               nlen = len;
+               if ((error = xfs_btree_insert(bno_cur, &i)))
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+       }
+       xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+       bno_cur = NULL;
+       /*
+        * In all cases we need to insert the new freespace in the by-size tree.
+        */
+       if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
+               goto error0;
+       XFS_WANT_CORRUPTED_GOTO(i == 0, error0);
+       if ((error = xfs_btree_insert(cnt_cur, &i)))
+               goto error0;
+       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+       xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+       cnt_cur = NULL;
+
+       /*
+        * Update the freespace totals in the ag and superblock.
+        */
+       pag = xfs_perag_get(mp, agno);
+       error = xfs_alloc_update_counters(tp, pag, agbp, len);
+       xfs_perag_put(pag);
+       if (error)
+               goto error0;
+
+       if (!isfl)
+               xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
+       XFS_STATS_INC(xs_freex);
+       XFS_STATS_ADD(xs_freeb, len);
+
+       trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
+
+       return 0;
+
+ error0:
+       trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1);
+       if (bno_cur)
+               xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+       if (cnt_cur)
+               xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+       return error;
+}
+
+/*
+ * Visible (exported) allocation/free functions.
+ * Some of these are used just by xfs_alloc_btree.c and this file.
+ */
+
+/*
+ * Compute and fill in value of m_ag_maxlevels.
+ */
+void
+xfs_alloc_compute_maxlevels(
+       xfs_mount_t     *mp)    /* file system mount structure */
+{
+       int             level;
+       uint            maxblocks;
+       uint            maxleafents;
+       int             minleafrecs;
+       int             minnoderecs;
+
+       maxleafents = (mp->m_sb.sb_agblocks + 1) / 2;
+       minleafrecs = mp->m_alloc_mnr[0];
+       minnoderecs = mp->m_alloc_mnr[1];
+       maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+       for (level = 1; maxblocks > 1; level++)
+               maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+       mp->m_ag_maxlevels = level;
+}
+
+/*
+ * Find the length of the longest extent in an AG.
+ */
+xfs_extlen_t
+xfs_alloc_longest_free_extent(
+       struct xfs_mount        *mp,
+       struct xfs_perag        *pag)
+{
+       xfs_extlen_t            need, delta = 0;
+
+       need = XFS_MIN_FREELIST_PAG(pag, mp);
+       if (need > pag->pagf_flcount)
+               delta = need - pag->pagf_flcount;
+
+       if (pag->pagf_longest > delta)
+               return pag->pagf_longest - delta;
+       return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
+}
+
+/*
+ * Decide whether to use this allocation group for this allocation.
+ * If so, fix up the btree freelist's size.
+ */
+STATIC int                     /* error */
+xfs_alloc_fix_freelist(
+       xfs_alloc_arg_t *args,  /* allocation argument structure */
+       int             flags)  /* XFS_ALLOC_FLAG_... */
+{
+       xfs_buf_t       *agbp;  /* agf buffer pointer */
+       xfs_agf_t       *agf;   /* a.g. freespace structure pointer */
+       xfs_buf_t       *agflbp;/* agfl buffer pointer */
+       xfs_agblock_t   bno;    /* freelist block */
+       xfs_extlen_t    delta;  /* new blocks needed in freelist */
+       int             error;  /* error result code */
+       xfs_extlen_t    longest;/* longest extent in allocation group */
+       xfs_mount_t     *mp;    /* file system mount point structure */
+       xfs_extlen_t    need;   /* total blocks needed in freelist */
+       xfs_perag_t     *pag;   /* per-ag information structure */
+       xfs_alloc_arg_t targs;  /* local allocation arguments */
+       xfs_trans_t     *tp;    /* transaction pointer */
+
+       mp = args->mp;
+
+       pag = args->pag;
+       tp = args->tp;
+       if (!pag->pagf_init) {
+               if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
+                               &agbp)))
+                       return error;
+               if (!pag->pagf_init) {
+                       ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
+                       ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
+                       args->agbp = NULL;
+                       return 0;
+               }
+       } else
+               agbp = NULL;
+
+       /*
+        * If this is a metadata preferred pag and we are user data
+        * then try somewhere else if we are not being asked to
+        * try harder at this point
+        */
+       if (pag->pagf_metadata && args->userdata &&
+           (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
+               ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
+               args->agbp = NULL;
+               return 0;
+       }
+
+       if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
+               /*
+                * If it looks like there isn't a long enough extent, or enough
+                * total blocks, reject it.
+                */
+               need = XFS_MIN_FREELIST_PAG(pag, mp);
+               longest = xfs_alloc_longest_free_extent(mp, pag);
+               if ((args->minlen + args->alignment + args->minalignslop - 1) >
+                               longest ||
+                   ((int)(pag->pagf_freeblks + pag->pagf_flcount -
+                          need - args->total) < (int)args->minleft)) {
+                       if (agbp)
+                               xfs_trans_brelse(tp, agbp);
+                       args->agbp = NULL;
+                       return 0;
+               }
+       }
+
+       /*
+        * Get the a.g. freespace buffer.
+        * Can fail if we're not blocking on locks, and it's held.
+        */
+       if (agbp == NULL) {
+               if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
+                               &agbp)))
+                       return error;
+               if (agbp == NULL) {
+                       ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
+                       ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
+                       args->agbp = NULL;
+                       return 0;
+               }
+       }
+       /*
+        * Figure out how many blocks we should have in the freelist.
+        */
+       agf = XFS_BUF_TO_AGF(agbp);
+       need = XFS_MIN_FREELIST(agf, mp);
+       /*
+        * If there isn't enough total or single-extent, reject it.
+        */
+       if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
+               delta = need > be32_to_cpu(agf->agf_flcount) ?
+                       (need - be32_to_cpu(agf->agf_flcount)) : 0;
+               longest = be32_to_cpu(agf->agf_longest);
+               longest = (longest > delta) ? (longest - delta) :
+                       (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
+               if ((args->minlen + args->alignment + args->minalignslop - 1) >
+                               longest ||
+                   ((int)(be32_to_cpu(agf->agf_freeblks) +
+                    be32_to_cpu(agf->agf_flcount) - need - args->total) <
+                               (int)args->minleft)) {
+                       xfs_trans_brelse(tp, agbp);
+                       args->agbp = NULL;
+                       return 0;
+               }
+       }
+       /*
+        * Make the freelist shorter if it's too long.
+        */
+       while (be32_to_cpu(agf->agf_flcount) > need) {
+               xfs_buf_t       *bp;
+
+               error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
+               if (error)
+                       return error;
+               if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1)))
+                       return error;
+               bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
+               xfs_trans_binval(tp, bp);
+       }
+       /*
+        * Initialize the args structure.
+        */
+       memset(&targs, 0, sizeof(targs));
+       targs.tp = tp;
+       targs.mp = mp;
+       targs.agbp = agbp;
+       targs.agno = args->agno;
+       targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
+       targs.type = XFS_ALLOCTYPE_THIS_AG;
+       targs.pag = pag;
+       if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp)))
+               return error;
+       /*
+        * Make the freelist longer if it's too short.
+        */
+       while (be32_to_cpu(agf->agf_flcount) < need) {
+               targs.agbno = 0;
+               targs.maxlen = need - be32_to_cpu(agf->agf_flcount);
+               /*
+                * Allocate as many blocks as possible at once.
+                */
+               if ((error = xfs_alloc_ag_vextent(&targs))) {
+                       xfs_trans_brelse(tp, agflbp);
+                       return error;
+               }
+               /*
+                * Stop if we run out.  Won't happen if callers are obeying
+                * the restrictions correctly.  Can happen for free calls
+                * on a completely full ag.
+                */
+               if (targs.agbno == NULLAGBLOCK) {
+                       if (flags & XFS_ALLOC_FLAG_FREEING)
+                               break;
+                       xfs_trans_brelse(tp, agflbp);
+                       args->agbp = NULL;
+                       return 0;
+               }
+               /*
+                * Put each allocated block on the list.
+                */
+               for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
+                       error = xfs_alloc_put_freelist(tp, agbp,
+                                                       agflbp, bno, 0);
+                       if (error)
+                               return error;
+               }
+       }
+       xfs_trans_brelse(tp, agflbp);
+       args->agbp = agbp;
+       return 0;
+}
+
+/*
+ * Get a block from the freelist.
+ * Returns with the buffer for the block gotten.
+ */
+int                            /* error */
+xfs_alloc_get_freelist(
+       xfs_trans_t     *tp,    /* transaction pointer */
+       xfs_buf_t       *agbp,  /* buffer containing the agf structure */
+       xfs_agblock_t   *bnop,  /* block address retrieved from freelist */
+       int             btreeblk) /* destination is a AGF btree */
+{
+       xfs_agf_t       *agf;   /* a.g. freespace structure */
+       xfs_buf_t       *agflbp;/* buffer for a.g. freelist structure */
+       xfs_agblock_t   bno;    /* block number returned */
+       __be32          *agfl_bno;
+       int             error;
+       int             logflags;
+       xfs_mount_t     *mp = tp->t_mountp;
+       xfs_perag_t     *pag;   /* per allocation group data */
+
+       /*
+        * Freelist is empty, give up.
+        */
+       agf = XFS_BUF_TO_AGF(agbp);
+       if (!agf->agf_flcount) {
+               *bnop = NULLAGBLOCK;
+               return 0;
+       }
+       /*
+        * Read the array of free blocks.
+        */
+       error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno),
+                                   &agflbp);
+       if (error)
+               return error;
+
+
+       /*
+        * Get the block number and update the data structures.
+        */
+       agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+       bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
+       be32_add_cpu(&agf->agf_flfirst, 1);
+       xfs_trans_brelse(tp, agflbp);
+       if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
+               agf->agf_flfirst = 0;
+
+       pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
+       be32_add_cpu(&agf->agf_flcount, -1);
+       xfs_trans_agflist_delta(tp, -1);
+       pag->pagf_flcount--;
+       xfs_perag_put(pag);
+
+       logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
+       if (btreeblk) {
+               be32_add_cpu(&agf->agf_btreeblks, 1);
+               pag->pagf_btreeblks++;
+               logflags |= XFS_AGF_BTREEBLKS;
+       }
+
+       xfs_alloc_log_agf(tp, agbp, logflags);
+       *bnop = bno;
+
+       return 0;
+}
+
+/*
+ * Log the given fields from the agf structure.
+ */
+void
+xfs_alloc_log_agf(
+       xfs_trans_t     *tp,    /* transaction pointer */
+       xfs_buf_t       *bp,    /* buffer for a.g. freelist header */
+       int             fields) /* mask of fields to be logged (XFS_AGF_...) */
+{
+       int     first;          /* first byte offset */
+       int     last;           /* last byte offset */
+       static const short      offsets[] = {
+               offsetof(xfs_agf_t, agf_magicnum),
+               offsetof(xfs_agf_t, agf_versionnum),
+               offsetof(xfs_agf_t, agf_seqno),
+               offsetof(xfs_agf_t, agf_length),
+               offsetof(xfs_agf_t, agf_roots[0]),
+               offsetof(xfs_agf_t, agf_levels[0]),
+               offsetof(xfs_agf_t, agf_flfirst),
+               offsetof(xfs_agf_t, agf_fllast),
+               offsetof(xfs_agf_t, agf_flcount),
+               offsetof(xfs_agf_t, agf_freeblks),
+               offsetof(xfs_agf_t, agf_longest),
+               offsetof(xfs_agf_t, agf_btreeblks),
+               offsetof(xfs_agf_t, agf_uuid),
+               sizeof(xfs_agf_t)
+       };
+
+       trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_);
+
+       xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
+
+       xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
+       xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
+}
+
+/*
+ * Interface for inode allocation to force the pag data to be initialized.
+ */
+int                                    /* error */
+xfs_alloc_pagf_init(
+       xfs_mount_t             *mp,    /* file system mount structure */
+       xfs_trans_t             *tp,    /* transaction pointer */
+       xfs_agnumber_t          agno,   /* allocation group number */
+       int                     flags)  /* XFS_ALLOC_FLAGS_... */
+{
+       xfs_buf_t               *bp;
+       int                     error;
+
+       if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp)))
+               return error;
+       if (bp)
+               xfs_trans_brelse(tp, bp);
+       return 0;
+}
+
+/*
+ * Put the block on the freelist for the allocation group.
+ */
+int                                    /* error */
+xfs_alloc_put_freelist(
+       xfs_trans_t             *tp,    /* transaction pointer */
+       xfs_buf_t               *agbp,  /* buffer for a.g. freelist header */
+       xfs_buf_t               *agflbp,/* buffer for a.g. free block array */
+       xfs_agblock_t           bno,    /* block being freed */
+       int                     btreeblk) /* block came from a AGF btree */
+{
+       xfs_agf_t               *agf;   /* a.g. freespace structure */
+       __be32                  *blockp;/* pointer to array entry */
+       int                     error;
+       int                     logflags;
+       xfs_mount_t             *mp;    /* mount structure */
+       xfs_perag_t             *pag;   /* per allocation group data */
+       __be32                  *agfl_bno;
+       int                     startoff;
+
+       agf = XFS_BUF_TO_AGF(agbp);
+       mp = tp->t_mountp;
+
+       if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
+                       be32_to_cpu(agf->agf_seqno), &agflbp)))
+               return error;
+       be32_add_cpu(&agf->agf_fllast, 1);
+       if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
+               agf->agf_fllast = 0;
+
+       pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
+       be32_add_cpu(&agf->agf_flcount, 1);
+       xfs_trans_agflist_delta(tp, 1);
+       pag->pagf_flcount++;
+
+       logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
+       if (btreeblk) {
+               be32_add_cpu(&agf->agf_btreeblks, -1);
+               pag->pagf_btreeblks--;
+               logflags |= XFS_AGF_BTREEBLKS;
+       }
+       xfs_perag_put(pag);
+
+       xfs_alloc_log_agf(tp, agbp, logflags);
+
+       ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp));
+
+       agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+       blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
+       *blockp = cpu_to_be32(bno);
+       startoff = (char *)blockp - (char *)agflbp->b_addr;
+
+       xfs_alloc_log_agf(tp, agbp, logflags);
+
+       xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
+       xfs_trans_log_buf(tp, agflbp, startoff,
+                         startoff + sizeof(xfs_agblock_t) - 1);
+       return 0;
+}
+
+static bool
+xfs_agf_verify(
+       struct xfs_mount *mp,
+       struct xfs_buf  *bp)
+ {
+       struct xfs_agf  *agf = XFS_BUF_TO_AGF(bp);
+
+       if (xfs_sb_version_hascrc(&mp->m_sb) &&
+           !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid))
+                       return false;
+
+       if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
+             XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+             be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
+             be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
+             be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
+             be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)))
+               return false;
+
+       /*
+        * during growfs operations, the perag is not fully initialised,
+        * so we can't use it for any useful checking. growfs ensures we can't
+        * use it by using uncached buffers that don't have the perag attached
+        * so we can detect and avoid this problem.
+        */
+       if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
+               return false;
+
+       if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
+           be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
+               return false;
+
+       return true;;
+
+}
+
+static void
+xfs_agf_read_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+
+       if (xfs_sb_version_hascrc(&mp->m_sb) &&
+           !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
+               xfs_buf_ioerror(bp, -EFSBADCRC);
+       else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp,
+                               XFS_ERRTAG_ALLOC_READ_AGF,
+                               XFS_RANDOM_ALLOC_READ_AGF))
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+       if (bp->b_error)
+               xfs_verifier_error(bp);
+}
+
+static void
+xfs_agf_write_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+       struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+       if (!xfs_agf_verify(mp, bp)) {
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+               xfs_verifier_error(bp);
+               return;
+       }
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return;
+
+       if (bip)
+               XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+       xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_agf_buf_ops = {
+       .verify_read = xfs_agf_read_verify,
+       .verify_write = xfs_agf_write_verify,
+};
+
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int                                    /* error */
+xfs_read_agf(
+       struct xfs_mount        *mp,    /* mount point structure */
+       struct xfs_trans        *tp,    /* transaction pointer */
+       xfs_agnumber_t          agno,   /* allocation group number */
+       int                     flags,  /* XFS_BUF_ */
+       struct xfs_buf          **bpp)  /* buffer for the ag freelist header */
+{
+       int             error;
+
+       trace_xfs_read_agf(mp, agno);
+
+       ASSERT(agno != NULLAGNUMBER);
+       error = xfs_trans_read_buf(
+                       mp, tp, mp->m_ddev_targp,
+                       XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
+                       XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
+       if (error)
+               return error;
+       if (!*bpp)
+               return 0;
+
+       ASSERT(!(*bpp)->b_error);
+       xfs_buf_set_ref(*bpp, XFS_AGF_REF);
+       return 0;
+}
+
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int                                    /* error */
+xfs_alloc_read_agf(
+       struct xfs_mount        *mp,    /* mount point structure */
+       struct xfs_trans        *tp,    /* transaction pointer */
+       xfs_agnumber_t          agno,   /* allocation group number */
+       int                     flags,  /* XFS_ALLOC_FLAG_... */
+       struct xfs_buf          **bpp)  /* buffer for the ag freelist header */
+{
+       struct xfs_agf          *agf;           /* ag freelist header */
+       struct xfs_perag        *pag;           /* per allocation group data */
+       int                     error;
+
+       trace_xfs_alloc_read_agf(mp, agno);
+
+       ASSERT(agno != NULLAGNUMBER);
+       error = xfs_read_agf(mp, tp, agno,
+                       (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
+                       bpp);
+       if (error)
+               return error;
+       if (!*bpp)
+               return 0;
+       ASSERT(!(*bpp)->b_error);
+
+       agf = XFS_BUF_TO_AGF(*bpp);
+       pag = xfs_perag_get(mp, agno);
+       if (!pag->pagf_init) {
+               pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
+               pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
+               pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
+               pag->pagf_longest = be32_to_cpu(agf->agf_longest);
+               pag->pagf_levels[XFS_BTNUM_BNOi] =
+                       be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
+               pag->pagf_levels[XFS_BTNUM_CNTi] =
+                       be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
+               spin_lock_init(&pag->pagb_lock);
+               pag->pagb_count = 0;
+               pag->pagb_tree = RB_ROOT;
+               pag->pagf_init = 1;
+       }
+#ifdef DEBUG
+       else if (!XFS_FORCED_SHUTDOWN(mp)) {
+               ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
+               ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
+               ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
+               ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
+               ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
+                      be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
+               ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
+                      be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
+       }
+#endif
+       xfs_perag_put(pag);
+       return 0;
+}
+
+/*
+ * Allocate an extent (variable-size).
+ * Depending on the allocation type, we either look in a single allocation
+ * group or loop over the allocation groups to find the result.
+ */
+int                            /* error */
+xfs_alloc_vextent(
+       xfs_alloc_arg_t *args)  /* allocation argument structure */
+{
+       xfs_agblock_t   agsize; /* allocation group size */
+       int             error;
+       int             flags;  /* XFS_ALLOC_FLAG_... locking flags */
+       xfs_extlen_t    minleft;/* minimum left value, temp copy */
+       xfs_mount_t     *mp;    /* mount structure pointer */
+       xfs_agnumber_t  sagno;  /* starting allocation group number */
+       xfs_alloctype_t type;   /* input allocation type */
+       int             bump_rotor = 0;
+       int             no_min = 0;
+       xfs_agnumber_t  rotorstep = xfs_rotorstep; /* inode32 agf stepper */
+
+       mp = args->mp;
+       type = args->otype = args->type;
+       args->agbno = NULLAGBLOCK;
+       /*
+        * Just fix this up, for the case where the last a.g. is shorter
+        * (or there's only one a.g.) and the caller couldn't easily figure
+        * that out (xfs_bmap_alloc).
+        */
+       agsize = mp->m_sb.sb_agblocks;
+       if (args->maxlen > agsize)
+               args->maxlen = agsize;
+       if (args->alignment == 0)
+               args->alignment = 1;
+       ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
+       ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
+       ASSERT(args->minlen <= args->maxlen);
+       ASSERT(args->minlen <= agsize);
+       ASSERT(args->mod < args->prod);
+       if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
+           XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
+           args->minlen > args->maxlen || args->minlen > agsize ||
+           args->mod >= args->prod) {
+               args->fsbno = NULLFSBLOCK;
+               trace_xfs_alloc_vextent_badargs(args);
+               return 0;
+       }
+       minleft = args->minleft;
+
+       switch (type) {
+       case XFS_ALLOCTYPE_THIS_AG:
+       case XFS_ALLOCTYPE_NEAR_BNO:
+       case XFS_ALLOCTYPE_THIS_BNO:
+               /*
+                * These three force us into a single a.g.
+                */
+               args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+               args->pag = xfs_perag_get(mp, args->agno);
+               args->minleft = 0;
+               error = xfs_alloc_fix_freelist(args, 0);
+               args->minleft = minleft;
+               if (error) {
+                       trace_xfs_alloc_vextent_nofix(args);
+                       goto error0;
+               }
+               if (!args->agbp) {
+                       trace_xfs_alloc_vextent_noagbp(args);
+                       break;
+               }
+               args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+               if ((error = xfs_alloc_ag_vextent(args)))
+                       goto error0;
+               break;
+       case XFS_ALLOCTYPE_START_BNO:
+               /*
+                * Try near allocation first, then anywhere-in-ag after
+                * the first a.g. fails.
+                */
+               if ((args->userdata  == XFS_ALLOC_INITIAL_USER_DATA) &&
+                   (mp->m_flags & XFS_MOUNT_32BITINODES)) {
+                       args->fsbno = XFS_AGB_TO_FSB(mp,
+                                       ((mp->m_agfrotor / rotorstep) %
+                                       mp->m_sb.sb_agcount), 0);
+                       bump_rotor = 1;
+               }
+               args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+               args->type = XFS_ALLOCTYPE_NEAR_BNO;
+               /* FALLTHROUGH */
+       case XFS_ALLOCTYPE_ANY_AG:
+       case XFS_ALLOCTYPE_START_AG:
+       case XFS_ALLOCTYPE_FIRST_AG:
+               /*
+                * Rotate through the allocation groups looking for a winner.
+                */
+               if (type == XFS_ALLOCTYPE_ANY_AG) {
+                       /*
+                        * Start with the last place we left off.
+                        */
+                       args->agno = sagno = (mp->m_agfrotor / rotorstep) %
+                                       mp->m_sb.sb_agcount;
+                       args->type = XFS_ALLOCTYPE_THIS_AG;
+                       flags = XFS_ALLOC_FLAG_TRYLOCK;
+               } else if (type == XFS_ALLOCTYPE_FIRST_AG) {
+                       /*
+                        * Start with allocation group given by bno.
+                        */
+                       args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+                       args->type = XFS_ALLOCTYPE_THIS_AG;
+                       sagno = 0;
+                       flags = 0;
+               } else {
+                       if (type == XFS_ALLOCTYPE_START_AG)
+                               args->type = XFS_ALLOCTYPE_THIS_AG;
+                       /*
+                        * Start with the given allocation group.
+                        */
+                       args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+                       flags = XFS_ALLOC_FLAG_TRYLOCK;
+               }
+               /*
+                * Loop over allocation groups twice; first time with
+                * trylock set, second time without.
+                */
+               for (;;) {
+                       args->pag = xfs_perag_get(mp, args->agno);
+                       if (no_min) args->minleft = 0;
+                       error = xfs_alloc_fix_freelist(args, flags);
+                       args->minleft = minleft;
+                       if (error) {
+                               trace_xfs_alloc_vextent_nofix(args);
+                               goto error0;
+                       }
+                       /*
+                        * If we get a buffer back then the allocation will fly.
+                        */
+                       if (args->agbp) {
+                               if ((error = xfs_alloc_ag_vextent(args)))
+                                       goto error0;
+                               break;
+                       }
+
+                       trace_xfs_alloc_vextent_loopfailed(args);
+
+                       /*
+                        * Didn't work, figure out the next iteration.
+                        */
+                       if (args->agno == sagno &&
+                           type == XFS_ALLOCTYPE_START_BNO)
+                               args->type = XFS_ALLOCTYPE_THIS_AG;
+                       /*
+                       * For the first allocation, we can try any AG to get
+                       * space.  However, if we already have allocated a
+                       * block, we don't want to try AGs whose number is below
+                       * sagno. Otherwise, we may end up with out-of-order
+                       * locking of AGF, which might cause deadlock.
+                       */
+                       if (++(args->agno) == mp->m_sb.sb_agcount) {
+                               if (args->firstblock != NULLFSBLOCK)
+                                       args->agno = sagno;
+                               else
+                                       args->agno = 0;
+                       }
+                       /*
+                        * Reached the starting a.g., must either be done
+                        * or switch to non-trylock mode.
+                        */
+                       if (args->agno == sagno) {
+                               if (no_min == 1) {
+                                       args->agbno = NULLAGBLOCK;
+                                       trace_xfs_alloc_vextent_allfailed(args);
+                                       break;
+                               }
+                               if (flags == 0) {
+                                       no_min = 1;
+                               } else {
+                                       flags = 0;
+                                       if (type == XFS_ALLOCTYPE_START_BNO) {
+                                               args->agbno = XFS_FSB_TO_AGBNO(mp,
+                                                       args->fsbno);
+                                               args->type = XFS_ALLOCTYPE_NEAR_BNO;
+                                       }
+                               }
+                       }
+                       xfs_perag_put(args->pag);
+               }
+               if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
+                       if (args->agno == sagno)
+                               mp->m_agfrotor = (mp->m_agfrotor + 1) %
+                                       (mp->m_sb.sb_agcount * rotorstep);
+                       else
+                               mp->m_agfrotor = (args->agno * rotorstep + 1) %
+                                       (mp->m_sb.sb_agcount * rotorstep);
+               }
+               break;
+       default:
+               ASSERT(0);
+               /* NOTREACHED */
+       }
+       if (args->agbno == NULLAGBLOCK)
+               args->fsbno = NULLFSBLOCK;
+       else {
+               args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
+#ifdef DEBUG
+               ASSERT(args->len >= args->minlen);
+               ASSERT(args->len <= args->maxlen);
+               ASSERT(args->agbno % args->alignment == 0);
+               XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
+                       args->len);
+#endif
+       }
+       xfs_perag_put(args->pag);
+       return 0;
+error0:
+       xfs_perag_put(args->pag);
+       return error;
+}
+
+/*
+ * Free an extent.
+ * Just break up the extent address and hand off to xfs_free_ag_extent
+ * after fixing up the freelist.
+ */
+int                            /* error */
+xfs_free_extent(
+       xfs_trans_t     *tp,    /* transaction pointer */
+       xfs_fsblock_t   bno,    /* starting block number of extent */
+       xfs_extlen_t    len)    /* length of extent */
+{
+       xfs_alloc_arg_t args;
+       int             error;
+
+       ASSERT(len != 0);
+       memset(&args, 0, sizeof(xfs_alloc_arg_t));
+       args.tp = tp;
+       args.mp = tp->t_mountp;
+
+       /*
+        * validate that the block number is legal - the enables us to detect
+        * and handle a silent filesystem corruption rather than crashing.
+        */
+       args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
+       if (args.agno >= args.mp->m_sb.sb_agcount)
+               return -EFSCORRUPTED;
+
+       args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
+       if (args.agbno >= args.mp->m_sb.sb_agblocks)
+               return -EFSCORRUPTED;
+
+       args.pag = xfs_perag_get(args.mp, args.agno);
+       ASSERT(args.pag);
+
+       error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
+       if (error)
+               goto error0;
+
+       /* validate the extent size is legal now we have the agf locked */
+       if (args.agbno + len >
+                       be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
+               error = -EFSCORRUPTED;
+               goto error0;
+       }
+
+       error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
+       if (!error)
+               xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0);
+error0:
+       xfs_perag_put(args.pag);
+       return error;
+}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
new file mode 100644 (file)
index 0000000..feacb06
--- /dev/null
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ALLOC_H__
+#define        __XFS_ALLOC_H__
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xfs_perag;
+struct xfs_trans;
+
+extern struct workqueue_struct *xfs_alloc_wq;
+
+/*
+ * Freespace allocation types.  Argument to xfs_alloc_[v]extent.
+ */
+#define XFS_ALLOCTYPE_ANY_AG   0x01    /* allocate anywhere, use rotor */
+#define XFS_ALLOCTYPE_FIRST_AG 0x02    /* ... start at ag 0 */
+#define XFS_ALLOCTYPE_START_AG 0x04    /* anywhere, start in this a.g. */
+#define XFS_ALLOCTYPE_THIS_AG  0x08    /* anywhere in this a.g. */
+#define XFS_ALLOCTYPE_START_BNO        0x10    /* near this block else anywhere */
+#define XFS_ALLOCTYPE_NEAR_BNO 0x20    /* in this a.g. and near this block */
+#define XFS_ALLOCTYPE_THIS_BNO 0x40    /* at exactly this block */
+
+/* this should become an enum again when the tracing code is fixed */
+typedef unsigned int xfs_alloctype_t;
+
+#define XFS_ALLOC_TYPES \
+       { XFS_ALLOCTYPE_ANY_AG,         "ANY_AG" }, \
+       { XFS_ALLOCTYPE_FIRST_AG,       "FIRST_AG" }, \
+       { XFS_ALLOCTYPE_START_AG,       "START_AG" }, \
+       { XFS_ALLOCTYPE_THIS_AG,        "THIS_AG" }, \
+       { XFS_ALLOCTYPE_START_BNO,      "START_BNO" }, \
+       { XFS_ALLOCTYPE_NEAR_BNO,       "NEAR_BNO" }, \
+       { XFS_ALLOCTYPE_THIS_BNO,       "THIS_BNO" }
+
+/*
+ * Flags for xfs_alloc_fix_freelist.
+ */
+#define        XFS_ALLOC_FLAG_TRYLOCK  0x00000001  /* use trylock for buffer locking */
+#define        XFS_ALLOC_FLAG_FREEING  0x00000002  /* indicate caller is freeing extents*/
+
+/*
+ * In order to avoid ENOSPC-related deadlock caused by
+ * out-of-order locking of AGF buffer (PV 947395), we place
+ * constraints on the relationship among actual allocations for
+ * data blocks, freelist blocks, and potential file data bmap
+ * btree blocks. However, these restrictions may result in no
+ * actual space allocated for a delayed extent, for example, a data
+ * block in a certain AG is allocated but there is no additional
+ * block for the additional bmap btree block due to a split of the
+ * bmap btree of the file. The result of this may lead to an
+ * infinite loop in xfssyncd when the file gets flushed to disk and
+ * all delayed extents need to be actually allocated. To get around
+ * this, we explicitly set aside a few blocks which will not be
+ * reserved in delayed allocation. Considering the minimum number of
+ * needed freelist blocks is 4 fsbs _per AG_, a potential split of file's bmap
+ * btree requires 1 fsb, so we set the number of set-aside blocks
+ * to 4 + 4*agcount.
+ */
+#define XFS_ALLOC_SET_ASIDE(mp)  (4 + ((mp)->m_sb.sb_agcount * 4))
+
+/*
+ * When deciding how much space to allocate out of an AG, we limit the
+ * allocation maximum size to the size the AG. However, we cannot use all the
+ * blocks in the AG - some are permanently used by metadata. These
+ * blocks are generally:
+ *     - the AG superblock, AGF, AGI and AGFL
+ *     - the AGF (bno and cnt) and AGI btree root blocks
+ *     - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
+ *
+ * The AG headers are sector sized, so the amount of space they take up is
+ * dependent on filesystem geometry. The others are all single blocks.
+ */
+#define XFS_ALLOC_AG_MAX_USABLE(mp)    \
+       ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
+
+
+/*
+ * Argument structure for xfs_alloc routines.
+ * This is turned into a structure to avoid having 20 arguments passed
+ * down several levels of the stack.
+ */
+typedef struct xfs_alloc_arg {
+       struct xfs_trans *tp;           /* transaction pointer */
+       struct xfs_mount *mp;           /* file system mount point */
+       struct xfs_buf  *agbp;          /* buffer for a.g. freelist header */
+       struct xfs_perag *pag;          /* per-ag struct for this agno */
+       xfs_fsblock_t   fsbno;          /* file system block number */
+       xfs_agnumber_t  agno;           /* allocation group number */
+       xfs_agblock_t   agbno;          /* allocation group-relative block # */
+       xfs_extlen_t    minlen;         /* minimum size of extent */
+       xfs_extlen_t    maxlen;         /* maximum size of extent */
+       xfs_extlen_t    mod;            /* mod value for extent size */
+       xfs_extlen_t    prod;           /* prod value for extent size */
+       xfs_extlen_t    minleft;        /* min blocks must be left after us */
+       xfs_extlen_t    total;          /* total blocks needed in xaction */
+       xfs_extlen_t    alignment;      /* align answer to multiple of this */
+       xfs_extlen_t    minalignslop;   /* slop for minlen+alignment calcs */
+       xfs_extlen_t    len;            /* output: actual size of extent */
+       xfs_alloctype_t type;           /* allocation type XFS_ALLOCTYPE_... */
+       xfs_alloctype_t otype;          /* original allocation type */
+       char            wasdel;         /* set if allocation was prev delayed */
+       char            wasfromfl;      /* set if allocation is from freelist */
+       char            isfl;           /* set if is freelist blocks - !acctg */
+       char            userdata;       /* set if this is user data */
+       xfs_fsblock_t   firstblock;     /* io first block allocated */
+} xfs_alloc_arg_t;
+
+/*
+ * Defines for userdata
+ */
+#define XFS_ALLOC_USERDATA             1       /* allocation is for user data*/
+#define XFS_ALLOC_INITIAL_USER_DATA    2       /* special case start of file */
+
+/*
+ * Find the length of the longest extent in an AG.
+ */
+xfs_extlen_t
+xfs_alloc_longest_free_extent(struct xfs_mount *mp,
+               struct xfs_perag *pag);
+
+/*
+ * Compute and fill in value of m_ag_maxlevels.
+ */
+void
+xfs_alloc_compute_maxlevels(
+       struct xfs_mount        *mp);   /* file system mount structure */
+
+/*
+ * Get a block from the freelist.
+ * Returns with the buffer for the block gotten.
+ */
+int                            /* error */
+xfs_alloc_get_freelist(
+       struct xfs_trans *tp,   /* transaction pointer */
+       struct xfs_buf  *agbp,  /* buffer containing the agf structure */
+       xfs_agblock_t   *bnop,  /* block address retrieved from freelist */
+       int             btreeblk); /* destination is a AGF btree */
+
+/*
+ * Log the given fields from the agf structure.
+ */
+void
+xfs_alloc_log_agf(
+       struct xfs_trans *tp,   /* transaction pointer */
+       struct xfs_buf  *bp,    /* buffer for a.g. freelist header */
+       int             fields);/* mask of fields to be logged (XFS_AGF_...) */
+
+/*
+ * Interface for inode allocation to force the pag data to be initialized.
+ */
+int                            /* error */
+xfs_alloc_pagf_init(
+       struct xfs_mount *mp,   /* file system mount structure */
+       struct xfs_trans *tp,   /* transaction pointer */
+       xfs_agnumber_t  agno,   /* allocation group number */
+       int             flags); /* XFS_ALLOC_FLAGS_... */
+
+/*
+ * Put the block on the freelist for the allocation group.
+ */
+int                            /* error */
+xfs_alloc_put_freelist(
+       struct xfs_trans *tp,   /* transaction pointer */
+       struct xfs_buf  *agbp,  /* buffer for a.g. freelist header */
+       struct xfs_buf  *agflbp,/* buffer for a.g. free block array */
+       xfs_agblock_t   bno,    /* block being freed */
+       int             btreeblk); /* owner was a AGF btree */
+
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int                                    /* error  */
+xfs_alloc_read_agf(
+       struct xfs_mount *mp,           /* mount point structure */
+       struct xfs_trans *tp,           /* transaction pointer */
+       xfs_agnumber_t  agno,           /* allocation group number */
+       int             flags,          /* XFS_ALLOC_FLAG_... */
+       struct xfs_buf  **bpp);         /* buffer for the ag freelist header */
+
+/*
+ * Allocate an extent (variable-size).
+ */
+int                            /* error */
+xfs_alloc_vextent(
+       xfs_alloc_arg_t *args); /* allocation argument structure */
+
+/*
+ * Free an extent.
+ */
+int                            /* error */
+xfs_free_extent(
+       struct xfs_trans *tp,   /* transaction pointer */
+       xfs_fsblock_t   bno,    /* starting block number of extent */
+       xfs_extlen_t    len);   /* length of extent */
+
+int                                    /* error */
+xfs_alloc_lookup_le(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agblock_t           bno,    /* starting block of extent */
+       xfs_extlen_t            len,    /* length of extent */
+       int                     *stat); /* success/failure */
+
+int                            /* error */
+xfs_alloc_lookup_ge(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agblock_t           bno,    /* starting block of extent */
+       xfs_extlen_t            len,    /* length of extent */
+       int                     *stat); /* success/failure */
+
+int                                    /* error */
+xfs_alloc_get_rec(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agblock_t           *bno,   /* output: starting block of extent */
+       xfs_extlen_t            *len,   /* output: length of extent */
+       int                     *stat); /* output: success/failure */
+
+#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
new file mode 100644 (file)
index 0000000..e0e83e2
--- /dev/null
@@ -0,0 +1,504 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+
+
+STATIC struct xfs_btree_cur *
+xfs_allocbt_dup_cursor(
+       struct xfs_btree_cur    *cur)
+{
+       return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
+                       cur->bc_private.a.agbp, cur->bc_private.a.agno,
+                       cur->bc_btnum);
+}
+
+STATIC void
+xfs_allocbt_set_root(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr,
+       int                     inc)
+{
+       struct xfs_buf          *agbp = cur->bc_private.a.agbp;
+       struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
+       xfs_agnumber_t          seqno = be32_to_cpu(agf->agf_seqno);
+       int                     btnum = cur->bc_btnum;
+       struct xfs_perag        *pag = xfs_perag_get(cur->bc_mp, seqno);
+
+       ASSERT(ptr->s != 0);
+
+       agf->agf_roots[btnum] = ptr->s;
+       be32_add_cpu(&agf->agf_levels[btnum], inc);
+       pag->pagf_levels[btnum] += inc;
+       xfs_perag_put(pag);
+
+       xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+}
+
+STATIC int
+xfs_allocbt_alloc_block(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *start,
+       union xfs_btree_ptr     *new,
+       int                     *stat)
+{
+       int                     error;
+       xfs_agblock_t           bno;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+       /* Allocate the new block from the freelist. If we can't, give up.  */
+       error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+                                      &bno, 1);
+       if (error) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+               return error;
+       }
+
+       if (bno == NULLAGBLOCK) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+               *stat = 0;
+               return 0;
+       }
+
+       xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
+
+       xfs_trans_agbtree_delta(cur->bc_tp, 1);
+       new->s = cpu_to_be32(bno);
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+}
+
+STATIC int
+xfs_allocbt_free_block(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp)
+{
+       struct xfs_buf          *agbp = cur->bc_private.a.agbp;
+       struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
+       xfs_agblock_t           bno;
+       int                     error;
+
+       bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
+       error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
+       if (error)
+               return error;
+
+       xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
+                             XFS_EXTENT_BUSY_SKIP_DISCARD);
+       xfs_trans_agbtree_delta(cur->bc_tp, -1);
+
+       xfs_trans_binval(cur->bc_tp, bp);
+       return 0;
+}
+
+/*
+ * Update the longest extent in the AGF
+ */
+STATIC void
+xfs_allocbt_update_lastrec(
+       struct xfs_btree_cur    *cur,
+       struct xfs_btree_block  *block,
+       union xfs_btree_rec     *rec,
+       int                     ptr,
+       int                     reason)
+{
+       struct xfs_agf          *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+       xfs_agnumber_t          seqno = be32_to_cpu(agf->agf_seqno);
+       struct xfs_perag        *pag;
+       __be32                  len;
+       int                     numrecs;
+
+       ASSERT(cur->bc_btnum == XFS_BTNUM_CNT);
+
+       switch (reason) {
+       case LASTREC_UPDATE:
+               /*
+                * If this is the last leaf block and it's the last record,
+                * then update the size of the longest extent in the AG.
+                */
+               if (ptr != xfs_btree_get_numrecs(block))
+                       return;
+               len = rec->alloc.ar_blockcount;
+               break;
+       case LASTREC_INSREC:
+               if (be32_to_cpu(rec->alloc.ar_blockcount) <=
+                   be32_to_cpu(agf->agf_longest))
+                       return;
+               len = rec->alloc.ar_blockcount;
+               break;
+       case LASTREC_DELREC:
+               numrecs = xfs_btree_get_numrecs(block);
+               if (ptr <= numrecs)
+                       return;
+               ASSERT(ptr == numrecs + 1);
+
+               if (numrecs) {
+                       xfs_alloc_rec_t *rrp;
+
+                       rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs);
+                       len = rrp->ar_blockcount;
+               } else {
+                       len = 0;
+               }
+
+               break;
+       default:
+               ASSERT(0);
+               return;
+       }
+
+       agf->agf_longest = len;
+       pag = xfs_perag_get(cur->bc_mp, seqno);
+       pag->pagf_longest = be32_to_cpu(len);
+       xfs_perag_put(pag);
+       xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
+}
+
+STATIC int
+xfs_allocbt_get_minrecs(
+       struct xfs_btree_cur    *cur,
+       int                     level)
+{
+       return cur->bc_mp->m_alloc_mnr[level != 0];
+}
+
+STATIC int
+xfs_allocbt_get_maxrecs(
+       struct xfs_btree_cur    *cur,
+       int                     level)
+{
+       return cur->bc_mp->m_alloc_mxr[level != 0];
+}
+
+STATIC void
+xfs_allocbt_init_key_from_rec(
+       union xfs_btree_key     *key,
+       union xfs_btree_rec     *rec)
+{
+       ASSERT(rec->alloc.ar_startblock != 0);
+
+       key->alloc.ar_startblock = rec->alloc.ar_startblock;
+       key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
+}
+
+STATIC void
+xfs_allocbt_init_rec_from_key(
+       union xfs_btree_key     *key,
+       union xfs_btree_rec     *rec)
+{
+       ASSERT(key->alloc.ar_startblock != 0);
+
+       rec->alloc.ar_startblock = key->alloc.ar_startblock;
+       rec->alloc.ar_blockcount = key->alloc.ar_blockcount;
+}
+
+STATIC void
+xfs_allocbt_init_rec_from_cur(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *rec)
+{
+       ASSERT(cur->bc_rec.a.ar_startblock != 0);
+
+       rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
+       rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
+}
+
+STATIC void
+xfs_allocbt_init_ptr_from_cur(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr)
+{
+       struct xfs_agf          *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+
+       ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+       ASSERT(agf->agf_roots[cur->bc_btnum] != 0);
+
+       ptr->s = agf->agf_roots[cur->bc_btnum];
+}
+
+STATIC __int64_t
+xfs_allocbt_key_diff(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *key)
+{
+       xfs_alloc_rec_incore_t  *rec = &cur->bc_rec.a;
+       xfs_alloc_key_t         *kp = &key->alloc;
+       __int64_t               diff;
+
+       if (cur->bc_btnum == XFS_BTNUM_BNO) {
+               return (__int64_t)be32_to_cpu(kp->ar_startblock) -
+                               rec->ar_startblock;
+       }
+
+       diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
+       if (diff)
+               return diff;
+
+       return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
+}
+
+static bool
+xfs_allocbt_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+       struct xfs_perag        *pag = bp->b_pag;
+       unsigned int            level;
+
+       /*
+        * magic number and level verification
+        *
+        * During growfs operations, we can't verify the exact level or owner as
+        * the perag is not fully initialised and hence not attached to the
+        * buffer.  In this case, check against the maximum tree depth.
+        *
+        * Similarly, during log recovery we will have a perag structure
+        * attached, but the agf information will not yet have been initialised
+        * from the on disk AGF. Again, we can only check against maximum limits
+        * in this case.
+        */
+       level = be16_to_cpu(block->bb_level);
+       switch (block->bb_magic) {
+       case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
+               if (!xfs_sb_version_hascrc(&mp->m_sb))
+                       return false;
+               if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+                       return false;
+               if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+                       return false;
+               if (pag &&
+                   be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+                       return false;
+               /* fall through */
+       case cpu_to_be32(XFS_ABTB_MAGIC):
+               if (pag && pag->pagf_init) {
+                       if (level >= pag->pagf_levels[XFS_BTNUM_BNOi])
+                               return false;
+               } else if (level >= mp->m_ag_maxlevels)
+                       return false;
+               break;
+       case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
+               if (!xfs_sb_version_hascrc(&mp->m_sb))
+                       return false;
+               if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+                       return false;
+               if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+                       return false;
+               if (pag &&
+                   be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+                       return false;
+               /* fall through */
+       case cpu_to_be32(XFS_ABTC_MAGIC):
+               if (pag && pag->pagf_init) {
+                       if (level >= pag->pagf_levels[XFS_BTNUM_CNTi])
+                               return false;
+               } else if (level >= mp->m_ag_maxlevels)
+                       return false;
+               break;
+       default:
+               return false;
+       }
+
+       /* numrecs verification */
+       if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0])
+               return false;
+
+       /* sibling pointer verification */
+       if (!block->bb_u.s.bb_leftsib ||
+           (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
+            block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
+               return false;
+       if (!block->bb_u.s.bb_rightsib ||
+           (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
+            block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
+               return false;
+
+       return true;
+}
+
+static void
+xfs_allocbt_read_verify(
+       struct xfs_buf  *bp)
+{
+       if (!xfs_btree_sblock_verify_crc(bp))
+               xfs_buf_ioerror(bp, -EFSBADCRC);
+       else if (!xfs_allocbt_verify(bp))
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+       if (bp->b_error) {
+               trace_xfs_btree_corrupt(bp, _RET_IP_);
+               xfs_verifier_error(bp);
+       }
+}
+
+static void
+xfs_allocbt_write_verify(
+       struct xfs_buf  *bp)
+{
+       if (!xfs_allocbt_verify(bp)) {
+               trace_xfs_btree_corrupt(bp, _RET_IP_);
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+               xfs_verifier_error(bp);
+               return;
+       }
+       xfs_btree_sblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_allocbt_buf_ops = {
+       .verify_read = xfs_allocbt_read_verify,
+       .verify_write = xfs_allocbt_write_verify,
+};
+
+
+#if defined(DEBUG) || defined(XFS_WARN)
+STATIC int
+xfs_allocbt_keys_inorder(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *k1,
+       union xfs_btree_key     *k2)
+{
+       if (cur->bc_btnum == XFS_BTNUM_BNO) {
+               return be32_to_cpu(k1->alloc.ar_startblock) <
+                      be32_to_cpu(k2->alloc.ar_startblock);
+       } else {
+               return be32_to_cpu(k1->alloc.ar_blockcount) <
+                       be32_to_cpu(k2->alloc.ar_blockcount) ||
+                       (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
+                        be32_to_cpu(k1->alloc.ar_startblock) <
+                        be32_to_cpu(k2->alloc.ar_startblock));
+       }
+}
+
+STATIC int
+xfs_allocbt_recs_inorder(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *r1,
+       union xfs_btree_rec     *r2)
+{
+       if (cur->bc_btnum == XFS_BTNUM_BNO) {
+               return be32_to_cpu(r1->alloc.ar_startblock) +
+                       be32_to_cpu(r1->alloc.ar_blockcount) <=
+                       be32_to_cpu(r2->alloc.ar_startblock);
+       } else {
+               return be32_to_cpu(r1->alloc.ar_blockcount) <
+                       be32_to_cpu(r2->alloc.ar_blockcount) ||
+                       (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
+                        be32_to_cpu(r1->alloc.ar_startblock) <
+                        be32_to_cpu(r2->alloc.ar_startblock));
+       }
+}
+#endif /* DEBUG */
+
+static const struct xfs_btree_ops xfs_allocbt_ops = {
+       .rec_len                = sizeof(xfs_alloc_rec_t),
+       .key_len                = sizeof(xfs_alloc_key_t),
+
+       .dup_cursor             = xfs_allocbt_dup_cursor,
+       .set_root               = xfs_allocbt_set_root,
+       .alloc_block            = xfs_allocbt_alloc_block,
+       .free_block             = xfs_allocbt_free_block,
+       .update_lastrec         = xfs_allocbt_update_lastrec,
+       .get_minrecs            = xfs_allocbt_get_minrecs,
+       .get_maxrecs            = xfs_allocbt_get_maxrecs,
+       .init_key_from_rec      = xfs_allocbt_init_key_from_rec,
+       .init_rec_from_key      = xfs_allocbt_init_rec_from_key,
+       .init_rec_from_cur      = xfs_allocbt_init_rec_from_cur,
+       .init_ptr_from_cur      = xfs_allocbt_init_ptr_from_cur,
+       .key_diff               = xfs_allocbt_key_diff,
+       .buf_ops                = &xfs_allocbt_buf_ops,
+#if defined(DEBUG) || defined(XFS_WARN)
+       .keys_inorder           = xfs_allocbt_keys_inorder,
+       .recs_inorder           = xfs_allocbt_recs_inorder,
+#endif
+};
+
+/*
+ * Allocate a new allocation btree cursor.
+ */
+struct xfs_btree_cur *                 /* new alloc btree cursor */
+xfs_allocbt_init_cursor(
+       struct xfs_mount        *mp,            /* file system mount point */
+       struct xfs_trans        *tp,            /* transaction pointer */
+       struct xfs_buf          *agbp,          /* buffer for agf structure */
+       xfs_agnumber_t          agno,           /* allocation group number */
+       xfs_btnum_t             btnum)          /* btree identifier */
+{
+       struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
+       struct xfs_btree_cur    *cur;
+
+       ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
+
+       cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+
+       cur->bc_tp = tp;
+       cur->bc_mp = mp;
+       cur->bc_btnum = btnum;
+       cur->bc_blocklog = mp->m_sb.sb_blocklog;
+       cur->bc_ops = &xfs_allocbt_ops;
+
+       if (btnum == XFS_BTNUM_CNT) {
+               cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
+               cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
+       } else {
+               cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
+       }
+
+       cur->bc_private.a.agbp = agbp;
+       cur->bc_private.a.agno = agno;
+
+       if (xfs_sb_version_hascrc(&mp->m_sb))
+               cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+       return cur;
+}
+
+/*
+ * Calculate number of records in an alloc btree block.
+ */
+int
+xfs_allocbt_maxrecs(
+       struct xfs_mount        *mp,
+       int                     blocklen,
+       int                     leaf)
+{
+       blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
+
+       if (leaf)
+               return blocklen / sizeof(xfs_alloc_rec_t);
+       return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
+}
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
new file mode 100644 (file)
index 0000000..45e189e
--- /dev/null
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ALLOC_BTREE_H__
+#define        __XFS_ALLOC_BTREE_H__
+
+/*
+ * Freespace on-disk structures
+ */
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+
+/*
+ * Btree block header size depends on a superblock flag.
+ */
+#define XFS_ALLOC_BLOCK_LEN(mp) \
+       (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+               XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_ALLOC_REC_ADDR(mp, block, index) \
+       ((xfs_alloc_rec_t *) \
+               ((char *)(block) + \
+                XFS_ALLOC_BLOCK_LEN(mp) + \
+                (((index) - 1) * sizeof(xfs_alloc_rec_t))))
+
+#define XFS_ALLOC_KEY_ADDR(mp, block, index) \
+       ((xfs_alloc_key_t *) \
+               ((char *)(block) + \
+                XFS_ALLOC_BLOCK_LEN(mp) + \
+                ((index) - 1) * sizeof(xfs_alloc_key_t)))
+
+#define XFS_ALLOC_PTR_ADDR(mp, block, index, maxrecs) \
+       ((xfs_alloc_ptr_t *) \
+               ((char *)(block) + \
+                XFS_ALLOC_BLOCK_LEN(mp) + \
+                (maxrecs) * sizeof(xfs_alloc_key_t) + \
+                ((index) - 1) * sizeof(xfs_alloc_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
+               struct xfs_trans *, struct xfs_buf *,
+               xfs_agnumber_t, xfs_btnum_t);
+extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
+
+#endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
new file mode 100644 (file)
index 0000000..353fb42
--- /dev/null
@@ -0,0 +1,1459 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_dinode.h"
+
+/*
+ * xfs_attr.c
+ *
+ * Provide the external interfaces to manage attribute lists.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Internal routines when attribute list fits inside the inode.
+ */
+STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
+
+/*
+ * Internal routines when attribute list is one block.
+ */
+STATIC int xfs_attr_leaf_get(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
+
+/*
+ * Internal routines when attribute list is more than one block.
+ */
+STATIC int xfs_attr_node_get(xfs_da_args_t *args);
+STATIC int xfs_attr_node_addname(xfs_da_args_t *args);
+STATIC int xfs_attr_node_removename(xfs_da_args_t *args);
+STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
+STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
+
+
+STATIC int
+xfs_attr_args_init(
+       struct xfs_da_args      *args,
+       struct xfs_inode        *dp,
+       const unsigned char     *name,
+       int                     flags)
+{
+
+       if (!name)
+               return -EINVAL;
+
+       memset(args, 0, sizeof(*args));
+       args->geo = dp->i_mount->m_attr_geo;
+       args->whichfork = XFS_ATTR_FORK;
+       args->dp = dp;
+       args->flags = flags;
+       args->name = name;
+       args->namelen = strlen((const char *)name);
+       if (args->namelen >= MAXNAMELEN)
+               return -EFAULT;         /* match IRIX behaviour */
+
+       args->hashval = xfs_da_hashname(args->name, args->namelen);
+       return 0;
+}
+
+int
+xfs_inode_hasattr(
+       struct xfs_inode        *ip)
+{
+       if (!XFS_IFORK_Q(ip) ||
+           (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+            ip->i_d.di_anextents == 0))
+               return 0;
+       return 1;
+}
+
+/*========================================================================
+ * Overall external interface routines.
+ *========================================================================*/
+
+int
+xfs_attr_get(
+       struct xfs_inode        *ip,
+       const unsigned char     *name,
+       unsigned char           *value,
+       int                     *valuelenp,
+       int                     flags)
+{
+       struct xfs_da_args      args;
+       uint                    lock_mode;
+       int                     error;
+
+       XFS_STATS_INC(xs_attr_get);
+
+       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+               return -EIO;
+
+       if (!xfs_inode_hasattr(ip))
+               return -ENOATTR;
+
+       error = xfs_attr_args_init(&args, ip, name, flags);
+       if (error)
+               return error;
+
+       args.value = value;
+       args.valuelen = *valuelenp;
+
+       lock_mode = xfs_ilock_attr_map_shared(ip);
+       if (!xfs_inode_hasattr(ip))
+               error = -ENOATTR;
+       else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
+               error = xfs_attr_shortform_getvalue(&args);
+       else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK))
+               error = xfs_attr_leaf_get(&args);
+       else
+               error = xfs_attr_node_get(&args);
+       xfs_iunlock(ip, lock_mode);
+
+       *valuelenp = args.valuelen;
+       return error == -EEXIST ? 0 : error;
+}
+
+/*
+ * Calculate how many blocks we need for the new attribute,
+ */
+STATIC int
+xfs_attr_calc_size(
+       struct xfs_da_args      *args,
+       int                     *local)
+{
+       struct xfs_mount        *mp = args->dp->i_mount;
+       int                     size;
+       int                     nblks;
+
+       /*
+        * Determine space new attribute will use, and if it would be
+        * "local" or "remote" (note: local != inline).
+        */
+       size = xfs_attr_leaf_newentsize(args, local);
+       nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+       if (*local) {
+               if (size > (args->geo->blksize / 2)) {
+                       /* Double split possible */
+                       nblks *= 2;
+               }
+       } else {
+               /*
+                * Out of line attribute, cannot double split, but
+                * make room for the attribute value itself.
+                */
+               uint    dblocks = xfs_attr3_rmt_blocks(mp, args->valuelen);
+               nblks += dblocks;
+               nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
+       }
+
+       return nblks;
+}
+
+int
+xfs_attr_set(
+       struct xfs_inode        *dp,
+       const unsigned char     *name,
+       unsigned char           *value,
+       int                     valuelen,
+       int                     flags)
+{
+       struct xfs_mount        *mp = dp->i_mount;
+       struct xfs_da_args      args;
+       struct xfs_bmap_free    flist;
+       struct xfs_trans_res    tres;
+       xfs_fsblock_t           firstblock;
+       int                     rsvd = (flags & ATTR_ROOT) != 0;
+       int                     error, err2, committed, local;
+
+       XFS_STATS_INC(xs_attr_set);
+
+       if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+               return -EIO;
+
+       error = xfs_attr_args_init(&args, dp, name, flags);
+       if (error)
+               return error;
+
+       args.value = value;
+       args.valuelen = valuelen;
+       args.firstblock = &firstblock;
+       args.flist = &flist;
+       args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+       args.total = xfs_attr_calc_size(&args, &local);
+
+       error = xfs_qm_dqattach(dp, 0);
+       if (error)
+               return error;
+
+       /*
+        * If the inode doesn't have an attribute fork, add one.
+        * (inode must not be locked when we call this routine)
+        */
+       if (XFS_IFORK_Q(dp) == 0) {
+               int sf_size = sizeof(xfs_attr_sf_hdr_t) +
+                       XFS_ATTR_SF_ENTSIZE_BYNAME(args.namelen, valuelen);
+
+               error = xfs_bmap_add_attrfork(dp, sf_size, rsvd);
+               if (error)
+                       return error;
+       }
+
+       /*
+        * Start our first transaction of the day.
+        *
+        * All future transactions during this code must be "chained" off
+        * this one via the trans_dup() call.  All transactions will contain
+        * the inode, and the inode will always be marked with trans_ihold().
+        * Since the inode will be locked in all transactions, we must log
+        * the inode in every transaction to let it float upward through
+        * the log.
+        */
+       args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET);
+
+       /*
+        * Root fork attributes can use reserved data blocks for this
+        * operation if necessary
+        */
+
+       if (rsvd)
+               args.trans->t_flags |= XFS_TRANS_RESERVE;
+
+       tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
+                        M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
+       tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
+       tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+       error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
+       if (error) {
+               xfs_trans_cancel(args.trans, 0);
+               return error;
+       }
+       xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+       error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
+                               rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
+                                      XFS_QMOPT_RES_REGBLKS);
+       if (error) {
+               xfs_iunlock(dp, XFS_ILOCK_EXCL);
+               xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+               return error;
+       }
+
+       xfs_trans_ijoin(args.trans, dp, 0);
+
+       /*
+        * If the attribute list is non-existent or a shortform list,
+        * upgrade it to a single-leaf-block attribute list.
+        */
+       if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL ||
+           (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+            dp->i_d.di_anextents == 0)) {
+
+               /*
+                * Build initial attribute list (if required).
+                */
+               if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS)
+                       xfs_attr_shortform_create(&args);
+
+               /*
+                * Try to add the attr to the attribute list in
+                * the inode.
+                */
+               error = xfs_attr_shortform_addname(&args);
+               if (error != -ENOSPC) {
+                       /*
+                        * Commit the shortform mods, and we're done.
+                        * NOTE: this is also the error path (EEXIST, etc).
+                        */
+                       ASSERT(args.trans != NULL);
+
+                       /*
+                        * If this is a synchronous mount, make sure that
+                        * the transaction goes to disk before returning
+                        * to the user.
+                        */
+                       if (mp->m_flags & XFS_MOUNT_WSYNC)
+                               xfs_trans_set_sync(args.trans);
+
+                       if (!error && (flags & ATTR_KERNOTIME) == 0) {
+                               xfs_trans_ichgtime(args.trans, dp,
+                                                       XFS_ICHGTIME_CHG);
+                       }
+                       err2 = xfs_trans_commit(args.trans,
+                                                XFS_TRANS_RELEASE_LOG_RES);
+                       xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+                       return error ? error : err2;
+               }
+
+               /*
+                * It won't fit in the shortform, transform to a leaf block.
+                * GROT: another possible req'mt for a double-split btree op.
+                */
+               xfs_bmap_init(args.flist, args.firstblock);
+               error = xfs_attr_shortform_to_leaf(&args);
+               if (!error) {
+                       error = xfs_bmap_finish(&args.trans, args.flist,
+                                               &committed);
+               }
+               if (error) {
+                       ASSERT(committed);
+                       args.trans = NULL;
+                       xfs_bmap_cancel(&flist);
+                       goto out;
+               }
+
+               /*
+                * bmap_finish() may have committed the last trans and started
+                * a new one.  We need the inode to be in all transactions.
+                */
+               if (committed)
+                       xfs_trans_ijoin(args.trans, dp, 0);
+
+               /*
+                * Commit the leaf transformation.  We'll need another (linked)
+                * transaction to add the new attribute to the leaf.
+                */
+
+               error = xfs_trans_roll(&args.trans, dp);
+               if (error)
+                       goto out;
+
+       }
+
+       if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
+               error = xfs_attr_leaf_addname(&args);
+       else
+               error = xfs_attr_node_addname(&args);
+       if (error)
+               goto out;
+
+       /*
+        * If this is a synchronous mount, make sure that the
+        * transaction goes to disk before returning to the user.
+        */
+       if (mp->m_flags & XFS_MOUNT_WSYNC)
+               xfs_trans_set_sync(args.trans);
+
+       if ((flags & ATTR_KERNOTIME) == 0)
+               xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
+
+       /*
+        * Commit the last in the sequence of transactions.
+        */
+       xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
+       error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+       xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+       return error;
+
+out:
+       if (args.trans) {
+               xfs_trans_cancel(args.trans,
+                       XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+       }
+       xfs_iunlock(dp, XFS_ILOCK_EXCL);
+       return error;
+}
+
+/*
+ * Generic handler routine to remove a name from an attribute list.
+ * Transitions attribute list from Btree to shortform as necessary.
+ */
+int
+xfs_attr_remove(
+       struct xfs_inode        *dp,
+       const unsigned char     *name,
+       int                     flags)
+{
+       struct xfs_mount        *mp = dp->i_mount;
+       struct xfs_da_args      args;
+       struct xfs_bmap_free    flist;
+       xfs_fsblock_t           firstblock;
+       int                     error;
+
+       XFS_STATS_INC(xs_attr_remove);
+
+       if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+               return -EIO;
+
+       if (!xfs_inode_hasattr(dp))
+               return -ENOATTR;
+
+       error = xfs_attr_args_init(&args, dp, name, flags);
+       if (error)
+               return error;
+
+       args.firstblock = &firstblock;
+       args.flist = &flist;
+
+       /*
+        * we have no control over the attribute names that userspace passes us
+        * to remove, so we have to allow the name lookup prior to attribute
+        * removal to fail.
+        */
+       args.op_flags = XFS_DA_OP_OKNOENT;
+
+       error = xfs_qm_dqattach(dp, 0);
+       if (error)
+               return error;
+
+       /*
+        * Start our first transaction of the day.
+        *
+        * All future transactions during this code must be "chained" off
+        * this one via the trans_dup() call.  All transactions will contain
+        * the inode, and the inode will always be marked with trans_ihold().
+        * Since the inode will be locked in all transactions, we must log
+        * the inode in every transaction to let it float upward through
+        * the log.
+        */
+       args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM);
+
+       /*
+        * Root fork attributes can use reserved data blocks for this
+        * operation if necessary
+        */
+
+       if (flags & ATTR_ROOT)
+               args.trans->t_flags |= XFS_TRANS_RESERVE;
+
+       error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
+                                 XFS_ATTRRM_SPACE_RES(mp), 0);
+       if (error) {
+               xfs_trans_cancel(args.trans, 0);
+               return error;
+       }
+
+       xfs_ilock(dp, XFS_ILOCK_EXCL);
+       /*
+        * No need to make quota reservations here. We expect to release some
+        * blocks not allocate in the common case.
+        */
+       xfs_trans_ijoin(args.trans, dp, 0);
+
+       if (!xfs_inode_hasattr(dp)) {
+               error = -ENOATTR;
+       } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+               ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
+               error = xfs_attr_shortform_remove(&args);
+       } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+               error = xfs_attr_leaf_removename(&args);
+       } else {
+               error = xfs_attr_node_removename(&args);
+       }
+
+       if (error)
+               goto out;
+
+       /*
+        * If this is a synchronous mount, make sure that the
+        * transaction goes to disk before returning to the user.
+        */
+       if (mp->m_flags & XFS_MOUNT_WSYNC)
+               xfs_trans_set_sync(args.trans);
+
+       if ((flags & ATTR_KERNOTIME) == 0)
+               xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
+
+       /*
+        * Commit the last in the sequence of transactions.
+        */
+       xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
+       error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+       xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+       return error;
+
+out:
+       if (args.trans) {
+               xfs_trans_cancel(args.trans,
+                       XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+       }
+       xfs_iunlock(dp, XFS_ILOCK_EXCL);
+       return error;
+}
+
+/*========================================================================
+ * External routines when attribute list is inside the inode
+ *========================================================================*/
+
+/*
+ * Add a name to the shortform attribute list structure
+ * This is the external routine.
+ */
+STATIC int
+xfs_attr_shortform_addname(xfs_da_args_t *args)
+{
+       int newsize, forkoff, retval;
+
+       trace_xfs_attr_sf_addname(args);
+
+       retval = xfs_attr_shortform_lookup(args);
+       if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
+               return retval;
+       } else if (retval == -EEXIST) {
+               if (args->flags & ATTR_CREATE)
+                       return retval;
+               retval = xfs_attr_shortform_remove(args);
+               ASSERT(retval == 0);
+       }
+
+       if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
+           args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX)
+               return -ENOSPC;
+
+       newsize = XFS_ATTR_SF_TOTSIZE(args->dp);
+       newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+
+       forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize);
+       if (!forkoff)
+               return -ENOSPC;
+
+       xfs_attr_shortform_add(args, forkoff);
+       return 0;
+}
+
+
+/*========================================================================
+ * External routines when attribute list is one block
+ *========================================================================*/
+
+/*
+ * Add a name to the leaf attribute list structure
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+STATIC int
+xfs_attr_leaf_addname(xfs_da_args_t *args)
+{
+       xfs_inode_t *dp;
+       struct xfs_buf *bp;
+       int retval, error, committed, forkoff;
+
+       trace_xfs_attr_leaf_addname(args);
+
+       /*
+        * Read the (only) block in the attribute list in.
+        */
+       dp = args->dp;
+       args->blkno = 0;
+       error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+       if (error)
+               return error;
+
+       /*
+        * Look up the given attribute in the leaf block.  Figure out if
+        * the given flags produce an error or call for an atomic rename.
+        */
+       retval = xfs_attr3_leaf_lookup_int(bp, args);
+       if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
+               xfs_trans_brelse(args->trans, bp);
+               return retval;
+       } else if (retval == -EEXIST) {
+               if (args->flags & ATTR_CREATE) {        /* pure create op */
+                       xfs_trans_brelse(args->trans, bp);
+                       return retval;
+               }
+
+               trace_xfs_attr_leaf_replace(args);
+
+               /* save the attribute state for later removal*/
+               args->op_flags |= XFS_DA_OP_RENAME;     /* an atomic rename */
+               args->blkno2 = args->blkno;             /* set 2nd entry info*/
+               args->index2 = args->index;
+               args->rmtblkno2 = args->rmtblkno;
+               args->rmtblkcnt2 = args->rmtblkcnt;
+               args->rmtvaluelen2 = args->rmtvaluelen;
+
+               /*
+                * clear the remote attr state now that it is saved so that the
+                * values reflect the state of the attribute we are about to
+                * add, not the attribute we just found and will remove later.
+                */
+               args->rmtblkno = 0;
+               args->rmtblkcnt = 0;
+               args->rmtvaluelen = 0;
+       }
+
+       /*
+        * Add the attribute to the leaf block, transitioning to a Btree
+        * if required.
+        */
+       retval = xfs_attr3_leaf_add(bp, args);
+       if (retval == -ENOSPC) {
+               /*
+                * Promote the attribute list to the Btree format, then
+                * Commit that transaction so that the node_addname() call
+                * can manage its own transactions.
+                */
+               xfs_bmap_init(args->flist, args->firstblock);
+               error = xfs_attr3_leaf_to_node(args);
+               if (!error) {
+                       error = xfs_bmap_finish(&args->trans, args->flist,
+                                               &committed);
+               }
+               if (error) {
+                       ASSERT(committed);
+                       args->trans = NULL;
+                       xfs_bmap_cancel(args->flist);
+                       return error;
+               }
+
+               /*
+                * bmap_finish() may have committed the last trans and started
+                * a new one.  We need the inode to be in all transactions.
+                */
+               if (committed)
+                       xfs_trans_ijoin(args->trans, dp, 0);
+
+               /*
+                * Commit the current trans (including the inode) and start
+                * a new one.
+                */
+               error = xfs_trans_roll(&args->trans, dp);
+               if (error)
+                       return error;
+
+               /*
+                * Fob the whole rest of the problem off on the Btree code.
+                */
+               error = xfs_attr_node_addname(args);
+               return error;
+       }
+
+       /*
+        * Commit the transaction that added the attr name so that
+        * later routines can manage their own transactions.
+        */
+       error = xfs_trans_roll(&args->trans, dp);
+       if (error)
+               return error;
+
+       /*
+        * If there was an out-of-line value, allocate the blocks we
+        * identified for its storage and copy the value.  This is done
+        * after we create the attribute so that we don't overflow the
+        * maximum size of a transaction and/or hit a deadlock.
+        */
+       if (args->rmtblkno > 0) {
+               error = xfs_attr_rmtval_set(args);
+               if (error)
+                       return error;
+       }
+
+       /*
+        * If this is an atomic rename operation, we must "flip" the
+        * incomplete flags on the "new" and "old" attribute/value pairs
+        * so that one disappears and one appears atomically.  Then we
+        * must remove the "old" attribute/value pair.
+        */
+       if (args->op_flags & XFS_DA_OP_RENAME) {
+               /*
+                * In a separate transaction, set the incomplete flag on the
+                * "old" attr and clear the incomplete flag on the "new" attr.
+                */
+               error = xfs_attr3_leaf_flipflags(args);
+               if (error)
+                       return error;
+
+               /*
+                * Dismantle the "old" attribute/value pair by removing
+                * a "remote" value (if it exists).
+                */
+               args->index = args->index2;
+               args->blkno = args->blkno2;
+               args->rmtblkno = args->rmtblkno2;
+               args->rmtblkcnt = args->rmtblkcnt2;
+               args->rmtvaluelen = args->rmtvaluelen2;
+               if (args->rmtblkno) {
+                       error = xfs_attr_rmtval_remove(args);
+                       if (error)
+                               return error;
+               }
+
+               /*
+                * Read in the block containing the "old" attr, then
+                * remove the "old" attr from that block (neat, huh!)
+                */
+               error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
+                                          -1, &bp);
+               if (error)
+                       return error;
+
+               xfs_attr3_leaf_remove(bp, args);
+
+               /*
+                * If the result is small enough, shrink it all into the inode.
+                */
+               if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
+                       xfs_bmap_init(args->flist, args->firstblock);
+                       error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+                       /* bp is gone due to xfs_da_shrink_inode */
+                       if (!error) {
+                               error = xfs_bmap_finish(&args->trans,
+                                                       args->flist,
+                                                       &committed);
+                       }
+                       if (error) {
+                               ASSERT(committed);
+                               args->trans = NULL;
+                               xfs_bmap_cancel(args->flist);
+                               return error;
+                       }
+
+                       /*
+                        * bmap_finish() may have committed the last trans
+                        * and started a new one.  We need the inode to be
+                        * in all transactions.
+                        */
+                       if (committed)
+                               xfs_trans_ijoin(args->trans, dp, 0);
+               }
+
+               /*
+                * Commit the remove and start the next trans in series.
+                */
+               error = xfs_trans_roll(&args->trans, dp);
+
+       } else if (args->rmtblkno > 0) {
+               /*
+                * Added a "remote" value, just clear the incomplete flag.
+                */
+               error = xfs_attr3_leaf_clearflag(args);
+       }
+       return error;
+}
+
+/*
+ * Remove a name from the leaf attribute list structure
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+STATIC int
+xfs_attr_leaf_removename(xfs_da_args_t *args)
+{
+       xfs_inode_t *dp;
+       struct xfs_buf *bp;
+       int error, committed, forkoff;
+
+       trace_xfs_attr_leaf_removename(args);
+
+       /*
+        * Remove the attribute.
+        */
+       dp = args->dp;
+       args->blkno = 0;
+       error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+       if (error)
+               return error;
+
+       error = xfs_attr3_leaf_lookup_int(bp, args);
+       if (error == -ENOATTR) {
+               xfs_trans_brelse(args->trans, bp);
+               return error;
+       }
+
+       xfs_attr3_leaf_remove(bp, args);
+
+       /*
+        * If the result is small enough, shrink it all into the inode.
+        */
+       if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
+               xfs_bmap_init(args->flist, args->firstblock);
+               error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+               /* bp is gone due to xfs_da_shrink_inode */
+               if (!error) {
+                       error = xfs_bmap_finish(&args->trans, args->flist,
+                                               &committed);
+               }
+               if (error) {
+                       ASSERT(committed);
+                       args->trans = NULL;
+                       xfs_bmap_cancel(args->flist);
+                       return error;
+               }
+
+               /*
+                * bmap_finish() may have committed the last trans and started
+                * a new one.  We need the inode to be in all transactions.
+                */
+               if (committed)
+                       xfs_trans_ijoin(args->trans, dp, 0);
+       }
+       return 0;
+}
+
+/*
+ * Look up a name in a leaf attribute list structure.
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+STATIC int
+xfs_attr_leaf_get(xfs_da_args_t *args)
+{
+       struct xfs_buf *bp;
+       int error;
+
+       trace_xfs_attr_leaf_get(args);
+
+       args->blkno = 0;
+       error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+       if (error)
+               return error;
+
+       error = xfs_attr3_leaf_lookup_int(bp, args);
+       if (error != -EEXIST)  {
+               xfs_trans_brelse(args->trans, bp);
+               return error;
+       }
+       error = xfs_attr3_leaf_getvalue(bp, args);
+       xfs_trans_brelse(args->trans, bp);
+       if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
+               error = xfs_attr_rmtval_get(args);
+       }
+       return error;
+}
+
+/*========================================================================
+ * External routines when attribute list size > geo->blksize
+ *========================================================================*/
+
+/*
+ * Add a name to a Btree-format attribute list.
+ *
+ * This will involve walking down the Btree, and may involve splitting
+ * leaf nodes and even splitting intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ *
+ * "Remote" attribute values confuse the issue and atomic rename operations
+ * add a whole extra layer of confusion on top of that.
+ */
+STATIC int
+xfs_attr_node_addname(xfs_da_args_t *args)
+{
+       xfs_da_state_t *state;
+       xfs_da_state_blk_t *blk;
+       xfs_inode_t *dp;
+       xfs_mount_t *mp;
+       int committed, retval, error;
+
+       trace_xfs_attr_node_addname(args);
+
+       /*
+        * Fill in bucket of arguments/results/context to carry around.
+        */
+       dp = args->dp;
+       mp = dp->i_mount;
+restart:
+       state = xfs_da_state_alloc();
+       state->args = args;
+       state->mp = mp;
+
+       /*
+        * Search to see if name already exists, and get back a pointer
+        * to where it should go.
+        */
+       error = xfs_da3_node_lookup_int(state, &retval);
+       if (error)
+               goto out;
+       blk = &state->path.blk[ state->path.active-1 ];
+       ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+       if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
+               goto out;
+       } else if (retval == -EEXIST) {
+               if (args->flags & ATTR_CREATE)
+                       goto out;
+
+               trace_xfs_attr_node_replace(args);
+
+               /* save the attribute state for later removal*/
+               args->op_flags |= XFS_DA_OP_RENAME;     /* atomic rename op */
+               args->blkno2 = args->blkno;             /* set 2nd entry info*/
+               args->index2 = args->index;
+               args->rmtblkno2 = args->rmtblkno;
+               args->rmtblkcnt2 = args->rmtblkcnt;
+               args->rmtvaluelen2 = args->rmtvaluelen;
+
+               /*
+                * clear the remote attr state now that it is saved so that the
+                * values reflect the state of the attribute we are about to
+                * add, not the attribute we just found and will remove later.
+                */
+               args->rmtblkno = 0;
+               args->rmtblkcnt = 0;
+               args->rmtvaluelen = 0;
+       }
+
+       retval = xfs_attr3_leaf_add(blk->bp, state->args);
+       if (retval == -ENOSPC) {
+               if (state->path.active == 1) {
+                       /*
+                        * Its really a single leaf node, but it had
+                        * out-of-line values so it looked like it *might*
+                        * have been a b-tree.
+                        */
+                       xfs_da_state_free(state);
+                       state = NULL;
+                       xfs_bmap_init(args->flist, args->firstblock);
+                       error = xfs_attr3_leaf_to_node(args);
+                       if (!error) {
+                               error = xfs_bmap_finish(&args->trans,
+                                                       args->flist,
+                                                       &committed);
+                       }
+                       if (error) {
+                               ASSERT(committed);
+                               args->trans = NULL;
+                               xfs_bmap_cancel(args->flist);
+                               goto out;
+                       }
+
+                       /*
+                        * bmap_finish() may have committed the last trans
+                        * and started a new one.  We need the inode to be
+                        * in all transactions.
+                        */
+                       if (committed)
+                               xfs_trans_ijoin(args->trans, dp, 0);
+
+                       /*
+                        * Commit the node conversion and start the next
+                        * trans in the chain.
+                        */
+                       error = xfs_trans_roll(&args->trans, dp);
+                       if (error)
+                               goto out;
+
+                       goto restart;
+               }
+
+               /*
+                * Split as many Btree elements as required.
+                * This code tracks the new and old attr's location
+                * in the index/blkno/rmtblkno/rmtblkcnt fields and
+                * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
+                */
+               xfs_bmap_init(args->flist, args->firstblock);
+               error = xfs_da3_split(state);
+               if (!error) {
+                       error = xfs_bmap_finish(&args->trans, args->flist,
+                                               &committed);
+               }
+               if (error) {
+                       ASSERT(committed);
+                       args->trans = NULL;
+                       xfs_bmap_cancel(args->flist);
+                       goto out;
+               }
+
+               /*
+                * bmap_finish() may have committed the last trans and started
+                * a new one.  We need the inode to be in all transactions.
+                */
+               if (committed)
+                       xfs_trans_ijoin(args->trans, dp, 0);
+       } else {
+               /*
+                * Addition succeeded, update Btree hashvals.
+                */
+               xfs_da3_fixhashpath(state, &state->path);
+       }
+
+       /*
+        * Kill the state structure, we're done with it and need to
+        * allow the buffers to come back later.
+        */
+       xfs_da_state_free(state);
+       state = NULL;
+
+       /*
+        * Commit the leaf addition or btree split and start the next
+        * trans in the chain.
+        */
+       error = xfs_trans_roll(&args->trans, dp);
+       if (error)
+               goto out;
+
+       /*
+        * If there was an out-of-line value, allocate the blocks we
+        * identified for its storage and copy the value.  This is done
+        * after we create the attribute so that we don't overflow the
+        * maximum size of a transaction and/or hit a deadlock.
+        */
+       if (args->rmtblkno > 0) {
+               error = xfs_attr_rmtval_set(args);
+               if (error)
+                       return error;
+       }
+
+       /*
+        * If this is an atomic rename operation, we must "flip" the
+        * incomplete flags on the "new" and "old" attribute/value pairs
+        * so that one disappears and one appears atomically.  Then we
+        * must remove the "old" attribute/value pair.
+        */
+       if (args->op_flags & XFS_DA_OP_RENAME) {
+               /*
+                * In a separate transaction, set the incomplete flag on the
+                * "old" attr and clear the incomplete flag on the "new" attr.
+                */
+               error = xfs_attr3_leaf_flipflags(args);
+               if (error)
+                       goto out;
+
+               /*
+                * Dismantle the "old" attribute/value pair by removing
+                * a "remote" value (if it exists).
+                */
+               args->index = args->index2;
+               args->blkno = args->blkno2;
+               args->rmtblkno = args->rmtblkno2;
+               args->rmtblkcnt = args->rmtblkcnt2;
+               args->rmtvaluelen = args->rmtvaluelen2;
+               if (args->rmtblkno) {
+                       error = xfs_attr_rmtval_remove(args);
+                       if (error)
+                               return error;
+               }
+
+               /*
+                * Re-find the "old" attribute entry after any split ops.
+                * The INCOMPLETE flag means that we will find the "old"
+                * attr, not the "new" one.
+                */
+               args->flags |= XFS_ATTR_INCOMPLETE;
+               state = xfs_da_state_alloc();
+               state->args = args;
+               state->mp = mp;
+               state->inleaf = 0;
+               error = xfs_da3_node_lookup_int(state, &retval);
+               if (error)
+                       goto out;
+
+               /*
+                * Remove the name and update the hashvals in the tree.
+                */
+               blk = &state->path.blk[ state->path.active-1 ];
+               ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+               error = xfs_attr3_leaf_remove(blk->bp, args);
+               xfs_da3_fixhashpath(state, &state->path);
+
+               /*
+                * Check to see if the tree needs to be collapsed.
+                */
+               if (retval && (state->path.active > 1)) {
+                       xfs_bmap_init(args->flist, args->firstblock);
+                       error = xfs_da3_join(state);
+                       if (!error) {
+                               error = xfs_bmap_finish(&args->trans,
+                                                       args->flist,
+                                                       &committed);
+                       }
+                       if (error) {
+                               ASSERT(committed);
+                               args->trans = NULL;
+                               xfs_bmap_cancel(args->flist);
+                               goto out;
+                       }
+
+                       /*
+                        * bmap_finish() may have committed the last trans
+                        * and started a new one.  We need the inode to be
+                        * in all transactions.
+                        */
+                       if (committed)
+                               xfs_trans_ijoin(args->trans, dp, 0);
+               }
+
+               /*
+                * Commit and start the next trans in the chain.
+                */
+               error = xfs_trans_roll(&args->trans, dp);
+               if (error)
+                       goto out;
+
+       } else if (args->rmtblkno > 0) {
+               /*
+                * Added a "remote" value, just clear the incomplete flag.
+                */
+               error = xfs_attr3_leaf_clearflag(args);
+               if (error)
+                       goto out;
+       }
+       retval = error = 0;
+
+out:
+       if (state)
+               xfs_da_state_free(state);
+       if (error)
+               return error;
+       return retval;
+}
+
+/*
+ * Remove a name from a B-tree attribute list.
+ *
+ * This will involve walking down the Btree, and may involve joining
+ * leaf nodes and even joining intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ */
+STATIC int
+xfs_attr_node_removename(xfs_da_args_t *args)
+{
+       xfs_da_state_t *state;
+       xfs_da_state_blk_t *blk;
+       xfs_inode_t *dp;
+       struct xfs_buf *bp;
+       int retval, error, committed, forkoff;
+
+       trace_xfs_attr_node_removename(args);
+
+       /*
+        * Tie a string around our finger to remind us where we are.
+        */
+       dp = args->dp;
+       state = xfs_da_state_alloc();
+       state->args = args;
+       state->mp = dp->i_mount;
+
+       /*
+        * Search to see if name exists, and get back a pointer to it.
+        */
+       error = xfs_da3_node_lookup_int(state, &retval);
+       if (error || (retval != -EEXIST)) {
+               if (error == 0)
+                       error = retval;
+               goto out;
+       }
+
+       /*
+        * If there is an out-of-line value, de-allocate the blocks.
+        * This is done before we remove the attribute so that we don't
+        * overflow the maximum size of a transaction and/or hit a deadlock.
+        */
+       blk = &state->path.blk[ state->path.active-1 ];
+       ASSERT(blk->bp != NULL);
+       ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+       if (args->rmtblkno > 0) {
+               /*
+                * Fill in disk block numbers in the state structure
+                * so that we can get the buffers back after we commit
+                * several transactions in the following calls.
+                */
+               error = xfs_attr_fillstate(state);
+               if (error)
+                       goto out;
+
+               /*
+                * Mark the attribute as INCOMPLETE, then bunmapi() the
+                * remote value.
+                */
+               error = xfs_attr3_leaf_setflag(args);
+               if (error)
+                       goto out;
+               error = xfs_attr_rmtval_remove(args);
+               if (error)
+                       goto out;
+
+               /*
+                * Refill the state structure with buffers, the prior calls
+                * released our buffers.
+                */
+               error = xfs_attr_refillstate(state);
+               if (error)
+                       goto out;
+       }
+
+       /*
+        * Remove the name and update the hashvals in the tree.
+        */
+       blk = &state->path.blk[ state->path.active-1 ];
+       ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+       retval = xfs_attr3_leaf_remove(blk->bp, args);
+       xfs_da3_fixhashpath(state, &state->path);
+
+       /*
+        * Check to see if the tree needs to be collapsed.
+        */
+       if (retval && (state->path.active > 1)) {
+               xfs_bmap_init(args->flist, args->firstblock);
+               error = xfs_da3_join(state);
+               if (!error) {
+                       error = xfs_bmap_finish(&args->trans, args->flist,
+                                               &committed);
+               }
+               if (error) {
+                       ASSERT(committed);
+                       args->trans = NULL;
+                       xfs_bmap_cancel(args->flist);
+                       goto out;
+               }
+
+               /*
+                * bmap_finish() may have committed the last trans and started
+                * a new one.  We need the inode to be in all transactions.
+                */
+               if (committed)
+                       xfs_trans_ijoin(args->trans, dp, 0);
+
+               /*
+                * Commit the Btree join operation and start a new trans.
+                */
+               error = xfs_trans_roll(&args->trans, dp);
+               if (error)
+                       goto out;
+       }
+
+       /*
+        * If the result is small enough, push it all into the inode.
+        */
+       if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+               /*
+                * Have to get rid of the copy of this dabuf in the state.
+                */
+               ASSERT(state->path.active == 1);
+               ASSERT(state->path.blk[0].bp);
+               state->path.blk[0].bp = NULL;
+
+               error = xfs_attr3_leaf_read(args->trans, args->dp, 0, -1, &bp);
+               if (error)
+                       goto out;
+
+               if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
+                       xfs_bmap_init(args->flist, args->firstblock);
+                       error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+                       /* bp is gone due to xfs_da_shrink_inode */
+                       if (!error) {
+                               error = xfs_bmap_finish(&args->trans,
+                                                       args->flist,
+                                                       &committed);
+                       }
+                       if (error) {
+                               ASSERT(committed);
+                               args->trans = NULL;
+                               xfs_bmap_cancel(args->flist);
+                               goto out;
+                       }
+
+                       /*
+                        * bmap_finish() may have committed the last trans
+                        * and started a new one.  We need the inode to be
+                        * in all transactions.
+                        */
+                       if (committed)
+                               xfs_trans_ijoin(args->trans, dp, 0);
+               } else
+                       xfs_trans_brelse(args->trans, bp);
+       }
+       error = 0;
+
+out:
+       xfs_da_state_free(state);
+       return error;
+}
+
+/*
+ * Fill in the disk block numbers in the state structure for the buffers
+ * that are attached to the state structure.
+ * This is done so that we can quickly reattach ourselves to those buffers
+ * after some set of transaction commits have released these buffers.
+ */
+STATIC int
+xfs_attr_fillstate(xfs_da_state_t *state)
+{
+       xfs_da_state_path_t *path;
+       xfs_da_state_blk_t *blk;
+       int level;
+
+       trace_xfs_attr_fillstate(state->args);
+
+       /*
+        * Roll down the "path" in the state structure, storing the on-disk
+        * block number for those buffers in the "path".
+        */
+       path = &state->path;
+       ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+       for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+               if (blk->bp) {
+                       blk->disk_blkno = XFS_BUF_ADDR(blk->bp);
+                       blk->bp = NULL;
+               } else {
+                       blk->disk_blkno = 0;
+               }
+       }
+
+       /*
+        * Roll down the "altpath" in the state structure, storing the on-disk
+        * block number for those buffers in the "altpath".
+        */
+       path = &state->altpath;
+       ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+       for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+               if (blk->bp) {
+                       blk->disk_blkno = XFS_BUF_ADDR(blk->bp);
+                       blk->bp = NULL;
+               } else {
+                       blk->disk_blkno = 0;
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * Reattach the buffers to the state structure based on the disk block
+ * numbers stored in the state structure.
+ * This is done after some set of transaction commits have released those
+ * buffers from our grip.
+ */
+STATIC int
+xfs_attr_refillstate(xfs_da_state_t *state)
+{
+       xfs_da_state_path_t *path;
+       xfs_da_state_blk_t *blk;
+       int level, error;
+
+       trace_xfs_attr_refillstate(state->args);
+
+       /*
+        * Roll down the "path" in the state structure, storing the on-disk
+        * block number for those buffers in the "path".
+        */
+       path = &state->path;
+       ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+       for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+               if (blk->disk_blkno) {
+                       error = xfs_da3_node_read(state->args->trans,
+                                               state->args->dp,
+                                               blk->blkno, blk->disk_blkno,
+                                               &blk->bp, XFS_ATTR_FORK);
+                       if (error)
+                               return error;
+               } else {
+                       blk->bp = NULL;
+               }
+       }
+
+       /*
+        * Roll down the "altpath" in the state structure, storing the on-disk
+        * block number for those buffers in the "altpath".
+        */
+       path = &state->altpath;
+       ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+       for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+               if (blk->disk_blkno) {
+                       error = xfs_da3_node_read(state->args->trans,
+                                               state->args->dp,
+                                               blk->blkno, blk->disk_blkno,
+                                               &blk->bp, XFS_ATTR_FORK);
+                       if (error)
+                               return error;
+               } else {
+                       blk->bp = NULL;
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * Look up a filename in a node attribute list.
+ *
+ * This routine gets called for any attribute fork that has more than one
+ * block, ie: both true Btree attr lists and for single-leaf-blocks with
+ * "remote" values taking up more blocks.
+ */
+STATIC int
+xfs_attr_node_get(xfs_da_args_t *args)
+{
+       xfs_da_state_t *state;
+       xfs_da_state_blk_t *blk;
+       int error, retval;
+       int i;
+
+       trace_xfs_attr_node_get(args);
+
+       state = xfs_da_state_alloc();
+       state->args = args;
+       state->mp = args->dp->i_mount;
+
+       /*
+        * Search to see if name exists, and get back a pointer to it.
+        */
+       error = xfs_da3_node_lookup_int(state, &retval);
+       if (error) {
+               retval = error;
+       } else if (retval == -EEXIST) {
+               blk = &state->path.blk[ state->path.active-1 ];
+               ASSERT(blk->bp != NULL);
+               ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+
+               /*
+                * Get the value, local or "remote"
+                */
+               retval = xfs_attr3_leaf_getvalue(blk->bp, args);
+               if (!retval && (args->rmtblkno > 0)
+                   && !(args->flags & ATTR_KERNOVAL)) {
+                       retval = xfs_attr_rmtval_get(args);
+               }
+       }
+
+       /*
+        * If not in a transaction, we have to release all the buffers.
+        */
+       for (i = 0; i < state->path.active; i++) {
+               xfs_trans_brelse(args->trans, state->path.blk[i].bp);
+               state->path.blk[i].bp = NULL;
+       }
+
+       xfs_da_state_free(state);
+       return retval;
+}
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
new file mode 100644 (file)
index 0000000..b1f73db
--- /dev/null
@@ -0,0 +1,2697 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
+#include "xfs_dinode.h"
+#include "xfs_dir2.h"
+
+
+/*
+ * xfs_attr_leaf.c
+ *
+ * Routines to implement leaf blocks of attributes as Btrees of hashed names.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args,
+                                xfs_dablk_t which_block, struct xfs_buf **bpp);
+STATIC int xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer,
+                                  struct xfs_attr3_icleaf_hdr *ichdr,
+                                  struct xfs_da_args *args, int freemap_index);
+STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args,
+                                  struct xfs_attr3_icleaf_hdr *ichdr,
+                                  struct xfs_buf *leaf_buffer);
+STATIC void xfs_attr3_leaf_rebalance(xfs_da_state_t *state,
+                                                  xfs_da_state_blk_t *blk1,
+                                                  xfs_da_state_blk_t *blk2);
+STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state,
+                       xfs_da_state_blk_t *leaf_blk_1,
+                       struct xfs_attr3_icleaf_hdr *ichdr1,
+                       xfs_da_state_blk_t *leaf_blk_2,
+                       struct xfs_attr3_icleaf_hdr *ichdr2,
+                       int *number_entries_in_blk1,
+                       int *number_usedbytes_in_blk1);
+
+/*
+ * Utility routines.
+ */
+STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args,
+                       struct xfs_attr_leafblock *src_leaf,
+                       struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start,
+                       struct xfs_attr_leafblock *dst_leaf,
+                       struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start,
+                       int move_count);
+STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
+
+void
+xfs_attr3_leaf_hdr_from_disk(
+       struct xfs_attr3_icleaf_hdr     *to,
+       struct xfs_attr_leafblock       *from)
+{
+       int     i;
+
+       ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+              from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+
+       if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) {
+               struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)from;
+
+               to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+               to->back = be32_to_cpu(hdr3->info.hdr.back);
+               to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+               to->count = be16_to_cpu(hdr3->count);
+               to->usedbytes = be16_to_cpu(hdr3->usedbytes);
+               to->firstused = be16_to_cpu(hdr3->firstused);
+               to->holes = hdr3->holes;
+
+               for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+                       to->freemap[i].base = be16_to_cpu(hdr3->freemap[i].base);
+                       to->freemap[i].size = be16_to_cpu(hdr3->freemap[i].size);
+               }
+               return;
+       }
+       to->forw = be32_to_cpu(from->hdr.info.forw);
+       to->back = be32_to_cpu(from->hdr.info.back);
+       to->magic = be16_to_cpu(from->hdr.info.magic);
+       to->count = be16_to_cpu(from->hdr.count);
+       to->usedbytes = be16_to_cpu(from->hdr.usedbytes);
+       to->firstused = be16_to_cpu(from->hdr.firstused);
+       to->holes = from->hdr.holes;
+
+       for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+               to->freemap[i].base = be16_to_cpu(from->hdr.freemap[i].base);
+               to->freemap[i].size = be16_to_cpu(from->hdr.freemap[i].size);
+       }
+}
+
+void
+xfs_attr3_leaf_hdr_to_disk(
+       struct xfs_attr_leafblock       *to,
+       struct xfs_attr3_icleaf_hdr     *from)
+{
+       int     i;
+
+       ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC ||
+              from->magic == XFS_ATTR3_LEAF_MAGIC);
+
+       if (from->magic == XFS_ATTR3_LEAF_MAGIC) {
+               struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)to;
+
+               hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+               hdr3->info.hdr.back = cpu_to_be32(from->back);
+               hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+               hdr3->count = cpu_to_be16(from->count);
+               hdr3->usedbytes = cpu_to_be16(from->usedbytes);
+               hdr3->firstused = cpu_to_be16(from->firstused);
+               hdr3->holes = from->holes;
+               hdr3->pad1 = 0;
+
+               for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+                       hdr3->freemap[i].base = cpu_to_be16(from->freemap[i].base);
+                       hdr3->freemap[i].size = cpu_to_be16(from->freemap[i].size);
+               }
+               return;
+       }
+       to->hdr.info.forw = cpu_to_be32(from->forw);
+       to->hdr.info.back = cpu_to_be32(from->back);
+       to->hdr.info.magic = cpu_to_be16(from->magic);
+       to->hdr.count = cpu_to_be16(from->count);
+       to->hdr.usedbytes = cpu_to_be16(from->usedbytes);
+       to->hdr.firstused = cpu_to_be16(from->firstused);
+       to->hdr.holes = from->holes;
+       to->hdr.pad1 = 0;
+
+       for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+               to->hdr.freemap[i].base = cpu_to_be16(from->freemap[i].base);
+               to->hdr.freemap[i].size = cpu_to_be16(from->freemap[i].size);
+       }
+}
+
+static bool
+xfs_attr3_leaf_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_attr_leafblock *leaf = bp->b_addr;
+       struct xfs_attr3_icleaf_hdr ichdr;
+
+       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+               if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC)
+                       return false;
+
+               if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
+                       return false;
+               if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
+                       return false;
+       } else {
+               if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
+                       return false;
+       }
+       if (ichdr.count == 0)
+               return false;
+
+       /* XXX: need to range check rest of attr header values */
+       /* XXX: hash order check? */
+
+       return true;
+}
+
+static void
+xfs_attr3_leaf_write_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_buf_log_item *bip = bp->b_fspriv;
+       struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr;
+
+       if (!xfs_attr3_leaf_verify(bp)) {
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+               xfs_verifier_error(bp);
+               return;
+       }
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return;
+
+       if (bip)
+               hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+       xfs_buf_update_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF);
+}
+
+/*
+ * leaf/node format detection on trees is sketchy, so a node read can be done on
+ * leaf level blocks when detection identifies the tree as a node format tree
+ * incorrectly. In this case, we need to swap the verifier to match the correct
+ * format of the block being read.
+ */
+static void
+xfs_attr3_leaf_read_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+
+       if (xfs_sb_version_hascrc(&mp->m_sb) &&
+            !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF))
+               xfs_buf_ioerror(bp, -EFSBADCRC);
+       else if (!xfs_attr3_leaf_verify(bp))
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+       if (bp->b_error)
+               xfs_verifier_error(bp);
+}
+
+const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
+       .verify_read = xfs_attr3_leaf_read_verify,
+       .verify_write = xfs_attr3_leaf_write_verify,
+};
+
+int
+xfs_attr3_leaf_read(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *dp,
+       xfs_dablk_t             bno,
+       xfs_daddr_t             mappedbno,
+       struct xfs_buf          **bpp)
+{
+       int                     err;
+
+       err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+                               XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops);
+       if (!err && tp)
+               xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
+       return err;
+}
+
+/*========================================================================
+ * Namespace helper routines
+ *========================================================================*/
+
+/*
+ * If namespace bits don't match return 0.
+ * If all match then return 1.
+ */
+STATIC int
+xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
+{
+       return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
+}
+
+
+/*========================================================================
+ * External routines when attribute fork size < XFS_LITINO(mp).
+ *========================================================================*/
+
+/*
+ * Query whether the requested number of additional bytes of extended
+ * attribute space will be able to fit inline.
+ *
+ * Returns zero if not, else the di_forkoff fork offset to be used in the
+ * literal area for attribute data once the new bytes have been added.
+ *
+ * di_forkoff must be 8 byte aligned, hence is stored as a >>3 value;
+ * special case for dev/uuid inodes, they have fixed size data forks.
+ */
+int
+xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
+{
+       int offset;
+       int minforkoff; /* lower limit on valid forkoff locations */
+       int maxforkoff; /* upper limit on valid forkoff locations */
+       int dsize;
+       xfs_mount_t *mp = dp->i_mount;
+
+       /* rounded down */
+       offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3;
+
+       switch (dp->i_d.di_format) {
+       case XFS_DINODE_FMT_DEV:
+               minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
+               return (offset >= minforkoff) ? minforkoff : 0;
+       case XFS_DINODE_FMT_UUID:
+               minforkoff = roundup(sizeof(uuid_t), 8) >> 3;
+               return (offset >= minforkoff) ? minforkoff : 0;
+       }
+
+       /*
+        * If the requested numbers of bytes is smaller or equal to the
+        * current attribute fork size we can always proceed.
+        *
+        * Note that if_bytes in the data fork might actually be larger than
+        * the current data fork size is due to delalloc extents. In that
+        * case either the extent count will go down when they are converted
+        * to real extents, or the delalloc conversion will take care of the
+        * literal area rebalancing.
+        */
+       if (bytes <= XFS_IFORK_ASIZE(dp))
+               return dp->i_d.di_forkoff;
+
+       /*
+        * For attr2 we can try to move the forkoff if there is space in the
+        * literal area, but for the old format we are done if there is no
+        * space in the fixed attribute fork.
+        */
+       if (!(mp->m_flags & XFS_MOUNT_ATTR2))
+               return 0;
+
+       dsize = dp->i_df.if_bytes;
+
+       switch (dp->i_d.di_format) {
+       case XFS_DINODE_FMT_EXTENTS:
+               /*
+                * If there is no attr fork and the data fork is extents, 
+                * determine if creating the default attr fork will result
+                * in the extents form migrating to btree. If so, the
+                * minimum offset only needs to be the space required for
+                * the btree root.
+                */
+               if (!dp->i_d.di_forkoff && dp->i_df.if_bytes >
+                   xfs_default_attroffset(dp))
+                       dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+               break;
+       case XFS_DINODE_FMT_BTREE:
+               /*
+                * If we have a data btree then keep forkoff if we have one,
+                * otherwise we are adding a new attr, so then we set
+                * minforkoff to where the btree root can finish so we have
+                * plenty of room for attrs
+                */
+               if (dp->i_d.di_forkoff) {
+                       if (offset < dp->i_d.di_forkoff)
+                               return 0;
+                       return dp->i_d.di_forkoff;
+               }
+               dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot);
+               break;
+       }
+
+       /*
+        * A data fork btree root must have space for at least
+        * MINDBTPTRS key/ptr pairs if the data fork is small or empty.
+        */
+       minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS));
+       minforkoff = roundup(minforkoff, 8) >> 3;
+
+       /* attr fork btree root can have at least this many key/ptr pairs */
+       maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) -
+                       XFS_BMDR_SPACE_CALC(MINABTPTRS);
+       maxforkoff = maxforkoff >> 3;   /* rounded down */
+
+       if (offset >= maxforkoff)
+               return maxforkoff;
+       if (offset >= minforkoff)
+               return offset;
+       return 0;
+}
+
+/*
+ * Switch on the ATTR2 superblock bit (implies also FEATURES2)
+ */
+STATIC void
+xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp)
+{
+       if ((mp->m_flags & XFS_MOUNT_ATTR2) &&
+           !(xfs_sb_version_hasattr2(&mp->m_sb))) {
+               spin_lock(&mp->m_sb_lock);
+               if (!xfs_sb_version_hasattr2(&mp->m_sb)) {
+                       xfs_sb_version_addattr2(&mp->m_sb);
+                       spin_unlock(&mp->m_sb_lock);
+                       xfs_mod_sb(tp, XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
+               } else
+                       spin_unlock(&mp->m_sb_lock);
+       }
+}
+
+/*
+ * Create the initial contents of a shortform attribute list.
+ */
+void
+xfs_attr_shortform_create(xfs_da_args_t *args)
+{
+       xfs_attr_sf_hdr_t *hdr;
+       xfs_inode_t *dp;
+       xfs_ifork_t *ifp;
+
+       trace_xfs_attr_sf_create(args);
+
+       dp = args->dp;
+       ASSERT(dp != NULL);
+       ifp = dp->i_afp;
+       ASSERT(ifp != NULL);
+       ASSERT(ifp->if_bytes == 0);
+       if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) {
+               ifp->if_flags &= ~XFS_IFEXTENTS;        /* just in case */
+               dp->i_d.di_aformat = XFS_DINODE_FMT_LOCAL;
+               ifp->if_flags |= XFS_IFINLINE;
+       } else {
+               ASSERT(ifp->if_flags & XFS_IFINLINE);
+       }
+       xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
+       hdr = (xfs_attr_sf_hdr_t *)ifp->if_u1.if_data;
+       hdr->count = 0;
+       hdr->totsize = cpu_to_be16(sizeof(*hdr));
+       xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+}
+
+/*
+ * Add a name/value pair to the shortform attribute list.
+ * Overflow from the inode has already been checked for.
+ */
+void
+xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
+{
+       xfs_attr_shortform_t *sf;
+       xfs_attr_sf_entry_t *sfe;
+       int i, offset, size;
+       xfs_mount_t *mp;
+       xfs_inode_t *dp;
+       xfs_ifork_t *ifp;
+
+       trace_xfs_attr_sf_add(args);
+
+       dp = args->dp;
+       mp = dp->i_mount;
+       dp->i_d.di_forkoff = forkoff;
+
+       ifp = dp->i_afp;
+       ASSERT(ifp->if_flags & XFS_IFINLINE);
+       sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+       sfe = &sf->list[0];
+       for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+#ifdef DEBUG
+               if (sfe->namelen != args->namelen)
+                       continue;
+               if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
+                       continue;
+               if (!xfs_attr_namesp_match(args->flags, sfe->flags))
+                       continue;
+               ASSERT(0);
+#endif
+       }
+
+       offset = (char *)sfe - (char *)sf;
+       size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+       xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
+       sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+       sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset);
+
+       sfe->namelen = args->namelen;
+       sfe->valuelen = args->valuelen;
+       sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
+       memcpy(sfe->nameval, args->name, args->namelen);
+       memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen);
+       sf->hdr.count++;
+       be16_add_cpu(&sf->hdr.totsize, size);
+       xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+
+       xfs_sbversion_add_attr2(mp, args->trans);
+}
+
+/*
+ * After the last attribute is removed revert to original inode format,
+ * making all literal area available to the data fork once more.
+ */
+STATIC void
+xfs_attr_fork_reset(
+       struct xfs_inode        *ip,
+       struct xfs_trans        *tp)
+{
+       xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+       ip->i_d.di_forkoff = 0;
+       ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+
+       ASSERT(ip->i_d.di_anextents == 0);
+       ASSERT(ip->i_afp == NULL);
+
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/*
+ * Remove an attribute from the shortform attribute list structure.
+ */
+int
+xfs_attr_shortform_remove(xfs_da_args_t *args)
+{
+       xfs_attr_shortform_t *sf;
+       xfs_attr_sf_entry_t *sfe;
+       int base, size=0, end, totsize, i;
+       xfs_mount_t *mp;
+       xfs_inode_t *dp;
+
+       trace_xfs_attr_sf_remove(args);
+
+       dp = args->dp;
+       mp = dp->i_mount;
+       base = sizeof(xfs_attr_sf_hdr_t);
+       sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
+       sfe = &sf->list[0];
+       end = sf->hdr.count;
+       for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe),
+                                       base += size, i++) {
+               size = XFS_ATTR_SF_ENTSIZE(sfe);
+               if (sfe->namelen != args->namelen)
+                       continue;
+               if (memcmp(sfe->nameval, args->name, args->namelen) != 0)
+                       continue;
+               if (!xfs_attr_namesp_match(args->flags, sfe->flags))
+                       continue;
+               break;
+       }
+       if (i == end)
+               return -ENOATTR;
+
+       /*
+        * Fix up the attribute fork data, covering the hole
+        */
+       end = base + size;
+       totsize = be16_to_cpu(sf->hdr.totsize);
+       if (end != totsize)
+               memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end);
+       sf->hdr.count--;
+       be16_add_cpu(&sf->hdr.totsize, -size);
+
+       /*
+        * Fix up the start offset of the attribute fork
+        */
+       totsize -= size;
+       if (totsize == sizeof(xfs_attr_sf_hdr_t) &&
+           (mp->m_flags & XFS_MOUNT_ATTR2) &&
+           (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+           !(args->op_flags & XFS_DA_OP_ADDNAME)) {
+               xfs_attr_fork_reset(dp, args->trans);
+       } else {
+               xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+               dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
+               ASSERT(dp->i_d.di_forkoff);
+               ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) ||
+                               (args->op_flags & XFS_DA_OP_ADDNAME) ||
+                               !(mp->m_flags & XFS_MOUNT_ATTR2) ||
+                               dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
+               xfs_trans_log_inode(args->trans, dp,
+                                       XFS_ILOG_CORE | XFS_ILOG_ADATA);
+       }
+
+       xfs_sbversion_add_attr2(mp, args->trans);
+
+       return 0;
+}
+
+/*
+ * Look up a name in a shortform attribute list structure.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_shortform_lookup(xfs_da_args_t *args)
+{
+       xfs_attr_shortform_t *sf;
+       xfs_attr_sf_entry_t *sfe;
+       int i;
+       xfs_ifork_t *ifp;
+
+       trace_xfs_attr_sf_lookup(args);
+
+       ifp = args->dp->i_afp;
+       ASSERT(ifp->if_flags & XFS_IFINLINE);
+       sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+       sfe = &sf->list[0];
+       for (i = 0; i < sf->hdr.count;
+                               sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+               if (sfe->namelen != args->namelen)
+                       continue;
+               if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
+                       continue;
+               if (!xfs_attr_namesp_match(args->flags, sfe->flags))
+                       continue;
+               return -EEXIST;
+       }
+       return -ENOATTR;
+}
+
+/*
+ * Look up a name in a shortform attribute list structure.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_shortform_getvalue(xfs_da_args_t *args)
+{
+       xfs_attr_shortform_t *sf;
+       xfs_attr_sf_entry_t *sfe;
+       int i;
+
+       ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE);
+       sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
+       sfe = &sf->list[0];
+       for (i = 0; i < sf->hdr.count;
+                               sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+               if (sfe->namelen != args->namelen)
+                       continue;
+               if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
+                       continue;
+               if (!xfs_attr_namesp_match(args->flags, sfe->flags))
+                       continue;
+               if (args->flags & ATTR_KERNOVAL) {
+                       args->valuelen = sfe->valuelen;
+                       return -EEXIST;
+               }
+               if (args->valuelen < sfe->valuelen) {
+                       args->valuelen = sfe->valuelen;
+                       return -ERANGE;
+               }
+               args->valuelen = sfe->valuelen;
+               memcpy(args->value, &sfe->nameval[args->namelen],
+                                                   args->valuelen);
+               return -EEXIST;
+       }
+       return -ENOATTR;
+}
+
+/*
+ * Convert from using the shortform to the leaf.
+ */
+int
+xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
+{
+       xfs_inode_t *dp;
+       xfs_attr_shortform_t *sf;
+       xfs_attr_sf_entry_t *sfe;
+       xfs_da_args_t nargs;
+       char *tmpbuffer;
+       int error, i, size;
+       xfs_dablk_t blkno;
+       struct xfs_buf *bp;
+       xfs_ifork_t *ifp;
+
+       trace_xfs_attr_sf_to_leaf(args);
+
+       dp = args->dp;
+       ifp = dp->i_afp;
+       sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+       size = be16_to_cpu(sf->hdr.totsize);
+       tmpbuffer = kmem_alloc(size, KM_SLEEP);
+       ASSERT(tmpbuffer != NULL);
+       memcpy(tmpbuffer, ifp->if_u1.if_data, size);
+       sf = (xfs_attr_shortform_t *)tmpbuffer;
+
+       xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+       xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK);
+
+       bp = NULL;
+       error = xfs_da_grow_inode(args, &blkno);
+       if (error) {
+               /*
+                * If we hit an IO error middle of the transaction inside
+                * grow_inode(), we may have inconsistent data. Bail out.
+                */
+               if (error == -EIO)
+                       goto out;
+               xfs_idata_realloc(dp, size, XFS_ATTR_FORK);     /* try to put */
+               memcpy(ifp->if_u1.if_data, tmpbuffer, size);    /* it back */
+               goto out;
+       }
+
+       ASSERT(blkno == 0);
+       error = xfs_attr3_leaf_create(args, blkno, &bp);
+       if (error) {
+               error = xfs_da_shrink_inode(args, 0, bp);
+               bp = NULL;
+               if (error)
+                       goto out;
+               xfs_idata_realloc(dp, size, XFS_ATTR_FORK);     /* try to put */
+               memcpy(ifp->if_u1.if_data, tmpbuffer, size);    /* it back */
+               goto out;
+       }
+
+       memset((char *)&nargs, 0, sizeof(nargs));
+       nargs.dp = dp;
+       nargs.geo = args->geo;
+       nargs.firstblock = args->firstblock;
+       nargs.flist = args->flist;
+       nargs.total = args->total;
+       nargs.whichfork = XFS_ATTR_FORK;
+       nargs.trans = args->trans;
+       nargs.op_flags = XFS_DA_OP_OKNOENT;
+
+       sfe = &sf->list[0];
+       for (i = 0; i < sf->hdr.count; i++) {
+               nargs.name = sfe->nameval;
+               nargs.namelen = sfe->namelen;
+               nargs.value = &sfe->nameval[nargs.namelen];
+               nargs.valuelen = sfe->valuelen;
+               nargs.hashval = xfs_da_hashname(sfe->nameval,
+                                               sfe->namelen);
+               nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
+               error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */
+               ASSERT(error == -ENOATTR);
+               error = xfs_attr3_leaf_add(bp, &nargs);
+               ASSERT(error != -ENOSPC);
+               if (error)
+                       goto out;
+               sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+       }
+       error = 0;
+
+out:
+       kmem_free(tmpbuffer);
+       return error;
+}
+
+/*
+ * Check a leaf attribute block to see if all the entries would fit into
+ * a shortform attribute list.
+ */
+int
+xfs_attr_shortform_allfit(
+       struct xfs_buf          *bp,
+       struct xfs_inode        *dp)
+{
+       struct xfs_attr_leafblock *leaf;
+       struct xfs_attr_leaf_entry *entry;
+       xfs_attr_leaf_name_local_t *name_loc;
+       struct xfs_attr3_icleaf_hdr leafhdr;
+       int                     bytes;
+       int                     i;
+
+       leaf = bp->b_addr;
+       xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+       entry = xfs_attr3_leaf_entryp(leaf);
+
+       bytes = sizeof(struct xfs_attr_sf_hdr);
+       for (i = 0; i < leafhdr.count; entry++, i++) {
+               if (entry->flags & XFS_ATTR_INCOMPLETE)
+                       continue;               /* don't copy partial entries */
+               if (!(entry->flags & XFS_ATTR_LOCAL))
+                       return 0;
+               name_loc = xfs_attr3_leaf_name_local(leaf, i);
+               if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
+                       return 0;
+               if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
+                       return 0;
+               bytes += sizeof(struct xfs_attr_sf_entry) - 1
+                               + name_loc->namelen
+                               + be16_to_cpu(name_loc->valuelen);
+       }
+       if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) &&
+           (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+           (bytes == sizeof(struct xfs_attr_sf_hdr)))
+               return -1;
+       return xfs_attr_shortform_bytesfit(dp, bytes);
+}
+
+/*
+ * Convert a leaf attribute list to shortform attribute list
+ */
+int
+xfs_attr3_leaf_to_shortform(
+       struct xfs_buf          *bp,
+       struct xfs_da_args      *args,
+       int                     forkoff)
+{
+       struct xfs_attr_leafblock *leaf;
+       struct xfs_attr3_icleaf_hdr ichdr;
+       struct xfs_attr_leaf_entry *entry;
+       struct xfs_attr_leaf_name_local *name_loc;
+       struct xfs_da_args      nargs;
+       struct xfs_inode        *dp = args->dp;
+       char                    *tmpbuffer;
+       int                     error;
+       int                     i;
+
+       trace_xfs_attr_leaf_to_sf(args);
+
+       tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+       if (!tmpbuffer)
+               return -ENOMEM;
+
+       memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
+
+       leaf = (xfs_attr_leafblock_t *)tmpbuffer;
+       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+       entry = xfs_attr3_leaf_entryp(leaf);
+
+       /* XXX (dgc): buffer is about to be marked stale - why zero it? */
+       memset(bp->b_addr, 0, args->geo->blksize);
+
+       /*
+        * Clean out the prior contents of the attribute list.
+        */
+       error = xfs_da_shrink_inode(args, 0, bp);
+       if (error)
+               goto out;
+
+       if (forkoff == -1) {
+               ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2);
+               ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE);
+               xfs_attr_fork_reset(dp, args->trans);
+               goto out;
+       }
+
+       xfs_attr_shortform_create(args);
+
+       /*
+        * Copy the attributes
+        */
+       memset((char *)&nargs, 0, sizeof(nargs));
+       nargs.geo = args->geo;
+       nargs.dp = dp;
+       nargs.firstblock = args->firstblock;
+       nargs.flist = args->flist;
+       nargs.total = args->total;
+       nargs.whichfork = XFS_ATTR_FORK;
+       nargs.trans = args->trans;
+       nargs.op_flags = XFS_DA_OP_OKNOENT;
+
+       for (i = 0; i < ichdr.count; entry++, i++) {
+               if (entry->flags & XFS_ATTR_INCOMPLETE)
+                       continue;       /* don't copy partial entries */
+               if (!entry->nameidx)
+                       continue;
+               ASSERT(entry->flags & XFS_ATTR_LOCAL);
+               name_loc = xfs_attr3_leaf_name_local(leaf, i);
+               nargs.name = name_loc->nameval;
+               nargs.namelen = name_loc->namelen;
+               nargs.value = &name_loc->nameval[nargs.namelen];
+               nargs.valuelen = be16_to_cpu(name_loc->valuelen);
+               nargs.hashval = be32_to_cpu(entry->hashval);
+               nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags);
+               xfs_attr_shortform_add(&nargs, forkoff);
+       }
+       error = 0;
+
+out:
+       kmem_free(tmpbuffer);
+       return error;
+}
+
+/*
+ * Convert from using a single leaf to a root node and a leaf.
+ */
+int
+xfs_attr3_leaf_to_node(
+       struct xfs_da_args      *args)
+{
+       struct xfs_attr_leafblock *leaf;
+       struct xfs_attr3_icleaf_hdr icleafhdr;
+       struct xfs_attr_leaf_entry *entries;
+       struct xfs_da_node_entry *btree;
+       struct xfs_da3_icnode_hdr icnodehdr;
+       struct xfs_da_intnode   *node;
+       struct xfs_inode        *dp = args->dp;
+       struct xfs_mount        *mp = dp->i_mount;
+       struct xfs_buf          *bp1 = NULL;
+       struct xfs_buf          *bp2 = NULL;
+       xfs_dablk_t             blkno;
+       int                     error;
+
+       trace_xfs_attr_leaf_to_node(args);
+
+       error = xfs_da_grow_inode(args, &blkno);
+       if (error)
+               goto out;
+       error = xfs_attr3_leaf_read(args->trans, dp, 0, -1, &bp1);
+       if (error)
+               goto out;
+
+       error = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp2, XFS_ATTR_FORK);
+       if (error)
+               goto out;
+
+       /* copy leaf to new buffer, update identifiers */
+       xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF);
+       bp2->b_ops = bp1->b_ops;
+       memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize);
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               struct xfs_da3_blkinfo *hdr3 = bp2->b_addr;
+               hdr3->blkno = cpu_to_be64(bp2->b_bn);
+       }
+       xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1);
+
+       /*
+        * Set up the new root node.
+        */
+       error = xfs_da3_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK);
+       if (error)
+               goto out;
+       node = bp1->b_addr;
+       dp->d_ops->node_hdr_from_disk(&icnodehdr, node);
+       btree = dp->d_ops->node_tree_p(node);
+
+       leaf = bp2->b_addr;
+       xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf);
+       entries = xfs_attr3_leaf_entryp(leaf);
+
+       /* both on-disk, don't endian-flip twice */
+       btree[0].hashval = entries[icleafhdr.count - 1].hashval;
+       btree[0].before = cpu_to_be32(blkno);
+       icnodehdr.count = 1;
+       dp->d_ops->node_hdr_to_disk(node, &icnodehdr);
+       xfs_trans_log_buf(args->trans, bp1, 0, args->geo->blksize - 1);
+       error = 0;
+out:
+       return error;
+}
+
+/*========================================================================
+ * Routines used for growing the Btree.
+ *========================================================================*/
+
+/*
+ * Create the initial contents of a leaf attribute list
+ * or a leaf in a node attribute list.
+ */
+STATIC int
+xfs_attr3_leaf_create(
+       struct xfs_da_args      *args,
+       xfs_dablk_t             blkno,
+       struct xfs_buf          **bpp)
+{
+       struct xfs_attr_leafblock *leaf;
+       struct xfs_attr3_icleaf_hdr ichdr;
+       struct xfs_inode        *dp = args->dp;
+       struct xfs_mount        *mp = dp->i_mount;
+       struct xfs_buf          *bp;
+       int                     error;
+
+       trace_xfs_attr_leaf_create(args);
+
+       error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp,
+                                           XFS_ATTR_FORK);
+       if (error)
+               return error;
+       bp->b_ops = &xfs_attr3_leaf_buf_ops;
+       xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF);
+       leaf = bp->b_addr;
+       memset(leaf, 0, args->geo->blksize);
+
+       memset(&ichdr, 0, sizeof(ichdr));
+       ichdr.firstused = args->geo->blksize;
+
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               struct xfs_da3_blkinfo *hdr3 = bp->b_addr;
+
+               ichdr.magic = XFS_ATTR3_LEAF_MAGIC;
+
+               hdr3->blkno = cpu_to_be64(bp->b_bn);
+               hdr3->owner = cpu_to_be64(dp->i_ino);
+               uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+
+               ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr);
+       } else {
+               ichdr.magic = XFS_ATTR_LEAF_MAGIC;
+               ichdr.freemap[0].base = sizeof(struct xfs_attr_leaf_hdr);
+       }
+       ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base;
+
+       xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+       xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1);
+
+       *bpp = bp;
+       return 0;
+}
+
+/*
+ * Split the leaf node, rebalance, then add the new entry.
+ */
+int
+xfs_attr3_leaf_split(
+       struct xfs_da_state     *state,
+       struct xfs_da_state_blk *oldblk,
+       struct xfs_da_state_blk *newblk)
+{
+       xfs_dablk_t blkno;
+       int error;
+
+       trace_xfs_attr_leaf_split(state->args);
+
+       /*
+        * Allocate space for a new leaf node.
+        */
+       ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC);
+       error = xfs_da_grow_inode(state->args, &blkno);
+       if (error)
+               return error;
+       error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp);
+       if (error)
+               return error;
+       newblk->blkno = blkno;
+       newblk->magic = XFS_ATTR_LEAF_MAGIC;
+
+       /*
+        * Rebalance the entries across the two leaves.
+        * NOTE: rebalance() currently depends on the 2nd block being empty.
+        */
+       xfs_attr3_leaf_rebalance(state, oldblk, newblk);
+       error = xfs_da3_blk_link(state, oldblk, newblk);
+       if (error)
+               return error;
+
+       /*
+        * Save info on "old" attribute for "atomic rename" ops, leaf_add()
+        * modifies the index/blkno/rmtblk/rmtblkcnt fields to show the
+        * "new" attrs info.  Will need the "old" info to remove it later.
+        *
+        * Insert the "new" entry in the correct block.
+        */
+       if (state->inleaf) {
+               trace_xfs_attr_leaf_add_old(state->args);
+               error = xfs_attr3_leaf_add(oldblk->bp, state->args);
+       } else {
+               trace_xfs_attr_leaf_add_new(state->args);
+               error = xfs_attr3_leaf_add(newblk->bp, state->args);
+       }
+
+       /*
+        * Update last hashval in each block since we added the name.
+        */
+       oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL);
+       newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL);
+       return error;
+}
+
+/*
+ * Add a name to the leaf attribute list structure.
+ */
+int
+xfs_attr3_leaf_add(
+       struct xfs_buf          *bp,
+       struct xfs_da_args      *args)
+{
+       struct xfs_attr_leafblock *leaf;
+       struct xfs_attr3_icleaf_hdr ichdr;
+       int                     tablesize;
+       int                     entsize;
+       int                     sum;
+       int                     tmp;
+       int                     i;
+
+       trace_xfs_attr_leaf_add(args);
+
+       leaf = bp->b_addr;
+       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+       ASSERT(args->index >= 0 && args->index <= ichdr.count);
+       entsize = xfs_attr_leaf_newentsize(args, NULL);
+
+       /*
+        * Search through freemap for first-fit on new name length.
+        * (may need to figure in size of entry struct too)
+        */
+       tablesize = (ichdr.count + 1) * sizeof(xfs_attr_leaf_entry_t)
+                                       + xfs_attr3_leaf_hdr_size(leaf);
+       for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE - 1; i >= 0; i--) {
+               if (tablesize > ichdr.firstused) {
+                       sum += ichdr.freemap[i].size;
+                       continue;
+               }
+               if (!ichdr.freemap[i].size)
+                       continue;       /* no space in this map */
+               tmp = entsize;
+               if (ichdr.freemap[i].base < ichdr.firstused)
+                       tmp += sizeof(xfs_attr_leaf_entry_t);
+               if (ichdr.freemap[i].size >= tmp) {
+                       tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, i);
+                       goto out_log_hdr;
+               }
+               sum += ichdr.freemap[i].size;
+       }
+
+       /*
+        * If there are no holes in the address space of the block,
+        * and we don't have enough freespace, then compaction will do us
+        * no good and we should just give up.
+        */
+       if (!ichdr.holes && sum < entsize)
+               return -ENOSPC;
+
+       /*
+        * Compact the entries to coalesce free space.
+        * This may change the hdr->count via dropping INCOMPLETE entries.
+        */
+       xfs_attr3_leaf_compact(args, &ichdr, bp);
+
+       /*
+        * After compaction, the block is guaranteed to have only one
+        * free region, in freemap[0].  If it is not big enough, give up.
+        */
+       if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) {
+               tmp = -ENOSPC;
+               goto out_log_hdr;
+       }
+
+       tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0);
+
+out_log_hdr:
+       xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+       xfs_trans_log_buf(args->trans, bp,
+               XFS_DA_LOGRANGE(leaf, &leaf->hdr,
+                               xfs_attr3_leaf_hdr_size(leaf)));
+       return tmp;
+}
+
+/*
+ * Add a name to a leaf attribute list structure.
+ */
+STATIC int
+xfs_attr3_leaf_add_work(
+       struct xfs_buf          *bp,
+       struct xfs_attr3_icleaf_hdr *ichdr,
+       struct xfs_da_args      *args,
+       int                     mapindex)
+{
+       struct xfs_attr_leafblock *leaf;
+       struct xfs_attr_leaf_entry *entry;
+       struct xfs_attr_leaf_name_local *name_loc;
+       struct xfs_attr_leaf_name_remote *name_rmt;
+       struct xfs_mount        *mp;
+       int                     tmp;
+       int                     i;
+
+       trace_xfs_attr_leaf_add_work(args);
+
+       leaf = bp->b_addr;
+       ASSERT(mapindex >= 0 && mapindex < XFS_ATTR_LEAF_MAPSIZE);
+       ASSERT(args->index >= 0 && args->index <= ichdr->count);
+
+       /*
+        * Force open some space in the entry array and fill it in.
+        */
+       entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+       if (args->index < ichdr->count) {
+               tmp  = ichdr->count - args->index;
+               tmp *= sizeof(xfs_attr_leaf_entry_t);
+               memmove(entry + 1, entry, tmp);
+               xfs_trans_log_buf(args->trans, bp,
+                   XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
+       }
+       ichdr->count++;
+
+       /*
+        * Allocate space for the new string (at the end of the run).
+        */
+       mp = args->trans->t_mountp;
+       ASSERT(ichdr->freemap[mapindex].base < args->geo->blksize);
+       ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0);
+       ASSERT(ichdr->freemap[mapindex].size >=
+               xfs_attr_leaf_newentsize(args, NULL));
+       ASSERT(ichdr->freemap[mapindex].size < args->geo->blksize);
+       ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0);
+
+       ichdr->freemap[mapindex].size -= xfs_attr_leaf_newentsize(args, &tmp);
+
+       entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base +
+                                    ichdr->freemap[mapindex].size);
+       entry->hashval = cpu_to_be32(args->hashval);
+       entry->flags = tmp ? XFS_ATTR_LOCAL : 0;
+       entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
+       if (args->op_flags & XFS_DA_OP_RENAME) {
+               entry->flags |= XFS_ATTR_INCOMPLETE;
+               if ((args->blkno2 == args->blkno) &&
+                   (args->index2 <= args->index)) {
+                       args->index2++;
+               }
+       }
+       xfs_trans_log_buf(args->trans, bp,
+                         XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+       ASSERT((args->index == 0) ||
+              (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval)));
+       ASSERT((args->index == ichdr->count - 1) ||
+              (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval)));
+
+       /*
+        * For "remote" attribute values, simply note that we need to
+        * allocate space for the "remote" value.  We can't actually
+        * allocate the extents in this transaction, and we can't decide
+        * which blocks they should be as we might allocate more blocks
+        * as part of this transaction (a split operation for example).
+        */
+       if (entry->flags & XFS_ATTR_LOCAL) {
+               name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
+               name_loc->namelen = args->namelen;
+               name_loc->valuelen = cpu_to_be16(args->valuelen);
+               memcpy((char *)name_loc->nameval, args->name, args->namelen);
+               memcpy((char *)&name_loc->nameval[args->namelen], args->value,
+                                  be16_to_cpu(name_loc->valuelen));
+       } else {
+               name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+               name_rmt->namelen = args->namelen;
+               memcpy((char *)name_rmt->name, args->name, args->namelen);
+               entry->flags |= XFS_ATTR_INCOMPLETE;
+               /* just in case */
+               name_rmt->valuelen = 0;
+               name_rmt->valueblk = 0;
+               args->rmtblkno = 1;
+               args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
+               args->rmtvaluelen = args->valuelen;
+       }
+       xfs_trans_log_buf(args->trans, bp,
+            XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
+                                  xfs_attr_leaf_entsize(leaf, args->index)));
+
+       /*
+        * Update the control info for this leaf node
+        */
+       if (be16_to_cpu(entry->nameidx) < ichdr->firstused)
+               ichdr->firstused = be16_to_cpu(entry->nameidx);
+
+       ASSERT(ichdr->firstused >= ichdr->count * sizeof(xfs_attr_leaf_entry_t)
+                                       + xfs_attr3_leaf_hdr_size(leaf));
+       tmp = (ichdr->count - 1) * sizeof(xfs_attr_leaf_entry_t)
+                                       + xfs_attr3_leaf_hdr_size(leaf);
+
+       for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+               if (ichdr->freemap[i].base == tmp) {
+                       ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t);
+                       ichdr->freemap[i].size -= sizeof(xfs_attr_leaf_entry_t);
+               }
+       }
+       ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index);
+       return 0;
+}
+
+/*
+ * Garbage collect a leaf attribute list block by copying it to a new buffer.
+ */
+STATIC void
+xfs_attr3_leaf_compact(
+       struct xfs_da_args      *args,
+       struct xfs_attr3_icleaf_hdr *ichdr_dst,
+       struct xfs_buf          *bp)
+{
+       struct xfs_attr_leafblock *leaf_src;
+       struct xfs_attr_leafblock *leaf_dst;
+       struct xfs_attr3_icleaf_hdr ichdr_src;
+       struct xfs_trans        *trans = args->trans;
+       char                    *tmpbuffer;
+
+       trace_xfs_attr_leaf_compact(args);
+
+       tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+       memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
+       memset(bp->b_addr, 0, args->geo->blksize);
+       leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
+       leaf_dst = bp->b_addr;
+
+       /*
+        * Copy the on-disk header back into the destination buffer to ensure
+        * all the information in the header that is not part of the incore
+        * header structure is preserved.
+        */
+       memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src));
+
+       /* Initialise the incore headers */
+       ichdr_src = *ichdr_dst; /* struct copy */
+       ichdr_dst->firstused = args->geo->blksize;
+       ichdr_dst->usedbytes = 0;
+       ichdr_dst->count = 0;
+       ichdr_dst->holes = 0;
+       ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src);
+       ichdr_dst->freemap[0].size = ichdr_dst->firstused -
+                                               ichdr_dst->freemap[0].base;
+
+       /* write the header back to initialise the underlying buffer */
+       xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst);
+
+       /*
+        * Copy all entry's in the same (sorted) order,
+        * but allocate name/value pairs packed and in sequence.
+        */
+       xfs_attr3_leaf_moveents(args, leaf_src, &ichdr_src, 0,
+                               leaf_dst, ichdr_dst, 0, ichdr_src.count);
+       /*
+        * this logs the entire buffer, but the caller must write the header
+        * back to the buffer when it is finished modifying it.
+        */
+       xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1);
+
+       kmem_free(tmpbuffer);
+}
+
+/*
+ * Compare two leaf blocks "order".
+ * Return 0 unless leaf2 should go before leaf1.
+ */
+static int
+xfs_attr3_leaf_order(
+       struct xfs_buf  *leaf1_bp,
+       struct xfs_attr3_icleaf_hdr *leaf1hdr,
+       struct xfs_buf  *leaf2_bp,
+       struct xfs_attr3_icleaf_hdr *leaf2hdr)
+{
+       struct xfs_attr_leaf_entry *entries1;
+       struct xfs_attr_leaf_entry *entries2;
+
+       entries1 = xfs_attr3_leaf_entryp(leaf1_bp->b_addr);
+       entries2 = xfs_attr3_leaf_entryp(leaf2_bp->b_addr);
+       if (leaf1hdr->count > 0 && leaf2hdr->count > 0 &&
+           ((be32_to_cpu(entries2[0].hashval) <
+             be32_to_cpu(entries1[0].hashval)) ||
+            (be32_to_cpu(entries2[leaf2hdr->count - 1].hashval) <
+             be32_to_cpu(entries1[leaf1hdr->count - 1].hashval)))) {
+               return 1;
+       }
+       return 0;
+}
+
+int
+xfs_attr_leaf_order(
+       struct xfs_buf  *leaf1_bp,
+       struct xfs_buf  *leaf2_bp)
+{
+       struct xfs_attr3_icleaf_hdr ichdr1;
+       struct xfs_attr3_icleaf_hdr ichdr2;
+
+       xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1_bp->b_addr);
+       xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2_bp->b_addr);
+       return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2);
+}
+
+/*
+ * Redistribute the attribute list entries between two leaf nodes,
+ * taking into account the size of the new entry.
+ *
+ * NOTE: if new block is empty, then it will get the upper half of the
+ * old block.  At present, all (one) callers pass in an empty second block.
+ *
+ * This code adjusts the args->index/blkno and args->index2/blkno2 fields
+ * to match what it is doing in splitting the attribute leaf block.  Those
+ * values are used in "atomic rename" operations on attributes.  Note that
+ * the "new" and "old" values can end up in different blocks.
+ */
+STATIC void
+xfs_attr3_leaf_rebalance(
+       struct xfs_da_state     *state,
+       struct xfs_da_state_blk *blk1,
+       struct xfs_da_state_blk *blk2)
+{
+       struct xfs_da_args      *args;
+       struct xfs_attr_leafblock *leaf1;
+       struct xfs_attr_leafblock *leaf2;
+       struct xfs_attr3_icleaf_hdr ichdr1;
+       struct xfs_attr3_icleaf_hdr ichdr2;
+       struct xfs_attr_leaf_entry *entries1;
+       struct xfs_attr_leaf_entry *entries2;
+       int                     count;
+       int                     totallen;
+       int                     max;
+       int                     space;
+       int                     swap;
+
+       /*
+        * Set up environment.
+        */
+       ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC);
+       ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC);
+       leaf1 = blk1->bp->b_addr;
+       leaf2 = blk2->bp->b_addr;
+       xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
+       xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
+       ASSERT(ichdr2.count == 0);
+       args = state->args;
+
+       trace_xfs_attr_leaf_rebalance(args);
+
+       /*
+        * Check ordering of blocks, reverse if it makes things simpler.
+        *
+        * NOTE: Given that all (current) callers pass in an empty
+        * second block, this code should never set "swap".
+        */
+       swap = 0;
+       if (xfs_attr3_leaf_order(blk1->bp, &ichdr1, blk2->bp, &ichdr2)) {
+               struct xfs_da_state_blk *tmp_blk;
+               struct xfs_attr3_icleaf_hdr tmp_ichdr;
+
+               tmp_blk = blk1;
+               blk1 = blk2;
+               blk2 = tmp_blk;
+
+               /* struct copies to swap them rather than reconverting */
+               tmp_ichdr = ichdr1;
+               ichdr1 = ichdr2;
+               ichdr2 = tmp_ichdr;
+
+               leaf1 = blk1->bp->b_addr;
+               leaf2 = blk2->bp->b_addr;
+               swap = 1;
+       }
+
+       /*
+        * Examine entries until we reduce the absolute difference in
+        * byte usage between the two blocks to a minimum.  Then get
+        * the direction to copy and the number of elements to move.
+        *
+        * "inleaf" is true if the new entry should be inserted into blk1.
+        * If "swap" is also true, then reverse the sense of "inleaf".
+        */
+       state->inleaf = xfs_attr3_leaf_figure_balance(state, blk1, &ichdr1,
+                                                     blk2, &ichdr2,
+                                                     &count, &totallen);
+       if (swap)
+               state->inleaf = !state->inleaf;
+
+       /*
+        * Move any entries required from leaf to leaf:
+        */
+       if (count < ichdr1.count) {
+               /*
+                * Figure the total bytes to be added to the destination leaf.
+                */
+               /* number entries being moved */
+               count = ichdr1.count - count;
+               space  = ichdr1.usedbytes - totallen;
+               space += count * sizeof(xfs_attr_leaf_entry_t);
+
+               /*
+                * leaf2 is the destination, compact it if it looks tight.
+                */
+               max  = ichdr2.firstused - xfs_attr3_leaf_hdr_size(leaf1);
+               max -= ichdr2.count * sizeof(xfs_attr_leaf_entry_t);
+               if (space > max)
+                       xfs_attr3_leaf_compact(args, &ichdr2, blk2->bp);
+
+               /*
+                * Move high entries from leaf1 to low end of leaf2.
+                */
+               xfs_attr3_leaf_moveents(args, leaf1, &ichdr1,
+                               ichdr1.count - count, leaf2, &ichdr2, 0, count);
+
+       } else if (count > ichdr1.count) {
+               /*
+                * I assert that since all callers pass in an empty
+                * second buffer, this code should never execute.
+                */
+               ASSERT(0);
+
+               /*
+                * Figure the total bytes to be added to the destination leaf.
+                */
+               /* number entries being moved */
+               count -= ichdr1.count;
+               space  = totallen - ichdr1.usedbytes;
+               space += count * sizeof(xfs_attr_leaf_entry_t);
+
+               /*
+                * leaf1 is the destination, compact it if it looks tight.
+                */
+               max  = ichdr1.firstused - xfs_attr3_leaf_hdr_size(leaf1);
+               max -= ichdr1.count * sizeof(xfs_attr_leaf_entry_t);
+               if (space > max)
+                       xfs_attr3_leaf_compact(args, &ichdr1, blk1->bp);
+
+               /*
+                * Move low entries from leaf2 to high end of leaf1.
+                */
+               xfs_attr3_leaf_moveents(args, leaf2, &ichdr2, 0, leaf1, &ichdr1,
+                                       ichdr1.count, count);
+       }
+
+       xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1);
+       xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2);
+       xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1);
+       xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1);
+
+       /*
+        * Copy out last hashval in each block for B-tree code.
+        */
+       entries1 = xfs_attr3_leaf_entryp(leaf1);
+       entries2 = xfs_attr3_leaf_entryp(leaf2);
+       blk1->hashval = be32_to_cpu(entries1[ichdr1.count - 1].hashval);
+       blk2->hashval = be32_to_cpu(entries2[ichdr2.count - 1].hashval);
+
+       /*
+        * Adjust the expected index for insertion.
+        * NOTE: this code depends on the (current) situation that the
+        * second block was originally empty.
+        *
+        * If the insertion point moved to the 2nd block, we must adjust
+        * the index.  We must also track the entry just following the
+        * new entry for use in an "atomic rename" operation, that entry
+        * is always the "old" entry and the "new" entry is what we are
+        * inserting.  The index/blkno fields refer to the "old" entry,
+        * while the index2/blkno2 fields refer to the "new" entry.
+        */
+       if (blk1->index > ichdr1.count) {
+               ASSERT(state->inleaf == 0);
+               blk2->index = blk1->index - ichdr1.count;
+               args->index = args->index2 = blk2->index;
+               args->blkno = args->blkno2 = blk2->blkno;
+       } else if (blk1->index == ichdr1.count) {
+               if (state->inleaf) {
+                       args->index = blk1->index;
+                       args->blkno = blk1->blkno;
+                       args->index2 = 0;
+                       args->blkno2 = blk2->blkno;
+               } else {
+                       /*
+                        * On a double leaf split, the original attr location
+                        * is already stored in blkno2/index2, so don't
+                        * overwrite it overwise we corrupt the tree.
+                        */
+                       blk2->index = blk1->index - ichdr1.count;
+                       args->index = blk2->index;
+                       args->blkno = blk2->blkno;
+                       if (!state->extravalid) {
+                               /*
+                                * set the new attr location to match the old
+                                * one and let the higher level split code
+                                * decide where in the leaf to place it.
+                                */
+                               args->index2 = blk2->index;
+                               args->blkno2 = blk2->blkno;
+                       }
+               }
+       } else {
+               ASSERT(state->inleaf == 1);
+               args->index = args->index2 = blk1->index;
+               args->blkno = args->blkno2 = blk1->blkno;
+       }
+}
+
+/*
+ * Examine entries until we reduce the absolute difference in
+ * byte usage between the two blocks to a minimum.
+ * GROT: Is this really necessary?  With other than a 512 byte blocksize,
+ * GROT: there will always be enough room in either block for a new entry.
+ * GROT: Do a double-split for this case?
+ */
+STATIC int
+xfs_attr3_leaf_figure_balance(
+       struct xfs_da_state             *state,
+       struct xfs_da_state_blk         *blk1,
+       struct xfs_attr3_icleaf_hdr     *ichdr1,
+       struct xfs_da_state_blk         *blk2,
+       struct xfs_attr3_icleaf_hdr     *ichdr2,
+       int                             *countarg,
+       int                             *usedbytesarg)
+{
+       struct xfs_attr_leafblock       *leaf1 = blk1->bp->b_addr;
+       struct xfs_attr_leafblock       *leaf2 = blk2->bp->b_addr;
+       struct xfs_attr_leaf_entry      *entry;
+       int                             count;
+       int                             max;
+       int                             index;
+       int                             totallen = 0;
+       int                             half;
+       int                             lastdelta;
+       int                             foundit = 0;
+       int                             tmp;
+
+       /*
+        * Examine entries until we reduce the absolute difference in
+        * byte usage between the two blocks to a minimum.
+        */
+       max = ichdr1->count + ichdr2->count;
+       half = (max + 1) * sizeof(*entry);
+       half += ichdr1->usedbytes + ichdr2->usedbytes +
+                       xfs_attr_leaf_newentsize(state->args, NULL);
+       half /= 2;
+       lastdelta = state->args->geo->blksize;
+       entry = xfs_attr3_leaf_entryp(leaf1);
+       for (count = index = 0; count < max; entry++, index++, count++) {
+
+#define XFS_ATTR_ABS(A)        (((A) < 0) ? -(A) : (A))
+               /*
+                * The new entry is in the first block, account for it.
+                */
+               if (count == blk1->index) {
+                       tmp = totallen + sizeof(*entry) +
+                               xfs_attr_leaf_newentsize(state->args, NULL);
+                       if (XFS_ATTR_ABS(half - tmp) > lastdelta)
+                               break;
+                       lastdelta = XFS_ATTR_ABS(half - tmp);
+                       totallen = tmp;
+                       foundit = 1;
+               }
+
+               /*
+                * Wrap around into the second block if necessary.
+                */
+               if (count == ichdr1->count) {
+                       leaf1 = leaf2;
+                       entry = xfs_attr3_leaf_entryp(leaf1);
+                       index = 0;
+               }
+
+               /*
+                * Figure out if next leaf entry would be too much.
+                */
+               tmp = totallen + sizeof(*entry) + xfs_attr_leaf_entsize(leaf1,
+                                                                       index);
+               if (XFS_ATTR_ABS(half - tmp) > lastdelta)
+                       break;
+               lastdelta = XFS_ATTR_ABS(half - tmp);
+               totallen = tmp;
+#undef XFS_ATTR_ABS
+       }
+
+       /*
+        * Calculate the number of usedbytes that will end up in lower block.
+        * If new entry not in lower block, fix up the count.
+        */
+       totallen -= count * sizeof(*entry);
+       if (foundit) {
+               totallen -= sizeof(*entry) +
+                               xfs_attr_leaf_newentsize(state->args, NULL);
+       }
+
+       *countarg = count;
+       *usedbytesarg = totallen;
+       return foundit;
+}
+
+/*========================================================================
+ * Routines used for shrinking the Btree.
+ *========================================================================*/
+
+/*
+ * Check a leaf block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ *
+ * GROT: allow for INCOMPLETE entries in calculation.
+ */
+int
+xfs_attr3_leaf_toosmall(
+       struct xfs_da_state     *state,
+       int                     *action)
+{
+       struct xfs_attr_leafblock *leaf;
+       struct xfs_da_state_blk *blk;
+       struct xfs_attr3_icleaf_hdr ichdr;
+       struct xfs_buf          *bp;
+       xfs_dablk_t             blkno;
+       int                     bytes;
+       int                     forward;
+       int                     error;
+       int                     retval;
+       int                     i;
+
+       trace_xfs_attr_leaf_toosmall(state->args);
+
+       /*
+        * Check for the degenerate case of the block being over 50% full.
+        * If so, it's not worth even looking to see if we might be able
+        * to coalesce with a sibling.
+        */
+       blk = &state->path.blk[ state->path.active-1 ];
+       leaf = blk->bp->b_addr;
+       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+       bytes = xfs_attr3_leaf_hdr_size(leaf) +
+               ichdr.count * sizeof(xfs_attr_leaf_entry_t) +
+               ichdr.usedbytes;
+       if (bytes > (state->args->geo->blksize >> 1)) {
+               *action = 0;    /* blk over 50%, don't try to join */
+               return 0;
+       }
+
+       /*
+        * Check for the degenerate case of the block being empty.
+        * If the block is empty, we'll simply delete it, no need to
+        * coalesce it with a sibling block.  We choose (arbitrarily)
+        * to merge with the forward block unless it is NULL.
+        */
+       if (ichdr.count == 0) {
+               /*
+                * Make altpath point to the block we want to keep and
+                * path point to the block we want to drop (this one).
+                */
+               forward = (ichdr.forw != 0);
+               memcpy(&state->altpath, &state->path, sizeof(state->path));
+               error = xfs_da3_path_shift(state, &state->altpath, forward,
+                                                0, &retval);
+               if (error)
+                       return error;
+               if (retval) {
+                       *action = 0;
+               } else {
+                       *action = 2;
+               }
+               return 0;
+       }
+
+       /*
+        * Examine each sibling block to see if we can coalesce with
+        * at least 25% free space to spare.  We need to figure out
+        * whether to merge with the forward or the backward block.
+        * We prefer coalescing with the lower numbered sibling so as
+        * to shrink an attribute list over time.
+        */
+       /* start with smaller blk num */
+       forward = ichdr.forw < ichdr.back;
+       for (i = 0; i < 2; forward = !forward, i++) {
+               struct xfs_attr3_icleaf_hdr ichdr2;
+               if (forward)
+                       blkno = ichdr.forw;
+               else
+                       blkno = ichdr.back;
+               if (blkno == 0)
+                       continue;
+               error = xfs_attr3_leaf_read(state->args->trans, state->args->dp,
+                                       blkno, -1, &bp);
+               if (error)
+                       return error;
+
+               xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr);
+
+               bytes = state->args->geo->blksize -
+                       (state->args->geo->blksize >> 2) -
+                       ichdr.usedbytes - ichdr2.usedbytes -
+                       ((ichdr.count + ichdr2.count) *
+                                       sizeof(xfs_attr_leaf_entry_t)) -
+                       xfs_attr3_leaf_hdr_size(leaf);
+
+               xfs_trans_brelse(state->args->trans, bp);
+               if (bytes >= 0)
+                       break;  /* fits with at least 25% to spare */
+       }
+       if (i >= 2) {
+               *action = 0;
+               return 0;
+       }
+
+       /*
+        * Make altpath point to the block we want to keep (the lower
+        * numbered block) and path point to the block we want to drop.
+        */
+       memcpy(&state->altpath, &state->path, sizeof(state->path));
+       if (blkno < blk->blkno) {
+               error = xfs_da3_path_shift(state, &state->altpath, forward,
+                                                0, &retval);
+       } else {
+               error = xfs_da3_path_shift(state, &state->path, forward,
+                                                0, &retval);
+       }
+       if (error)
+               return error;
+       if (retval) {
+               *action = 0;
+       } else {
+               *action = 1;
+       }
+       return 0;
+}
+
+/*
+ * Remove a name from the leaf attribute list structure.
+ *
+ * Return 1 if leaf is less than 37% full, 0 if >= 37% full.
+ * If two leaves are 37% full, when combined they will leave 25% free.
+ */
+int
+xfs_attr3_leaf_remove(
+       struct xfs_buf          *bp,
+       struct xfs_da_args      *args)
+{
+       struct xfs_attr_leafblock *leaf;
+       struct xfs_attr3_icleaf_hdr ichdr;
+       struct xfs_attr_leaf_entry *entry;
+       int                     before;
+       int                     after;
+       int                     smallest;
+       int                     entsize;
+       int                     tablesize;
+       int                     tmp;
+       int                     i;
+
+       trace_xfs_attr_leaf_remove(args);
+
+       leaf = bp->b_addr;
+       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+
+       ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8);
+       ASSERT(args->index >= 0 && args->index < ichdr.count);
+       ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) +
+                                       xfs_attr3_leaf_hdr_size(leaf));
+
+       entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+
+       ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
+       ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
+
+       /*
+        * Scan through free region table:
+        *    check for adjacency of free'd entry with an existing one,
+        *    find smallest free region in case we need to replace it,
+        *    adjust any map that borders the entry table,
+        */
+       tablesize = ichdr.count * sizeof(xfs_attr_leaf_entry_t)
+                                       + xfs_attr3_leaf_hdr_size(leaf);
+       tmp = ichdr.freemap[0].size;
+       before = after = -1;
+       smallest = XFS_ATTR_LEAF_MAPSIZE - 1;
+       entsize = xfs_attr_leaf_entsize(leaf, args->index);
+       for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+               ASSERT(ichdr.freemap[i].base < args->geo->blksize);
+               ASSERT(ichdr.freemap[i].size < args->geo->blksize);
+               if (ichdr.freemap[i].base == tablesize) {
+                       ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t);
+                       ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t);
+               }
+
+               if (ichdr.freemap[i].base + ichdr.freemap[i].size ==
+                               be16_to_cpu(entry->nameidx)) {
+                       before = i;
+               } else if (ichdr.freemap[i].base ==
+                               (be16_to_cpu(entry->nameidx) + entsize)) {
+                       after = i;
+               } else if (ichdr.freemap[i].size < tmp) {
+                       tmp = ichdr.freemap[i].size;
+                       smallest = i;
+               }
+       }
+
+       /*
+        * Coalesce adjacent freemap regions,
+        * or replace the smallest region.
+        */
+       if ((before >= 0) || (after >= 0)) {
+               if ((before >= 0) && (after >= 0)) {
+                       ichdr.freemap[before].size += entsize;
+                       ichdr.freemap[before].size += ichdr.freemap[after].size;
+                       ichdr.freemap[after].base = 0;
+                       ichdr.freemap[after].size = 0;
+               } else if (before >= 0) {
+                       ichdr.freemap[before].size += entsize;
+               } else {
+                       ichdr.freemap[after].base = be16_to_cpu(entry->nameidx);
+                       ichdr.freemap[after].size += entsize;
+               }
+       } else {
+               /*
+                * Replace smallest region (if it is smaller than free'd entry)
+                */
+               if (ichdr.freemap[smallest].size < entsize) {
+                       ichdr.freemap[smallest].base = be16_to_cpu(entry->nameidx);
+                       ichdr.freemap[smallest].size = entsize;
+               }
+       }
+
+       /*
+        * Did we remove the first entry?
+        */
+       if (be16_to_cpu(entry->nameidx) == ichdr.firstused)
+               smallest = 1;
+       else
+               smallest = 0;
+
+       /*
+        * Compress the remaining entries and zero out the removed stuff.
+        */
+       memset(xfs_attr3_leaf_name(leaf, args->index), 0, entsize);
+       ichdr.usedbytes -= entsize;
+       xfs_trans_log_buf(args->trans, bp,
+            XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
+                                  entsize));
+
+       tmp = (ichdr.count - args->index) * sizeof(xfs_attr_leaf_entry_t);
+       memmove(entry, entry + 1, tmp);
+       ichdr.count--;
+       xfs_trans_log_buf(args->trans, bp,
+           XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(xfs_attr_leaf_entry_t)));
+
+       entry = &xfs_attr3_leaf_entryp(leaf)[ichdr.count];
+       memset(entry, 0, sizeof(xfs_attr_leaf_entry_t));
+
+       /*
+        * If we removed the first entry, re-find the first used byte
+        * in the name area.  Note that if the entry was the "firstused",
+        * then we don't have a "hole" in our block resulting from
+        * removing the name.
+        */
+       if (smallest) {
+               tmp = args->geo->blksize;
+               entry = xfs_attr3_leaf_entryp(leaf);
+               for (i = ichdr.count - 1; i >= 0; entry++, i--) {
+                       ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
+                       ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
+
+                       if (be16_to_cpu(entry->nameidx) < tmp)
+                               tmp = be16_to_cpu(entry->nameidx);
+               }
+               ichdr.firstused = tmp;
+               if (!ichdr.firstused)
+                       ichdr.firstused = tmp - XFS_ATTR_LEAF_NAME_ALIGN;
+       } else {
+               ichdr.holes = 1;        /* mark as needing compaction */
+       }
+       xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+       xfs_trans_log_buf(args->trans, bp,
+                         XFS_DA_LOGRANGE(leaf, &leaf->hdr,
+                                         xfs_attr3_leaf_hdr_size(leaf)));
+
+       /*
+        * Check if leaf is less than 50% full, caller may want to
+        * "join" the leaf with a sibling if so.
+        */
+       tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) +
+             ichdr.count * sizeof(xfs_attr_leaf_entry_t);
+
+       return tmp < args->geo->magicpct; /* leaf is < 37% full */
+}
+
+/*
+ * Move all the attribute list entries from drop_leaf into save_leaf.
+ */
+void
+xfs_attr3_leaf_unbalance(
+       struct xfs_da_state     *state,
+       struct xfs_da_state_blk *drop_blk,
+       struct xfs_da_state_blk *save_blk)
+{
+       struct xfs_attr_leafblock *drop_leaf = drop_blk->bp->b_addr;
+       struct xfs_attr_leafblock *save_leaf = save_blk->bp->b_addr;
+       struct xfs_attr3_icleaf_hdr drophdr;
+       struct xfs_attr3_icleaf_hdr savehdr;
+       struct xfs_attr_leaf_entry *entry;
+
+       trace_xfs_attr_leaf_unbalance(state->args);
+
+       drop_leaf = drop_blk->bp->b_addr;
+       save_leaf = save_blk->bp->b_addr;
+       xfs_attr3_leaf_hdr_from_disk(&drophdr, drop_leaf);
+       xfs_attr3_leaf_hdr_from_disk(&savehdr, save_leaf);
+       entry = xfs_attr3_leaf_entryp(drop_leaf);
+
+       /*
+        * Save last hashval from dying block for later Btree fixup.
+        */
+       drop_blk->hashval = be32_to_cpu(entry[drophdr.count - 1].hashval);
+
+       /*
+        * Check if we need a temp buffer, or can we do it in place.
+        * Note that we don't check "leaf" for holes because we will
+        * always be dropping it, toosmall() decided that for us already.
+        */
+       if (savehdr.holes == 0) {
+               /*
+                * dest leaf has no holes, so we add there.  May need
+                * to make some room in the entry array.
+                */
+               if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
+                                        drop_blk->bp, &drophdr)) {
+                       xfs_attr3_leaf_moveents(state->args,
+                                               drop_leaf, &drophdr, 0,
+                                               save_leaf, &savehdr, 0,
+                                               drophdr.count);
+               } else {
+                       xfs_attr3_leaf_moveents(state->args,
+                                               drop_leaf, &drophdr, 0,
+                                               save_leaf, &savehdr,
+                                               savehdr.count, drophdr.count);
+               }
+       } else {
+               /*
+                * Destination has holes, so we make a temporary copy
+                * of the leaf and add them both to that.
+                */
+               struct xfs_attr_leafblock *tmp_leaf;
+               struct xfs_attr3_icleaf_hdr tmphdr;
+
+               tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP);
+
+               /*
+                * Copy the header into the temp leaf so that all the stuff
+                * not in the incore header is present and gets copied back in
+                * once we've moved all the entries.
+                */
+               memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf));
+
+               memset(&tmphdr, 0, sizeof(tmphdr));
+               tmphdr.magic = savehdr.magic;
+               tmphdr.forw = savehdr.forw;
+               tmphdr.back = savehdr.back;
+               tmphdr.firstused = state->args->geo->blksize;
+
+               /* write the header to the temp buffer to initialise it */
+               xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr);
+
+               if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
+                                        drop_blk->bp, &drophdr)) {
+                       xfs_attr3_leaf_moveents(state->args,
+                                               drop_leaf, &drophdr, 0,
+                                               tmp_leaf, &tmphdr, 0,
+                                               drophdr.count);
+                       xfs_attr3_leaf_moveents(state->args,
+                                               save_leaf, &savehdr, 0,
+                                               tmp_leaf, &tmphdr, tmphdr.count,
+                                               savehdr.count);
+               } else {
+                       xfs_attr3_leaf_moveents(state->args,
+                                               save_leaf, &savehdr, 0,
+                                               tmp_leaf, &tmphdr, 0,
+                                               savehdr.count);
+                       xfs_attr3_leaf_moveents(state->args,
+                                               drop_leaf, &drophdr, 0,
+                                               tmp_leaf, &tmphdr, tmphdr.count,
+                                               drophdr.count);
+               }
+               memcpy(save_leaf, tmp_leaf, state->args->geo->blksize);
+               savehdr = tmphdr; /* struct copy */
+               kmem_free(tmp_leaf);
+       }
+
+       xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr);
+       xfs_trans_log_buf(state->args->trans, save_blk->bp, 0,
+                                          state->args->geo->blksize - 1);
+
+       /*
+        * Copy out last hashval in each block for B-tree code.
+        */
+       entry = xfs_attr3_leaf_entryp(save_leaf);
+       save_blk->hashval = be32_to_cpu(entry[savehdr.count - 1].hashval);
+}
+
+/*========================================================================
+ * Routines used for finding things in the Btree.
+ *========================================================================*/
+
+/*
+ * Look up a name in a leaf attribute list structure.
+ * This is the internal routine, it uses the caller's buffer.
+ *
+ * Note that duplicate keys are allowed, but only check within the
+ * current leaf node.  The Btree code must check in adjacent leaf nodes.
+ *
+ * Return in args->index the index into the entry[] array of either
+ * the found entry, or where the entry should have been (insert before
+ * that entry).
+ *
+ * Don't change the args->value unless we find the attribute.
+ */
+int
+xfs_attr3_leaf_lookup_int(
+       struct xfs_buf          *bp,
+       struct xfs_da_args      *args)
+{
+       struct xfs_attr_leafblock *leaf;
+       struct xfs_attr3_icleaf_hdr ichdr;
+       struct xfs_attr_leaf_entry *entry;
+       struct xfs_attr_leaf_entry *entries;
+       struct xfs_attr_leaf_name_local *name_loc;
+       struct xfs_attr_leaf_name_remote *name_rmt;
+       xfs_dahash_t            hashval;
+       int                     probe;
+       int                     span;
+
+       trace_xfs_attr_leaf_lookup(args);
+
+       leaf = bp->b_addr;
+       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+       entries = xfs_attr3_leaf_entryp(leaf);
+       ASSERT(ichdr.count < args->geo->blksize / 8);
+
+       /*
+        * Binary search.  (note: small blocks will skip this loop)
+        */
+       hashval = args->hashval;
+       probe = span = ichdr.count / 2;
+       for (entry = &entries[probe]; span > 4; entry = &entries[probe]) {
+               span /= 2;
+               if (be32_to_cpu(entry->hashval) < hashval)
+                       probe += span;
+               else if (be32_to_cpu(entry->hashval) > hashval)
+                       probe -= span;
+               else
+                       break;
+       }
+       ASSERT(probe >= 0 && (!ichdr.count || probe < ichdr.count));
+       ASSERT(span <= 4 || be32_to_cpu(entry->hashval) == hashval);
+
+       /*
+        * Since we may have duplicate hashval's, find the first matching
+        * hashval in the leaf.
+        */
+       while (probe > 0 && be32_to_cpu(entry->hashval) >= hashval) {
+               entry--;
+               probe--;
+       }
+       while (probe < ichdr.count &&
+              be32_to_cpu(entry->hashval) < hashval) {
+               entry++;
+               probe++;
+       }
+       if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) {
+               args->index = probe;
+               return -ENOATTR;
+       }
+
+       /*
+        * Duplicate keys may be present, so search all of them for a match.
+        */
+       for (; probe < ichdr.count && (be32_to_cpu(entry->hashval) == hashval);
+                       entry++, probe++) {
+/*
+ * GROT: Add code to remove incomplete entries.
+ */
+               /*
+                * If we are looking for INCOMPLETE entries, show only those.
+                * If we are looking for complete entries, show only those.
+                */
+               if ((args->flags & XFS_ATTR_INCOMPLETE) !=
+                   (entry->flags & XFS_ATTR_INCOMPLETE)) {
+                       continue;
+               }
+               if (entry->flags & XFS_ATTR_LOCAL) {
+                       name_loc = xfs_attr3_leaf_name_local(leaf, probe);
+                       if (name_loc->namelen != args->namelen)
+                               continue;
+                       if (memcmp(args->name, name_loc->nameval,
+                                                       args->namelen) != 0)
+                               continue;
+                       if (!xfs_attr_namesp_match(args->flags, entry->flags))
+                               continue;
+                       args->index = probe;
+                       return -EEXIST;
+               } else {
+                       name_rmt = xfs_attr3_leaf_name_remote(leaf, probe);
+                       if (name_rmt->namelen != args->namelen)
+                               continue;
+                       if (memcmp(args->name, name_rmt->name,
+                                                       args->namelen) != 0)
+                               continue;
+                       if (!xfs_attr_namesp_match(args->flags, entry->flags))
+                               continue;
+                       args->index = probe;
+                       args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
+                       args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
+                       args->rmtblkcnt = xfs_attr3_rmt_blocks(
+                                                       args->dp->i_mount,
+                                                       args->rmtvaluelen);
+                       return -EEXIST;
+               }
+       }
+       args->index = probe;
+       return -ENOATTR;
+}
+
+/*
+ * Get the value associated with an attribute name from a leaf attribute
+ * list structure.
+ */
+int
+xfs_attr3_leaf_getvalue(
+       struct xfs_buf          *bp,
+       struct xfs_da_args      *args)
+{
+       struct xfs_attr_leafblock *leaf;
+       struct xfs_attr3_icleaf_hdr ichdr;
+       struct xfs_attr_leaf_entry *entry;
+       struct xfs_attr_leaf_name_local *name_loc;
+       struct xfs_attr_leaf_name_remote *name_rmt;
+       int                     valuelen;
+
+       leaf = bp->b_addr;
+       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+       ASSERT(ichdr.count < args->geo->blksize / 8);
+       ASSERT(args->index < ichdr.count);
+
+       entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+       if (entry->flags & XFS_ATTR_LOCAL) {
+               name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
+               ASSERT(name_loc->namelen == args->namelen);
+               ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
+               valuelen = be16_to_cpu(name_loc->valuelen);
+               if (args->flags & ATTR_KERNOVAL) {
+                       args->valuelen = valuelen;
+                       return 0;
+               }
+               if (args->valuelen < valuelen) {
+                       args->valuelen = valuelen;
+                       return -ERANGE;
+               }
+               args->valuelen = valuelen;
+               memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
+       } else {
+               name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+               ASSERT(name_rmt->namelen == args->namelen);
+               ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
+               args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
+               args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
+               args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
+                                                      args->rmtvaluelen);
+               if (args->flags & ATTR_KERNOVAL) {
+                       args->valuelen = args->rmtvaluelen;
+                       return 0;
+               }
+               if (args->valuelen < args->rmtvaluelen) {
+                       args->valuelen = args->rmtvaluelen;
+                       return -ERANGE;
+               }
+               args->valuelen = args->rmtvaluelen;
+       }
+       return 0;
+}
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Move the indicated entries from one leaf to another.
+ * NOTE: this routine modifies both source and destination leaves.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_attr3_leaf_moveents(
+       struct xfs_da_args              *args,
+       struct xfs_attr_leafblock       *leaf_s,
+       struct xfs_attr3_icleaf_hdr     *ichdr_s,
+       int                             start_s,
+       struct xfs_attr_leafblock       *leaf_d,
+       struct xfs_attr3_icleaf_hdr     *ichdr_d,
+       int                             start_d,
+       int                             count)
+{
+       struct xfs_attr_leaf_entry      *entry_s;
+       struct xfs_attr_leaf_entry      *entry_d;
+       int                             desti;
+       int                             tmp;
+       int                             i;
+
+       /*
+        * Check for nothing to do.
+        */
+       if (count == 0)
+               return;
+
+       /*
+        * Set up environment.
+        */
+       ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC ||
+              ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC);
+       ASSERT(ichdr_s->magic == ichdr_d->magic);
+       ASSERT(ichdr_s->count > 0 && ichdr_s->count < args->geo->blksize / 8);
+       ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s))
+                                       + xfs_attr3_leaf_hdr_size(leaf_s));
+       ASSERT(ichdr_d->count < args->geo->blksize / 8);
+       ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d))
+                                       + xfs_attr3_leaf_hdr_size(leaf_d));
+
+       ASSERT(start_s < ichdr_s->count);
+       ASSERT(start_d <= ichdr_d->count);
+       ASSERT(count <= ichdr_s->count);
+
+
+       /*
+        * Move the entries in the destination leaf up to make a hole?
+        */
+       if (start_d < ichdr_d->count) {
+               tmp  = ichdr_d->count - start_d;
+               tmp *= sizeof(xfs_attr_leaf_entry_t);
+               entry_s = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
+               entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d + count];
+               memmove(entry_d, entry_s, tmp);
+       }
+
+       /*
+        * Copy all entry's in the same (sorted) order,
+        * but allocate attribute info packed and in sequence.
+        */
+       entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+       entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
+       desti = start_d;
+       for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) {
+               ASSERT(be16_to_cpu(entry_s->nameidx) >= ichdr_s->firstused);
+               tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i);
+#ifdef GROT
+               /*
+                * Code to drop INCOMPLETE entries.  Difficult to use as we
+                * may also need to change the insertion index.  Code turned
+                * off for 6.2, should be revisited later.
+                */
+               if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */
+                       memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
+                       ichdr_s->usedbytes -= tmp;
+                       ichdr_s->count -= 1;
+                       entry_d--;      /* to compensate for ++ in loop hdr */
+                       desti--;
+                       if ((start_s + i) < offset)
+                               result++;       /* insertion index adjustment */
+               } else {
+#endif /* GROT */
+                       ichdr_d->firstused -= tmp;
+                       /* both on-disk, don't endian flip twice */
+                       entry_d->hashval = entry_s->hashval;
+                       entry_d->nameidx = cpu_to_be16(ichdr_d->firstused);
+                       entry_d->flags = entry_s->flags;
+                       ASSERT(be16_to_cpu(entry_d->nameidx) + tmp
+                                                       <= args->geo->blksize);
+                       memmove(xfs_attr3_leaf_name(leaf_d, desti),
+                               xfs_attr3_leaf_name(leaf_s, start_s + i), tmp);
+                       ASSERT(be16_to_cpu(entry_s->nameidx) + tmp
+                                                       <= args->geo->blksize);
+                       memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
+                       ichdr_s->usedbytes -= tmp;
+                       ichdr_d->usedbytes += tmp;
+                       ichdr_s->count -= 1;
+                       ichdr_d->count += 1;
+                       tmp = ichdr_d->count * sizeof(xfs_attr_leaf_entry_t)
+                                       + xfs_attr3_leaf_hdr_size(leaf_d);
+                       ASSERT(ichdr_d->firstused >= tmp);
+#ifdef GROT
+               }
+#endif /* GROT */
+       }
+
+       /*
+        * Zero out the entries we just copied.
+        */
+       if (start_s == ichdr_s->count) {
+               tmp = count * sizeof(xfs_attr_leaf_entry_t);
+               entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+               ASSERT(((char *)entry_s + tmp) <=
+                      ((char *)leaf_s + args->geo->blksize));
+               memset(entry_s, 0, tmp);
+       } else {
+               /*
+                * Move the remaining entries down to fill the hole,
+                * then zero the entries at the top.
+                */
+               tmp  = (ichdr_s->count - count) * sizeof(xfs_attr_leaf_entry_t);
+               entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s + count];
+               entry_d = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+               memmove(entry_d, entry_s, tmp);
+
+               tmp = count * sizeof(xfs_attr_leaf_entry_t);
+               entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count];
+               ASSERT(((char *)entry_s + tmp) <=
+                      ((char *)leaf_s + args->geo->blksize));
+               memset(entry_s, 0, tmp);
+       }
+
+       /*
+        * Fill in the freemap information
+        */
+       ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_d);
+       ichdr_d->freemap[0].base += ichdr_d->count * sizeof(xfs_attr_leaf_entry_t);
+       ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base;
+       ichdr_d->freemap[1].base = 0;
+       ichdr_d->freemap[2].base = 0;
+       ichdr_d->freemap[1].size = 0;
+       ichdr_d->freemap[2].size = 0;
+       ichdr_s->holes = 1;     /* leaf may not be compact */
+}
+
+/*
+ * Pick up the last hashvalue from a leaf block.
+ */
+xfs_dahash_t
+xfs_attr_leaf_lasthash(
+       struct xfs_buf  *bp,
+       int             *count)
+{
+       struct xfs_attr3_icleaf_hdr ichdr;
+       struct xfs_attr_leaf_entry *entries;
+
+       xfs_attr3_leaf_hdr_from_disk(&ichdr, bp->b_addr);
+       entries = xfs_attr3_leaf_entryp(bp->b_addr);
+       if (count)
+               *count = ichdr.count;
+       if (!ichdr.count)
+               return 0;
+       return be32_to_cpu(entries[ichdr.count - 1].hashval);
+}
+
+/*
+ * Calculate the number of bytes used to store the indicated attribute
+ * (whether local or remote only calculate bytes in this block).
+ */
+STATIC int
+xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
+{
+       struct xfs_attr_leaf_entry *entries;
+       xfs_attr_leaf_name_local_t *name_loc;
+       xfs_attr_leaf_name_remote_t *name_rmt;
+       int size;
+
+       entries = xfs_attr3_leaf_entryp(leaf);
+       if (entries[index].flags & XFS_ATTR_LOCAL) {
+               name_loc = xfs_attr3_leaf_name_local(leaf, index);
+               size = xfs_attr_leaf_entsize_local(name_loc->namelen,
+                                                  be16_to_cpu(name_loc->valuelen));
+       } else {
+               name_rmt = xfs_attr3_leaf_name_remote(leaf, index);
+               size = xfs_attr_leaf_entsize_remote(name_rmt->namelen);
+       }
+       return size;
+}
+
+/*
+ * Calculate the number of bytes that would be required to store the new
+ * attribute (whether local or remote only calculate bytes in this block).
+ * This routine decides as a side effect whether the attribute will be
+ * a "local" or a "remote" attribute.
+ */
+int
+xfs_attr_leaf_newentsize(
+       struct xfs_da_args      *args,
+       int                     *local)
+{
+       int                     size;
+
+       size = xfs_attr_leaf_entsize_local(args->namelen, args->valuelen);
+       if (size < xfs_attr_leaf_entsize_local_max(args->geo->blksize)) {
+               if (local)
+                       *local = 1;
+               return size;
+       }
+       if (local)
+               *local = 0;
+       return xfs_attr_leaf_entsize_remote(args->namelen);
+}
+
+
+/*========================================================================
+ * Manage the INCOMPLETE flag in a leaf entry
+ *========================================================================*/
+
+/*
+ * Clear the INCOMPLETE flag on an entry in a leaf block.
+ */
+int
+xfs_attr3_leaf_clearflag(
+       struct xfs_da_args      *args)
+{
+       struct xfs_attr_leafblock *leaf;
+       struct xfs_attr_leaf_entry *entry;
+       struct xfs_attr_leaf_name_remote *name_rmt;
+       struct xfs_buf          *bp;
+       int                     error;
+#ifdef DEBUG
+       struct xfs_attr3_icleaf_hdr ichdr;
+       xfs_attr_leaf_name_local_t *name_loc;
+       int namelen;
+       char *name;
+#endif /* DEBUG */
+
+       trace_xfs_attr_leaf_clearflag(args);
+       /*
+        * Set up the operation.
+        */
+       error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+       if (error)
+               return error;
+
+       leaf = bp->b_addr;
+       entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+       ASSERT(entry->flags & XFS_ATTR_INCOMPLETE);
+
+#ifdef DEBUG
+       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+       ASSERT(args->index < ichdr.count);
+       ASSERT(args->index >= 0);
+
+       if (entry->flags & XFS_ATTR_LOCAL) {
+               name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
+               namelen = name_loc->namelen;
+               name = (char *)name_loc->nameval;
+       } else {
+               name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+               namelen = name_rmt->namelen;
+               name = (char *)name_rmt->name;
+       }
+       ASSERT(be32_to_cpu(entry->hashval) == args->hashval);
+       ASSERT(namelen == args->namelen);
+       ASSERT(memcmp(name, args->name, namelen) == 0);
+#endif /* DEBUG */
+
+       entry->flags &= ~XFS_ATTR_INCOMPLETE;
+       xfs_trans_log_buf(args->trans, bp,
+                        XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+
+       if (args->rmtblkno) {
+               ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
+               name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+               name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
+               name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
+               xfs_trans_log_buf(args->trans, bp,
+                        XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
+       }
+
+       /*
+        * Commit the flag value change and start the next trans in series.
+        */
+       return xfs_trans_roll(&args->trans, args->dp);
+}
+
+/*
+ * Set the INCOMPLETE flag on an entry in a leaf block.
+ */
+int
+xfs_attr3_leaf_setflag(
+       struct xfs_da_args      *args)
+{
+       struct xfs_attr_leafblock *leaf;
+       struct xfs_attr_leaf_entry *entry;
+       struct xfs_attr_leaf_name_remote *name_rmt;
+       struct xfs_buf          *bp;
+       int error;
+#ifdef DEBUG
+       struct xfs_attr3_icleaf_hdr ichdr;
+#endif
+
+       trace_xfs_attr_leaf_setflag(args);
+
+       /*
+        * Set up the operation.
+        */
+       error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+       if (error)
+               return error;
+
+       leaf = bp->b_addr;
+#ifdef DEBUG
+       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+       ASSERT(args->index < ichdr.count);
+       ASSERT(args->index >= 0);
+#endif
+       entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+
+       ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0);
+       entry->flags |= XFS_ATTR_INCOMPLETE;
+       xfs_trans_log_buf(args->trans, bp,
+                       XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+       if ((entry->flags & XFS_ATTR_LOCAL) == 0) {
+               name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+               name_rmt->valueblk = 0;
+               name_rmt->valuelen = 0;
+               xfs_trans_log_buf(args->trans, bp,
+                        XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
+       }
+
+       /*
+        * Commit the flag value change and start the next trans in series.
+        */
+       return xfs_trans_roll(&args->trans, args->dp);
+}
+
+/*
+ * In a single transaction, clear the INCOMPLETE flag on the leaf entry
+ * given by args->blkno/index and set the INCOMPLETE flag on the leaf
+ * entry given by args->blkno2/index2.
+ *
+ * Note that they could be in different blocks, or in the same block.
+ */
+int
+xfs_attr3_leaf_flipflags(
+       struct xfs_da_args      *args)
+{
+       struct xfs_attr_leafblock *leaf1;
+       struct xfs_attr_leafblock *leaf2;
+       struct xfs_attr_leaf_entry *entry1;
+       struct xfs_attr_leaf_entry *entry2;
+       struct xfs_attr_leaf_name_remote *name_rmt;
+       struct xfs_buf          *bp1;
+       struct xfs_buf          *bp2;
+       int error;
+#ifdef DEBUG
+       struct xfs_attr3_icleaf_hdr ichdr1;
+       struct xfs_attr3_icleaf_hdr ichdr2;
+       xfs_attr_leaf_name_local_t *name_loc;
+       int namelen1, namelen2;
+       char *name1, *name2;
+#endif /* DEBUG */
+
+       trace_xfs_attr_leaf_flipflags(args);
+
+       /*
+        * Read the block containing the "old" attr
+        */
+       error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
+       if (error)
+               return error;
+
+       /*
+        * Read the block containing the "new" attr, if it is different
+        */
+       if (args->blkno2 != args->blkno) {
+               error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2,
+                                          -1, &bp2);
+               if (error)
+                       return error;
+       } else {
+               bp2 = bp1;
+       }
+
+       leaf1 = bp1->b_addr;
+       entry1 = &xfs_attr3_leaf_entryp(leaf1)[args->index];
+
+       leaf2 = bp2->b_addr;
+       entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2];
+
+#ifdef DEBUG
+       xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
+       ASSERT(args->index < ichdr1.count);
+       ASSERT(args->index >= 0);
+
+       xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
+       ASSERT(args->index2 < ichdr2.count);
+       ASSERT(args->index2 >= 0);
+
+       if (entry1->flags & XFS_ATTR_LOCAL) {
+               name_loc = xfs_attr3_leaf_name_local(leaf1, args->index);
+               namelen1 = name_loc->namelen;
+               name1 = (char *)name_loc->nameval;
+       } else {
+               name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
+               namelen1 = name_rmt->namelen;
+               name1 = (char *)name_rmt->name;
+       }
+       if (entry2->flags & XFS_ATTR_LOCAL) {
+               name_loc = xfs_attr3_leaf_name_local(leaf2, args->index2);
+               namelen2 = name_loc->namelen;
+               name2 = (char *)name_loc->nameval;
+       } else {
+               name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
+               namelen2 = name_rmt->namelen;
+               name2 = (char *)name_rmt->name;
+       }
+       ASSERT(be32_to_cpu(entry1->hashval) == be32_to_cpu(entry2->hashval));
+       ASSERT(namelen1 == namelen2);
+       ASSERT(memcmp(name1, name2, namelen1) == 0);
+#endif /* DEBUG */
+
+       ASSERT(entry1->flags & XFS_ATTR_INCOMPLETE);
+       ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0);
+
+       entry1->flags &= ~XFS_ATTR_INCOMPLETE;
+       xfs_trans_log_buf(args->trans, bp1,
+                         XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1)));
+       if (args->rmtblkno) {
+               ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
+               name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
+               name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
+               name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
+               xfs_trans_log_buf(args->trans, bp1,
+                        XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt)));
+       }
+
+       entry2->flags |= XFS_ATTR_INCOMPLETE;
+       xfs_trans_log_buf(args->trans, bp2,
+                         XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2)));
+       if ((entry2->flags & XFS_ATTR_LOCAL) == 0) {
+               name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
+               name_rmt->valueblk = 0;
+               name_rmt->valuelen = 0;
+               xfs_trans_log_buf(args->trans, bp2,
+                        XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt)));
+       }
+
+       /*
+        * Commit the flag value change and start the next trans in series.
+        */
+       error = xfs_trans_roll(&args->trans, args->dp);
+
+       return error;
+}
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
new file mode 100644 (file)
index 0000000..e2929da
--- /dev/null
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ATTR_LEAF_H__
+#define        __XFS_ATTR_LEAF_H__
+
+struct attrlist;
+struct attrlist_cursor_kern;
+struct xfs_attr_list_context;
+struct xfs_da_args;
+struct xfs_da_state;
+struct xfs_da_state_blk;
+struct xfs_inode;
+struct xfs_trans;
+
+/*
+ * Used to keep a list of "remote value" extents when unlinking an inode.
+ */
+typedef struct xfs_attr_inactive_list {
+       xfs_dablk_t     valueblk;       /* block number of value bytes */
+       int             valuelen;       /* number of bytes in value */
+} xfs_attr_inactive_list_t;
+
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Internal routines when attribute fork size < XFS_LITINO(mp).
+ */
+void   xfs_attr_shortform_create(struct xfs_da_args *args);
+void   xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
+int    xfs_attr_shortform_lookup(struct xfs_da_args *args);
+int    xfs_attr_shortform_getvalue(struct xfs_da_args *args);
+int    xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
+int    xfs_attr_shortform_remove(struct xfs_da_args *args);
+int    xfs_attr_shortform_list(struct xfs_attr_list_context *context);
+int    xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
+int    xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
+
+
+/*
+ * Internal routines when attribute fork size == XFS_LBSIZE(mp).
+ */
+int    xfs_attr3_leaf_to_node(struct xfs_da_args *args);
+int    xfs_attr3_leaf_to_shortform(struct xfs_buf *bp,
+                                  struct xfs_da_args *args, int forkoff);
+int    xfs_attr3_leaf_clearflag(struct xfs_da_args *args);
+int    xfs_attr3_leaf_setflag(struct xfs_da_args *args);
+int    xfs_attr3_leaf_flipflags(struct xfs_da_args *args);
+
+/*
+ * Routines used for growing the Btree.
+ */
+int    xfs_attr3_leaf_split(struct xfs_da_state *state,
+                                  struct xfs_da_state_blk *oldblk,
+                                  struct xfs_da_state_blk *newblk);
+int    xfs_attr3_leaf_lookup_int(struct xfs_buf *leaf,
+                                       struct xfs_da_args *args);
+int    xfs_attr3_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args);
+int    xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer,
+                                struct xfs_da_args *args);
+int    xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer,
+                                   struct xfs_da_args *args);
+int    xfs_attr3_leaf_list_int(struct xfs_buf *bp,
+                                     struct xfs_attr_list_context *context);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+int    xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval);
+void   xfs_attr3_leaf_unbalance(struct xfs_da_state *state,
+                                      struct xfs_da_state_blk *drop_blk,
+                                      struct xfs_da_state_blk *save_blk);
+int    xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
+
+/*
+ * Utility routines.
+ */
+xfs_dahash_t   xfs_attr_leaf_lasthash(struct xfs_buf *bp, int *count);
+int    xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
+                                  struct xfs_buf *leaf2_bp);
+int    xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local);
+int    xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
+                       xfs_dablk_t bno, xfs_daddr_t mappedbno,
+                       struct xfs_buf **bpp);
+void   xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to,
+                                    struct xfs_attr_leafblock *from);
+void   xfs_attr3_leaf_hdr_to_disk(struct xfs_attr_leafblock *to,
+                                  struct xfs_attr3_icleaf_hdr *from);
+
+#endif /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
new file mode 100644 (file)
index 0000000..7510ab8
--- /dev/null
@@ -0,0 +1,628 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
+#include "xfs_error.h"
+
+#define ATTR_RMTVALUE_MAPSIZE  1       /* # of map entries at once */
+
+/*
+ * Each contiguous block has a header, so it is not just a simple attribute
+ * length to FSB conversion.
+ */
+int
+xfs_attr3_rmt_blocks(
+       struct xfs_mount *mp,
+       int             attrlen)
+{
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
+               return (attrlen + buflen - 1) / buflen;
+       }
+       return XFS_B_TO_FSB(mp, attrlen);
+}
+
+/*
+ * Checking of the remote attribute header is split into two parts. The verifier
+ * does CRC, location and bounds checking, the unpacking function checks the
+ * attribute parameters and owner.
+ */
+static bool
+xfs_attr3_rmt_hdr_ok(
+       void                    *ptr,
+       xfs_ino_t               ino,
+       uint32_t                offset,
+       uint32_t                size,
+       xfs_daddr_t             bno)
+{
+       struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+       if (bno != be64_to_cpu(rmt->rm_blkno))
+               return false;
+       if (offset != be32_to_cpu(rmt->rm_offset))
+               return false;
+       if (size != be32_to_cpu(rmt->rm_bytes))
+               return false;
+       if (ino != be64_to_cpu(rmt->rm_owner))
+               return false;
+
+       /* ok */
+       return true;
+}
+
+static bool
+xfs_attr3_rmt_verify(
+       struct xfs_mount        *mp,
+       void                    *ptr,
+       int                     fsbsize,
+       xfs_daddr_t             bno)
+{
+       struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return false;
+       if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC))
+               return false;
+       if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid))
+               return false;
+       if (be64_to_cpu(rmt->rm_blkno) != bno)
+               return false;
+       if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
+               return false;
+       if (be32_to_cpu(rmt->rm_offset) +
+                               be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX)
+               return false;
+       if (rmt->rm_owner == 0)
+               return false;
+
+       return true;
+}
+
+static void
+xfs_attr3_rmt_read_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+       char            *ptr;
+       int             len;
+       xfs_daddr_t     bno;
+       int             blksize = mp->m_attr_geo->blksize;
+
+       /* no verification of non-crc buffers */
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return;
+
+       ptr = bp->b_addr;
+       bno = bp->b_bn;
+       len = BBTOB(bp->b_length);
+       ASSERT(len >= blksize);
+
+       while (len > 0) {
+               if (!xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) {
+                       xfs_buf_ioerror(bp, -EFSBADCRC);
+                       break;
+               }
+               if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
+                       xfs_buf_ioerror(bp, -EFSCORRUPTED);
+                       break;
+               }
+               len -= blksize;
+               ptr += blksize;
+               bno += BTOBB(blksize);
+       }
+
+       if (bp->b_error)
+               xfs_verifier_error(bp);
+       else
+               ASSERT(len == 0);
+}
+
+static void
+xfs_attr3_rmt_write_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+       struct xfs_buf_log_item *bip = bp->b_fspriv;
+       char            *ptr;
+       int             len;
+       xfs_daddr_t     bno;
+       int             blksize = mp->m_attr_geo->blksize;
+
+       /* no verification of non-crc buffers */
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return;
+
+       ptr = bp->b_addr;
+       bno = bp->b_bn;
+       len = BBTOB(bp->b_length);
+       ASSERT(len >= blksize);
+
+       while (len > 0) {
+               if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
+                       xfs_buf_ioerror(bp, -EFSCORRUPTED);
+                       xfs_verifier_error(bp);
+                       return;
+               }
+               if (bip) {
+                       struct xfs_attr3_rmt_hdr *rmt;
+
+                       rmt = (struct xfs_attr3_rmt_hdr *)ptr;
+                       rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+               }
+               xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF);
+
+               len -= blksize;
+               ptr += blksize;
+               bno += BTOBB(blksize);
+       }
+       ASSERT(len == 0);
+}
+
+const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
+       .verify_read = xfs_attr3_rmt_read_verify,
+       .verify_write = xfs_attr3_rmt_write_verify,
+};
+
+STATIC int
+xfs_attr3_rmt_hdr_set(
+       struct xfs_mount        *mp,
+       void                    *ptr,
+       xfs_ino_t               ino,
+       uint32_t                offset,
+       uint32_t                size,
+       xfs_daddr_t             bno)
+{
+       struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return 0;
+
+       rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC);
+       rmt->rm_offset = cpu_to_be32(offset);
+       rmt->rm_bytes = cpu_to_be32(size);
+       uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid);
+       rmt->rm_owner = cpu_to_be64(ino);
+       rmt->rm_blkno = cpu_to_be64(bno);
+
+       return sizeof(struct xfs_attr3_rmt_hdr);
+}
+
+/*
+ * Helper functions to copy attribute data in and out of the one disk extents
+ */
+STATIC int
+xfs_attr_rmtval_copyout(
+       struct xfs_mount *mp,
+       struct xfs_buf  *bp,
+       xfs_ino_t       ino,
+       int             *offset,
+       int             *valuelen,
+       __uint8_t       **dst)
+{
+       char            *src = bp->b_addr;
+       xfs_daddr_t     bno = bp->b_bn;
+       int             len = BBTOB(bp->b_length);
+       int             blksize = mp->m_attr_geo->blksize;
+
+       ASSERT(len >= blksize);
+
+       while (len > 0 && *valuelen > 0) {
+               int hdr_size = 0;
+               int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
+
+               byte_cnt = min(*valuelen, byte_cnt);
+
+               if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                       if (!xfs_attr3_rmt_hdr_ok(src, ino, *offset,
+                                                 byte_cnt, bno)) {
+                               xfs_alert(mp,
+"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
+                                       bno, *offset, byte_cnt, ino);
+                               return -EFSCORRUPTED;
+                       }
+                       hdr_size = sizeof(struct xfs_attr3_rmt_hdr);
+               }
+
+               memcpy(*dst, src + hdr_size, byte_cnt);
+
+               /* roll buffer forwards */
+               len -= blksize;
+               src += blksize;
+               bno += BTOBB(blksize);
+
+               /* roll attribute data forwards */
+               *valuelen -= byte_cnt;
+               *dst += byte_cnt;
+               *offset += byte_cnt;
+       }
+       return 0;
+}
+
+STATIC void
+xfs_attr_rmtval_copyin(
+       struct xfs_mount *mp,
+       struct xfs_buf  *bp,
+       xfs_ino_t       ino,
+       int             *offset,
+       int             *valuelen,
+       __uint8_t       **src)
+{
+       char            *dst = bp->b_addr;
+       xfs_daddr_t     bno = bp->b_bn;
+       int             len = BBTOB(bp->b_length);
+       int             blksize = mp->m_attr_geo->blksize;
+
+       ASSERT(len >= blksize);
+
+       while (len > 0 && *valuelen > 0) {
+               int hdr_size;
+               int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
+
+               byte_cnt = min(*valuelen, byte_cnt);
+               hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset,
+                                                byte_cnt, bno);
+
+               memcpy(dst + hdr_size, *src, byte_cnt);
+
+               /*
+                * If this is the last block, zero the remainder of it.
+                * Check that we are actually the last block, too.
+                */
+               if (byte_cnt + hdr_size < blksize) {
+                       ASSERT(*valuelen - byte_cnt == 0);
+                       ASSERT(len == blksize);
+                       memset(dst + hdr_size + byte_cnt, 0,
+                                       blksize - hdr_size - byte_cnt);
+               }
+
+               /* roll buffer forwards */
+               len -= blksize;
+               dst += blksize;
+               bno += BTOBB(blksize);
+
+               /* roll attribute data forwards */
+               *valuelen -= byte_cnt;
+               *src += byte_cnt;
+               *offset += byte_cnt;
+       }
+}
+
+/*
+ * Read the value associated with an attribute from the out-of-line buffer
+ * that we stored it in.
+ */
+int
+xfs_attr_rmtval_get(
+       struct xfs_da_args      *args)
+{
+       struct xfs_bmbt_irec    map[ATTR_RMTVALUE_MAPSIZE];
+       struct xfs_mount        *mp = args->dp->i_mount;
+       struct xfs_buf          *bp;
+       xfs_dablk_t             lblkno = args->rmtblkno;
+       __uint8_t               *dst = args->value;
+       int                     valuelen;
+       int                     nmap;
+       int                     error;
+       int                     blkcnt = args->rmtblkcnt;
+       int                     i;
+       int                     offset = 0;
+
+       trace_xfs_attr_rmtval_get(args);
+
+       ASSERT(!(args->flags & ATTR_KERNOVAL));
+       ASSERT(args->rmtvaluelen == args->valuelen);
+
+       valuelen = args->rmtvaluelen;
+       while (valuelen > 0) {
+               nmap = ATTR_RMTVALUE_MAPSIZE;
+               error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+                                      blkcnt, map, &nmap,
+                                      XFS_BMAPI_ATTRFORK);
+               if (error)
+                       return error;
+               ASSERT(nmap >= 1);
+
+               for (i = 0; (i < nmap) && (valuelen > 0); i++) {
+                       xfs_daddr_t     dblkno;
+                       int             dblkcnt;
+
+                       ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
+                              (map[i].br_startblock != HOLESTARTBLOCK));
+                       dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
+                       dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+                       error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+                                                  dblkno, dblkcnt, 0, &bp,
+                                                  &xfs_attr3_rmt_buf_ops);
+                       if (error)
+                               return error;
+
+                       error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
+                                                       &offset, &valuelen,
+                                                       &dst);
+                       xfs_buf_relse(bp);
+                       if (error)
+                               return error;
+
+                       /* roll attribute extent map forwards */
+                       lblkno += map[i].br_blockcount;
+                       blkcnt -= map[i].br_blockcount;
+               }
+       }
+       ASSERT(valuelen == 0);
+       return 0;
+}
+
+/*
+ * Write the value associated with an attribute into the out-of-line buffer
+ * that we have defined for it.
+ */
+int
+xfs_attr_rmtval_set(
+       struct xfs_da_args      *args)
+{
+       struct xfs_inode        *dp = args->dp;
+       struct xfs_mount        *mp = dp->i_mount;
+       struct xfs_bmbt_irec    map;
+       xfs_dablk_t             lblkno;
+       xfs_fileoff_t           lfileoff = 0;
+       __uint8_t               *src = args->value;
+       int                     blkcnt;
+       int                     valuelen;
+       int                     nmap;
+       int                     error;
+       int                     offset = 0;
+
+       trace_xfs_attr_rmtval_set(args);
+
+       /*
+        * Find a "hole" in the attribute address space large enough for
+        * us to drop the new attribute's value into. Because CRC enable
+        * attributes have headers, we can't just do a straight byte to FSB
+        * conversion and have to take the header space into account.
+        */
+       blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen);
+       error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
+                                                  XFS_ATTR_FORK);
+       if (error)
+               return error;
+
+       args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
+       args->rmtblkcnt = blkcnt;
+
+       /*
+        * Roll through the "value", allocating blocks on disk as required.
+        */
+       while (blkcnt > 0) {
+               int     committed;
+
+               /*
+                * Allocate a single extent, up to the size of the value.
+                */
+               xfs_bmap_init(args->flist, args->firstblock);
+               nmap = 1;
+               error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
+                                 blkcnt,
+                                 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+                                 args->firstblock, args->total, &map, &nmap,
+                                 args->flist);
+               if (!error) {
+                       error = xfs_bmap_finish(&args->trans, args->flist,
+                                               &committed);
+               }
+               if (error) {
+                       ASSERT(committed);
+                       args->trans = NULL;
+                       xfs_bmap_cancel(args->flist);
+                       return error;
+               }
+
+               /*
+                * bmap_finish() may have committed the last trans and started
+                * a new one.  We need the inode to be in all transactions.
+                */
+               if (committed)
+                       xfs_trans_ijoin(args->trans, dp, 0);
+
+               ASSERT(nmap == 1);
+               ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+                      (map.br_startblock != HOLESTARTBLOCK));
+               lblkno += map.br_blockcount;
+               blkcnt -= map.br_blockcount;
+
+               /*
+                * Start the next trans in the chain.
+                */
+               error = xfs_trans_roll(&args->trans, dp);
+               if (error)
+                       return error;
+       }
+
+       /*
+        * Roll through the "value", copying the attribute value to the
+        * already-allocated blocks.  Blocks are written synchronously
+        * so that we can know they are all on disk before we turn off
+        * the INCOMPLETE flag.
+        */
+       lblkno = args->rmtblkno;
+       blkcnt = args->rmtblkcnt;
+       valuelen = args->rmtvaluelen;
+       while (valuelen > 0) {
+               struct xfs_buf  *bp;
+               xfs_daddr_t     dblkno;
+               int             dblkcnt;
+
+               ASSERT(blkcnt > 0);
+
+               xfs_bmap_init(args->flist, args->firstblock);
+               nmap = 1;
+               error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
+                                      blkcnt, &map, &nmap,
+                                      XFS_BMAPI_ATTRFORK);
+               if (error)
+                       return error;
+               ASSERT(nmap == 1);
+               ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+                      (map.br_startblock != HOLESTARTBLOCK));
+
+               dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+               dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+
+               bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0);
+               if (!bp)
+                       return -ENOMEM;
+               bp->b_ops = &xfs_attr3_rmt_buf_ops;
+
+               xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset,
+                                      &valuelen, &src);
+
+               error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */
+               xfs_buf_relse(bp);
+               if (error)
+                       return error;
+
+
+               /* roll attribute extent map forwards */
+               lblkno += map.br_blockcount;
+               blkcnt -= map.br_blockcount;
+       }
+       ASSERT(valuelen == 0);
+       return 0;
+}
+
+/*
+ * Remove the value associated with an attribute by deleting the
+ * out-of-line buffer that it is stored on.
+ */
+int
+xfs_attr_rmtval_remove(
+       struct xfs_da_args      *args)
+{
+       struct xfs_mount        *mp = args->dp->i_mount;
+       xfs_dablk_t             lblkno;
+       int                     blkcnt;
+       int                     error;
+       int                     done;
+
+       trace_xfs_attr_rmtval_remove(args);
+
+       /*
+        * Roll through the "value", invalidating the attribute value's blocks.
+        */
+       lblkno = args->rmtblkno;
+       blkcnt = args->rmtblkcnt;
+       while (blkcnt > 0) {
+               struct xfs_bmbt_irec    map;
+               struct xfs_buf          *bp;
+               xfs_daddr_t             dblkno;
+               int                     dblkcnt;
+               int                     nmap;
+
+               /*
+                * Try to remember where we decided to put the value.
+                */
+               nmap = 1;
+               error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+                                      blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
+               if (error)
+                       return error;
+               ASSERT(nmap == 1);
+               ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+                      (map.br_startblock != HOLESTARTBLOCK));
+
+               dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+               dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+
+               /*
+                * If the "remote" value is in the cache, remove it.
+                */
+               bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK);
+               if (bp) {
+                       xfs_buf_stale(bp);
+                       xfs_buf_relse(bp);
+                       bp = NULL;
+               }
+
+               lblkno += map.br_blockcount;
+               blkcnt -= map.br_blockcount;
+       }
+
+       /*
+        * Keep de-allocating extents until the remote-value region is gone.
+        */
+       lblkno = args->rmtblkno;
+       blkcnt = args->rmtblkcnt;
+       done = 0;
+       while (!done) {
+               int committed;
+
+               xfs_bmap_init(args->flist, args->firstblock);
+               error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
+                                   XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+                                   1, args->firstblock, args->flist,
+                                   &done);
+               if (!error) {
+                       error = xfs_bmap_finish(&args->trans, args->flist,
+                                               &committed);
+               }
+               if (error) {
+                       ASSERT(committed);
+                       args->trans = NULL;
+                       xfs_bmap_cancel(args->flist);
+                       return error;
+               }
+
+               /*
+                * bmap_finish() may have committed the last trans and started
+                * a new one.  We need the inode to be in all transactions.
+                */
+               if (committed)
+                       xfs_trans_ijoin(args->trans, args->dp, 0);
+
+               /*
+                * Close out trans and start the next one in the chain.
+                */
+               error = xfs_trans_roll(&args->trans, args->dp);
+               if (error)
+                       return error;
+       }
+       return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h
new file mode 100644 (file)
index 0000000..5a9acfa
--- /dev/null
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ATTR_REMOTE_H__
+#define        __XFS_ATTR_REMOTE_H__
+
+int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen);
+
+int xfs_attr_rmtval_get(struct xfs_da_args *args);
+int xfs_attr_rmtval_set(struct xfs_da_args *args);
+int xfs_attr_rmtval_remove(struct xfs_da_args *args);
+
+#endif /* __XFS_ATTR_REMOTE_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h
new file mode 100644 (file)
index 0000000..919756e
--- /dev/null
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ATTR_SF_H__
+#define        __XFS_ATTR_SF_H__
+
+/*
+ * Attribute storage when stored inside the inode.
+ *
+ * Small attribute lists are packed as tightly as possible so as
+ * to fit into the literal area of the inode.
+ */
+
+/*
+ * Entries are packed toward the top as tight as possible.
+ */
+typedef struct xfs_attr_shortform {
+       struct xfs_attr_sf_hdr {        /* constant-structure header block */
+               __be16  totsize;        /* total bytes in shortform list */
+               __u8    count;  /* count of active entries */
+       } hdr;
+       struct xfs_attr_sf_entry {
+               __uint8_t namelen;      /* actual length of name (no NULL) */
+               __uint8_t valuelen;     /* actual length of value (no NULL) */
+               __uint8_t flags;        /* flags bits (see xfs_attr_leaf.h) */
+               __uint8_t nameval[1];   /* name & value bytes concatenated */
+       } list[1];                      /* variable sized array */
+} xfs_attr_shortform_t;
+typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
+typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t;
+
+/*
+ * We generate this then sort it, attr_list() must return things in hash-order.
+ */
+typedef struct xfs_attr_sf_sort {
+       __uint8_t       entno;          /* entry number in original list */
+       __uint8_t       namelen;        /* length of name value (no null) */
+       __uint8_t       valuelen;       /* length of value */
+       __uint8_t       flags;          /* flags bits (see xfs_attr_leaf.h) */
+       xfs_dahash_t    hash;           /* this entry's hash value */
+       unsigned char   *name;          /* name value, pointer into buffer */
+} xfs_attr_sf_sort_t;
+
+#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen)  /* space name/value uses */ \
+       (((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen)))
+#define XFS_ATTR_SF_ENTSIZE_MAX                        /* max space for name&value */ \
+       ((1 << (NBBY*(int)sizeof(__uint8_t))) - 1)
+#define XFS_ATTR_SF_ENTSIZE(sfep)              /* space an entry uses */ \
+       ((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen)
+#define XFS_ATTR_SF_NEXTENTRY(sfep)            /* next entry in struct */ \
+       ((xfs_attr_sf_entry_t *)((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep)))
+#define XFS_ATTR_SF_TOTSIZE(dp)                        /* total space in use */ \
+       (be16_to_cpu(((xfs_attr_shortform_t *)  \
+               ((dp)->i_afp->if_u1.if_data))->hdr.totsize))
+
+#endif /* __XFS_ATTR_SF_H__ */
diff --git a/fs/xfs/libxfs/xfs_bit.h b/fs/xfs/libxfs/xfs_bit.h
new file mode 100644 (file)
index 0000000..e1649c0
--- /dev/null
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BIT_H__
+#define        __XFS_BIT_H__
+
+/*
+ * XFS bit manipulation routines.
+ */
+
+/*
+ * masks with n high/low bits set, 64-bit values
+ */
+static inline __uint64_t xfs_mask64hi(int n)
+{
+       return (__uint64_t)-1 << (64 - (n));
+}
+static inline __uint32_t xfs_mask32lo(int n)
+{
+       return ((__uint32_t)1 << (n)) - 1;
+}
+static inline __uint64_t xfs_mask64lo(int n)
+{
+       return ((__uint64_t)1 << (n)) - 1;
+}
+
+/* Get high bit set out of 32-bit argument, -1 if none set */
+static inline int xfs_highbit32(__uint32_t v)
+{
+       return fls(v) - 1;
+}
+
+/* Get high bit set out of 64-bit argument, -1 if none set */
+static inline int xfs_highbit64(__uint64_t v)
+{
+       return fls64(v) - 1;
+}
+
+/* Get low bit set out of 32-bit argument, -1 if none set */
+static inline int xfs_lowbit32(__uint32_t v)
+{
+       return ffs(v) - 1;
+}
+
+/* Get low bit set out of 64-bit argument, -1 if none set */
+static inline int xfs_lowbit64(__uint64_t v)
+{
+       __uint32_t      w = (__uint32_t)v;
+       int             n = 0;
+
+       if (w) {        /* lower bits */
+               n = ffs(w);
+       } else {        /* upper bits */
+               w = (__uint32_t)(v >> 32);
+               if (w) {
+                       n = ffs(w);
+                       if (n)
+                               n += 32;
+               }
+       }
+       return n - 1;
+}
+
+/* Return whether bitmap is empty (1 == empty) */
+extern int xfs_bitmap_empty(uint *map, uint size);
+
+/* Count continuous one bits in map starting with start_bit */
+extern int xfs_contig_bits(uint *map, uint size, uint start_bit);
+
+/* Find next set bit in map */
+extern int xfs_next_bit(uint *map, uint size, uint start_bit);
+
+#endif /* __XFS_BIT_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
new file mode 100644 (file)
index 0000000..de2d26d
--- /dev/null
@@ -0,0 +1,5602 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_inum.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_extfree_item.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_buf_item.h"
+#include "xfs_trace.h"
+#include "xfs_symlink.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_dinode.h"
+#include "xfs_filestream.h"
+
+
+kmem_zone_t            *xfs_bmap_free_item_zone;
+
+/*
+ * Miscellaneous helper functions
+ */
+
+/*
+ * Compute and fill in the value of the maximum depth of a bmap btree
+ * in this filesystem.  Done once, during mount.
+ */
+void
+xfs_bmap_compute_maxlevels(
+       xfs_mount_t     *mp,            /* file system mount structure */
+       int             whichfork)      /* data or attr fork */
+{
+       int             level;          /* btree level */
+       uint            maxblocks;      /* max blocks at this level */
+       uint            maxleafents;    /* max leaf entries possible */
+       int             maxrootrecs;    /* max records in root block */
+       int             minleafrecs;    /* min records in leaf block */
+       int             minnoderecs;    /* min records in node block */
+       int             sz;             /* root block size */
+
+       /*
+        * The maximum number of extents in a file, hence the maximum
+        * number of leaf entries, is controlled by the type of di_nextents
+        * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
+        * (a signed 16-bit number, xfs_aextnum_t).
+        *
+        * Note that we can no longer assume that if we are in ATTR1 that
+        * the fork offset of all the inodes will be
+        * (xfs_default_attroffset(ip) >> 3) because we could have mounted
+        * with ATTR2 and then mounted back with ATTR1, keeping the
+        * di_forkoff's fixed but probably at various positions. Therefore,
+        * for both ATTR1 and ATTR2 we have to assume the worst case scenario
+        * of a minimum size available.
+        */
+       if (whichfork == XFS_DATA_FORK) {
+               maxleafents = MAXEXTNUM;
+               sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+       } else {
+               maxleafents = MAXAEXTNUM;
+               sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
+       }
+       maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
+       minleafrecs = mp->m_bmap_dmnr[0];
+       minnoderecs = mp->m_bmap_dmnr[1];
+       maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+       for (level = 1; maxblocks > 1; level++) {
+               if (maxblocks <= maxrootrecs)
+                       maxblocks = 1;
+               else
+                       maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+       }
+       mp->m_bm_maxlevels[whichfork] = level;
+}
+
+STATIC int                             /* error */
+xfs_bmbt_lookup_eq(
+       struct xfs_btree_cur    *cur,
+       xfs_fileoff_t           off,
+       xfs_fsblock_t           bno,
+       xfs_filblks_t           len,
+       int                     *stat)  /* success/failure */
+{
+       cur->bc_rec.b.br_startoff = off;
+       cur->bc_rec.b.br_startblock = bno;
+       cur->bc_rec.b.br_blockcount = len;
+       return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+STATIC int                             /* error */
+xfs_bmbt_lookup_ge(
+       struct xfs_btree_cur    *cur,
+       xfs_fileoff_t           off,
+       xfs_fsblock_t           bno,
+       xfs_filblks_t           len,
+       int                     *stat)  /* success/failure */
+{
+       cur->bc_rec.b.br_startoff = off;
+       cur->bc_rec.b.br_startblock = bno;
+       cur->bc_rec.b.br_blockcount = len;
+       return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+ * Check if the inode needs to be converted to btree format.
+ */
+static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
+{
+       return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+               XFS_IFORK_NEXTENTS(ip, whichfork) >
+                       XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Check if the inode should be converted to extent format.
+ */
+static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
+{
+       return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+               XFS_IFORK_NEXTENTS(ip, whichfork) <=
+                       XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [off, bno, len, state].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_bmbt_update(
+       struct xfs_btree_cur    *cur,
+       xfs_fileoff_t           off,
+       xfs_fsblock_t           bno,
+       xfs_filblks_t           len,
+       xfs_exntst_t            state)
+{
+       union xfs_btree_rec     rec;
+
+       xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state);
+       return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Compute the worst-case number of indirect blocks that will be used
+ * for ip's delayed extent of length "len".
+ */
+STATIC xfs_filblks_t
+xfs_bmap_worst_indlen(
+       xfs_inode_t     *ip,            /* incore inode pointer */
+       xfs_filblks_t   len)            /* delayed extent length */
+{
+       int             level;          /* btree level number */
+       int             maxrecs;        /* maximum record count at this level */
+       xfs_mount_t     *mp;            /* mount structure */
+       xfs_filblks_t   rval;           /* return value */
+
+       mp = ip->i_mount;
+       maxrecs = mp->m_bmap_dmxr[0];
+       for (level = 0, rval = 0;
+            level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
+            level++) {
+               len += maxrecs - 1;
+               do_div(len, maxrecs);
+               rval += len;
+               if (len == 1)
+                       return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
+                               level - 1;
+               if (level == 0)
+                       maxrecs = mp->m_bmap_dmxr[1];
+       }
+       return rval;
+}
+
+/*
+ * Calculate the default attribute fork offset for newly created inodes.
+ */
+uint
+xfs_default_attroffset(
+       struct xfs_inode        *ip)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       uint                    offset;
+
+       if (mp->m_sb.sb_inodesize == 256) {
+               offset = XFS_LITINO(mp, ip->i_d.di_version) -
+                               XFS_BMDR_SPACE_CALC(MINABTPTRS);
+       } else {
+               offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
+       }
+
+       ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version));
+       return offset;
+}
+
+/*
+ * Helper routine to reset inode di_forkoff field when switching
+ * attribute fork from local to extent format - we reset it where
+ * possible to make space available for inline data fork extents.
+ */
+STATIC void
+xfs_bmap_forkoff_reset(
+       xfs_inode_t     *ip,
+       int             whichfork)
+{
+       if (whichfork == XFS_ATTR_FORK &&
+           ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
+           ip->i_d.di_format != XFS_DINODE_FMT_UUID &&
+           ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
+               uint    dfl_forkoff = xfs_default_attroffset(ip) >> 3;
+
+               if (dfl_forkoff > ip->i_d.di_forkoff)
+                       ip->i_d.di_forkoff = dfl_forkoff;
+       }
+}
+
+/*
+ * Debug/sanity checking code
+ */
+
+STATIC int
+xfs_bmap_sanity_check(
+       struct xfs_mount        *mp,
+       struct xfs_buf          *bp,
+       int                     level)
+{
+       struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+
+       if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) &&
+           block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC))
+               return 0;
+
+       if (be16_to_cpu(block->bb_level) != level ||
+           be16_to_cpu(block->bb_numrecs) == 0 ||
+           be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
+               return 0;
+
+       return 1;
+}
+
+#ifdef DEBUG
+STATIC struct xfs_buf *
+xfs_bmap_get_bp(
+       struct xfs_btree_cur    *cur,
+       xfs_fsblock_t           bno)
+{
+       struct xfs_log_item_desc *lidp;
+       int                     i;
+
+       if (!cur)
+               return NULL;
+
+       for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
+               if (!cur->bc_bufs[i])
+                       break;
+               if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno)
+                       return cur->bc_bufs[i];
+       }
+
+       /* Chase down all the log items to see if the bp is there */
+       list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) {
+               struct xfs_buf_log_item *bip;
+               bip = (struct xfs_buf_log_item *)lidp->lid_item;
+               if (bip->bli_item.li_type == XFS_LI_BUF &&
+                   XFS_BUF_ADDR(bip->bli_buf) == bno)
+                       return bip->bli_buf;
+       }
+
+       return NULL;
+}
+
+STATIC void
+xfs_check_block(
+       struct xfs_btree_block  *block,
+       xfs_mount_t             *mp,
+       int                     root,
+       short                   sz)
+{
+       int                     i, j, dmxr;
+       __be64                  *pp, *thispa;   /* pointer to block address */
+       xfs_bmbt_key_t          *prevp, *keyp;
+
+       ASSERT(be16_to_cpu(block->bb_level) > 0);
+
+       prevp = NULL;
+       for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
+               dmxr = mp->m_bmap_dmxr[0];
+               keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
+
+               if (prevp) {
+                       ASSERT(be64_to_cpu(prevp->br_startoff) <
+                              be64_to_cpu(keyp->br_startoff));
+               }
+               prevp = keyp;
+
+               /*
+                * Compare the block numbers to see if there are dups.
+                */
+               if (root)
+                       pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
+               else
+                       pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
+
+               for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
+                       if (root)
+                               thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
+                       else
+                               thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
+                       if (*thispa == *pp) {
+                               xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
+                                       __func__, j, i,
+                                       (unsigned long long)be64_to_cpu(*thispa));
+                               panic("%s: ptrs are equal in node\n",
+                                       __func__);
+                       }
+               }
+       }
+}
+
+/*
+ * Check that the extents for the inode ip are in the right order in all
+ * btree leaves.
+ */
+
+STATIC void
+xfs_bmap_check_leaf_extents(
+       xfs_btree_cur_t         *cur,   /* btree cursor or null */
+       xfs_inode_t             *ip,            /* incore inode pointer */
+       int                     whichfork)      /* data or attr fork */
+{
+       struct xfs_btree_block  *block; /* current btree block */
+       xfs_fsblock_t           bno;    /* block # of "block" */
+       xfs_buf_t               *bp;    /* buffer for "block" */
+       int                     error;  /* error return value */
+       xfs_extnum_t            i=0, j; /* index into the extents list */
+       xfs_ifork_t             *ifp;   /* fork structure */
+       int                     level;  /* btree level, for checking */
+       xfs_mount_t             *mp;    /* file system mount structure */
+       __be64                  *pp;    /* pointer to block address */
+       xfs_bmbt_rec_t          *ep;    /* pointer to current extent */
+       xfs_bmbt_rec_t          last = {0, 0}; /* last extent in prev block */
+       xfs_bmbt_rec_t          *nextp; /* pointer to next extent */
+       int                     bp_release = 0;
+
+       if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) {
+               return;
+       }
+
+       bno = NULLFSBLOCK;
+       mp = ip->i_mount;
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       block = ifp->if_broot;
+       /*
+        * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+        */
+       level = be16_to_cpu(block->bb_level);
+       ASSERT(level > 0);
+       xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
+       pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+       bno = be64_to_cpu(*pp);
+
+       ASSERT(bno != NULLFSBLOCK);
+       ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+       ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+
+       /*
+        * Go down the tree until leaf level is reached, following the first
+        * pointer (leftmost) at each level.
+        */
+       while (level-- > 0) {
+               /* See if buf is in cur first */
+               bp_release = 0;
+               bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+               if (!bp) {
+                       bp_release = 1;
+                       error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+                                               XFS_BMAP_BTREE_REF,
+                                               &xfs_bmbt_buf_ops);
+                       if (error)
+                               goto error_norelse;
+               }
+               block = XFS_BUF_TO_BLOCK(bp);
+               XFS_WANT_CORRUPTED_GOTO(
+                       xfs_bmap_sanity_check(mp, bp, level),
+                       error0);
+               if (level == 0)
+                       break;
+
+               /*
+                * Check this block for basic sanity (increasing keys and
+                * no duplicate blocks).
+                */
+
+               xfs_check_block(block, mp, 0, 0);
+               pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+               bno = be64_to_cpu(*pp);
+               XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+               if (bp_release) {
+                       bp_release = 0;
+                       xfs_trans_brelse(NULL, bp);
+               }
+       }
+
+       /*
+        * Here with bp and block set to the leftmost leaf node in the tree.
+        */
+       i = 0;
+
+       /*
+        * Loop over all leaf nodes checking that all extents are in the right order.
+        */
+       for (;;) {
+               xfs_fsblock_t   nextbno;
+               xfs_extnum_t    num_recs;
+
+
+               num_recs = xfs_btree_get_numrecs(block);
+
+               /*
+                * Read-ahead the next leaf block, if any.
+                */
+
+               nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+
+               /*
+                * Check all the extents to make sure they are OK.
+                * If we had a previous block, the last entry should
+                * conform with the first entry in this one.
+                */
+
+               ep = XFS_BMBT_REC_ADDR(mp, block, 1);
+               if (i) {
+                       ASSERT(xfs_bmbt_disk_get_startoff(&last) +
+                              xfs_bmbt_disk_get_blockcount(&last) <=
+                              xfs_bmbt_disk_get_startoff(ep));
+               }
+               for (j = 1; j < num_recs; j++) {
+                       nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
+                       ASSERT(xfs_bmbt_disk_get_startoff(ep) +
+                              xfs_bmbt_disk_get_blockcount(ep) <=
+                              xfs_bmbt_disk_get_startoff(nextp));
+                       ep = nextp;
+               }
+
+               last = *ep;
+               i += num_recs;
+               if (bp_release) {
+                       bp_release = 0;
+                       xfs_trans_brelse(NULL, bp);
+               }
+               bno = nextbno;
+               /*
+                * If we've reached the end, stop.
+                */
+               if (bno == NULLFSBLOCK)
+                       break;
+
+               bp_release = 0;
+               bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+               if (!bp) {
+                       bp_release = 1;
+                       error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+                                               XFS_BMAP_BTREE_REF,
+                                               &xfs_bmbt_buf_ops);
+                       if (error)
+                               goto error_norelse;
+               }
+               block = XFS_BUF_TO_BLOCK(bp);
+       }
+       if (bp_release) {
+               bp_release = 0;
+               xfs_trans_brelse(NULL, bp);
+       }
+       return;
+
+error0:
+       xfs_warn(mp, "%s: at error0", __func__);
+       if (bp_release)
+               xfs_trans_brelse(NULL, bp);
+error_norelse:
+       xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
+               __func__, i);
+       panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
+       return;
+}
+
+/*
+ * Add bmap trace insert entries for all the contents of the extent records.
+ */
+void
+xfs_bmap_trace_exlist(
+       xfs_inode_t     *ip,            /* incore inode pointer */
+       xfs_extnum_t    cnt,            /* count of entries in the list */
+       int             whichfork,      /* data or attr fork */
+       unsigned long   caller_ip)
+{
+       xfs_extnum_t    idx;            /* extent record index */
+       xfs_ifork_t     *ifp;           /* inode fork pointer */
+       int             state = 0;
+
+       if (whichfork == XFS_ATTR_FORK)
+               state |= BMAP_ATTRFORK;
+
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
+       for (idx = 0; idx < cnt; idx++)
+               trace_xfs_extlist(ip, idx, whichfork, caller_ip);
+}
+
+/*
+ * Validate that the bmbt_irecs being returned from bmapi are valid
+ * given the caller's original parameters.  Specifically check the
+ * ranges of the returned irecs to ensure that they only extend beyond
+ * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
+ */
+STATIC void
+xfs_bmap_validate_ret(
+       xfs_fileoff_t           bno,
+       xfs_filblks_t           len,
+       int                     flags,
+       xfs_bmbt_irec_t         *mval,
+       int                     nmap,
+       int                     ret_nmap)
+{
+       int                     i;              /* index to map values */
+
+       ASSERT(ret_nmap <= nmap);
+
+       for (i = 0; i < ret_nmap; i++) {
+               ASSERT(mval[i].br_blockcount > 0);
+               if (!(flags & XFS_BMAPI_ENTIRE)) {
+                       ASSERT(mval[i].br_startoff >= bno);
+                       ASSERT(mval[i].br_blockcount <= len);
+                       ASSERT(mval[i].br_startoff + mval[i].br_blockcount <=
+                              bno + len);
+               } else {
+                       ASSERT(mval[i].br_startoff < bno + len);
+                       ASSERT(mval[i].br_startoff + mval[i].br_blockcount >
+                              bno);
+               }
+               ASSERT(i == 0 ||
+                      mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
+                      mval[i].br_startoff);
+               ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
+                      mval[i].br_startblock != HOLESTARTBLOCK);
+               ASSERT(mval[i].br_state == XFS_EXT_NORM ||
+                      mval[i].br_state == XFS_EXT_UNWRITTEN);
+       }
+}
+
+#else
+#define xfs_bmap_check_leaf_extents(cur, ip, whichfork)                do { } while (0)
+#define        xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
+#endif /* DEBUG */
+
+/*
+ * bmap free list manipulation functions
+ */
+
+/*
+ * Add the extent to the list of extents to be free at transaction end.
+ * The list is maintained sorted (by block number).
+ */
+void
+xfs_bmap_add_free(
+       xfs_fsblock_t           bno,            /* fs block number of extent */
+       xfs_filblks_t           len,            /* length of extent */
+       xfs_bmap_free_t         *flist,         /* list of extents */
+       xfs_mount_t             *mp)            /* mount point structure */
+{
+       xfs_bmap_free_item_t    *cur;           /* current (next) element */
+       xfs_bmap_free_item_t    *new;           /* new element */
+       xfs_bmap_free_item_t    *prev;          /* previous element */
+#ifdef DEBUG
+       xfs_agnumber_t          agno;
+       xfs_agblock_t           agbno;
+
+       ASSERT(bno != NULLFSBLOCK);
+       ASSERT(len > 0);
+       ASSERT(len <= MAXEXTLEN);
+       ASSERT(!isnullstartblock(bno));
+       agno = XFS_FSB_TO_AGNO(mp, bno);
+       agbno = XFS_FSB_TO_AGBNO(mp, bno);
+       ASSERT(agno < mp->m_sb.sb_agcount);
+       ASSERT(agbno < mp->m_sb.sb_agblocks);
+       ASSERT(len < mp->m_sb.sb_agblocks);
+       ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
+#endif
+       ASSERT(xfs_bmap_free_item_zone != NULL);
+       new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+       new->xbfi_startblock = bno;
+       new->xbfi_blockcount = (xfs_extlen_t)len;
+       for (prev = NULL, cur = flist->xbf_first;
+            cur != NULL;
+            prev = cur, cur = cur->xbfi_next) {
+               if (cur->xbfi_startblock >= bno)
+                       break;
+       }
+       if (prev)
+               prev->xbfi_next = new;
+       else
+               flist->xbf_first = new;
+       new->xbfi_next = cur;
+       flist->xbf_count++;
+}
+
+/*
+ * Remove the entry "free" from the free item list.  Prev points to the
+ * previous entry, unless "free" is the head of the list.
+ */
+void
+xfs_bmap_del_free(
+       xfs_bmap_free_t         *flist, /* free item list header */
+       xfs_bmap_free_item_t    *prev,  /* previous item on list, if any */
+       xfs_bmap_free_item_t    *free)  /* list item to be freed */
+{
+       if (prev)
+               prev->xbfi_next = free->xbfi_next;
+       else
+               flist->xbf_first = free->xbfi_next;
+       flist->xbf_count--;
+       kmem_zone_free(xfs_bmap_free_item_zone, free);
+}
+
+/*
+ * Free up any items left in the list.
+ */
+void
+xfs_bmap_cancel(
+       xfs_bmap_free_t         *flist) /* list of bmap_free_items */
+{
+       xfs_bmap_free_item_t    *free;  /* free list item */
+       xfs_bmap_free_item_t    *next;
+
+       if (flist->xbf_count == 0)
+               return;
+       ASSERT(flist->xbf_first != NULL);
+       for (free = flist->xbf_first; free; free = next) {
+               next = free->xbfi_next;
+               xfs_bmap_del_free(flist, NULL, free);
+       }
+       ASSERT(flist->xbf_count == 0);
+}
+
+/*
+ * Inode fork format manipulation functions
+ */
+
+/*
+ * Transform a btree format file with only one leaf node, where the
+ * extents list will fit in the inode, into an extents format file.
+ * Since the file extents are already in-core, all we have to do is
+ * give up the space for the btree root and pitch the leaf block.
+ */
+STATIC int                             /* error */
+xfs_bmap_btree_to_extents(
+       xfs_trans_t             *tp,    /* transaction pointer */
+       xfs_inode_t             *ip,    /* incore inode pointer */
+       xfs_btree_cur_t         *cur,   /* btree cursor */
+       int                     *logflagsp, /* inode logging flags */
+       int                     whichfork)  /* data or attr fork */
+{
+       /* REFERENCED */
+       struct xfs_btree_block  *cblock;/* child btree block */
+       xfs_fsblock_t           cbno;   /* child block number */
+       xfs_buf_t               *cbp;   /* child block's buffer */
+       int                     error;  /* error return value */
+       xfs_ifork_t             *ifp;   /* inode fork data */
+       xfs_mount_t             *mp;    /* mount point structure */
+       __be64                  *pp;    /* ptr to block address */
+       struct xfs_btree_block  *rblock;/* root btree block */
+
+       mp = ip->i_mount;
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+       ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+       rblock = ifp->if_broot;
+       ASSERT(be16_to_cpu(rblock->bb_level) == 1);
+       ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
+       ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+       pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
+       cbno = be64_to_cpu(*pp);
+       *logflagsp = 0;
+#ifdef DEBUG
+       if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
+               return error;
+#endif
+       error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
+                               &xfs_bmbt_buf_ops);
+       if (error)
+               return error;
+       cblock = XFS_BUF_TO_BLOCK(cbp);
+       if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
+               return error;
+       xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
+       ip->i_d.di_nblocks--;
+       xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+       xfs_trans_binval(tp, cbp);
+       if (cur->bc_bufs[0] == cbp)
+               cur->bc_bufs[0] = NULL;
+       xfs_iroot_realloc(ip, -1, whichfork);
+       ASSERT(ifp->if_broot == NULL);
+       ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
+       XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+       *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+       return 0;
+}
+
+/*
+ * Convert an extents-format file into a btree-format file.
+ * The new file will have a root block (in the inode) and a single child block.
+ */
+STATIC int                                     /* error */
+xfs_bmap_extents_to_btree(
+       xfs_trans_t             *tp,            /* transaction pointer */
+       xfs_inode_t             *ip,            /* incore inode pointer */
+       xfs_fsblock_t           *firstblock,    /* first-block-allocated */
+       xfs_bmap_free_t         *flist,         /* blocks freed in xaction */
+       xfs_btree_cur_t         **curp,         /* cursor returned to caller */
+       int                     wasdel,         /* converting a delayed alloc */
+       int                     *logflagsp,     /* inode logging flags */
+       int                     whichfork)      /* data or attr fork */
+{
+       struct xfs_btree_block  *ablock;        /* allocated (child) bt block */
+       xfs_buf_t               *abp;           /* buffer for ablock */
+       xfs_alloc_arg_t         args;           /* allocation arguments */
+       xfs_bmbt_rec_t          *arp;           /* child record pointer */
+       struct xfs_btree_block  *block;         /* btree root block */
+       xfs_btree_cur_t         *cur;           /* bmap btree cursor */
+       xfs_bmbt_rec_host_t     *ep;            /* extent record pointer */
+       int                     error;          /* error return value */
+       xfs_extnum_t            i, cnt;         /* extent record index */
+       xfs_ifork_t             *ifp;           /* inode fork pointer */
+       xfs_bmbt_key_t          *kp;            /* root block key pointer */
+       xfs_mount_t             *mp;            /* mount structure */
+       xfs_extnum_t            nextents;       /* number of file extents */
+       xfs_bmbt_ptr_t          *pp;            /* root block address pointer */
+
+       mp = ip->i_mount;
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
+
+       /*
+        * Make space in the inode incore.
+        */
+       xfs_iroot_realloc(ip, 1, whichfork);
+       ifp->if_flags |= XFS_IFBROOT;
+
+       /*
+        * Fill in the root.
+        */
+       block = ifp->if_broot;
+       if (xfs_sb_version_hascrc(&mp->m_sb))
+               xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
+                                XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino,
+                                XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
+       else
+               xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
+                                XFS_BMAP_MAGIC, 1, 1, ip->i_ino,
+                                XFS_BTREE_LONG_PTRS);
+
+       /*
+        * Need a cursor.  Can't allocate until bb_level is filled in.
+        */
+       cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+       cur->bc_private.b.firstblock = *firstblock;
+       cur->bc_private.b.flist = flist;
+       cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+       /*
+        * Convert to a btree with two levels, one record in root.
+        */
+       XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
+       memset(&args, 0, sizeof(args));
+       args.tp = tp;
+       args.mp = mp;
+       args.firstblock = *firstblock;
+       if (*firstblock == NULLFSBLOCK) {
+               args.type = XFS_ALLOCTYPE_START_BNO;
+               args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
+       } else if (flist->xbf_low) {
+               args.type = XFS_ALLOCTYPE_START_BNO;
+               args.fsbno = *firstblock;
+       } else {
+               args.type = XFS_ALLOCTYPE_NEAR_BNO;
+               args.fsbno = *firstblock;
+       }
+       args.minlen = args.maxlen = args.prod = 1;
+       args.wasdel = wasdel;
+       *logflagsp = 0;
+       if ((error = xfs_alloc_vextent(&args))) {
+               xfs_iroot_realloc(ip, -1, whichfork);
+               xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+               return error;
+       }
+       /*
+        * Allocation can't fail, the space was reserved.
+        */
+       ASSERT(args.fsbno != NULLFSBLOCK);
+       ASSERT(*firstblock == NULLFSBLOCK ||
+              args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
+              (flist->xbf_low &&
+               args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
+       *firstblock = cur->bc_private.b.firstblock = args.fsbno;
+       cur->bc_private.b.allocated++;
+       ip->i_d.di_nblocks++;
+       xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
+       abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
+       /*
+        * Fill in the child block.
+        */
+       abp->b_ops = &xfs_bmbt_buf_ops;
+       ablock = XFS_BUF_TO_BLOCK(abp);
+       if (xfs_sb_version_hascrc(&mp->m_sb))
+               xfs_btree_init_block_int(mp, ablock, abp->b_bn,
+                               XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
+                               XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
+       else
+               xfs_btree_init_block_int(mp, ablock, abp->b_bn,
+                               XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
+                               XFS_BTREE_LONG_PTRS);
+
+       arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       for (cnt = i = 0; i < nextents; i++) {
+               ep = xfs_iext_get_ext(ifp, i);
+               if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
+                       arp->l0 = cpu_to_be64(ep->l0);
+                       arp->l1 = cpu_to_be64(ep->l1);
+                       arp++; cnt++;
+               }
+       }
+       ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
+       xfs_btree_set_numrecs(ablock, cnt);
+
+       /*
+        * Fill in the root key and pointer.
+        */
+       kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
+       arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
+       kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
+       pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
+                                               be16_to_cpu(block->bb_level)));
+       *pp = cpu_to_be64(args.fsbno);
+
+       /*
+        * Do all this logging at the end so that
+        * the root is at the right level.
+        */
+       xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS);
+       xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
+       ASSERT(*curp == NULL);
+       *curp = cur;
+       *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
+       return 0;
+}
+
+/*
+ * Convert a local file to an extents file.
+ * This code is out of bounds for data forks of regular files,
+ * since the file data needs to get logged so things will stay consistent.
+ * (The bmap-level manipulations are ok, though).
+ */
+void
+xfs_bmap_local_to_extents_empty(
+       struct xfs_inode        *ip,
+       int                     whichfork)
+{
+       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+
+       ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+       ASSERT(ifp->if_bytes == 0);
+       ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
+
+       xfs_bmap_forkoff_reset(ip, whichfork);
+       ifp->if_flags &= ~XFS_IFINLINE;
+       ifp->if_flags |= XFS_IFEXTENTS;
+       XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+}
+
+
+STATIC int                             /* error */
+xfs_bmap_local_to_extents(
+       xfs_trans_t     *tp,            /* transaction pointer */
+       xfs_inode_t     *ip,            /* incore inode pointer */
+       xfs_fsblock_t   *firstblock,    /* first block allocated in xaction */
+       xfs_extlen_t    total,          /* total blocks needed by transaction */
+       int             *logflagsp,     /* inode logging flags */
+       int             whichfork,
+       void            (*init_fn)(struct xfs_trans *tp,
+                                  struct xfs_buf *bp,
+                                  struct xfs_inode *ip,
+                                  struct xfs_ifork *ifp))
+{
+       int             error = 0;
+       int             flags;          /* logging flags returned */
+       xfs_ifork_t     *ifp;           /* inode fork pointer */
+       xfs_alloc_arg_t args;           /* allocation arguments */
+       xfs_buf_t       *bp;            /* buffer for extent block */
+       xfs_bmbt_rec_host_t *ep;        /* extent record pointer */
+
+       /*
+        * We don't want to deal with the case of keeping inode data inline yet.
+        * So sending the data fork of a regular inode is invalid.
+        */
+       ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK));
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+
+       if (!ifp->if_bytes) {
+               xfs_bmap_local_to_extents_empty(ip, whichfork);
+               flags = XFS_ILOG_CORE;
+               goto done;
+       }
+
+       flags = 0;
+       error = 0;
+       ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) ==
+                                                               XFS_IFINLINE);
+       memset(&args, 0, sizeof(args));
+       args.tp = tp;
+       args.mp = ip->i_mount;
+       args.firstblock = *firstblock;
+       /*
+        * Allocate a block.  We know we need only one, since the
+        * file currently fits in an inode.
+        */
+       if (*firstblock == NULLFSBLOCK) {
+               args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
+               args.type = XFS_ALLOCTYPE_START_BNO;
+       } else {
+               args.fsbno = *firstblock;
+               args.type = XFS_ALLOCTYPE_NEAR_BNO;
+       }
+       args.total = total;
+       args.minlen = args.maxlen = args.prod = 1;
+       error = xfs_alloc_vextent(&args);
+       if (error)
+               goto done;
+
+       /* Can't fail, the space was reserved. */
+       ASSERT(args.fsbno != NULLFSBLOCK);
+       ASSERT(args.len == 1);
+       *firstblock = args.fsbno;
+       bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
+
+       /* initialise the block and copy the data */
+       init_fn(tp, bp, ip, ifp);
+
+       /* account for the change in fork size and log everything */
+       xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
+       xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
+       xfs_bmap_local_to_extents_empty(ip, whichfork);
+       flags |= XFS_ILOG_CORE;
+
+       xfs_iext_add(ifp, 0, 1);
+       ep = xfs_iext_get_ext(ifp, 0);
+       xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
+       trace_xfs_bmap_post_update(ip, 0,
+                       whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
+                       _THIS_IP_);
+       XFS_IFORK_NEXT_SET(ip, whichfork, 1);
+       ip->i_d.di_nblocks = 1;
+       xfs_trans_mod_dquot_byino(tp, ip,
+               XFS_TRANS_DQ_BCOUNT, 1L);
+       flags |= xfs_ilog_fext(whichfork);
+
+done:
+       *logflagsp = flags;
+       return error;
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle btree format files.
+ */
+STATIC int                                     /* error */
+xfs_bmap_add_attrfork_btree(
+       xfs_trans_t             *tp,            /* transaction pointer */
+       xfs_inode_t             *ip,            /* incore inode pointer */
+       xfs_fsblock_t           *firstblock,    /* first block allocated */
+       xfs_bmap_free_t         *flist,         /* blocks to free at commit */
+       int                     *flags)         /* inode logging flags */
+{
+       xfs_btree_cur_t         *cur;           /* btree cursor */
+       int                     error;          /* error return value */
+       xfs_mount_t             *mp;            /* file system mount struct */
+       int                     stat;           /* newroot status */
+
+       mp = ip->i_mount;
+       if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip))
+               *flags |= XFS_ILOG_DBROOT;
+       else {
+               cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
+               cur->bc_private.b.flist = flist;
+               cur->bc_private.b.firstblock = *firstblock;
+               if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
+                       goto error0;
+               /* must be at least one entry */
+               XFS_WANT_CORRUPTED_GOTO(stat == 1, error0);
+               if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
+                       goto error0;
+               if (stat == 0) {
+                       xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+                       return -ENOSPC;
+               }
+               *firstblock = cur->bc_private.b.firstblock;
+               cur->bc_private.b.allocated = 0;
+               xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+       }
+       return 0;
+error0:
+       xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+       return error;
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle extents format files.
+ */
+STATIC int                                     /* error */
+xfs_bmap_add_attrfork_extents(
+       xfs_trans_t             *tp,            /* transaction pointer */
+       xfs_inode_t             *ip,            /* incore inode pointer */
+       xfs_fsblock_t           *firstblock,    /* first block allocated */
+       xfs_bmap_free_t         *flist,         /* blocks to free at commit */
+       int                     *flags)         /* inode logging flags */
+{
+       xfs_btree_cur_t         *cur;           /* bmap btree cursor */
+       int                     error;          /* error return value */
+
+       if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip))
+               return 0;
+       cur = NULL;
+       error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0,
+               flags, XFS_DATA_FORK);
+       if (cur) {
+               cur->bc_private.b.allocated = 0;
+               xfs_btree_del_cursor(cur,
+                       error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+       }
+       return error;
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle local format files. Each
+ * different data fork content type needs a different callout to do the
+ * conversion. Some are basic and only require special block initialisation
+ * callouts for the data formating, others (directories) are so specialised they
+ * handle everything themselves.
+ *
+ * XXX (dgc): investigate whether directory conversion can use the generic
+ * formatting callout. It should be possible - it's just a very complex
+ * formatter.
+ */
+STATIC int                                     /* error */
+xfs_bmap_add_attrfork_local(
+       xfs_trans_t             *tp,            /* transaction pointer */
+       xfs_inode_t             *ip,            /* incore inode pointer */
+       xfs_fsblock_t           *firstblock,    /* first block allocated */
+       xfs_bmap_free_t         *flist,         /* blocks to free at commit */
+       int                     *flags)         /* inode logging flags */
+{
+       xfs_da_args_t           dargs;          /* args for dir/attr code */
+
+       if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
+               return 0;
+
+       if (S_ISDIR(ip->i_d.di_mode)) {
+               memset(&dargs, 0, sizeof(dargs));
+               dargs.geo = ip->i_mount->m_dir_geo;
+               dargs.dp = ip;
+               dargs.firstblock = firstblock;
+               dargs.flist = flist;
+               dargs.total = dargs.geo->fsbcount;
+               dargs.whichfork = XFS_DATA_FORK;
+               dargs.trans = tp;
+               return xfs_dir2_sf_to_block(&dargs);
+       }
+
+       if (S_ISLNK(ip->i_d.di_mode))
+               return xfs_bmap_local_to_extents(tp, ip, firstblock, 1,
+                                                flags, XFS_DATA_FORK,
+                                                xfs_symlink_local_to_remote);
+
+       /* should only be called for types that support local format data */
+       ASSERT(0);
+       return -EFSCORRUPTED;
+}
+
+/*
+ * Convert inode from non-attributed to attributed.
+ * Must not be in a transaction, ip must not be locked.
+ */
+int                                            /* error code */
+xfs_bmap_add_attrfork(
+       xfs_inode_t             *ip,            /* incore inode pointer */
+       int                     size,           /* space new attribute needs */
+       int                     rsvd)           /* xact may use reserved blks */
+{
+       xfs_fsblock_t           firstblock;     /* 1st block/ag allocated */
+       xfs_bmap_free_t         flist;          /* freed extent records */
+       xfs_mount_t             *mp;            /* mount structure */
+       xfs_trans_t             *tp;            /* transaction pointer */
+       int                     blks;           /* space reservation */
+       int                     version = 1;    /* superblock attr version */
+       int                     committed;      /* xaction was committed */
+       int                     logflags;       /* logging flags */
+       int                     error;          /* error return value */
+       int                     cancel_flags = 0;
+
+       ASSERT(XFS_IFORK_Q(ip) == 0);
+
+       mp = ip->i_mount;
+       ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+       tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
+       blks = XFS_ADDAFORK_SPACE_RES(mp);
+       if (rsvd)
+               tp->t_flags |= XFS_TRANS_RESERVE;
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
+       if (error) {
+               xfs_trans_cancel(tp, 0);
+               return error;
+       }
+       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
+                       XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
+                       XFS_QMOPT_RES_REGBLKS);
+       if (error)
+               goto trans_cancel;
+       cancel_flags |= XFS_TRANS_ABORT;
+       if (XFS_IFORK_Q(ip))
+               goto trans_cancel;
+       if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
+               /*
+                * For inodes coming from pre-6.2 filesystems.
+                */
+               ASSERT(ip->i_d.di_aformat == 0);
+               ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+       }
+       ASSERT(ip->i_d.di_anextents == 0);
+
+       xfs_trans_ijoin(tp, ip, 0);
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+       switch (ip->i_d.di_format) {
+       case XFS_DINODE_FMT_DEV:
+               ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
+               break;
+       case XFS_DINODE_FMT_UUID:
+               ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3;
+               break;
+       case XFS_DINODE_FMT_LOCAL:
+       case XFS_DINODE_FMT_EXTENTS:
+       case XFS_DINODE_FMT_BTREE:
+               ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
+               if (!ip->i_d.di_forkoff)
+                       ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
+               else if (mp->m_flags & XFS_MOUNT_ATTR2)
+                       version = 2;
+               break;
+       default:
+               ASSERT(0);
+               error = -EINVAL;
+               goto trans_cancel;
+       }
+
+       ASSERT(ip->i_afp == NULL);
+       ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+       ip->i_afp->if_flags = XFS_IFEXTENTS;
+       logflags = 0;
+       xfs_bmap_init(&flist, &firstblock);
+       switch (ip->i_d.di_format) {
+       case XFS_DINODE_FMT_LOCAL:
+               error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
+                       &logflags);
+               break;
+       case XFS_DINODE_FMT_EXTENTS:
+               error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock,
+                       &flist, &logflags);
+               break;
+       case XFS_DINODE_FMT_BTREE:
+               error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist,
+                       &logflags);
+               break;
+       default:
+               error = 0;
+               break;
+       }
+       if (logflags)
+               xfs_trans_log_inode(tp, ip, logflags);
+       if (error)
+               goto bmap_cancel;
+       if (!xfs_sb_version_hasattr(&mp->m_sb) ||
+          (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
+               __int64_t sbfields = 0;
+
+               spin_lock(&mp->m_sb_lock);
+               if (!xfs_sb_version_hasattr(&mp->m_sb)) {
+                       xfs_sb_version_addattr(&mp->m_sb);
+                       sbfields |= XFS_SB_VERSIONNUM;
+               }
+               if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) {
+                       xfs_sb_version_addattr2(&mp->m_sb);
+                       sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
+               }
+               if (sbfields) {
+                       spin_unlock(&mp->m_sb_lock);
+                       xfs_mod_sb(tp, sbfields);
+               } else
+                       spin_unlock(&mp->m_sb_lock);
+       }
+
+       error = xfs_bmap_finish(&tp, &flist, &committed);
+       if (error)
+               goto bmap_cancel;
+       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       return error;
+
+bmap_cancel:
+       xfs_bmap_cancel(&flist);
+trans_cancel:
+       xfs_trans_cancel(tp, cancel_flags);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       return error;
+}
+
+/*
+ * Internal and external extent tree search functions.
+ */
+
+/*
+ * Read in the extents to if_extents.
+ * All inode fields are set up by caller, we just traverse the btree
+ * and copy the records in. If the file system cannot contain unwritten
+ * extents, the records are checked for no "state" flags.
+ */
+int                                    /* error */
+xfs_bmap_read_extents(
+       xfs_trans_t             *tp,    /* transaction pointer */
+       xfs_inode_t             *ip,    /* incore inode */
+       int                     whichfork) /* data or attr fork */
+{
+       struct xfs_btree_block  *block; /* current btree block */
+       xfs_fsblock_t           bno;    /* block # of "block" */
+       xfs_buf_t               *bp;    /* buffer for "block" */
+       int                     error;  /* error return value */
+       xfs_exntfmt_t           exntf;  /* XFS_EXTFMT_NOSTATE, if checking */
+       xfs_extnum_t            i, j;   /* index into the extents list */
+       xfs_ifork_t             *ifp;   /* fork structure */
+       int                     level;  /* btree level, for checking */
+       xfs_mount_t             *mp;    /* file system mount structure */
+       __be64                  *pp;    /* pointer to block address */
+       /* REFERENCED */
+       xfs_extnum_t            room;   /* number of entries there's room for */
+
+       bno = NULLFSBLOCK;
+       mp = ip->i_mount;
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
+                                       XFS_EXTFMT_INODE(ip);
+       block = ifp->if_broot;
+       /*
+        * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+        */
+       level = be16_to_cpu(block->bb_level);
+       ASSERT(level > 0);
+       pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+       bno = be64_to_cpu(*pp);
+       ASSERT(bno != NULLFSBLOCK);
+       ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+       ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+       /*
+        * Go down the tree until leaf level is reached, following the first
+        * pointer (leftmost) at each level.
+        */
+       while (level-- > 0) {
+               error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+                               XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+               if (error)
+                       return error;
+               block = XFS_BUF_TO_BLOCK(bp);
+               XFS_WANT_CORRUPTED_GOTO(
+                       xfs_bmap_sanity_check(mp, bp, level),
+                       error0);
+               if (level == 0)
+                       break;
+               pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+               bno = be64_to_cpu(*pp);
+               XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+               xfs_trans_brelse(tp, bp);
+       }
+       /*
+        * Here with bp and block set to the leftmost leaf node in the tree.
+        */
+       room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       i = 0;
+       /*
+        * Loop over all leaf nodes.  Copy information to the extent records.
+        */
+       for (;;) {
+               xfs_bmbt_rec_t  *frp;
+               xfs_fsblock_t   nextbno;
+               xfs_extnum_t    num_recs;
+               xfs_extnum_t    start;
+
+               num_recs = xfs_btree_get_numrecs(block);
+               if (unlikely(i + num_recs > room)) {
+                       ASSERT(i + num_recs <= room);
+                       xfs_warn(ip->i_mount,
+                               "corrupt dinode %Lu, (btree extents).",
+                               (unsigned long long) ip->i_ino);
+                       XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
+                               XFS_ERRLEVEL_LOW, ip->i_mount, block);
+                       goto error0;
+               }
+               XFS_WANT_CORRUPTED_GOTO(
+                       xfs_bmap_sanity_check(mp, bp, 0),
+                       error0);
+               /*
+                * Read-ahead the next leaf block, if any.
+                */
+               nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+               if (nextbno != NULLFSBLOCK)
+                       xfs_btree_reada_bufl(mp, nextbno, 1,
+                                            &xfs_bmbt_buf_ops);
+               /*
+                * Copy records into the extent records.
+                */
+               frp = XFS_BMBT_REC_ADDR(mp, block, 1);
+               start = i;
+               for (j = 0; j < num_recs; j++, i++, frp++) {
+                       xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
+                       trp->l0 = be64_to_cpu(frp->l0);
+                       trp->l1 = be64_to_cpu(frp->l1);
+               }
+               if (exntf == XFS_EXTFMT_NOSTATE) {
+                       /*
+                        * Check all attribute bmap btree records and
+                        * any "older" data bmap btree records for a
+                        * set bit in the "extent flag" position.
+                        */
+                       if (unlikely(xfs_check_nostate_extents(ifp,
+                                       start, num_recs))) {
+                               XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
+                                                XFS_ERRLEVEL_LOW,
+                                                ip->i_mount);
+                               goto error0;
+                       }
+               }
+               xfs_trans_brelse(tp, bp);
+               bno = nextbno;
+               /*
+                * If we've reached the end, stop.
+                */
+               if (bno == NULLFSBLOCK)
+                       break;
+               error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+                               XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+               if (error)
+                       return error;
+               block = XFS_BUF_TO_BLOCK(bp);
+       }
+       ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
+       ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
+       XFS_BMAP_TRACE_EXLIST(ip, i, whichfork);
+       return 0;
+error0:
+       xfs_trans_brelse(tp, bp);
+       return -EFSCORRUPTED;
+}
+
+
+/*
+ * Search the extent records for the entry containing block bno.
+ * If bno lies in a hole, point to the next entry.  If bno lies
+ * past eof, *eofp will be set, and *prevp will contain the last
+ * entry (null if none).  Else, *lastxp will be set to the index
+ * of the found entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_host_t *           /* pointer to found extent entry */
+xfs_bmap_search_multi_extents(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_fileoff_t   bno,            /* block number searched for */
+       int             *eofp,          /* out: end of file found */
+       xfs_extnum_t    *lastxp,        /* out: last extent index */
+       xfs_bmbt_irec_t *gotp,          /* out: extent entry found */
+       xfs_bmbt_irec_t *prevp)         /* out: previous extent entry found */
+{
+       xfs_bmbt_rec_host_t *ep;                /* extent record pointer */
+       xfs_extnum_t    lastx;          /* last extent index */
+
+       /*
+        * Initialize the extent entry structure to catch access to
+        * uninitialized br_startblock field.
+        */
+       gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL;
+       gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL;
+       gotp->br_state = XFS_EXT_INVALID;
+       gotp->br_startblock = 0xffffa5a5a5a5a5a5LL;
+       prevp->br_startoff = NULLFILEOFF;
+
+       ep = xfs_iext_bno_to_ext(ifp, bno, &lastx);
+       if (lastx > 0) {
+               xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp);
+       }
+       if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
+               xfs_bmbt_get_all(ep, gotp);
+               *eofp = 0;
+       } else {
+               if (lastx > 0) {
+                       *gotp = *prevp;
+               }
+               *eofp = 1;
+               ep = NULL;
+       }
+       *lastxp = lastx;
+       return ep;
+}
+
+/*
+ * Search the extents list for the inode, for the extent containing bno.
+ * If bno lies in a hole, point to the next entry.  If bno lies past eof,
+ * *eofp will be set, and *prevp will contain the last entry (null if none).
+ * Else, *lastxp will be set to the index of the found
+ * entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_host_t *                 /* pointer to found extent entry */
+xfs_bmap_search_extents(
+       xfs_inode_t     *ip,            /* incore inode pointer */
+       xfs_fileoff_t   bno,            /* block number searched for */
+       int             fork,           /* data or attr fork */
+       int             *eofp,          /* out: end of file found */
+       xfs_extnum_t    *lastxp,        /* out: last extent index */
+       xfs_bmbt_irec_t *gotp,          /* out: extent entry found */
+       xfs_bmbt_irec_t *prevp)         /* out: previous extent entry found */
+{
+       xfs_ifork_t     *ifp;           /* inode fork pointer */
+       xfs_bmbt_rec_host_t  *ep;            /* extent record pointer */
+
+       XFS_STATS_INC(xs_look_exlist);
+       ifp = XFS_IFORK_PTR(ip, fork);
+
+       ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
+
+       if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
+                    !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
+               xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
+                               "Access to block zero in inode %llu "
+                               "start_block: %llx start_off: %llx "
+                               "blkcnt: %llx extent-state: %x lastx: %x",
+                       (unsigned long long)ip->i_ino,
+                       (unsigned long long)gotp->br_startblock,
+                       (unsigned long long)gotp->br_startoff,
+                       (unsigned long long)gotp->br_blockcount,
+                       gotp->br_state, *lastxp);
+               *lastxp = NULLEXTNUM;
+               *eofp = 1;
+               return NULL;
+       }
+       return ep;
+}
+
+/*
+ * Returns the file-relative block number of the first unused block(s)
+ * in the file with at least "len" logically contiguous blocks free.
+ * This is the lowest-address hole if the file has holes, else the first block
+ * past the end of file.
+ * Return 0 if the file is currently local (in-inode).
+ */
+int                                            /* error */
+xfs_bmap_first_unused(
+       xfs_trans_t     *tp,                    /* transaction pointer */
+       xfs_inode_t     *ip,                    /* incore inode */
+       xfs_extlen_t    len,                    /* size of hole to find */
+       xfs_fileoff_t   *first_unused,          /* unused block */
+       int             whichfork)              /* data or attr fork */
+{
+       int             error;                  /* error return value */
+       int             idx;                    /* extent record index */
+       xfs_ifork_t     *ifp;                   /* inode fork pointer */
+       xfs_fileoff_t   lastaddr;               /* last block number seen */
+       xfs_fileoff_t   lowest;                 /* lowest useful block */
+       xfs_fileoff_t   max;                    /* starting useful block */
+       xfs_fileoff_t   off;                    /* offset for this block */
+       xfs_extnum_t    nextents;               /* number of extent entries */
+
+       ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
+              XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
+              XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+       if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+               *first_unused = 0;
+               return 0;
+       }
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+           (error = xfs_iread_extents(tp, ip, whichfork)))
+               return error;
+       lowest = *first_unused;
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) {
+               xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
+               off = xfs_bmbt_get_startoff(ep);
+               /*
+                * See if the hole before this extent will work.
+                */
+               if (off >= lowest + len && off - max >= len) {
+                       *first_unused = max;
+                       return 0;
+               }
+               lastaddr = off + xfs_bmbt_get_blockcount(ep);
+               max = XFS_FILEOFF_MAX(lastaddr, lowest);
+       }
+       *first_unused = max;
+       return 0;
+}
+
+/*
+ * Returns the file-relative block number of the last block - 1 before
+ * last_block (input value) in the file.
+ * This is not based on i_size, it is based on the extent records.
+ * Returns 0 for local files, as they do not have extent records.
+ */
+int                                            /* error */
+xfs_bmap_last_before(
+       xfs_trans_t     *tp,                    /* transaction pointer */
+       xfs_inode_t     *ip,                    /* incore inode */
+       xfs_fileoff_t   *last_block,            /* last block */
+       int             whichfork)              /* data or attr fork */
+{
+       xfs_fileoff_t   bno;                    /* input file offset */
+       int             eof;                    /* hit end of file */
+       xfs_bmbt_rec_host_t *ep;                /* pointer to last extent */
+       int             error;                  /* error return value */
+       xfs_bmbt_irec_t got;                    /* current extent value */
+       xfs_ifork_t     *ifp;                   /* inode fork pointer */
+       xfs_extnum_t    lastx;                  /* last extent used */
+       xfs_bmbt_irec_t prev;                   /* previous extent value */
+
+       if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+           XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+           XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
+              return -EIO;
+       if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+               *last_block = 0;
+               return 0;
+       }
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+           (error = xfs_iread_extents(tp, ip, whichfork)))
+               return error;
+       bno = *last_block - 1;
+       ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+               &prev);
+       if (eof || xfs_bmbt_get_startoff(ep) > bno) {
+               if (prev.br_startoff == NULLFILEOFF)
+                       *last_block = 0;
+               else
+                       *last_block = prev.br_startoff + prev.br_blockcount;
+       }
+       /*
+        * Otherwise *last_block is already the right answer.
+        */
+       return 0;
+}
+
+int
+xfs_bmap_last_extent(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *ip,
+       int                     whichfork,
+       struct xfs_bmbt_irec    *rec,
+       int                     *is_empty)
+{
+       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+       int                     error;
+       int                     nextents;
+
+       if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+               error = xfs_iread_extents(tp, ip, whichfork);
+               if (error)
+                       return error;
+       }
+
+       nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+       if (nextents == 0) {
+               *is_empty = 1;
+               return 0;
+       }
+
+       xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec);
+       *is_empty = 0;
+       return 0;
+}
+
+/*
+ * Check the last inode extent to determine whether this allocation will result
+ * in blocks being allocated at the end of the file. When we allocate new data
+ * blocks at the end of the file which do not start at the previous data block,
+ * we will try to align the new blocks at stripe unit boundaries.
+ *
+ * Returns 1 in bma->aeof if the file (fork) is empty as any new write will be
+ * at, or past the EOF.
+ */
+STATIC int
+xfs_bmap_isaeof(
+       struct xfs_bmalloca     *bma,
+       int                     whichfork)
+{
+       struct xfs_bmbt_irec    rec;
+       int                     is_empty;
+       int                     error;
+
+       bma->aeof = 0;
+       error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
+                                    &is_empty);
+       if (error)
+               return error;
+
+       if (is_empty) {
+               bma->aeof = 1;
+               return 0;
+       }
+
+       /*
+        * Check if we are allocation or past the last extent, or at least into
+        * the last delayed allocated extent.
+        */
+       bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount ||
+               (bma->offset >= rec.br_startoff &&
+                isnullstartblock(rec.br_startblock));
+       return 0;
+}
+
+/*
+ * Returns the file-relative block number of the first block past eof in
+ * the file.  This is not based on i_size, it is based on the extent records.
+ * Returns 0 for local files, as they do not have extent records.
+ */
+int
+xfs_bmap_last_offset(
+       struct xfs_inode        *ip,
+       xfs_fileoff_t           *last_block,
+       int                     whichfork)
+{
+       struct xfs_bmbt_irec    rec;
+       int                     is_empty;
+       int                     error;
+
+       *last_block = 0;
+
+       if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL)
+               return 0;
+
+       if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+           XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+              return -EIO;
+
+       error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
+       if (error || is_empty)
+               return error;
+
+       *last_block = rec.br_startoff + rec.br_blockcount;
+       return 0;
+}
+
+/*
+ * Returns whether the selected fork of the inode has exactly one
+ * block or not.  For the data fork we check this matches di_size,
+ * implying the file's range is 0..bsize-1.
+ */
+int                                    /* 1=>1 block, 0=>otherwise */
+xfs_bmap_one_block(
+       xfs_inode_t     *ip,            /* incore inode */
+       int             whichfork)      /* data or attr fork */
+{
+       xfs_bmbt_rec_host_t *ep;        /* ptr to fork's extent */
+       xfs_ifork_t     *ifp;           /* inode fork pointer */
+       int             rval;           /* return value */
+       xfs_bmbt_irec_t s;              /* internal version of extent */
+
+#ifndef DEBUG
+       if (whichfork == XFS_DATA_FORK)
+               return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
+#endif /* !DEBUG */
+       if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
+               return 0;
+       if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+               return 0;
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+       ep = xfs_iext_get_ext(ifp, 0);
+       xfs_bmbt_get_all(ep, &s);
+       rval = s.br_startoff == 0 && s.br_blockcount == 1;
+       if (rval && whichfork == XFS_DATA_FORK)
+               ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
+       return rval;
+}
+
+/*
+ * Extent tree manipulation functions used during allocation.
+ */
+
+/*
+ * Convert a delayed allocation to a real allocation.
+ */
+STATIC int                             /* error */
+xfs_bmap_add_extent_delay_real(
+       struct xfs_bmalloca     *bma)
+{
+       struct xfs_bmbt_irec    *new = &bma->got;
+       int                     diff;   /* temp value */
+       xfs_bmbt_rec_host_t     *ep;    /* extent entry for idx */
+       int                     error;  /* error return value */
+       int                     i;      /* temp state */
+       xfs_ifork_t             *ifp;   /* inode fork pointer */
+       xfs_fileoff_t           new_endoff;     /* end offset of new entry */
+       xfs_bmbt_irec_t         r[3];   /* neighbor extent entries */
+                                       /* left is 0, right is 1, prev is 2 */
+       int                     rval=0; /* return value (logging flags) */
+       int                     state = 0;/* state bits, accessed thru macros */
+       xfs_filblks_t           da_new; /* new count del alloc blocks used */
+       xfs_filblks_t           da_old; /* old count del alloc blocks used */
+       xfs_filblks_t           temp=0; /* value for da_new calculations */
+       xfs_filblks_t           temp2=0;/* value for da_new calculations */
+       int                     tmp_rval;       /* partial logging flags */
+
+       ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK);
+
+       ASSERT(bma->idx >= 0);
+       ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+       ASSERT(!isnullstartblock(new->br_startblock));
+       ASSERT(!bma->cur ||
+              (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+
+       XFS_STATS_INC(xs_add_exlist);
+
+#define        LEFT            r[0]
+#define        RIGHT           r[1]
+#define        PREV            r[2]
+
+       /*
+        * Set up a bunch of variables to make the tests simpler.
+        */
+       ep = xfs_iext_get_ext(ifp, bma->idx);
+       xfs_bmbt_get_all(ep, &PREV);
+       new_endoff = new->br_startoff + new->br_blockcount;
+       ASSERT(PREV.br_startoff <= new->br_startoff);
+       ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+
+       da_old = startblockval(PREV.br_startblock);
+       da_new = 0;
+
+       /*
+        * Set flags determining what part of the previous delayed allocation
+        * extent is being replaced by a real allocation.
+        */
+       if (PREV.br_startoff == new->br_startoff)
+               state |= BMAP_LEFT_FILLING;
+       if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
+               state |= BMAP_RIGHT_FILLING;
+
+       /*
+        * Check and set flags if this segment has a left neighbor.
+        * Don't set contiguous if the combined extent would be too large.
+        */
+       if (bma->idx > 0) {
+               state |= BMAP_LEFT_VALID;
+               xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &LEFT);
+
+               if (isnullstartblock(LEFT.br_startblock))
+                       state |= BMAP_LEFT_DELAY;
+       }
+
+       if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+           LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+           LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+           LEFT.br_state == new->br_state &&
+           LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+               state |= BMAP_LEFT_CONTIG;
+
+       /*
+        * Check and set flags if this segment has a right neighbor.
+        * Don't set contiguous if the combined extent would be too large.
+        * Also check for all-three-contiguous being too large.
+        */
+       if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+               state |= BMAP_RIGHT_VALID;
+               xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT);
+
+               if (isnullstartblock(RIGHT.br_startblock))
+                       state |= BMAP_RIGHT_DELAY;
+       }
+
+       if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+           new_endoff == RIGHT.br_startoff &&
+           new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
+           new->br_state == RIGHT.br_state &&
+           new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+           ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+                      BMAP_RIGHT_FILLING)) !=
+                     (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+                      BMAP_RIGHT_FILLING) ||
+            LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+                       <= MAXEXTLEN))
+               state |= BMAP_RIGHT_CONTIG;
+
+       error = 0;
+       /*
+        * Switch out based on the FILLING and CONTIG state bits.
+        */
+       switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+                        BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
+       case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+            BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+               /*
+                * Filling in all of a previously delayed allocation extent.
+                * The left and right neighbors are both contiguous with new.
+                */
+               bma->idx--;
+               trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+               xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
+                       LEFT.br_blockcount + PREV.br_blockcount +
+                       RIGHT.br_blockcount);
+               trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+               xfs_iext_remove(bma->ip, bma->idx + 1, 2, state);
+               bma->ip->i_d.di_nextents--;
+               if (bma->cur == NULL)
+                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+               else {
+                       rval = XFS_ILOG_CORE;
+                       error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
+                                       RIGHT.br_startblock,
+                                       RIGHT.br_blockcount, &i);
+                       if (error)
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       error = xfs_btree_delete(bma->cur, &i);
+                       if (error)
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       error = xfs_btree_decrement(bma->cur, 0, &i);
+                       if (error)
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
+                                       LEFT.br_startblock,
+                                       LEFT.br_blockcount +
+                                       PREV.br_blockcount +
+                                       RIGHT.br_blockcount, LEFT.br_state);
+                       if (error)
+                               goto done;
+               }
+               break;
+
+       case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+               /*
+                * Filling in all of a previously delayed allocation extent.
+                * The left neighbor is contiguous, the right is not.
+                */
+               bma->idx--;
+
+               trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+               xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
+                       LEFT.br_blockcount + PREV.br_blockcount);
+               trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+               xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
+               if (bma->cur == NULL)
+                       rval = XFS_ILOG_DEXT;
+               else {
+                       rval = 0;
+                       error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
+                                       LEFT.br_startblock, LEFT.br_blockcount,
+                                       &i);
+                       if (error)
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
+                                       LEFT.br_startblock,
+                                       LEFT.br_blockcount +
+                                       PREV.br_blockcount, LEFT.br_state);
+                       if (error)
+                               goto done;
+               }
+               break;
+
+       case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+               /*
+                * Filling in all of a previously delayed allocation extent.
+                * The right neighbor is contiguous, the left is not.
+                */
+               trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+               xfs_bmbt_set_startblock(ep, new->br_startblock);
+               xfs_bmbt_set_blockcount(ep,
+                       PREV.br_blockcount + RIGHT.br_blockcount);
+               trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+               xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
+               if (bma->cur == NULL)
+                       rval = XFS_ILOG_DEXT;
+               else {
+                       rval = 0;
+                       error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
+                                       RIGHT.br_startblock,
+                                       RIGHT.br_blockcount, &i);
+                       if (error)
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       error = xfs_bmbt_update(bma->cur, PREV.br_startoff,
+                                       new->br_startblock,
+                                       PREV.br_blockcount +
+                                       RIGHT.br_blockcount, PREV.br_state);
+                       if (error)
+                               goto done;
+               }
+               break;
+
+       case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
+               /*
+                * Filling in all of a previously delayed allocation extent.
+                * Neither the left nor right neighbors are contiguous with
+                * the new one.
+                */
+               trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+               xfs_bmbt_set_startblock(ep, new->br_startblock);
+               trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+               bma->ip->i_d.di_nextents++;
+               if (bma->cur == NULL)
+                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+               else {
+                       rval = XFS_ILOG_CORE;
+                       error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
+                                       new->br_startblock, new->br_blockcount,
+                                       &i);
+                       if (error)
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                       bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+                       error = xfs_btree_insert(bma->cur, &i);
+                       if (error)
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+               }
+               break;
+
+       case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
+               /*
+                * Filling in the first part of a previous delayed allocation.
+                * The left neighbor is contiguous.
+                */
+               trace_xfs_bmap_pre_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
+               xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx - 1),
+                       LEFT.br_blockcount + new->br_blockcount);
+               xfs_bmbt_set_startoff(ep,
+                       PREV.br_startoff + new->br_blockcount);
+               trace_xfs_bmap_post_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
+
+               temp = PREV.br_blockcount - new->br_blockcount;
+               trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+               xfs_bmbt_set_blockcount(ep, temp);
+               if (bma->cur == NULL)
+                       rval = XFS_ILOG_DEXT;
+               else {
+                       rval = 0;
+                       error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
+                                       LEFT.br_startblock, LEFT.br_blockcount,
+                                       &i);
+                       if (error)
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
+                                       LEFT.br_startblock,
+                                       LEFT.br_blockcount +
+                                       new->br_blockcount,
+                                       LEFT.br_state);
+                       if (error)
+                               goto done;
+               }
+               da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+                       startblockval(PREV.br_startblock));
+               xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+               trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+               bma->idx--;
+               break;
+
+       case BMAP_LEFT_FILLING:
+               /*
+                * Filling in the first part of a previous delayed allocation.
+                * The left neighbor is not contiguous.
+                */
+               trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+               xfs_bmbt_set_startoff(ep, new_endoff);
+               temp = PREV.br_blockcount - new->br_blockcount;
+               xfs_bmbt_set_blockcount(ep, temp);
+               xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
+               bma->ip->i_d.di_nextents++;
+               if (bma->cur == NULL)
+                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+               else {
+                       rval = XFS_ILOG_CORE;
+                       error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
+                                       new->br_startblock, new->br_blockcount,
+                                       &i);
+                       if (error)
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                       bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+                       error = xfs_btree_insert(bma->cur, &i);
+                       if (error)
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+               }
+
+               if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+                       error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+                                       bma->firstblock, bma->flist,
+                                       &bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
+                       rval |= tmp_rval;
+                       if (error)
+                               goto done;
+               }
+               da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+                       startblockval(PREV.br_startblock) -
+                       (bma->cur ? bma->cur->bc_private.b.allocated : 0));
+               ep = xfs_iext_get_ext(ifp, bma->idx + 1);
+               xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+               trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
+               break;
+
+       case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+               /*
+                * Filling in the last part of a previous delayed allocation.
+                * The right neighbor is contiguous with the new allocation.
+                */
+               temp = PREV.br_blockcount - new->br_blockcount;
+               trace_xfs_bmap_pre_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
+               xfs_bmbt_set_blockcount(ep, temp);
+               xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx + 1),
+                       new->br_startoff, new->br_startblock,
+                       new->br_blockcount + RIGHT.br_blockcount,
+                       RIGHT.br_state);
+               trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
+               if (bma->cur == NULL)
+                       rval = XFS_ILOG_DEXT;
+               else {
+                       rval = 0;
+                       error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
+                                       RIGHT.br_startblock,
+                                       RIGHT.br_blockcount, &i);
+                       if (error)
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       error = xfs_bmbt_update(bma->cur, new->br_startoff,
+                                       new->br_startblock,
+                                       new->br_blockcount +
+                                       RIGHT.br_blockcount,
+                                       RIGHT.br_state);
+                       if (error)
+                               goto done;
+               }
+
+               da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+                       startblockval(PREV.br_startblock));
+               trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+               xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+               trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+               bma->idx++;
+               break;
+
+       case BMAP_RIGHT_FILLING:
+               /*
+                * Filling in the last part of a previous delayed allocation.
+                * The right neighbor is not contiguous.
+                */
+               temp = PREV.br_blockcount - new->br_blockcount;
+               trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+               xfs_bmbt_set_blockcount(ep, temp);
+               xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state);
+               bma->ip->i_d.di_nextents++;
+               if (bma->cur == NULL)
+                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+               else {
+                       rval = XFS_ILOG_CORE;
+                       error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
+                                       new->br_startblock, new->br_blockcount,
+                                       &i);
+                       if (error)
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                       bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+                       error = xfs_btree_insert(bma->cur, &i);
+                       if (error)
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+               }
+
+               if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+                       error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+                               bma->firstblock, bma->flist, &bma->cur, 1,
+                               &tmp_rval, XFS_DATA_FORK);
+                       rval |= tmp_rval;
+                       if (error)
+                               goto done;
+               }
+               da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+                       startblockval(PREV.br_startblock) -
+                       (bma->cur ? bma->cur->bc_private.b.allocated : 0));
+               ep = xfs_iext_get_ext(ifp, bma->idx);
+               xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+               trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+               bma->idx++;
+               break;
+
+       case 0:
+               /*
+                * Filling in the middle part of a previous delayed allocation.
+                * Contiguity is impossible here.
+                * This case is avoided almost all the time.
+                *
+                * We start with a delayed allocation:
+                *
+                * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
+                *  PREV @ idx
+                *
+                * and we are allocating:
+                *                     +rrrrrrrrrrrrrrrrr+
+                *                            new
+                *
+                * and we set it up for insertion as:
+                * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
+                *                            new
+                *  PREV @ idx          LEFT              RIGHT
+                *                      inserted at idx + 1
+                */
+               temp = new->br_startoff - PREV.br_startoff;
+               temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
+               trace_xfs_bmap_pre_update(bma->ip, bma->idx, 0, _THIS_IP_);
+               xfs_bmbt_set_blockcount(ep, temp);      /* truncate PREV */
+               LEFT = *new;
+               RIGHT.br_state = PREV.br_state;
+               RIGHT.br_startblock = nullstartblock(
+                               (int)xfs_bmap_worst_indlen(bma->ip, temp2));
+               RIGHT.br_startoff = new_endoff;
+               RIGHT.br_blockcount = temp2;
+               /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
+               xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state);
+               bma->ip->i_d.di_nextents++;
+               if (bma->cur == NULL)
+                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+               else {
+                       rval = XFS_ILOG_CORE;
+                       error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
+                                       new->br_startblock, new->br_blockcount,
+                                       &i);
+                       if (error)
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                       bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+                       error = xfs_btree_insert(bma->cur, &i);
+                       if (error)
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+               }
+
+               if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+                       error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+                                       bma->firstblock, bma->flist, &bma->cur,
+                                       1, &tmp_rval, XFS_DATA_FORK);
+                       rval |= tmp_rval;
+                       if (error)
+                               goto done;
+               }
+               temp = xfs_bmap_worst_indlen(bma->ip, temp);
+               temp2 = xfs_bmap_worst_indlen(bma->ip, temp2);
+               diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
+                       (bma->cur ? bma->cur->bc_private.b.allocated : 0));
+               if (diff > 0) {
+                       error = xfs_icsb_modify_counters(bma->ip->i_mount,
+                                       XFS_SBS_FDBLOCKS,
+                                       -((int64_t)diff), 0);
+                       ASSERT(!error);
+                       if (error)
+                               goto done;
+               }
+
+               ep = xfs_iext_get_ext(ifp, bma->idx);
+               xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+               trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+               trace_xfs_bmap_pre_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
+               xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, bma->idx + 2),
+                       nullstartblock((int)temp2));
+               trace_xfs_bmap_post_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
+
+               bma->idx++;
+               da_new = temp + temp2;
+               break;
+
+       case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+       case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+       case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
+       case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+       case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+       case BMAP_LEFT_CONTIG:
+       case BMAP_RIGHT_CONTIG:
+               /*
+                * These cases are all impossible.
+                */
+               ASSERT(0);
+       }
+
+       /* convert to a btree if necessary */
+       if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+               int     tmp_logflags;   /* partial log flag return val */
+
+               ASSERT(bma->cur == NULL);
+               error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+                               bma->firstblock, bma->flist, &bma->cur,
+                               da_old > 0, &tmp_logflags, XFS_DATA_FORK);
+               bma->logflags |= tmp_logflags;
+               if (error)
+                       goto done;
+       }
+
+       /* adjust for changes in reserved delayed indirect blocks */
+       if (da_old || da_new) {
+               temp = da_new;
+               if (bma->cur)
+                       temp += bma->cur->bc_private.b.allocated;
+               ASSERT(temp <= da_old);
+               if (temp < da_old)
+                       xfs_icsb_modify_counters(bma->ip->i_mount,
+                                       XFS_SBS_FDBLOCKS,
+                                       (int64_t)(da_old - temp), 0);
+       }
+
+       /* clear out the allocated field, done with it now in any case. */
+       if (bma->cur)
+               bma->cur->bc_private.b.allocated = 0;
+
+       xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK);
+done:
+       bma->logflags |= rval;
+       return error;
+#undef LEFT
+#undef RIGHT
+#undef PREV
+}
+
+/*
+ * Convert an unwritten allocation to a real allocation or vice versa.
+ */
+STATIC int                             /* error */
+xfs_bmap_add_extent_unwritten_real(
+       struct xfs_trans        *tp,
+       xfs_inode_t             *ip,    /* incore inode pointer */
+       xfs_extnum_t            *idx,   /* extent number to update/insert */
+       xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
+       xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
+       xfs_fsblock_t           *first, /* pointer to firstblock variable */
+       xfs_bmap_free_t         *flist, /* list of extents to be freed */
+       int                     *logflagsp) /* inode logging flags */
+{
+       xfs_btree_cur_t         *cur;   /* btree cursor */
+       xfs_bmbt_rec_host_t     *ep;    /* extent entry for idx */
+       int                     error;  /* error return value */
+       int                     i;      /* temp state */
+       xfs_ifork_t             *ifp;   /* inode fork pointer */
+       xfs_fileoff_t           new_endoff;     /* end offset of new entry */
+       xfs_exntst_t            newext; /* new extent state */
+       xfs_exntst_t            oldext; /* old extent state */
+       xfs_bmbt_irec_t         r[3];   /* neighbor extent entries */
+                                       /* left is 0, right is 1, prev is 2 */
+       int                     rval=0; /* return value (logging flags) */
+       int                     state = 0;/* state bits, accessed thru macros */
+
+       *logflagsp = 0;
+
+       cur = *curp;
+       ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+
+       ASSERT(*idx >= 0);
+       ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+       ASSERT(!isnullstartblock(new->br_startblock));
+
+       XFS_STATS_INC(xs_add_exlist);
+
+#define        LEFT            r[0]
+#define        RIGHT           r[1]
+#define        PREV            r[2]
+
+       /*
+        * Set up a bunch of variables to make the tests simpler.
+        */
+       error = 0;
+       ep = xfs_iext_get_ext(ifp, *idx);
+       xfs_bmbt_get_all(ep, &PREV);
+       newext = new->br_state;
+       oldext = (newext == XFS_EXT_UNWRITTEN) ?
+               XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
+       ASSERT(PREV.br_state == oldext);
+       new_endoff = new->br_startoff + new->br_blockcount;
+       ASSERT(PREV.br_startoff <= new->br_startoff);
+       ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+
+       /*
+        * Set flags determining what part of the previous oldext allocation
+        * extent is being replaced by a newext allocation.
+        */
+       if (PREV.br_startoff == new->br_startoff)
+               state |= BMAP_LEFT_FILLING;
+       if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
+               state |= BMAP_RIGHT_FILLING;
+
+       /*
+        * Check and set flags if this segment has a left neighbor.
+        * Don't set contiguous if the combined extent would be too large.
+        */
+       if (*idx > 0) {
+               state |= BMAP_LEFT_VALID;
+               xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
+
+               if (isnullstartblock(LEFT.br_startblock))
+                       state |= BMAP_LEFT_DELAY;
+       }
+
+       if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+           LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+           LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+           LEFT.br_state == newext &&
+           LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+               state |= BMAP_LEFT_CONTIG;
+
+       /*
+        * Check and set flags if this segment has a right neighbor.
+        * Don't set contiguous if the combined extent would be too large.
+        * Also check for all-three-contiguous being too large.
+        */
+       if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+               state |= BMAP_RIGHT_VALID;
+               xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
+               if (isnullstartblock(RIGHT.br_startblock))
+                       state |= BMAP_RIGHT_DELAY;
+       }
+
+       if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+           new_endoff == RIGHT.br_startoff &&
+           new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
+           newext == RIGHT.br_state &&
+           new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+           ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+                      BMAP_RIGHT_FILLING)) !=
+                     (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+                      BMAP_RIGHT_FILLING) ||
+            LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+                       <= MAXEXTLEN))
+               state |= BMAP_RIGHT_CONTIG;
+
+       /*
+        * Switch out based on the FILLING and CONTIG state bits.
+        */
+       switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+                        BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
+       case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+            BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+               /*
+                * Setting all of a previous oldext extent to newext.
+                * The left and right neighbors are both contiguous with new.
+                */
+               --*idx;
+
+               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+               xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+                       LEFT.br_blockcount + PREV.br_blockcount +
+                       RIGHT.br_blockcount);
+               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+               xfs_iext_remove(ip, *idx + 1, 2, state);
+               ip->i_d.di_nextents -= 2;
+               if (cur == NULL)
+                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+               else {
+                       rval = XFS_ILOG_CORE;
+                       if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+                                       RIGHT.br_startblock,
+                                       RIGHT.br_blockcount, &i)))
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       if ((error = xfs_btree_delete(cur, &i)))
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       if ((error = xfs_btree_decrement(cur, 0, &i)))
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       if ((error = xfs_btree_delete(cur, &i)))
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       if ((error = xfs_btree_decrement(cur, 0, &i)))
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+                               LEFT.br_startblock,
+                               LEFT.br_blockcount + PREV.br_blockcount +
+                               RIGHT.br_blockcount, LEFT.br_state)))
+                               goto done;
+               }
+               break;
+
+       case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+               /*
+                * Setting all of a previous oldext extent to newext.
+                * The left neighbor is contiguous, the right is not.
+                */
+               --*idx;
+
+               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+               xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+                       LEFT.br_blockcount + PREV.br_blockcount);
+               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+               xfs_iext_remove(ip, *idx + 1, 1, state);
+               ip->i_d.di_nextents--;
+               if (cur == NULL)
+                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+               else {
+                       rval = XFS_ILOG_CORE;
+                       if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+                                       PREV.br_startblock, PREV.br_blockcount,
+                                       &i)))
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       if ((error = xfs_btree_delete(cur, &i)))
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       if ((error = xfs_btree_decrement(cur, 0, &i)))
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+                               LEFT.br_startblock,
+                               LEFT.br_blockcount + PREV.br_blockcount,
+                               LEFT.br_state)))
+                               goto done;
+               }
+               break;
+
+       case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+               /*
+                * Setting all of a previous oldext extent to newext.
+                * The right neighbor is contiguous, the left is not.
+                */
+               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+               xfs_bmbt_set_blockcount(ep,
+                       PREV.br_blockcount + RIGHT.br_blockcount);
+               xfs_bmbt_set_state(ep, newext);
+               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+               xfs_iext_remove(ip, *idx + 1, 1, state);
+               ip->i_d.di_nextents--;
+               if (cur == NULL)
+                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+               else {
+                       rval = XFS_ILOG_CORE;
+                       if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+                                       RIGHT.br_startblock,
+                                       RIGHT.br_blockcount, &i)))
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       if ((error = xfs_btree_delete(cur, &i)))
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       if ((error = xfs_btree_decrement(cur, 0, &i)))
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       if ((error = xfs_bmbt_update(cur, new->br_startoff,
+                               new->br_startblock,
+                               new->br_blockcount + RIGHT.br_blockcount,
+                               newext)))
+                               goto done;
+               }
+               break;
+
+       case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
+               /*
+                * Setting all of a previous oldext extent to newext.
+                * Neither the left nor right neighbors are contiguous with
+                * the new one.
+                */
+               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+               xfs_bmbt_set_state(ep, newext);
+               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+               if (cur == NULL)
+                       rval = XFS_ILOG_DEXT;
+               else {
+                       rval = 0;
+                       if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+                                       new->br_startblock, new->br_blockcount,
+                                       &i)))
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       if ((error = xfs_bmbt_update(cur, new->br_startoff,
+                               new->br_startblock, new->br_blockcount,
+                               newext)))
+                               goto done;
+               }
+               break;
+
+       case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
+               /*
+                * Setting the first part of a previous oldext extent to newext.
+                * The left neighbor is contiguous.
+                */
+               trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
+               xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
+                       LEFT.br_blockcount + new->br_blockcount);
+               xfs_bmbt_set_startoff(ep,
+                       PREV.br_startoff + new->br_blockcount);
+               trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
+
+               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+               xfs_bmbt_set_startblock(ep,
+                       new->br_startblock + new->br_blockcount);
+               xfs_bmbt_set_blockcount(ep,
+                       PREV.br_blockcount - new->br_blockcount);
+               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+               --*idx;
+
+               if (cur == NULL)
+                       rval = XFS_ILOG_DEXT;
+               else {
+                       rval = 0;
+                       if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+                                       PREV.br_startblock, PREV.br_blockcount,
+                                       &i)))
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       if ((error = xfs_bmbt_update(cur,
+                               PREV.br_startoff + new->br_blockcount,
+                               PREV.br_startblock + new->br_blockcount,
+                               PREV.br_blockcount - new->br_blockcount,
+                               oldext)))
+                               goto done;
+                       if ((error = xfs_btree_decrement(cur, 0, &i)))
+                               goto done;
+                       error = xfs_bmbt_update(cur, LEFT.br_startoff,
+                               LEFT.br_startblock,
+                               LEFT.br_blockcount + new->br_blockcount,
+                               LEFT.br_state);
+                       if (error)
+                               goto done;
+               }
+               break;
+
+       case BMAP_LEFT_FILLING:
+               /*
+                * Setting the first part of a previous oldext extent to newext.
+                * The left neighbor is not contiguous.
+                */
+               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+               ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
+               xfs_bmbt_set_startoff(ep, new_endoff);
+               xfs_bmbt_set_blockcount(ep,
+                       PREV.br_blockcount - new->br_blockcount);
+               xfs_bmbt_set_startblock(ep,
+                       new->br_startblock + new->br_blockcount);
+               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+               xfs_iext_insert(ip, *idx, 1, new, state);
+               ip->i_d.di_nextents++;
+               if (cur == NULL)
+                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+               else {
+                       rval = XFS_ILOG_CORE;
+                       if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+                                       PREV.br_startblock, PREV.br_blockcount,
+                                       &i)))
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       if ((error = xfs_bmbt_update(cur,
+                               PREV.br_startoff + new->br_blockcount,
+                               PREV.br_startblock + new->br_blockcount,
+                               PREV.br_blockcount - new->br_blockcount,
+                               oldext)))
+                               goto done;
+                       cur->bc_rec.b = *new;
+                       if ((error = xfs_btree_insert(cur, &i)))
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+               }
+               break;
+
+       case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+               /*
+                * Setting the last part of a previous oldext extent to newext.
+                * The right neighbor is contiguous with the new allocation.
+                */
+               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+               xfs_bmbt_set_blockcount(ep,
+                       PREV.br_blockcount - new->br_blockcount);
+               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+               ++*idx;
+
+               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+               xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+                       new->br_startoff, new->br_startblock,
+                       new->br_blockcount + RIGHT.br_blockcount, newext);
+               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+               if (cur == NULL)
+                       rval = XFS_ILOG_DEXT;
+               else {
+                       rval = 0;
+                       if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+                                       PREV.br_startblock,
+                                       PREV.br_blockcount, &i)))
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+                               PREV.br_startblock,
+                               PREV.br_blockcount - new->br_blockcount,
+                               oldext)))
+                               goto done;
+                       if ((error = xfs_btree_increment(cur, 0, &i)))
+                               goto done;
+                       if ((error = xfs_bmbt_update(cur, new->br_startoff,
+                               new->br_startblock,
+                               new->br_blockcount + RIGHT.br_blockcount,
+                               newext)))
+                               goto done;
+               }
+               break;
+
+       case BMAP_RIGHT_FILLING:
+               /*
+                * Setting the last part of a previous oldext extent to newext.
+                * The right neighbor is not contiguous.
+                */
+               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+               xfs_bmbt_set_blockcount(ep,
+                       PREV.br_blockcount - new->br_blockcount);
+               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+               ++*idx;
+               xfs_iext_insert(ip, *idx, 1, new, state);
+
+               ip->i_d.di_nextents++;
+               if (cur == NULL)
+                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+               else {
+                       rval = XFS_ILOG_CORE;
+                       if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+                                       PREV.br_startblock, PREV.br_blockcount,
+                                       &i)))
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+                               PREV.br_startblock,
+                               PREV.br_blockcount - new->br_blockcount,
+                               oldext)))
+                               goto done;
+                       if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+                                       new->br_startblock, new->br_blockcount,
+                                       &i)))
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                       cur->bc_rec.b.br_state = XFS_EXT_NORM;
+                       if ((error = xfs_btree_insert(cur, &i)))
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+               }
+               break;
+
+       case 0:
+               /*
+                * Setting the middle part of a previous oldext extent to
+                * newext.  Contiguity is impossible here.
+                * One extent becomes three extents.
+                */
+               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+               xfs_bmbt_set_blockcount(ep,
+                       new->br_startoff - PREV.br_startoff);
+               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+               r[0] = *new;
+               r[1].br_startoff = new_endoff;
+               r[1].br_blockcount =
+                       PREV.br_startoff + PREV.br_blockcount - new_endoff;
+               r[1].br_startblock = new->br_startblock + new->br_blockcount;
+               r[1].br_state = oldext;
+
+               ++*idx;
+               xfs_iext_insert(ip, *idx, 2, &r[0], state);
+
+               ip->i_d.di_nextents += 2;
+               if (cur == NULL)
+                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+               else {
+                       rval = XFS_ILOG_CORE;
+                       if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+                                       PREV.br_startblock, PREV.br_blockcount,
+                                       &i)))
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       /* new right extent - oldext */
+                       if ((error = xfs_bmbt_update(cur, r[1].br_startoff,
+                               r[1].br_startblock, r[1].br_blockcount,
+                               r[1].br_state)))
+                               goto done;
+                       /* new left extent - oldext */
+                       cur->bc_rec.b = PREV;
+                       cur->bc_rec.b.br_blockcount =
+                               new->br_startoff - PREV.br_startoff;
+                       if ((error = xfs_btree_insert(cur, &i)))
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       /*
+                        * Reset the cursor to the position of the new extent
+                        * we are about to insert as we can't trust it after
+                        * the previous insert.
+                        */
+                       if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+                                       new->br_startblock, new->br_blockcount,
+                                       &i)))
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                       /* new middle extent - newext */
+                       cur->bc_rec.b.br_state = new->br_state;
+                       if ((error = xfs_btree_insert(cur, &i)))
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+               }
+               break;
+
+       case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+       case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+       case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
+       case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+       case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+       case BMAP_LEFT_CONTIG:
+       case BMAP_RIGHT_CONTIG:
+               /*
+                * These cases are all impossible.
+                */
+               ASSERT(0);
+       }
+
+       /* convert to a btree if necessary */
+       if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
+               int     tmp_logflags;   /* partial log flag return val */
+
+               ASSERT(cur == NULL);
+               error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur,
+                               0, &tmp_logflags, XFS_DATA_FORK);
+               *logflagsp |= tmp_logflags;
+               if (error)
+                       goto done;
+       }
+
+       /* clear out the allocated field, done with it now in any case. */
+       if (cur) {
+               cur->bc_private.b.allocated = 0;
+               *curp = cur;
+       }
+
+       xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK);
+done:
+       *logflagsp |= rval;
+       return error;
+#undef LEFT
+#undef RIGHT
+#undef PREV
+}
+
+/*
+ * Convert a hole to a delayed allocation.
+ */
+STATIC void
+xfs_bmap_add_extent_hole_delay(
+       xfs_inode_t             *ip,    /* incore inode pointer */
+       xfs_extnum_t            *idx,   /* extent number to update/insert */
+       xfs_bmbt_irec_t         *new)   /* new data to add to file extents */
+{
+       xfs_ifork_t             *ifp;   /* inode fork pointer */
+       xfs_bmbt_irec_t         left;   /* left neighbor extent entry */
+       xfs_filblks_t           newlen=0;       /* new indirect size */
+       xfs_filblks_t           oldlen=0;       /* old indirect size */
+       xfs_bmbt_irec_t         right;  /* right neighbor extent entry */
+       int                     state;  /* state bits, accessed thru macros */
+       xfs_filblks_t           temp=0; /* temp for indirect calculations */
+
+       ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+       state = 0;
+       ASSERT(isnullstartblock(new->br_startblock));
+
+       /*
+        * Check and set flags if this segment has a left neighbor
+        */
+       if (*idx > 0) {
+               state |= BMAP_LEFT_VALID;
+               xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
+
+               if (isnullstartblock(left.br_startblock))
+                       state |= BMAP_LEFT_DELAY;
+       }
+
+       /*
+        * Check and set flags if the current (right) segment exists.
+        * If it doesn't exist, we're converting the hole at end-of-file.
+        */
+       if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+               state |= BMAP_RIGHT_VALID;
+               xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
+
+               if (isnullstartblock(right.br_startblock))
+                       state |= BMAP_RIGHT_DELAY;
+       }
+
+       /*
+        * Set contiguity flags on the left and right neighbors.
+        * Don't let extents get too large, even if the pieces are contiguous.
+        */
+       if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
+           left.br_startoff + left.br_blockcount == new->br_startoff &&
+           left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+               state |= BMAP_LEFT_CONTIG;
+
+       if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
+           new->br_startoff + new->br_blockcount == right.br_startoff &&
+           new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+           (!(state & BMAP_LEFT_CONTIG) ||
+            (left.br_blockcount + new->br_blockcount +
+             right.br_blockcount <= MAXEXTLEN)))
+               state |= BMAP_RIGHT_CONTIG;
+
+       /*
+        * Switch out based on the contiguity flags.
+        */
+       switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+       case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+               /*
+                * New allocation is contiguous with delayed allocations
+                * on the left and on the right.
+                * Merge all three into a single extent record.
+                */
+               --*idx;
+               temp = left.br_blockcount + new->br_blockcount +
+                       right.br_blockcount;
+
+               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+               xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
+               oldlen = startblockval(left.br_startblock) +
+                       startblockval(new->br_startblock) +
+                       startblockval(right.br_startblock);
+               newlen = xfs_bmap_worst_indlen(ip, temp);
+               xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
+                       nullstartblock((int)newlen));
+               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+               xfs_iext_remove(ip, *idx + 1, 1, state);
+               break;
+
+       case BMAP_LEFT_CONTIG:
+               /*
+                * New allocation is contiguous with a delayed allocation
+                * on the left.
+                * Merge the new allocation with the left neighbor.
+                */
+               --*idx;
+               temp = left.br_blockcount + new->br_blockcount;
+
+               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+               xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
+               oldlen = startblockval(left.br_startblock) +
+                       startblockval(new->br_startblock);
+               newlen = xfs_bmap_worst_indlen(ip, temp);
+               xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
+                       nullstartblock((int)newlen));
+               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+               break;
+
+       case BMAP_RIGHT_CONTIG:
+               /*
+                * New allocation is contiguous with a delayed allocation
+                * on the right.
+                * Merge the new allocation with the right neighbor.
+                */
+               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+               temp = new->br_blockcount + right.br_blockcount;
+               oldlen = startblockval(new->br_startblock) +
+                       startblockval(right.br_startblock);
+               newlen = xfs_bmap_worst_indlen(ip, temp);
+               xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+                       new->br_startoff,
+                       nullstartblock((int)newlen), temp, right.br_state);
+               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+               break;
+
+       case 0:
+               /*
+                * New allocation is not contiguous with another
+                * delayed allocation.
+                * Insert a new entry.
+                */
+               oldlen = newlen = 0;
+               xfs_iext_insert(ip, *idx, 1, new, state);
+               break;
+       }
+       if (oldlen != newlen) {
+               ASSERT(oldlen > newlen);
+               xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
+                       (int64_t)(oldlen - newlen), 0);
+               /*
+                * Nothing to do for disk quota accounting here.
+                */
+       }
+}
+
+/*
+ * Convert a hole to a real allocation.
+ */
+STATIC int                             /* error */
+xfs_bmap_add_extent_hole_real(
+       struct xfs_bmalloca     *bma,
+       int                     whichfork)
+{
+       struct xfs_bmbt_irec    *new = &bma->got;
+       int                     error;  /* error return value */
+       int                     i;      /* temp state */
+       xfs_ifork_t             *ifp;   /* inode fork pointer */
+       xfs_bmbt_irec_t         left;   /* left neighbor extent entry */
+       xfs_bmbt_irec_t         right;  /* right neighbor extent entry */
+       int                     rval=0; /* return value (logging flags) */
+       int                     state;  /* state bits, accessed thru macros */
+
+       ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+
+       ASSERT(bma->idx >= 0);
+       ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+       ASSERT(!isnullstartblock(new->br_startblock));
+       ASSERT(!bma->cur ||
+              !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+
+       XFS_STATS_INC(xs_add_exlist);
+
+       state = 0;
+       if (whichfork == XFS_ATTR_FORK)
+               state |= BMAP_ATTRFORK;
+
+       /*
+        * Check and set flags if this segment has a left neighbor.
+        */
+       if (bma->idx > 0) {
+               state |= BMAP_LEFT_VALID;
+               xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left);
+               if (isnullstartblock(left.br_startblock))
+                       state |= BMAP_LEFT_DELAY;
+       }
+
+       /*
+        * Check and set flags if this segment has a current value.
+        * Not true if we're inserting into the "hole" at eof.
+        */
+       if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+               state |= BMAP_RIGHT_VALID;
+               xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right);
+               if (isnullstartblock(right.br_startblock))
+                       state |= BMAP_RIGHT_DELAY;
+       }
+
+       /*
+        * We're inserting a real allocation between "left" and "right".
+        * Set the contiguity flags.  Don't let extents get too large.
+        */
+       if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+           left.br_startoff + left.br_blockcount == new->br_startoff &&
+           left.br_startblock + left.br_blockcount == new->br_startblock &&
+           left.br_state == new->br_state &&
+           left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+               state |= BMAP_LEFT_CONTIG;
+
+       if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+           new->br_startoff + new->br_blockcount == right.br_startoff &&
+           new->br_startblock + new->br_blockcount == right.br_startblock &&
+           new->br_state == right.br_state &&
+           new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+           (!(state & BMAP_LEFT_CONTIG) ||
+            left.br_blockcount + new->br_blockcount +
+            right.br_blockcount <= MAXEXTLEN))
+               state |= BMAP_RIGHT_CONTIG;
+
+       error = 0;
+       /*
+        * Select which case we're in here, and implement it.
+        */
+       switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+       case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+               /*
+                * New allocation is contiguous with real allocations on the
+                * left and on the right.
+                * Merge all three into a single extent record.
+                */
+               --bma->idx;
+               trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+               xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
+                       left.br_blockcount + new->br_blockcount +
+                       right.br_blockcount);
+               trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+               xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
+
+               XFS_IFORK_NEXT_SET(bma->ip, whichfork,
+                       XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1);
+               if (bma->cur == NULL) {
+                       rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+               } else {
+                       rval = XFS_ILOG_CORE;
+                       error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff,
+                                       right.br_startblock, right.br_blockcount,
+                                       &i);
+                       if (error)
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       error = xfs_btree_delete(bma->cur, &i);
+                       if (error)
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       error = xfs_btree_decrement(bma->cur, 0, &i);
+                       if (error)
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       error = xfs_bmbt_update(bma->cur, left.br_startoff,
+                                       left.br_startblock,
+                                       left.br_blockcount +
+                                               new->br_blockcount +
+                                               right.br_blockcount,
+                                       left.br_state);
+                       if (error)
+                               goto done;
+               }
+               break;
+
+       case BMAP_LEFT_CONTIG:
+               /*
+                * New allocation is contiguous with a real allocation
+                * on the left.
+                * Merge the new allocation with the left neighbor.
+                */
+               --bma->idx;
+               trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+               xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
+                       left.br_blockcount + new->br_blockcount);
+               trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+               if (bma->cur == NULL) {
+                       rval = xfs_ilog_fext(whichfork);
+               } else {
+                       rval = 0;
+                       error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff,
+                                       left.br_startblock, left.br_blockcount,
+                                       &i);
+                       if (error)
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       error = xfs_bmbt_update(bma->cur, left.br_startoff,
+                                       left.br_startblock,
+                                       left.br_blockcount +
+                                               new->br_blockcount,
+                                       left.br_state);
+                       if (error)
+                               goto done;
+               }
+               break;
+
+       case BMAP_RIGHT_CONTIG:
+               /*
+                * New allocation is contiguous with a real allocation
+                * on the right.
+                * Merge the new allocation with the right neighbor.
+                */
+               trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+               xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx),
+                       new->br_startoff, new->br_startblock,
+                       new->br_blockcount + right.br_blockcount,
+                       right.br_state);
+               trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+               if (bma->cur == NULL) {
+                       rval = xfs_ilog_fext(whichfork);
+               } else {
+                       rval = 0;
+                       error = xfs_bmbt_lookup_eq(bma->cur,
+                                       right.br_startoff,
+                                       right.br_startblock,
+                                       right.br_blockcount, &i);
+                       if (error)
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       error = xfs_bmbt_update(bma->cur, new->br_startoff,
+                                       new->br_startblock,
+                                       new->br_blockcount +
+                                               right.br_blockcount,
+                                       right.br_state);
+                       if (error)
+                               goto done;
+               }
+               break;
+
+       case 0:
+               /*
+                * New allocation is not contiguous with another
+                * real allocation.
+                * Insert a new entry.
+                */
+               xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
+               XFS_IFORK_NEXT_SET(bma->ip, whichfork,
+                       XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1);
+               if (bma->cur == NULL) {
+                       rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+               } else {
+                       rval = XFS_ILOG_CORE;
+                       error = xfs_bmbt_lookup_eq(bma->cur,
+                                       new->br_startoff,
+                                       new->br_startblock,
+                                       new->br_blockcount, &i);
+                       if (error)
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                       bma->cur->bc_rec.b.br_state = new->br_state;
+                       error = xfs_btree_insert(bma->cur, &i);
+                       if (error)
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+               }
+               break;
+       }
+
+       /* convert to a btree if necessary */
+       if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
+               int     tmp_logflags;   /* partial log flag return val */
+
+               ASSERT(bma->cur == NULL);
+               error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+                               bma->firstblock, bma->flist, &bma->cur,
+                               0, &tmp_logflags, whichfork);
+               bma->logflags |= tmp_logflags;
+               if (error)
+                       goto done;
+       }
+
+       /* clear out the allocated field, done with it now in any case. */
+       if (bma->cur)
+               bma->cur->bc_private.b.allocated = 0;
+
+       xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
+done:
+       bma->logflags |= rval;
+       return error;
+}
+
+/*
+ * Functions used in the extent read, allocate and remove paths
+ */
+
+/*
+ * Adjust the size of the new extent based on di_extsize and rt extsize.
+ */
+int
+xfs_bmap_extsize_align(
+       xfs_mount_t     *mp,
+       xfs_bmbt_irec_t *gotp,          /* next extent pointer */
+       xfs_bmbt_irec_t *prevp,         /* previous extent pointer */
+       xfs_extlen_t    extsz,          /* align to this extent size */
+       int             rt,             /* is this a realtime inode? */
+       int             eof,            /* is extent at end-of-file? */
+       int             delay,          /* creating delalloc extent? */
+       int             convert,        /* overwriting unwritten extent? */
+       xfs_fileoff_t   *offp,          /* in/out: aligned offset */
+       xfs_extlen_t    *lenp)          /* in/out: aligned length */
+{
+       xfs_fileoff_t   orig_off;       /* original offset */
+       xfs_extlen_t    orig_alen;      /* original length */
+       xfs_fileoff_t   orig_end;       /* original off+len */
+       xfs_fileoff_t   nexto;          /* next file offset */
+       xfs_fileoff_t   prevo;          /* previous file offset */
+       xfs_fileoff_t   align_off;      /* temp for offset */
+       xfs_extlen_t    align_alen;     /* temp for length */
+       xfs_extlen_t    temp;           /* temp for calculations */
+
+       if (convert)
+               return 0;
+
+       orig_off = align_off = *offp;
+       orig_alen = align_alen = *lenp;
+       orig_end = orig_off + orig_alen;
+
+       /*
+        * If this request overlaps an existing extent, then don't
+        * attempt to perform any additional alignment.
+        */
+       if (!delay && !eof &&
+           (orig_off >= gotp->br_startoff) &&
+           (orig_end <= gotp->br_startoff + gotp->br_blockcount)) {
+               return 0;
+       }
+
+       /*
+        * If the file offset is unaligned vs. the extent size
+        * we need to align it.  This will be possible unless
+        * the file was previously written with a kernel that didn't
+        * perform this alignment, or if a truncate shot us in the
+        * foot.
+        */
+       temp = do_mod(orig_off, extsz);
+       if (temp) {
+               align_alen += temp;
+               align_off -= temp;
+       }
+       /*
+        * Same adjustment for the end of the requested area.
+        */
+       if ((temp = (align_alen % extsz))) {
+               align_alen += extsz - temp;
+       }
+       /*
+        * If the previous block overlaps with this proposed allocation
+        * then move the start forward without adjusting the length.
+        */
+       if (prevp->br_startoff != NULLFILEOFF) {
+               if (prevp->br_startblock == HOLESTARTBLOCK)
+                       prevo = prevp->br_startoff;
+               else
+                       prevo = prevp->br_startoff + prevp->br_blockcount;
+       } else
+               prevo = 0;
+       if (align_off != orig_off && align_off < prevo)
+               align_off = prevo;
+       /*
+        * If the next block overlaps with this proposed allocation
+        * then move the start back without adjusting the length,
+        * but not before offset 0.
+        * This may of course make the start overlap previous block,
+        * and if we hit the offset 0 limit then the next block
+        * can still overlap too.
+        */
+       if (!eof && gotp->br_startoff != NULLFILEOFF) {
+               if ((delay && gotp->br_startblock == HOLESTARTBLOCK) ||
+                   (!delay && gotp->br_startblock == DELAYSTARTBLOCK))
+                       nexto = gotp->br_startoff + gotp->br_blockcount;
+               else
+                       nexto = gotp->br_startoff;
+       } else
+               nexto = NULLFILEOFF;
+       if (!eof &&
+           align_off + align_alen != orig_end &&
+           align_off + align_alen > nexto)
+               align_off = nexto > align_alen ? nexto - align_alen : 0;
+       /*
+        * If we're now overlapping the next or previous extent that
+        * means we can't fit an extsz piece in this hole.  Just move
+        * the start forward to the first valid spot and set
+        * the length so we hit the end.
+        */
+       if (align_off != orig_off && align_off < prevo)
+               align_off = prevo;
+       if (align_off + align_alen != orig_end &&
+           align_off + align_alen > nexto &&
+           nexto != NULLFILEOFF) {
+               ASSERT(nexto > prevo);
+               align_alen = nexto - align_off;
+       }
+
+       /*
+        * If realtime, and the result isn't a multiple of the realtime
+        * extent size we need to remove blocks until it is.
+        */
+       if (rt && (temp = (align_alen % mp->m_sb.sb_rextsize))) {
+               /*
+                * We're not covering the original request, or
+                * we won't be able to once we fix the length.
+                */
+               if (orig_off < align_off ||
+                   orig_end > align_off + align_alen ||
+                   align_alen - temp < orig_alen)
+                       return -EINVAL;
+               /*
+                * Try to fix it by moving the start up.
+                */
+               if (align_off + temp <= orig_off) {
+                       align_alen -= temp;
+                       align_off += temp;
+               }
+               /*
+                * Try to fix it by moving the end in.
+                */
+               else if (align_off + align_alen - temp >= orig_end)
+                       align_alen -= temp;
+               /*
+                * Set the start to the minimum then trim the length.
+                */
+               else {
+                       align_alen -= orig_off - align_off;
+                       align_off = orig_off;
+                       align_alen -= align_alen % mp->m_sb.sb_rextsize;
+               }
+               /*
+                * Result doesn't cover the request, fail it.
+                */
+               if (orig_off < align_off || orig_end > align_off + align_alen)
+                       return -EINVAL;
+       } else {
+               ASSERT(orig_off >= align_off);
+               ASSERT(orig_end <= align_off + align_alen);
+       }
+
+#ifdef DEBUG
+       if (!eof && gotp->br_startoff != NULLFILEOFF)
+               ASSERT(align_off + align_alen <= gotp->br_startoff);
+       if (prevp->br_startoff != NULLFILEOFF)
+               ASSERT(align_off >= prevp->br_startoff + prevp->br_blockcount);
+#endif
+
+       *lenp = align_alen;
+       *offp = align_off;
+       return 0;
+}
+
+#define XFS_ALLOC_GAP_UNITS    4
+
+void
+xfs_bmap_adjacent(
+       struct xfs_bmalloca     *ap)    /* bmap alloc argument struct */
+{
+       xfs_fsblock_t   adjust;         /* adjustment to block numbers */
+       xfs_agnumber_t  fb_agno;        /* ag number of ap->firstblock */
+       xfs_mount_t     *mp;            /* mount point structure */
+       int             nullfb;         /* true if ap->firstblock isn't set */
+       int             rt;             /* true if inode is realtime */
+
+#define        ISVALID(x,y)    \
+       (rt ? \
+               (x) < mp->m_sb.sb_rblocks : \
+               XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && \
+               XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && \
+               XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks)
+
+       mp = ap->ip->i_mount;
+       nullfb = *ap->firstblock == NULLFSBLOCK;
+       rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata;
+       fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
+       /*
+        * If allocating at eof, and there's a previous real block,
+        * try to use its last block as our starting point.
+        */
+       if (ap->eof && ap->prev.br_startoff != NULLFILEOFF &&
+           !isnullstartblock(ap->prev.br_startblock) &&
+           ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount,
+                   ap->prev.br_startblock)) {
+               ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount;
+               /*
+                * Adjust for the gap between prevp and us.
+                */
+               adjust = ap->offset -
+                       (ap->prev.br_startoff + ap->prev.br_blockcount);
+               if (adjust &&
+                   ISVALID(ap->blkno + adjust, ap->prev.br_startblock))
+                       ap->blkno += adjust;
+       }
+       /*
+        * If not at eof, then compare the two neighbor blocks.
+        * Figure out whether either one gives us a good starting point,
+        * and pick the better one.
+        */
+       else if (!ap->eof) {
+               xfs_fsblock_t   gotbno;         /* right side block number */
+               xfs_fsblock_t   gotdiff=0;      /* right side difference */
+               xfs_fsblock_t   prevbno;        /* left side block number */
+               xfs_fsblock_t   prevdiff=0;     /* left side difference */
+
+               /*
+                * If there's a previous (left) block, select a requested
+                * start block based on it.
+                */
+               if (ap->prev.br_startoff != NULLFILEOFF &&
+                   !isnullstartblock(ap->prev.br_startblock) &&
+                   (prevbno = ap->prev.br_startblock +
+                              ap->prev.br_blockcount) &&
+                   ISVALID(prevbno, ap->prev.br_startblock)) {
+                       /*
+                        * Calculate gap to end of previous block.
+                        */
+                       adjust = prevdiff = ap->offset -
+                               (ap->prev.br_startoff +
+                                ap->prev.br_blockcount);
+                       /*
+                        * Figure the startblock based on the previous block's
+                        * end and the gap size.
+                        * Heuristic!
+                        * If the gap is large relative to the piece we're
+                        * allocating, or using it gives us an invalid block
+                        * number, then just use the end of the previous block.
+                        */
+                       if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
+                           ISVALID(prevbno + prevdiff,
+                                   ap->prev.br_startblock))
+                               prevbno += adjust;
+                       else
+                               prevdiff += adjust;
+                       /*
+                        * If the firstblock forbids it, can't use it,
+                        * must use default.
+                        */
+                       if (!rt && !nullfb &&
+                           XFS_FSB_TO_AGNO(mp, prevbno) != fb_agno)
+                               prevbno = NULLFSBLOCK;
+               }
+               /*
+                * No previous block or can't follow it, just default.
+                */
+               else
+                       prevbno = NULLFSBLOCK;
+               /*
+                * If there's a following (right) block, select a requested
+                * start block based on it.
+                */
+               if (!isnullstartblock(ap->got.br_startblock)) {
+                       /*
+                        * Calculate gap to start of next block.
+                        */
+                       adjust = gotdiff = ap->got.br_startoff - ap->offset;
+                       /*
+                        * Figure the startblock based on the next block's
+                        * start and the gap size.
+                        */
+                       gotbno = ap->got.br_startblock;
+                       /*
+                        * Heuristic!
+                        * If the gap is large relative to the piece we're
+                        * allocating, or using it gives us an invalid block
+                        * number, then just use the start of the next block
+                        * offset by our length.
+                        */
+                       if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
+                           ISVALID(gotbno - gotdiff, gotbno))
+                               gotbno -= adjust;
+                       else if (ISVALID(gotbno - ap->length, gotbno)) {
+                               gotbno -= ap->length;
+                               gotdiff += adjust - ap->length;
+                       } else
+                               gotdiff += adjust;
+                       /*
+                        * If the firstblock forbids it, can't use it,
+                        * must use default.
+                        */
+                       if (!rt && !nullfb &&
+                           XFS_FSB_TO_AGNO(mp, gotbno) != fb_agno)
+                               gotbno = NULLFSBLOCK;
+               }
+               /*
+                * No next block, just default.
+                */
+               else
+                       gotbno = NULLFSBLOCK;
+               /*
+                * If both valid, pick the better one, else the only good
+                * one, else ap->blkno is already set (to 0 or the inode block).
+                */
+               if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
+                       ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno;
+               else if (prevbno != NULLFSBLOCK)
+                       ap->blkno = prevbno;
+               else if (gotbno != NULLFSBLOCK)
+                       ap->blkno = gotbno;
+       }
+#undef ISVALID
+}
+
+static int
+xfs_bmap_longest_free_extent(
+       struct xfs_trans        *tp,
+       xfs_agnumber_t          ag,
+       xfs_extlen_t            *blen,
+       int                     *notinit)
+{
+       struct xfs_mount        *mp = tp->t_mountp;
+       struct xfs_perag        *pag;
+       xfs_extlen_t            longest;
+       int                     error = 0;
+
+       pag = xfs_perag_get(mp, ag);
+       if (!pag->pagf_init) {
+               error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK);
+               if (error)
+                       goto out;
+
+               if (!pag->pagf_init) {
+                       *notinit = 1;
+                       goto out;
+               }
+       }
+
+       longest = xfs_alloc_longest_free_extent(mp, pag);
+       if (*blen < longest)
+               *blen = longest;
+
+out:
+       xfs_perag_put(pag);
+       return error;
+}
+
+static void
+xfs_bmap_select_minlen(
+       struct xfs_bmalloca     *ap,
+       struct xfs_alloc_arg    *args,
+       xfs_extlen_t            *blen,
+       int                     notinit)
+{
+       if (notinit || *blen < ap->minlen) {
+               /*
+                * Since we did a BUF_TRYLOCK above, it is possible that
+                * there is space for this request.
+                */
+               args->minlen = ap->minlen;
+       } else if (*blen < args->maxlen) {
+               /*
+                * If the best seen length is less than the request length,
+                * use the best as the minimum.
+                */
+               args->minlen = *blen;
+       } else {
+               /*
+                * Otherwise we've seen an extent as big as maxlen, use that
+                * as the minimum.
+                */
+               args->minlen = args->maxlen;
+       }
+}
+
+STATIC int
+xfs_bmap_btalloc_nullfb(
+       struct xfs_bmalloca     *ap,
+       struct xfs_alloc_arg    *args,
+       xfs_extlen_t            *blen)
+{
+       struct xfs_mount        *mp = ap->ip->i_mount;
+       xfs_agnumber_t          ag, startag;
+       int                     notinit = 0;
+       int                     error;
+
+       args->type = XFS_ALLOCTYPE_START_BNO;
+       args->total = ap->total;
+
+       startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
+       if (startag == NULLAGNUMBER)
+               startag = ag = 0;
+
+       while (*blen < args->maxlen) {
+               error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
+                                                    &notinit);
+               if (error)
+                       return error;
+
+               if (++ag == mp->m_sb.sb_agcount)
+                       ag = 0;
+               if (ag == startag)
+                       break;
+       }
+
+       xfs_bmap_select_minlen(ap, args, blen, notinit);
+       return 0;
+}
+
+STATIC int
+xfs_bmap_btalloc_filestreams(
+       struct xfs_bmalloca     *ap,
+       struct xfs_alloc_arg    *args,
+       xfs_extlen_t            *blen)
+{
+       struct xfs_mount        *mp = ap->ip->i_mount;
+       xfs_agnumber_t          ag;
+       int                     notinit = 0;
+       int                     error;
+
+       args->type = XFS_ALLOCTYPE_NEAR_BNO;
+       args->total = ap->total;
+
+       ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
+       if (ag == NULLAGNUMBER)
+               ag = 0;
+
+       error = xfs_bmap_longest_free_extent(args->tp, ag, blen, &notinit);
+       if (error)
+               return error;
+
+       if (*blen < args->maxlen) {
+               error = xfs_filestream_new_ag(ap, &ag);
+               if (error)
+                       return error;
+
+               error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
+                                                    &notinit);
+               if (error)
+                       return error;
+
+       }
+
+       xfs_bmap_select_minlen(ap, args, blen, notinit);
+
+       /*
+        * Set the failure fallback case to look in the selected AG as stream
+        * may have moved.
+        */
+       ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
+       return 0;
+}
+
+STATIC int
+xfs_bmap_btalloc(
+       struct xfs_bmalloca     *ap)    /* bmap alloc argument struct */
+{
+       xfs_mount_t     *mp;            /* mount point structure */
+       xfs_alloctype_t atype = 0;      /* type for allocation routines */
+       xfs_extlen_t    align;          /* minimum allocation alignment */
+       xfs_agnumber_t  fb_agno;        /* ag number of ap->firstblock */
+       xfs_agnumber_t  ag;
+       xfs_alloc_arg_t args;
+       xfs_extlen_t    blen;
+       xfs_extlen_t    nextminlen = 0;
+       int             nullfb;         /* true if ap->firstblock isn't set */
+       int             isaligned;
+       int             tryagain;
+       int             error;
+       int             stripe_align;
+
+       ASSERT(ap->length);
+
+       mp = ap->ip->i_mount;
+
+       /* stripe alignment for allocation is determined by mount parameters */
+       stripe_align = 0;
+       if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
+               stripe_align = mp->m_swidth;
+       else if (mp->m_dalign)
+               stripe_align = mp->m_dalign;
+
+       align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
+       if (unlikely(align)) {
+               error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
+                                               align, 0, ap->eof, 0, ap->conv,
+                                               &ap->offset, &ap->length);
+               ASSERT(!error);
+               ASSERT(ap->length);
+       }
+
+
+       nullfb = *ap->firstblock == NULLFSBLOCK;
+       fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
+       if (nullfb) {
+               if (ap->userdata && xfs_inode_is_filestream(ap->ip)) {
+                       ag = xfs_filestream_lookup_ag(ap->ip);
+                       ag = (ag != NULLAGNUMBER) ? ag : 0;
+                       ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0);
+               } else {
+                       ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
+               }
+       } else
+               ap->blkno = *ap->firstblock;
+
+       xfs_bmap_adjacent(ap);
+
+       /*
+        * If allowed, use ap->blkno; otherwise must use firstblock since
+        * it's in the right allocation group.
+        */
+       if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno)
+               ;
+       else
+               ap->blkno = *ap->firstblock;
+       /*
+        * Normal allocation, done through xfs_alloc_vextent.
+        */
+       tryagain = isaligned = 0;
+       memset(&args, 0, sizeof(args));
+       args.tp = ap->tp;
+       args.mp = mp;
+       args.fsbno = ap->blkno;
+
+       /* Trim the allocation back to the maximum an AG can fit. */
+       args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp));
+       args.firstblock = *ap->firstblock;
+       blen = 0;
+       if (nullfb) {
+               /*
+                * Search for an allocation group with a single extent large
+                * enough for the request.  If one isn't found, then adjust
+                * the minimum allocation size to the largest space found.
+                */
+               if (ap->userdata && xfs_inode_is_filestream(ap->ip))
+                       error = xfs_bmap_btalloc_filestreams(ap, &args, &blen);
+               else
+                       error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
+               if (error)
+                       return error;
+       } else if (ap->flist->xbf_low) {
+               if (xfs_inode_is_filestream(ap->ip))
+                       args.type = XFS_ALLOCTYPE_FIRST_AG;
+               else
+                       args.type = XFS_ALLOCTYPE_START_BNO;
+               args.total = args.minlen = ap->minlen;
+       } else {
+               args.type = XFS_ALLOCTYPE_NEAR_BNO;
+               args.total = ap->total;
+               args.minlen = ap->minlen;
+       }
+       /* apply extent size hints if obtained earlier */
+       if (unlikely(align)) {
+               args.prod = align;
+               if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod)))
+                       args.mod = (xfs_extlen_t)(args.prod - args.mod);
+       } else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) {
+               args.prod = 1;
+               args.mod = 0;
+       } else {
+               args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog;
+               if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod))))
+                       args.mod = (xfs_extlen_t)(args.prod - args.mod);
+       }
+       /*
+        * If we are not low on available data blocks, and the
+        * underlying logical volume manager is a stripe, and
+        * the file offset is zero then try to allocate data
+        * blocks on stripe unit boundary.
+        * NOTE: ap->aeof is only set if the allocation length
+        * is >= the stripe unit and the allocation offset is
+        * at the end of file.
+        */
+       if (!ap->flist->xbf_low && ap->aeof) {
+               if (!ap->offset) {
+                       args.alignment = stripe_align;
+                       atype = args.type;
+                       isaligned = 1;
+                       /*
+                        * Adjust for alignment
+                        */
+                       if (blen > args.alignment && blen <= args.maxlen)
+                               args.minlen = blen - args.alignment;
+                       args.minalignslop = 0;
+               } else {
+                       /*
+                        * First try an exact bno allocation.
+                        * If it fails then do a near or start bno
+                        * allocation with alignment turned on.
+                        */
+                       atype = args.type;
+                       tryagain = 1;
+                       args.type = XFS_ALLOCTYPE_THIS_BNO;
+                       args.alignment = 1;
+                       /*
+                        * Compute the minlen+alignment for the
+                        * next case.  Set slop so that the value
+                        * of minlen+alignment+slop doesn't go up
+                        * between the calls.
+                        */
+                       if (blen > stripe_align && blen <= args.maxlen)
+                               nextminlen = blen - stripe_align;
+                       else
+                               nextminlen = args.minlen;
+                       if (nextminlen + stripe_align > args.minlen + 1)
+                               args.minalignslop =
+                                       nextminlen + stripe_align -
+                                       args.minlen - 1;
+                       else
+                               args.minalignslop = 0;
+               }
+       } else {
+               args.alignment = 1;
+               args.minalignslop = 0;
+       }
+       args.minleft = ap->minleft;
+       args.wasdel = ap->wasdel;
+       args.isfl = 0;
+       args.userdata = ap->userdata;
+       if ((error = xfs_alloc_vextent(&args)))
+               return error;
+       if (tryagain && args.fsbno == NULLFSBLOCK) {
+               /*
+                * Exact allocation failed. Now try with alignment
+                * turned on.
+                */
+               args.type = atype;
+               args.fsbno = ap->blkno;
+               args.alignment = stripe_align;
+               args.minlen = nextminlen;
+               args.minalignslop = 0;
+               isaligned = 1;
+               if ((error = xfs_alloc_vextent(&args)))
+                       return error;
+       }
+       if (isaligned && args.fsbno == NULLFSBLOCK) {
+               /*
+                * allocation failed, so turn off alignment and
+                * try again.
+                */
+               args.type = atype;
+               args.fsbno = ap->blkno;
+               args.alignment = 0;
+               if ((error = xfs_alloc_vextent(&args)))
+                       return error;
+       }
+       if (args.fsbno == NULLFSBLOCK && nullfb &&
+           args.minlen > ap->minlen) {
+               args.minlen = ap->minlen;
+               args.type = XFS_ALLOCTYPE_START_BNO;
+               args.fsbno = ap->blkno;
+               if ((error = xfs_alloc_vextent(&args)))
+                       return error;
+       }
+       if (args.fsbno == NULLFSBLOCK && nullfb) {
+               args.fsbno = 0;
+               args.type = XFS_ALLOCTYPE_FIRST_AG;
+               args.total = ap->minlen;
+               args.minleft = 0;
+               if ((error = xfs_alloc_vextent(&args)))
+                       return error;
+               ap->flist->xbf_low = 1;
+       }
+       if (args.fsbno != NULLFSBLOCK) {
+               /*
+                * check the allocation happened at the same or higher AG than
+                * the first block that was allocated.
+                */
+               ASSERT(*ap->firstblock == NULLFSBLOCK ||
+                      XFS_FSB_TO_AGNO(mp, *ap->firstblock) ==
+                      XFS_FSB_TO_AGNO(mp, args.fsbno) ||
+                      (ap->flist->xbf_low &&
+                       XFS_FSB_TO_AGNO(mp, *ap->firstblock) <
+                       XFS_FSB_TO_AGNO(mp, args.fsbno)));
+
+               ap->blkno = args.fsbno;
+               if (*ap->firstblock == NULLFSBLOCK)
+                       *ap->firstblock = args.fsbno;
+               ASSERT(nullfb || fb_agno == args.agno ||
+                      (ap->flist->xbf_low && fb_agno < args.agno));
+               ap->length = args.len;
+               ap->ip->i_d.di_nblocks += args.len;
+               xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+               if (ap->wasdel)
+                       ap->ip->i_delayed_blks -= args.len;
+               /*
+                * Adjust the disk quota also. This was reserved
+                * earlier.
+                */
+               xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
+                       ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT :
+                                       XFS_TRANS_DQ_BCOUNT,
+                       (long) args.len);
+       } else {
+               ap->blkno = NULLFSBLOCK;
+               ap->length = 0;
+       }
+       return 0;
+}
+
+/*
+ * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
+ * It figures out where to ask the underlying allocator to put the new extent.
+ */
+STATIC int
+xfs_bmap_alloc(
+       struct xfs_bmalloca     *ap)    /* bmap alloc argument struct */
+{
+       if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata)
+               return xfs_bmap_rtalloc(ap);
+       return xfs_bmap_btalloc(ap);
+}
+
+/*
+ * Trim the returned map to the required bounds
+ */
+STATIC void
+xfs_bmapi_trim_map(
+       struct xfs_bmbt_irec    *mval,
+       struct xfs_bmbt_irec    *got,
+       xfs_fileoff_t           *bno,
+       xfs_filblks_t           len,
+       xfs_fileoff_t           obno,
+       xfs_fileoff_t           end,
+       int                     n,
+       int                     flags)
+{
+       if ((flags & XFS_BMAPI_ENTIRE) ||
+           got->br_startoff + got->br_blockcount <= obno) {
+               *mval = *got;
+               if (isnullstartblock(got->br_startblock))
+                       mval->br_startblock = DELAYSTARTBLOCK;
+               return;
+       }
+
+       if (obno > *bno)
+               *bno = obno;
+       ASSERT((*bno >= obno) || (n == 0));
+       ASSERT(*bno < end);
+       mval->br_startoff = *bno;
+       if (isnullstartblock(got->br_startblock))
+               mval->br_startblock = DELAYSTARTBLOCK;
+       else
+               mval->br_startblock = got->br_startblock +
+                                       (*bno - got->br_startoff);
+       /*
+        * Return the minimum of what we got and what we asked for for
+        * the length.  We can use the len variable here because it is
+        * modified below and we could have been there before coming
+        * here if the first part of the allocation didn't overlap what
+        * was asked for.
+        */
+       mval->br_blockcount = XFS_FILBLKS_MIN(end - *bno,
+                       got->br_blockcount - (*bno - got->br_startoff));
+       mval->br_state = got->br_state;
+       ASSERT(mval->br_blockcount <= len);
+       return;
+}
+
+/*
+ * Update and validate the extent map to return
+ */
+STATIC void
+xfs_bmapi_update_map(
+       struct xfs_bmbt_irec    **map,
+       xfs_fileoff_t           *bno,
+       xfs_filblks_t           *len,
+       xfs_fileoff_t           obno,
+       xfs_fileoff_t           end,
+       int                     *n,
+       int                     flags)
+{
+       xfs_bmbt_irec_t *mval = *map;
+
+       ASSERT((flags & XFS_BMAPI_ENTIRE) ||
+              ((mval->br_startoff + mval->br_blockcount) <= end));
+       ASSERT((flags & XFS_BMAPI_ENTIRE) || (mval->br_blockcount <= *len) ||
+              (mval->br_startoff < obno));
+
+       *bno = mval->br_startoff + mval->br_blockcount;
+       *len = end - *bno;
+       if (*n > 0 && mval->br_startoff == mval[-1].br_startoff) {
+               /* update previous map with new information */
+               ASSERT(mval->br_startblock == mval[-1].br_startblock);
+               ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
+               ASSERT(mval->br_state == mval[-1].br_state);
+               mval[-1].br_blockcount = mval->br_blockcount;
+               mval[-1].br_state = mval->br_state;
+       } else if (*n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
+                  mval[-1].br_startblock != DELAYSTARTBLOCK &&
+                  mval[-1].br_startblock != HOLESTARTBLOCK &&
+                  mval->br_startblock == mval[-1].br_startblock +
+                                         mval[-1].br_blockcount &&
+                  ((flags & XFS_BMAPI_IGSTATE) ||
+                       mval[-1].br_state == mval->br_state)) {
+               ASSERT(mval->br_startoff ==
+                      mval[-1].br_startoff + mval[-1].br_blockcount);
+               mval[-1].br_blockcount += mval->br_blockcount;
+       } else if (*n > 0 &&
+                  mval->br_startblock == DELAYSTARTBLOCK &&
+                  mval[-1].br_startblock == DELAYSTARTBLOCK &&
+                  mval->br_startoff ==
+                  mval[-1].br_startoff + mval[-1].br_blockcount) {
+               mval[-1].br_blockcount += mval->br_blockcount;
+               mval[-1].br_state = mval->br_state;
+       } else if (!((*n == 0) &&
+                    ((mval->br_startoff + mval->br_blockcount) <=
+                     obno))) {
+               mval++;
+               (*n)++;
+       }
+       *map = mval;
+}
+
+/*
+ * Map file blocks to filesystem blocks without allocation.
+ */
+int
+xfs_bmapi_read(
+       struct xfs_inode        *ip,
+       xfs_fileoff_t           bno,
+       xfs_filblks_t           len,
+       struct xfs_bmbt_irec    *mval,
+       int                     *nmap,
+       int                     flags)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_ifork        *ifp;
+       struct xfs_bmbt_irec    got;
+       struct xfs_bmbt_irec    prev;
+       xfs_fileoff_t           obno;
+       xfs_fileoff_t           end;
+       xfs_extnum_t            lastx;
+       int                     error;
+       int                     eof;
+       int                     n = 0;
+       int                     whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+                                               XFS_ATTR_FORK : XFS_DATA_FORK;
+
+       ASSERT(*nmap >= 1);
+       ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
+                          XFS_BMAPI_IGSTATE)));
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
+
+       if (unlikely(XFS_TEST_ERROR(
+           (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+            mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+               XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp);
+               return -EFSCORRUPTED;
+       }
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -EIO;
+
+       XFS_STATS_INC(xs_blk_mapr);
+
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+
+       if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+               error = xfs_iread_extents(NULL, ip, whichfork);
+               if (error)
+                       return error;
+       }
+
+       xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev);
+       end = bno + len;
+       obno = bno;
+
+       while (bno < end && n < *nmap) {
+               /* Reading past eof, act as though there's a hole up to end. */
+               if (eof)
+                       got.br_startoff = end;
+               if (got.br_startoff > bno) {
+                       /* Reading in a hole.  */
+                       mval->br_startoff = bno;
+                       mval->br_startblock = HOLESTARTBLOCK;
+                       mval->br_blockcount =
+                               XFS_FILBLKS_MIN(len, got.br_startoff - bno);
+                       mval->br_state = XFS_EXT_NORM;
+                       bno += mval->br_blockcount;
+                       len -= mval->br_blockcount;
+                       mval++;
+                       n++;
+                       continue;
+               }
+
+               /* set up the extent map to return. */
+               xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
+               xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+
+               /* If we're done, stop now. */
+               if (bno >= end || n >= *nmap)
+                       break;
+
+               /* Else go on to the next record. */
+               if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
+                       xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
+               else
+                       eof = 1;
+       }
+       *nmap = n;
+       return 0;
+}
+
+STATIC int
+xfs_bmapi_reserve_delalloc(
+       struct xfs_inode        *ip,
+       xfs_fileoff_t           aoff,
+       xfs_filblks_t           len,
+       struct xfs_bmbt_irec    *got,
+       struct xfs_bmbt_irec    *prev,
+       xfs_extnum_t            *lastx,
+       int                     eof)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+       xfs_extlen_t            alen;
+       xfs_extlen_t            indlen;
+       char                    rt = XFS_IS_REALTIME_INODE(ip);
+       xfs_extlen_t            extsz;
+       int                     error;
+
+       alen = XFS_FILBLKS_MIN(len, MAXEXTLEN);
+       if (!eof)
+               alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
+
+       /* Figure out the extent size, adjust alen */
+       extsz = xfs_get_extsz_hint(ip);
+       if (extsz) {
+               /*
+                * Make sure we don't exceed a single extent length when we
+                * align the extent by reducing length we are going to
+                * allocate by the maximum amount extent size aligment may
+                * require.
+                */
+               alen = XFS_FILBLKS_MIN(len, MAXEXTLEN - (2 * extsz - 1));
+               error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof,
+                                              1, 0, &aoff, &alen);
+               ASSERT(!error);
+       }
+
+       if (rt)
+               extsz = alen / mp->m_sb.sb_rextsize;
+
+       /*
+        * Make a transaction-less quota reservation for delayed allocation
+        * blocks.  This number gets adjusted later.  We return if we haven't
+        * allocated blocks already inside this loop.
+        */
+       error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0,
+                       rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+       if (error)
+               return error;
+
+       /*
+        * Split changing sb for alen and indlen since they could be coming
+        * from different places.
+        */
+       indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
+       ASSERT(indlen > 0);
+
+       if (rt) {
+               error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
+                                         -((int64_t)extsz), 0);
+       } else {
+               error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                                                -((int64_t)alen), 0);
+       }
+
+       if (error)
+               goto out_unreserve_quota;
+
+       error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                                        -((int64_t)indlen), 0);
+       if (error)
+               goto out_unreserve_blocks;
+
+
+       ip->i_delayed_blks += alen;
+
+       got->br_startoff = aoff;
+       got->br_startblock = nullstartblock(indlen);
+       got->br_blockcount = alen;
+       got->br_state = XFS_EXT_NORM;
+       xfs_bmap_add_extent_hole_delay(ip, lastx, got);
+
+       /*
+        * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay
+        * might have merged it into one of the neighbouring ones.
+        */
+       xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
+
+       ASSERT(got->br_startoff <= aoff);
+       ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen);
+       ASSERT(isnullstartblock(got->br_startblock));
+       ASSERT(got->br_state == XFS_EXT_NORM);
+       return 0;
+
+out_unreserve_blocks:
+       if (rt)
+               xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0);
+       else
+               xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0);
+out_unreserve_quota:
+       if (XFS_IS_QUOTA_ON(mp))
+               xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ?
+                               XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+       return error;
+}
+
+/*
+ * Map file blocks to filesystem blocks, adding delayed allocations as needed.
+ */
+int
+xfs_bmapi_delay(
+       struct xfs_inode        *ip,    /* incore inode */
+       xfs_fileoff_t           bno,    /* starting file offs. mapped */
+       xfs_filblks_t           len,    /* length to map in file */
+       struct xfs_bmbt_irec    *mval,  /* output: map values */
+       int                     *nmap,  /* i/o: mval size/count */
+       int                     flags)  /* XFS_BMAPI_... */
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+       struct xfs_bmbt_irec    got;    /* current file extent record */
+       struct xfs_bmbt_irec    prev;   /* previous file extent record */
+       xfs_fileoff_t           obno;   /* old block number (offset) */
+       xfs_fileoff_t           end;    /* end of mapped file region */
+       xfs_extnum_t            lastx;  /* last useful extent number */
+       int                     eof;    /* we've hit the end of extents */
+       int                     n = 0;  /* current extent index */
+       int                     error = 0;
+
+       ASSERT(*nmap >= 1);
+       ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
+       ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+       if (unlikely(XFS_TEST_ERROR(
+           (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
+            XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
+            mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+               XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp);
+               return -EFSCORRUPTED;
+       }
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -EIO;
+
+       XFS_STATS_INC(xs_blk_mapw);
+
+       if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+               error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+               if (error)
+                       return error;
+       }
+
+       xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev);
+       end = bno + len;
+       obno = bno;
+
+       while (bno < end && n < *nmap) {
+               if (eof || got.br_startoff > bno) {
+                       error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got,
+                                                          &prev, &lastx, eof);
+                       if (error) {
+                               if (n == 0) {
+                                       *nmap = 0;
+                                       return error;
+                               }
+                               break;
+                       }
+               }
+
+               /* set up the extent map to return. */
+               xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
+               xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+
+               /* If we're done, stop now. */
+               if (bno >= end || n >= *nmap)
+                       break;
+
+               /* Else go on to the next record. */
+               prev = got;
+               if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
+                       xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
+               else
+                       eof = 1;
+       }
+
+       *nmap = n;
+       return 0;
+}
+
+
+static int
+xfs_bmapi_allocate(
+       struct xfs_bmalloca     *bma)
+{
+       struct xfs_mount        *mp = bma->ip->i_mount;
+       int                     whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ?
+                                               XFS_ATTR_FORK : XFS_DATA_FORK;
+       struct xfs_ifork        *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+       int                     tmp_logflags = 0;
+       int                     error;
+
+       ASSERT(bma->length > 0);
+
+       /*
+        * For the wasdelay case, we could also just allocate the stuff asked
+        * for in this bmap call but that wouldn't be as good.
+        */
+       if (bma->wasdel) {
+               bma->length = (xfs_extlen_t)bma->got.br_blockcount;
+               bma->offset = bma->got.br_startoff;
+               if (bma->idx != NULLEXTNUM && bma->idx) {
+                       xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1),
+                                        &bma->prev);
+               }
+       } else {
+               bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
+               if (!bma->eof)
+                       bma->length = XFS_FILBLKS_MIN(bma->length,
+                                       bma->got.br_startoff - bma->offset);
+       }
+
+       /*
+        * Indicate if this is the first user data in the file, or just any
+        * user data.
+        */
+       if (!(bma->flags & XFS_BMAPI_METADATA)) {
+               bma->userdata = (bma->offset == 0) ?
+                       XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
+       }
+
+       bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
+
+       /*
+        * Only want to do the alignment at the eof if it is userdata and
+        * allocation length is larger than a stripe unit.
+        */
+       if (mp->m_dalign && bma->length >= mp->m_dalign &&
+           !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
+               error = xfs_bmap_isaeof(bma, whichfork);
+               if (error)
+                       return error;
+       }
+
+       error = xfs_bmap_alloc(bma);
+       if (error)
+               return error;
+
+       if (bma->flist->xbf_low)
+               bma->minleft = 0;
+       if (bma->cur)
+               bma->cur->bc_private.b.firstblock = *bma->firstblock;
+       if (bma->blkno == NULLFSBLOCK)
+               return 0;
+       if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
+               bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
+               bma->cur->bc_private.b.firstblock = *bma->firstblock;
+               bma->cur->bc_private.b.flist = bma->flist;
+       }
+       /*
+        * Bump the number of extents we've allocated
+        * in this call.
+        */
+       bma->nallocs++;
+
+       if (bma->cur)
+               bma->cur->bc_private.b.flags =
+                       bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+
+       bma->got.br_startoff = bma->offset;
+       bma->got.br_startblock = bma->blkno;
+       bma->got.br_blockcount = bma->length;
+       bma->got.br_state = XFS_EXT_NORM;
+
+       /*
+        * A wasdelay extent has been initialized, so shouldn't be flagged
+        * as unwritten.
+        */
+       if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) &&
+           xfs_sb_version_hasextflgbit(&mp->m_sb))
+               bma->got.br_state = XFS_EXT_UNWRITTEN;
+
+       if (bma->wasdel)
+               error = xfs_bmap_add_extent_delay_real(bma);
+       else
+               error = xfs_bmap_add_extent_hole_real(bma, whichfork);
+
+       bma->logflags |= tmp_logflags;
+       if (error)
+               return error;
+
+       /*
+        * Update our extent pointer, given that xfs_bmap_add_extent_delay_real
+        * or xfs_bmap_add_extent_hole_real might have merged it into one of
+        * the neighbouring ones.
+        */
+       xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
+
+       ASSERT(bma->got.br_startoff <= bma->offset);
+       ASSERT(bma->got.br_startoff + bma->got.br_blockcount >=
+              bma->offset + bma->length);
+       ASSERT(bma->got.br_state == XFS_EXT_NORM ||
+              bma->got.br_state == XFS_EXT_UNWRITTEN);
+       return 0;
+}
+
+STATIC int
+xfs_bmapi_convert_unwritten(
+       struct xfs_bmalloca     *bma,
+       struct xfs_bmbt_irec    *mval,
+       xfs_filblks_t           len,
+       int                     flags)
+{
+       int                     whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+                                               XFS_ATTR_FORK : XFS_DATA_FORK;
+       struct xfs_ifork        *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+       int                     tmp_logflags = 0;
+       int                     error;
+
+       /* check if we need to do unwritten->real conversion */
+       if (mval->br_state == XFS_EXT_UNWRITTEN &&
+           (flags & XFS_BMAPI_PREALLOC))
+               return 0;
+
+       /* check if we need to do real->unwritten conversion */
+       if (mval->br_state == XFS_EXT_NORM &&
+           (flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) !=
+                       (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
+               return 0;
+
+       /*
+        * Modify (by adding) the state flag, if writing.
+        */
+       ASSERT(mval->br_blockcount <= len);
+       if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
+               bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp,
+                                       bma->ip, whichfork);
+               bma->cur->bc_private.b.firstblock = *bma->firstblock;
+               bma->cur->bc_private.b.flist = bma->flist;
+       }
+       mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
+                               ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
+
+       error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
+                       &bma->cur, mval, bma->firstblock, bma->flist,
+                       &tmp_logflags);
+       bma->logflags |= tmp_logflags;
+       if (error)
+               return error;
+
+       /*
+        * Update our extent pointer, given that
+        * xfs_bmap_add_extent_unwritten_real might have merged it into one
+        * of the neighbouring ones.
+        */
+       xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
+
+       /*
+        * We may have combined previously unwritten space with written space,
+        * so generate another request.
+        */
+       if (mval->br_blockcount < len)
+               return -EAGAIN;
+       return 0;
+}
+
+/*
+ * Map file blocks to filesystem blocks, and allocate blocks or convert the
+ * extent state if necessary.  Details behaviour is controlled by the flags
+ * parameter.  Only allocates blocks from a single allocation group, to avoid
+ * locking problems.
+ *
+ * The returned value in "firstblock" from the first call in a transaction
+ * must be remembered and presented to subsequent calls in "firstblock".
+ * An upper bound for the number of blocks to be allocated is supplied to
+ * the first call in "total"; if no allocation group has that many free
+ * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
+ */
+int
+xfs_bmapi_write(
+       struct xfs_trans        *tp,            /* transaction pointer */
+       struct xfs_inode        *ip,            /* incore inode */
+       xfs_fileoff_t           bno,            /* starting file offs. mapped */
+       xfs_filblks_t           len,            /* length to map in file */
+       int                     flags,          /* XFS_BMAPI_... */
+       xfs_fsblock_t           *firstblock,    /* first allocated block
+                                                  controls a.g. for allocs */
+       xfs_extlen_t            total,          /* total blocks needed */
+       struct xfs_bmbt_irec    *mval,          /* output: map values */
+       int                     *nmap,          /* i/o: mval size/count */
+       struct xfs_bmap_free    *flist)         /* i/o: list extents to free */
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_ifork        *ifp;
+       struct xfs_bmalloca     bma = { NULL }; /* args for xfs_bmap_alloc */
+       xfs_fileoff_t           end;            /* end of mapped file region */
+       int                     eof;            /* after the end of extents */
+       int                     error;          /* error return */
+       int                     n;              /* current extent index */
+       xfs_fileoff_t           obno;           /* old block number (offset) */
+       int                     whichfork;      /* data or attr fork */
+       char                    inhole;         /* current location is hole in file */
+       char                    wasdelay;       /* old extent was delayed */
+
+#ifdef DEBUG
+       xfs_fileoff_t           orig_bno;       /* original block number value */
+       int                     orig_flags;     /* original flags arg value */
+       xfs_filblks_t           orig_len;       /* original value of len arg */
+       struct xfs_bmbt_irec    *orig_mval;     /* original value of mval */
+       int                     orig_nmap;      /* original value of *nmap */
+
+       orig_bno = bno;
+       orig_len = len;
+       orig_flags = flags;
+       orig_mval = mval;
+       orig_nmap = *nmap;
+#endif
+       whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+               XFS_ATTR_FORK : XFS_DATA_FORK;
+
+       ASSERT(*nmap >= 1);
+       ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
+       ASSERT(!(flags & XFS_BMAPI_IGSTATE));
+       ASSERT(tp != NULL);
+       ASSERT(len > 0);
+       ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+       if (unlikely(XFS_TEST_ERROR(
+           (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+            mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+               XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
+               return -EFSCORRUPTED;
+       }
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -EIO;
+
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+
+       XFS_STATS_INC(xs_blk_mapw);
+
+       if (*firstblock == NULLFSBLOCK) {
+               if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
+                       bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+               else
+                       bma.minleft = 1;
+       } else {
+               bma.minleft = 0;
+       }
+
+       if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+               error = xfs_iread_extents(tp, ip, whichfork);
+               if (error)
+                       goto error0;
+       }
+
+       xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got,
+                               &bma.prev);
+       n = 0;
+       end = bno + len;
+       obno = bno;
+
+       bma.tp = tp;
+       bma.ip = ip;
+       bma.total = total;
+       bma.userdata = 0;
+       bma.flist = flist;
+       bma.firstblock = firstblock;
+
+       while (bno < end && n < *nmap) {
+               inhole = eof || bma.got.br_startoff > bno;
+               wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
+
+               /*
+                * First, deal with the hole before the allocated space
+                * that we found, if any.
+                */
+               if (inhole || wasdelay) {
+                       bma.eof = eof;
+                       bma.conv = !!(flags & XFS_BMAPI_CONVERT);
+                       bma.wasdel = wasdelay;
+                       bma.offset = bno;
+                       bma.flags = flags;
+
+                       /*
+                        * There's a 32/64 bit type mismatch between the
+                        * allocation length request (which can be 64 bits in
+                        * length) and the bma length request, which is
+                        * xfs_extlen_t and therefore 32 bits. Hence we have to
+                        * check for 32-bit overflows and handle them here.
+                        */
+                       if (len > (xfs_filblks_t)MAXEXTLEN)
+                               bma.length = MAXEXTLEN;
+                       else
+                               bma.length = len;
+
+                       ASSERT(len > 0);
+                       ASSERT(bma.length > 0);
+                       error = xfs_bmapi_allocate(&bma);
+                       if (error)
+                               goto error0;
+                       if (bma.blkno == NULLFSBLOCK)
+                               break;
+               }
+
+               /* Deal with the allocated space we found.  */
+               xfs_bmapi_trim_map(mval, &bma.got, &bno, len, obno,
+                                                       end, n, flags);
+
+               /* Execute unwritten extent conversion if necessary */
+               error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags);
+               if (error == -EAGAIN)
+                       continue;
+               if (error)
+                       goto error0;
+
+               /* update the extent map to return */
+               xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+
+               /*
+                * If we're done, stop now.  Stop when we've allocated
+                * XFS_BMAP_MAX_NMAP extents no matter what.  Otherwise
+                * the transaction may get too big.
+                */
+               if (bno >= end || n >= *nmap || bma.nallocs >= *nmap)
+                       break;
+
+               /* Else go on to the next record. */
+               bma.prev = bma.got;
+               if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) {
+                       xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx),
+                                        &bma.got);
+               } else
+                       eof = 1;
+       }
+       *nmap = n;
+
+       /*
+        * Transform from btree to extents, give it cur.
+        */
+       if (xfs_bmap_wants_extents(ip, whichfork)) {
+               int             tmp_logflags = 0;
+
+               ASSERT(bma.cur);
+               error = xfs_bmap_btree_to_extents(tp, ip, bma.cur,
+                       &tmp_logflags, whichfork);
+               bma.logflags |= tmp_logflags;
+               if (error)
+                       goto error0;
+       }
+
+       ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
+              XFS_IFORK_NEXTENTS(ip, whichfork) >
+               XFS_IFORK_MAXEXT(ip, whichfork));
+       error = 0;
+error0:
+       /*
+        * Log everything.  Do this after conversion, there's no point in
+        * logging the extent records if we've converted to btree format.
+        */
+       if ((bma.logflags & xfs_ilog_fext(whichfork)) &&
+           XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+               bma.logflags &= ~xfs_ilog_fext(whichfork);
+       else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) &&
+                XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
+               bma.logflags &= ~xfs_ilog_fbroot(whichfork);
+       /*
+        * Log whatever the flags say, even if error.  Otherwise we might miss
+        * detecting a case where the data is changed, there's an error,
+        * and it's not logged so we don't shutdown when we should.
+        */
+       if (bma.logflags)
+               xfs_trans_log_inode(tp, ip, bma.logflags);
+
+       if (bma.cur) {
+               if (!error) {
+                       ASSERT(*firstblock == NULLFSBLOCK ||
+                              XFS_FSB_TO_AGNO(mp, *firstblock) ==
+                              XFS_FSB_TO_AGNO(mp,
+                                      bma.cur->bc_private.b.firstblock) ||
+                              (flist->xbf_low &&
+                               XFS_FSB_TO_AGNO(mp, *firstblock) <
+                               XFS_FSB_TO_AGNO(mp,
+                                       bma.cur->bc_private.b.firstblock)));
+                       *firstblock = bma.cur->bc_private.b.firstblock;
+               }
+               xfs_btree_del_cursor(bma.cur,
+                       error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+       }
+       if (!error)
+               xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
+                       orig_nmap, *nmap);
+       return error;
+}
+
+/*
+ * Called by xfs_bmapi to update file extent records and the btree
+ * after removing space (or undoing a delayed allocation).
+ */
+STATIC int                             /* error */
+xfs_bmap_del_extent(
+       xfs_inode_t             *ip,    /* incore inode pointer */
+       xfs_trans_t             *tp,    /* current transaction pointer */
+       xfs_extnum_t            *idx,   /* extent number to update/delete */
+       xfs_bmap_free_t         *flist, /* list of extents to be freed */
+       xfs_btree_cur_t         *cur,   /* if null, not a btree */
+       xfs_bmbt_irec_t         *del,   /* data to remove from extents */
+       int                     *logflagsp, /* inode logging flags */
+       int                     whichfork) /* data or attr fork */
+{
+       xfs_filblks_t           da_new; /* new delay-alloc indirect blocks */
+       xfs_filblks_t           da_old; /* old delay-alloc indirect blocks */
+       xfs_fsblock_t           del_endblock=0; /* first block past del */
+       xfs_fileoff_t           del_endoff;     /* first offset past del */
+       int                     delay;  /* current block is delayed allocated */
+       int                     do_fx;  /* free extent at end of routine */
+       xfs_bmbt_rec_host_t     *ep;    /* current extent entry pointer */
+       int                     error;  /* error return value */
+       int                     flags;  /* inode logging flags */
+       xfs_bmbt_irec_t         got;    /* current extent entry */
+       xfs_fileoff_t           got_endoff;     /* first offset past got */
+       int                     i;      /* temp state */
+       xfs_ifork_t             *ifp;   /* inode fork pointer */
+       xfs_mount_t             *mp;    /* mount structure */
+       xfs_filblks_t           nblks;  /* quota/sb block count */
+       xfs_bmbt_irec_t         new;    /* new record to be inserted */
+       /* REFERENCED */
+       uint                    qfield; /* quota field to update */
+       xfs_filblks_t           temp;   /* for indirect length calculations */
+       xfs_filblks_t           temp2;  /* for indirect length calculations */
+       int                     state = 0;
+
+       XFS_STATS_INC(xs_del_exlist);
+
+       if (whichfork == XFS_ATTR_FORK)
+               state |= BMAP_ATTRFORK;
+
+       mp = ip->i_mount;
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
+               (uint)sizeof(xfs_bmbt_rec_t)));
+       ASSERT(del->br_blockcount > 0);
+       ep = xfs_iext_get_ext(ifp, *idx);
+       xfs_bmbt_get_all(ep, &got);
+       ASSERT(got.br_startoff <= del->br_startoff);
+       del_endoff = del->br_startoff + del->br_blockcount;
+       got_endoff = got.br_startoff + got.br_blockcount;
+       ASSERT(got_endoff >= del_endoff);
+       delay = isnullstartblock(got.br_startblock);
+       ASSERT(isnullstartblock(del->br_startblock) == delay);
+       flags = 0;
+       qfield = 0;
+       error = 0;
+       /*
+        * If deleting a real allocation, must free up the disk space.
+        */
+       if (!delay) {
+               flags = XFS_ILOG_CORE;
+               /*
+                * Realtime allocation.  Free it and record di_nblocks update.
+                */
+               if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
+                       xfs_fsblock_t   bno;
+                       xfs_filblks_t   len;
+
+                       ASSERT(do_mod(del->br_blockcount,
+                                     mp->m_sb.sb_rextsize) == 0);
+                       ASSERT(do_mod(del->br_startblock,
+                                     mp->m_sb.sb_rextsize) == 0);
+                       bno = del->br_startblock;
+                       len = del->br_blockcount;
+                       do_div(bno, mp->m_sb.sb_rextsize);
+                       do_div(len, mp->m_sb.sb_rextsize);
+                       error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
+                       if (error)
+                               goto done;
+                       do_fx = 0;
+                       nblks = len * mp->m_sb.sb_rextsize;
+                       qfield = XFS_TRANS_DQ_RTBCOUNT;
+               }
+               /*
+                * Ordinary allocation.
+                */
+               else {
+                       do_fx = 1;
+                       nblks = del->br_blockcount;
+                       qfield = XFS_TRANS_DQ_BCOUNT;
+               }
+               /*
+                * Set up del_endblock and cur for later.
+                */
+               del_endblock = del->br_startblock + del->br_blockcount;
+               if (cur) {
+                       if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+                                       got.br_startblock, got.br_blockcount,
+                                       &i)))
+                               goto done;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+               }
+               da_old = da_new = 0;
+       } else {
+               da_old = startblockval(got.br_startblock);
+               da_new = 0;
+               nblks = 0;
+               do_fx = 0;
+       }
+       /*
+        * Set flag value to use in switch statement.
+        * Left-contig is 2, right-contig is 1.
+        */
+       switch (((got.br_startoff == del->br_startoff) << 1) |
+               (got_endoff == del_endoff)) {
+       case 3:
+               /*
+                * Matches the whole extent.  Delete the entry.
+                */
+               xfs_iext_remove(ip, *idx, 1,
+                               whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
+               --*idx;
+               if (delay)
+                       break;
+
+               XFS_IFORK_NEXT_SET(ip, whichfork,
+                       XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+               flags |= XFS_ILOG_CORE;
+               if (!cur) {
+                       flags |= xfs_ilog_fext(whichfork);
+                       break;
+               }
+               if ((error = xfs_btree_delete(cur, &i)))
+                       goto done;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+               break;
+
+       case 2:
+               /*
+                * Deleting the first part of the extent.
+                */
+               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+               xfs_bmbt_set_startoff(ep, del_endoff);
+               temp = got.br_blockcount - del->br_blockcount;
+               xfs_bmbt_set_blockcount(ep, temp);
+               if (delay) {
+                       temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+                               da_old);
+                       xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+                       trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                       da_new = temp;
+                       break;
+               }
+               xfs_bmbt_set_startblock(ep, del_endblock);
+               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+               if (!cur) {
+                       flags |= xfs_ilog_fext(whichfork);
+                       break;
+               }
+               if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
+                               got.br_blockcount - del->br_blockcount,
+                               got.br_state)))
+                       goto done;
+               break;
+
+       case 1:
+               /*
+                * Deleting the last part of the extent.
+                */
+               temp = got.br_blockcount - del->br_blockcount;
+               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+               xfs_bmbt_set_blockcount(ep, temp);
+               if (delay) {
+                       temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+                               da_old);
+                       xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+                       trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                       da_new = temp;
+                       break;
+               }
+               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+               if (!cur) {
+                       flags |= xfs_ilog_fext(whichfork);
+                       break;
+               }
+               if ((error = xfs_bmbt_update(cur, got.br_startoff,
+                               got.br_startblock,
+                               got.br_blockcount - del->br_blockcount,
+                               got.br_state)))
+                       goto done;
+               break;
+
+       case 0:
+               /*
+                * Deleting the middle of the extent.
+                */
+               temp = del->br_startoff - got.br_startoff;
+               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+               xfs_bmbt_set_blockcount(ep, temp);
+               new.br_startoff = del_endoff;
+               temp2 = got_endoff - del_endoff;
+               new.br_blockcount = temp2;
+               new.br_state = got.br_state;
+               if (!delay) {
+                       new.br_startblock = del_endblock;
+                       flags |= XFS_ILOG_CORE;
+                       if (cur) {
+                               if ((error = xfs_bmbt_update(cur,
+                                               got.br_startoff,
+                                               got.br_startblock, temp,
+                                               got.br_state)))
+                                       goto done;
+                               if ((error = xfs_btree_increment(cur, 0, &i)))
+                                       goto done;
+                               cur->bc_rec.b = new;
+                               error = xfs_btree_insert(cur, &i);
+                               if (error && error != -ENOSPC)
+                                       goto done;
+                               /*
+                                * If get no-space back from btree insert,
+                                * it tried a split, and we have a zero
+                                * block reservation.
+                                * Fix up our state and return the error.
+                                */
+                               if (error == -ENOSPC) {
+                                       /*
+                                        * Reset the cursor, don't trust
+                                        * it after any insert operation.
+                                        */
+                                       if ((error = xfs_bmbt_lookup_eq(cur,
+                                                       got.br_startoff,
+                                                       got.br_startblock,
+                                                       temp, &i)))
+                                               goto done;
+                                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                                       /*
+                                        * Update the btree record back
+                                        * to the original value.
+                                        */
+                                       if ((error = xfs_bmbt_update(cur,
+                                                       got.br_startoff,
+                                                       got.br_startblock,
+                                                       got.br_blockcount,
+                                                       got.br_state)))
+                                               goto done;
+                                       /*
+                                        * Reset the extent record back
+                                        * to the original value.
+                                        */
+                                       xfs_bmbt_set_blockcount(ep,
+                                               got.br_blockcount);
+                                       flags = 0;
+                                       error = -ENOSPC;
+                                       goto done;
+                               }
+                               XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       } else
+                               flags |= xfs_ilog_fext(whichfork);
+                       XFS_IFORK_NEXT_SET(ip, whichfork,
+                               XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+               } else {
+                       ASSERT(whichfork == XFS_DATA_FORK);
+                       temp = xfs_bmap_worst_indlen(ip, temp);
+                       xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+                       temp2 = xfs_bmap_worst_indlen(ip, temp2);
+                       new.br_startblock = nullstartblock((int)temp2);
+                       da_new = temp + temp2;
+                       while (da_new > da_old) {
+                               if (temp) {
+                                       temp--;
+                                       da_new--;
+                                       xfs_bmbt_set_startblock(ep,
+                                               nullstartblock((int)temp));
+                               }
+                               if (da_new == da_old)
+                                       break;
+                               if (temp2) {
+                                       temp2--;
+                                       da_new--;
+                                       new.br_startblock =
+                                               nullstartblock((int)temp2);
+                               }
+                       }
+               }
+               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+               xfs_iext_insert(ip, *idx + 1, 1, &new, state);
+               ++*idx;
+               break;
+       }
+       /*
+        * If we need to, add to list of extents to delete.
+        */
+       if (do_fx)
+               xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
+                       mp);
+       /*
+        * Adjust inode # blocks in the file.
+        */
+       if (nblks)
+               ip->i_d.di_nblocks -= nblks;
+       /*
+        * Adjust quota data.
+        */
+       if (qfield)
+               xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
+
+       /*
+        * Account for change in delayed indirect blocks.
+        * Nothing to do for disk quota accounting here.
+        */
+       ASSERT(da_old >= da_new);
+       if (da_old > da_new) {
+               xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                       (int64_t)(da_old - da_new), 0);
+       }
+done:
+       *logflagsp = flags;
+       return error;
+}
+
+/*
+ * Unmap (remove) blocks from a file.
+ * If nexts is nonzero then the number of extents to remove is limited to
+ * that value.  If not all extents in the block range can be removed then
+ * *done is set.
+ */
+int                                            /* error */
+xfs_bunmapi(
+       xfs_trans_t             *tp,            /* transaction pointer */
+       struct xfs_inode        *ip,            /* incore inode */
+       xfs_fileoff_t           bno,            /* starting offset to unmap */
+       xfs_filblks_t           len,            /* length to unmap in file */
+       int                     flags,          /* misc flags */
+       xfs_extnum_t            nexts,          /* number of extents max */
+       xfs_fsblock_t           *firstblock,    /* first allocated block
+                                                  controls a.g. for allocs */
+       xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
+       int                     *done)          /* set if not done yet */
+{
+       xfs_btree_cur_t         *cur;           /* bmap btree cursor */
+       xfs_bmbt_irec_t         del;            /* extent being deleted */
+       int                     eof;            /* is deleting at eof */
+       xfs_bmbt_rec_host_t     *ep;            /* extent record pointer */
+       int                     error;          /* error return value */
+       xfs_extnum_t            extno;          /* extent number in list */
+       xfs_bmbt_irec_t         got;            /* current extent record */
+       xfs_ifork_t             *ifp;           /* inode fork pointer */
+       int                     isrt;           /* freeing in rt area */
+       xfs_extnum_t            lastx;          /* last extent index used */
+       int                     logflags;       /* transaction logging flags */
+       xfs_extlen_t            mod;            /* rt extent offset */
+       xfs_mount_t             *mp;            /* mount structure */
+       xfs_extnum_t            nextents;       /* number of file extents */
+       xfs_bmbt_irec_t         prev;           /* previous extent record */
+       xfs_fileoff_t           start;          /* first file offset deleted */
+       int                     tmp_logflags;   /* partial logging flags */
+       int                     wasdel;         /* was a delayed alloc extent */
+       int                     whichfork;      /* data or attribute fork */
+       xfs_fsblock_t           sum;
+
+       trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
+
+       whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+               XFS_ATTR_FORK : XFS_DATA_FORK;
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       if (unlikely(
+           XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+           XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
+               XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW,
+                                ip->i_mount);
+               return -EFSCORRUPTED;
+       }
+       mp = ip->i_mount;
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -EIO;
+
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+       ASSERT(len > 0);
+       ASSERT(nexts >= 0);
+
+       if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+           (error = xfs_iread_extents(tp, ip, whichfork)))
+               return error;
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       if (nextents == 0) {
+               *done = 1;
+               return 0;
+       }
+       XFS_STATS_INC(xs_blk_unmap);
+       isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+       start = bno;
+       bno = start + len - 1;
+       ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+               &prev);
+
+       /*
+        * Check to see if the given block number is past the end of the
+        * file, back up to the last block if so...
+        */
+       if (eof) {
+               ep = xfs_iext_get_ext(ifp, --lastx);
+               xfs_bmbt_get_all(ep, &got);
+               bno = got.br_startoff + got.br_blockcount - 1;
+       }
+       logflags = 0;
+       if (ifp->if_flags & XFS_IFBROOT) {
+               ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+               cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+               cur->bc_private.b.firstblock = *firstblock;
+               cur->bc_private.b.flist = flist;
+               cur->bc_private.b.flags = 0;
+       } else
+               cur = NULL;
+
+       if (isrt) {
+               /*
+                * Synchronize by locking the bitmap inode.
+                */
+               xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+               xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+       }
+
+       extno = 0;
+       while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
+              (nexts == 0 || extno < nexts)) {
+               /*
+                * Is the found extent after a hole in which bno lives?
+                * Just back up to the previous extent, if so.
+                */
+               if (got.br_startoff > bno) {
+                       if (--lastx < 0)
+                               break;
+                       ep = xfs_iext_get_ext(ifp, lastx);
+                       xfs_bmbt_get_all(ep, &got);
+               }
+               /*
+                * Is the last block of this extent before the range
+                * we're supposed to delete?  If so, we're done.
+                */
+               bno = XFS_FILEOFF_MIN(bno,
+                       got.br_startoff + got.br_blockcount - 1);
+               if (bno < start)
+                       break;
+               /*
+                * Then deal with the (possibly delayed) allocated space
+                * we found.
+                */
+               ASSERT(ep != NULL);
+               del = got;
+               wasdel = isnullstartblock(del.br_startblock);
+               if (got.br_startoff < start) {
+                       del.br_startoff = start;
+                       del.br_blockcount -= start - got.br_startoff;
+                       if (!wasdel)
+                               del.br_startblock += start - got.br_startoff;
+               }
+               if (del.br_startoff + del.br_blockcount > bno + 1)
+                       del.br_blockcount = bno + 1 - del.br_startoff;
+               sum = del.br_startblock + del.br_blockcount;
+               if (isrt &&
+                   (mod = do_mod(sum, mp->m_sb.sb_rextsize))) {
+                       /*
+                        * Realtime extent not lined up at the end.
+                        * The extent could have been split into written
+                        * and unwritten pieces, or we could just be
+                        * unmapping part of it.  But we can't really
+                        * get rid of part of a realtime extent.
+                        */
+                       if (del.br_state == XFS_EXT_UNWRITTEN ||
+                           !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+                               /*
+                                * This piece is unwritten, or we're not
+                                * using unwritten extents.  Skip over it.
+                                */
+                               ASSERT(bno >= mod);
+                               bno -= mod > del.br_blockcount ?
+                                       del.br_blockcount : mod;
+                               if (bno < got.br_startoff) {
+                                       if (--lastx >= 0)
+                                               xfs_bmbt_get_all(xfs_iext_get_ext(
+                                                       ifp, lastx), &got);
+                               }
+                               continue;
+                       }
+                       /*
+                        * It's written, turn it unwritten.
+                        * This is better than zeroing it.
+                        */
+                       ASSERT(del.br_state == XFS_EXT_NORM);
+                       ASSERT(xfs_trans_get_block_res(tp) > 0);
+                       /*
+                        * If this spans a realtime extent boundary,
+                        * chop it back to the start of the one we end at.
+                        */
+                       if (del.br_blockcount > mod) {
+                               del.br_startoff += del.br_blockcount - mod;
+                               del.br_startblock += del.br_blockcount - mod;
+                               del.br_blockcount = mod;
+                       }
+                       del.br_state = XFS_EXT_UNWRITTEN;
+                       error = xfs_bmap_add_extent_unwritten_real(tp, ip,
+                                       &lastx, &cur, &del, firstblock, flist,
+                                       &logflags);
+                       if (error)
+                               goto error0;
+                       goto nodelete;
+               }
+               if (isrt && (mod = do_mod(del.br_startblock, mp->m_sb.sb_rextsize))) {
+                       /*
+                        * Realtime extent is lined up at the end but not
+                        * at the front.  We'll get rid of full extents if
+                        * we can.
+                        */
+                       mod = mp->m_sb.sb_rextsize - mod;
+                       if (del.br_blockcount > mod) {
+                               del.br_blockcount -= mod;
+                               del.br_startoff += mod;
+                               del.br_startblock += mod;
+                       } else if ((del.br_startoff == start &&
+                                   (del.br_state == XFS_EXT_UNWRITTEN ||
+                                    xfs_trans_get_block_res(tp) == 0)) ||
+                                  !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+                               /*
+                                * Can't make it unwritten.  There isn't
+                                * a full extent here so just skip it.
+                                */
+                               ASSERT(bno >= del.br_blockcount);
+                               bno -= del.br_blockcount;
+                               if (got.br_startoff > bno) {
+                                       if (--lastx >= 0) {
+                                               ep = xfs_iext_get_ext(ifp,
+                                                                     lastx);
+                                               xfs_bmbt_get_all(ep, &got);
+                                       }
+                               }
+                               continue;
+                       } else if (del.br_state == XFS_EXT_UNWRITTEN) {
+                               /*
+                                * This one is already unwritten.
+                                * It must have a written left neighbor.
+                                * Unwrite the killed part of that one and
+                                * try again.
+                                */
+                               ASSERT(lastx > 0);
+                               xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
+                                               lastx - 1), &prev);
+                               ASSERT(prev.br_state == XFS_EXT_NORM);
+                               ASSERT(!isnullstartblock(prev.br_startblock));
+                               ASSERT(del.br_startblock ==
+                                      prev.br_startblock + prev.br_blockcount);
+                               if (prev.br_startoff < start) {
+                                       mod = start - prev.br_startoff;
+                                       prev.br_blockcount -= mod;
+                                       prev.br_startblock += mod;
+                                       prev.br_startoff = start;
+                               }
+                               prev.br_state = XFS_EXT_UNWRITTEN;
+                               lastx--;
+                               error = xfs_bmap_add_extent_unwritten_real(tp,
+                                               ip, &lastx, &cur, &prev,
+                                               firstblock, flist, &logflags);
+                               if (error)
+                                       goto error0;
+                               goto nodelete;
+                       } else {
+                               ASSERT(del.br_state == XFS_EXT_NORM);
+                               del.br_state = XFS_EXT_UNWRITTEN;
+                               error = xfs_bmap_add_extent_unwritten_real(tp,
+                                               ip, &lastx, &cur, &del,
+                                               firstblock, flist, &logflags);
+                               if (error)
+                                       goto error0;
+                               goto nodelete;
+                       }
+               }
+               if (wasdel) {
+                       ASSERT(startblockval(del.br_startblock) > 0);
+                       /* Update realtime/data freespace, unreserve quota */
+                       if (isrt) {
+                               xfs_filblks_t rtexts;
+
+                               rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
+                               do_div(rtexts, mp->m_sb.sb_rextsize);
+                               xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
+                                               (int64_t)rtexts, 0);
+                               (void)xfs_trans_reserve_quota_nblks(NULL,
+                                       ip, -((long)del.br_blockcount), 0,
+                                       XFS_QMOPT_RES_RTBLKS);
+                       } else {
+                               xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                                               (int64_t)del.br_blockcount, 0);
+                               (void)xfs_trans_reserve_quota_nblks(NULL,
+                                       ip, -((long)del.br_blockcount), 0,
+                                       XFS_QMOPT_RES_REGBLKS);
+                       }
+                       ip->i_delayed_blks -= del.br_blockcount;
+                       if (cur)
+                               cur->bc_private.b.flags |=
+                                       XFS_BTCUR_BPRV_WASDEL;
+               } else if (cur)
+                       cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
+               /*
+                * If it's the case where the directory code is running
+                * with no block reservation, and the deleted block is in
+                * the middle of its extent, and the resulting insert
+                * of an extent would cause transformation to btree format,
+                * then reject it.  The calling code will then swap
+                * blocks around instead.
+                * We have to do this now, rather than waiting for the
+                * conversion to btree format, since the transaction
+                * will be dirty.
+                */
+               if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
+                   XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+                   XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
+                       XFS_IFORK_MAXEXT(ip, whichfork) &&
+                   del.br_startoff > got.br_startoff &&
+                   del.br_startoff + del.br_blockcount <
+                   got.br_startoff + got.br_blockcount) {
+                       error = -ENOSPC;
+                       goto error0;
+               }
+               error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
+                               &tmp_logflags, whichfork);
+               logflags |= tmp_logflags;
+               if (error)
+                       goto error0;
+               bno = del.br_startoff - 1;
+nodelete:
+               /*
+                * If not done go on to the next (previous) record.
+                */
+               if (bno != (xfs_fileoff_t)-1 && bno >= start) {
+                       if (lastx >= 0) {
+                               ep = xfs_iext_get_ext(ifp, lastx);
+                               if (xfs_bmbt_get_startoff(ep) > bno) {
+                                       if (--lastx >= 0)
+                                               ep = xfs_iext_get_ext(ifp,
+                                                                     lastx);
+                               }
+                               xfs_bmbt_get_all(ep, &got);
+                       }
+                       extno++;
+               }
+       }
+       *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
+
+       /*
+        * Convert to a btree if necessary.
+        */
+       if (xfs_bmap_needs_btree(ip, whichfork)) {
+               ASSERT(cur == NULL);
+               error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
+                       &cur, 0, &tmp_logflags, whichfork);
+               logflags |= tmp_logflags;
+               if (error)
+                       goto error0;
+       }
+       /*
+        * transform from btree to extents, give it cur
+        */
+       else if (xfs_bmap_wants_extents(ip, whichfork)) {
+               ASSERT(cur != NULL);
+               error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
+                       whichfork);
+               logflags |= tmp_logflags;
+               if (error)
+                       goto error0;
+       }
+       /*
+        * transform from extents to local?
+        */
+       error = 0;
+error0:
+       /*
+        * Log everything.  Do this after conversion, there's no point in
+        * logging the extent records if we've converted to btree format.
+        */
+       if ((logflags & xfs_ilog_fext(whichfork)) &&
+           XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+               logflags &= ~xfs_ilog_fext(whichfork);
+       else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
+                XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
+               logflags &= ~xfs_ilog_fbroot(whichfork);
+       /*
+        * Log inode even in the error case, if the transaction
+        * is dirty we'll need to shut down the filesystem.
+        */
+       if (logflags)
+               xfs_trans_log_inode(tp, ip, logflags);
+       if (cur) {
+               if (!error) {
+                       *firstblock = cur->bc_private.b.firstblock;
+                       cur->bc_private.b.allocated = 0;
+               }
+               xfs_btree_del_cursor(cur,
+                       error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+       }
+       return error;
+}
+
+/*
+ * Shift extent records to the left to cover a hole.
+ *
+ * The maximum number of extents to be shifted in a single operation
+ * is @num_exts, and @current_ext keeps track of the current extent
+ * index we have shifted. @offset_shift_fsb is the length by which each
+ * extent is shifted. If there is no hole to shift the extents
+ * into, this will be considered invalid operation and we abort immediately.
+ */
+int
+xfs_bmap_shift_extents(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *ip,
+       int                     *done,
+       xfs_fileoff_t           start_fsb,
+       xfs_fileoff_t           offset_shift_fsb,
+       xfs_extnum_t            *current_ext,
+       xfs_fsblock_t           *firstblock,
+       struct xfs_bmap_free    *flist,
+       int                     num_exts)
+{
+       struct xfs_btree_cur            *cur;
+       struct xfs_bmbt_rec_host        *gotp;
+       struct xfs_bmbt_irec            got;
+       struct xfs_bmbt_irec            left;
+       struct xfs_mount                *mp = ip->i_mount;
+       struct xfs_ifork                *ifp;
+       xfs_extnum_t                    nexts = 0;
+       xfs_fileoff_t                   startoff;
+       int                             error = 0;
+       int                             i;
+       int                             whichfork = XFS_DATA_FORK;
+       int                             logflags;
+       xfs_filblks_t                   blockcount = 0;
+       int                             total_extents;
+
+       if (unlikely(XFS_TEST_ERROR(
+           (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+            mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+               XFS_ERROR_REPORT("xfs_bmap_shift_extents",
+                                XFS_ERRLEVEL_LOW, mp);
+               return -EFSCORRUPTED;
+       }
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -EIO;
+
+       ASSERT(current_ext != NULL);
+
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+               /* Read in all the extents */
+               error = xfs_iread_extents(tp, ip, whichfork);
+               if (error)
+                       return error;
+       }
+
+       /*
+        * If *current_ext is 0, we would need to lookup the extent
+        * from where we would start shifting and store it in gotp.
+        */
+       if (!*current_ext) {
+               gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
+               /*
+                * gotp can be null in 2 cases: 1) if there are no extents
+                * or 2) start_fsb lies in a hole beyond which there are
+                * no extents. Either way, we are done.
+                */
+               if (!gotp) {
+                       *done = 1;
+                       return 0;
+               }
+       }
+
+       /* We are going to change core inode */
+       logflags = XFS_ILOG_CORE;
+       if (ifp->if_flags & XFS_IFBROOT) {
+               cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+               cur->bc_private.b.firstblock = *firstblock;
+               cur->bc_private.b.flist = flist;
+               cur->bc_private.b.flags = 0;
+       } else {
+               cur = NULL;
+               logflags |= XFS_ILOG_DEXT;
+       }
+
+       /*
+        * There may be delalloc extents in the data fork before the range we
+        * are collapsing out, so we cannot
+        * use the count of real extents here. Instead we have to calculate it
+        * from the incore fork.
+        */
+       total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+       while (nexts++ < num_exts && *current_ext < total_extents) {
+
+               gotp = xfs_iext_get_ext(ifp, *current_ext);
+               xfs_bmbt_get_all(gotp, &got);
+               startoff = got.br_startoff - offset_shift_fsb;
+
+               /*
+                * Before shifting extent into hole, make sure that the hole
+                * is large enough to accomodate the shift.
+                */
+               if (*current_ext) {
+                       xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
+                                               *current_ext - 1), &left);
+
+                       if (startoff < left.br_startoff + left.br_blockcount)
+                               error = -EINVAL;
+               } else if (offset_shift_fsb > got.br_startoff) {
+                       /*
+                        * When first extent is shifted, offset_shift_fsb
+                        * should be less than the stating offset of
+                        * the first extent.
+                        */
+                       error = -EINVAL;
+               }
+
+               if (error)
+                       goto del_cursor;
+
+               if (cur) {
+                       error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+                                                  got.br_startblock,
+                                                  got.br_blockcount,
+                                                  &i);
+                       if (error)
+                               goto del_cursor;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+               }
+
+               /* Check if we can merge 2 adjacent extents */
+               if (*current_ext &&
+                   left.br_startoff + left.br_blockcount == startoff &&
+                   left.br_startblock + left.br_blockcount ==
+                               got.br_startblock &&
+                   left.br_state == got.br_state &&
+                   left.br_blockcount + got.br_blockcount <= MAXEXTLEN) {
+                       blockcount = left.br_blockcount +
+                               got.br_blockcount;
+                       xfs_iext_remove(ip, *current_ext, 1, 0);
+                       if (cur) {
+                               error = xfs_btree_delete(cur, &i);
+                               if (error)
+                                       goto del_cursor;
+                               XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+                       }
+                       XFS_IFORK_NEXT_SET(ip, whichfork,
+                               XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+                       gotp = xfs_iext_get_ext(ifp, --*current_ext);
+                       xfs_bmbt_get_all(gotp, &got);
+
+                       /* Make cursor point to the extent we will update */
+                       if (cur) {
+                               error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+                                                          got.br_startblock,
+                                                          got.br_blockcount,
+                                                          &i);
+                               if (error)
+                                       goto del_cursor;
+                               XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+                       }
+
+                       xfs_bmbt_set_blockcount(gotp, blockcount);
+                       got.br_blockcount = blockcount;
+               } else {
+                       /* We have to update the startoff */
+                       xfs_bmbt_set_startoff(gotp, startoff);
+                       got.br_startoff = startoff;
+               }
+
+               if (cur) {
+                       error = xfs_bmbt_update(cur, got.br_startoff,
+                                               got.br_startblock,
+                                               got.br_blockcount,
+                                               got.br_state);
+                       if (error)
+                               goto del_cursor;
+               }
+
+               (*current_ext)++;
+               total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+       }
+
+       /* Check if we are done */
+       if (*current_ext == total_extents)
+               *done = 1;
+
+del_cursor:
+       if (cur)
+               xfs_btree_del_cursor(cur,
+                       error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+
+       xfs_trans_log_inode(tp, ip, logflags);
+       return error;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
new file mode 100644 (file)
index 0000000..b879ca5
--- /dev/null
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BMAP_H__
+#define        __XFS_BMAP_H__
+
+struct getbmap;
+struct xfs_bmbt_irec;
+struct xfs_ifork;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+extern kmem_zone_t     *xfs_bmap_free_item_zone;
+
+/*
+ * List of extents to be free "later".
+ * The list is kept sorted on xbf_startblock.
+ */
+typedef struct xfs_bmap_free_item
+{
+       xfs_fsblock_t           xbfi_startblock;/* starting fs block number */
+       xfs_extlen_t            xbfi_blockcount;/* number of blocks in extent */
+       struct xfs_bmap_free_item *xbfi_next;   /* link to next entry */
+} xfs_bmap_free_item_t;
+
+/*
+ * Header for free extent list.
+ *
+ * xbf_low is used by the allocator to activate the lowspace algorithm -
+ * when free space is running low the extent allocator may choose to
+ * allocate an extent from an AG without leaving sufficient space for
+ * a btree split when inserting the new extent.  In this case the allocator
+ * will enable the lowspace algorithm which is supposed to allow further
+ * allocations (such as btree splits and newroots) to allocate from
+ * sequential AGs.  In order to avoid locking AGs out of order the lowspace
+ * algorithm will start searching for free space from AG 0.  If the correct
+ * transaction reservations have been made then this algorithm will eventually
+ * find all the space it needs.
+ */
+typedef        struct xfs_bmap_free
+{
+       xfs_bmap_free_item_t    *xbf_first;     /* list of to-be-free extents */
+       int                     xbf_count;      /* count of items on list */
+       int                     xbf_low;        /* alloc in low mode */
+} xfs_bmap_free_t;
+
+#define        XFS_BMAP_MAX_NMAP       4
+
+/*
+ * Flags for xfs_bmapi_*
+ */
+#define XFS_BMAPI_ENTIRE       0x001   /* return entire extent, not trimmed */
+#define XFS_BMAPI_METADATA     0x002   /* mapping metadata not user data */
+#define XFS_BMAPI_ATTRFORK     0x004   /* use attribute fork not data */
+#define XFS_BMAPI_PREALLOC     0x008   /* preallocation op: unwritten space */
+#define XFS_BMAPI_IGSTATE      0x010   /* Ignore state - */
+                                       /* combine contig. space */
+#define XFS_BMAPI_CONTIG       0x020   /* must allocate only one extent */
+/*
+ * unwritten extent conversion - this needs write cache flushing and no additional
+ * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
+ * from written to unwritten, otherwise convert from unwritten to written.
+ */
+#define XFS_BMAPI_CONVERT      0x040
+
+#define XFS_BMAPI_FLAGS \
+       { XFS_BMAPI_ENTIRE,     "ENTIRE" }, \
+       { XFS_BMAPI_METADATA,   "METADATA" }, \
+       { XFS_BMAPI_ATTRFORK,   "ATTRFORK" }, \
+       { XFS_BMAPI_PREALLOC,   "PREALLOC" }, \
+       { XFS_BMAPI_IGSTATE,    "IGSTATE" }, \
+       { XFS_BMAPI_CONTIG,     "CONTIG" }, \
+       { XFS_BMAPI_CONVERT,    "CONVERT" }
+
+
+static inline int xfs_bmapi_aflag(int w)
+{
+       return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0);
+}
+
+/*
+ * Special values for xfs_bmbt_irec_t br_startblock field.
+ */
+#define        DELAYSTARTBLOCK         ((xfs_fsblock_t)-1LL)
+#define        HOLESTARTBLOCK          ((xfs_fsblock_t)-2LL)
+
+static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
+{
+       ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \
+               (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK);
+}
+
+/*
+ * Flags for xfs_bmap_add_extent*.
+ */
+#define BMAP_LEFT_CONTIG       (1 << 0)
+#define BMAP_RIGHT_CONTIG      (1 << 1)
+#define BMAP_LEFT_FILLING      (1 << 2)
+#define BMAP_RIGHT_FILLING     (1 << 3)
+#define BMAP_LEFT_DELAY                (1 << 4)
+#define BMAP_RIGHT_DELAY       (1 << 5)
+#define BMAP_LEFT_VALID                (1 << 6)
+#define BMAP_RIGHT_VALID       (1 << 7)
+#define BMAP_ATTRFORK          (1 << 8)
+
+#define XFS_BMAP_EXT_FLAGS \
+       { BMAP_LEFT_CONTIG,     "LC" }, \
+       { BMAP_RIGHT_CONTIG,    "RC" }, \
+       { BMAP_LEFT_FILLING,    "LF" }, \
+       { BMAP_RIGHT_FILLING,   "RF" }, \
+       { BMAP_ATTRFORK,        "ATTR" }
+
+
+/*
+ * This macro is used to determine how many extents will be shifted
+ * in one write transaction. We could require two splits,
+ * an extent move on the first and an extent merge on the second,
+ * So it is proper that one extent is shifted inside write transaction
+ * at a time.
+ */
+#define XFS_BMAP_MAX_SHIFT_EXTENTS     1
+
+#ifdef DEBUG
+void   xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
+               int whichfork, unsigned long caller_ip);
+#define        XFS_BMAP_TRACE_EXLIST(ip,c,w)   \
+       xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_)
+#else
+#define        XFS_BMAP_TRACE_EXLIST(ip,c,w)
+#endif
+
+int    xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
+void   xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
+void   xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
+               struct xfs_bmap_free *flist, struct xfs_mount *mp);
+void   xfs_bmap_cancel(struct xfs_bmap_free *flist);
+void   xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
+int    xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
+               xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
+int    xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip,
+               xfs_fileoff_t *last_block, int whichfork);
+int    xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused,
+               int whichfork);
+int    xfs_bmap_one_block(struct xfs_inode *ip, int whichfork);
+int    xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+               int whichfork);
+int    xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
+               xfs_filblks_t len, struct xfs_bmbt_irec *mval,
+               int *nmap, int flags);
+int    xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno,
+               xfs_filblks_t len, struct xfs_bmbt_irec *mval,
+               int *nmap, int flags);
+int    xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
+               xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+               xfs_fsblock_t *firstblock, xfs_extlen_t total,
+               struct xfs_bmbt_irec *mval, int *nmap,
+               struct xfs_bmap_free *flist);
+int    xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
+               xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+               xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
+               struct xfs_bmap_free *flist, int *done);
+int    xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
+               xfs_extnum_t num);
+uint   xfs_default_attroffset(struct xfs_inode *ip);
+int    xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+               int *done, xfs_fileoff_t start_fsb,
+               xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext,
+               xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist,
+               int num_exts);
+
+#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
new file mode 100644 (file)
index 0000000..fba7533
--- /dev/null
@@ -0,0 +1,886 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_dinode.h"
+
+/*
+ * Determine the extent state.
+ */
+/* ARGSUSED */
+STATIC xfs_exntst_t
+xfs_extent_state(
+       xfs_filblks_t           blks,
+       int                     extent_flag)
+{
+       if (extent_flag) {
+               ASSERT(blks != 0);      /* saved for DMIG */
+               return XFS_EXT_UNWRITTEN;
+       }
+       return XFS_EXT_NORM;
+}
+
+/*
+ * Convert on-disk form of btree root to in-memory form.
+ */
+void
+xfs_bmdr_to_bmbt(
+       struct xfs_inode        *ip,
+       xfs_bmdr_block_t        *dblock,
+       int                     dblocklen,
+       struct xfs_btree_block  *rblock,
+       int                     rblocklen)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       int                     dmxr;
+       xfs_bmbt_key_t          *fkp;
+       __be64                  *fpp;
+       xfs_bmbt_key_t          *tkp;
+       __be64                  *tpp;
+
+       if (xfs_sb_version_hascrc(&mp->m_sb))
+               xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
+                                XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
+                                XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
+       else
+               xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
+                                XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
+                                XFS_BTREE_LONG_PTRS);
+
+       rblock->bb_level = dblock->bb_level;
+       ASSERT(be16_to_cpu(rblock->bb_level) > 0);
+       rblock->bb_numrecs = dblock->bb_numrecs;
+       dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
+       fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
+       tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
+       fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
+       tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
+       dmxr = be16_to_cpu(dblock->bb_numrecs);
+       memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
+       memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
+}
+
+/*
+ * Convert a compressed bmap extent record to an uncompressed form.
+ * This code must be in sync with the routines xfs_bmbt_get_startoff,
+ * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state.
+ */
+STATIC void
+__xfs_bmbt_get_all(
+               __uint64_t l0,
+               __uint64_t l1,
+               xfs_bmbt_irec_t *s)
+{
+       int     ext_flag;
+       xfs_exntst_t st;
+
+       ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN));
+       s->br_startoff = ((xfs_fileoff_t)l0 &
+                          xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+       s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) |
+                          (((xfs_fsblock_t)l1) >> 21);
+       s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21));
+       /* This is xfs_extent_state() in-line */
+       if (ext_flag) {
+               ASSERT(s->br_blockcount != 0);  /* saved for DMIG */
+               st = XFS_EXT_UNWRITTEN;
+       } else
+               st = XFS_EXT_NORM;
+       s->br_state = st;
+}
+
+void
+xfs_bmbt_get_all(
+       xfs_bmbt_rec_host_t *r,
+       xfs_bmbt_irec_t *s)
+{
+       __xfs_bmbt_get_all(r->l0, r->l1, s);
+}
+
+/*
+ * Extract the blockcount field from an in memory bmap extent record.
+ */
+xfs_filblks_t
+xfs_bmbt_get_blockcount(
+       xfs_bmbt_rec_host_t     *r)
+{
+       return (xfs_filblks_t)(r->l1 & xfs_mask64lo(21));
+}
+
+/*
+ * Extract the startblock field from an in memory bmap extent record.
+ */
+xfs_fsblock_t
+xfs_bmbt_get_startblock(
+       xfs_bmbt_rec_host_t     *r)
+{
+       return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) |
+              (((xfs_fsblock_t)r->l1) >> 21);
+}
+
+/*
+ * Extract the startoff field from an in memory bmap extent record.
+ */
+xfs_fileoff_t
+xfs_bmbt_get_startoff(
+       xfs_bmbt_rec_host_t     *r)
+{
+       return ((xfs_fileoff_t)r->l0 &
+                xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+}
+
+xfs_exntst_t
+xfs_bmbt_get_state(
+       xfs_bmbt_rec_host_t     *r)
+{
+       int     ext_flag;
+
+       ext_flag = (int)((r->l0) >> (64 - BMBT_EXNTFLAG_BITLEN));
+       return xfs_extent_state(xfs_bmbt_get_blockcount(r),
+                               ext_flag);
+}
+
+/*
+ * Extract the blockcount field from an on disk bmap extent record.
+ */
+xfs_filblks_t
+xfs_bmbt_disk_get_blockcount(
+       xfs_bmbt_rec_t  *r)
+{
+       return (xfs_filblks_t)(be64_to_cpu(r->l1) & xfs_mask64lo(21));
+}
+
+/*
+ * Extract the startoff field from a disk format bmap extent record.
+ */
+xfs_fileoff_t
+xfs_bmbt_disk_get_startoff(
+       xfs_bmbt_rec_t  *r)
+{
+       return ((xfs_fileoff_t)be64_to_cpu(r->l0) &
+                xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+}
+
+
+/*
+ * Set all the fields in a bmap extent record from the arguments.
+ */
+void
+xfs_bmbt_set_allf(
+       xfs_bmbt_rec_host_t     *r,
+       xfs_fileoff_t           startoff,
+       xfs_fsblock_t           startblock,
+       xfs_filblks_t           blockcount,
+       xfs_exntst_t            state)
+{
+       int             extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
+
+       ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
+       ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
+       ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
+
+       ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
+
+       r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+               ((xfs_bmbt_rec_base_t)startoff << 9) |
+               ((xfs_bmbt_rec_base_t)startblock >> 43);
+       r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
+               ((xfs_bmbt_rec_base_t)blockcount &
+               (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
+}
+
+/*
+ * Set all the fields in a bmap extent record from the uncompressed form.
+ */
+void
+xfs_bmbt_set_all(
+       xfs_bmbt_rec_host_t *r,
+       xfs_bmbt_irec_t *s)
+{
+       xfs_bmbt_set_allf(r, s->br_startoff, s->br_startblock,
+                            s->br_blockcount, s->br_state);
+}
+
+
+/*
+ * Set all the fields in a disk format bmap extent record from the arguments.
+ */
+void
+xfs_bmbt_disk_set_allf(
+       xfs_bmbt_rec_t          *r,
+       xfs_fileoff_t           startoff,
+       xfs_fsblock_t           startblock,
+       xfs_filblks_t           blockcount,
+       xfs_exntst_t            state)
+{
+       int                     extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
+
+       ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
+       ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
+       ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
+       ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
+
+       r->l0 = cpu_to_be64(
+               ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+                ((xfs_bmbt_rec_base_t)startoff << 9) |
+                ((xfs_bmbt_rec_base_t)startblock >> 43));
+       r->l1 = cpu_to_be64(
+               ((xfs_bmbt_rec_base_t)startblock << 21) |
+                ((xfs_bmbt_rec_base_t)blockcount &
+                 (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
+}
+
+/*
+ * Set all the fields in a bmap extent record from the uncompressed form.
+ */
+STATIC void
+xfs_bmbt_disk_set_all(
+       xfs_bmbt_rec_t  *r,
+       xfs_bmbt_irec_t *s)
+{
+       xfs_bmbt_disk_set_allf(r, s->br_startoff, s->br_startblock,
+                                 s->br_blockcount, s->br_state);
+}
+
+/*
+ * Set the blockcount field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_blockcount(
+       xfs_bmbt_rec_host_t *r,
+       xfs_filblks_t   v)
+{
+       ASSERT((v & xfs_mask64hi(43)) == 0);
+       r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64hi(43)) |
+                 (xfs_bmbt_rec_base_t)(v & xfs_mask64lo(21));
+}
+
+/*
+ * Set the startblock field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_startblock(
+       xfs_bmbt_rec_host_t *r,
+       xfs_fsblock_t   v)
+{
+       ASSERT((v & xfs_mask64hi(12)) == 0);
+       r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) |
+                 (xfs_bmbt_rec_base_t)(v >> 43);
+       r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) |
+                 (xfs_bmbt_rec_base_t)(v << 21);
+}
+
+/*
+ * Set the startoff field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_startoff(
+       xfs_bmbt_rec_host_t *r,
+       xfs_fileoff_t   v)
+{
+       ASSERT((v & xfs_mask64hi(9)) == 0);
+       r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) xfs_mask64hi(1)) |
+               ((xfs_bmbt_rec_base_t)v << 9) |
+                 (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
+}
+
+/*
+ * Set the extent state field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_state(
+       xfs_bmbt_rec_host_t *r,
+       xfs_exntst_t    v)
+{
+       ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN);
+       if (v == XFS_EXT_NORM)
+               r->l0 &= xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN);
+       else
+               r->l0 |= xfs_mask64hi(BMBT_EXNTFLAG_BITLEN);
+}
+
+/*
+ * Convert in-memory form of btree root to on-disk form.
+ */
+void
+xfs_bmbt_to_bmdr(
+       struct xfs_mount        *mp,
+       struct xfs_btree_block  *rblock,
+       int                     rblocklen,
+       xfs_bmdr_block_t        *dblock,
+       int                     dblocklen)
+{
+       int                     dmxr;
+       xfs_bmbt_key_t          *fkp;
+       __be64                  *fpp;
+       xfs_bmbt_key_t          *tkp;
+       __be64                  *tpp;
+
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC));
+               ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid));
+               ASSERT(rblock->bb_u.l.bb_blkno ==
+                      cpu_to_be64(XFS_BUF_DADDR_NULL));
+       } else
+               ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC));
+       ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK));
+       ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK));
+       ASSERT(rblock->bb_level != 0);
+       dblock->bb_level = rblock->bb_level;
+       dblock->bb_numrecs = rblock->bb_numrecs;
+       dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
+       fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
+       tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
+       fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
+       tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
+       dmxr = be16_to_cpu(dblock->bb_numrecs);
+       memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
+       memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
+}
+
+/*
+ * Check extent records, which have just been read, for
+ * any bit in the extent flag field. ASSERT on debug
+ * kernels, as this condition should not occur.
+ * Return an error condition (1) if any flags found,
+ * otherwise return 0.
+ */
+
+int
+xfs_check_nostate_extents(
+       xfs_ifork_t             *ifp,
+       xfs_extnum_t            idx,
+       xfs_extnum_t            num)
+{
+       for (; num > 0; num--, idx++) {
+               xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
+               if ((ep->l0 >>
+                    (64 - BMBT_EXNTFLAG_BITLEN)) != 0) {
+                       ASSERT(0);
+                       return 1;
+               }
+       }
+       return 0;
+}
+
+
+STATIC struct xfs_btree_cur *
+xfs_bmbt_dup_cursor(
+       struct xfs_btree_cur    *cur)
+{
+       struct xfs_btree_cur    *new;
+
+       new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
+                       cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+
+       /*
+        * Copy the firstblock, flist, and flags values,
+        * since init cursor doesn't get them.
+        */
+       new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
+       new->bc_private.b.flist = cur->bc_private.b.flist;
+       new->bc_private.b.flags = cur->bc_private.b.flags;
+
+       return new;
+}
+
+STATIC void
+xfs_bmbt_update_cursor(
+       struct xfs_btree_cur    *src,
+       struct xfs_btree_cur    *dst)
+{
+       ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) ||
+              (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME));
+       ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist);
+
+       dst->bc_private.b.allocated += src->bc_private.b.allocated;
+       dst->bc_private.b.firstblock = src->bc_private.b.firstblock;
+
+       src->bc_private.b.allocated = 0;
+}
+
+STATIC int
+xfs_bmbt_alloc_block(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *start,
+       union xfs_btree_ptr     *new,
+       int                     *stat)
+{
+       xfs_alloc_arg_t         args;           /* block allocation args */
+       int                     error;          /* error return value */
+
+       memset(&args, 0, sizeof(args));
+       args.tp = cur->bc_tp;
+       args.mp = cur->bc_mp;
+       args.fsbno = cur->bc_private.b.firstblock;
+       args.firstblock = args.fsbno;
+
+       if (args.fsbno == NULLFSBLOCK) {
+               args.fsbno = be64_to_cpu(start->l);
+               args.type = XFS_ALLOCTYPE_START_BNO;
+               /*
+                * Make sure there is sufficient room left in the AG to
+                * complete a full tree split for an extent insert.  If
+                * we are converting the middle part of an extent then
+                * we may need space for two tree splits.
+                *
+                * We are relying on the caller to make the correct block
+                * reservation for this operation to succeed.  If the
+                * reservation amount is insufficient then we may fail a
+                * block allocation here and corrupt the filesystem.
+                */
+               args.minleft = xfs_trans_get_block_res(args.tp);
+       } else if (cur->bc_private.b.flist->xbf_low) {
+               args.type = XFS_ALLOCTYPE_START_BNO;
+       } else {
+               args.type = XFS_ALLOCTYPE_NEAR_BNO;
+       }
+
+       args.minlen = args.maxlen = args.prod = 1;
+       args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+       if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
+               error = -ENOSPC;
+               goto error0;
+       }
+       error = xfs_alloc_vextent(&args);
+       if (error)
+               goto error0;
+
+       if (args.fsbno == NULLFSBLOCK && args.minleft) {
+               /*
+                * Could not find an AG with enough free space to satisfy
+                * a full btree split.  Try again without minleft and if
+                * successful activate the lowspace algorithm.
+                */
+               args.fsbno = 0;
+               args.type = XFS_ALLOCTYPE_FIRST_AG;
+               args.minleft = 0;
+               error = xfs_alloc_vextent(&args);
+               if (error)
+                       goto error0;
+               cur->bc_private.b.flist->xbf_low = 1;
+       }
+       if (args.fsbno == NULLFSBLOCK) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+               *stat = 0;
+               return 0;
+       }
+       ASSERT(args.len == 1);
+       cur->bc_private.b.firstblock = args.fsbno;
+       cur->bc_private.b.allocated++;
+       cur->bc_private.b.ip->i_d.di_nblocks++;
+       xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
+       xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip,
+                       XFS_TRANS_DQ_BCOUNT, 1L);
+
+       new->l = cpu_to_be64(args.fsbno);
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+
+ error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+STATIC int
+xfs_bmbt_free_block(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = cur->bc_mp;
+       struct xfs_inode        *ip = cur->bc_private.b.ip;
+       struct xfs_trans        *tp = cur->bc_tp;
+       xfs_fsblock_t           fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
+
+       xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp);
+       ip->i_d.di_nblocks--;
+
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+       xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+       xfs_trans_binval(tp, bp);
+       return 0;
+}
+
+STATIC int
+xfs_bmbt_get_minrecs(
+       struct xfs_btree_cur    *cur,
+       int                     level)
+{
+       if (level == cur->bc_nlevels - 1) {
+               struct xfs_ifork        *ifp;
+
+               ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+                                   cur->bc_private.b.whichfork);
+
+               return xfs_bmbt_maxrecs(cur->bc_mp,
+                                       ifp->if_broot_bytes, level == 0) / 2;
+       }
+
+       return cur->bc_mp->m_bmap_dmnr[level != 0];
+}
+
+int
+xfs_bmbt_get_maxrecs(
+       struct xfs_btree_cur    *cur,
+       int                     level)
+{
+       if (level == cur->bc_nlevels - 1) {
+               struct xfs_ifork        *ifp;
+
+               ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+                                   cur->bc_private.b.whichfork);
+
+               return xfs_bmbt_maxrecs(cur->bc_mp,
+                                       ifp->if_broot_bytes, level == 0);
+       }
+
+       return cur->bc_mp->m_bmap_dmxr[level != 0];
+
+}
+
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_bmbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork
+ * so that we can resize the in-memory buffer to match it.  After a
+ * resize to the maximum size this function returns the same value
+ * as xfs_bmbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_bmbt_get_dmaxrecs(
+       struct xfs_btree_cur    *cur,
+       int                     level)
+{
+       if (level != cur->bc_nlevels - 1)
+               return cur->bc_mp->m_bmap_dmxr[level != 0];
+       return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0);
+}
+
+STATIC void
+xfs_bmbt_init_key_from_rec(
+       union xfs_btree_key     *key,
+       union xfs_btree_rec     *rec)
+{
+       key->bmbt.br_startoff =
+               cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt));
+}
+
+STATIC void
+xfs_bmbt_init_rec_from_key(
+       union xfs_btree_key     *key,
+       union xfs_btree_rec     *rec)
+{
+       ASSERT(key->bmbt.br_startoff != 0);
+
+       xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff),
+                              0, 0, XFS_EXT_NORM);
+}
+
+STATIC void
+xfs_bmbt_init_rec_from_cur(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *rec)
+{
+       xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
+}
+
+STATIC void
+xfs_bmbt_init_ptr_from_cur(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr)
+{
+       ptr->l = 0;
+}
+
+STATIC __int64_t
+xfs_bmbt_key_diff(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *key)
+{
+       return (__int64_t)be64_to_cpu(key->bmbt.br_startoff) -
+                                     cur->bc_rec.b.br_startoff;
+}
+
+static bool
+xfs_bmbt_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+       unsigned int            level;
+
+       switch (block->bb_magic) {
+       case cpu_to_be32(XFS_BMAP_CRC_MAGIC):
+               if (!xfs_sb_version_hascrc(&mp->m_sb))
+                       return false;
+               if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid))
+                       return false;
+               if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn)
+                       return false;
+               /*
+                * XXX: need a better way of verifying the owner here. Right now
+                * just make sure there has been one set.
+                */
+               if (be64_to_cpu(block->bb_u.l.bb_owner) == 0)
+                       return false;
+               /* fall through */
+       case cpu_to_be32(XFS_BMAP_MAGIC):
+               break;
+       default:
+               return false;
+       }
+
+       /*
+        * numrecs and level verification.
+        *
+        * We don't know what fork we belong to, so just verify that the level
+        * is less than the maximum of the two. Later checks will be more
+        * precise.
+        */
+       level = be16_to_cpu(block->bb_level);
+       if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]))
+               return false;
+       if (be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
+               return false;
+
+       /* sibling pointer verification */
+       if (!block->bb_u.l.bb_leftsib ||
+           (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
+            !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))))
+               return false;
+       if (!block->bb_u.l.bb_rightsib ||
+           (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
+            !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))))
+               return false;
+
+       return true;
+}
+
+static void
+xfs_bmbt_read_verify(
+       struct xfs_buf  *bp)
+{
+       if (!xfs_btree_lblock_verify_crc(bp))
+               xfs_buf_ioerror(bp, -EFSBADCRC);
+       else if (!xfs_bmbt_verify(bp))
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+       if (bp->b_error) {
+               trace_xfs_btree_corrupt(bp, _RET_IP_);
+               xfs_verifier_error(bp);
+       }
+}
+
+static void
+xfs_bmbt_write_verify(
+       struct xfs_buf  *bp)
+{
+       if (!xfs_bmbt_verify(bp)) {
+               trace_xfs_btree_corrupt(bp, _RET_IP_);
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+               xfs_verifier_error(bp);
+               return;
+       }
+       xfs_btree_lblock_calc_crc(bp);
+}
+
+const struct xfs_buf_ops xfs_bmbt_buf_ops = {
+       .verify_read = xfs_bmbt_read_verify,
+       .verify_write = xfs_bmbt_write_verify,
+};
+
+
+#if defined(DEBUG) || defined(XFS_WARN)
+STATIC int
+xfs_bmbt_keys_inorder(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *k1,
+       union xfs_btree_key     *k2)
+{
+       return be64_to_cpu(k1->bmbt.br_startoff) <
+               be64_to_cpu(k2->bmbt.br_startoff);
+}
+
+STATIC int
+xfs_bmbt_recs_inorder(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *r1,
+       union xfs_btree_rec     *r2)
+{
+       return xfs_bmbt_disk_get_startoff(&r1->bmbt) +
+               xfs_bmbt_disk_get_blockcount(&r1->bmbt) <=
+               xfs_bmbt_disk_get_startoff(&r2->bmbt);
+}
+#endif /* DEBUG */
+
+static const struct xfs_btree_ops xfs_bmbt_ops = {
+       .rec_len                = sizeof(xfs_bmbt_rec_t),
+       .key_len                = sizeof(xfs_bmbt_key_t),
+
+       .dup_cursor             = xfs_bmbt_dup_cursor,
+       .update_cursor          = xfs_bmbt_update_cursor,
+       .alloc_block            = xfs_bmbt_alloc_block,
+       .free_block             = xfs_bmbt_free_block,
+       .get_maxrecs            = xfs_bmbt_get_maxrecs,
+       .get_minrecs            = xfs_bmbt_get_minrecs,
+       .get_dmaxrecs           = xfs_bmbt_get_dmaxrecs,
+       .init_key_from_rec      = xfs_bmbt_init_key_from_rec,
+       .init_rec_from_key      = xfs_bmbt_init_rec_from_key,
+       .init_rec_from_cur      = xfs_bmbt_init_rec_from_cur,
+       .init_ptr_from_cur      = xfs_bmbt_init_ptr_from_cur,
+       .key_diff               = xfs_bmbt_key_diff,
+       .buf_ops                = &xfs_bmbt_buf_ops,
+#if defined(DEBUG) || defined(XFS_WARN)
+       .keys_inorder           = xfs_bmbt_keys_inorder,
+       .recs_inorder           = xfs_bmbt_recs_inorder,
+#endif
+};
+
+/*
+ * Allocate a new bmap btree cursor.
+ */
+struct xfs_btree_cur *                         /* new bmap btree cursor */
+xfs_bmbt_init_cursor(
+       struct xfs_mount        *mp,            /* file system mount point */
+       struct xfs_trans        *tp,            /* transaction pointer */
+       struct xfs_inode        *ip,            /* inode owning the btree */
+       int                     whichfork)      /* data or attr fork */
+{
+       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+       struct xfs_btree_cur    *cur;
+
+       cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+
+       cur->bc_tp = tp;
+       cur->bc_mp = mp;
+       cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+       cur->bc_btnum = XFS_BTNUM_BMAP;
+       cur->bc_blocklog = mp->m_sb.sb_blocklog;
+
+       cur->bc_ops = &xfs_bmbt_ops;
+       cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
+       if (xfs_sb_version_hascrc(&mp->m_sb))
+               cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+       cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
+       cur->bc_private.b.ip = ip;
+       cur->bc_private.b.firstblock = NULLFSBLOCK;
+       cur->bc_private.b.flist = NULL;
+       cur->bc_private.b.allocated = 0;
+       cur->bc_private.b.flags = 0;
+       cur->bc_private.b.whichfork = whichfork;
+
+       return cur;
+}
+
+/*
+ * Calculate number of records in a bmap btree block.
+ */
+int
+xfs_bmbt_maxrecs(
+       struct xfs_mount        *mp,
+       int                     blocklen,
+       int                     leaf)
+{
+       blocklen -= XFS_BMBT_BLOCK_LEN(mp);
+
+       if (leaf)
+               return blocklen / sizeof(xfs_bmbt_rec_t);
+       return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
+}
+
+/*
+ * Calculate number of records in a bmap btree inode root.
+ */
+int
+xfs_bmdr_maxrecs(
+       int                     blocklen,
+       int                     leaf)
+{
+       blocklen -= sizeof(xfs_bmdr_block_t);
+
+       if (leaf)
+               return blocklen / sizeof(xfs_bmdr_rec_t);
+       return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
+}
+
+/*
+ * Change the owner of a btree format fork fo the inode passed in. Change it to
+ * the owner of that is passed in so that we can change owners before or after
+ * we switch forks between inodes. The operation that the caller is doing will
+ * determine whether is needs to change owner before or after the switch.
+ *
+ * For demand paged transactional modification, the fork switch should be done
+ * after reading in all the blocks, modifying them and pinning them in the
+ * transaction. For modification when the buffers are already pinned in memory,
+ * the fork switch can be done before changing the owner as we won't need to
+ * validate the owner until the btree buffers are unpinned and writes can occur
+ * again.
+ *
+ * For recovery based ownership change, there is no transactional context and
+ * so a buffer list must be supplied so that we can record the buffers that we
+ * modified for the caller to issue IO on.
+ */
+int
+xfs_bmbt_change_owner(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *ip,
+       int                     whichfork,
+       xfs_ino_t               new_owner,
+       struct list_head        *buffer_list)
+{
+       struct xfs_btree_cur    *cur;
+       int                     error;
+
+       ASSERT(tp || buffer_list);
+       ASSERT(!(tp && buffer_list));
+       if (whichfork == XFS_DATA_FORK)
+               ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE);
+       else
+               ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE);
+
+       cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
+       if (!cur)
+               return -ENOMEM;
+
+       error = xfs_btree_change_owner(cur, new_owner, buffer_list);
+       xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+       return error;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
new file mode 100644 (file)
index 0000000..819a8a4
--- /dev/null
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2000,2002-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BMAP_BTREE_H__
+#define __XFS_BMAP_BTREE_H__
+
+struct xfs_btree_cur;
+struct xfs_btree_block;
+struct xfs_mount;
+struct xfs_inode;
+struct xfs_trans;
+
+/*
+ * Extent state and extent format macros.
+ */
+#define XFS_EXTFMT_INODE(x)    \
+       (xfs_sb_version_hasextflgbit(&((x)->i_mount->m_sb)) ? \
+               XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE)
+#define ISUNWRITTEN(x) ((x)->br_state == XFS_EXT_UNWRITTEN)
+
+/*
+ * Btree block header size depends on a superblock flag.
+ */
+#define XFS_BMBT_BLOCK_LEN(mp) \
+       (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+               XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN)
+
+#define XFS_BMBT_REC_ADDR(mp, block, index) \
+       ((xfs_bmbt_rec_t *) \
+               ((char *)(block) + \
+                XFS_BMBT_BLOCK_LEN(mp) + \
+                ((index) - 1) * sizeof(xfs_bmbt_rec_t)))
+
+#define XFS_BMBT_KEY_ADDR(mp, block, index) \
+       ((xfs_bmbt_key_t *) \
+               ((char *)(block) + \
+                XFS_BMBT_BLOCK_LEN(mp) + \
+                ((index) - 1) * sizeof(xfs_bmbt_key_t)))
+
+#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \
+       ((xfs_bmbt_ptr_t *) \
+               ((char *)(block) + \
+                XFS_BMBT_BLOCK_LEN(mp) + \
+                (maxrecs) * sizeof(xfs_bmbt_key_t) + \
+                ((index) - 1) * sizeof(xfs_bmbt_ptr_t)))
+
+#define XFS_BMDR_REC_ADDR(block, index) \
+       ((xfs_bmdr_rec_t *) \
+               ((char *)(block) + \
+                sizeof(struct xfs_bmdr_block) + \
+                ((index) - 1) * sizeof(xfs_bmdr_rec_t)))
+
+#define XFS_BMDR_KEY_ADDR(block, index) \
+       ((xfs_bmdr_key_t *) \
+               ((char *)(block) + \
+                sizeof(struct xfs_bmdr_block) + \
+                ((index) - 1) * sizeof(xfs_bmdr_key_t)))
+
+#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \
+       ((xfs_bmdr_ptr_t *) \
+               ((char *)(block) + \
+                sizeof(struct xfs_bmdr_block) + \
+                (maxrecs) * sizeof(xfs_bmdr_key_t) + \
+                ((index) - 1) * sizeof(xfs_bmdr_ptr_t)))
+
+/*
+ * These are to be used when we know the size of the block and
+ * we don't have a cursor.
+ */
+#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \
+       XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0))
+
+#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \
+       (int)(XFS_BMBT_BLOCK_LEN(mp) + \
+              ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
+
+#define XFS_BMAP_BROOT_SPACE(mp, bb) \
+       (XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs)))
+#define XFS_BMDR_SPACE_CALC(nrecs) \
+       (int)(sizeof(xfs_bmdr_block_t) + \
+              ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
+#define XFS_BMAP_BMDR_SPACE(bb) \
+       (XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
+
+/*
+ * Maximum number of bmap btree levels.
+ */
+#define XFS_BM_MAXLEVELS(mp,w)         ((mp)->m_bm_maxlevels[(w)])
+
+/*
+ * Prototypes for xfs_bmap.c to call.
+ */
+extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int,
+                       struct xfs_btree_block *, int);
+extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
+extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r);
+extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
+extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
+extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r);
+
+extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
+extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
+
+extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
+extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o,
+                       xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
+extern void xfs_bmbt_set_blockcount(xfs_bmbt_rec_host_t *r, xfs_filblks_t v);
+extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v);
+extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v);
+extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v);
+
+extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o,
+                       xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
+
+extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
+                       xfs_bmdr_block_t *, int);
+
+extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
+extern int xfs_bmdr_maxrecs(int blocklen, int leaf);
+extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+
+extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
+                                int whichfork, xfs_ino_t new_owner,
+                                struct list_head *buffer_list);
+
+extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
+               struct xfs_trans *, struct xfs_inode *, int);
+
+#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
new file mode 100644 (file)
index 0000000..8fe6a93
--- /dev/null
@@ -0,0 +1,4069 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_buf_item.h"
+#include "xfs_btree.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_alloc.h"
+
+/*
+ * Cursor allocation zone.
+ */
+kmem_zone_t    *xfs_btree_cur_zone;
+
+/*
+ * Btree magic numbers.
+ */
+static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
+       { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
+         XFS_FIBT_MAGIC },
+       { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
+         XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
+};
+#define xfs_btree_magic(cur) \
+       xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
+
+
+STATIC int                             /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lblock(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       struct xfs_btree_block  *block, /* btree long form block pointer */
+       int                     level,  /* level of the btree block */
+       struct xfs_buf          *bp)    /* buffer for block, if any */
+{
+       int                     lblock_ok = 1; /* block passes checks */
+       struct xfs_mount        *mp;    /* file system mount point */
+
+       mp = cur->bc_mp;
+
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               lblock_ok = lblock_ok &&
+                       uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) &&
+                       block->bb_u.l.bb_blkno == cpu_to_be64(
+                               bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
+       }
+
+       lblock_ok = lblock_ok &&
+               be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
+               be16_to_cpu(block->bb_level) == level &&
+               be16_to_cpu(block->bb_numrecs) <=
+                       cur->bc_ops->get_maxrecs(cur, level) &&
+               block->bb_u.l.bb_leftsib &&
+               (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK) ||
+                XFS_FSB_SANITY_CHECK(mp,
+                       be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
+               block->bb_u.l.bb_rightsib &&
+               (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK) ||
+                XFS_FSB_SANITY_CHECK(mp,
+                       be64_to_cpu(block->bb_u.l.bb_rightsib)));
+
+       if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
+                       XFS_ERRTAG_BTREE_CHECK_LBLOCK,
+                       XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
+               if (bp)
+                       trace_xfs_btree_corrupt(bp, _RET_IP_);
+               XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+               return -EFSCORRUPTED;
+       }
+       return 0;
+}
+
+STATIC int                             /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sblock(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       struct xfs_btree_block  *block, /* btree short form block pointer */
+       int                     level,  /* level of the btree block */
+       struct xfs_buf          *bp)    /* buffer containing block */
+{
+       struct xfs_mount        *mp;    /* file system mount point */
+       struct xfs_buf          *agbp;  /* buffer for ag. freespace struct */
+       struct xfs_agf          *agf;   /* ag. freespace structure */
+       xfs_agblock_t           agflen; /* native ag. freespace length */
+       int                     sblock_ok = 1; /* block passes checks */
+
+       mp = cur->bc_mp;
+       agbp = cur->bc_private.a.agbp;
+       agf = XFS_BUF_TO_AGF(agbp);
+       agflen = be32_to_cpu(agf->agf_length);
+
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               sblock_ok = sblock_ok &&
+                       uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) &&
+                       block->bb_u.s.bb_blkno == cpu_to_be64(
+                               bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
+       }
+
+       sblock_ok = sblock_ok &&
+               be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
+               be16_to_cpu(block->bb_level) == level &&
+               be16_to_cpu(block->bb_numrecs) <=
+                       cur->bc_ops->get_maxrecs(cur, level) &&
+               (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
+                be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
+               block->bb_u.s.bb_leftsib &&
+               (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
+                be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
+               block->bb_u.s.bb_rightsib;
+
+       if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
+                       XFS_ERRTAG_BTREE_CHECK_SBLOCK,
+                       XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
+               if (bp)
+                       trace_xfs_btree_corrupt(bp, _RET_IP_);
+               XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+               return -EFSCORRUPTED;
+       }
+       return 0;
+}
+
+/*
+ * Debug routine: check that block header is ok.
+ */
+int
+xfs_btree_check_block(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       struct xfs_btree_block  *block, /* generic btree block pointer */
+       int                     level,  /* level of the btree block */
+       struct xfs_buf          *bp)    /* buffer containing block, if any */
+{
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+               return xfs_btree_check_lblock(cur, block, level, bp);
+       else
+               return xfs_btree_check_sblock(cur, block, level, bp);
+}
+
+/*
+ * Check that (long) pointer is ok.
+ */
+int                                    /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lptr(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_fsblock_t           bno,    /* btree block disk address */
+       int                     level)  /* btree block level */
+{
+       XFS_WANT_CORRUPTED_RETURN(
+               level > 0 &&
+               bno != NULLFSBLOCK &&
+               XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
+       return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check that (short) pointer is ok.
+ */
+STATIC int                             /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sptr(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agblock_t           bno,    /* btree block disk address */
+       int                     level)  /* btree block level */
+{
+       xfs_agblock_t           agblocks = cur->bc_mp->m_sb.sb_agblocks;
+
+       XFS_WANT_CORRUPTED_RETURN(
+               level > 0 &&
+               bno != NULLAGBLOCK &&
+               bno != 0 &&
+               bno < agblocks);
+       return 0;
+}
+
+/*
+ * Check that block ptr is ok.
+ */
+STATIC int                             /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_ptr(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       union xfs_btree_ptr     *ptr,   /* btree block disk address */
+       int                     index,  /* offset from ptr to check */
+       int                     level)  /* btree block level */
+{
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+               return xfs_btree_check_lptr(cur,
+                               be64_to_cpu((&ptr->l)[index]), level);
+       } else {
+               return xfs_btree_check_sptr(cur,
+                               be32_to_cpu((&ptr->s)[index]), level);
+       }
+}
+#endif
+
+/*
+ * Calculate CRC on the whole btree block and stuff it into the
+ * long-form btree header.
+ *
+ * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
+ * it into the buffer so recovery knows what the last modifcation was that made
+ * it to disk.
+ */
+void
+xfs_btree_lblock_calc_crc(
+       struct xfs_buf          *bp)
+{
+       struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+       struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+       if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+               return;
+       if (bip)
+               block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+       xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
+}
+
+bool
+xfs_btree_lblock_verify_crc(
+       struct xfs_buf          *bp)
+{
+       if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+               return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
+
+       return true;
+}
+
+/*
+ * Calculate CRC on the whole btree block and stuff it into the
+ * short-form btree header.
+ *
+ * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
+ * it into the buffer so recovery knows what the last modifcation was that made
+ * it to disk.
+ */
+void
+xfs_btree_sblock_calc_crc(
+       struct xfs_buf          *bp)
+{
+       struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+       struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+       if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+               return;
+       if (bip)
+               block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+       xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
+}
+
+bool
+xfs_btree_sblock_verify_crc(
+       struct xfs_buf          *bp)
+{
+       if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+               return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
+
+       return true;
+}
+
+/*
+ * Delete the btree cursor.
+ */
+void
+xfs_btree_del_cursor(
+       xfs_btree_cur_t *cur,           /* btree cursor */
+       int             error)          /* del because of error */
+{
+       int             i;              /* btree level */
+
+       /*
+        * Clear the buffer pointers, and release the buffers.
+        * If we're doing this in the face of an error, we
+        * need to make sure to inspect all of the entries
+        * in the bc_bufs array for buffers to be unlocked.
+        * This is because some of the btree code works from
+        * level n down to 0, and if we get an error along
+        * the way we won't have initialized all the entries
+        * down to 0.
+        */
+       for (i = 0; i < cur->bc_nlevels; i++) {
+               if (cur->bc_bufs[i])
+                       xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
+               else if (!error)
+                       break;
+       }
+       /*
+        * Can't free a bmap cursor without having dealt with the
+        * allocated indirect blocks' accounting.
+        */
+       ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
+              cur->bc_private.b.allocated == 0);
+       /*
+        * Free the cursor.
+        */
+       kmem_zone_free(xfs_btree_cur_zone, cur);
+}
+
+/*
+ * Duplicate the btree cursor.
+ * Allocate a new one, copy the record, re-get the buffers.
+ */
+int                                    /* error */
+xfs_btree_dup_cursor(
+       xfs_btree_cur_t *cur,           /* input cursor */
+       xfs_btree_cur_t **ncur)         /* output cursor */
+{
+       xfs_buf_t       *bp;            /* btree block's buffer pointer */
+       int             error;          /* error return value */
+       int             i;              /* level number of btree block */
+       xfs_mount_t     *mp;            /* mount structure for filesystem */
+       xfs_btree_cur_t *new;           /* new cursor value */
+       xfs_trans_t     *tp;            /* transaction pointer, can be NULL */
+
+       tp = cur->bc_tp;
+       mp = cur->bc_mp;
+
+       /*
+        * Allocate a new cursor like the old one.
+        */
+       new = cur->bc_ops->dup_cursor(cur);
+
+       /*
+        * Copy the record currently in the cursor.
+        */
+       new->bc_rec = cur->bc_rec;
+
+       /*
+        * For each level current, re-get the buffer and copy the ptr value.
+        */
+       for (i = 0; i < new->bc_nlevels; i++) {
+               new->bc_ptrs[i] = cur->bc_ptrs[i];
+               new->bc_ra[i] = cur->bc_ra[i];
+               bp = cur->bc_bufs[i];
+               if (bp) {
+                       error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+                                                  XFS_BUF_ADDR(bp), mp->m_bsize,
+                                                  0, &bp,
+                                                  cur->bc_ops->buf_ops);
+                       if (error) {
+                               xfs_btree_del_cursor(new, error);
+                               *ncur = NULL;
+                               return error;
+                       }
+               }
+               new->bc_bufs[i] = bp;
+       }
+       *ncur = new;
+       return 0;
+}
+
+/*
+ * XFS btree block layout and addressing:
+ *
+ * There are two types of blocks in the btree: leaf and non-leaf blocks.
+ *
+ * The leaf record start with a header then followed by records containing
+ * the values.  A non-leaf block also starts with the same header, and
+ * then first contains lookup keys followed by an equal number of pointers
+ * to the btree blocks at the previous level.
+ *
+ *             +--------+-------+-------+-------+-------+-------+-------+
+ * Leaf:       | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
+ *             +--------+-------+-------+-------+-------+-------+-------+
+ *
+ *             +--------+-------+-------+-------+-------+-------+-------+
+ * Non-Leaf:   | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
+ *             +--------+-------+-------+-------+-------+-------+-------+
+ *
+ * The header is called struct xfs_btree_block for reasons better left unknown
+ * and comes in different versions for short (32bit) and long (64bit) block
+ * pointers.  The record and key structures are defined by the btree instances
+ * and opaque to the btree core.  The block pointers are simple disk endian
+ * integers, available in a short (32bit) and long (64bit) variant.
+ *
+ * The helpers below calculate the offset of a given record, key or pointer
+ * into a btree block (xfs_btree_*_offset) or return a pointer to the given
+ * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
+ * inside the btree block is done using indices starting at one, not zero!
+ */
+
+/*
+ * Return size of the btree block header for this btree instance.
+ */
+static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
+{
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+               if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+                       return XFS_BTREE_LBLOCK_CRC_LEN;
+               return XFS_BTREE_LBLOCK_LEN;
+       }
+       if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+               return XFS_BTREE_SBLOCK_CRC_LEN;
+       return XFS_BTREE_SBLOCK_LEN;
+}
+
+/*
+ * Return size of btree block pointers for this btree instance.
+ */
+static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
+{
+       return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+               sizeof(__be64) : sizeof(__be32);
+}
+
+/*
+ * Calculate offset of the n-th record in a btree block.
+ */
+STATIC size_t
+xfs_btree_rec_offset(
+       struct xfs_btree_cur    *cur,
+       int                     n)
+{
+       return xfs_btree_block_len(cur) +
+               (n - 1) * cur->bc_ops->rec_len;
+}
+
+/*
+ * Calculate offset of the n-th key in a btree block.
+ */
+STATIC size_t
+xfs_btree_key_offset(
+       struct xfs_btree_cur    *cur,
+       int                     n)
+{
+       return xfs_btree_block_len(cur) +
+               (n - 1) * cur->bc_ops->key_len;
+}
+
+/*
+ * Calculate offset of the n-th block pointer in a btree block.
+ */
+STATIC size_t
+xfs_btree_ptr_offset(
+       struct xfs_btree_cur    *cur,
+       int                     n,
+       int                     level)
+{
+       return xfs_btree_block_len(cur) +
+               cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
+               (n - 1) * xfs_btree_ptr_len(cur);
+}
+
+/*
+ * Return a pointer to the n-th record in the btree block.
+ */
+STATIC union xfs_btree_rec *
+xfs_btree_rec_addr(
+       struct xfs_btree_cur    *cur,
+       int                     n,
+       struct xfs_btree_block  *block)
+{
+       return (union xfs_btree_rec *)
+               ((char *)block + xfs_btree_rec_offset(cur, n));
+}
+
+/*
+ * Return a pointer to the n-th key in the btree block.
+ */
+STATIC union xfs_btree_key *
+xfs_btree_key_addr(
+       struct xfs_btree_cur    *cur,
+       int                     n,
+       struct xfs_btree_block  *block)
+{
+       return (union xfs_btree_key *)
+               ((char *)block + xfs_btree_key_offset(cur, n));
+}
+
+/*
+ * Return a pointer to the n-th block pointer in the btree block.
+ */
+STATIC union xfs_btree_ptr *
+xfs_btree_ptr_addr(
+       struct xfs_btree_cur    *cur,
+       int                     n,
+       struct xfs_btree_block  *block)
+{
+       int                     level = xfs_btree_get_level(block);
+
+       ASSERT(block->bb_level != 0);
+
+       return (union xfs_btree_ptr *)
+               ((char *)block + xfs_btree_ptr_offset(cur, n, level));
+}
+
+/*
+ * Get the root block which is stored in the inode.
+ *
+ * For now this btree implementation assumes the btree root is always
+ * stored in the if_broot field of an inode fork.
+ */
+STATIC struct xfs_btree_block *
+xfs_btree_get_iroot(
+       struct xfs_btree_cur    *cur)
+{
+       struct xfs_ifork        *ifp;
+
+       ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+       return (struct xfs_btree_block *)ifp->if_broot;
+}
+
+/*
+ * Retrieve the block pointer from the cursor at the given level.
+ * This may be an inode btree root or from a buffer.
+ */
+STATIC struct xfs_btree_block *                /* generic btree block pointer */
+xfs_btree_get_block(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       int                     level,  /* level in btree */
+       struct xfs_buf          **bpp)  /* buffer containing the block */
+{
+       if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+           (level == cur->bc_nlevels - 1)) {
+               *bpp = NULL;
+               return xfs_btree_get_iroot(cur);
+       }
+
+       *bpp = cur->bc_bufs[level];
+       return XFS_BUF_TO_BLOCK(*bpp);
+}
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Long-form addressing.
+ */
+xfs_buf_t *                            /* buffer for fsbno */
+xfs_btree_get_bufl(
+       xfs_mount_t     *mp,            /* file system mount point */
+       xfs_trans_t     *tp,            /* transaction pointer */
+       xfs_fsblock_t   fsbno,          /* file system block number */
+       uint            lock)           /* lock flags for get_buf */
+{
+       xfs_daddr_t             d;              /* real disk block address */
+
+       ASSERT(fsbno != NULLFSBLOCK);
+       d = XFS_FSB_TO_DADDR(mp, fsbno);
+       return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
+}
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Short-form addressing.
+ */
+xfs_buf_t *                            /* buffer for agno/agbno */
+xfs_btree_get_bufs(
+       xfs_mount_t     *mp,            /* file system mount point */
+       xfs_trans_t     *tp,            /* transaction pointer */
+       xfs_agnumber_t  agno,           /* allocation group number */
+       xfs_agblock_t   agbno,          /* allocation group block number */
+       uint            lock)           /* lock flags for get_buf */
+{
+       xfs_daddr_t             d;              /* real disk block address */
+
+       ASSERT(agno != NULLAGNUMBER);
+       ASSERT(agbno != NULLAGBLOCK);
+       d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+       return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
+}
+
+/*
+ * Check for the cursor referring to the last block at the given level.
+ */
+int                                    /* 1=is last block, 0=not last block */
+xfs_btree_islastblock(
+       xfs_btree_cur_t         *cur,   /* btree cursor */
+       int                     level)  /* level to check */
+{
+       struct xfs_btree_block  *block; /* generic btree block pointer */
+       xfs_buf_t               *bp;    /* buffer containing block */
+
+       block = xfs_btree_get_block(cur, level, &bp);
+       xfs_btree_check_block(cur, block, level, bp);
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+               return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
+       else
+               return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
+}
+
+/*
+ * Change the cursor to point to the first record at the given level.
+ * Other levels are unaffected.
+ */
+STATIC int                             /* success=1, failure=0 */
+xfs_btree_firstrec(
+       xfs_btree_cur_t         *cur,   /* btree cursor */
+       int                     level)  /* level to change */
+{
+       struct xfs_btree_block  *block; /* generic btree block pointer */
+       xfs_buf_t               *bp;    /* buffer containing block */
+
+       /*
+        * Get the block pointer for this level.
+        */
+       block = xfs_btree_get_block(cur, level, &bp);
+       xfs_btree_check_block(cur, block, level, bp);
+       /*
+        * It's empty, there is no such record.
+        */
+       if (!block->bb_numrecs)
+               return 0;
+       /*
+        * Set the ptr value to 1, that's the first record/key.
+        */
+       cur->bc_ptrs[level] = 1;
+       return 1;
+}
+
+/*
+ * Change the cursor to point to the last record in the current block
+ * at the given level.  Other levels are unaffected.
+ */
+STATIC int                             /* success=1, failure=0 */
+xfs_btree_lastrec(
+       xfs_btree_cur_t         *cur,   /* btree cursor */
+       int                     level)  /* level to change */
+{
+       struct xfs_btree_block  *block; /* generic btree block pointer */
+       xfs_buf_t               *bp;    /* buffer containing block */
+
+       /*
+        * Get the block pointer for this level.
+        */
+       block = xfs_btree_get_block(cur, level, &bp);
+       xfs_btree_check_block(cur, block, level, bp);
+       /*
+        * It's empty, there is no such record.
+        */
+       if (!block->bb_numrecs)
+               return 0;
+       /*
+        * Set the ptr value to numrecs, that's the last record/key.
+        */
+       cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
+       return 1;
+}
+
+/*
+ * Compute first and last byte offsets for the fields given.
+ * Interprets the offsets table, which contains struct field offsets.
+ */
+void
+xfs_btree_offsets(
+       __int64_t       fields,         /* bitmask of fields */
+       const short     *offsets,       /* table of field offsets */
+       int             nbits,          /* number of bits to inspect */
+       int             *first,         /* output: first byte offset */
+       int             *last)          /* output: last byte offset */
+{
+       int             i;              /* current bit number */
+       __int64_t       imask;          /* mask for current bit number */
+
+       ASSERT(fields != 0);
+       /*
+        * Find the lowest bit, so the first byte offset.
+        */
+       for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
+               if (imask & fields) {
+                       *first = offsets[i];
+                       break;
+               }
+       }
+       /*
+        * Find the highest bit, so the last byte offset.
+        */
+       for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
+               if (imask & fields) {
+                       *last = offsets[i + 1] - 1;
+                       break;
+               }
+       }
+}
+
+/*
+ * Get a buffer for the block, return it read in.
+ * Long-form addressing.
+ */
+int
+xfs_btree_read_bufl(
+       struct xfs_mount        *mp,            /* file system mount point */
+       struct xfs_trans        *tp,            /* transaction pointer */
+       xfs_fsblock_t           fsbno,          /* file system block number */
+       uint                    lock,           /* lock flags for read_buf */
+       struct xfs_buf          **bpp,          /* buffer for fsbno */
+       int                     refval,         /* ref count value for buffer */
+       const struct xfs_buf_ops *ops)
+{
+       struct xfs_buf          *bp;            /* return value */
+       xfs_daddr_t             d;              /* real disk block address */
+       int                     error;
+
+       ASSERT(fsbno != NULLFSBLOCK);
+       d = XFS_FSB_TO_DADDR(mp, fsbno);
+       error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+                                  mp->m_bsize, lock, &bp, ops);
+       if (error)
+               return error;
+       if (bp)
+               xfs_buf_set_ref(bp, refval);
+       *bpp = bp;
+       return 0;
+}
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Long-form addressing.
+ */
+/* ARGSUSED */
+void
+xfs_btree_reada_bufl(
+       struct xfs_mount        *mp,            /* file system mount point */
+       xfs_fsblock_t           fsbno,          /* file system block number */
+       xfs_extlen_t            count,          /* count of filesystem blocks */
+       const struct xfs_buf_ops *ops)
+{
+       xfs_daddr_t             d;
+
+       ASSERT(fsbno != NULLFSBLOCK);
+       d = XFS_FSB_TO_DADDR(mp, fsbno);
+       xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
+}
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Short-form addressing.
+ */
+/* ARGSUSED */
+void
+xfs_btree_reada_bufs(
+       struct xfs_mount        *mp,            /* file system mount point */
+       xfs_agnumber_t          agno,           /* allocation group number */
+       xfs_agblock_t           agbno,          /* allocation group block number */
+       xfs_extlen_t            count,          /* count of filesystem blocks */
+       const struct xfs_buf_ops *ops)
+{
+       xfs_daddr_t             d;
+
+       ASSERT(agno != NULLAGNUMBER);
+       ASSERT(agbno != NULLAGBLOCK);
+       d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+       xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
+}
+
+STATIC int
+xfs_btree_readahead_lblock(
+       struct xfs_btree_cur    *cur,
+       int                     lr,
+       struct xfs_btree_block  *block)
+{
+       int                     rval = 0;
+       xfs_fsblock_t           left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+       xfs_fsblock_t           right = be64_to_cpu(block->bb_u.l.bb_rightsib);
+
+       if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
+               xfs_btree_reada_bufl(cur->bc_mp, left, 1,
+                                    cur->bc_ops->buf_ops);
+               rval++;
+       }
+
+       if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
+               xfs_btree_reada_bufl(cur->bc_mp, right, 1,
+                                    cur->bc_ops->buf_ops);
+               rval++;
+       }
+
+       return rval;
+}
+
+STATIC int
+xfs_btree_readahead_sblock(
+       struct xfs_btree_cur    *cur,
+       int                     lr,
+       struct xfs_btree_block *block)
+{
+       int                     rval = 0;
+       xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
+       xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
+
+
+       if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
+               xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+                                    left, 1, cur->bc_ops->buf_ops);
+               rval++;
+       }
+
+       if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
+               xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+                                    right, 1, cur->bc_ops->buf_ops);
+               rval++;
+       }
+
+       return rval;
+}
+
+/*
+ * Read-ahead btree blocks, at the given level.
+ * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
+ */
+STATIC int
+xfs_btree_readahead(
+       struct xfs_btree_cur    *cur,           /* btree cursor */
+       int                     lev,            /* level in btree */
+       int                     lr)             /* left/right bits */
+{
+       struct xfs_btree_block  *block;
+
+       /*
+        * No readahead needed if we are at the root level and the
+        * btree root is stored in the inode.
+        */
+       if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+           (lev == cur->bc_nlevels - 1))
+               return 0;
+
+       if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
+               return 0;
+
+       cur->bc_ra[lev] |= lr;
+       block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
+
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+               return xfs_btree_readahead_lblock(cur, lr, block);
+       return xfs_btree_readahead_sblock(cur, lr, block);
+}
+
+STATIC xfs_daddr_t
+xfs_btree_ptr_to_daddr(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr)
+{
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+               ASSERT(ptr->l != cpu_to_be64(NULLFSBLOCK));
+
+               return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
+       } else {
+               ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
+               ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
+
+               return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
+                                       be32_to_cpu(ptr->s));
+       }
+}
+
+/*
+ * Readahead @count btree blocks at the given @ptr location.
+ *
+ * We don't need to care about long or short form btrees here as we have a
+ * method of converting the ptr directly to a daddr available to us.
+ */
+STATIC void
+xfs_btree_readahead_ptr(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr,
+       xfs_extlen_t            count)
+{
+       xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
+                         xfs_btree_ptr_to_daddr(cur, ptr),
+                         cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
+}
+
+/*
+ * Set the buffer for level "lev" in the cursor to bp, releasing
+ * any previous buffer.
+ */
+STATIC void
+xfs_btree_setbuf(
+       xfs_btree_cur_t         *cur,   /* btree cursor */
+       int                     lev,    /* level in btree */
+       xfs_buf_t               *bp)    /* new buffer to set */
+{
+       struct xfs_btree_block  *b;     /* btree block */
+
+       if (cur->bc_bufs[lev])
+               xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
+       cur->bc_bufs[lev] = bp;
+       cur->bc_ra[lev] = 0;
+
+       b = XFS_BUF_TO_BLOCK(bp);
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+               if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
+                       cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+               if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
+                       cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+       } else {
+               if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
+                       cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+               if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
+                       cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+       }
+}
+
+STATIC int
+xfs_btree_ptr_is_null(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr)
+{
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+               return ptr->l == cpu_to_be64(NULLFSBLOCK);
+       else
+               return ptr->s == cpu_to_be32(NULLAGBLOCK);
+}
+
+STATIC void
+xfs_btree_set_ptr_null(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr)
+{
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+               ptr->l = cpu_to_be64(NULLFSBLOCK);
+       else
+               ptr->s = cpu_to_be32(NULLAGBLOCK);
+}
+
+/*
+ * Get/set/init sibling pointers
+ */
+STATIC void
+xfs_btree_get_sibling(
+       struct xfs_btree_cur    *cur,
+       struct xfs_btree_block  *block,
+       union xfs_btree_ptr     *ptr,
+       int                     lr)
+{
+       ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+               if (lr == XFS_BB_RIGHTSIB)
+                       ptr->l = block->bb_u.l.bb_rightsib;
+               else
+                       ptr->l = block->bb_u.l.bb_leftsib;
+       } else {
+               if (lr == XFS_BB_RIGHTSIB)
+                       ptr->s = block->bb_u.s.bb_rightsib;
+               else
+                       ptr->s = block->bb_u.s.bb_leftsib;
+       }
+}
+
+STATIC void
+xfs_btree_set_sibling(
+       struct xfs_btree_cur    *cur,
+       struct xfs_btree_block  *block,
+       union xfs_btree_ptr     *ptr,
+       int                     lr)
+{
+       ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+               if (lr == XFS_BB_RIGHTSIB)
+                       block->bb_u.l.bb_rightsib = ptr->l;
+               else
+                       block->bb_u.l.bb_leftsib = ptr->l;
+       } else {
+               if (lr == XFS_BB_RIGHTSIB)
+                       block->bb_u.s.bb_rightsib = ptr->s;
+               else
+                       block->bb_u.s.bb_leftsib = ptr->s;
+       }
+}
+
+void
+xfs_btree_init_block_int(
+       struct xfs_mount        *mp,
+       struct xfs_btree_block  *buf,
+       xfs_daddr_t             blkno,
+       __u32                   magic,
+       __u16                   level,
+       __u16                   numrecs,
+       __u64                   owner,
+       unsigned int            flags)
+{
+       buf->bb_magic = cpu_to_be32(magic);
+       buf->bb_level = cpu_to_be16(level);
+       buf->bb_numrecs = cpu_to_be16(numrecs);
+
+       if (flags & XFS_BTREE_LONG_PTRS) {
+               buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
+               buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
+               if (flags & XFS_BTREE_CRC_BLOCKS) {
+                       buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
+                       buf->bb_u.l.bb_owner = cpu_to_be64(owner);
+                       uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
+                       buf->bb_u.l.bb_pad = 0;
+                       buf->bb_u.l.bb_lsn = 0;
+               }
+       } else {
+               /* owner is a 32 bit value on short blocks */
+               __u32 __owner = (__u32)owner;
+
+               buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+               buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+               if (flags & XFS_BTREE_CRC_BLOCKS) {
+                       buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
+                       buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
+                       uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
+                       buf->bb_u.s.bb_lsn = 0;
+               }
+       }
+}
+
+void
+xfs_btree_init_block(
+       struct xfs_mount *mp,
+       struct xfs_buf  *bp,
+       __u32           magic,
+       __u16           level,
+       __u16           numrecs,
+       __u64           owner,
+       unsigned int    flags)
+{
+       xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
+                                magic, level, numrecs, owner, flags);
+}
+
+STATIC void
+xfs_btree_init_block_cur(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp,
+       int                     level,
+       int                     numrecs)
+{
+       __u64 owner;
+
+       /*
+        * we can pull the owner from the cursor right now as the different
+        * owners align directly with the pointer size of the btree. This may
+        * change in future, but is safe for current users of the generic btree
+        * code.
+        */
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+               owner = cur->bc_private.b.ip->i_ino;
+       else
+               owner = cur->bc_private.a.agno;
+
+       xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
+                                xfs_btree_magic(cur), level, numrecs,
+                                owner, cur->bc_flags);
+}
+
+/*
+ * Return true if ptr is the last record in the btree and
+ * we need to track updates to this record.  The decision
+ * will be further refined in the update_lastrec method.
+ */
+STATIC int
+xfs_btree_is_lastrec(
+       struct xfs_btree_cur    *cur,
+       struct xfs_btree_block  *block,
+       int                     level)
+{
+       union xfs_btree_ptr     ptr;
+
+       if (level > 0)
+               return 0;
+       if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
+               return 0;
+
+       xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+       if (!xfs_btree_ptr_is_null(cur, &ptr))
+               return 0;
+       return 1;
+}
+
+STATIC void
+xfs_btree_buf_to_ptr(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp,
+       union xfs_btree_ptr     *ptr)
+{
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+               ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
+                                       XFS_BUF_ADDR(bp)));
+       else {
+               ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
+                                       XFS_BUF_ADDR(bp)));
+       }
+}
+
+STATIC void
+xfs_btree_set_refs(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp)
+{
+       switch (cur->bc_btnum) {
+       case XFS_BTNUM_BNO:
+       case XFS_BTNUM_CNT:
+               xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
+               break;
+       case XFS_BTNUM_INO:
+       case XFS_BTNUM_FINO:
+               xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
+               break;
+       case XFS_BTNUM_BMAP:
+               xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
+               break;
+       default:
+               ASSERT(0);
+       }
+}
+
+STATIC int
+xfs_btree_get_buf_block(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr,
+       int                     flags,
+       struct xfs_btree_block  **block,
+       struct xfs_buf          **bpp)
+{
+       struct xfs_mount        *mp = cur->bc_mp;
+       xfs_daddr_t             d;
+
+       /* need to sort out how callers deal with failures first */
+       ASSERT(!(flags & XBF_TRYLOCK));
+
+       d = xfs_btree_ptr_to_daddr(cur, ptr);
+       *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
+                                mp->m_bsize, flags);
+
+       if (!*bpp)
+               return -ENOMEM;
+
+       (*bpp)->b_ops = cur->bc_ops->buf_ops;
+       *block = XFS_BUF_TO_BLOCK(*bpp);
+       return 0;
+}
+
+/*
+ * Read in the buffer at the given ptr and return the buffer and
+ * the block pointer within the buffer.
+ */
+STATIC int
+xfs_btree_read_buf_block(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr,
+       int                     flags,
+       struct xfs_btree_block  **block,
+       struct xfs_buf          **bpp)
+{
+       struct xfs_mount        *mp = cur->bc_mp;
+       xfs_daddr_t             d;
+       int                     error;
+
+       /* need to sort out how callers deal with failures first */
+       ASSERT(!(flags & XBF_TRYLOCK));
+
+       d = xfs_btree_ptr_to_daddr(cur, ptr);
+       error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
+                                  mp->m_bsize, flags, bpp,
+                                  cur->bc_ops->buf_ops);
+       if (error)
+               return error;
+
+       xfs_btree_set_refs(cur, *bpp);
+       *block = XFS_BUF_TO_BLOCK(*bpp);
+       return 0;
+}
+
+/*
+ * Copy keys from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_keys(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *dst_key,
+       union xfs_btree_key     *src_key,
+       int                     numkeys)
+{
+       ASSERT(numkeys >= 0);
+       memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
+}
+
+/*
+ * Copy records from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_recs(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *dst_rec,
+       union xfs_btree_rec     *src_rec,
+       int                     numrecs)
+{
+       ASSERT(numrecs >= 0);
+       memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
+}
+
+/*
+ * Copy block pointers from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_ptrs(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *dst_ptr,
+       union xfs_btree_ptr     *src_ptr,
+       int                     numptrs)
+{
+       ASSERT(numptrs >= 0);
+       memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+
+/*
+ * Shift keys one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_keys(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *key,
+       int                     dir,
+       int                     numkeys)
+{
+       char                    *dst_key;
+
+       ASSERT(numkeys >= 0);
+       ASSERT(dir == 1 || dir == -1);
+
+       dst_key = (char *)key + (dir * cur->bc_ops->key_len);
+       memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
+}
+
+/*
+ * Shift records one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_recs(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *rec,
+       int                     dir,
+       int                     numrecs)
+{
+       char                    *dst_rec;
+
+       ASSERT(numrecs >= 0);
+       ASSERT(dir == 1 || dir == -1);
+
+       dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
+       memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
+}
+
+/*
+ * Shift block pointers one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_ptrs(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr,
+       int                     dir,
+       int                     numptrs)
+{
+       char                    *dst_ptr;
+
+       ASSERT(numptrs >= 0);
+       ASSERT(dir == 1 || dir == -1);
+
+       dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
+       memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+
+/*
+ * Log key values from the btree block.
+ */
+STATIC void
+xfs_btree_log_keys(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp,
+       int                     first,
+       int                     last)
+{
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+       if (bp) {
+               xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+               xfs_trans_log_buf(cur->bc_tp, bp,
+                                 xfs_btree_key_offset(cur, first),
+                                 xfs_btree_key_offset(cur, last + 1) - 1);
+       } else {
+               xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+                               xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+       }
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log record values from the btree block.
+ */
+void
+xfs_btree_log_recs(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp,
+       int                     first,
+       int                     last)
+{
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+       xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+       xfs_trans_log_buf(cur->bc_tp, bp,
+                         xfs_btree_rec_offset(cur, first),
+                         xfs_btree_rec_offset(cur, last + 1) - 1);
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log block pointer fields from a btree block (nonleaf).
+ */
+STATIC void
+xfs_btree_log_ptrs(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       struct xfs_buf          *bp,    /* buffer containing btree block */
+       int                     first,  /* index of first pointer to log */
+       int                     last)   /* index of last pointer to log */
+{
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+       if (bp) {
+               struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+               int                     level = xfs_btree_get_level(block);
+
+               xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+               xfs_trans_log_buf(cur->bc_tp, bp,
+                               xfs_btree_ptr_offset(cur, first, level),
+                               xfs_btree_ptr_offset(cur, last + 1, level) - 1);
+       } else {
+               xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+                       xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+       }
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log fields from a btree block header.
+ */
+void
+xfs_btree_log_block(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       struct xfs_buf          *bp,    /* buffer containing btree block */
+       int                     fields) /* mask of fields: XFS_BB_... */
+{
+       int                     first;  /* first byte offset logged */
+       int                     last;   /* last byte offset logged */
+       static const short      soffsets[] = {  /* table of offsets (short) */
+               offsetof(struct xfs_btree_block, bb_magic),
+               offsetof(struct xfs_btree_block, bb_level),
+               offsetof(struct xfs_btree_block, bb_numrecs),
+               offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
+               offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
+               offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
+               offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
+               offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
+               offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
+               offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
+               XFS_BTREE_SBLOCK_CRC_LEN
+       };
+       static const short      loffsets[] = {  /* table of offsets (long) */
+               offsetof(struct xfs_btree_block, bb_magic),
+               offsetof(struct xfs_btree_block, bb_level),
+               offsetof(struct xfs_btree_block, bb_numrecs),
+               offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
+               offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
+               offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
+               offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
+               offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
+               offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
+               offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
+               offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
+               XFS_BTREE_LBLOCK_CRC_LEN
+       };
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
+
+       if (bp) {
+               int nbits;
+
+               if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+                       /*
+                        * We don't log the CRC when updating a btree
+                        * block but instead recreate it during log
+                        * recovery.  As the log buffers have checksums
+                        * of their own this is safe and avoids logging a crc
+                        * update in a lot of places.
+                        */
+                       if (fields == XFS_BB_ALL_BITS)
+                               fields = XFS_BB_ALL_BITS_CRC;
+                       nbits = XFS_BB_NUM_BITS_CRC;
+               } else {
+                       nbits = XFS_BB_NUM_BITS;
+               }
+               xfs_btree_offsets(fields,
+                                 (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+                                       loffsets : soffsets,
+                                 nbits, &first, &last);
+               xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+               xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+       } else {
+               xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+                       xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+       }
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int                                            /* error */
+xfs_btree_increment(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       int                     *stat)          /* success/failure */
+{
+       struct xfs_btree_block  *block;
+       union xfs_btree_ptr     ptr;
+       struct xfs_buf          *bp;
+       int                     error;          /* error return value */
+       int                     lev;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGI(cur, level);
+
+       ASSERT(level < cur->bc_nlevels);
+
+       /* Read-ahead to the right at this level. */
+       xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+
+       /* Get a pointer to the btree block. */
+       block = xfs_btree_get_block(cur, level, &bp);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, block, level, bp);
+       if (error)
+               goto error0;
+#endif
+
+       /* We're done if we remain in the block after the increment. */
+       if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
+               goto out1;
+
+       /* Fail if we just went off the right edge of the tree. */
+       xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+       if (xfs_btree_ptr_is_null(cur, &ptr))
+               goto out0;
+
+       XFS_BTREE_STATS_INC(cur, increment);
+
+       /*
+        * March up the tree incrementing pointers.
+        * Stop when we don't go off the right edge of a block.
+        */
+       for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+               block = xfs_btree_get_block(cur, lev, &bp);
+
+#ifdef DEBUG
+               error = xfs_btree_check_block(cur, block, lev, bp);
+               if (error)
+                       goto error0;
+#endif
+
+               if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
+                       break;
+
+               /* Read-ahead the right block for the next loop. */
+               xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+       }
+
+       /*
+        * If we went off the root then we are either seriously
+        * confused or have the tree root in an inode.
+        */
+       if (lev == cur->bc_nlevels) {
+               if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+                       goto out0;
+               ASSERT(0);
+               error = -EFSCORRUPTED;
+               goto error0;
+       }
+       ASSERT(lev < cur->bc_nlevels);
+
+       /*
+        * Now walk back down the tree, fixing up the cursor's buffer
+        * pointers and key numbers.
+        */
+       for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+               union xfs_btree_ptr     *ptrp;
+
+               ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+               --lev;
+               error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
+               if (error)
+                       goto error0;
+
+               xfs_btree_setbuf(cur, lev, bp);
+               cur->bc_ptrs[lev] = 1;
+       }
+out1:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+
+out0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 0;
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int                                            /* error */
+xfs_btree_decrement(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       int                     *stat)          /* success/failure */
+{
+       struct xfs_btree_block  *block;
+       xfs_buf_t               *bp;
+       int                     error;          /* error return value */
+       int                     lev;
+       union xfs_btree_ptr     ptr;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGI(cur, level);
+
+       ASSERT(level < cur->bc_nlevels);
+
+       /* Read-ahead to the left at this level. */
+       xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
+
+       /* We're done if we remain in the block after the decrement. */
+       if (--cur->bc_ptrs[level] > 0)
+               goto out1;
+
+       /* Get a pointer to the btree block. */
+       block = xfs_btree_get_block(cur, level, &bp);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, block, level, bp);
+       if (error)
+               goto error0;
+#endif
+
+       /* Fail if we just went off the left edge of the tree. */
+       xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+       if (xfs_btree_ptr_is_null(cur, &ptr))
+               goto out0;
+
+       XFS_BTREE_STATS_INC(cur, decrement);
+
+       /*
+        * March up the tree decrementing pointers.
+        * Stop when we don't go off the left edge of a block.
+        */
+       for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+               if (--cur->bc_ptrs[lev] > 0)
+                       break;
+               /* Read-ahead the left block for the next loop. */
+               xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
+       }
+
+       /*
+        * If we went off the root then we are seriously confused.
+        * or the root of the tree is in an inode.
+        */
+       if (lev == cur->bc_nlevels) {
+               if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+                       goto out0;
+               ASSERT(0);
+               error = -EFSCORRUPTED;
+               goto error0;
+       }
+       ASSERT(lev < cur->bc_nlevels);
+
+       /*
+        * Now walk back down the tree, fixing up the cursor's buffer
+        * pointers and key numbers.
+        */
+       for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+               union xfs_btree_ptr     *ptrp;
+
+               ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+               --lev;
+               error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
+               if (error)
+                       goto error0;
+               xfs_btree_setbuf(cur, lev, bp);
+               cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
+       }
+out1:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+
+out0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 0;
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+STATIC int
+xfs_btree_lookup_get_block(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       int                     level,  /* level in the btree */
+       union xfs_btree_ptr     *pp,    /* ptr to btree block */
+       struct xfs_btree_block  **blkp) /* return btree block */
+{
+       struct xfs_buf          *bp;    /* buffer pointer for btree block */
+       int                     error = 0;
+
+       /* special case the root block if in an inode */
+       if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+           (level == cur->bc_nlevels - 1)) {
+               *blkp = xfs_btree_get_iroot(cur);
+               return 0;
+       }
+
+       /*
+        * If the old buffer at this level for the disk address we are
+        * looking for re-use it.
+        *
+        * Otherwise throw it away and get a new one.
+        */
+       bp = cur->bc_bufs[level];
+       if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
+               *blkp = XFS_BUF_TO_BLOCK(bp);
+               return 0;
+       }
+
+       error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
+       if (error)
+               return error;
+
+       xfs_btree_setbuf(cur, level, bp);
+       return 0;
+}
+
+/*
+ * Get current search key.  For level 0 we don't actually have a key
+ * structure so we make one up from the record.  For all other levels
+ * we just return the right key.
+ */
+STATIC union xfs_btree_key *
+xfs_lookup_get_search_key(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       int                     keyno,
+       struct xfs_btree_block  *block,
+       union xfs_btree_key     *kp)
+{
+       if (level == 0) {
+               cur->bc_ops->init_key_from_rec(kp,
+                               xfs_btree_rec_addr(cur, keyno, block));
+               return kp;
+       }
+
+       return xfs_btree_key_addr(cur, keyno, block);
+}
+
+/*
+ * Lookup the record.  The cursor is made to point to it, based on dir.
+ * stat is set to 0 if can't find any such record, 1 for success.
+ */
+int                                    /* error */
+xfs_btree_lookup(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_lookup_t            dir,    /* <=, ==, or >= */
+       int                     *stat)  /* success/failure */
+{
+       struct xfs_btree_block  *block; /* current btree block */
+       __int64_t               diff;   /* difference for the current key */
+       int                     error;  /* error return value */
+       int                     keyno;  /* current key number */
+       int                     level;  /* level in the btree */
+       union xfs_btree_ptr     *pp;    /* ptr to btree block */
+       union xfs_btree_ptr     ptr;    /* ptr to btree block */
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGI(cur, dir);
+
+       XFS_BTREE_STATS_INC(cur, lookup);
+
+       block = NULL;
+       keyno = 0;
+
+       /* initialise start pointer from cursor */
+       cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+       pp = &ptr;
+
+       /*
+        * Iterate over each level in the btree, starting at the root.
+        * For each level above the leaves, find the key we need, based
+        * on the lookup record, then follow the corresponding block
+        * pointer down to the next level.
+        */
+       for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+               /* Get the block we need to do the lookup on. */
+               error = xfs_btree_lookup_get_block(cur, level, pp, &block);
+               if (error)
+                       goto error0;
+
+               if (diff == 0) {
+                       /*
+                        * If we already had a key match at a higher level, we
+                        * know we need to use the first entry in this block.
+                        */
+                       keyno = 1;
+               } else {
+                       /* Otherwise search this block. Do a binary search. */
+
+                       int     high;   /* high entry number */
+                       int     low;    /* low entry number */
+
+                       /* Set low and high entry numbers, 1-based. */
+                       low = 1;
+                       high = xfs_btree_get_numrecs(block);
+                       if (!high) {
+                               /* Block is empty, must be an empty leaf. */
+                               ASSERT(level == 0 && cur->bc_nlevels == 1);
+
+                               cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+                               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                               *stat = 0;
+                               return 0;
+                       }
+
+                       /* Binary search the block. */
+                       while (low <= high) {
+                               union xfs_btree_key     key;
+                               union xfs_btree_key     *kp;
+
+                               XFS_BTREE_STATS_INC(cur, compare);
+
+                               /* keyno is average of low and high. */
+                               keyno = (low + high) >> 1;
+
+                               /* Get current search key */
+                               kp = xfs_lookup_get_search_key(cur, level,
+                                               keyno, block, &key);
+
+                               /*
+                                * Compute difference to get next direction:
+                                *  - less than, move right
+                                *  - greater than, move left
+                                *  - equal, we're done
+                                */
+                               diff = cur->bc_ops->key_diff(cur, kp);
+                               if (diff < 0)
+                                       low = keyno + 1;
+                               else if (diff > 0)
+                                       high = keyno - 1;
+                               else
+                                       break;
+                       }
+               }
+
+               /*
+                * If there are more levels, set up for the next level
+                * by getting the block number and filling in the cursor.
+                */
+               if (level > 0) {
+                       /*
+                        * If we moved left, need the previous key number,
+                        * unless there isn't one.
+                        */
+                       if (diff > 0 && --keyno < 1)
+                               keyno = 1;
+                       pp = xfs_btree_ptr_addr(cur, keyno, block);
+
+#ifdef DEBUG
+                       error = xfs_btree_check_ptr(cur, pp, 0, level);
+                       if (error)
+                               goto error0;
+#endif
+                       cur->bc_ptrs[level] = keyno;
+               }
+       }
+
+       /* Done with the search. See if we need to adjust the results. */
+       if (dir != XFS_LOOKUP_LE && diff < 0) {
+               keyno++;
+               /*
+                * If ge search and we went off the end of the block, but it's
+                * not the last block, we're in the wrong block.
+                */
+               xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+               if (dir == XFS_LOOKUP_GE &&
+                   keyno > xfs_btree_get_numrecs(block) &&
+                   !xfs_btree_ptr_is_null(cur, &ptr)) {
+                       int     i;
+
+                       cur->bc_ptrs[0] = keyno;
+                       error = xfs_btree_increment(cur, 0, &i);
+                       if (error)
+                               goto error0;
+                       XFS_WANT_CORRUPTED_RETURN(i == 1);
+                       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                       *stat = 1;
+                       return 0;
+               }
+       } else if (dir == XFS_LOOKUP_LE && diff > 0)
+               keyno--;
+       cur->bc_ptrs[0] = keyno;
+
+       /* Return if we succeeded or not. */
+       if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
+               *stat = 0;
+       else if (dir != XFS_LOOKUP_EQ || diff == 0)
+               *stat = 1;
+       else
+               *stat = 0;
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Update keys at all levels from here to the root along the cursor's path.
+ */
+STATIC int
+xfs_btree_updkey(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *keyp,
+       int                     level)
+{
+       struct xfs_btree_block  *block;
+       struct xfs_buf          *bp;
+       union xfs_btree_key     *kp;
+       int                     ptr;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
+
+       ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
+
+       /*
+        * Go up the tree from this level toward the root.
+        * At each level, update the key value to the value input.
+        * Stop when we reach a level where the cursor isn't pointing
+        * at the first entry in the block.
+        */
+       for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+#ifdef DEBUG
+               int             error;
+#endif
+               block = xfs_btree_get_block(cur, level, &bp);
+#ifdef DEBUG
+               error = xfs_btree_check_block(cur, block, level, bp);
+               if (error) {
+                       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+                       return error;
+               }
+#endif
+               ptr = cur->bc_ptrs[level];
+               kp = xfs_btree_key_addr(cur, ptr, block);
+               xfs_btree_copy_keys(cur, kp, keyp, 1);
+               xfs_btree_log_keys(cur, bp, ptr, ptr);
+       }
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       return 0;
+}
+
+/*
+ * Update the record referred to by cur to the value in the
+ * given record. This either works (return 0) or gets an
+ * EFSCORRUPTED error.
+ */
+int
+xfs_btree_update(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *rec)
+{
+       struct xfs_btree_block  *block;
+       struct xfs_buf          *bp;
+       int                     error;
+       int                     ptr;
+       union xfs_btree_rec     *rp;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGR(cur, rec);
+
+       /* Pick up the current block. */
+       block = xfs_btree_get_block(cur, 0, &bp);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, block, 0, bp);
+       if (error)
+               goto error0;
+#endif
+       /* Get the address of the rec to be updated. */
+       ptr = cur->bc_ptrs[0];
+       rp = xfs_btree_rec_addr(cur, ptr, block);
+
+       /* Fill in the new contents and log them. */
+       xfs_btree_copy_recs(cur, rp, rec, 1);
+       xfs_btree_log_recs(cur, bp, ptr, ptr);
+
+       /*
+        * If we are tracking the last record in the tree and
+        * we are at the far right edge of the tree, update it.
+        */
+       if (xfs_btree_is_lastrec(cur, block, 0)) {
+               cur->bc_ops->update_lastrec(cur, block, rec,
+                                           ptr, LASTREC_UPDATE);
+       }
+
+       /* Updating first rec in leaf. Pass new key value up to our parent. */
+       if (ptr == 1) {
+               union xfs_btree_key     key;
+
+               cur->bc_ops->init_key_from_rec(&key, rec);
+               error = xfs_btree_updkey(cur, &key, 1);
+               if (error)
+                       goto error0;
+       }
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Move 1 record left from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int                                     /* error */
+xfs_btree_lshift(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       int                     *stat)          /* success/failure */
+{
+       union xfs_btree_key     key;            /* btree key */
+       struct xfs_buf          *lbp;           /* left buffer pointer */
+       struct xfs_btree_block  *left;          /* left btree block */
+       int                     lrecs;          /* left record count */
+       struct xfs_buf          *rbp;           /* right buffer pointer */
+       struct xfs_btree_block  *right;         /* right btree block */
+       int                     rrecs;          /* right record count */
+       union xfs_btree_ptr     lptr;           /* left btree pointer */
+       union xfs_btree_key     *rkp = NULL;    /* right btree key */
+       union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
+       union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
+       int                     error;          /* error return value */
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGI(cur, level);
+
+       if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+           level == cur->bc_nlevels - 1)
+               goto out0;
+
+       /* Set up variables for this block as "right". */
+       right = xfs_btree_get_block(cur, level, &rbp);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, right, level, rbp);
+       if (error)
+               goto error0;
+#endif
+
+       /* If we've got no left sibling then we can't shift an entry left. */
+       xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+       if (xfs_btree_ptr_is_null(cur, &lptr))
+               goto out0;
+
+       /*
+        * If the cursor entry is the one that would be moved, don't
+        * do it... it's too complicated.
+        */
+       if (cur->bc_ptrs[level] <= 1)
+               goto out0;
+
+       /* Set up the left neighbor as "left". */
+       error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
+       if (error)
+               goto error0;
+
+       /* If it's full, it can't take another entry. */
+       lrecs = xfs_btree_get_numrecs(left);
+       if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
+               goto out0;
+
+       rrecs = xfs_btree_get_numrecs(right);
+
+       /*
+        * We add one entry to the left side and remove one for the right side.
+        * Account for it here, the changes will be updated on disk and logged
+        * later.
+        */
+       lrecs++;
+       rrecs--;
+
+       XFS_BTREE_STATS_INC(cur, lshift);
+       XFS_BTREE_STATS_ADD(cur, moves, 1);
+
+       /*
+        * If non-leaf, copy a key and a ptr to the left block.
+        * Log the changes to the left block.
+        */
+       if (level > 0) {
+               /* It's a non-leaf.  Move keys and pointers. */
+               union xfs_btree_key     *lkp;   /* left btree key */
+               union xfs_btree_ptr     *lpp;   /* left address pointer */
+
+               lkp = xfs_btree_key_addr(cur, lrecs, left);
+               rkp = xfs_btree_key_addr(cur, 1, right);
+
+               lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+               rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+               error = xfs_btree_check_ptr(cur, rpp, 0, level);
+               if (error)
+                       goto error0;
+#endif
+               xfs_btree_copy_keys(cur, lkp, rkp, 1);
+               xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
+
+               xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
+               xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
+
+               ASSERT(cur->bc_ops->keys_inorder(cur,
+                       xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
+       } else {
+               /* It's a leaf.  Move records.  */
+               union xfs_btree_rec     *lrp;   /* left record pointer */
+
+               lrp = xfs_btree_rec_addr(cur, lrecs, left);
+               rrp = xfs_btree_rec_addr(cur, 1, right);
+
+               xfs_btree_copy_recs(cur, lrp, rrp, 1);
+               xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
+
+               ASSERT(cur->bc_ops->recs_inorder(cur,
+                       xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
+       }
+
+       xfs_btree_set_numrecs(left, lrecs);
+       xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+
+       xfs_btree_set_numrecs(right, rrecs);
+       xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+
+       /*
+        * Slide the contents of right down one entry.
+        */
+       XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
+       if (level > 0) {
+               /* It's a nonleaf. operate on keys and ptrs */
+#ifdef DEBUG
+               int                     i;              /* loop index */
+
+               for (i = 0; i < rrecs; i++) {
+                       error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
+                       if (error)
+                               goto error0;
+               }
+#endif
+               xfs_btree_shift_keys(cur,
+                               xfs_btree_key_addr(cur, 2, right),
+                               -1, rrecs);
+               xfs_btree_shift_ptrs(cur,
+                               xfs_btree_ptr_addr(cur, 2, right),
+                               -1, rrecs);
+
+               xfs_btree_log_keys(cur, rbp, 1, rrecs);
+               xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+       } else {
+               /* It's a leaf. operate on records */
+               xfs_btree_shift_recs(cur,
+                       xfs_btree_rec_addr(cur, 2, right),
+                       -1, rrecs);
+               xfs_btree_log_recs(cur, rbp, 1, rrecs);
+
+               /*
+                * If it's the first record in the block, we'll need a key
+                * structure to pass up to the next level (updkey).
+                */
+               cur->bc_ops->init_key_from_rec(&key,
+                       xfs_btree_rec_addr(cur, 1, right));
+               rkp = &key;
+       }
+
+       /* Update the parent key values of right. */
+       error = xfs_btree_updkey(cur, rkp, level + 1);
+       if (error)
+               goto error0;
+
+       /* Slide the cursor value left one. */
+       cur->bc_ptrs[level]--;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+
+out0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 0;
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Move 1 record right from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int                                     /* error */
+xfs_btree_rshift(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       int                     *stat)          /* success/failure */
+{
+       union xfs_btree_key     key;            /* btree key */
+       struct xfs_buf          *lbp;           /* left buffer pointer */
+       struct xfs_btree_block  *left;          /* left btree block */
+       struct xfs_buf          *rbp;           /* right buffer pointer */
+       struct xfs_btree_block  *right;         /* right btree block */
+       struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
+       union xfs_btree_ptr     rptr;           /* right block pointer */
+       union xfs_btree_key     *rkp;           /* right btree key */
+       int                     rrecs;          /* right record count */
+       int                     lrecs;          /* left record count */
+       int                     error;          /* error return value */
+       int                     i;              /* loop counter */
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGI(cur, level);
+
+       if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+           (level == cur->bc_nlevels - 1))
+               goto out0;
+
+       /* Set up variables for this block as "left". */
+       left = xfs_btree_get_block(cur, level, &lbp);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, left, level, lbp);
+       if (error)
+               goto error0;
+#endif
+
+       /* If we've got no right sibling then we can't shift an entry right. */
+       xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+       if (xfs_btree_ptr_is_null(cur, &rptr))
+               goto out0;
+
+       /*
+        * If the cursor entry is the one that would be moved, don't
+        * do it... it's too complicated.
+        */
+       lrecs = xfs_btree_get_numrecs(left);
+       if (cur->bc_ptrs[level] >= lrecs)
+               goto out0;
+
+       /* Set up the right neighbor as "right". */
+       error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
+       if (error)
+               goto error0;
+
+       /* If it's full, it can't take another entry. */
+       rrecs = xfs_btree_get_numrecs(right);
+       if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
+               goto out0;
+
+       XFS_BTREE_STATS_INC(cur, rshift);
+       XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+
+       /*
+        * Make a hole at the start of the right neighbor block, then
+        * copy the last left block entry to the hole.
+        */
+       if (level > 0) {
+               /* It's a nonleaf. make a hole in the keys and ptrs */
+               union xfs_btree_key     *lkp;
+               union xfs_btree_ptr     *lpp;
+               union xfs_btree_ptr     *rpp;
+
+               lkp = xfs_btree_key_addr(cur, lrecs, left);
+               lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+               rkp = xfs_btree_key_addr(cur, 1, right);
+               rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+#ifdef DEBUG
+               for (i = rrecs - 1; i >= 0; i--) {
+                       error = xfs_btree_check_ptr(cur, rpp, i, level);
+                       if (error)
+                               goto error0;
+               }
+#endif
+
+               xfs_btree_shift_keys(cur, rkp, 1, rrecs);
+               xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
+
+#ifdef DEBUG
+               error = xfs_btree_check_ptr(cur, lpp, 0, level);
+               if (error)
+                       goto error0;
+#endif
+
+               /* Now put the new data in, and log it. */
+               xfs_btree_copy_keys(cur, rkp, lkp, 1);
+               xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
+
+               xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
+               xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
+
+               ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
+                       xfs_btree_key_addr(cur, 2, right)));
+       } else {
+               /* It's a leaf. make a hole in the records */
+               union xfs_btree_rec     *lrp;
+               union xfs_btree_rec     *rrp;
+
+               lrp = xfs_btree_rec_addr(cur, lrecs, left);
+               rrp = xfs_btree_rec_addr(cur, 1, right);
+
+               xfs_btree_shift_recs(cur, rrp, 1, rrecs);
+
+               /* Now put the new data in, and log it. */
+               xfs_btree_copy_recs(cur, rrp, lrp, 1);
+               xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
+
+               cur->bc_ops->init_key_from_rec(&key, rrp);
+               rkp = &key;
+
+               ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
+                       xfs_btree_rec_addr(cur, 2, right)));
+       }
+
+       /*
+        * Decrement and log left's numrecs, bump and log right's numrecs.
+        */
+       xfs_btree_set_numrecs(left, --lrecs);
+       xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+
+       xfs_btree_set_numrecs(right, ++rrecs);
+       xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+
+       /*
+        * Using a temporary cursor, update the parent key values of the
+        * block on the right.
+        */
+       error = xfs_btree_dup_cursor(cur, &tcur);
+       if (error)
+               goto error0;
+       i = xfs_btree_lastrec(tcur, level);
+       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+       error = xfs_btree_increment(tcur, level, &i);
+       if (error)
+               goto error1;
+
+       error = xfs_btree_updkey(tcur, rkp, level + 1);
+       if (error)
+               goto error1;
+
+       xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+
+out0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 0;
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+
+error1:
+       XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
+       xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+       return error;
+}
+
+/*
+ * Split cur/level block in half.
+ * Return new block number and the key to its first
+ * record (to be inserted into parent).
+ */
+STATIC int                                     /* error */
+__xfs_btree_split(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       union xfs_btree_ptr     *ptrp,
+       union xfs_btree_key     *key,
+       struct xfs_btree_cur    **curp,
+       int                     *stat)          /* success/failure */
+{
+       union xfs_btree_ptr     lptr;           /* left sibling block ptr */
+       struct xfs_buf          *lbp;           /* left buffer pointer */
+       struct xfs_btree_block  *left;          /* left btree block */
+       union xfs_btree_ptr     rptr;           /* right sibling block ptr */
+       struct xfs_buf          *rbp;           /* right buffer pointer */
+       struct xfs_btree_block  *right;         /* right btree block */
+       union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
+       struct xfs_buf          *rrbp;          /* right-right buffer pointer */
+       struct xfs_btree_block  *rrblock;       /* right-right btree block */
+       int                     lrecs;
+       int                     rrecs;
+       int                     src_index;
+       int                     error;          /* error return value */
+#ifdef DEBUG
+       int                     i;
+#endif
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
+
+       XFS_BTREE_STATS_INC(cur, split);
+
+       /* Set up left block (current one). */
+       left = xfs_btree_get_block(cur, level, &lbp);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, left, level, lbp);
+       if (error)
+               goto error0;
+#endif
+
+       xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+
+       /* Allocate the new block. If we can't do it, we're toast. Give up. */
+       error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
+       if (error)
+               goto error0;
+       if (*stat == 0)
+               goto out0;
+       XFS_BTREE_STATS_INC(cur, alloc);
+
+       /* Set up the new block as "right". */
+       error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
+       if (error)
+               goto error0;
+
+       /* Fill in the btree header for the new right block. */
+       xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
+
+       /*
+        * Split the entries between the old and the new block evenly.
+        * Make sure that if there's an odd number of entries now, that
+        * each new block will have the same number of entries.
+        */
+       lrecs = xfs_btree_get_numrecs(left);
+       rrecs = lrecs / 2;
+       if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
+               rrecs++;
+       src_index = (lrecs - rrecs + 1);
+
+       XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+
+       /*
+        * Copy btree block entries from the left block over to the
+        * new block, the right. Update the right block and log the
+        * changes.
+        */
+       if (level > 0) {
+               /* It's a non-leaf.  Move keys and pointers. */
+               union xfs_btree_key     *lkp;   /* left btree key */
+               union xfs_btree_ptr     *lpp;   /* left address pointer */
+               union xfs_btree_key     *rkp;   /* right btree key */
+               union xfs_btree_ptr     *rpp;   /* right address pointer */
+
+               lkp = xfs_btree_key_addr(cur, src_index, left);
+               lpp = xfs_btree_ptr_addr(cur, src_index, left);
+               rkp = xfs_btree_key_addr(cur, 1, right);
+               rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+#ifdef DEBUG
+               for (i = src_index; i < rrecs; i++) {
+                       error = xfs_btree_check_ptr(cur, lpp, i, level);
+                       if (error)
+                               goto error0;
+               }
+#endif
+
+               xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
+               xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
+
+               xfs_btree_log_keys(cur, rbp, 1, rrecs);
+               xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+
+               /* Grab the keys to the entries moved to the right block */
+               xfs_btree_copy_keys(cur, key, rkp, 1);
+       } else {
+               /* It's a leaf.  Move records.  */
+               union xfs_btree_rec     *lrp;   /* left record pointer */
+               union xfs_btree_rec     *rrp;   /* right record pointer */
+
+               lrp = xfs_btree_rec_addr(cur, src_index, left);
+               rrp = xfs_btree_rec_addr(cur, 1, right);
+
+               xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
+               xfs_btree_log_recs(cur, rbp, 1, rrecs);
+
+               cur->bc_ops->init_key_from_rec(key,
+                       xfs_btree_rec_addr(cur, 1, right));
+       }
+
+
+       /*
+        * Find the left block number by looking in the buffer.
+        * Adjust numrecs, sibling pointers.
+        */
+       xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
+       xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
+       xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+       xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+
+       lrecs -= rrecs;
+       xfs_btree_set_numrecs(left, lrecs);
+       xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
+
+       xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
+       xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+
+       /*
+        * If there's a block to the new block's right, make that block
+        * point back to right instead of to left.
+        */
+       if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
+               error = xfs_btree_read_buf_block(cur, &rrptr,
+                                                       0, &rrblock, &rrbp);
+               if (error)
+                       goto error0;
+               xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
+               xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+       }
+       /*
+        * If the cursor is really in the right block, move it there.
+        * If it's just pointing past the last entry in left, then we'll
+        * insert there, so don't change anything in that case.
+        */
+       if (cur->bc_ptrs[level] > lrecs + 1) {
+               xfs_btree_setbuf(cur, level, rbp);
+               cur->bc_ptrs[level] -= lrecs;
+       }
+       /*
+        * If there are more levels, we'll need another cursor which refers
+        * the right block, no matter where this cursor was.
+        */
+       if (level + 1 < cur->bc_nlevels) {
+               error = xfs_btree_dup_cursor(cur, curp);
+               if (error)
+                       goto error0;
+               (*curp)->bc_ptrs[level + 1]++;
+       }
+       *ptrp = rptr;
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+out0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 0;
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+struct xfs_btree_split_args {
+       struct xfs_btree_cur    *cur;
+       int                     level;
+       union xfs_btree_ptr     *ptrp;
+       union xfs_btree_key     *key;
+       struct xfs_btree_cur    **curp;
+       int                     *stat;          /* success/failure */
+       int                     result;
+       bool                    kswapd; /* allocation in kswapd context */
+       struct completion       *done;
+       struct work_struct      work;
+};
+
+/*
+ * Stack switching interfaces for allocation
+ */
+static void
+xfs_btree_split_worker(
+       struct work_struct      *work)
+{
+       struct xfs_btree_split_args     *args = container_of(work,
+                                               struct xfs_btree_split_args, work);
+       unsigned long           pflags;
+       unsigned long           new_pflags = PF_FSTRANS;
+
+       /*
+        * we are in a transaction context here, but may also be doing work
+        * in kswapd context, and hence we may need to inherit that state
+        * temporarily to ensure that we don't block waiting for memory reclaim
+        * in any way.
+        */
+       if (args->kswapd)
+               new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+
+       current_set_flags_nested(&pflags, new_pflags);
+
+       args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
+                                        args->key, args->curp, args->stat);
+       complete(args->done);
+
+       current_restore_flags_nested(&pflags, new_pflags);
+}
+
+/*
+ * BMBT split requests often come in with little stack to work on. Push
+ * them off to a worker thread so there is lots of stack to use. For the other
+ * btree types, just call directly to avoid the context switch overhead here.
+ */
+STATIC int                                     /* error */
+xfs_btree_split(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       union xfs_btree_ptr     *ptrp,
+       union xfs_btree_key     *key,
+       struct xfs_btree_cur    **curp,
+       int                     *stat)          /* success/failure */
+{
+       struct xfs_btree_split_args     args;
+       DECLARE_COMPLETION_ONSTACK(done);
+
+       if (cur->bc_btnum != XFS_BTNUM_BMAP)
+               return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
+
+       args.cur = cur;
+       args.level = level;
+       args.ptrp = ptrp;
+       args.key = key;
+       args.curp = curp;
+       args.stat = stat;
+       args.done = &done;
+       args.kswapd = current_is_kswapd();
+       INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
+       queue_work(xfs_alloc_wq, &args.work);
+       wait_for_completion(&done);
+       destroy_work_on_stack(&args.work);
+       return args.result;
+}
+
+
+/*
+ * Copy the old inode root contents into a real block and make the
+ * broot point to it.
+ */
+int                                            /* error */
+xfs_btree_new_iroot(
+       struct xfs_btree_cur    *cur,           /* btree cursor */
+       int                     *logflags,      /* logging flags for inode */
+       int                     *stat)          /* return status - 0 fail */
+{
+       struct xfs_buf          *cbp;           /* buffer for cblock */
+       struct xfs_btree_block  *block;         /* btree block */
+       struct xfs_btree_block  *cblock;        /* child btree block */
+       union xfs_btree_key     *ckp;           /* child key pointer */
+       union xfs_btree_ptr     *cpp;           /* child ptr pointer */
+       union xfs_btree_key     *kp;            /* pointer to btree key */
+       union xfs_btree_ptr     *pp;            /* pointer to block addr */
+       union xfs_btree_ptr     nptr;           /* new block addr */
+       int                     level;          /* btree level */
+       int                     error;          /* error return code */
+#ifdef DEBUG
+       int                     i;              /* loop counter */
+#endif
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_STATS_INC(cur, newroot);
+
+       ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+
+       level = cur->bc_nlevels - 1;
+
+       block = xfs_btree_get_iroot(cur);
+       pp = xfs_btree_ptr_addr(cur, 1, block);
+
+       /* Allocate the new block. If we can't do it, we're toast. Give up. */
+       error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
+       if (error)
+               goto error0;
+       if (*stat == 0) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+               return 0;
+       }
+       XFS_BTREE_STATS_INC(cur, alloc);
+
+       /* Copy the root into a real block. */
+       error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
+       if (error)
+               goto error0;
+
+       /*
+        * we can't just memcpy() the root in for CRC enabled btree blocks.
+        * In that case have to also ensure the blkno remains correct
+        */
+       memcpy(cblock, block, xfs_btree_block_len(cur));
+       if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+               if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+                       cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
+               else
+                       cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
+       }
+
+       be16_add_cpu(&block->bb_level, 1);
+       xfs_btree_set_numrecs(block, 1);
+       cur->bc_nlevels++;
+       cur->bc_ptrs[level + 1] = 1;
+
+       kp = xfs_btree_key_addr(cur, 1, block);
+       ckp = xfs_btree_key_addr(cur, 1, cblock);
+       xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
+
+       cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+       for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
+               error = xfs_btree_check_ptr(cur, pp, i, level);
+               if (error)
+                       goto error0;
+       }
+#endif
+       xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
+
+#ifdef DEBUG
+       error = xfs_btree_check_ptr(cur, &nptr, 0, level);
+       if (error)
+               goto error0;
+#endif
+       xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
+
+       xfs_iroot_realloc(cur->bc_private.b.ip,
+                         1 - xfs_btree_get_numrecs(cblock),
+                         cur->bc_private.b.whichfork);
+
+       xfs_btree_setbuf(cur, level, cbp);
+
+       /*
+        * Do all this logging at the end so that
+        * the root is at the right level.
+        */
+       xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
+       xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+       xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+
+       *logflags |=
+               XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
+       *stat = 1;
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       return 0;
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Allocate a new root block, fill it in.
+ */
+STATIC int                             /* error */
+xfs_btree_new_root(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       int                     *stat)  /* success/failure */
+{
+       struct xfs_btree_block  *block; /* one half of the old root block */
+       struct xfs_buf          *bp;    /* buffer containing block */
+       int                     error;  /* error return value */
+       struct xfs_buf          *lbp;   /* left buffer pointer */
+       struct xfs_btree_block  *left;  /* left btree block */
+       struct xfs_buf          *nbp;   /* new (root) buffer */
+       struct xfs_btree_block  *new;   /* new (root) btree block */
+       int                     nptr;   /* new value for key index, 1 or 2 */
+       struct xfs_buf          *rbp;   /* right buffer pointer */
+       struct xfs_btree_block  *right; /* right btree block */
+       union xfs_btree_ptr     rptr;
+       union xfs_btree_ptr     lptr;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_STATS_INC(cur, newroot);
+
+       /* initialise our start point from the cursor */
+       cur->bc_ops->init_ptr_from_cur(cur, &rptr);
+
+       /* Allocate the new block. If we can't do it, we're toast. Give up. */
+       error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
+       if (error)
+               goto error0;
+       if (*stat == 0)
+               goto out0;
+       XFS_BTREE_STATS_INC(cur, alloc);
+
+       /* Set up the new block. */
+       error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
+       if (error)
+               goto error0;
+
+       /* Set the root in the holding structure  increasing the level by 1. */
+       cur->bc_ops->set_root(cur, &lptr, 1);
+
+       /*
+        * At the previous root level there are now two blocks: the old root,
+        * and the new block generated when it was split.  We don't know which
+        * one the cursor is pointing at, so we set up variables "left" and
+        * "right" for each case.
+        */
+       block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
+       if (error)
+               goto error0;
+#endif
+
+       xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+       if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+               /* Our block is left, pick up the right block. */
+               lbp = bp;
+               xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+               left = block;
+               error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
+               if (error)
+                       goto error0;
+               bp = rbp;
+               nptr = 1;
+       } else {
+               /* Our block is right, pick up the left block. */
+               rbp = bp;
+               xfs_btree_buf_to_ptr(cur, rbp, &rptr);
+               right = block;
+               xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+               error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
+               if (error)
+                       goto error0;
+               bp = lbp;
+               nptr = 2;
+       }
+       /* Fill in the new block's btree header and log it. */
+       xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
+       xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
+       ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
+                       !xfs_btree_ptr_is_null(cur, &rptr));
+
+       /* Fill in the key data in the new root. */
+       if (xfs_btree_get_level(left) > 0) {
+               xfs_btree_copy_keys(cur,
+                               xfs_btree_key_addr(cur, 1, new),
+                               xfs_btree_key_addr(cur, 1, left), 1);
+               xfs_btree_copy_keys(cur,
+                               xfs_btree_key_addr(cur, 2, new),
+                               xfs_btree_key_addr(cur, 1, right), 1);
+       } else {
+               cur->bc_ops->init_key_from_rec(
+                               xfs_btree_key_addr(cur, 1, new),
+                               xfs_btree_rec_addr(cur, 1, left));
+               cur->bc_ops->init_key_from_rec(
+                               xfs_btree_key_addr(cur, 2, new),
+                               xfs_btree_rec_addr(cur, 1, right));
+       }
+       xfs_btree_log_keys(cur, nbp, 1, 2);
+
+       /* Fill in the pointer data in the new root. */
+       xfs_btree_copy_ptrs(cur,
+               xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
+       xfs_btree_copy_ptrs(cur,
+               xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
+       xfs_btree_log_ptrs(cur, nbp, 1, 2);
+
+       /* Fix up the cursor. */
+       xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
+       cur->bc_ptrs[cur->bc_nlevels] = nptr;
+       cur->bc_nlevels++;
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+out0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 0;
+       return 0;
+}
+
+STATIC int
+xfs_btree_make_block_unfull(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       int                     level,  /* btree level */
+       int                     numrecs,/* # of recs in block */
+       int                     *oindex,/* old tree index */
+       int                     *index, /* new tree index */
+       union xfs_btree_ptr     *nptr,  /* new btree ptr */
+       struct xfs_btree_cur    **ncur, /* new btree cursor */
+       union xfs_btree_rec     *nrec,  /* new record */
+       int                     *stat)
+{
+       union xfs_btree_key     key;    /* new btree key value */
+       int                     error = 0;
+
+       if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+           level == cur->bc_nlevels - 1) {
+               struct xfs_inode *ip = cur->bc_private.b.ip;
+
+               if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
+                       /* A root block that can be made bigger. */
+                       xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
+               } else {
+                       /* A root block that needs replacing */
+                       int     logflags = 0;
+
+                       error = xfs_btree_new_iroot(cur, &logflags, stat);
+                       if (error || *stat == 0)
+                               return error;
+
+                       xfs_trans_log_inode(cur->bc_tp, ip, logflags);
+               }
+
+               return 0;
+       }
+
+       /* First, try shifting an entry to the right neighbor. */
+       error = xfs_btree_rshift(cur, level, stat);
+       if (error || *stat)
+               return error;
+
+       /* Next, try shifting an entry to the left neighbor. */
+       error = xfs_btree_lshift(cur, level, stat);
+       if (error)
+               return error;
+
+       if (*stat) {
+               *oindex = *index = cur->bc_ptrs[level];
+               return 0;
+       }
+
+       /*
+        * Next, try splitting the current block in half.
+        *
+        * If this works we have to re-set our variables because we
+        * could be in a different block now.
+        */
+       error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
+       if (error || *stat == 0)
+               return error;
+
+
+       *index = cur->bc_ptrs[level];
+       cur->bc_ops->init_rec_from_key(&key, nrec);
+       return 0;
+}
+
+/*
+ * Insert one record/level.  Return information to the caller
+ * allowing the next level up to proceed if necessary.
+ */
+STATIC int
+xfs_btree_insrec(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       int                     level,  /* level to insert record at */
+       union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
+       union xfs_btree_rec     *recp,  /* i/o: record data inserted */
+       struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
+       int                     *stat)  /* success/failure */
+{
+       struct xfs_btree_block  *block; /* btree block */
+       struct xfs_buf          *bp;    /* buffer for block */
+       union xfs_btree_key     key;    /* btree key */
+       union xfs_btree_ptr     nptr;   /* new block ptr */
+       struct xfs_btree_cur    *ncur;  /* new btree cursor */
+       union xfs_btree_rec     nrec;   /* new record count */
+       int                     optr;   /* old key/record index */
+       int                     ptr;    /* key/record index */
+       int                     numrecs;/* number of records */
+       int                     error;  /* error return value */
+#ifdef DEBUG
+       int                     i;
+#endif
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
+
+       ncur = NULL;
+
+       /*
+        * If we have an external root pointer, and we've made it to the
+        * root level, allocate a new root block and we're done.
+        */
+       if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+           (level >= cur->bc_nlevels)) {
+               error = xfs_btree_new_root(cur, stat);
+               xfs_btree_set_ptr_null(cur, ptrp);
+
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+               return error;
+       }
+
+       /* If we're off the left edge, return failure. */
+       ptr = cur->bc_ptrs[level];
+       if (ptr == 0) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+               *stat = 0;
+               return 0;
+       }
+
+       /* Make a key out of the record data to be inserted, and save it. */
+       cur->bc_ops->init_key_from_rec(&key, recp);
+
+       optr = ptr;
+
+       XFS_BTREE_STATS_INC(cur, insrec);
+
+       /* Get pointers to the btree buffer and block. */
+       block = xfs_btree_get_block(cur, level, &bp);
+       numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, block, level, bp);
+       if (error)
+               goto error0;
+
+       /* Check that the new entry is being inserted in the right place. */
+       if (ptr <= numrecs) {
+               if (level == 0) {
+                       ASSERT(cur->bc_ops->recs_inorder(cur, recp,
+                               xfs_btree_rec_addr(cur, ptr, block)));
+               } else {
+                       ASSERT(cur->bc_ops->keys_inorder(cur, &key,
+                               xfs_btree_key_addr(cur, ptr, block)));
+               }
+       }
+#endif
+
+       /*
+        * If the block is full, we can't insert the new entry until we
+        * make the block un-full.
+        */
+       xfs_btree_set_ptr_null(cur, &nptr);
+       if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
+               error = xfs_btree_make_block_unfull(cur, level, numrecs,
+                                       &optr, &ptr, &nptr, &ncur, &nrec, stat);
+               if (error || *stat == 0)
+                       goto error0;
+       }
+
+       /*
+        * The current block may have changed if the block was
+        * previously full and we have just made space in it.
+        */
+       block = xfs_btree_get_block(cur, level, &bp);
+       numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, block, level, bp);
+       if (error)
+               return error;
+#endif
+
+       /*
+        * At this point we know there's room for our new entry in the block
+        * we're pointing at.
+        */
+       XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
+
+       if (level > 0) {
+               /* It's a nonleaf. make a hole in the keys and ptrs */
+               union xfs_btree_key     *kp;
+               union xfs_btree_ptr     *pp;
+
+               kp = xfs_btree_key_addr(cur, ptr, block);
+               pp = xfs_btree_ptr_addr(cur, ptr, block);
+
+#ifdef DEBUG
+               for (i = numrecs - ptr; i >= 0; i--) {
+                       error = xfs_btree_check_ptr(cur, pp, i, level);
+                       if (error)
+                               return error;
+               }
+#endif
+
+               xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
+               xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
+
+#ifdef DEBUG
+               error = xfs_btree_check_ptr(cur, ptrp, 0, level);
+               if (error)
+                       goto error0;
+#endif
+
+               /* Now put the new data in, bump numrecs and log it. */
+               xfs_btree_copy_keys(cur, kp, &key, 1);
+               xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
+               numrecs++;
+               xfs_btree_set_numrecs(block, numrecs);
+               xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
+               xfs_btree_log_keys(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+               if (ptr < numrecs) {
+                       ASSERT(cur->bc_ops->keys_inorder(cur, kp,
+                               xfs_btree_key_addr(cur, ptr + 1, block)));
+               }
+#endif
+       } else {
+               /* It's a leaf. make a hole in the records */
+               union xfs_btree_rec             *rp;
+
+               rp = xfs_btree_rec_addr(cur, ptr, block);
+
+               xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
+
+               /* Now put the new data in, bump numrecs and log it. */
+               xfs_btree_copy_recs(cur, rp, recp, 1);
+               xfs_btree_set_numrecs(block, ++numrecs);
+               xfs_btree_log_recs(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+               if (ptr < numrecs) {
+                       ASSERT(cur->bc_ops->recs_inorder(cur, rp,
+                               xfs_btree_rec_addr(cur, ptr + 1, block)));
+               }
+#endif
+       }
+
+       /* Log the new number of records in the btree header. */
+       xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+
+       /* If we inserted at the start of a block, update the parents' keys. */
+       if (optr == 1) {
+               error = xfs_btree_updkey(cur, &key, level + 1);
+               if (error)
+                       goto error0;
+       }
+
+       /*
+        * If we are tracking the last record in the tree and
+        * we are at the far right edge of the tree, update it.
+        */
+       if (xfs_btree_is_lastrec(cur, block, level)) {
+               cur->bc_ops->update_lastrec(cur, block, recp,
+                                           ptr, LASTREC_INSREC);
+       }
+
+       /*
+        * Return the new block number, if any.
+        * If there is one, give back a record value and a cursor too.
+        */
+       *ptrp = nptr;
+       if (!xfs_btree_ptr_is_null(cur, &nptr)) {
+               *recp = nrec;
+               *curp = ncur;
+       }
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Insert the record at the point referenced by cur.
+ *
+ * A multi-level split of the tree on insert will invalidate the original
+ * cursor.  All callers of this function should assume that the cursor is
+ * no longer valid and revalidate it.
+ */
+int
+xfs_btree_insert(
+       struct xfs_btree_cur    *cur,
+       int                     *stat)
+{
+       int                     error;  /* error return value */
+       int                     i;      /* result value, 0 for failure */
+       int                     level;  /* current level number in btree */
+       union xfs_btree_ptr     nptr;   /* new block number (split result) */
+       struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
+       struct xfs_btree_cur    *pcur;  /* previous level's cursor */
+       union xfs_btree_rec     rec;    /* record to insert */
+
+       level = 0;
+       ncur = NULL;
+       pcur = cur;
+
+       xfs_btree_set_ptr_null(cur, &nptr);
+       cur->bc_ops->init_rec_from_cur(cur, &rec);
+
+       /*
+        * Loop going up the tree, starting at the leaf level.
+        * Stop when we don't get a split block, that must mean that
+        * the insert is finished with this level.
+        */
+       do {
+               /*
+                * Insert nrec/nptr into this level of the tree.
+                * Note if we fail, nptr will be null.
+                */
+               error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
+               if (error) {
+                       if (pcur != cur)
+                               xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
+                       goto error0;
+               }
+
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               level++;
+
+               /*
+                * See if the cursor we just used is trash.
+                * Can't trash the caller's cursor, but otherwise we should
+                * if ncur is a new cursor or we're about to be done.
+                */
+               if (pcur != cur &&
+                   (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
+                       /* Save the state from the cursor before we trash it */
+                       if (cur->bc_ops->update_cursor)
+                               cur->bc_ops->update_cursor(pcur, cur);
+                       cur->bc_nlevels = pcur->bc_nlevels;
+                       xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
+               }
+               /* If we got a new cursor, switch to it. */
+               if (ncur) {
+                       pcur = ncur;
+                       ncur = NULL;
+               }
+       } while (!xfs_btree_ptr_is_null(cur, &nptr));
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = i;
+       return 0;
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Try to merge a non-leaf block back into the inode root.
+ *
+ * Note: the killroot names comes from the fact that we're effectively
+ * killing the old root block.  But because we can't just delete the
+ * inode we have to copy the single block it was pointing to into the
+ * inode.
+ */
+STATIC int
+xfs_btree_kill_iroot(
+       struct xfs_btree_cur    *cur)
+{
+       int                     whichfork = cur->bc_private.b.whichfork;
+       struct xfs_inode        *ip = cur->bc_private.b.ip;
+       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+       struct xfs_btree_block  *block;
+       struct xfs_btree_block  *cblock;
+       union xfs_btree_key     *kp;
+       union xfs_btree_key     *ckp;
+       union xfs_btree_ptr     *pp;
+       union xfs_btree_ptr     *cpp;
+       struct xfs_buf          *cbp;
+       int                     level;
+       int                     index;
+       int                     numrecs;
+#ifdef DEBUG
+       union xfs_btree_ptr     ptr;
+       int                     i;
+#endif
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+       ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+       ASSERT(cur->bc_nlevels > 1);
+
+       /*
+        * Don't deal with the root block needs to be a leaf case.
+        * We're just going to turn the thing back into extents anyway.
+        */
+       level = cur->bc_nlevels - 1;
+       if (level == 1)
+               goto out0;
+
+       /*
+        * Give up if the root has multiple children.
+        */
+       block = xfs_btree_get_iroot(cur);
+       if (xfs_btree_get_numrecs(block) != 1)
+               goto out0;
+
+       cblock = xfs_btree_get_block(cur, level - 1, &cbp);
+       numrecs = xfs_btree_get_numrecs(cblock);
+
+       /*
+        * Only do this if the next level will fit.
+        * Then the data must be copied up to the inode,
+        * instead of freeing the root you free the next level.
+        */
+       if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
+               goto out0;
+
+       XFS_BTREE_STATS_INC(cur, killroot);
+
+#ifdef DEBUG
+       xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+       ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+       xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+       ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+#endif
+
+       index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
+       if (index) {
+               xfs_iroot_realloc(cur->bc_private.b.ip, index,
+                                 cur->bc_private.b.whichfork);
+               block = ifp->if_broot;
+       }
+
+       be16_add_cpu(&block->bb_numrecs, index);
+       ASSERT(block->bb_numrecs == cblock->bb_numrecs);
+
+       kp = xfs_btree_key_addr(cur, 1, block);
+       ckp = xfs_btree_key_addr(cur, 1, cblock);
+       xfs_btree_copy_keys(cur, kp, ckp, numrecs);
+
+       pp = xfs_btree_ptr_addr(cur, 1, block);
+       cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+       for (i = 0; i < numrecs; i++) {
+               int             error;
+
+               error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
+               if (error) {
+                       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+                       return error;
+               }
+       }
+#endif
+       xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
+
+       cur->bc_ops->free_block(cur, cbp);
+       XFS_BTREE_STATS_INC(cur, free);
+
+       cur->bc_bufs[level - 1] = NULL;
+       be16_add_cpu(&block->bb_level, -1);
+       xfs_trans_log_inode(cur->bc_tp, ip,
+               XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+       cur->bc_nlevels--;
+out0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       return 0;
+}
+
+/*
+ * Kill the current root node, and replace it with it's only child node.
+ */
+STATIC int
+xfs_btree_kill_root(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp,
+       int                     level,
+       union xfs_btree_ptr     *newroot)
+{
+       int                     error;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_STATS_INC(cur, killroot);
+
+       /*
+        * Update the root pointer, decreasing the level by 1 and then
+        * free the old root.
+        */
+       cur->bc_ops->set_root(cur, newroot, -1);
+
+       error = cur->bc_ops->free_block(cur, bp);
+       if (error) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+               return error;
+       }
+
+       XFS_BTREE_STATS_INC(cur, free);
+
+       cur->bc_bufs[level] = NULL;
+       cur->bc_ra[level] = 0;
+       cur->bc_nlevels--;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       return 0;
+}
+
+STATIC int
+xfs_btree_dec_cursor(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       int                     *stat)
+{
+       int                     error;
+       int                     i;
+
+       if (level > 0) {
+               error = xfs_btree_decrement(cur, level, &i);
+               if (error)
+                       return error;
+       }
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+}
+
+/*
+ * Single level of the btree record deletion routine.
+ * Delete record pointed to by cur/level.
+ * Remove the record from its block then rebalance the tree.
+ * Return 0 for error, 1 for done, 2 to go on to the next level.
+ */
+STATIC int                                     /* error */
+xfs_btree_delrec(
+       struct xfs_btree_cur    *cur,           /* btree cursor */
+       int                     level,          /* level removing record from */
+       int                     *stat)          /* fail/done/go-on */
+{
+       struct xfs_btree_block  *block;         /* btree block */
+       union xfs_btree_ptr     cptr;           /* current block ptr */
+       struct xfs_buf          *bp;            /* buffer for block */
+       int                     error;          /* error return value */
+       int                     i;              /* loop counter */
+       union xfs_btree_key     key;            /* storage for keyp */
+       union xfs_btree_key     *keyp = &key;   /* passed to the next level */
+       union xfs_btree_ptr     lptr;           /* left sibling block ptr */
+       struct xfs_buf          *lbp;           /* left buffer pointer */
+       struct xfs_btree_block  *left;          /* left btree block */
+       int                     lrecs = 0;      /* left record count */
+       int                     ptr;            /* key/record index */
+       union xfs_btree_ptr     rptr;           /* right sibling block ptr */
+       struct xfs_buf          *rbp;           /* right buffer pointer */
+       struct xfs_btree_block  *right;         /* right btree block */
+       struct xfs_btree_block  *rrblock;       /* right-right btree block */
+       struct xfs_buf          *rrbp;          /* right-right buffer pointer */
+       int                     rrecs = 0;      /* right record count */
+       struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
+       int                     numrecs;        /* temporary numrec count */
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGI(cur, level);
+
+       tcur = NULL;
+
+       /* Get the index of the entry being deleted, check for nothing there. */
+       ptr = cur->bc_ptrs[level];
+       if (ptr == 0) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+               *stat = 0;
+               return 0;
+       }
+
+       /* Get the buffer & block containing the record or key/ptr. */
+       block = xfs_btree_get_block(cur, level, &bp);
+       numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, block, level, bp);
+       if (error)
+               goto error0;
+#endif
+
+       /* Fail if we're off the end of the block. */
+       if (ptr > numrecs) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+               *stat = 0;
+               return 0;
+       }
+
+       XFS_BTREE_STATS_INC(cur, delrec);
+       XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
+
+       /* Excise the entries being deleted. */
+       if (level > 0) {
+               /* It's a nonleaf. operate on keys and ptrs */
+               union xfs_btree_key     *lkp;
+               union xfs_btree_ptr     *lpp;
+
+               lkp = xfs_btree_key_addr(cur, ptr + 1, block);
+               lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
+
+#ifdef DEBUG
+               for (i = 0; i < numrecs - ptr; i++) {
+                       error = xfs_btree_check_ptr(cur, lpp, i, level);
+                       if (error)
+                               goto error0;
+               }
+#endif
+
+               if (ptr < numrecs) {
+                       xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
+                       xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
+                       xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
+                       xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
+               }
+
+               /*
+                * If it's the first record in the block, we'll need to pass a
+                * key up to the next level (updkey).
+                */
+               if (ptr == 1)
+                       keyp = xfs_btree_key_addr(cur, 1, block);
+       } else {
+               /* It's a leaf. operate on records */
+               if (ptr < numrecs) {
+                       xfs_btree_shift_recs(cur,
+                               xfs_btree_rec_addr(cur, ptr + 1, block),
+                               -1, numrecs - ptr);
+                       xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
+               }
+
+               /*
+                * If it's the first record in the block, we'll need a key
+                * structure to pass up to the next level (updkey).
+                */
+               if (ptr == 1) {
+                       cur->bc_ops->init_key_from_rec(&key,
+                                       xfs_btree_rec_addr(cur, 1, block));
+                       keyp = &key;
+               }
+       }
+
+       /*
+        * Decrement and log the number of entries in the block.
+        */
+       xfs_btree_set_numrecs(block, --numrecs);
+       xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+
+       /*
+        * If we are tracking the last record in the tree and
+        * we are at the far right edge of the tree, update it.
+        */
+       if (xfs_btree_is_lastrec(cur, block, level)) {
+               cur->bc_ops->update_lastrec(cur, block, NULL,
+                                           ptr, LASTREC_DELREC);
+       }
+
+       /*
+        * We're at the root level.  First, shrink the root block in-memory.
+        * Try to get rid of the next level down.  If we can't then there's
+        * nothing left to do.
+        */
+       if (level == cur->bc_nlevels - 1) {
+               if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+                       xfs_iroot_realloc(cur->bc_private.b.ip, -1,
+                                         cur->bc_private.b.whichfork);
+
+                       error = xfs_btree_kill_iroot(cur);
+                       if (error)
+                               goto error0;
+
+                       error = xfs_btree_dec_cursor(cur, level, stat);
+                       if (error)
+                               goto error0;
+                       *stat = 1;
+                       return 0;
+               }
+
+               /*
+                * If this is the root level, and there's only one entry left,
+                * and it's NOT the leaf level, then we can get rid of this
+                * level.
+                */
+               if (numrecs == 1 && level > 0) {
+                       union xfs_btree_ptr     *pp;
+                       /*
+                        * pp is still set to the first pointer in the block.
+                        * Make it the new root of the btree.
+                        */
+                       pp = xfs_btree_ptr_addr(cur, 1, block);
+                       error = xfs_btree_kill_root(cur, bp, level, pp);
+                       if (error)
+                               goto error0;
+               } else if (level > 0) {
+                       error = xfs_btree_dec_cursor(cur, level, stat);
+                       if (error)
+                               goto error0;
+               }
+               *stat = 1;
+               return 0;
+       }
+
+       /*
+        * If we deleted the leftmost entry in the block, update the
+        * key values above us in the tree.
+        */
+       if (ptr == 1) {
+               error = xfs_btree_updkey(cur, keyp, level + 1);
+               if (error)
+                       goto error0;
+       }
+
+       /*
+        * If the number of records remaining in the block is at least
+        * the minimum, we're done.
+        */
+       if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
+               error = xfs_btree_dec_cursor(cur, level, stat);
+               if (error)
+                       goto error0;
+               return 0;
+       }
+
+       /*
+        * Otherwise, we have to move some records around to keep the
+        * tree balanced.  Look at the left and right sibling blocks to
+        * see if we can re-balance by moving only one record.
+        */
+       xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+       xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
+
+       if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+               /*
+                * One child of root, need to get a chance to copy its contents
+                * into the root and delete it. Can't go up to next level,
+                * there's nothing to delete there.
+                */
+               if (xfs_btree_ptr_is_null(cur, &rptr) &&
+                   xfs_btree_ptr_is_null(cur, &lptr) &&
+                   level == cur->bc_nlevels - 2) {
+                       error = xfs_btree_kill_iroot(cur);
+                       if (!error)
+                               error = xfs_btree_dec_cursor(cur, level, stat);
+                       if (error)
+                               goto error0;
+                       return 0;
+               }
+       }
+
+       ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
+              !xfs_btree_ptr_is_null(cur, &lptr));
+
+       /*
+        * Duplicate the cursor so our btree manipulations here won't
+        * disrupt the next level up.
+        */
+       error = xfs_btree_dup_cursor(cur, &tcur);
+       if (error)
+               goto error0;
+
+       /*
+        * If there's a right sibling, see if it's ok to shift an entry
+        * out of it.
+        */
+       if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+               /*
+                * Move the temp cursor to the last entry in the next block.
+                * Actually any entry but the first would suffice.
+                */
+               i = xfs_btree_lastrec(tcur, level);
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+               error = xfs_btree_increment(tcur, level, &i);
+               if (error)
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+               i = xfs_btree_lastrec(tcur, level);
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+               /* Grab a pointer to the block. */
+               right = xfs_btree_get_block(tcur, level, &rbp);
+#ifdef DEBUG
+               error = xfs_btree_check_block(tcur, right, level, rbp);
+               if (error)
+                       goto error0;
+#endif
+               /* Grab the current block number, for future use. */
+               xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
+
+               /*
+                * If right block is full enough so that removing one entry
+                * won't make it too empty, and left-shifting an entry out
+                * of right to us works, we're done.
+                */
+               if (xfs_btree_get_numrecs(right) - 1 >=
+                   cur->bc_ops->get_minrecs(tcur, level)) {
+                       error = xfs_btree_lshift(tcur, level, &i);
+                       if (error)
+                               goto error0;
+                       if (i) {
+                               ASSERT(xfs_btree_get_numrecs(block) >=
+                                      cur->bc_ops->get_minrecs(tcur, level));
+
+                               xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+                               tcur = NULL;
+
+                               error = xfs_btree_dec_cursor(cur, level, stat);
+                               if (error)
+                                       goto error0;
+                               return 0;
+                       }
+               }
+
+               /*
+                * Otherwise, grab the number of records in right for
+                * future reference, and fix up the temp cursor to point
+                * to our block again (last record).
+                */
+               rrecs = xfs_btree_get_numrecs(right);
+               if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+                       i = xfs_btree_firstrec(tcur, level);
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+                       error = xfs_btree_decrement(tcur, level, &i);
+                       if (error)
+                               goto error0;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               }
+       }
+
+       /*
+        * If there's a left sibling, see if it's ok to shift an entry
+        * out of it.
+        */
+       if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+               /*
+                * Move the temp cursor to the first entry in the
+                * previous block.
+                */
+               i = xfs_btree_firstrec(tcur, level);
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+               error = xfs_btree_decrement(tcur, level, &i);
+               if (error)
+                       goto error0;
+               i = xfs_btree_firstrec(tcur, level);
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+               /* Grab a pointer to the block. */
+               left = xfs_btree_get_block(tcur, level, &lbp);
+#ifdef DEBUG
+               error = xfs_btree_check_block(cur, left, level, lbp);
+               if (error)
+                       goto error0;
+#endif
+               /* Grab the current block number, for future use. */
+               xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
+
+               /*
+                * If left block is full enough so that removing one entry
+                * won't make it too empty, and right-shifting an entry out
+                * of left to us works, we're done.
+                */
+               if (xfs_btree_get_numrecs(left) - 1 >=
+                   cur->bc_ops->get_minrecs(tcur, level)) {
+                       error = xfs_btree_rshift(tcur, level, &i);
+                       if (error)
+                               goto error0;
+                       if (i) {
+                               ASSERT(xfs_btree_get_numrecs(block) >=
+                                      cur->bc_ops->get_minrecs(tcur, level));
+                               xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+                               tcur = NULL;
+                               if (level == 0)
+                                       cur->bc_ptrs[0]++;
+                               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                               *stat = 1;
+                               return 0;
+                       }
+               }
+
+               /*
+                * Otherwise, grab the number of records in right for
+                * future reference.
+                */
+               lrecs = xfs_btree_get_numrecs(left);
+       }
+
+       /* Delete the temp cursor, we're done with it. */
+       xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+       tcur = NULL;
+
+       /* If here, we need to do a join to keep the tree balanced. */
+       ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
+
+       if (!xfs_btree_ptr_is_null(cur, &lptr) &&
+           lrecs + xfs_btree_get_numrecs(block) <=
+                       cur->bc_ops->get_maxrecs(cur, level)) {
+               /*
+                * Set "right" to be the starting block,
+                * "left" to be the left neighbor.
+                */
+               rptr = cptr;
+               right = block;
+               rbp = bp;
+               error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
+               if (error)
+                       goto error0;
+
+       /*
+        * If that won't work, see if we can join with the right neighbor block.
+        */
+       } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
+                  rrecs + xfs_btree_get_numrecs(block) <=
+                       cur->bc_ops->get_maxrecs(cur, level)) {
+               /*
+                * Set "left" to be the starting block,
+                * "right" to be the right neighbor.
+                */
+               lptr = cptr;
+               left = block;
+               lbp = bp;
+               error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
+               if (error)
+                       goto error0;
+
+       /*
+        * Otherwise, we can't fix the imbalance.
+        * Just return.  This is probably a logic error, but it's not fatal.
+        */
+       } else {
+               error = xfs_btree_dec_cursor(cur, level, stat);
+               if (error)
+                       goto error0;
+               return 0;
+       }
+
+       rrecs = xfs_btree_get_numrecs(right);
+       lrecs = xfs_btree_get_numrecs(left);
+
+       /*
+        * We're now going to join "left" and "right" by moving all the stuff
+        * in "right" to "left" and deleting "right".
+        */
+       XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+       if (level > 0) {
+               /* It's a non-leaf.  Move keys and pointers. */
+               union xfs_btree_key     *lkp;   /* left btree key */
+               union xfs_btree_ptr     *lpp;   /* left address pointer */
+               union xfs_btree_key     *rkp;   /* right btree key */
+               union xfs_btree_ptr     *rpp;   /* right address pointer */
+
+               lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
+               lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
+               rkp = xfs_btree_key_addr(cur, 1, right);
+               rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+               for (i = 1; i < rrecs; i++) {
+                       error = xfs_btree_check_ptr(cur, rpp, i, level);
+                       if (error)
+                               goto error0;
+               }
+#endif
+               xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
+               xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
+
+               xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
+               xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
+       } else {
+               /* It's a leaf.  Move records.  */
+               union xfs_btree_rec     *lrp;   /* left record pointer */
+               union xfs_btree_rec     *rrp;   /* right record pointer */
+
+               lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
+               rrp = xfs_btree_rec_addr(cur, 1, right);
+
+               xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
+               xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
+       }
+
+       XFS_BTREE_STATS_INC(cur, join);
+
+       /*
+        * Fix up the number of records and right block pointer in the
+        * surviving block, and log it.
+        */
+       xfs_btree_set_numrecs(left, lrecs + rrecs);
+       xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
+       xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+       xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+
+       /* If there is a right sibling, point it to the remaining block. */
+       xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+       if (!xfs_btree_ptr_is_null(cur, &cptr)) {
+               error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
+               if (error)
+                       goto error0;
+               xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
+               xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+       }
+
+       /* Free the deleted block. */
+       error = cur->bc_ops->free_block(cur, rbp);
+       if (error)
+               goto error0;
+       XFS_BTREE_STATS_INC(cur, free);
+
+       /*
+        * If we joined with the left neighbor, set the buffer in the
+        * cursor to the left block, and fix up the index.
+        */
+       if (bp != lbp) {
+               cur->bc_bufs[level] = lbp;
+               cur->bc_ptrs[level] += lrecs;
+               cur->bc_ra[level] = 0;
+       }
+       /*
+        * If we joined with the right neighbor and there's a level above
+        * us, increment the cursor at that level.
+        */
+       else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
+                  (level + 1 < cur->bc_nlevels)) {
+               error = xfs_btree_increment(cur, level + 1, &i);
+               if (error)
+                       goto error0;
+       }
+
+       /*
+        * Readjust the ptr at this level if it's not a leaf, since it's
+        * still pointing at the deletion point, which makes the cursor
+        * inconsistent.  If this makes the ptr 0, the caller fixes it up.
+        * We can't use decrement because it would change the next level up.
+        */
+       if (level > 0)
+               cur->bc_ptrs[level]--;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       /* Return value means the next level up has something to do. */
+       *stat = 2;
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       if (tcur)
+               xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+       return error;
+}
+
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int                                    /* error */
+xfs_btree_delete(
+       struct xfs_btree_cur    *cur,
+       int                     *stat)  /* success/failure */
+{
+       int                     error;  /* error return value */
+       int                     level;
+       int                     i;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+       /*
+        * Go up the tree, starting at leaf level.
+        *
+        * If 2 is returned then a join was done; go to the next level.
+        * Otherwise we are done.
+        */
+       for (level = 0, i = 2; i == 2; level++) {
+               error = xfs_btree_delrec(cur, level, &i);
+               if (error)
+                       goto error0;
+       }
+
+       if (i == 0) {
+               for (level = 1; level < cur->bc_nlevels; level++) {
+                       if (cur->bc_ptrs[level] == 0) {
+                               error = xfs_btree_decrement(cur, level, &i);
+                               if (error)
+                                       goto error0;
+                               break;
+                       }
+               }
+       }
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = i;
+       return 0;
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int                                    /* error */
+xfs_btree_get_rec(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       union xfs_btree_rec     **recp, /* output: btree record */
+       int                     *stat)  /* output: success/failure */
+{
+       struct xfs_btree_block  *block; /* btree block */
+       struct xfs_buf          *bp;    /* buffer pointer */
+       int                     ptr;    /* record number */
+#ifdef DEBUG
+       int                     error;  /* error return value */
+#endif
+
+       ptr = cur->bc_ptrs[0];
+       block = xfs_btree_get_block(cur, 0, &bp);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, block, 0, bp);
+       if (error)
+               return error;
+#endif
+
+       /*
+        * Off the right end or left end, return failure.
+        */
+       if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
+               *stat = 0;
+               return 0;
+       }
+
+       /*
+        * Point to the record and extract its data.
+        */
+       *recp = xfs_btree_rec_addr(cur, ptr, block);
+       *stat = 1;
+       return 0;
+}
+
+/*
+ * Change the owner of a btree.
+ *
+ * The mechanism we use here is ordered buffer logging. Because we don't know
+ * how many buffers were are going to need to modify, we don't really want to
+ * have to make transaction reservations for the worst case of every buffer in a
+ * full size btree as that may be more space that we can fit in the log....
+ *
+ * We do the btree walk in the most optimal manner possible - we have sibling
+ * pointers so we can just walk all the blocks on each level from left to right
+ * in a single pass, and then move to the next level and do the same. We can
+ * also do readahead on the sibling pointers to get IO moving more quickly,
+ * though for slow disks this is unlikely to make much difference to performance
+ * as the amount of CPU work we have to do before moving to the next block is
+ * relatively small.
+ *
+ * For each btree block that we load, modify the owner appropriately, set the
+ * buffer as an ordered buffer and log it appropriately. We need to ensure that
+ * we mark the region we change dirty so that if the buffer is relogged in
+ * a subsequent transaction the changes we make here as an ordered buffer are
+ * correctly relogged in that transaction.  If we are in recovery context, then
+ * just queue the modified buffer as delayed write buffer so the transaction
+ * recovery completion writes the changes to disk.
+ */
+static int
+xfs_btree_block_change_owner(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       __uint64_t              new_owner,
+       struct list_head        *buffer_list)
+{
+       struct xfs_btree_block  *block;
+       struct xfs_buf          *bp;
+       union xfs_btree_ptr     rptr;
+
+       /* do right sibling readahead */
+       xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+
+       /* modify the owner */
+       block = xfs_btree_get_block(cur, level, &bp);
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+               block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
+       else
+               block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
+
+       /*
+        * If the block is a root block hosted in an inode, we might not have a
+        * buffer pointer here and we shouldn't attempt to log the change as the
+        * information is already held in the inode and discarded when the root
+        * block is formatted into the on-disk inode fork. We still change it,
+        * though, so everything is consistent in memory.
+        */
+       if (bp) {
+               if (cur->bc_tp) {
+                       xfs_trans_ordered_buf(cur->bc_tp, bp);
+                       xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
+               } else {
+                       xfs_buf_delwri_queue(bp, buffer_list);
+               }
+       } else {
+               ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+               ASSERT(level == cur->bc_nlevels - 1);
+       }
+
+       /* now read rh sibling block for next iteration */
+       xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+       if (xfs_btree_ptr_is_null(cur, &rptr))
+               return -ENOENT;
+
+       return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
+}
+
+int
+xfs_btree_change_owner(
+       struct xfs_btree_cur    *cur,
+       __uint64_t              new_owner,
+       struct list_head        *buffer_list)
+{
+       union xfs_btree_ptr     lptr;
+       int                     level;
+       struct xfs_btree_block  *block = NULL;
+       int                     error = 0;
+
+       cur->bc_ops->init_ptr_from_cur(cur, &lptr);
+
+       /* for each level */
+       for (level = cur->bc_nlevels - 1; level >= 0; level--) {
+               /* grab the left hand block */
+               error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
+               if (error)
+                       return error;
+
+               /* readahead the left most block for the next level down */
+               if (level > 0) {
+                       union xfs_btree_ptr     *ptr;
+
+                       ptr = xfs_btree_ptr_addr(cur, 1, block);
+                       xfs_btree_readahead_ptr(cur, ptr, 1);
+
+                       /* save for the next iteration of the loop */
+                       lptr = *ptr;
+               }
+
+               /* for each buffer in the level */
+               do {
+                       error = xfs_btree_block_change_owner(cur, level,
+                                                            new_owner,
+                                                            buffer_list);
+               } while (!error);
+
+               if (error != -ENOENT)
+                       return error;
+       }
+
+       return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
new file mode 100644 (file)
index 0000000..8f18bab
--- /dev/null
@@ -0,0 +1,468 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BTREE_H__
+#define        __XFS_BTREE_H__
+
+struct xfs_buf;
+struct xfs_bmap_free;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+extern kmem_zone_t     *xfs_btree_cur_zone;
+
+/*
+ * Generic key, ptr and record wrapper structures.
+ *
+ * These are disk format structures, and are converted where necessary
+ * by the btree specific code that needs to interpret them.
+ */
+union xfs_btree_ptr {
+       __be32                  s;      /* short form ptr */
+       __be64                  l;      /* long form ptr */
+};
+
+union xfs_btree_key {
+       xfs_bmbt_key_t          bmbt;
+       xfs_bmdr_key_t          bmbr;   /* bmbt root block */
+       xfs_alloc_key_t         alloc;
+       xfs_inobt_key_t         inobt;
+};
+
+union xfs_btree_rec {
+       xfs_bmbt_rec_t          bmbt;
+       xfs_bmdr_rec_t          bmbr;   /* bmbt root block */
+       xfs_alloc_rec_t         alloc;
+       xfs_inobt_rec_t         inobt;
+};
+
+/*
+ * This nonsense is to make -wlint happy.
+ */
+#define        XFS_LOOKUP_EQ   ((xfs_lookup_t)XFS_LOOKUP_EQi)
+#define        XFS_LOOKUP_LE   ((xfs_lookup_t)XFS_LOOKUP_LEi)
+#define        XFS_LOOKUP_GE   ((xfs_lookup_t)XFS_LOOKUP_GEi)
+
+#define        XFS_BTNUM_BNO   ((xfs_btnum_t)XFS_BTNUM_BNOi)
+#define        XFS_BTNUM_CNT   ((xfs_btnum_t)XFS_BTNUM_CNTi)
+#define        XFS_BTNUM_BMAP  ((xfs_btnum_t)XFS_BTNUM_BMAPi)
+#define        XFS_BTNUM_INO   ((xfs_btnum_t)XFS_BTNUM_INOi)
+#define        XFS_BTNUM_FINO  ((xfs_btnum_t)XFS_BTNUM_FINOi)
+
+/*
+ * For logging record fields.
+ */
+#define        XFS_BB_MAGIC            (1 << 0)
+#define        XFS_BB_LEVEL            (1 << 1)
+#define        XFS_BB_NUMRECS          (1 << 2)
+#define        XFS_BB_LEFTSIB          (1 << 3)
+#define        XFS_BB_RIGHTSIB         (1 << 4)
+#define        XFS_BB_BLKNO            (1 << 5)
+#define        XFS_BB_LSN              (1 << 6)
+#define        XFS_BB_UUID             (1 << 7)
+#define        XFS_BB_OWNER            (1 << 8)
+#define        XFS_BB_NUM_BITS         5
+#define        XFS_BB_ALL_BITS         ((1 << XFS_BB_NUM_BITS) - 1)
+#define        XFS_BB_NUM_BITS_CRC     9
+#define        XFS_BB_ALL_BITS_CRC     ((1 << XFS_BB_NUM_BITS_CRC) - 1)
+
+/*
+ * Generic stats interface
+ */
+#define __XFS_BTREE_STATS_INC(type, stat) \
+       XFS_STATS_INC(xs_ ## type ## _2_ ## stat)
+#define XFS_BTREE_STATS_INC(cur, stat)  \
+do {    \
+       switch (cur->bc_btnum) {  \
+       case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break;   \
+       case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break;   \
+       case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break;  \
+       case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break;    \
+       case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(fibt, stat); break;  \
+       case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;       \
+       }       \
+} while (0)
+
+#define __XFS_BTREE_STATS_ADD(type, stat, val) \
+       XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val)
+#define XFS_BTREE_STATS_ADD(cur, stat, val)  \
+do {    \
+       switch (cur->bc_btnum) {  \
+       case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \
+       case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \
+       case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \
+       case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \
+       case XFS_BTNUM_FINO: __XFS_BTREE_STATS_ADD(fibt, stat, val); break; \
+       case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;       \
+       }       \
+} while (0)
+
+#define        XFS_BTREE_MAXLEVELS     8       /* max of all btrees */
+
+struct xfs_btree_ops {
+       /* size of the key and record structures */
+       size_t  key_len;
+       size_t  rec_len;
+
+       /* cursor operations */
+       struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *);
+       void    (*update_cursor)(struct xfs_btree_cur *src,
+                                struct xfs_btree_cur *dst);
+
+       /* update btree root pointer */
+       void    (*set_root)(struct xfs_btree_cur *cur,
+                           union xfs_btree_ptr *nptr, int level_change);
+
+       /* block allocation / freeing */
+       int     (*alloc_block)(struct xfs_btree_cur *cur,
+                              union xfs_btree_ptr *start_bno,
+                              union xfs_btree_ptr *new_bno,
+                              int *stat);
+       int     (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
+
+       /* update last record information */
+       void    (*update_lastrec)(struct xfs_btree_cur *cur,
+                                 struct xfs_btree_block *block,
+                                 union xfs_btree_rec *rec,
+                                 int ptr, int reason);
+
+       /* records in block/level */
+       int     (*get_minrecs)(struct xfs_btree_cur *cur, int level);
+       int     (*get_maxrecs)(struct xfs_btree_cur *cur, int level);
+
+       /* records on disk.  Matter for the root in inode case. */
+       int     (*get_dmaxrecs)(struct xfs_btree_cur *cur, int level);
+
+       /* init values of btree structures */
+       void    (*init_key_from_rec)(union xfs_btree_key *key,
+                                    union xfs_btree_rec *rec);
+       void    (*init_rec_from_key)(union xfs_btree_key *key,
+                                    union xfs_btree_rec *rec);
+       void    (*init_rec_from_cur)(struct xfs_btree_cur *cur,
+                                    union xfs_btree_rec *rec);
+       void    (*init_ptr_from_cur)(struct xfs_btree_cur *cur,
+                                    union xfs_btree_ptr *ptr);
+
+       /* difference between key value and cursor value */
+       __int64_t (*key_diff)(struct xfs_btree_cur *cur,
+                             union xfs_btree_key *key);
+
+       const struct xfs_buf_ops        *buf_ops;
+
+#if defined(DEBUG) || defined(XFS_WARN)
+       /* check that k1 is lower than k2 */
+       int     (*keys_inorder)(struct xfs_btree_cur *cur,
+                               union xfs_btree_key *k1,
+                               union xfs_btree_key *k2);
+
+       /* check that r1 is lower than r2 */
+       int     (*recs_inorder)(struct xfs_btree_cur *cur,
+                               union xfs_btree_rec *r1,
+                               union xfs_btree_rec *r2);
+#endif
+};
+
+/*
+ * Reasons for the update_lastrec method to be called.
+ */
+#define LASTREC_UPDATE 0
+#define LASTREC_INSREC 1
+#define LASTREC_DELREC 2
+
+
+/*
+ * Btree cursor structure.
+ * This collects all information needed by the btree code in one place.
+ */
+typedef struct xfs_btree_cur
+{
+       struct xfs_trans        *bc_tp; /* transaction we're in, if any */
+       struct xfs_mount        *bc_mp; /* file system mount struct */
+       const struct xfs_btree_ops *bc_ops;
+       uint                    bc_flags; /* btree features - below */
+       union {
+               xfs_alloc_rec_incore_t  a;
+               xfs_bmbt_irec_t         b;
+               xfs_inobt_rec_incore_t  i;
+       }               bc_rec;         /* current insert/search record value */
+       struct xfs_buf  *bc_bufs[XFS_BTREE_MAXLEVELS];  /* buf ptr per level */
+       int             bc_ptrs[XFS_BTREE_MAXLEVELS];   /* key/record # */
+       __uint8_t       bc_ra[XFS_BTREE_MAXLEVELS];     /* readahead bits */
+#define        XFS_BTCUR_LEFTRA        1       /* left sibling has been read-ahead */
+#define        XFS_BTCUR_RIGHTRA       2       /* right sibling has been read-ahead */
+       __uint8_t       bc_nlevels;     /* number of levels in the tree */
+       __uint8_t       bc_blocklog;    /* log2(blocksize) of btree blocks */
+       xfs_btnum_t     bc_btnum;       /* identifies which btree type */
+       union {
+               struct {                        /* needed for BNO, CNT, INO */
+                       struct xfs_buf  *agbp;  /* agf/agi buffer pointer */
+                       xfs_agnumber_t  agno;   /* ag number */
+               } a;
+               struct {                        /* needed for BMAP */
+                       struct xfs_inode *ip;   /* pointer to our inode */
+                       struct xfs_bmap_free *flist;    /* list to free after */
+                       xfs_fsblock_t   firstblock;     /* 1st blk allocated */
+                       int             allocated;      /* count of alloced */
+                       short           forksize;       /* fork's inode space */
+                       char            whichfork;      /* data or attr fork */
+                       char            flags;          /* flags */
+#define        XFS_BTCUR_BPRV_WASDEL   1                       /* was delayed */
+               } b;
+       }               bc_private;     /* per-btree type data */
+} xfs_btree_cur_t;
+
+/* cursor flags */
+#define XFS_BTREE_LONG_PTRS            (1<<0)  /* pointers are 64bits long */
+#define XFS_BTREE_ROOT_IN_INODE                (1<<1)  /* root may be variable size */
+#define XFS_BTREE_LASTREC_UPDATE       (1<<2)  /* track last rec externally */
+#define XFS_BTREE_CRC_BLOCKS           (1<<3)  /* uses extended btree blocks */
+
+
+#define        XFS_BTREE_NOERROR       0
+#define        XFS_BTREE_ERROR         1
+
+/*
+ * Convert from buffer to btree block header.
+ */
+#define        XFS_BUF_TO_BLOCK(bp)    ((struct xfs_btree_block *)((bp)->b_addr))
+
+
+/*
+ * Check that block header is ok.
+ */
+int
+xfs_btree_check_block(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       struct xfs_btree_block  *block, /* generic btree block pointer */
+       int                     level,  /* level of the btree block */
+       struct xfs_buf          *bp);   /* buffer containing block, if any */
+
+/*
+ * Check that (long) pointer is ok.
+ */
+int                                    /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lptr(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_fsblock_t           ptr,    /* btree block disk address */
+       int                     level); /* btree block level */
+
+/*
+ * Delete the btree cursor.
+ */
+void
+xfs_btree_del_cursor(
+       xfs_btree_cur_t         *cur,   /* btree cursor */
+       int                     error); /* del because of error */
+
+/*
+ * Duplicate the btree cursor.
+ * Allocate a new one, copy the record, re-get the buffers.
+ */
+int                                    /* error */
+xfs_btree_dup_cursor(
+       xfs_btree_cur_t         *cur,   /* input cursor */
+       xfs_btree_cur_t         **ncur);/* output cursor */
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Long-form addressing.
+ */
+struct xfs_buf *                               /* buffer for fsbno */
+xfs_btree_get_bufl(
+       struct xfs_mount        *mp,    /* file system mount point */
+       struct xfs_trans        *tp,    /* transaction pointer */
+       xfs_fsblock_t           fsbno,  /* file system block number */
+       uint                    lock);  /* lock flags for get_buf */
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Short-form addressing.
+ */
+struct xfs_buf *                               /* buffer for agno/agbno */
+xfs_btree_get_bufs(
+       struct xfs_mount        *mp,    /* file system mount point */
+       struct xfs_trans        *tp,    /* transaction pointer */
+       xfs_agnumber_t          agno,   /* allocation group number */
+       xfs_agblock_t           agbno,  /* allocation group block number */
+       uint                    lock);  /* lock flags for get_buf */
+
+/*
+ * Check for the cursor referring to the last block at the given level.
+ */
+int                                    /* 1=is last block, 0=not last block */
+xfs_btree_islastblock(
+       xfs_btree_cur_t         *cur,   /* btree cursor */
+       int                     level); /* level to check */
+
+/*
+ * Compute first and last byte offsets for the fields given.
+ * Interprets the offsets table, which contains struct field offsets.
+ */
+void
+xfs_btree_offsets(
+       __int64_t               fields, /* bitmask of fields */
+       const short             *offsets,/* table of field offsets */
+       int                     nbits,  /* number of bits to inspect */
+       int                     *first, /* output: first byte offset */
+       int                     *last); /* output: last byte offset */
+
+/*
+ * Get a buffer for the block, return it read in.
+ * Long-form addressing.
+ */
+int                                    /* error */
+xfs_btree_read_bufl(
+       struct xfs_mount        *mp,    /* file system mount point */
+       struct xfs_trans        *tp,    /* transaction pointer */
+       xfs_fsblock_t           fsbno,  /* file system block number */
+       uint                    lock,   /* lock flags for read_buf */
+       struct xfs_buf          **bpp,  /* buffer for fsbno */
+       int                     refval, /* ref count value for buffer */
+       const struct xfs_buf_ops *ops);
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Long-form addressing.
+ */
+void                                   /* error */
+xfs_btree_reada_bufl(
+       struct xfs_mount        *mp,    /* file system mount point */
+       xfs_fsblock_t           fsbno,  /* file system block number */
+       xfs_extlen_t            count,  /* count of filesystem blocks */
+       const struct xfs_buf_ops *ops);
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Short-form addressing.
+ */
+void                                   /* error */
+xfs_btree_reada_bufs(
+       struct xfs_mount        *mp,    /* file system mount point */
+       xfs_agnumber_t          agno,   /* allocation group number */
+       xfs_agblock_t           agbno,  /* allocation group block number */
+       xfs_extlen_t            count,  /* count of filesystem blocks */
+       const struct xfs_buf_ops *ops);
+
+/*
+ * Initialise a new btree block header
+ */
+void
+xfs_btree_init_block(
+       struct xfs_mount *mp,
+       struct xfs_buf  *bp,
+       __u32           magic,
+       __u16           level,
+       __u16           numrecs,
+       __u64           owner,
+       unsigned int    flags);
+
+void
+xfs_btree_init_block_int(
+       struct xfs_mount        *mp,
+       struct xfs_btree_block  *buf,
+       xfs_daddr_t             blkno,
+       __u32                   magic,
+       __u16                   level,
+       __u16                   numrecs,
+       __u64                   owner,
+       unsigned int            flags);
+
+/*
+ * Common btree core entry points.
+ */
+int xfs_btree_increment(struct xfs_btree_cur *, int, int *);
+int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
+int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
+int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
+int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
+int xfs_btree_insert(struct xfs_btree_cur *, int *);
+int xfs_btree_delete(struct xfs_btree_cur *, int *);
+int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
+int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner,
+                          struct list_head *buffer_list);
+
+/*
+ * btree block CRC helpers
+ */
+void xfs_btree_lblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_lblock_verify_crc(struct xfs_buf *);
+void xfs_btree_sblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_sblock_verify_crc(struct xfs_buf *);
+
+/*
+ * Internal btree helpers also used by xfs_bmap.c.
+ */
+void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
+void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
+
+/*
+ * Helpers.
+ */
+static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block)
+{
+       return be16_to_cpu(block->bb_numrecs);
+}
+
+static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block,
+               __uint16_t numrecs)
+{
+       block->bb_numrecs = cpu_to_be16(numrecs);
+}
+
+static inline int xfs_btree_get_level(struct xfs_btree_block *block)
+{
+       return be16_to_cpu(block->bb_level);
+}
+
+
+/*
+ * Min and max functions for extlen, agblock, fileoff, and filblks types.
+ */
+#define        XFS_EXTLEN_MIN(a,b)     min_t(xfs_extlen_t, (a), (b))
+#define        XFS_EXTLEN_MAX(a,b)     max_t(xfs_extlen_t, (a), (b))
+#define        XFS_AGBLOCK_MIN(a,b)    min_t(xfs_agblock_t, (a), (b))
+#define        XFS_AGBLOCK_MAX(a,b)    max_t(xfs_agblock_t, (a), (b))
+#define        XFS_FILEOFF_MIN(a,b)    min_t(xfs_fileoff_t, (a), (b))
+#define        XFS_FILEOFF_MAX(a,b)    max_t(xfs_fileoff_t, (a), (b))
+#define        XFS_FILBLKS_MIN(a,b)    min_t(xfs_filblks_t, (a), (b))
+#define        XFS_FILBLKS_MAX(a,b)    max_t(xfs_filblks_t, (a), (b))
+
+#define        XFS_FSB_SANITY_CHECK(mp,fsb)    \
+       (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \
+               XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks)
+
+/*
+ * Trace hooks.  Currently not implemented as they need to be ported
+ * over to the generic tracing functionality, which is some effort.
+ *
+ * i,j = integer (32 bit)
+ * b = btree block buffer (xfs_buf_t)
+ * p = btree ptr
+ * r = btree record
+ * k = btree key
+ */
+#define        XFS_BTREE_TRACE_ARGBI(c, b, i)
+#define        XFS_BTREE_TRACE_ARGBII(c, b, i, j)
+#define        XFS_BTREE_TRACE_ARGI(c, i)
+#define        XFS_BTREE_TRACE_ARGIPK(c, i, p, s)
+#define        XFS_BTREE_TRACE_ARGIPR(c, i, p, r)
+#define        XFS_BTREE_TRACE_ARGIK(c, i, k)
+#define XFS_BTREE_TRACE_ARGR(c, r)
+#define        XFS_BTREE_TRACE_CURSOR(c, t)
+
+#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h
new file mode 100644 (file)
index 0000000..fad1676
--- /dev/null
@@ -0,0 +1,63 @@
+#ifndef _XFS_CKSUM_H
+#define _XFS_CKSUM_H 1
+
+#define XFS_CRC_SEED   (~(__uint32_t)0)
+
+/*
+ * Calculate the intermediate checksum for a buffer that has the CRC field
+ * inside it.  The offset of the 32bit crc fields is passed as the
+ * cksum_offset parameter.
+ */
+static inline __uint32_t
+xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+       __uint32_t zero = 0;
+       __uint32_t crc;
+
+       /* Calculate CRC up to the checksum. */
+       crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset);
+
+       /* Skip checksum field */
+       crc = crc32c(crc, &zero, sizeof(__u32));
+
+       /* Calculate the rest of the CRC. */
+       return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)],
+                     length - (cksum_offset + sizeof(__be32)));
+}
+
+/*
+ * Convert the intermediate checksum to the final ondisk format.
+ *
+ * The CRC32c calculation uses LE format even on BE machines, but returns the
+ * result in host endian format. Hence we need to byte swap it back to LE format
+ * so that it is consistent on disk.
+ */
+static inline __le32
+xfs_end_cksum(__uint32_t crc)
+{
+       return ~cpu_to_le32(crc);
+}
+
+/*
+ * Helper to generate the checksum for a buffer.
+ */
+static inline void
+xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+       __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+
+       *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
+}
+
+/*
+ * Helper to verify the checksum for a buffer.
+ */
+static inline int
+xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+       __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+
+       return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
+}
+
+#endif /* _XFS_CKSUM_H */
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
new file mode 100644 (file)
index 0000000..2c42ae2
--- /dev/null
@@ -0,0 +1,2665 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
+
+/*
+ * xfs_da_btree.c
+ *
+ * Routines to implement directories as Btrees of hashed names.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+STATIC int xfs_da3_root_split(xfs_da_state_t *state,
+                                           xfs_da_state_blk_t *existing_root,
+                                           xfs_da_state_blk_t *new_child);
+STATIC int xfs_da3_node_split(xfs_da_state_t *state,
+                                           xfs_da_state_blk_t *existing_blk,
+                                           xfs_da_state_blk_t *split_blk,
+                                           xfs_da_state_blk_t *blk_to_add,
+                                           int treelevel,
+                                           int *result);
+STATIC void xfs_da3_node_rebalance(xfs_da_state_t *state,
+                                        xfs_da_state_blk_t *node_blk_1,
+                                        xfs_da_state_blk_t *node_blk_2);
+STATIC void xfs_da3_node_add(xfs_da_state_t *state,
+                                  xfs_da_state_blk_t *old_node_blk,
+                                  xfs_da_state_blk_t *new_node_blk);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+STATIC int xfs_da3_root_join(xfs_da_state_t *state,
+                                          xfs_da_state_blk_t *root_blk);
+STATIC int xfs_da3_node_toosmall(xfs_da_state_t *state, int *retval);
+STATIC void xfs_da3_node_remove(xfs_da_state_t *state,
+                                             xfs_da_state_blk_t *drop_blk);
+STATIC void xfs_da3_node_unbalance(xfs_da_state_t *state,
+                                        xfs_da_state_blk_t *src_node_blk,
+                                        xfs_da_state_blk_t *dst_node_blk);
+
+/*
+ * Utility routines.
+ */
+STATIC int     xfs_da3_blk_unlink(xfs_da_state_t *state,
+                                 xfs_da_state_blk_t *drop_blk,
+                                 xfs_da_state_blk_t *save_blk);
+
+
+kmem_zone_t *xfs_da_state_zone;        /* anchor for state struct zone */
+
+/*
+ * Allocate a dir-state structure.
+ * We don't put them on the stack since they're large.
+ */
+xfs_da_state_t *
+xfs_da_state_alloc(void)
+{
+       return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
+}
+
+/*
+ * Kill the altpath contents of a da-state structure.
+ */
+STATIC void
+xfs_da_state_kill_altpath(xfs_da_state_t *state)
+{
+       int     i;
+
+       for (i = 0; i < state->altpath.active; i++)
+               state->altpath.blk[i].bp = NULL;
+       state->altpath.active = 0;
+}
+
+/*
+ * Free a da-state structure.
+ */
+void
+xfs_da_state_free(xfs_da_state_t *state)
+{
+       xfs_da_state_kill_altpath(state);
+#ifdef DEBUG
+       memset((char *)state, 0, sizeof(*state));
+#endif /* DEBUG */
+       kmem_zone_free(xfs_da_state_zone, state);
+}
+
+static bool
+xfs_da3_node_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_da_intnode   *hdr = bp->b_addr;
+       struct xfs_da3_icnode_hdr ichdr;
+       const struct xfs_dir_ops *ops;
+
+       ops = xfs_dir_get_ops(mp, NULL);
+
+       ops->node_hdr_from_disk(&ichdr, hdr);
+
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+               if (ichdr.magic != XFS_DA3_NODE_MAGIC)
+                       return false;
+
+               if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
+                       return false;
+               if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
+                       return false;
+       } else {
+               if (ichdr.magic != XFS_DA_NODE_MAGIC)
+                       return false;
+       }
+       if (ichdr.level == 0)
+               return false;
+       if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
+               return false;
+       if (ichdr.count == 0)
+               return false;
+
+       /*
+        * we don't know if the node is for and attribute or directory tree,
+        * so only fail if the count is outside both bounds
+        */
+       if (ichdr.count > mp->m_dir_geo->node_ents &&
+           ichdr.count > mp->m_attr_geo->node_ents)
+               return false;
+
+       /* XXX: hash order check? */
+
+       return true;
+}
+
+static void
+xfs_da3_node_write_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_buf_log_item *bip = bp->b_fspriv;
+       struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+       if (!xfs_da3_node_verify(bp)) {
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+               xfs_verifier_error(bp);
+               return;
+       }
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return;
+
+       if (bip)
+               hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+       xfs_buf_update_cksum(bp, XFS_DA3_NODE_CRC_OFF);
+}
+
+/*
+ * leaf/node format detection on trees is sketchy, so a node read can be done on
+ * leaf level blocks when detection identifies the tree as a node format tree
+ * incorrectly. In this case, we need to swap the verifier to match the correct
+ * format of the block being read.
+ */
+static void
+xfs_da3_node_read_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_da_blkinfo   *info = bp->b_addr;
+
+       switch (be16_to_cpu(info->magic)) {
+               case XFS_DA3_NODE_MAGIC:
+                       if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) {
+                               xfs_buf_ioerror(bp, -EFSBADCRC);
+                               break;
+                       }
+                       /* fall through */
+               case XFS_DA_NODE_MAGIC:
+                       if (!xfs_da3_node_verify(bp)) {
+                               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+                               break;
+                       }
+                       return;
+               case XFS_ATTR_LEAF_MAGIC:
+               case XFS_ATTR3_LEAF_MAGIC:
+                       bp->b_ops = &xfs_attr3_leaf_buf_ops;
+                       bp->b_ops->verify_read(bp);
+                       return;
+               case XFS_DIR2_LEAFN_MAGIC:
+               case XFS_DIR3_LEAFN_MAGIC:
+                       bp->b_ops = &xfs_dir3_leafn_buf_ops;
+                       bp->b_ops->verify_read(bp);
+                       return;
+               default:
+                       break;
+       }
+
+       /* corrupt block */
+       xfs_verifier_error(bp);
+}
+
+const struct xfs_buf_ops xfs_da3_node_buf_ops = {
+       .verify_read = xfs_da3_node_read_verify,
+       .verify_write = xfs_da3_node_write_verify,
+};
+
+int
+xfs_da3_node_read(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *dp,
+       xfs_dablk_t             bno,
+       xfs_daddr_t             mappedbno,
+       struct xfs_buf          **bpp,
+       int                     which_fork)
+{
+       int                     err;
+
+       err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+                                       which_fork, &xfs_da3_node_buf_ops);
+       if (!err && tp) {
+               struct xfs_da_blkinfo   *info = (*bpp)->b_addr;
+               int                     type;
+
+               switch (be16_to_cpu(info->magic)) {
+               case XFS_DA_NODE_MAGIC:
+               case XFS_DA3_NODE_MAGIC:
+                       type = XFS_BLFT_DA_NODE_BUF;
+                       break;
+               case XFS_ATTR_LEAF_MAGIC:
+               case XFS_ATTR3_LEAF_MAGIC:
+                       type = XFS_BLFT_ATTR_LEAF_BUF;
+                       break;
+               case XFS_DIR2_LEAFN_MAGIC:
+               case XFS_DIR3_LEAFN_MAGIC:
+                       type = XFS_BLFT_DIR_LEAFN_BUF;
+                       break;
+               default:
+                       type = 0;
+                       ASSERT(0);
+                       break;
+               }
+               xfs_trans_buf_set_type(tp, *bpp, type);
+       }
+       return err;
+}
+
+/*========================================================================
+ * Routines used for growing the Btree.
+ *========================================================================*/
+
+/*
+ * Create the initial contents of an intermediate node.
+ */
+int
+xfs_da3_node_create(
+       struct xfs_da_args      *args,
+       xfs_dablk_t             blkno,
+       int                     level,
+       struct xfs_buf          **bpp,
+       int                     whichfork)
+{
+       struct xfs_da_intnode   *node;
+       struct xfs_trans        *tp = args->trans;
+       struct xfs_mount        *mp = tp->t_mountp;
+       struct xfs_da3_icnode_hdr ichdr = {0};
+       struct xfs_buf          *bp;
+       int                     error;
+       struct xfs_inode        *dp = args->dp;
+
+       trace_xfs_da_node_create(args);
+       ASSERT(level <= XFS_DA_NODE_MAXDEPTH);
+
+       error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork);
+       if (error)
+               return error;
+       bp->b_ops = &xfs_da3_node_buf_ops;
+       xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
+       node = bp->b_addr;
+
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+               ichdr.magic = XFS_DA3_NODE_MAGIC;
+               hdr3->info.blkno = cpu_to_be64(bp->b_bn);
+               hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
+               uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid);
+       } else {
+               ichdr.magic = XFS_DA_NODE_MAGIC;
+       }
+       ichdr.level = level;
+
+       dp->d_ops->node_hdr_to_disk(node, &ichdr);
+       xfs_trans_log_buf(tp, bp,
+               XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
+
+       *bpp = bp;
+       return 0;
+}
+
+/*
+ * Split a leaf node, rebalance, then possibly split
+ * intermediate nodes, rebalance, etc.
+ */
+int                                                    /* error */
+xfs_da3_split(
+       struct xfs_da_state     *state)
+{
+       struct xfs_da_state_blk *oldblk;
+       struct xfs_da_state_blk *newblk;
+       struct xfs_da_state_blk *addblk;
+       struct xfs_da_intnode   *node;
+       struct xfs_buf          *bp;
+       int                     max;
+       int                     action = 0;
+       int                     error;
+       int                     i;
+
+       trace_xfs_da_split(state->args);
+
+       /*
+        * Walk back up the tree splitting/inserting/adjusting as necessary.
+        * If we need to insert and there isn't room, split the node, then
+        * decide which fragment to insert the new block from below into.
+        * Note that we may split the root this way, but we need more fixup.
+        */
+       max = state->path.active - 1;
+       ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH));
+       ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC ||
+              state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
+
+       addblk = &state->path.blk[max];         /* initial dummy value */
+       for (i = max; (i >= 0) && addblk; state->path.active--, i--) {
+               oldblk = &state->path.blk[i];
+               newblk = &state->altpath.blk[i];
+
+               /*
+                * If a leaf node then
+                *     Allocate a new leaf node, then rebalance across them.
+                * else if an intermediate node then
+                *     We split on the last layer, must we split the node?
+                */
+               switch (oldblk->magic) {
+               case XFS_ATTR_LEAF_MAGIC:
+                       error = xfs_attr3_leaf_split(state, oldblk, newblk);
+                       if ((error != 0) && (error != -ENOSPC)) {
+                               return error;   /* GROT: attr is inconsistent */
+                       }
+                       if (!error) {
+                               addblk = newblk;
+                               break;
+                       }
+                       /*
+                        * Entry wouldn't fit, split the leaf again.
+                        */
+                       state->extravalid = 1;
+                       if (state->inleaf) {
+                               state->extraafter = 0;  /* before newblk */
+                               trace_xfs_attr_leaf_split_before(state->args);
+                               error = xfs_attr3_leaf_split(state, oldblk,
+                                                           &state->extrablk);
+                       } else {
+                               state->extraafter = 1;  /* after newblk */
+                               trace_xfs_attr_leaf_split_after(state->args);
+                               error = xfs_attr3_leaf_split(state, newblk,
+                                                           &state->extrablk);
+                       }
+                       if (error)
+                               return error;   /* GROT: attr inconsistent */
+                       addblk = newblk;
+                       break;
+               case XFS_DIR2_LEAFN_MAGIC:
+                       error = xfs_dir2_leafn_split(state, oldblk, newblk);
+                       if (error)
+                               return error;
+                       addblk = newblk;
+                       break;
+               case XFS_DA_NODE_MAGIC:
+                       error = xfs_da3_node_split(state, oldblk, newblk, addblk,
+                                                        max - i, &action);
+                       addblk->bp = NULL;
+                       if (error)
+                               return error;   /* GROT: dir is inconsistent */
+                       /*
+                        * Record the newly split block for the next time thru?
+                        */
+                       if (action)
+                               addblk = newblk;
+                       else
+                               addblk = NULL;
+                       break;
+               }
+
+               /*
+                * Update the btree to show the new hashval for this child.
+                */
+               xfs_da3_fixhashpath(state, &state->path);
+       }
+       if (!addblk)
+               return 0;
+
+       /*
+        * Split the root node.
+        */
+       ASSERT(state->path.active == 0);
+       oldblk = &state->path.blk[0];
+       error = xfs_da3_root_split(state, oldblk, addblk);
+       if (error) {
+               addblk->bp = NULL;
+               return error;   /* GROT: dir is inconsistent */
+       }
+
+       /*
+        * Update pointers to the node which used to be block 0 and
+        * just got bumped because of the addition of a new root node.
+        * There might be three blocks involved if a double split occurred,
+        * and the original block 0 could be at any position in the list.
+        *
+        * Note: the magic numbers and sibling pointers are in the same
+        * physical place for both v2 and v3 headers (by design). Hence it
+        * doesn't matter which version of the xfs_da_intnode structure we use
+        * here as the result will be the same using either structure.
+        */
+       node = oldblk->bp->b_addr;
+       if (node->hdr.info.forw) {
+               if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
+                       bp = addblk->bp;
+               } else {
+                       ASSERT(state->extravalid);
+                       bp = state->extrablk.bp;
+               }
+               node = bp->b_addr;
+               node->hdr.info.back = cpu_to_be32(oldblk->blkno);
+               xfs_trans_log_buf(state->args->trans, bp,
+                   XFS_DA_LOGRANGE(node, &node->hdr.info,
+                   sizeof(node->hdr.info)));
+       }
+       node = oldblk->bp->b_addr;
+       if (node->hdr.info.back) {
+               if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) {
+                       bp = addblk->bp;
+               } else {
+                       ASSERT(state->extravalid);
+                       bp = state->extrablk.bp;
+               }
+               node = bp->b_addr;
+               node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
+               xfs_trans_log_buf(state->args->trans, bp,
+                   XFS_DA_LOGRANGE(node, &node->hdr.info,
+                   sizeof(node->hdr.info)));
+       }
+       addblk->bp = NULL;
+       return 0;
+}
+
+/*
+ * Split the root.  We have to create a new root and point to the two
+ * parts (the split old root) that we just created.  Copy block zero to
+ * the EOF, extending the inode in process.
+ */
+STATIC int                                             /* error */
+xfs_da3_root_split(
+       struct xfs_da_state     *state,
+       struct xfs_da_state_blk *blk1,
+       struct xfs_da_state_blk *blk2)
+{
+       struct xfs_da_intnode   *node;
+       struct xfs_da_intnode   *oldroot;
+       struct xfs_da_node_entry *btree;
+       struct xfs_da3_icnode_hdr nodehdr;
+       struct xfs_da_args      *args;
+       struct xfs_buf          *bp;
+       struct xfs_inode        *dp;
+       struct xfs_trans        *tp;
+       struct xfs_mount        *mp;
+       struct xfs_dir2_leaf    *leaf;
+       xfs_dablk_t             blkno;
+       int                     level;
+       int                     error;
+       int                     size;
+
+       trace_xfs_da_root_split(state->args);
+
+       /*
+        * Copy the existing (incorrect) block from the root node position
+        * to a free space somewhere.
+        */
+       args = state->args;
+       error = xfs_da_grow_inode(args, &blkno);
+       if (error)
+               return error;
+
+       dp = args->dp;
+       tp = args->trans;
+       mp = state->mp;
+       error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
+       if (error)
+               return error;
+       node = bp->b_addr;
+       oldroot = blk1->bp->b_addr;
+       if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+           oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
+               struct xfs_da3_icnode_hdr nodehdr;
+
+               dp->d_ops->node_hdr_from_disk(&nodehdr, oldroot);
+               btree = dp->d_ops->node_tree_p(oldroot);
+               size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot);
+               level = nodehdr.level;
+
+               /*
+                * we are about to copy oldroot to bp, so set up the type
+                * of bp while we know exactly what it will be.
+                */
+               xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
+       } else {
+               struct xfs_dir3_icleaf_hdr leafhdr;
+               struct xfs_dir2_leaf_entry *ents;
+
+               leaf = (xfs_dir2_leaf_t *)oldroot;
+               dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+               ents = dp->d_ops->leaf_ents_p(leaf);
+
+               ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+                      leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
+               size = (int)((char *)&ents[leafhdr.count] - (char *)leaf);
+               level = 0;
+
+               /*
+                * we are about to copy oldroot to bp, so set up the type
+                * of bp while we know exactly what it will be.
+                */
+               xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
+       }
+
+       /*
+        * we can copy most of the information in the node from one block to
+        * another, but for CRC enabled headers we have to make sure that the
+        * block specific identifiers are kept intact. We update the buffer
+        * directly for this.
+        */
+       memcpy(node, oldroot, size);
+       if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
+           oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+               struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node;
+
+               node3->hdr.info.blkno = cpu_to_be64(bp->b_bn);
+       }
+       xfs_trans_log_buf(tp, bp, 0, size - 1);
+
+       bp->b_ops = blk1->bp->b_ops;
+       xfs_trans_buf_copy_type(bp, blk1->bp);
+       blk1->bp = bp;
+       blk1->blkno = blkno;
+
+       /*
+        * Set up the new root node.
+        */
+       error = xfs_da3_node_create(args,
+               (args->whichfork == XFS_DATA_FORK) ? args->geo->leafblk : 0,
+               level + 1, &bp, args->whichfork);
+       if (error)
+               return error;
+
+       node = bp->b_addr;
+       dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+       btree = dp->d_ops->node_tree_p(node);
+       btree[0].hashval = cpu_to_be32(blk1->hashval);
+       btree[0].before = cpu_to_be32(blk1->blkno);
+       btree[1].hashval = cpu_to_be32(blk2->hashval);
+       btree[1].before = cpu_to_be32(blk2->blkno);
+       nodehdr.count = 2;
+       dp->d_ops->node_hdr_to_disk(node, &nodehdr);
+
+#ifdef DEBUG
+       if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+           oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+               ASSERT(blk1->blkno >= args->geo->leafblk &&
+                      blk1->blkno < args->geo->freeblk);
+               ASSERT(blk2->blkno >= args->geo->leafblk &&
+                      blk2->blkno < args->geo->freeblk);
+       }
+#endif
+
+       /* Header is already logged by xfs_da_node_create */
+       xfs_trans_log_buf(tp, bp,
+               XFS_DA_LOGRANGE(node, btree, sizeof(xfs_da_node_entry_t) * 2));
+
+       return 0;
+}
+
+/*
+ * Split the node, rebalance, then add the new entry.
+ */
+STATIC int                                             /* error */
+xfs_da3_node_split(
+       struct xfs_da_state     *state,
+       struct xfs_da_state_blk *oldblk,
+       struct xfs_da_state_blk *newblk,
+       struct xfs_da_state_blk *addblk,
+       int                     treelevel,
+       int                     *result)
+{
+       struct xfs_da_intnode   *node;
+       struct xfs_da3_icnode_hdr nodehdr;
+       xfs_dablk_t             blkno;
+       int                     newcount;
+       int                     error;
+       int                     useextra;
+       struct xfs_inode        *dp = state->args->dp;
+
+       trace_xfs_da_node_split(state->args);
+
+       node = oldblk->bp->b_addr;
+       dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+
+       /*
+        * With V2 dirs the extra block is data or freespace.
+        */
+       useextra = state->extravalid && state->args->whichfork == XFS_ATTR_FORK;
+       newcount = 1 + useextra;
+       /*
+        * Do we have to split the node?
+        */
+       if (nodehdr.count + newcount > state->args->geo->node_ents) {
+               /*
+                * Allocate a new node, add to the doubly linked chain of
+                * nodes, then move some of our excess entries into it.
+                */
+               error = xfs_da_grow_inode(state->args, &blkno);
+               if (error)
+                       return error;   /* GROT: dir is inconsistent */
+
+               error = xfs_da3_node_create(state->args, blkno, treelevel,
+                                          &newblk->bp, state->args->whichfork);
+               if (error)
+                       return error;   /* GROT: dir is inconsistent */
+               newblk->blkno = blkno;
+               newblk->magic = XFS_DA_NODE_MAGIC;
+               xfs_da3_node_rebalance(state, oldblk, newblk);
+               error = xfs_da3_blk_link(state, oldblk, newblk);
+               if (error)
+                       return error;
+               *result = 1;
+       } else {
+               *result = 0;
+       }
+
+       /*
+        * Insert the new entry(s) into the correct block
+        * (updating last hashval in the process).
+        *
+        * xfs_da3_node_add() inserts BEFORE the given index,
+        * and as a result of using node_lookup_int() we always
+        * point to a valid entry (not after one), but a split
+        * operation always results in a new block whose hashvals
+        * FOLLOW the current block.
+        *
+        * If we had double-split op below us, then add the extra block too.
+        */
+       node = oldblk->bp->b_addr;
+       dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+       if (oldblk->index <= nodehdr.count) {
+               oldblk->index++;
+               xfs_da3_node_add(state, oldblk, addblk);
+               if (useextra) {
+                       if (state->extraafter)
+                               oldblk->index++;
+                       xfs_da3_node_add(state, oldblk, &state->extrablk);
+                       state->extravalid = 0;
+               }
+       } else {
+               newblk->index++;
+               xfs_da3_node_add(state, newblk, addblk);
+               if (useextra) {
+                       if (state->extraafter)
+                               newblk->index++;
+                       xfs_da3_node_add(state, newblk, &state->extrablk);
+                       state->extravalid = 0;
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * Balance the btree elements between two intermediate nodes,
+ * usually one full and one empty.
+ *
+ * NOTE: if blk2 is empty, then it will get the upper half of blk1.
+ */
+STATIC void
+xfs_da3_node_rebalance(
+       struct xfs_da_state     *state,
+       struct xfs_da_state_blk *blk1,
+       struct xfs_da_state_blk *blk2)
+{
+       struct xfs_da_intnode   *node1;
+       struct xfs_da_intnode   *node2;
+       struct xfs_da_intnode   *tmpnode;
+       struct xfs_da_node_entry *btree1;
+       struct xfs_da_node_entry *btree2;
+       struct xfs_da_node_entry *btree_s;
+       struct xfs_da_node_entry *btree_d;
+       struct xfs_da3_icnode_hdr nodehdr1;
+       struct xfs_da3_icnode_hdr nodehdr2;
+       struct xfs_trans        *tp;
+       int                     count;
+       int                     tmp;
+       int                     swap = 0;
+       struct xfs_inode        *dp = state->args->dp;
+
+       trace_xfs_da_node_rebalance(state->args);
+
+       node1 = blk1->bp->b_addr;
+       node2 = blk2->bp->b_addr;
+       dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
+       dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
+       btree1 = dp->d_ops->node_tree_p(node1);
+       btree2 = dp->d_ops->node_tree_p(node2);
+
+       /*
+        * Figure out how many entries need to move, and in which direction.
+        * Swap the nodes around if that makes it simpler.
+        */
+       if (nodehdr1.count > 0 && nodehdr2.count > 0 &&
+           ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
+            (be32_to_cpu(btree2[nodehdr2.count - 1].hashval) <
+                       be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) {
+               tmpnode = node1;
+               node1 = node2;
+               node2 = tmpnode;
+               dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
+               dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
+               btree1 = dp->d_ops->node_tree_p(node1);
+               btree2 = dp->d_ops->node_tree_p(node2);
+               swap = 1;
+       }
+
+       count = (nodehdr1.count - nodehdr2.count) / 2;
+       if (count == 0)
+               return;
+       tp = state->args->trans;
+       /*
+        * Two cases: high-to-low and low-to-high.
+        */
+       if (count > 0) {
+               /*
+                * Move elements in node2 up to make a hole.
+                */
+               tmp = nodehdr2.count;
+               if (tmp > 0) {
+                       tmp *= (uint)sizeof(xfs_da_node_entry_t);
+                       btree_s = &btree2[0];
+                       btree_d = &btree2[count];
+                       memmove(btree_d, btree_s, tmp);
+               }
+
+               /*
+                * Move the req'd B-tree elements from high in node1 to
+                * low in node2.
+                */
+               nodehdr2.count += count;
+               tmp = count * (uint)sizeof(xfs_da_node_entry_t);
+               btree_s = &btree1[nodehdr1.count - count];
+               btree_d = &btree2[0];
+               memcpy(btree_d, btree_s, tmp);
+               nodehdr1.count -= count;
+       } else {
+               /*
+                * Move the req'd B-tree elements from low in node2 to
+                * high in node1.
+                */
+               count = -count;
+               tmp = count * (uint)sizeof(xfs_da_node_entry_t);
+               btree_s = &btree2[0];
+               btree_d = &btree1[nodehdr1.count];
+               memcpy(btree_d, btree_s, tmp);
+               nodehdr1.count += count;
+
+               xfs_trans_log_buf(tp, blk1->bp,
+                       XFS_DA_LOGRANGE(node1, btree_d, tmp));
+
+               /*
+                * Move elements in node2 down to fill the hole.
+                */
+               tmp  = nodehdr2.count - count;
+               tmp *= (uint)sizeof(xfs_da_node_entry_t);
+               btree_s = &btree2[count];
+               btree_d = &btree2[0];
+               memmove(btree_d, btree_s, tmp);
+               nodehdr2.count -= count;
+       }
+
+       /*
+        * Log header of node 1 and all current bits of node 2.
+        */
+       dp->d_ops->node_hdr_to_disk(node1, &nodehdr1);
+       xfs_trans_log_buf(tp, blk1->bp,
+               XFS_DA_LOGRANGE(node1, &node1->hdr, dp->d_ops->node_hdr_size));
+
+       dp->d_ops->node_hdr_to_disk(node2, &nodehdr2);
+       xfs_trans_log_buf(tp, blk2->bp,
+               XFS_DA_LOGRANGE(node2, &node2->hdr,
+                               dp->d_ops->node_hdr_size +
+                               (sizeof(btree2[0]) * nodehdr2.count)));
+
+       /*
+        * Record the last hashval from each block for upward propagation.
+        * (note: don't use the swapped node pointers)
+        */
+       if (swap) {
+               node1 = blk1->bp->b_addr;
+               node2 = blk2->bp->b_addr;
+               dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
+               dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
+               btree1 = dp->d_ops->node_tree_p(node1);
+               btree2 = dp->d_ops->node_tree_p(node2);
+       }
+       blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval);
+       blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval);
+
+       /*
+        * Adjust the expected index for insertion.
+        */
+       if (blk1->index >= nodehdr1.count) {
+               blk2->index = blk1->index - nodehdr1.count;
+               blk1->index = nodehdr1.count + 1;       /* make it invalid */
+       }
+}
+
+/*
+ * Add a new entry to an intermediate node.
+ */
+STATIC void
+xfs_da3_node_add(
+       struct xfs_da_state     *state,
+       struct xfs_da_state_blk *oldblk,
+       struct xfs_da_state_blk *newblk)
+{
+       struct xfs_da_intnode   *node;
+       struct xfs_da3_icnode_hdr nodehdr;
+       struct xfs_da_node_entry *btree;
+       int                     tmp;
+       struct xfs_inode        *dp = state->args->dp;
+
+       trace_xfs_da_node_add(state->args);
+
+       node = oldblk->bp->b_addr;
+       dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+       btree = dp->d_ops->node_tree_p(node);
+
+       ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count);
+       ASSERT(newblk->blkno != 0);
+       if (state->args->whichfork == XFS_DATA_FORK)
+               ASSERT(newblk->blkno >= state->args->geo->leafblk &&
+                      newblk->blkno < state->args->geo->freeblk);
+
+       /*
+        * We may need to make some room before we insert the new node.
+        */
+       tmp = 0;
+       if (oldblk->index < nodehdr.count) {
+               tmp = (nodehdr.count - oldblk->index) * (uint)sizeof(*btree);
+               memmove(&btree[oldblk->index + 1], &btree[oldblk->index], tmp);
+       }
+       btree[oldblk->index].hashval = cpu_to_be32(newblk->hashval);
+       btree[oldblk->index].before = cpu_to_be32(newblk->blkno);
+       xfs_trans_log_buf(state->args->trans, oldblk->bp,
+               XFS_DA_LOGRANGE(node, &btree[oldblk->index],
+                               tmp + sizeof(*btree)));
+
+       nodehdr.count += 1;
+       dp->d_ops->node_hdr_to_disk(node, &nodehdr);
+       xfs_trans_log_buf(state->args->trans, oldblk->bp,
+               XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
+
+       /*
+        * Copy the last hash value from the oldblk to propagate upwards.
+        */
+       oldblk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
+}
+
+/*========================================================================
+ * Routines used for shrinking the Btree.
+ *========================================================================*/
+
+/*
+ * Deallocate an empty leaf node, remove it from its parent,
+ * possibly deallocating that block, etc...
+ */
+int
+xfs_da3_join(
+       struct xfs_da_state     *state)
+{
+       struct xfs_da_state_blk *drop_blk;
+       struct xfs_da_state_blk *save_blk;
+       int                     action = 0;
+       int                     error;
+
+       trace_xfs_da_join(state->args);
+
+       drop_blk = &state->path.blk[ state->path.active-1 ];
+       save_blk = &state->altpath.blk[ state->path.active-1 ];
+       ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
+       ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC ||
+              drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+
+       /*
+        * Walk back up the tree joining/deallocating as necessary.
+        * When we stop dropping blocks, break out.
+        */
+       for (  ; state->path.active >= 2; drop_blk--, save_blk--,
+                state->path.active--) {
+               /*
+                * See if we can combine the block with a neighbor.
+                *   (action == 0) => no options, just leave
+                *   (action == 1) => coalesce, then unlink
+                *   (action == 2) => block empty, unlink it
+                */
+               switch (drop_blk->magic) {
+               case XFS_ATTR_LEAF_MAGIC:
+                       error = xfs_attr3_leaf_toosmall(state, &action);
+                       if (error)
+                               return error;
+                       if (action == 0)
+                               return 0;
+                       xfs_attr3_leaf_unbalance(state, drop_blk, save_blk);
+                       break;
+               case XFS_DIR2_LEAFN_MAGIC:
+                       error = xfs_dir2_leafn_toosmall(state, &action);
+                       if (error)
+                               return error;
+                       if (action == 0)
+                               return 0;
+                       xfs_dir2_leafn_unbalance(state, drop_blk, save_blk);
+                       break;
+               case XFS_DA_NODE_MAGIC:
+                       /*
+                        * Remove the offending node, fixup hashvals,
+                        * check for a toosmall neighbor.
+                        */
+                       xfs_da3_node_remove(state, drop_blk);
+                       xfs_da3_fixhashpath(state, &state->path);
+                       error = xfs_da3_node_toosmall(state, &action);
+                       if (error)
+                               return error;
+                       if (action == 0)
+                               return 0;
+                       xfs_da3_node_unbalance(state, drop_blk, save_blk);
+                       break;
+               }
+               xfs_da3_fixhashpath(state, &state->altpath);
+               error = xfs_da3_blk_unlink(state, drop_blk, save_blk);
+               xfs_da_state_kill_altpath(state);
+               if (error)
+                       return error;
+               error = xfs_da_shrink_inode(state->args, drop_blk->blkno,
+                                                        drop_blk->bp);
+               drop_blk->bp = NULL;
+               if (error)
+                       return error;
+       }
+       /*
+        * We joined all the way to the top.  If it turns out that
+        * we only have one entry in the root, make the child block
+        * the new root.
+        */
+       xfs_da3_node_remove(state, drop_blk);
+       xfs_da3_fixhashpath(state, &state->path);
+       error = xfs_da3_root_join(state, &state->path.blk[0]);
+       return error;
+}
+
+#ifdef DEBUG
+static void
+xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level)
+{
+       __be16  magic = blkinfo->magic;
+
+       if (level == 1) {
+               ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+                      magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
+                      magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+                      magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+       } else {
+               ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+                      magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
+       }
+       ASSERT(!blkinfo->forw);
+       ASSERT(!blkinfo->back);
+}
+#else  /* !DEBUG */
+#define        xfs_da_blkinfo_onlychild_validate(blkinfo, level)
+#endif /* !DEBUG */
+
+/*
+ * We have only one entry in the root.  Copy the only remaining child of
+ * the old root to block 0 as the new root node.
+ */
+STATIC int
+xfs_da3_root_join(
+       struct xfs_da_state     *state,
+       struct xfs_da_state_blk *root_blk)
+{
+       struct xfs_da_intnode   *oldroot;
+       struct xfs_da_args      *args;
+       xfs_dablk_t             child;
+       struct xfs_buf          *bp;
+       struct xfs_da3_icnode_hdr oldroothdr;
+       struct xfs_da_node_entry *btree;
+       int                     error;
+       struct xfs_inode        *dp = state->args->dp;
+
+       trace_xfs_da_root_join(state->args);
+
+       ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
+
+       args = state->args;
+       oldroot = root_blk->bp->b_addr;
+       dp->d_ops->node_hdr_from_disk(&oldroothdr, oldroot);
+       ASSERT(oldroothdr.forw == 0);
+       ASSERT(oldroothdr.back == 0);
+
+       /*
+        * If the root has more than one child, then don't do anything.
+        */
+       if (oldroothdr.count > 1)
+               return 0;
+
+       /*
+        * Read in the (only) child block, then copy those bytes into
+        * the root block's buffer and free the original child block.
+        */
+       btree = dp->d_ops->node_tree_p(oldroot);
+       child = be32_to_cpu(btree[0].before);
+       ASSERT(child != 0);
+       error = xfs_da3_node_read(args->trans, dp, child, -1, &bp,
+                                            args->whichfork);
+       if (error)
+               return error;
+       xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level);
+
+       /*
+        * This could be copying a leaf back into the root block in the case of
+        * there only being a single leaf block left in the tree. Hence we have
+        * to update the b_ops pointer as well to match the buffer type change
+        * that could occur. For dir3 blocks we also need to update the block
+        * number in the buffer header.
+        */
+       memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize);
+       root_blk->bp->b_ops = bp->b_ops;
+       xfs_trans_buf_copy_type(root_blk->bp, bp);
+       if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) {
+               struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr;
+               da3->blkno = cpu_to_be64(root_blk->bp->b_bn);
+       }
+       xfs_trans_log_buf(args->trans, root_blk->bp, 0,
+                         args->geo->blksize - 1);
+       error = xfs_da_shrink_inode(args, child, bp);
+       return error;
+}
+
+/*
+ * Check a node block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ */
+STATIC int
+xfs_da3_node_toosmall(
+       struct xfs_da_state     *state,
+       int                     *action)
+{
+       struct xfs_da_intnode   *node;
+       struct xfs_da_state_blk *blk;
+       struct xfs_da_blkinfo   *info;
+       xfs_dablk_t             blkno;
+       struct xfs_buf          *bp;
+       struct xfs_da3_icnode_hdr nodehdr;
+       int                     count;
+       int                     forward;
+       int                     error;
+       int                     retval;
+       int                     i;
+       struct xfs_inode        *dp = state->args->dp;
+
+       trace_xfs_da_node_toosmall(state->args);
+
+       /*
+        * Check for the degenerate case of the block being over 50% full.
+        * If so, it's not worth even looking to see if we might be able
+        * to coalesce with a sibling.
+        */
+       blk = &state->path.blk[ state->path.active-1 ];
+       info = blk->bp->b_addr;
+       node = (xfs_da_intnode_t *)info;
+       dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+       if (nodehdr.count > (state->args->geo->node_ents >> 1)) {
+               *action = 0;    /* blk over 50%, don't try to join */
+               return 0;       /* blk over 50%, don't try to join */
+       }
+
+       /*
+        * Check for the degenerate case of the block being empty.
+        * If the block is empty, we'll simply delete it, no need to
+        * coalesce it with a sibling block.  We choose (arbitrarily)
+        * to merge with the forward block unless it is NULL.
+        */
+       if (nodehdr.count == 0) {
+               /*
+                * Make altpath point to the block we want to keep and
+                * path point to the block we want to drop (this one).
+                */
+               forward = (info->forw != 0);
+               memcpy(&state->altpath, &state->path, sizeof(state->path));
+               error = xfs_da3_path_shift(state, &state->altpath, forward,
+                                                0, &retval);
+               if (error)
+                       return error;
+               if (retval) {
+                       *action = 0;
+               } else {
+                       *action = 2;
+               }
+               return 0;
+       }
+
+       /*
+        * Examine each sibling block to see if we can coalesce with
+        * at least 25% free space to spare.  We need to figure out
+        * whether to merge with the forward or the backward block.
+        * We prefer coalescing with the lower numbered sibling so as
+        * to shrink a directory over time.
+        */
+       count  = state->args->geo->node_ents;
+       count -= state->args->geo->node_ents >> 2;
+       count -= nodehdr.count;
+
+       /* start with smaller blk num */
+       forward = nodehdr.forw < nodehdr.back;
+       for (i = 0; i < 2; forward = !forward, i++) {
+               struct xfs_da3_icnode_hdr thdr;
+               if (forward)
+                       blkno = nodehdr.forw;
+               else
+                       blkno = nodehdr.back;
+               if (blkno == 0)
+                       continue;
+               error = xfs_da3_node_read(state->args->trans, dp,
+                                       blkno, -1, &bp, state->args->whichfork);
+               if (error)
+                       return error;
+
+               node = bp->b_addr;
+               dp->d_ops->node_hdr_from_disk(&thdr, node);
+               xfs_trans_brelse(state->args->trans, bp);
+
+               if (count - thdr.count >= 0)
+                       break;  /* fits with at least 25% to spare */
+       }
+       if (i >= 2) {
+               *action = 0;
+               return 0;
+       }
+
+       /*
+        * Make altpath point to the block we want to keep (the lower
+        * numbered block) and path point to the block we want to drop.
+        */
+       memcpy(&state->altpath, &state->path, sizeof(state->path));
+       if (blkno < blk->blkno) {
+               error = xfs_da3_path_shift(state, &state->altpath, forward,
+                                                0, &retval);
+       } else {
+               error = xfs_da3_path_shift(state, &state->path, forward,
+                                                0, &retval);
+       }
+       if (error)
+               return error;
+       if (retval) {
+               *action = 0;
+               return 0;
+       }
+       *action = 1;
+       return 0;
+}
+
+/*
+ * Pick up the last hashvalue from an intermediate node.
+ */
+STATIC uint
+xfs_da3_node_lasthash(
+       struct xfs_inode        *dp,
+       struct xfs_buf          *bp,
+       int                     *count)
+{
+       struct xfs_da_intnode    *node;
+       struct xfs_da_node_entry *btree;
+       struct xfs_da3_icnode_hdr nodehdr;
+
+       node = bp->b_addr;
+       dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+       if (count)
+               *count = nodehdr.count;
+       if (!nodehdr.count)
+               return 0;
+       btree = dp->d_ops->node_tree_p(node);
+       return be32_to_cpu(btree[nodehdr.count - 1].hashval);
+}
+
+/*
+ * Walk back up the tree adjusting hash values as necessary,
+ * when we stop making changes, return.
+ */
+void
+xfs_da3_fixhashpath(
+       struct xfs_da_state     *state,
+       struct xfs_da_state_path *path)
+{
+       struct xfs_da_state_blk *blk;
+       struct xfs_da_intnode   *node;
+       struct xfs_da_node_entry *btree;
+       xfs_dahash_t            lasthash=0;
+       int                     level;
+       int                     count;
+       struct xfs_inode        *dp = state->args->dp;
+
+       trace_xfs_da_fixhashpath(state->args);
+
+       level = path->active-1;
+       blk = &path->blk[ level ];
+       switch (blk->magic) {
+       case XFS_ATTR_LEAF_MAGIC:
+               lasthash = xfs_attr_leaf_lasthash(blk->bp, &count);
+               if (count == 0)
+                       return;
+               break;
+       case XFS_DIR2_LEAFN_MAGIC:
+               lasthash = xfs_dir2_leafn_lasthash(dp, blk->bp, &count);
+               if (count == 0)
+                       return;
+               break;
+       case XFS_DA_NODE_MAGIC:
+               lasthash = xfs_da3_node_lasthash(dp, blk->bp, &count);
+               if (count == 0)
+                       return;
+               break;
+       }
+       for (blk--, level--; level >= 0; blk--, level--) {
+               struct xfs_da3_icnode_hdr nodehdr;
+
+               node = blk->bp->b_addr;
+               dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+               btree = dp->d_ops->node_tree_p(node);
+               if (be32_to_cpu(btree[blk->index].hashval) == lasthash)
+                       break;
+               blk->hashval = lasthash;
+               btree[blk->index].hashval = cpu_to_be32(lasthash);
+               xfs_trans_log_buf(state->args->trans, blk->bp,
+                                 XFS_DA_LOGRANGE(node, &btree[blk->index],
+                                                 sizeof(*btree)));
+
+               lasthash = be32_to_cpu(btree[nodehdr.count - 1].hashval);
+       }
+}
+
+/*
+ * Remove an entry from an intermediate node.
+ */
+STATIC void
+xfs_da3_node_remove(
+       struct xfs_da_state     *state,
+       struct xfs_da_state_blk *drop_blk)
+{
+       struct xfs_da_intnode   *node;
+       struct xfs_da3_icnode_hdr nodehdr;
+       struct xfs_da_node_entry *btree;
+       int                     index;
+       int                     tmp;
+       struct xfs_inode        *dp = state->args->dp;
+
+       trace_xfs_da_node_remove(state->args);
+
+       node = drop_blk->bp->b_addr;
+       dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+       ASSERT(drop_blk->index < nodehdr.count);
+       ASSERT(drop_blk->index >= 0);
+
+       /*
+        * Copy over the offending entry, or just zero it out.
+        */
+       index = drop_blk->index;
+       btree = dp->d_ops->node_tree_p(node);
+       if (index < nodehdr.count - 1) {
+               tmp  = nodehdr.count - index - 1;
+               tmp *= (uint)sizeof(xfs_da_node_entry_t);
+               memmove(&btree[index], &btree[index + 1], tmp);
+               xfs_trans_log_buf(state->args->trans, drop_blk->bp,
+                   XFS_DA_LOGRANGE(node, &btree[index], tmp));
+               index = nodehdr.count - 1;
+       }
+       memset(&btree[index], 0, sizeof(xfs_da_node_entry_t));
+       xfs_trans_log_buf(state->args->trans, drop_blk->bp,
+           XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index])));
+       nodehdr.count -= 1;
+       dp->d_ops->node_hdr_to_disk(node, &nodehdr);
+       xfs_trans_log_buf(state->args->trans, drop_blk->bp,
+           XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
+
+       /*
+        * Copy the last hash value from the block to propagate upwards.
+        */
+       drop_blk->hashval = be32_to_cpu(btree[index - 1].hashval);
+}
+
+/*
+ * Unbalance the elements between two intermediate nodes,
+ * move all Btree elements from one node into another.
+ */
+STATIC void
+xfs_da3_node_unbalance(
+       struct xfs_da_state     *state,
+       struct xfs_da_state_blk *drop_blk,
+       struct xfs_da_state_blk *save_blk)
+{
+       struct xfs_da_intnode   *drop_node;
+       struct xfs_da_intnode   *save_node;
+       struct xfs_da_node_entry *drop_btree;
+       struct xfs_da_node_entry *save_btree;
+       struct xfs_da3_icnode_hdr drop_hdr;
+       struct xfs_da3_icnode_hdr save_hdr;
+       struct xfs_trans        *tp;
+       int                     sindex;
+       int                     tmp;
+       struct xfs_inode        *dp = state->args->dp;
+
+       trace_xfs_da_node_unbalance(state->args);
+
+       drop_node = drop_blk->bp->b_addr;
+       save_node = save_blk->bp->b_addr;
+       dp->d_ops->node_hdr_from_disk(&drop_hdr, drop_node);
+       dp->d_ops->node_hdr_from_disk(&save_hdr, save_node);
+       drop_btree = dp->d_ops->node_tree_p(drop_node);
+       save_btree = dp->d_ops->node_tree_p(save_node);
+       tp = state->args->trans;
+
+       /*
+        * If the dying block has lower hashvals, then move all the
+        * elements in the remaining block up to make a hole.
+        */
+       if ((be32_to_cpu(drop_btree[0].hashval) <
+                       be32_to_cpu(save_btree[0].hashval)) ||
+           (be32_to_cpu(drop_btree[drop_hdr.count - 1].hashval) <
+                       be32_to_cpu(save_btree[save_hdr.count - 1].hashval))) {
+               /* XXX: check this - is memmove dst correct? */
+               tmp = save_hdr.count * sizeof(xfs_da_node_entry_t);
+               memmove(&save_btree[drop_hdr.count], &save_btree[0], tmp);
+
+               sindex = 0;
+               xfs_trans_log_buf(tp, save_blk->bp,
+                       XFS_DA_LOGRANGE(save_node, &save_btree[0],
+                               (save_hdr.count + drop_hdr.count) *
+                                               sizeof(xfs_da_node_entry_t)));
+       } else {
+               sindex = save_hdr.count;
+               xfs_trans_log_buf(tp, save_blk->bp,
+                       XFS_DA_LOGRANGE(save_node, &save_btree[sindex],
+                               drop_hdr.count * sizeof(xfs_da_node_entry_t)));
+       }
+
+       /*
+        * Move all the B-tree elements from drop_blk to save_blk.
+        */
+       tmp = drop_hdr.count * (uint)sizeof(xfs_da_node_entry_t);
+       memcpy(&save_btree[sindex], &drop_btree[0], tmp);
+       save_hdr.count += drop_hdr.count;
+
+       dp->d_ops->node_hdr_to_disk(save_node, &save_hdr);
+       xfs_trans_log_buf(tp, save_blk->bp,
+               XFS_DA_LOGRANGE(save_node, &save_node->hdr,
+                               dp->d_ops->node_hdr_size));
+
+       /*
+        * Save the last hashval in the remaining block for upward propagation.
+        */
+       save_blk->hashval = be32_to_cpu(save_btree[save_hdr.count - 1].hashval);
+}
+
+/*========================================================================
+ * Routines used for finding things in the Btree.
+ *========================================================================*/
+
+/*
+ * Walk down the Btree looking for a particular filename, filling
+ * in the state structure as we go.
+ *
+ * We will set the state structure to point to each of the elements
+ * in each of the nodes where either the hashval is or should be.
+ *
+ * We support duplicate hashval's so for each entry in the current
+ * node that could contain the desired hashval, descend.  This is a
+ * pruned depth-first tree search.
+ */
+int                                                    /* error */
+xfs_da3_node_lookup_int(
+       struct xfs_da_state     *state,
+       int                     *result)
+{
+       struct xfs_da_state_blk *blk;
+       struct xfs_da_blkinfo   *curr;
+       struct xfs_da_intnode   *node;
+       struct xfs_da_node_entry *btree;
+       struct xfs_da3_icnode_hdr nodehdr;
+       struct xfs_da_args      *args;
+       xfs_dablk_t             blkno;
+       xfs_dahash_t            hashval;
+       xfs_dahash_t            btreehashval;
+       int                     probe;
+       int                     span;
+       int                     max;
+       int                     error;
+       int                     retval;
+       struct xfs_inode        *dp = state->args->dp;
+
+       args = state->args;
+
+       /*
+        * Descend thru the B-tree searching each level for the right
+        * node to use, until the right hashval is found.
+        */
+       blkno = (args->whichfork == XFS_DATA_FORK)? args->geo->leafblk : 0;
+       for (blk = &state->path.blk[0], state->path.active = 1;
+                        state->path.active <= XFS_DA_NODE_MAXDEPTH;
+                        blk++, state->path.active++) {
+               /*
+                * Read the next node down in the tree.
+                */
+               blk->blkno = blkno;
+               error = xfs_da3_node_read(args->trans, args->dp, blkno,
+                                       -1, &blk->bp, args->whichfork);
+               if (error) {
+                       blk->blkno = 0;
+                       state->path.active--;
+                       return error;
+               }
+               curr = blk->bp->b_addr;
+               blk->magic = be16_to_cpu(curr->magic);
+
+               if (blk->magic == XFS_ATTR_LEAF_MAGIC ||
+                   blk->magic == XFS_ATTR3_LEAF_MAGIC) {
+                       blk->magic = XFS_ATTR_LEAF_MAGIC;
+                       blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
+                       break;
+               }
+
+               if (blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+                   blk->magic == XFS_DIR3_LEAFN_MAGIC) {
+                       blk->magic = XFS_DIR2_LEAFN_MAGIC;
+                       blk->hashval = xfs_dir2_leafn_lasthash(args->dp,
+                                                              blk->bp, NULL);
+                       break;
+               }
+
+               blk->magic = XFS_DA_NODE_MAGIC;
+
+
+               /*
+                * Search an intermediate node for a match.
+                */
+               node = blk->bp->b_addr;
+               dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+               btree = dp->d_ops->node_tree_p(node);
+
+               max = nodehdr.count;
+               blk->hashval = be32_to_cpu(btree[max - 1].hashval);
+
+               /*
+                * Binary search.  (note: small blocks will skip loop)
+                */
+               probe = span = max / 2;
+               hashval = args->hashval;
+               while (span > 4) {
+                       span /= 2;
+                       btreehashval = be32_to_cpu(btree[probe].hashval);
+                       if (btreehashval < hashval)
+                               probe += span;
+                       else if (btreehashval > hashval)
+                               probe -= span;
+                       else
+                               break;
+               }
+               ASSERT((probe >= 0) && (probe < max));
+               ASSERT((span <= 4) ||
+                       (be32_to_cpu(btree[probe].hashval) == hashval));
+
+               /*
+                * Since we may have duplicate hashval's, find the first
+                * matching hashval in the node.
+                */
+               while (probe > 0 &&
+                      be32_to_cpu(btree[probe].hashval) >= hashval) {
+                       probe--;
+               }
+               while (probe < max &&
+                      be32_to_cpu(btree[probe].hashval) < hashval) {
+                       probe++;
+               }
+
+               /*
+                * Pick the right block to descend on.
+                */
+               if (probe == max) {
+                       blk->index = max - 1;
+                       blkno = be32_to_cpu(btree[max - 1].before);
+               } else {
+                       blk->index = probe;
+                       blkno = be32_to_cpu(btree[probe].before);
+               }
+       }
+
+       /*
+        * A leaf block that ends in the hashval that we are interested in
+        * (final hashval == search hashval) means that the next block may
+        * contain more entries with the same hashval, shift upward to the
+        * next leaf and keep searching.
+        */
+       for (;;) {
+               if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
+                       retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
+                                                       &blk->index, state);
+               } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+                       retval = xfs_attr3_leaf_lookup_int(blk->bp, args);
+                       blk->index = args->index;
+                       args->blkno = blk->blkno;
+               } else {
+                       ASSERT(0);
+                       return -EFSCORRUPTED;
+               }
+               if (((retval == -ENOENT) || (retval == -ENOATTR)) &&
+                   (blk->hashval == args->hashval)) {
+                       error = xfs_da3_path_shift(state, &state->path, 1, 1,
+                                                        &retval);
+                       if (error)
+                               return error;
+                       if (retval == 0) {
+                               continue;
+                       } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+                               /* path_shift() gives ENOENT */
+                               retval = -ENOATTR;
+                       }
+               }
+               break;
+       }
+       *result = retval;
+       return 0;
+}
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Compare two intermediate nodes for "order".
+ */
+STATIC int
+xfs_da3_node_order(
+       struct xfs_inode *dp,
+       struct xfs_buf  *node1_bp,
+       struct xfs_buf  *node2_bp)
+{
+       struct xfs_da_intnode   *node1;
+       struct xfs_da_intnode   *node2;
+       struct xfs_da_node_entry *btree1;
+       struct xfs_da_node_entry *btree2;
+       struct xfs_da3_icnode_hdr node1hdr;
+       struct xfs_da3_icnode_hdr node2hdr;
+
+       node1 = node1_bp->b_addr;
+       node2 = node2_bp->b_addr;
+       dp->d_ops->node_hdr_from_disk(&node1hdr, node1);
+       dp->d_ops->node_hdr_from_disk(&node2hdr, node2);
+       btree1 = dp->d_ops->node_tree_p(node1);
+       btree2 = dp->d_ops->node_tree_p(node2);
+
+       if (node1hdr.count > 0 && node2hdr.count > 0 &&
+           ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
+            (be32_to_cpu(btree2[node2hdr.count - 1].hashval) <
+             be32_to_cpu(btree1[node1hdr.count - 1].hashval)))) {
+               return 1;
+       }
+       return 0;
+}
+
+/*
+ * Link a new block into a doubly linked list of blocks (of whatever type).
+ */
+int                                                    /* error */
+xfs_da3_blk_link(
+       struct xfs_da_state     *state,
+       struct xfs_da_state_blk *old_blk,
+       struct xfs_da_state_blk *new_blk)
+{
+       struct xfs_da_blkinfo   *old_info;
+       struct xfs_da_blkinfo   *new_info;
+       struct xfs_da_blkinfo   *tmp_info;
+       struct xfs_da_args      *args;
+       struct xfs_buf          *bp;
+       int                     before = 0;
+       int                     error;
+       struct xfs_inode        *dp = state->args->dp;
+
+       /*
+        * Set up environment.
+        */
+       args = state->args;
+       ASSERT(args != NULL);
+       old_info = old_blk->bp->b_addr;
+       new_info = new_blk->bp->b_addr;
+       ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
+              old_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+              old_blk->magic == XFS_ATTR_LEAF_MAGIC);
+
+       switch (old_blk->magic) {
+       case XFS_ATTR_LEAF_MAGIC:
+               before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
+               break;
+       case XFS_DIR2_LEAFN_MAGIC:
+               before = xfs_dir2_leafn_order(dp, old_blk->bp, new_blk->bp);
+               break;
+       case XFS_DA_NODE_MAGIC:
+               before = xfs_da3_node_order(dp, old_blk->bp, new_blk->bp);
+               break;
+       }
+
+       /*
+        * Link blocks in appropriate order.
+        */
+       if (before) {
+               /*
+                * Link new block in before existing block.
+                */
+               trace_xfs_da_link_before(args);
+               new_info->forw = cpu_to_be32(old_blk->blkno);
+               new_info->back = old_info->back;
+               if (old_info->back) {
+                       error = xfs_da3_node_read(args->trans, dp,
+                                               be32_to_cpu(old_info->back),
+                                               -1, &bp, args->whichfork);
+                       if (error)
+                               return error;
+                       ASSERT(bp != NULL);
+                       tmp_info = bp->b_addr;
+                       ASSERT(tmp_info->magic == old_info->magic);
+                       ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno);
+                       tmp_info->forw = cpu_to_be32(new_blk->blkno);
+                       xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
+               }
+               old_info->back = cpu_to_be32(new_blk->blkno);
+       } else {
+               /*
+                * Link new block in after existing block.
+                */
+               trace_xfs_da_link_after(args);
+               new_info->forw = old_info->forw;
+               new_info->back = cpu_to_be32(old_blk->blkno);
+               if (old_info->forw) {
+                       error = xfs_da3_node_read(args->trans, dp,
+                                               be32_to_cpu(old_info->forw),
+                                               -1, &bp, args->whichfork);
+                       if (error)
+                               return error;
+                       ASSERT(bp != NULL);
+                       tmp_info = bp->b_addr;
+                       ASSERT(tmp_info->magic == old_info->magic);
+                       ASSERT(be32_to_cpu(tmp_info->back) == old_blk->blkno);
+                       tmp_info->back = cpu_to_be32(new_blk->blkno);
+                       xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
+               }
+               old_info->forw = cpu_to_be32(new_blk->blkno);
+       }
+
+       xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
+       xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
+       return 0;
+}
+
+/*
+ * Unlink a block from a doubly linked list of blocks.
+ */
+STATIC int                                             /* error */
+xfs_da3_blk_unlink(
+       struct xfs_da_state     *state,
+       struct xfs_da_state_blk *drop_blk,
+       struct xfs_da_state_blk *save_blk)
+{
+       struct xfs_da_blkinfo   *drop_info;
+       struct xfs_da_blkinfo   *save_info;
+       struct xfs_da_blkinfo   *tmp_info;
+       struct xfs_da_args      *args;
+       struct xfs_buf          *bp;
+       int                     error;
+
+       /*
+        * Set up environment.
+        */
+       args = state->args;
+       ASSERT(args != NULL);
+       save_info = save_blk->bp->b_addr;
+       drop_info = drop_blk->bp->b_addr;
+       ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
+              save_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+              save_blk->magic == XFS_ATTR_LEAF_MAGIC);
+       ASSERT(save_blk->magic == drop_blk->magic);
+       ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) ||
+              (be32_to_cpu(save_info->back) == drop_blk->blkno));
+       ASSERT((be32_to_cpu(drop_info->forw) == save_blk->blkno) ||
+              (be32_to_cpu(drop_info->back) == save_blk->blkno));
+
+       /*
+        * Unlink the leaf block from the doubly linked chain of leaves.
+        */
+       if (be32_to_cpu(save_info->back) == drop_blk->blkno) {
+               trace_xfs_da_unlink_back(args);
+               save_info->back = drop_info->back;
+               if (drop_info->back) {
+                       error = xfs_da3_node_read(args->trans, args->dp,
+                                               be32_to_cpu(drop_info->back),
+                                               -1, &bp, args->whichfork);
+                       if (error)
+                               return error;
+                       ASSERT(bp != NULL);
+                       tmp_info = bp->b_addr;
+                       ASSERT(tmp_info->magic == save_info->magic);
+                       ASSERT(be32_to_cpu(tmp_info->forw) == drop_blk->blkno);
+                       tmp_info->forw = cpu_to_be32(save_blk->blkno);
+                       xfs_trans_log_buf(args->trans, bp, 0,
+                                                   sizeof(*tmp_info) - 1);
+               }
+       } else {
+               trace_xfs_da_unlink_forward(args);
+               save_info->forw = drop_info->forw;
+               if (drop_info->forw) {
+                       error = xfs_da3_node_read(args->trans, args->dp,
+                                               be32_to_cpu(drop_info->forw),
+                                               -1, &bp, args->whichfork);
+                       if (error)
+                               return error;
+                       ASSERT(bp != NULL);
+                       tmp_info = bp->b_addr;
+                       ASSERT(tmp_info->magic == save_info->magic);
+                       ASSERT(be32_to_cpu(tmp_info->back) == drop_blk->blkno);
+                       tmp_info->back = cpu_to_be32(save_blk->blkno);
+                       xfs_trans_log_buf(args->trans, bp, 0,
+                                                   sizeof(*tmp_info) - 1);
+               }
+       }
+
+       xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
+       return 0;
+}
+
+/*
+ * Move a path "forward" or "!forward" one block at the current level.
+ *
+ * This routine will adjust a "path" to point to the next block
+ * "forward" (higher hashvalues) or "!forward" (lower hashvals) in the
+ * Btree, including updating pointers to the intermediate nodes between
+ * the new bottom and the root.
+ */
+int                                                    /* error */
+xfs_da3_path_shift(
+       struct xfs_da_state     *state,
+       struct xfs_da_state_path *path,
+       int                     forward,
+       int                     release,
+       int                     *result)
+{
+       struct xfs_da_state_blk *blk;
+       struct xfs_da_blkinfo   *info;
+       struct xfs_da_intnode   *node;
+       struct xfs_da_args      *args;
+       struct xfs_da_node_entry *btree;
+       struct xfs_da3_icnode_hdr nodehdr;
+       xfs_dablk_t             blkno = 0;
+       int                     level;
+       int                     error;
+       struct xfs_inode        *dp = state->args->dp;
+
+       trace_xfs_da_path_shift(state->args);
+
+       /*
+        * Roll up the Btree looking for the first block where our
+        * current index is not at the edge of the block.  Note that
+        * we skip the bottom layer because we want the sibling block.
+        */
+       args = state->args;
+       ASSERT(args != NULL);
+       ASSERT(path != NULL);
+       ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+       level = (path->active-1) - 1;   /* skip bottom layer in path */
+       for (blk = &path->blk[level]; level >= 0; blk--, level--) {
+               node = blk->bp->b_addr;
+               dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+               btree = dp->d_ops->node_tree_p(node);
+
+               if (forward && (blk->index < nodehdr.count - 1)) {
+                       blk->index++;
+                       blkno = be32_to_cpu(btree[blk->index].before);
+                       break;
+               } else if (!forward && (blk->index > 0)) {
+                       blk->index--;
+                       blkno = be32_to_cpu(btree[blk->index].before);
+                       break;
+               }
+       }
+       if (level < 0) {
+               *result = -ENOENT;      /* we're out of our tree */
+               ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+               return 0;
+       }
+
+       /*
+        * Roll down the edge of the subtree until we reach the
+        * same depth we were at originally.
+        */
+       for (blk++, level++; level < path->active; blk++, level++) {
+               /*
+                * Release the old block.
+                * (if it's dirty, trans won't actually let go)
+                */
+               if (release)
+                       xfs_trans_brelse(args->trans, blk->bp);
+
+               /*
+                * Read the next child block.
+                */
+               blk->blkno = blkno;
+               error = xfs_da3_node_read(args->trans, dp, blkno, -1,
+                                       &blk->bp, args->whichfork);
+               if (error)
+                       return error;
+               info = blk->bp->b_addr;
+               ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+                      info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
+                      info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+                      info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
+                      info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+                      info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+
+
+               /*
+                * Note: we flatten the magic number to a single type so we
+                * don't have to compare against crc/non-crc types elsewhere.
+                */
+               switch (be16_to_cpu(info->magic)) {
+               case XFS_DA_NODE_MAGIC:
+               case XFS_DA3_NODE_MAGIC:
+                       blk->magic = XFS_DA_NODE_MAGIC;
+                       node = (xfs_da_intnode_t *)info;
+                       dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+                       btree = dp->d_ops->node_tree_p(node);
+                       blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
+                       if (forward)
+                               blk->index = 0;
+                       else
+                               blk->index = nodehdr.count - 1;
+                       blkno = be32_to_cpu(btree[blk->index].before);
+                       break;
+               case XFS_ATTR_LEAF_MAGIC:
+               case XFS_ATTR3_LEAF_MAGIC:
+                       blk->magic = XFS_ATTR_LEAF_MAGIC;
+                       ASSERT(level == path->active-1);
+                       blk->index = 0;
+                       blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
+                       break;
+               case XFS_DIR2_LEAFN_MAGIC:
+               case XFS_DIR3_LEAFN_MAGIC:
+                       blk->magic = XFS_DIR2_LEAFN_MAGIC;
+                       ASSERT(level == path->active-1);
+                       blk->index = 0;
+                       blk->hashval = xfs_dir2_leafn_lasthash(args->dp,
+                                                              blk->bp, NULL);
+                       break;
+               default:
+                       ASSERT(0);
+                       break;
+               }
+       }
+       *result = 0;
+       return 0;
+}
+
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Implement a simple hash on a character string.
+ * Rotate the hash value by 7 bits, then XOR each character in.
+ * This is implemented with some source-level loop unrolling.
+ */
+xfs_dahash_t
+xfs_da_hashname(const __uint8_t *name, int namelen)
+{
+       xfs_dahash_t hash;
+
+       /*
+        * Do four characters at a time as long as we can.
+        */
+       for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
+               hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
+                      (name[3] << 0) ^ rol32(hash, 7 * 4);
+
+       /*
+        * Now do the rest of the characters.
+        */
+       switch (namelen) {
+       case 3:
+               return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
+                      rol32(hash, 7 * 3);
+       case 2:
+               return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
+       case 1:
+               return (name[0] << 0) ^ rol32(hash, 7 * 1);
+       default: /* case 0: */
+               return hash;
+       }
+}
+
+enum xfs_dacmp
+xfs_da_compname(
+       struct xfs_da_args *args,
+       const unsigned char *name,
+       int             len)
+{
+       return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
+                                       XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
+}
+
+static xfs_dahash_t
+xfs_default_hashname(
+       struct xfs_name *name)
+{
+       return xfs_da_hashname(name->name, name->len);
+}
+
+const struct xfs_nameops xfs_default_nameops = {
+       .hashname       = xfs_default_hashname,
+       .compname       = xfs_da_compname
+};
+
+int
+xfs_da_grow_inode_int(
+       struct xfs_da_args      *args,
+       xfs_fileoff_t           *bno,
+       int                     count)
+{
+       struct xfs_trans        *tp = args->trans;
+       struct xfs_inode        *dp = args->dp;
+       int                     w = args->whichfork;
+       xfs_rfsblock_t          nblks = dp->i_d.di_nblocks;
+       struct xfs_bmbt_irec    map, *mapp;
+       int                     nmap, error, got, i, mapi;
+
+       /*
+        * Find a spot in the file space to put the new block.
+        */
+       error = xfs_bmap_first_unused(tp, dp, count, bno, w);
+       if (error)
+               return error;
+
+       /*
+        * Try mapping it in one filesystem block.
+        */
+       nmap = 1;
+       ASSERT(args->firstblock != NULL);
+       error = xfs_bmapi_write(tp, dp, *bno, count,
+                       xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
+                       args->firstblock, args->total, &map, &nmap,
+                       args->flist);
+       if (error)
+               return error;
+
+       ASSERT(nmap <= 1);
+       if (nmap == 1) {
+               mapp = &map;
+               mapi = 1;
+       } else if (nmap == 0 && count > 1) {
+               xfs_fileoff_t           b;
+               int                     c;
+
+               /*
+                * If we didn't get it and the block might work if fragmented,
+                * try without the CONTIG flag.  Loop until we get it all.
+                */
+               mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
+               for (b = *bno, mapi = 0; b < *bno + count; ) {
+                       nmap = MIN(XFS_BMAP_MAX_NMAP, count);
+                       c = (int)(*bno + count - b);
+                       error = xfs_bmapi_write(tp, dp, b, c,
+                                       xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
+                                       args->firstblock, args->total,
+                                       &mapp[mapi], &nmap, args->flist);
+                       if (error)
+                               goto out_free_map;
+                       if (nmap < 1)
+                               break;
+                       mapi += nmap;
+                       b = mapp[mapi - 1].br_startoff +
+                           mapp[mapi - 1].br_blockcount;
+               }
+       } else {
+               mapi = 0;
+               mapp = NULL;
+       }
+
+       /*
+        * Count the blocks we got, make sure it matches the total.
+        */
+       for (i = 0, got = 0; i < mapi; i++)
+               got += mapp[i].br_blockcount;
+       if (got != count || mapp[0].br_startoff != *bno ||
+           mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
+           *bno + count) {
+               error = -ENOSPC;
+               goto out_free_map;
+       }
+
+       /* account for newly allocated blocks in reserved blocks total */
+       args->total -= dp->i_d.di_nblocks - nblks;
+
+out_free_map:
+       if (mapp != &map)
+               kmem_free(mapp);
+       return error;
+}
+
+/*
+ * Add a block to the btree ahead of the file.
+ * Return the new block number to the caller.
+ */
+int
+xfs_da_grow_inode(
+       struct xfs_da_args      *args,
+       xfs_dablk_t             *new_blkno)
+{
+       xfs_fileoff_t           bno;
+       int                     error;
+
+       trace_xfs_da_grow_inode(args);
+
+       bno = args->geo->leafblk;
+       error = xfs_da_grow_inode_int(args, &bno, args->geo->fsbcount);
+       if (!error)
+               *new_blkno = (xfs_dablk_t)bno;
+       return error;
+}
+
+/*
+ * Ick.  We need to always be able to remove a btree block, even
+ * if there's no space reservation because the filesystem is full.
+ * This is called if xfs_bunmapi on a btree block fails due to ENOSPC.
+ * It swaps the target block with the last block in the file.  The
+ * last block in the file can always be removed since it can't cause
+ * a bmap btree split to do that.
+ */
+STATIC int
+xfs_da3_swap_lastblock(
+       struct xfs_da_args      *args,
+       xfs_dablk_t             *dead_blknop,
+       struct xfs_buf          **dead_bufp)
+{
+       struct xfs_da_blkinfo   *dead_info;
+       struct xfs_da_blkinfo   *sib_info;
+       struct xfs_da_intnode   *par_node;
+       struct xfs_da_intnode   *dead_node;
+       struct xfs_dir2_leaf    *dead_leaf2;
+       struct xfs_da_node_entry *btree;
+       struct xfs_da3_icnode_hdr par_hdr;
+       struct xfs_inode        *dp;
+       struct xfs_trans        *tp;
+       struct xfs_mount        *mp;
+       struct xfs_buf          *dead_buf;
+       struct xfs_buf          *last_buf;
+       struct xfs_buf          *sib_buf;
+       struct xfs_buf          *par_buf;
+       xfs_dahash_t            dead_hash;
+       xfs_fileoff_t           lastoff;
+       xfs_dablk_t             dead_blkno;
+       xfs_dablk_t             last_blkno;
+       xfs_dablk_t             sib_blkno;
+       xfs_dablk_t             par_blkno;
+       int                     error;
+       int                     w;
+       int                     entno;
+       int                     level;
+       int                     dead_level;
+
+       trace_xfs_da_swap_lastblock(args);
+
+       dead_buf = *dead_bufp;
+       dead_blkno = *dead_blknop;
+       tp = args->trans;
+       dp = args->dp;
+       w = args->whichfork;
+       ASSERT(w == XFS_DATA_FORK);
+       mp = dp->i_mount;
+       lastoff = args->geo->freeblk;
+       error = xfs_bmap_last_before(tp, dp, &lastoff, w);
+       if (error)
+               return error;
+       if (unlikely(lastoff == 0)) {
+               XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW,
+                                mp);
+               return -EFSCORRUPTED;
+       }
+       /*
+        * Read the last block in the btree space.
+        */
+       last_blkno = (xfs_dablk_t)lastoff - args->geo->fsbcount;
+       error = xfs_da3_node_read(tp, dp, last_blkno, -1, &last_buf, w);
+       if (error)
+               return error;
+       /*
+        * Copy the last block into the dead buffer and log it.
+        */
+       memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize);
+       xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1);
+       dead_info = dead_buf->b_addr;
+       /*
+        * Get values from the moved block.
+        */
+       if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+           dead_info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+               struct xfs_dir3_icleaf_hdr leafhdr;
+               struct xfs_dir2_leaf_entry *ents;
+
+               dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
+               dp->d_ops->leaf_hdr_from_disk(&leafhdr, dead_leaf2);
+               ents = dp->d_ops->leaf_ents_p(dead_leaf2);
+               dead_level = 0;
+               dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval);
+       } else {
+               struct xfs_da3_icnode_hdr deadhdr;
+
+               dead_node = (xfs_da_intnode_t *)dead_info;
+               dp->d_ops->node_hdr_from_disk(&deadhdr, dead_node);
+               btree = dp->d_ops->node_tree_p(dead_node);
+               dead_level = deadhdr.level;
+               dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval);
+       }
+       sib_buf = par_buf = NULL;
+       /*
+        * If the moved block has a left sibling, fix up the pointers.
+        */
+       if ((sib_blkno = be32_to_cpu(dead_info->back))) {
+               error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
+               if (error)
+                       goto done;
+               sib_info = sib_buf->b_addr;
+               if (unlikely(
+                   be32_to_cpu(sib_info->forw) != last_blkno ||
+                   sib_info->magic != dead_info->magic)) {
+                       XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)",
+                                        XFS_ERRLEVEL_LOW, mp);
+                       error = -EFSCORRUPTED;
+                       goto done;
+               }
+               sib_info->forw = cpu_to_be32(dead_blkno);
+               xfs_trans_log_buf(tp, sib_buf,
+                       XFS_DA_LOGRANGE(sib_info, &sib_info->forw,
+                                       sizeof(sib_info->forw)));
+               sib_buf = NULL;
+       }
+       /*
+        * If the moved block has a right sibling, fix up the pointers.
+        */
+       if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
+               error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
+               if (error)
+                       goto done;
+               sib_info = sib_buf->b_addr;
+               if (unlikely(
+                      be32_to_cpu(sib_info->back) != last_blkno ||
+                      sib_info->magic != dead_info->magic)) {
+                       XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)",
+                                        XFS_ERRLEVEL_LOW, mp);
+                       error = -EFSCORRUPTED;
+                       goto done;
+               }
+               sib_info->back = cpu_to_be32(dead_blkno);
+               xfs_trans_log_buf(tp, sib_buf,
+                       XFS_DA_LOGRANGE(sib_info, &sib_info->back,
+                                       sizeof(sib_info->back)));
+               sib_buf = NULL;
+       }
+       par_blkno = args->geo->leafblk;
+       level = -1;
+       /*
+        * Walk down the tree looking for the parent of the moved block.
+        */
+       for (;;) {
+               error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
+               if (error)
+                       goto done;
+               par_node = par_buf->b_addr;
+               dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
+               if (level >= 0 && level != par_hdr.level + 1) {
+                       XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
+                                        XFS_ERRLEVEL_LOW, mp);
+                       error = -EFSCORRUPTED;
+                       goto done;
+               }
+               level = par_hdr.level;
+               btree = dp->d_ops->node_tree_p(par_node);
+               for (entno = 0;
+                    entno < par_hdr.count &&
+                    be32_to_cpu(btree[entno].hashval) < dead_hash;
+                    entno++)
+                       continue;
+               if (entno == par_hdr.count) {
+                       XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
+                                        XFS_ERRLEVEL_LOW, mp);
+                       error = -EFSCORRUPTED;
+                       goto done;
+               }
+               par_blkno = be32_to_cpu(btree[entno].before);
+               if (level == dead_level + 1)
+                       break;
+               xfs_trans_brelse(tp, par_buf);
+               par_buf = NULL;
+       }
+       /*
+        * We're in the right parent block.
+        * Look for the right entry.
+        */
+       for (;;) {
+               for (;
+                    entno < par_hdr.count &&
+                    be32_to_cpu(btree[entno].before) != last_blkno;
+                    entno++)
+                       continue;
+               if (entno < par_hdr.count)
+                       break;
+               par_blkno = par_hdr.forw;
+               xfs_trans_brelse(tp, par_buf);
+               par_buf = NULL;
+               if (unlikely(par_blkno == 0)) {
+                       XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)",
+                                        XFS_ERRLEVEL_LOW, mp);
+                       error = -EFSCORRUPTED;
+                       goto done;
+               }
+               error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
+               if (error)
+                       goto done;
+               par_node = par_buf->b_addr;
+               dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
+               if (par_hdr.level != level) {
+                       XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
+                                        XFS_ERRLEVEL_LOW, mp);
+                       error = -EFSCORRUPTED;
+                       goto done;
+               }
+               btree = dp->d_ops->node_tree_p(par_node);
+               entno = 0;
+       }
+       /*
+        * Update the parent entry pointing to the moved block.
+        */
+       btree[entno].before = cpu_to_be32(dead_blkno);
+       xfs_trans_log_buf(tp, par_buf,
+               XFS_DA_LOGRANGE(par_node, &btree[entno].before,
+                               sizeof(btree[entno].before)));
+       *dead_blknop = last_blkno;
+       *dead_bufp = last_buf;
+       return 0;
+done:
+       if (par_buf)
+               xfs_trans_brelse(tp, par_buf);
+       if (sib_buf)
+               xfs_trans_brelse(tp, sib_buf);
+       xfs_trans_brelse(tp, last_buf);
+       return error;
+}
+
+/*
+ * Remove a btree block from a directory or attribute.
+ */
+int
+xfs_da_shrink_inode(
+       xfs_da_args_t   *args,
+       xfs_dablk_t     dead_blkno,
+       struct xfs_buf  *dead_buf)
+{
+       xfs_inode_t *dp;
+       int done, error, w, count;
+       xfs_trans_t *tp;
+       xfs_mount_t *mp;
+
+       trace_xfs_da_shrink_inode(args);
+
+       dp = args->dp;
+       w = args->whichfork;
+       tp = args->trans;
+       mp = dp->i_mount;
+       count = args->geo->fsbcount;
+       for (;;) {
+               /*
+                * Remove extents.  If we get ENOSPC for a dir we have to move
+                * the last block to the place we want to kill.
+                */
+               error = xfs_bunmapi(tp, dp, dead_blkno, count,
+                                   xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
+                                   0, args->firstblock, args->flist, &done);
+               if (error == -ENOSPC) {
+                       if (w != XFS_DATA_FORK)
+                               break;
+                       error = xfs_da3_swap_lastblock(args, &dead_blkno,
+                                                     &dead_buf);
+                       if (error)
+                               break;
+               } else {
+                       break;
+               }
+       }
+       xfs_trans_binval(tp, dead_buf);
+       return error;
+}
+
+/*
+ * See if the mapping(s) for this btree block are valid, i.e.
+ * don't contain holes, are logically contiguous, and cover the whole range.
+ */
+STATIC int
+xfs_da_map_covers_blocks(
+       int             nmap,
+       xfs_bmbt_irec_t *mapp,
+       xfs_dablk_t     bno,
+       int             count)
+{
+       int             i;
+       xfs_fileoff_t   off;
+
+       for (i = 0, off = bno; i < nmap; i++) {
+               if (mapp[i].br_startblock == HOLESTARTBLOCK ||
+                   mapp[i].br_startblock == DELAYSTARTBLOCK) {
+                       return 0;
+               }
+               if (off != mapp[i].br_startoff) {
+                       return 0;
+               }
+               off += mapp[i].br_blockcount;
+       }
+       return off == bno + count;
+}
+
+/*
+ * Convert a struct xfs_bmbt_irec to a struct xfs_buf_map.
+ *
+ * For the single map case, it is assumed that the caller has provided a pointer
+ * to a valid xfs_buf_map.  For the multiple map case, this function will
+ * allocate the xfs_buf_map to hold all the maps and replace the caller's single
+ * map pointer with the allocated map.
+ */
+static int
+xfs_buf_map_from_irec(
+       struct xfs_mount        *mp,
+       struct xfs_buf_map      **mapp,
+       int                     *nmaps,
+       struct xfs_bmbt_irec    *irecs,
+       int                     nirecs)
+{
+       struct xfs_buf_map      *map;
+       int                     i;
+
+       ASSERT(*nmaps == 1);
+       ASSERT(nirecs >= 1);
+
+       if (nirecs > 1) {
+               map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
+                                 KM_SLEEP | KM_NOFS);
+               if (!map)
+                       return -ENOMEM;
+               *mapp = map;
+       }
+
+       *nmaps = nirecs;
+       map = *mapp;
+       for (i = 0; i < *nmaps; i++) {
+               ASSERT(irecs[i].br_startblock != DELAYSTARTBLOCK &&
+                      irecs[i].br_startblock != HOLESTARTBLOCK);
+               map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock);
+               map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount);
+       }
+       return 0;
+}
+
+/*
+ * Map the block we are given ready for reading. There are three possible return
+ * values:
+ *     -1 - will be returned if we land in a hole and mappedbno == -2 so the
+ *          caller knows not to execute a subsequent read.
+ *      0 - if we mapped the block successfully
+ *     >0 - positive error number if there was an error.
+ */
+static int
+xfs_dabuf_map(
+       struct xfs_inode        *dp,
+       xfs_dablk_t             bno,
+       xfs_daddr_t             mappedbno,
+       int                     whichfork,
+       struct xfs_buf_map      **map,
+       int                     *nmaps)
+{
+       struct xfs_mount        *mp = dp->i_mount;
+       int                     nfsb;
+       int                     error = 0;
+       struct xfs_bmbt_irec    irec;
+       struct xfs_bmbt_irec    *irecs = &irec;
+       int                     nirecs;
+
+       ASSERT(map && *map);
+       ASSERT(*nmaps == 1);
+
+       if (whichfork == XFS_DATA_FORK)
+               nfsb = mp->m_dir_geo->fsbcount;
+       else
+               nfsb = mp->m_attr_geo->fsbcount;
+
+       /*
+        * Caller doesn't have a mapping.  -2 means don't complain
+        * if we land in a hole.
+        */
+       if (mappedbno == -1 || mappedbno == -2) {
+               /*
+                * Optimize the one-block case.
+                */
+               if (nfsb != 1)
+                       irecs = kmem_zalloc(sizeof(irec) * nfsb,
+                                           KM_SLEEP | KM_NOFS);
+
+               nirecs = nfsb;
+               error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
+                                      &nirecs, xfs_bmapi_aflag(whichfork));
+               if (error)
+                       goto out;
+       } else {
+               irecs->br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
+               irecs->br_startoff = (xfs_fileoff_t)bno;
+               irecs->br_blockcount = nfsb;
+               irecs->br_state = 0;
+               nirecs = 1;
+       }
+
+       if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) {
+               error = mappedbno == -2 ? -1 : -EFSCORRUPTED;
+               if (unlikely(error == -EFSCORRUPTED)) {
+                       if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
+                               int i;
+                               xfs_alert(mp, "%s: bno %lld dir: inode %lld",
+                                       __func__, (long long)bno,
+                                       (long long)dp->i_ino);
+                               for (i = 0; i < *nmaps; i++) {
+                                       xfs_alert(mp,
+"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
+                                               i,
+                                               (long long)irecs[i].br_startoff,
+                                               (long long)irecs[i].br_startblock,
+                                               (long long)irecs[i].br_blockcount,
+                                               irecs[i].br_state);
+                               }
+                       }
+                       XFS_ERROR_REPORT("xfs_da_do_buf(1)",
+                                        XFS_ERRLEVEL_LOW, mp);
+               }
+               goto out;
+       }
+       error = xfs_buf_map_from_irec(mp, map, nmaps, irecs, nirecs);
+out:
+       if (irecs != &irec)
+               kmem_free(irecs);
+       return error;
+}
+
+/*
+ * Get a buffer for the dir/attr block.
+ */
+int
+xfs_da_get_buf(
+       struct xfs_trans        *trans,
+       struct xfs_inode        *dp,
+       xfs_dablk_t             bno,
+       xfs_daddr_t             mappedbno,
+       struct xfs_buf          **bpp,
+       int                     whichfork)
+{
+       struct xfs_buf          *bp;
+       struct xfs_buf_map      map;
+       struct xfs_buf_map      *mapp;
+       int                     nmap;
+       int                     error;
+
+       *bpp = NULL;
+       mapp = &map;
+       nmap = 1;
+       error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
+                               &mapp, &nmap);
+       if (error) {
+               /* mapping a hole is not an error, but we don't continue */
+               if (error == -1)
+                       error = 0;
+               goto out_free;
+       }
+
+       bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp,
+                                   mapp, nmap, 0);
+       error = bp ? bp->b_error : -EIO;
+       if (error) {
+               xfs_trans_brelse(trans, bp);
+               goto out_free;
+       }
+
+       *bpp = bp;
+
+out_free:
+       if (mapp != &map)
+               kmem_free(mapp);
+
+       return error;
+}
+
+/*
+ * Get a buffer for the dir/attr block, fill in the contents.
+ */
+int
+xfs_da_read_buf(
+       struct xfs_trans        *trans,
+       struct xfs_inode        *dp,
+       xfs_dablk_t             bno,
+       xfs_daddr_t             mappedbno,
+       struct xfs_buf          **bpp,
+       int                     whichfork,
+       const struct xfs_buf_ops *ops)
+{
+       struct xfs_buf          *bp;
+       struct xfs_buf_map      map;
+       struct xfs_buf_map      *mapp;
+       int                     nmap;
+       int                     error;
+
+       *bpp = NULL;
+       mapp = &map;
+       nmap = 1;
+       error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
+                               &mapp, &nmap);
+       if (error) {
+               /* mapping a hole is not an error, but we don't continue */
+               if (error == -1)
+                       error = 0;
+               goto out_free;
+       }
+
+       error = xfs_trans_read_buf_map(dp->i_mount, trans,
+                                       dp->i_mount->m_ddev_targp,
+                                       mapp, nmap, 0, &bp, ops);
+       if (error)
+               goto out_free;
+
+       if (whichfork == XFS_ATTR_FORK)
+               xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
+       else
+               xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
+       *bpp = bp;
+out_free:
+       if (mapp != &map)
+               kmem_free(mapp);
+
+       return error;
+}
+
+/*
+ * Readahead the dir/attr block.
+ */
+xfs_daddr_t
+xfs_da_reada_buf(
+       struct xfs_inode        *dp,
+       xfs_dablk_t             bno,
+       xfs_daddr_t             mappedbno,
+       int                     whichfork,
+       const struct xfs_buf_ops *ops)
+{
+       struct xfs_buf_map      map;
+       struct xfs_buf_map      *mapp;
+       int                     nmap;
+       int                     error;
+
+       mapp = &map;
+       nmap = 1;
+       error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
+                               &mapp, &nmap);
+       if (error) {
+               /* mapping a hole is not an error, but we don't continue */
+               if (error == -1)
+                       error = 0;
+               goto out_free;
+       }
+
+       mappedbno = mapp[0].bm_bn;
+       xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops);
+
+out_free:
+       if (mapp != &map)
+               kmem_free(mapp);
+
+       if (error)
+               return -1;
+       return mappedbno;
+}
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
new file mode 100644 (file)
index 0000000..6e153e3
--- /dev/null
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DA_BTREE_H__
+#define        __XFS_DA_BTREE_H__
+
+struct xfs_bmap_free;
+struct xfs_inode;
+struct xfs_trans;
+struct zone;
+struct xfs_dir_ops;
+
+/*
+ * Directory/attribute geometry information. There will be one of these for each
+ * data fork type, and it will be passed around via the xfs_da_args. Global
+ * structures will be attached to the xfs_mount.
+ */
+struct xfs_da_geometry {
+       int             blksize;        /* da block size in bytes */
+       int             fsbcount;       /* da block size in filesystem blocks */
+       uint8_t         fsblog;         /* log2 of _filesystem_ block size */
+       uint8_t         blklog;         /* log2 of da block size */
+       uint            node_ents;      /* # of entries in a danode */
+       int             magicpct;       /* 37% of block size in bytes */
+       xfs_dablk_t     datablk;        /* blockno of dir data v2 */
+       xfs_dablk_t     leafblk;        /* blockno of leaf data v2 */
+       xfs_dablk_t     freeblk;        /* blockno of free data v2 */
+};
+
+/*========================================================================
+ * Btree searching and modification structure definitions.
+ *========================================================================*/
+
+/*
+ * Search comparison results
+ */
+enum xfs_dacmp {
+       XFS_CMP_DIFFERENT,      /* names are completely different */
+       XFS_CMP_EXACT,          /* names are exactly the same */
+       XFS_CMP_CASE            /* names are same but differ in case */
+};
+
+/*
+ * Structure to ease passing around component names.
+ */
+typedef struct xfs_da_args {
+       struct xfs_da_geometry *geo;    /* da block geometry */
+       const __uint8_t *name;          /* string (maybe not NULL terminated) */
+       int             namelen;        /* length of string (maybe no NULL) */
+       __uint8_t       filetype;       /* filetype of inode for directories */
+       __uint8_t       *value;         /* set of bytes (maybe contain NULLs) */
+       int             valuelen;       /* length of value */
+       int             flags;          /* argument flags (eg: ATTR_NOCREATE) */
+       xfs_dahash_t    hashval;        /* hash value of name */
+       xfs_ino_t       inumber;        /* input/output inode number */
+       struct xfs_inode *dp;           /* directory inode to manipulate */
+       xfs_fsblock_t   *firstblock;    /* ptr to firstblock for bmap calls */
+       struct xfs_bmap_free *flist;    /* ptr to freelist for bmap_finish */
+       struct xfs_trans *trans;        /* current trans (changes over time) */
+       xfs_extlen_t    total;          /* total blocks needed, for 1st bmap */
+       int             whichfork;      /* data or attribute fork */
+       xfs_dablk_t     blkno;          /* blkno of attr leaf of interest */
+       int             index;          /* index of attr of interest in blk */
+       xfs_dablk_t     rmtblkno;       /* remote attr value starting blkno */
+       int             rmtblkcnt;      /* remote attr value block count */
+       int             rmtvaluelen;    /* remote attr value length in bytes */
+       xfs_dablk_t     blkno2;         /* blkno of 2nd attr leaf of interest */
+       int             index2;         /* index of 2nd attr in blk */
+       xfs_dablk_t     rmtblkno2;      /* remote attr value starting blkno */
+       int             rmtblkcnt2;     /* remote attr value block count */
+       int             rmtvaluelen2;   /* remote attr value length in bytes */
+       int             op_flags;       /* operation flags */
+       enum xfs_dacmp  cmpresult;      /* name compare result for lookups */
+} xfs_da_args_t;
+
+/*
+ * Operation flags:
+ */
+#define XFS_DA_OP_JUSTCHECK    0x0001  /* check for ok with no space */
+#define XFS_DA_OP_RENAME       0x0002  /* this is an atomic rename op */
+#define XFS_DA_OP_ADDNAME      0x0004  /* this is an add operation */
+#define XFS_DA_OP_OKNOENT      0x0008  /* lookup/add op, ENOENT ok, else die */
+#define XFS_DA_OP_CILOOKUP     0x0010  /* lookup to return CI name if found */
+
+#define XFS_DA_OP_FLAGS \
+       { XFS_DA_OP_JUSTCHECK,  "JUSTCHECK" }, \
+       { XFS_DA_OP_RENAME,     "RENAME" }, \
+       { XFS_DA_OP_ADDNAME,    "ADDNAME" }, \
+       { XFS_DA_OP_OKNOENT,    "OKNOENT" }, \
+       { XFS_DA_OP_CILOOKUP,   "CILOOKUP" }
+
+/*
+ * Storage for holding state during Btree searches and split/join ops.
+ *
+ * Only need space for 5 intermediate nodes.  With a minimum of 62-way
+ * fanout to the Btree, we can support over 900 million directory blocks,
+ * which is slightly more than enough.
+ */
+typedef struct xfs_da_state_blk {
+       struct xfs_buf  *bp;            /* buffer containing block */
+       xfs_dablk_t     blkno;          /* filesystem blkno of buffer */
+       xfs_daddr_t     disk_blkno;     /* on-disk blkno (in BBs) of buffer */
+       int             index;          /* relevant index into block */
+       xfs_dahash_t    hashval;        /* last hash value in block */
+       int             magic;          /* blk's magic number, ie: blk type */
+} xfs_da_state_blk_t;
+
+typedef struct xfs_da_state_path {
+       int                     active;         /* number of active levels */
+       xfs_da_state_blk_t      blk[XFS_DA_NODE_MAXDEPTH];
+} xfs_da_state_path_t;
+
+typedef struct xfs_da_state {
+       xfs_da_args_t           *args;          /* filename arguments */
+       struct xfs_mount        *mp;            /* filesystem mount point */
+       xfs_da_state_path_t     path;           /* search/split paths */
+       xfs_da_state_path_t     altpath;        /* alternate path for join */
+       unsigned char           inleaf;         /* insert into 1->lf, 0->splf */
+       unsigned char           extravalid;     /* T/F: extrablk is in use */
+       unsigned char           extraafter;     /* T/F: extrablk is after new */
+       xfs_da_state_blk_t      extrablk;       /* for double-splits on leaves */
+                                               /* for dirv2 extrablk is data */
+} xfs_da_state_t;
+
+/*
+ * Utility macros to aid in logging changed structure fields.
+ */
+#define XFS_DA_LOGOFF(BASE, ADDR)      ((char *)(ADDR) - (char *)(BASE))
+#define XFS_DA_LOGRANGE(BASE, ADDR, SIZE)      \
+               (uint)(XFS_DA_LOGOFF(BASE, ADDR)), \
+               (uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1)
+
+/*
+ * Name ops for directory and/or attr name operations
+ */
+struct xfs_nameops {
+       xfs_dahash_t    (*hashname)(struct xfs_name *);
+       enum xfs_dacmp  (*compname)(struct xfs_da_args *,
+                                       const unsigned char *, int);
+};
+
+
+/*========================================================================
+ * Function prototypes.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+int    xfs_da3_node_create(struct xfs_da_args *args, xfs_dablk_t blkno,
+                           int level, struct xfs_buf **bpp, int whichfork);
+int    xfs_da3_split(xfs_da_state_t *state);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+int    xfs_da3_join(xfs_da_state_t *state);
+void   xfs_da3_fixhashpath(struct xfs_da_state *state,
+                           struct xfs_da_state_path *path_to_to_fix);
+
+/*
+ * Routines used for finding things in the Btree.
+ */
+int    xfs_da3_node_lookup_int(xfs_da_state_t *state, int *result);
+int    xfs_da3_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
+                                        int forward, int release, int *result);
+/*
+ * Utility routines.
+ */
+int    xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
+                                      xfs_da_state_blk_t *new_blk);
+int    xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
+                        xfs_dablk_t bno, xfs_daddr_t mappedbno,
+                        struct xfs_buf **bpp, int which_fork);
+
+/*
+ * Utility routines.
+ */
+int    xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno);
+int    xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno,
+                             int count);
+int    xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+                             xfs_dablk_t bno, xfs_daddr_t mappedbno,
+                             struct xfs_buf **bp, int whichfork);
+int    xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+                              xfs_dablk_t bno, xfs_daddr_t mappedbno,
+                              struct xfs_buf **bpp, int whichfork,
+                              const struct xfs_buf_ops *ops);
+xfs_daddr_t    xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
+                               xfs_daddr_t mapped_bno, int whichfork,
+                               const struct xfs_buf_ops *ops);
+int    xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
+                                         struct xfs_buf *dead_buf);
+
+uint xfs_da_hashname(const __uint8_t *name_string, int name_length);
+enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
+                               const unsigned char *name, int len);
+
+
+xfs_da_state_t *xfs_da_state_alloc(void);
+void xfs_da_state_free(xfs_da_state_t *state);
+
+extern struct kmem_zone *xfs_da_state_zone;
+extern const struct xfs_nameops xfs_default_nameops;
+
+#endif /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
new file mode 100644 (file)
index 0000000..c9aee52
--- /dev/null
@@ -0,0 +1,911 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+
+/*
+ * Shortform directory ops
+ */
+static int
+xfs_dir2_sf_entsize(
+       struct xfs_dir2_sf_hdr  *hdr,
+       int                     len)
+{
+       int count = sizeof(struct xfs_dir2_sf_entry);   /* namelen + offset */
+
+       count += len;                                   /* name */
+       count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
+                               sizeof(xfs_dir2_ino4_t); /* ino # */
+       return count;
+}
+
+static int
+xfs_dir3_sf_entsize(
+       struct xfs_dir2_sf_hdr  *hdr,
+       int                     len)
+{
+       return xfs_dir2_sf_entsize(hdr, len) + sizeof(__uint8_t);
+}
+
+static struct xfs_dir2_sf_entry *
+xfs_dir2_sf_nextentry(
+       struct xfs_dir2_sf_hdr  *hdr,
+       struct xfs_dir2_sf_entry *sfep)
+{
+       return (struct xfs_dir2_sf_entry *)
+               ((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen));
+}
+
+static struct xfs_dir2_sf_entry *
+xfs_dir3_sf_nextentry(
+       struct xfs_dir2_sf_hdr  *hdr,
+       struct xfs_dir2_sf_entry *sfep)
+{
+       return (struct xfs_dir2_sf_entry *)
+               ((char *)sfep + xfs_dir3_sf_entsize(hdr, sfep->namelen));
+}
+
+
+/*
+ * For filetype enabled shortform directories, the file type field is stored at
+ * the end of the name.  Because it's only a single byte, endian conversion is
+ * not necessary. For non-filetype enable directories, the type is always
+ * unknown and we never store the value.
+ */
+static __uint8_t
+xfs_dir2_sfe_get_ftype(
+       struct xfs_dir2_sf_entry *sfep)
+{
+       return XFS_DIR3_FT_UNKNOWN;
+}
+
+static void
+xfs_dir2_sfe_put_ftype(
+       struct xfs_dir2_sf_entry *sfep,
+       __uint8_t               ftype)
+{
+       ASSERT(ftype < XFS_DIR3_FT_MAX);
+}
+
+static __uint8_t
+xfs_dir3_sfe_get_ftype(
+       struct xfs_dir2_sf_entry *sfep)
+{
+       __uint8_t       ftype;
+
+       ftype = sfep->name[sfep->namelen];
+       if (ftype >= XFS_DIR3_FT_MAX)
+               return XFS_DIR3_FT_UNKNOWN;
+       return ftype;
+}
+
+static void
+xfs_dir3_sfe_put_ftype(
+       struct xfs_dir2_sf_entry *sfep,
+       __uint8_t               ftype)
+{
+       ASSERT(ftype < XFS_DIR3_FT_MAX);
+
+       sfep->name[sfep->namelen] = ftype;
+}
+
+/*
+ * Inode numbers in short-form directories can come in two versions,
+ * either 4 bytes or 8 bytes wide.  These helpers deal with the
+ * two forms transparently by looking at the headers i8count field.
+ *
+ * For 64-bit inode number the most significant byte must be zero.
+ */
+static xfs_ino_t
+xfs_dir2_sf_get_ino(
+       struct xfs_dir2_sf_hdr  *hdr,
+       xfs_dir2_inou_t         *from)
+{
+       if (hdr->i8count)
+               return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL;
+       else
+               return get_unaligned_be32(&from->i4.i);
+}
+
+static void
+xfs_dir2_sf_put_ino(
+       struct xfs_dir2_sf_hdr  *hdr,
+       xfs_dir2_inou_t         *to,
+       xfs_ino_t               ino)
+{
+       ASSERT((ino & 0xff00000000000000ULL) == 0);
+
+       if (hdr->i8count)
+               put_unaligned_be64(ino, &to->i8.i);
+       else
+               put_unaligned_be32(ino, &to->i4.i);
+}
+
+static xfs_ino_t
+xfs_dir2_sf_get_parent_ino(
+       struct xfs_dir2_sf_hdr  *hdr)
+{
+       return xfs_dir2_sf_get_ino(hdr, &hdr->parent);
+}
+
+static void
+xfs_dir2_sf_put_parent_ino(
+       struct xfs_dir2_sf_hdr  *hdr,
+       xfs_ino_t               ino)
+{
+       xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino);
+}
+
+/*
+ * In short-form directory entries the inode numbers are stored at variable
+ * offset behind the entry name. If the entry stores a filetype value, then it
+ * sits between the name and the inode number. Hence the inode numbers may only
+ * be accessed through the helpers below.
+ */
+static xfs_ino_t
+xfs_dir2_sfe_get_ino(
+       struct xfs_dir2_sf_hdr  *hdr,
+       struct xfs_dir2_sf_entry *sfep)
+{
+       return xfs_dir2_sf_get_ino(hdr,
+                               (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]);
+}
+
+static void
+xfs_dir2_sfe_put_ino(
+       struct xfs_dir2_sf_hdr  *hdr,
+       struct xfs_dir2_sf_entry *sfep,
+       xfs_ino_t               ino)
+{
+       xfs_dir2_sf_put_ino(hdr,
+                           (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino);
+}
+
+static xfs_ino_t
+xfs_dir3_sfe_get_ino(
+       struct xfs_dir2_sf_hdr  *hdr,
+       struct xfs_dir2_sf_entry *sfep)
+{
+       return xfs_dir2_sf_get_ino(hdr,
+                       (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]);
+}
+
+static void
+xfs_dir3_sfe_put_ino(
+       struct xfs_dir2_sf_hdr  *hdr,
+       struct xfs_dir2_sf_entry *sfep,
+       xfs_ino_t               ino)
+{
+       xfs_dir2_sf_put_ino(hdr,
+                       (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino);
+}
+
+
+/*
+ * Directory data block operations
+ */
+
+/*
+ * For special situations, the dirent size ends up fixed because we always know
+ * what the size of the entry is. That's true for the "." and "..", and
+ * therefore we know that they are a fixed size and hence their offsets are
+ * constant, as is the first entry.
+ *
+ * Hence, this calculation is written as a macro to be able to be calculated at
+ * compile time and so certain offsets can be calculated directly in the
+ * structure initaliser via the macro. There are two macros - one for dirents
+ * with ftype and without so there are no unresolvable conditionals in the
+ * calculations. We also use round_up() as XFS_DIR2_DATA_ALIGN is always a power
+ * of 2 and the compiler doesn't reject it (unlike roundup()).
+ */
+#define XFS_DIR2_DATA_ENTSIZE(n)                                       \
+       round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \
+                sizeof(xfs_dir2_data_off_t)), XFS_DIR2_DATA_ALIGN)
+
+#define XFS_DIR3_DATA_ENTSIZE(n)                                       \
+       round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \
+                sizeof(xfs_dir2_data_off_t) + sizeof(__uint8_t)),      \
+               XFS_DIR2_DATA_ALIGN)
+
+static int
+xfs_dir2_data_entsize(
+       int                     n)
+{
+       return XFS_DIR2_DATA_ENTSIZE(n);
+}
+
+static int
+xfs_dir3_data_entsize(
+       int                     n)
+{
+       return XFS_DIR3_DATA_ENTSIZE(n);
+}
+
+static __uint8_t
+xfs_dir2_data_get_ftype(
+       struct xfs_dir2_data_entry *dep)
+{
+       return XFS_DIR3_FT_UNKNOWN;
+}
+
+static void
+xfs_dir2_data_put_ftype(
+       struct xfs_dir2_data_entry *dep,
+       __uint8_t               ftype)
+{
+       ASSERT(ftype < XFS_DIR3_FT_MAX);
+}
+
+static __uint8_t
+xfs_dir3_data_get_ftype(
+       struct xfs_dir2_data_entry *dep)
+{
+       __uint8_t       ftype = dep->name[dep->namelen];
+
+       ASSERT(ftype < XFS_DIR3_FT_MAX);
+       if (ftype >= XFS_DIR3_FT_MAX)
+               return XFS_DIR3_FT_UNKNOWN;
+       return ftype;
+}
+
+static void
+xfs_dir3_data_put_ftype(
+       struct xfs_dir2_data_entry *dep,
+       __uint8_t               type)
+{
+       ASSERT(type < XFS_DIR3_FT_MAX);
+       ASSERT(dep->namelen != 0);
+
+       dep->name[dep->namelen] = type;
+}
+
+/*
+ * Pointer to an entry's tag word.
+ */
+static __be16 *
+xfs_dir2_data_entry_tag_p(
+       struct xfs_dir2_data_entry *dep)
+{
+       return (__be16 *)((char *)dep +
+               xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16));
+}
+
+static __be16 *
+xfs_dir3_data_entry_tag_p(
+       struct xfs_dir2_data_entry *dep)
+{
+       return (__be16 *)((char *)dep +
+               xfs_dir3_data_entsize(dep->namelen) - sizeof(__be16));
+}
+
+/*
+ * location of . and .. in data space (always block 0)
+ */
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_dot_entry_p(
+       struct xfs_dir2_data_hdr *hdr)
+{
+       return (struct xfs_dir2_data_entry *)
+               ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_dotdot_entry_p(
+       struct xfs_dir2_data_hdr *hdr)
+{
+       return (struct xfs_dir2_data_entry *)
+               ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+                               XFS_DIR2_DATA_ENTSIZE(1));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_first_entry_p(
+       struct xfs_dir2_data_hdr *hdr)
+{
+       return (struct xfs_dir2_data_entry *)
+               ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+                               XFS_DIR2_DATA_ENTSIZE(1) +
+                               XFS_DIR2_DATA_ENTSIZE(2));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_ftype_data_dotdot_entry_p(
+       struct xfs_dir2_data_hdr *hdr)
+{
+       return (struct xfs_dir2_data_entry *)
+               ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+                               XFS_DIR3_DATA_ENTSIZE(1));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_ftype_data_first_entry_p(
+       struct xfs_dir2_data_hdr *hdr)
+{
+       return (struct xfs_dir2_data_entry *)
+               ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+                               XFS_DIR3_DATA_ENTSIZE(1) +
+                               XFS_DIR3_DATA_ENTSIZE(2));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_dot_entry_p(
+       struct xfs_dir2_data_hdr *hdr)
+{
+       return (struct xfs_dir2_data_entry *)
+               ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_dotdot_entry_p(
+       struct xfs_dir2_data_hdr *hdr)
+{
+       return (struct xfs_dir2_data_entry *)
+               ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
+                               XFS_DIR3_DATA_ENTSIZE(1));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_first_entry_p(
+       struct xfs_dir2_data_hdr *hdr)
+{
+       return (struct xfs_dir2_data_entry *)
+               ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
+                               XFS_DIR3_DATA_ENTSIZE(1) +
+                               XFS_DIR3_DATA_ENTSIZE(2));
+}
+
+static struct xfs_dir2_data_free *
+xfs_dir2_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
+{
+       return hdr->bestfree;
+}
+
+static struct xfs_dir2_data_free *
+xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
+{
+       return ((struct xfs_dir3_data_hdr *)hdr)->best_free;
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_entry_p(struct xfs_dir2_data_hdr *hdr)
+{
+       return (struct xfs_dir2_data_entry *)
+               ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
+}
+
+static struct xfs_dir2_data_unused *
+xfs_dir2_data_unused_p(struct xfs_dir2_data_hdr *hdr)
+{
+       return (struct xfs_dir2_data_unused *)
+               ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr)
+{
+       return (struct xfs_dir2_data_entry *)
+               ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
+}
+
+static struct xfs_dir2_data_unused *
+xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
+{
+       return (struct xfs_dir2_data_unused *)
+               ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
+}
+
+
+/*
+ * Directory Leaf block operations
+ */
+static int
+xfs_dir2_max_leaf_ents(struct xfs_da_geometry *geo)
+{
+       return (geo->blksize - sizeof(struct xfs_dir2_leaf_hdr)) /
+               (uint)sizeof(struct xfs_dir2_leaf_entry);
+}
+
+static struct xfs_dir2_leaf_entry *
+xfs_dir2_leaf_ents_p(struct xfs_dir2_leaf *lp)
+{
+       return lp->__ents;
+}
+
+static int
+xfs_dir3_max_leaf_ents(struct xfs_da_geometry *geo)
+{
+       return (geo->blksize - sizeof(struct xfs_dir3_leaf_hdr)) /
+               (uint)sizeof(struct xfs_dir2_leaf_entry);
+}
+
+static struct xfs_dir2_leaf_entry *
+xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp)
+{
+       return ((struct xfs_dir3_leaf *)lp)->__ents;
+}
+
+static void
+xfs_dir2_leaf_hdr_from_disk(
+       struct xfs_dir3_icleaf_hdr      *to,
+       struct xfs_dir2_leaf            *from)
+{
+       to->forw = be32_to_cpu(from->hdr.info.forw);
+       to->back = be32_to_cpu(from->hdr.info.back);
+       to->magic = be16_to_cpu(from->hdr.info.magic);
+       to->count = be16_to_cpu(from->hdr.count);
+       to->stale = be16_to_cpu(from->hdr.stale);
+
+       ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC ||
+              to->magic == XFS_DIR2_LEAFN_MAGIC);
+}
+
+static void
+xfs_dir2_leaf_hdr_to_disk(
+       struct xfs_dir2_leaf            *to,
+       struct xfs_dir3_icleaf_hdr      *from)
+{
+       ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC ||
+              from->magic == XFS_DIR2_LEAFN_MAGIC);
+
+       to->hdr.info.forw = cpu_to_be32(from->forw);
+       to->hdr.info.back = cpu_to_be32(from->back);
+       to->hdr.info.magic = cpu_to_be16(from->magic);
+       to->hdr.count = cpu_to_be16(from->count);
+       to->hdr.stale = cpu_to_be16(from->stale);
+}
+
+static void
+xfs_dir3_leaf_hdr_from_disk(
+       struct xfs_dir3_icleaf_hdr      *to,
+       struct xfs_dir2_leaf            *from)
+{
+       struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from;
+
+       to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+       to->back = be32_to_cpu(hdr3->info.hdr.back);
+       to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+       to->count = be16_to_cpu(hdr3->count);
+       to->stale = be16_to_cpu(hdr3->stale);
+
+       ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC ||
+              to->magic == XFS_DIR3_LEAFN_MAGIC);
+}
+
+static void
+xfs_dir3_leaf_hdr_to_disk(
+       struct xfs_dir2_leaf            *to,
+       struct xfs_dir3_icleaf_hdr      *from)
+{
+       struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to;
+
+       ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC ||
+              from->magic == XFS_DIR3_LEAFN_MAGIC);
+
+       hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+       hdr3->info.hdr.back = cpu_to_be32(from->back);
+       hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+       hdr3->count = cpu_to_be16(from->count);
+       hdr3->stale = cpu_to_be16(from->stale);
+}
+
+
+/*
+ * Directory/Attribute Node block operations
+ */
+static struct xfs_da_node_entry *
+xfs_da2_node_tree_p(struct xfs_da_intnode *dap)
+{
+       return dap->__btree;
+}
+
+static struct xfs_da_node_entry *
+xfs_da3_node_tree_p(struct xfs_da_intnode *dap)
+{
+       return ((struct xfs_da3_intnode *)dap)->__btree;
+}
+
+static void
+xfs_da2_node_hdr_from_disk(
+       struct xfs_da3_icnode_hdr       *to,
+       struct xfs_da_intnode           *from)
+{
+       ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
+       to->forw = be32_to_cpu(from->hdr.info.forw);
+       to->back = be32_to_cpu(from->hdr.info.back);
+       to->magic = be16_to_cpu(from->hdr.info.magic);
+       to->count = be16_to_cpu(from->hdr.__count);
+       to->level = be16_to_cpu(from->hdr.__level);
+}
+
+static void
+xfs_da2_node_hdr_to_disk(
+       struct xfs_da_intnode           *to,
+       struct xfs_da3_icnode_hdr       *from)
+{
+       ASSERT(from->magic == XFS_DA_NODE_MAGIC);
+       to->hdr.info.forw = cpu_to_be32(from->forw);
+       to->hdr.info.back = cpu_to_be32(from->back);
+       to->hdr.info.magic = cpu_to_be16(from->magic);
+       to->hdr.__count = cpu_to_be16(from->count);
+       to->hdr.__level = cpu_to_be16(from->level);
+}
+
+static void
+xfs_da3_node_hdr_from_disk(
+       struct xfs_da3_icnode_hdr       *to,
+       struct xfs_da_intnode           *from)
+{
+       struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from;
+
+       ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
+       to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+       to->back = be32_to_cpu(hdr3->info.hdr.back);
+       to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+       to->count = be16_to_cpu(hdr3->__count);
+       to->level = be16_to_cpu(hdr3->__level);
+}
+
+static void
+xfs_da3_node_hdr_to_disk(
+       struct xfs_da_intnode           *to,
+       struct xfs_da3_icnode_hdr       *from)
+{
+       struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to;
+
+       ASSERT(from->magic == XFS_DA3_NODE_MAGIC);
+       hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+       hdr3->info.hdr.back = cpu_to_be32(from->back);
+       hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+       hdr3->__count = cpu_to_be16(from->count);
+       hdr3->__level = cpu_to_be16(from->level);
+}
+
+
+/*
+ * Directory free space block operations
+ */
+static int
+xfs_dir2_free_max_bests(struct xfs_da_geometry *geo)
+{
+       return (geo->blksize - sizeof(struct xfs_dir2_free_hdr)) /
+               sizeof(xfs_dir2_data_off_t);
+}
+
+static __be16 *
+xfs_dir2_free_bests_p(struct xfs_dir2_free *free)
+{
+       return (__be16 *)((char *)free + sizeof(struct xfs_dir2_free_hdr));
+}
+
+/*
+ * Convert data space db to the corresponding free db.
+ */
+static xfs_dir2_db_t
+xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+       return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
+                       (db / xfs_dir2_free_max_bests(geo));
+}
+
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+static int
+xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+       return db % xfs_dir2_free_max_bests(geo);
+}
+
+static int
+xfs_dir3_free_max_bests(struct xfs_da_geometry *geo)
+{
+       return (geo->blksize - sizeof(struct xfs_dir3_free_hdr)) /
+               sizeof(xfs_dir2_data_off_t);
+}
+
+static __be16 *
+xfs_dir3_free_bests_p(struct xfs_dir2_free *free)
+{
+       return (__be16 *)((char *)free + sizeof(struct xfs_dir3_free_hdr));
+}
+
+/*
+ * Convert data space db to the corresponding free db.
+ */
+static xfs_dir2_db_t
+xfs_dir3_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+       return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
+                       (db / xfs_dir3_free_max_bests(geo));
+}
+
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+static int
+xfs_dir3_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+       return db % xfs_dir3_free_max_bests(geo);
+}
+
+static void
+xfs_dir2_free_hdr_from_disk(
+       struct xfs_dir3_icfree_hdr      *to,
+       struct xfs_dir2_free            *from)
+{
+       to->magic = be32_to_cpu(from->hdr.magic);
+       to->firstdb = be32_to_cpu(from->hdr.firstdb);
+       to->nvalid = be32_to_cpu(from->hdr.nvalid);
+       to->nused = be32_to_cpu(from->hdr.nused);
+       ASSERT(to->magic == XFS_DIR2_FREE_MAGIC);
+}
+
+static void
+xfs_dir2_free_hdr_to_disk(
+       struct xfs_dir2_free            *to,
+       struct xfs_dir3_icfree_hdr      *from)
+{
+       ASSERT(from->magic == XFS_DIR2_FREE_MAGIC);
+
+       to->hdr.magic = cpu_to_be32(from->magic);
+       to->hdr.firstdb = cpu_to_be32(from->firstdb);
+       to->hdr.nvalid = cpu_to_be32(from->nvalid);
+       to->hdr.nused = cpu_to_be32(from->nused);
+}
+
+static void
+xfs_dir3_free_hdr_from_disk(
+       struct xfs_dir3_icfree_hdr      *to,
+       struct xfs_dir2_free            *from)
+{
+       struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from;
+
+       to->magic = be32_to_cpu(hdr3->hdr.magic);
+       to->firstdb = be32_to_cpu(hdr3->firstdb);
+       to->nvalid = be32_to_cpu(hdr3->nvalid);
+       to->nused = be32_to_cpu(hdr3->nused);
+
+       ASSERT(to->magic == XFS_DIR3_FREE_MAGIC);
+}
+
+static void
+xfs_dir3_free_hdr_to_disk(
+       struct xfs_dir2_free            *to,
+       struct xfs_dir3_icfree_hdr      *from)
+{
+       struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to;
+
+       ASSERT(from->magic == XFS_DIR3_FREE_MAGIC);
+
+       hdr3->hdr.magic = cpu_to_be32(from->magic);
+       hdr3->firstdb = cpu_to_be32(from->firstdb);
+       hdr3->nvalid = cpu_to_be32(from->nvalid);
+       hdr3->nused = cpu_to_be32(from->nused);
+}
+
+static const struct xfs_dir_ops xfs_dir2_ops = {
+       .sf_entsize = xfs_dir2_sf_entsize,
+       .sf_nextentry = xfs_dir2_sf_nextentry,
+       .sf_get_ftype = xfs_dir2_sfe_get_ftype,
+       .sf_put_ftype = xfs_dir2_sfe_put_ftype,
+       .sf_get_ino = xfs_dir2_sfe_get_ino,
+       .sf_put_ino = xfs_dir2_sfe_put_ino,
+       .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
+       .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
+
+       .data_entsize = xfs_dir2_data_entsize,
+       .data_get_ftype = xfs_dir2_data_get_ftype,
+       .data_put_ftype = xfs_dir2_data_put_ftype,
+       .data_entry_tag_p = xfs_dir2_data_entry_tag_p,
+       .data_bestfree_p = xfs_dir2_data_bestfree_p,
+
+       .data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
+       .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
+                               XFS_DIR2_DATA_ENTSIZE(1),
+       .data_first_offset =  sizeof(struct xfs_dir2_data_hdr) +
+                               XFS_DIR2_DATA_ENTSIZE(1) +
+                               XFS_DIR2_DATA_ENTSIZE(2),
+       .data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
+
+       .data_dot_entry_p = xfs_dir2_data_dot_entry_p,
+       .data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p,
+       .data_first_entry_p = xfs_dir2_data_first_entry_p,
+       .data_entry_p = xfs_dir2_data_entry_p,
+       .data_unused_p = xfs_dir2_data_unused_p,
+
+       .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
+       .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
+       .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
+       .leaf_max_ents = xfs_dir2_max_leaf_ents,
+       .leaf_ents_p = xfs_dir2_leaf_ents_p,
+
+       .node_hdr_size = sizeof(struct xfs_da_node_hdr),
+       .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
+       .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
+       .node_tree_p = xfs_da2_node_tree_p,
+
+       .free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
+       .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
+       .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
+       .free_max_bests = xfs_dir2_free_max_bests,
+       .free_bests_p = xfs_dir2_free_bests_p,
+       .db_to_fdb = xfs_dir2_db_to_fdb,
+       .db_to_fdindex = xfs_dir2_db_to_fdindex,
+};
+
+static const struct xfs_dir_ops xfs_dir2_ftype_ops = {
+       .sf_entsize = xfs_dir3_sf_entsize,
+       .sf_nextentry = xfs_dir3_sf_nextentry,
+       .sf_get_ftype = xfs_dir3_sfe_get_ftype,
+       .sf_put_ftype = xfs_dir3_sfe_put_ftype,
+       .sf_get_ino = xfs_dir3_sfe_get_ino,
+       .sf_put_ino = xfs_dir3_sfe_put_ino,
+       .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
+       .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
+
+       .data_entsize = xfs_dir3_data_entsize,
+       .data_get_ftype = xfs_dir3_data_get_ftype,
+       .data_put_ftype = xfs_dir3_data_put_ftype,
+       .data_entry_tag_p = xfs_dir3_data_entry_tag_p,
+       .data_bestfree_p = xfs_dir2_data_bestfree_p,
+
+       .data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
+       .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
+                               XFS_DIR3_DATA_ENTSIZE(1),
+       .data_first_offset =  sizeof(struct xfs_dir2_data_hdr) +
+                               XFS_DIR3_DATA_ENTSIZE(1) +
+                               XFS_DIR3_DATA_ENTSIZE(2),
+       .data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
+
+       .data_dot_entry_p = xfs_dir2_data_dot_entry_p,
+       .data_dotdot_entry_p = xfs_dir2_ftype_data_dotdot_entry_p,
+       .data_first_entry_p = xfs_dir2_ftype_data_first_entry_p,
+       .data_entry_p = xfs_dir2_data_entry_p,
+       .data_unused_p = xfs_dir2_data_unused_p,
+
+       .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
+       .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
+       .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
+       .leaf_max_ents = xfs_dir2_max_leaf_ents,
+       .leaf_ents_p = xfs_dir2_leaf_ents_p,
+
+       .node_hdr_size = sizeof(struct xfs_da_node_hdr),
+       .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
+       .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
+       .node_tree_p = xfs_da2_node_tree_p,
+
+       .free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
+       .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
+       .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
+       .free_max_bests = xfs_dir2_free_max_bests,
+       .free_bests_p = xfs_dir2_free_bests_p,
+       .db_to_fdb = xfs_dir2_db_to_fdb,
+       .db_to_fdindex = xfs_dir2_db_to_fdindex,
+};
+
+static const struct xfs_dir_ops xfs_dir3_ops = {
+       .sf_entsize = xfs_dir3_sf_entsize,
+       .sf_nextentry = xfs_dir3_sf_nextentry,
+       .sf_get_ftype = xfs_dir3_sfe_get_ftype,
+       .sf_put_ftype = xfs_dir3_sfe_put_ftype,
+       .sf_get_ino = xfs_dir3_sfe_get_ino,
+       .sf_put_ino = xfs_dir3_sfe_put_ino,
+       .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
+       .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
+
+       .data_entsize = xfs_dir3_data_entsize,
+       .data_get_ftype = xfs_dir3_data_get_ftype,
+       .data_put_ftype = xfs_dir3_data_put_ftype,
+       .data_entry_tag_p = xfs_dir3_data_entry_tag_p,
+       .data_bestfree_p = xfs_dir3_data_bestfree_p,
+
+       .data_dot_offset = sizeof(struct xfs_dir3_data_hdr),
+       .data_dotdot_offset = sizeof(struct xfs_dir3_data_hdr) +
+                               XFS_DIR3_DATA_ENTSIZE(1),
+       .data_first_offset =  sizeof(struct xfs_dir3_data_hdr) +
+                               XFS_DIR3_DATA_ENTSIZE(1) +
+                               XFS_DIR3_DATA_ENTSIZE(2),
+       .data_entry_offset = sizeof(struct xfs_dir3_data_hdr),
+
+       .data_dot_entry_p = xfs_dir3_data_dot_entry_p,
+       .data_dotdot_entry_p = xfs_dir3_data_dotdot_entry_p,
+       .data_first_entry_p = xfs_dir3_data_first_entry_p,
+       .data_entry_p = xfs_dir3_data_entry_p,
+       .data_unused_p = xfs_dir3_data_unused_p,
+
+       .leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr),
+       .leaf_hdr_to_disk = xfs_dir3_leaf_hdr_to_disk,
+       .leaf_hdr_from_disk = xfs_dir3_leaf_hdr_from_disk,
+       .leaf_max_ents = xfs_dir3_max_leaf_ents,
+       .leaf_ents_p = xfs_dir3_leaf_ents_p,
+
+       .node_hdr_size = sizeof(struct xfs_da3_node_hdr),
+       .node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
+       .node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
+       .node_tree_p = xfs_da3_node_tree_p,
+
+       .free_hdr_size = sizeof(struct xfs_dir3_free_hdr),
+       .free_hdr_to_disk = xfs_dir3_free_hdr_to_disk,
+       .free_hdr_from_disk = xfs_dir3_free_hdr_from_disk,
+       .free_max_bests = xfs_dir3_free_max_bests,
+       .free_bests_p = xfs_dir3_free_bests_p,
+       .db_to_fdb = xfs_dir3_db_to_fdb,
+       .db_to_fdindex = xfs_dir3_db_to_fdindex,
+};
+
+static const struct xfs_dir_ops xfs_dir2_nondir_ops = {
+       .node_hdr_size = sizeof(struct xfs_da_node_hdr),
+       .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
+       .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
+       .node_tree_p = xfs_da2_node_tree_p,
+};
+
+static const struct xfs_dir_ops xfs_dir3_nondir_ops = {
+       .node_hdr_size = sizeof(struct xfs_da3_node_hdr),
+       .node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
+       .node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
+       .node_tree_p = xfs_da3_node_tree_p,
+};
+
+/*
+ * Return the ops structure according to the current config.  If we are passed
+ * an inode, then that overrides the default config we use which is based on
+ * feature bits.
+ */
+const struct xfs_dir_ops *
+xfs_dir_get_ops(
+       struct xfs_mount        *mp,
+       struct xfs_inode        *dp)
+{
+       if (dp)
+               return dp->d_ops;
+       if (mp->m_dir_inode_ops)
+               return mp->m_dir_inode_ops;
+       if (xfs_sb_version_hascrc(&mp->m_sb))
+               return &xfs_dir3_ops;
+       if (xfs_sb_version_hasftype(&mp->m_sb))
+               return &xfs_dir2_ftype_ops;
+       return &xfs_dir2_ops;
+}
+
+const struct xfs_dir_ops *
+xfs_nondir_get_ops(
+       struct xfs_mount        *mp,
+       struct xfs_inode        *dp)
+{
+       if (dp)
+               return dp->d_ops;
+       if (mp->m_nondir_inode_ops)
+               return mp->m_nondir_inode_ops;
+       if (xfs_sb_version_hascrc(&mp->m_sb))
+               return &xfs_dir3_nondir_ops;
+       return &xfs_dir2_nondir_ops;
+}
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
new file mode 100644 (file)
index 0000000..0a49b02
--- /dev/null
@@ -0,0 +1,861 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DA_FORMAT_H__
+#define __XFS_DA_FORMAT_H__
+
+/*
+ * This structure is common to both leaf nodes and non-leaf nodes in the Btree.
+ *
+ * It is used to manage a doubly linked list of all blocks at the same
+ * level in the Btree, and to identify which type of block this is.
+ */
+#define XFS_DA_NODE_MAGIC      0xfebe  /* magic number: non-leaf blocks */
+#define XFS_ATTR_LEAF_MAGIC    0xfbee  /* magic number: attribute leaf blks */
+#define        XFS_DIR2_LEAF1_MAGIC    0xd2f1  /* magic number: v2 dirlf single blks */
+#define        XFS_DIR2_LEAFN_MAGIC    0xd2ff  /* magic number: v2 dirlf multi blks */
+
+typedef struct xfs_da_blkinfo {
+       __be32          forw;                   /* previous block in list */
+       __be32          back;                   /* following block in list */
+       __be16          magic;                  /* validity check on block */
+       __be16          pad;                    /* unused */
+} xfs_da_blkinfo_t;
+
+/*
+ * CRC enabled directory structure types
+ *
+ * The headers change size for the additional verification information, but
+ * otherwise the tree layouts and contents are unchanged. Hence the da btree
+ * code can use the struct xfs_da_blkinfo for manipulating the tree links and
+ * magic numbers without modification for both v2 and v3 nodes.
+ */
+#define XFS_DA3_NODE_MAGIC     0x3ebe  /* magic number: non-leaf blocks */
+#define XFS_ATTR3_LEAF_MAGIC   0x3bee  /* magic number: attribute leaf blks */
+#define        XFS_DIR3_LEAF1_MAGIC    0x3df1  /* magic number: v2 dirlf single blks */
+#define        XFS_DIR3_LEAFN_MAGIC    0x3dff  /* magic number: v2 dirlf multi blks */
+
+struct xfs_da3_blkinfo {
+       /*
+        * the node link manipulation code relies on the fact that the first
+        * element of this structure is the struct xfs_da_blkinfo so it can
+        * ignore the differences in the rest of the structures.
+        */
+       struct xfs_da_blkinfo   hdr;
+       __be32                  crc;    /* CRC of block */
+       __be64                  blkno;  /* first block of the buffer */
+       __be64                  lsn;    /* sequence number of last write */
+       uuid_t                  uuid;   /* filesystem we belong to */
+       __be64                  owner;  /* inode that owns the block */
+};
+
+/*
+ * This is the structure of the root and intermediate nodes in the Btree.
+ * The leaf nodes are defined above.
+ *
+ * Entries are not packed.
+ *
+ * Since we have duplicate keys, use a binary search but always follow
+ * all match in the block, not just the first match found.
+ */
+#define        XFS_DA_NODE_MAXDEPTH    5       /* max depth of Btree */
+
+typedef struct xfs_da_node_hdr {
+       struct xfs_da_blkinfo   info;   /* block type, links, etc. */
+       __be16                  __count; /* count of active entries */
+       __be16                  __level; /* level above leaves (leaf == 0) */
+} xfs_da_node_hdr_t;
+
+struct xfs_da3_node_hdr {
+       struct xfs_da3_blkinfo  info;   /* block type, links, etc. */
+       __be16                  __count; /* count of active entries */
+       __be16                  __level; /* level above leaves (leaf == 0) */
+       __be32                  __pad32;
+};
+
+#define XFS_DA3_NODE_CRC_OFF   (offsetof(struct xfs_da3_node_hdr, info.crc))
+
+typedef struct xfs_da_node_entry {
+       __be32  hashval;        /* hash value for this descendant */
+       __be32  before;         /* Btree block before this key */
+} xfs_da_node_entry_t;
+
+typedef struct xfs_da_intnode {
+       struct xfs_da_node_hdr  hdr;
+       struct xfs_da_node_entry __btree[];
+} xfs_da_intnode_t;
+
+struct xfs_da3_intnode {
+       struct xfs_da3_node_hdr hdr;
+       struct xfs_da_node_entry __btree[];
+};
+
+/*
+ * In-core version of the node header to abstract the differences in the v2 and
+ * v3 disk format of the headers. Callers need to convert to/from disk format as
+ * appropriate.
+ */
+struct xfs_da3_icnode_hdr {
+       __uint32_t      forw;
+       __uint32_t      back;
+       __uint16_t      magic;
+       __uint16_t      count;
+       __uint16_t      level;
+};
+
+/*
+ * Directory version 2.
+ *
+ * There are 4 possible formats:
+ *  - shortform - embedded into the inode
+ *  - single block - data with embedded leaf at the end
+ *  - multiple data blocks, single leaf+freeindex block
+ *  - data blocks, node and leaf blocks (btree), freeindex blocks
+ *
+ * Note: many node blocks structures and constants are shared with the attr
+ * code and defined in xfs_da_btree.h.
+ */
+
+#define        XFS_DIR2_BLOCK_MAGIC    0x58443242      /* XD2B: single block dirs */
+#define        XFS_DIR2_DATA_MAGIC     0x58443244      /* XD2D: multiblock dirs */
+#define        XFS_DIR2_FREE_MAGIC     0x58443246      /* XD2F: free index blocks */
+
+/*
+ * Directory Version 3 With CRCs.
+ *
+ * The tree formats are the same as for version 2 directories.  The difference
+ * is in the block header and dirent formats. In many cases the v3 structures
+ * use v2 definitions as they are no different and this makes code sharing much
+ * easier.
+ *
+ * Also, the xfs_dir3_*() functions handle both v2 and v3 formats - if the
+ * format is v2 then they switch to the existing v2 code, or the format is v3
+ * they implement the v3 functionality. This means the existing dir2 is a mix of
+ * xfs_dir2/xfs_dir3 calls and functions. The xfs_dir3 functions are called
+ * where there is a difference in the formats, otherwise the code is unchanged.
+ *
+ * Where it is possible, the code decides what to do based on the magic numbers
+ * in the blocks rather than feature bits in the superblock. This means the code
+ * is as independent of the external XFS code as possible as doesn't require
+ * passing struct xfs_mount pointers into places where it isn't really
+ * necessary.
+ *
+ * Version 3 includes:
+ *
+ *     - a larger block header for CRC and identification purposes and so the
+ *     offsets of all the structures inside the blocks are different.
+ *
+ *     - new magic numbers to be able to detect the v2/v3 types on the fly.
+ */
+
+#define        XFS_DIR3_BLOCK_MAGIC    0x58444233      /* XDB3: single block dirs */
+#define        XFS_DIR3_DATA_MAGIC     0x58444433      /* XDD3: multiblock dirs */
+#define        XFS_DIR3_FREE_MAGIC     0x58444633      /* XDF3: free index blocks */
+
+/*
+ * Dirents in version 3 directories have a file type field. Additions to this
+ * list are an on-disk format change, requiring feature bits. Valid values
+ * are as follows:
+ */
+#define XFS_DIR3_FT_UNKNOWN            0
+#define XFS_DIR3_FT_REG_FILE           1
+#define XFS_DIR3_FT_DIR                        2
+#define XFS_DIR3_FT_CHRDEV             3
+#define XFS_DIR3_FT_BLKDEV             4
+#define XFS_DIR3_FT_FIFO               5
+#define XFS_DIR3_FT_SOCK               6
+#define XFS_DIR3_FT_SYMLINK            7
+#define XFS_DIR3_FT_WHT                        8
+
+#define XFS_DIR3_FT_MAX                        9
+
+/*
+ * Byte offset in data block and shortform entry.
+ */
+typedef        __uint16_t      xfs_dir2_data_off_t;
+#define        NULLDATAOFF     0xffffU
+typedef uint           xfs_dir2_data_aoff_t;   /* argument form */
+
+/*
+ * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
+ * Only need 16 bits, this is the byte offset into the single block form.
+ */
+typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t;
+
+/*
+ * Offset in data space of a data entry.
+ */
+typedef        __uint32_t      xfs_dir2_dataptr_t;
+#define        XFS_DIR2_MAX_DATAPTR    ((xfs_dir2_dataptr_t)0xffffffff)
+#define        XFS_DIR2_NULL_DATAPTR   ((xfs_dir2_dataptr_t)0)
+
+/*
+ * Byte offset in a directory.
+ */
+typedef        xfs_off_t       xfs_dir2_off_t;
+
+/*
+ * Directory block number (logical dirblk in file)
+ */
+typedef        __uint32_t      xfs_dir2_db_t;
+
+/*
+ * Inode number stored as 8 8-bit values.
+ */
+typedef        struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
+
+/*
+ * Inode number stored as 4 8-bit values.
+ * Works a lot of the time, when all the inode numbers in a directory
+ * fit in 32 bits.
+ */
+typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
+
+typedef union {
+       xfs_dir2_ino8_t i8;
+       xfs_dir2_ino4_t i4;
+} xfs_dir2_inou_t;
+#define        XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL)
+
+/*
+ * Directory layout when stored internal to an inode.
+ *
+ * Small directories are packed as tightly as possible so as to fit into the
+ * literal area of the inode.  These "shortform" directories consist of a
+ * single xfs_dir2_sf_hdr header followed by zero or more xfs_dir2_sf_entry
+ * structures.  Due the different inode number storage size and the variable
+ * length name field in the xfs_dir2_sf_entry all these structure are
+ * variable length, and the accessors in this file should be used to iterate
+ * over them.
+ */
+typedef struct xfs_dir2_sf_hdr {
+       __uint8_t               count;          /* count of entries */
+       __uint8_t               i8count;        /* count of 8-byte inode #s */
+       xfs_dir2_inou_t         parent;         /* parent dir inode number */
+} __arch_pack xfs_dir2_sf_hdr_t;
+
+typedef struct xfs_dir2_sf_entry {
+       __u8                    namelen;        /* actual name length */
+       xfs_dir2_sf_off_t       offset;         /* saved offset */
+       __u8                    name[];         /* name, variable size */
+       /*
+        * A single byte containing the file type field follows the inode
+        * number for version 3 directory entries.
+        *
+        * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a
+        * variable offset after the name.
+        */
+} __arch_pack xfs_dir2_sf_entry_t;
+
+static inline int xfs_dir2_sf_hdr_size(int i8count)
+{
+       return sizeof(struct xfs_dir2_sf_hdr) -
+               (i8count == 0) *
+               (sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t));
+}
+
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
+{
+       return get_unaligned_be16(&sfep->offset.i);
+}
+
+static inline void
+xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
+{
+       put_unaligned_be16(off, &sfep->offset.i);
+}
+
+static inline struct xfs_dir2_sf_entry *
+xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
+{
+       return (struct xfs_dir2_sf_entry *)
+               ((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count));
+}
+
+/*
+ * Data block structures.
+ *
+ * A pure data block looks like the following drawing on disk:
+ *
+ *    +-------------------------------------------------+
+ *    | xfs_dir2_data_hdr_t                             |
+ *    +-------------------------------------------------+
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ *    | ...                                             |
+ *    +-------------------------------------------------+
+ *    | unused space                                    |
+ *    +-------------------------------------------------+
+ *
+ * As all the entries are variable size structures the accessors below should
+ * be used to iterate over them.
+ *
+ * In addition to the pure data blocks for the data and node formats,
+ * most structures are also used for the combined data/freespace "block"
+ * format below.
+ */
+
+#define        XFS_DIR2_DATA_ALIGN_LOG 3               /* i.e., 8 bytes */
+#define        XFS_DIR2_DATA_ALIGN     (1 << XFS_DIR2_DATA_ALIGN_LOG)
+#define        XFS_DIR2_DATA_FREE_TAG  0xffff
+#define        XFS_DIR2_DATA_FD_COUNT  3
+
+/*
+ * Directory address space divided into sections,
+ * spaces separated by 32GB.
+ */
+#define        XFS_DIR2_SPACE_SIZE     (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
+#define        XFS_DIR2_DATA_SPACE     0
+#define        XFS_DIR2_DATA_OFFSET    (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
+
+/*
+ * Describe a free area in the data block.
+ *
+ * The freespace will be formatted as a xfs_dir2_data_unused_t.
+ */
+typedef struct xfs_dir2_data_free {
+       __be16                  offset;         /* start of freespace */
+       __be16                  length;         /* length of freespace */
+} xfs_dir2_data_free_t;
+
+/*
+ * Header for the data blocks.
+ *
+ * The code knows that XFS_DIR2_DATA_FD_COUNT is 3.
+ */
+typedef struct xfs_dir2_data_hdr {
+       __be32                  magic;          /* XFS_DIR2_DATA_MAGIC or */
+                                               /* XFS_DIR2_BLOCK_MAGIC */
+       xfs_dir2_data_free_t    bestfree[XFS_DIR2_DATA_FD_COUNT];
+} xfs_dir2_data_hdr_t;
+
+/*
+ * define a structure for all the verification fields we are adding to the
+ * directory block structures. This will be used in several structures.
+ * The magic number must be the first entry to align with all the dir2
+ * structures so we determine how to decode them just by the magic number.
+ */
+struct xfs_dir3_blk_hdr {
+       __be32                  magic;  /* magic number */
+       __be32                  crc;    /* CRC of block */
+       __be64                  blkno;  /* first block of the buffer */
+       __be64                  lsn;    /* sequence number of last write */
+       uuid_t                  uuid;   /* filesystem we belong to */
+       __be64                  owner;  /* inode that owns the block */
+};
+
+struct xfs_dir3_data_hdr {
+       struct xfs_dir3_blk_hdr hdr;
+       xfs_dir2_data_free_t    best_free[XFS_DIR2_DATA_FD_COUNT];
+       __be32                  pad;    /* 64 bit alignment */
+};
+
+#define XFS_DIR3_DATA_CRC_OFF  offsetof(struct xfs_dir3_data_hdr, hdr.crc)
+
+/*
+ * Active entry in a data block.
+ *
+ * Aligned to 8 bytes.  After the variable length name field there is a
+ * 2 byte tag field, which can be accessed using xfs_dir3_data_entry_tag_p.
+ *
+ * For dir3 structures, there is file type field between the name and the tag.
+ * This can only be manipulated by helper functions. It is packed hard against
+ * the end of the name so any padding for rounding is between the file type and
+ * the tag.
+ */
+typedef struct xfs_dir2_data_entry {
+       __be64                  inumber;        /* inode number */
+       __u8                    namelen;        /* name length */
+       __u8                    name[];         /* name bytes, no null */
+     /* __u8                   filetype; */    /* type of inode we point to */
+     /*        __be16                  tag; */         /* starting offset of us */
+} xfs_dir2_data_entry_t;
+
+/*
+ * Unused entry in a data block.
+ *
+ * Aligned to 8 bytes.  Tag appears as the last 2 bytes and must be accessed
+ * using xfs_dir2_data_unused_tag_p.
+ */
+typedef struct xfs_dir2_data_unused {
+       __be16                  freetag;        /* XFS_DIR2_DATA_FREE_TAG */
+       __be16                  length;         /* total free length */
+                                               /* variable offset */
+       __be16                  tag;            /* starting offset of us */
+} xfs_dir2_data_unused_t;
+
+/*
+ * Pointer to a freespace's tag word.
+ */
+static inline __be16 *
+xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup)
+{
+       return (__be16 *)((char *)dup +
+                       be16_to_cpu(dup->length) - sizeof(__be16));
+}
+
+/*
+ * Leaf block structures.
+ *
+ * A pure leaf block looks like the following drawing on disk:
+ *
+ *    +---------------------------+
+ *    | xfs_dir2_leaf_hdr_t       |
+ *    +---------------------------+
+ *    | xfs_dir2_leaf_entry_t     |
+ *    | xfs_dir2_leaf_entry_t     |
+ *    | xfs_dir2_leaf_entry_t     |
+ *    | xfs_dir2_leaf_entry_t     |
+ *    | ...                       |
+ *    +---------------------------+
+ *    | xfs_dir2_data_off_t       |
+ *    | xfs_dir2_data_off_t       |
+ *    | xfs_dir2_data_off_t       |
+ *    | ...                       |
+ *    +---------------------------+
+ *    | xfs_dir2_leaf_tail_t      |
+ *    +---------------------------+
+ *
+ * The xfs_dir2_data_off_t members (bests) and tail are at the end of the block
+ * for single-leaf (magic = XFS_DIR2_LEAF1_MAGIC) blocks only, but not present
+ * for directories with separate leaf nodes and free space blocks
+ * (magic = XFS_DIR2_LEAFN_MAGIC).
+ *
+ * As all the entries are variable size structures the accessors below should
+ * be used to iterate over them.
+ */
+
+/*
+ * Offset of the leaf/node space.  First block in this space
+ * is the btree root.
+ */
+#define        XFS_DIR2_LEAF_SPACE     1
+#define        XFS_DIR2_LEAF_OFFSET    (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE)
+
+/*
+ * Leaf block header.
+ */
+typedef struct xfs_dir2_leaf_hdr {
+       xfs_da_blkinfo_t        info;           /* header for da routines */
+       __be16                  count;          /* count of entries */
+       __be16                  stale;          /* count of stale entries */
+} xfs_dir2_leaf_hdr_t;
+
+struct xfs_dir3_leaf_hdr {
+       struct xfs_da3_blkinfo  info;           /* header for da routines */
+       __be16                  count;          /* count of entries */
+       __be16                  stale;          /* count of stale entries */
+       __be32                  pad;            /* 64 bit alignment */
+};
+
+struct xfs_dir3_icleaf_hdr {
+       __uint32_t              forw;
+       __uint32_t              back;
+       __uint16_t              magic;
+       __uint16_t              count;
+       __uint16_t              stale;
+};
+
+/*
+ * Leaf block entry.
+ */
+typedef struct xfs_dir2_leaf_entry {
+       __be32                  hashval;        /* hash value of name */
+       __be32                  address;        /* address of data entry */
+} xfs_dir2_leaf_entry_t;
+
+/*
+ * Leaf block tail.
+ */
+typedef struct xfs_dir2_leaf_tail {
+       __be32                  bestcount;
+} xfs_dir2_leaf_tail_t;
+
+/*
+ * Leaf block.
+ */
+typedef struct xfs_dir2_leaf {
+       xfs_dir2_leaf_hdr_t     hdr;                    /* leaf header */
+       xfs_dir2_leaf_entry_t   __ents[];               /* entries */
+} xfs_dir2_leaf_t;
+
+struct xfs_dir3_leaf {
+       struct xfs_dir3_leaf_hdr        hdr;            /* leaf header */
+       struct xfs_dir2_leaf_entry      __ents[];       /* entries */
+};
+
+#define XFS_DIR3_LEAF_CRC_OFF  offsetof(struct xfs_dir3_leaf_hdr, info.crc)
+
+/*
+ * Get address of the bests array in the single-leaf block.
+ */
+static inline __be16 *
+xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp)
+{
+       return (__be16 *)ltp - be32_to_cpu(ltp->bestcount);
+}
+
+/*
+ * Free space block defintions for the node format.
+ */
+
+/*
+ * Offset of the freespace index.
+ */
+#define        XFS_DIR2_FREE_SPACE     2
+#define        XFS_DIR2_FREE_OFFSET    (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE)
+
+typedef        struct xfs_dir2_free_hdr {
+       __be32                  magic;          /* XFS_DIR2_FREE_MAGIC */
+       __be32                  firstdb;        /* db of first entry */
+       __be32                  nvalid;         /* count of valid entries */
+       __be32                  nused;          /* count of used entries */
+} xfs_dir2_free_hdr_t;
+
+typedef struct xfs_dir2_free {
+       xfs_dir2_free_hdr_t     hdr;            /* block header */
+       __be16                  bests[];        /* best free counts */
+                                               /* unused entries are -1 */
+} xfs_dir2_free_t;
+
+struct xfs_dir3_free_hdr {
+       struct xfs_dir3_blk_hdr hdr;
+       __be32                  firstdb;        /* db of first entry */
+       __be32                  nvalid;         /* count of valid entries */
+       __be32                  nused;          /* count of used entries */
+       __be32                  pad;            /* 64 bit alignment */
+};
+
+struct xfs_dir3_free {
+       struct xfs_dir3_free_hdr hdr;
+       __be16                  bests[];        /* best free counts */
+                                               /* unused entries are -1 */
+};
+
+#define XFS_DIR3_FREE_CRC_OFF  offsetof(struct xfs_dir3_free, hdr.hdr.crc)
+
+/*
+ * In core version of the free block header, abstracted away from on-disk format
+ * differences. Use this in the code, and convert to/from the disk version using
+ * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk.
+ */
+struct xfs_dir3_icfree_hdr {
+       __uint32_t      magic;
+       __uint32_t      firstdb;
+       __uint32_t      nvalid;
+       __uint32_t      nused;
+
+};
+
+/*
+ * Single block format.
+ *
+ * The single block format looks like the following drawing on disk:
+ *
+ *    +-------------------------------------------------+
+ *    | xfs_dir2_data_hdr_t                             |
+ *    +-------------------------------------------------+
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t :
+ *    | ...                                             |
+ *    +-------------------------------------------------+
+ *    | unused space                                    |
+ *    +-------------------------------------------------+
+ *    | ...                                             |
+ *    | xfs_dir2_leaf_entry_t                           |
+ *    | xfs_dir2_leaf_entry_t                           |
+ *    +-------------------------------------------------+
+ *    | xfs_dir2_block_tail_t                           |
+ *    +-------------------------------------------------+
+ *
+ * As all the entries are variable size structures the accessors below should
+ * be used to iterate over them.
+ */
+
+typedef struct xfs_dir2_block_tail {
+       __be32          count;                  /* count of leaf entries */
+       __be32          stale;                  /* count of stale lf entries */
+} xfs_dir2_block_tail_t;
+
+/*
+ * Pointer to the leaf entries embedded in a data block (1-block format)
+ */
+static inline struct xfs_dir2_leaf_entry *
+xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
+{
+       return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count);
+}
+
+
+/*
+ * Attribute storage layout
+ *
+ * Attribute lists are structured around Btrees where all the data
+ * elements are in the leaf nodes.  Attribute names are hashed into an int,
+ * then that int is used as the index into the Btree.  Since the hashval
+ * of an attribute name may not be unique, we may have duplicate keys.  The
+ * internal links in the Btree are logical block offsets into the file.
+ *
+ * Struct leaf_entry's are packed from the top.  Name/values grow from the
+ * bottom but are not packed.  The freemap contains run-length-encoded entries
+ * for the free bytes after the leaf_entry's, but only the N largest such,
+ * smaller runs are dropped.  When the freemap doesn't show enough space
+ * for an allocation, we compact the name/value area and try again.  If we
+ * still don't have enough space, then we have to split the block.  The
+ * name/value structs (both local and remote versions) must be 32bit aligned.
+ *
+ * Since we have duplicate hash keys, for each key that matches, compare
+ * the actual name string.  The root and intermediate node search always
+ * takes the first-in-the-block key match found, so we should only have
+ * to work "forw"ard.  If none matches, continue with the "forw"ard leaf
+ * nodes until the hash key changes or the attribute name is found.
+ *
+ * We store the fact that an attribute is a ROOT/USER/SECURE attribute in
+ * the leaf_entry.  The namespaces are independent only because we also look
+ * at the namespace bit when we are looking for a matching attribute name.
+ *
+ * We also store an "incomplete" bit in the leaf_entry.  It shows that an
+ * attribute is in the middle of being created and should not be shown to
+ * the user if we crash during the time that the bit is set.  We clear the
+ * bit when we have finished setting up the attribute.  We do this because
+ * we cannot create some large attributes inside a single transaction, and we
+ * need some indication that we weren't finished if we crash in the middle.
+ */
+#define XFS_ATTR_LEAF_MAPSIZE  3       /* how many freespace slots */
+
+typedef struct xfs_attr_leaf_map {     /* RLE map of free bytes */
+       __be16  base;                     /* base of free region */
+       __be16  size;                     /* length of free region */
+} xfs_attr_leaf_map_t;
+
+typedef struct xfs_attr_leaf_hdr {     /* constant-structure header block */
+       xfs_da_blkinfo_t info;          /* block type, links, etc. */
+       __be16  count;                  /* count of active leaf_entry's */
+       __be16  usedbytes;              /* num bytes of names/values stored */
+       __be16  firstused;              /* first used byte in name area */
+       __u8    holes;                  /* != 0 if blk needs compaction */
+       __u8    pad1;
+       xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE];
+                                       /* N largest free regions */
+} xfs_attr_leaf_hdr_t;
+
+typedef struct xfs_attr_leaf_entry {   /* sorted on key, not name */
+       __be32  hashval;                /* hash value of name */
+       __be16  nameidx;                /* index into buffer of name/value */
+       __u8    flags;                  /* LOCAL/ROOT/SECURE/INCOMPLETE flag */
+       __u8    pad2;                   /* unused pad byte */
+} xfs_attr_leaf_entry_t;
+
+typedef struct xfs_attr_leaf_name_local {
+       __be16  valuelen;               /* number of bytes in value */
+       __u8    namelen;                /* length of name bytes */
+       __u8    nameval[1];             /* name/value bytes */
+} xfs_attr_leaf_name_local_t;
+
+typedef struct xfs_attr_leaf_name_remote {
+       __be32  valueblk;               /* block number of value bytes */
+       __be32  valuelen;               /* number of bytes in value */
+       __u8    namelen;                /* length of name bytes */
+       __u8    name[1];                /* name bytes */
+} xfs_attr_leaf_name_remote_t;
+
+typedef struct xfs_attr_leafblock {
+       xfs_attr_leaf_hdr_t     hdr;    /* constant-structure header block */
+       xfs_attr_leaf_entry_t   entries[1];     /* sorted on key, not name */
+       xfs_attr_leaf_name_local_t namelist;    /* grows from bottom of buf */
+       xfs_attr_leaf_name_remote_t valuelist;  /* grows from bottom of buf */
+} xfs_attr_leafblock_t;
+
+/*
+ * CRC enabled leaf structures. Called "version 3" structures to match the
+ * version number of the directory and dablk structures for this feature, and
+ * attr2 is already taken by the variable inode attribute fork size feature.
+ */
+struct xfs_attr3_leaf_hdr {
+       struct xfs_da3_blkinfo  info;
+       __be16                  count;
+       __be16                  usedbytes;
+       __be16                  firstused;
+       __u8                    holes;
+       __u8                    pad1;
+       struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE];
+       __be32                  pad2;           /* 64 bit alignment */
+};
+
+#define XFS_ATTR3_LEAF_CRC_OFF (offsetof(struct xfs_attr3_leaf_hdr, info.crc))
+
+struct xfs_attr3_leafblock {
+       struct xfs_attr3_leaf_hdr       hdr;
+       struct xfs_attr_leaf_entry      entries[1];
+
+       /*
+        * The rest of the block contains the following structures after the
+        * leaf entries, growing from the bottom up. The variables are never
+        * referenced, the locations accessed purely from helper functions.
+        *
+        * struct xfs_attr_leaf_name_local
+        * struct xfs_attr_leaf_name_remote
+        */
+};
+
+/*
+ * incore, neutral version of the attribute leaf header
+ */
+struct xfs_attr3_icleaf_hdr {
+       __uint32_t      forw;
+       __uint32_t      back;
+       __uint16_t      magic;
+       __uint16_t      count;
+       __uint16_t      usedbytes;
+       __uint16_t      firstused;
+       __u8            holes;
+       struct {
+               __uint16_t      base;
+               __uint16_t      size;
+       } freemap[XFS_ATTR_LEAF_MAPSIZE];
+};
+
+/*
+ * Flags used in the leaf_entry[i].flags field.
+ * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
+ * on the system call, they are "or"ed together for various operations.
+ */
+#define        XFS_ATTR_LOCAL_BIT      0       /* attr is stored locally */
+#define        XFS_ATTR_ROOT_BIT       1       /* limit access to trusted attrs */
+#define        XFS_ATTR_SECURE_BIT     2       /* limit access to secure attrs */
+#define        XFS_ATTR_INCOMPLETE_BIT 7       /* attr in middle of create/delete */
+#define XFS_ATTR_LOCAL         (1 << XFS_ATTR_LOCAL_BIT)
+#define XFS_ATTR_ROOT          (1 << XFS_ATTR_ROOT_BIT)
+#define XFS_ATTR_SECURE                (1 << XFS_ATTR_SECURE_BIT)
+#define XFS_ATTR_INCOMPLETE    (1 << XFS_ATTR_INCOMPLETE_BIT)
+
+/*
+ * Conversion macros for converting namespace bits from argument flags
+ * to ondisk flags.
+ */
+#define XFS_ATTR_NSP_ARGS_MASK         (ATTR_ROOT | ATTR_SECURE)
+#define XFS_ATTR_NSP_ONDISK_MASK       (XFS_ATTR_ROOT | XFS_ATTR_SECURE)
+#define XFS_ATTR_NSP_ONDISK(flags)     ((flags) & XFS_ATTR_NSP_ONDISK_MASK)
+#define XFS_ATTR_NSP_ARGS(flags)       ((flags) & XFS_ATTR_NSP_ARGS_MASK)
+#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\
+                                        ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0))
+#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\
+                                        ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0))
+
+/*
+ * Alignment for namelist and valuelist entries (since they are mixed
+ * there can be only one alignment value)
+ */
+#define        XFS_ATTR_LEAF_NAME_ALIGN        ((uint)sizeof(xfs_dablk_t))
+
+static inline int
+xfs_attr3_leaf_hdr_size(struct xfs_attr_leafblock *leafp)
+{
+       if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
+               return sizeof(struct xfs_attr3_leaf_hdr);
+       return sizeof(struct xfs_attr_leaf_hdr);
+}
+
+static inline struct xfs_attr_leaf_entry *
+xfs_attr3_leaf_entryp(xfs_attr_leafblock_t *leafp)
+{
+       if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
+               return &((struct xfs_attr3_leafblock *)leafp)->entries[0];
+       return &leafp->entries[0];
+}
+
+/*
+ * Cast typed pointers for "local" and "remote" name/value structs.
+ */
+static inline char *
+xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
+{
+       struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp);
+
+       return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)];
+}
+
+static inline xfs_attr_leaf_name_remote_t *
+xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
+{
+       return (xfs_attr_leaf_name_remote_t *)xfs_attr3_leaf_name(leafp, idx);
+}
+
+static inline xfs_attr_leaf_name_local_t *
+xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
+{
+       return (xfs_attr_leaf_name_local_t *)xfs_attr3_leaf_name(leafp, idx);
+}
+
+/*
+ * Calculate total bytes used (including trailing pad for alignment) for
+ * a "local" name/value structure, a "remote" name/value structure, and
+ * a pointer which might be either.
+ */
+static inline int xfs_attr_leaf_entsize_remote(int nlen)
+{
+       return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
+               XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+}
+
+static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
+{
+       return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) +
+               XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+}
+
+static inline int xfs_attr_leaf_entsize_local_max(int bsize)
+{
+       return (((bsize) >> 1) + ((bsize) >> 2));
+}
+
+
+
+/*
+ * Remote attribute block format definition
+ *
+ * There is one of these headers per filesystem block in a remote attribute.
+ * This is done to ensure there is a 1:1 mapping between the attribute value
+ * length and the number of blocks needed to store the attribute. This makes the
+ * verification of a buffer a little more complex, but greatly simplifies the
+ * allocation, reading and writing of these attributes as we don't have to guess
+ * the number of blocks needed to store the attribute data.
+ */
+#define XFS_ATTR3_RMT_MAGIC    0x5841524d      /* XARM */
+
+struct xfs_attr3_rmt_hdr {
+       __be32  rm_magic;
+       __be32  rm_offset;
+       __be32  rm_bytes;
+       __be32  rm_crc;
+       uuid_t  rm_uuid;
+       __be64  rm_owner;
+       __be64  rm_blkno;
+       __be64  rm_lsn;
+};
+
+#define XFS_ATTR3_RMT_CRC_OFF  offsetof(struct xfs_attr3_rmt_hdr, rm_crc)
+
+#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize)   \
+       ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
+                       sizeof(struct xfs_attr3_rmt_hdr) : 0))
+
+#endif /* __XFS_DA_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_dinode.h b/fs/xfs/libxfs/xfs_dinode.h
new file mode 100644 (file)
index 0000000..623bbe8
--- /dev/null
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DINODE_H__
+#define        __XFS_DINODE_H__
+
+#define        XFS_DINODE_MAGIC                0x494e  /* 'IN' */
+#define XFS_DINODE_GOOD_VERSION(v)     ((v) >= 1 && (v) <= 3)
+
+typedef struct xfs_timestamp {
+       __be32          t_sec;          /* timestamp seconds */
+       __be32          t_nsec;         /* timestamp nanoseconds */
+} xfs_timestamp_t;
+
+/*
+ * On-disk inode structure.
+ *
+ * This is just the header or "dinode core", the inode is expanded to fill a
+ * variable size the leftover area split into a data and an attribute fork.
+ * The format of the data and attribute fork depends on the format of the
+ * inode as indicated by di_format and di_aformat.  To access the data and
+ * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros
+ * below.
+ *
+ * There is a very similar struct icdinode in xfs_inode which matches the
+ * layout of the first 96 bytes of this structure, but is kept in native
+ * format instead of big endian.
+ *
+ * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed
+ * padding field for v3 inodes.
+ */
+typedef struct xfs_dinode {
+       __be16          di_magic;       /* inode magic # = XFS_DINODE_MAGIC */
+       __be16          di_mode;        /* mode and type of file */
+       __u8            di_version;     /* inode version */
+       __u8            di_format;      /* format of di_c data */
+       __be16          di_onlink;      /* old number of links to file */
+       __be32          di_uid;         /* owner's user id */
+       __be32          di_gid;         /* owner's group id */
+       __be32          di_nlink;       /* number of links to file */
+       __be16          di_projid_lo;   /* lower part of owner's project id */
+       __be16          di_projid_hi;   /* higher part owner's project id */
+       __u8            di_pad[6];      /* unused, zeroed space */
+       __be16          di_flushiter;   /* incremented on flush */
+       xfs_timestamp_t di_atime;       /* time last accessed */
+       xfs_timestamp_t di_mtime;       /* time last modified */
+       xfs_timestamp_t di_ctime;       /* time created/inode modified */
+       __be64          di_size;        /* number of bytes in file */
+       __be64          di_nblocks;     /* # of direct & btree blocks used */
+       __be32          di_extsize;     /* basic/minimum extent size for file */
+       __be32          di_nextents;    /* number of extents in data fork */
+       __be16          di_anextents;   /* number of extents in attribute fork*/
+       __u8            di_forkoff;     /* attr fork offs, <<3 for 64b align */
+       __s8            di_aformat;     /* format of attr fork's data */
+       __be32          di_dmevmask;    /* DMIG event mask */
+       __be16          di_dmstate;     /* DMIG state info */
+       __be16          di_flags;       /* random flags, XFS_DIFLAG_... */
+       __be32          di_gen;         /* generation number */
+
+       /* di_next_unlinked is the only non-core field in the old dinode */
+       __be32          di_next_unlinked;/* agi unlinked list ptr */
+
+       /* start of the extended dinode, writable fields */
+       __le32          di_crc;         /* CRC of the inode */
+       __be64          di_changecount; /* number of attribute changes */
+       __be64          di_lsn;         /* flush sequence */
+       __be64          di_flags2;      /* more random flags */
+       __u8            di_pad2[16];    /* more padding for future expansion */
+
+       /* fields only written to during inode creation */
+       xfs_timestamp_t di_crtime;      /* time created */
+       __be64          di_ino;         /* inode number */
+       uuid_t          di_uuid;        /* UUID of the filesystem */
+
+       /* structure must be padded to 64 bit alignment */
+} xfs_dinode_t;
+
+#define XFS_DINODE_CRC_OFF     offsetof(struct xfs_dinode, di_crc)
+
+#define DI_MAX_FLUSH 0xffff
+
+/*
+ * Size of the core inode on disk.  Version 1 and 2 inodes have
+ * the same size, but version 3 has grown a few additional fields.
+ */
+static inline uint xfs_dinode_size(int version)
+{
+       if (version == 3)
+               return sizeof(struct xfs_dinode);
+       return offsetof(struct xfs_dinode, di_crc);
+}
+
+/*
+ * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
+ * Since the pathconf interface is signed, we use 2^31 - 1 instead.
+ * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
+ */
+#define        XFS_MAXLINK             ((1U << 31) - 1U)
+#define        XFS_MAXLINK_1           65535U
+
+/*
+ * Values for di_format
+ */
+typedef enum xfs_dinode_fmt {
+       XFS_DINODE_FMT_DEV,             /* xfs_dev_t */
+       XFS_DINODE_FMT_LOCAL,           /* bulk data */
+       XFS_DINODE_FMT_EXTENTS,         /* struct xfs_bmbt_rec */
+       XFS_DINODE_FMT_BTREE,           /* struct xfs_bmdr_block */
+       XFS_DINODE_FMT_UUID             /* uuid_t */
+} xfs_dinode_fmt_t;
+
+/*
+ * Inode minimum and maximum sizes.
+ */
+#define        XFS_DINODE_MIN_LOG      8
+#define        XFS_DINODE_MAX_LOG      11
+#define        XFS_DINODE_MIN_SIZE     (1 << XFS_DINODE_MIN_LOG)
+#define        XFS_DINODE_MAX_SIZE     (1 << XFS_DINODE_MAX_LOG)
+
+/*
+ * Inode size for given fs.
+ */
+#define XFS_LITINO(mp, version) \
+       ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
+
+/*
+ * Inode data & attribute fork sizes, per inode.
+ */
+#define XFS_DFORK_Q(dip)               ((dip)->di_forkoff != 0)
+#define XFS_DFORK_BOFF(dip)            ((int)((dip)->di_forkoff << 3))
+
+#define XFS_DFORK_DSIZE(dip,mp) \
+       (XFS_DFORK_Q(dip) ? \
+               XFS_DFORK_BOFF(dip) : \
+               XFS_LITINO(mp, (dip)->di_version))
+#define XFS_DFORK_ASIZE(dip,mp) \
+       (XFS_DFORK_Q(dip) ? \
+               XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \
+               0)
+#define XFS_DFORK_SIZE(dip,mp,w) \
+       ((w) == XFS_DATA_FORK ? \
+               XFS_DFORK_DSIZE(dip, mp) : \
+               XFS_DFORK_ASIZE(dip, mp))
+
+/*
+ * Return pointers to the data or attribute forks.
+ */
+#define XFS_DFORK_DPTR(dip) \
+       ((char *)dip + xfs_dinode_size(dip->di_version))
+#define XFS_DFORK_APTR(dip)    \
+       (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
+#define XFS_DFORK_PTR(dip,w)   \
+       ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
+
+#define XFS_DFORK_FORMAT(dip,w) \
+       ((w) == XFS_DATA_FORK ? \
+               (dip)->di_format : \
+               (dip)->di_aformat)
+#define XFS_DFORK_NEXTENTS(dip,w) \
+       ((w) == XFS_DATA_FORK ? \
+               be32_to_cpu((dip)->di_nextents) : \
+               be16_to_cpu((dip)->di_anextents))
+
+#define        XFS_BUF_TO_DINODE(bp)   ((xfs_dinode_t *)((bp)->b_addr))
+
+/*
+ * For block and character special files the 32bit dev_t is stored at the
+ * beginning of the data fork.
+ */
+static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip)
+{
+       return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip));
+}
+
+static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
+{
+       *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev);
+}
+
+/*
+ * Values for di_flags
+ * There should be a one-to-one correspondence between these flags and the
+ * XFS_XFLAG_s.
+ */
+#define XFS_DIFLAG_REALTIME_BIT  0     /* file's blocks come from rt area */
+#define XFS_DIFLAG_PREALLOC_BIT  1     /* file space has been preallocated */
+#define XFS_DIFLAG_NEWRTBM_BIT   2     /* for rtbitmap inode, new format */
+#define XFS_DIFLAG_IMMUTABLE_BIT 3     /* inode is immutable */
+#define XFS_DIFLAG_APPEND_BIT    4     /* inode is append-only */
+#define XFS_DIFLAG_SYNC_BIT      5     /* inode is written synchronously */
+#define XFS_DIFLAG_NOATIME_BIT   6     /* do not update atime */
+#define XFS_DIFLAG_NODUMP_BIT    7     /* do not dump */
+#define XFS_DIFLAG_RTINHERIT_BIT 8     /* create with realtime bit set */
+#define XFS_DIFLAG_PROJINHERIT_BIT   9 /* create with parents projid */
+#define XFS_DIFLAG_NOSYMLINKS_BIT   10 /* disallow symlink creation */
+#define XFS_DIFLAG_EXTSIZE_BIT      11 /* inode extent size allocator hint */
+#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */
+#define XFS_DIFLAG_NODEFRAG_BIT     13 /* do not reorganize/defragment */
+#define XFS_DIFLAG_FILESTREAM_BIT   14  /* use filestream allocator */
+#define XFS_DIFLAG_REALTIME      (1 << XFS_DIFLAG_REALTIME_BIT)
+#define XFS_DIFLAG_PREALLOC      (1 << XFS_DIFLAG_PREALLOC_BIT)
+#define XFS_DIFLAG_NEWRTBM       (1 << XFS_DIFLAG_NEWRTBM_BIT)
+#define XFS_DIFLAG_IMMUTABLE     (1 << XFS_DIFLAG_IMMUTABLE_BIT)
+#define XFS_DIFLAG_APPEND        (1 << XFS_DIFLAG_APPEND_BIT)
+#define XFS_DIFLAG_SYNC          (1 << XFS_DIFLAG_SYNC_BIT)
+#define XFS_DIFLAG_NOATIME       (1 << XFS_DIFLAG_NOATIME_BIT)
+#define XFS_DIFLAG_NODUMP        (1 << XFS_DIFLAG_NODUMP_BIT)
+#define XFS_DIFLAG_RTINHERIT     (1 << XFS_DIFLAG_RTINHERIT_BIT)
+#define XFS_DIFLAG_PROJINHERIT   (1 << XFS_DIFLAG_PROJINHERIT_BIT)
+#define XFS_DIFLAG_NOSYMLINKS    (1 << XFS_DIFLAG_NOSYMLINKS_BIT)
+#define XFS_DIFLAG_EXTSIZE       (1 << XFS_DIFLAG_EXTSIZE_BIT)
+#define XFS_DIFLAG_EXTSZINHERIT  (1 << XFS_DIFLAG_EXTSZINHERIT_BIT)
+#define XFS_DIFLAG_NODEFRAG      (1 << XFS_DIFLAG_NODEFRAG_BIT)
+#define XFS_DIFLAG_FILESTREAM    (1 << XFS_DIFLAG_FILESTREAM_BIT)
+
+#ifdef CONFIG_XFS_RT
+#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME)
+#else
+#define XFS_IS_REALTIME_INODE(ip) (0)
+#endif
+
+#define XFS_DIFLAG_ANY \
+       (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
+        XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
+        XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
+        XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
+        XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)
+
+#endif /* __XFS_DINODE_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
new file mode 100644 (file)
index 0000000..6cef221
--- /dev/null
@@ -0,0 +1,762 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_inum.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_dinode.h"
+
+struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
+
+
+/*
+ * ASCII case-insensitive (ie. A-Z) support for directories that was
+ * used in IRIX.
+ */
+STATIC xfs_dahash_t
+xfs_ascii_ci_hashname(
+       struct xfs_name *name)
+{
+       xfs_dahash_t    hash;
+       int             i;
+
+       for (i = 0, hash = 0; i < name->len; i++)
+               hash = tolower(name->name[i]) ^ rol32(hash, 7);
+
+       return hash;
+}
+
+STATIC enum xfs_dacmp
+xfs_ascii_ci_compname(
+       struct xfs_da_args *args,
+       const unsigned char *name,
+       int             len)
+{
+       enum xfs_dacmp  result;
+       int             i;
+
+       if (args->namelen != len)
+               return XFS_CMP_DIFFERENT;
+
+       result = XFS_CMP_EXACT;
+       for (i = 0; i < len; i++) {
+               if (args->name[i] == name[i])
+                       continue;
+               if (tolower(args->name[i]) != tolower(name[i]))
+                       return XFS_CMP_DIFFERENT;
+               result = XFS_CMP_CASE;
+       }
+
+       return result;
+}
+
+static struct xfs_nameops xfs_ascii_ci_nameops = {
+       .hashname       = xfs_ascii_ci_hashname,
+       .compname       = xfs_ascii_ci_compname,
+};
+
+int
+xfs_da_mount(
+       struct xfs_mount        *mp)
+{
+       struct xfs_da_geometry  *dageo;
+       int                     nodehdr_size;
+
+
+       ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
+       ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
+              XFS_MAX_BLOCKSIZE);
+
+       mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL);
+       mp->m_nondir_inode_ops = xfs_nondir_get_ops(mp, NULL);
+
+       nodehdr_size = mp->m_dir_inode_ops->node_hdr_size;
+       mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
+                                   KM_SLEEP | KM_MAYFAIL);
+       mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
+                                    KM_SLEEP | KM_MAYFAIL);
+       if (!mp->m_dir_geo || !mp->m_attr_geo) {
+               kmem_free(mp->m_dir_geo);
+               kmem_free(mp->m_attr_geo);
+               return -ENOMEM;
+       }
+
+       /* set up directory geometry */
+       dageo = mp->m_dir_geo;
+       dageo->blklog = mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog;
+       dageo->fsblog = mp->m_sb.sb_blocklog;
+       dageo->blksize = 1 << dageo->blklog;
+       dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog;
+
+       /*
+        * Now we've set up the block conversion variables, we can calculate the
+        * segment block constants using the geometry structure.
+        */
+       dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET);
+       dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET);
+       dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET);
+       dageo->node_ents = (dageo->blksize - nodehdr_size) /
+                               (uint)sizeof(xfs_da_node_entry_t);
+       dageo->magicpct = (dageo->blksize * 37) / 100;
+
+       /* set up attribute geometry - single fsb only */
+       dageo = mp->m_attr_geo;
+       dageo->blklog = mp->m_sb.sb_blocklog;
+       dageo->fsblog = mp->m_sb.sb_blocklog;
+       dageo->blksize = 1 << dageo->blklog;
+       dageo->fsbcount = 1;
+       dageo->node_ents = (dageo->blksize - nodehdr_size) /
+                               (uint)sizeof(xfs_da_node_entry_t);
+       dageo->magicpct = (dageo->blksize * 37) / 100;
+
+       if (xfs_sb_version_hasasciici(&mp->m_sb))
+               mp->m_dirnameops = &xfs_ascii_ci_nameops;
+       else
+               mp->m_dirnameops = &xfs_default_nameops;
+
+       return 0;
+}
+
+void
+xfs_da_unmount(
+       struct xfs_mount        *mp)
+{
+       kmem_free(mp->m_dir_geo);
+       kmem_free(mp->m_attr_geo);
+}
+
+/*
+ * Return 1 if directory contains only "." and "..".
+ */
+int
+xfs_dir_isempty(
+       xfs_inode_t     *dp)
+{
+       xfs_dir2_sf_hdr_t       *sfp;
+
+       ASSERT(S_ISDIR(dp->i_d.di_mode));
+       if (dp->i_d.di_size == 0)       /* might happen during shutdown. */
+               return 1;
+       if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
+               return 0;
+       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+       return !sfp->count;
+}
+
+/*
+ * Validate a given inode number.
+ */
+int
+xfs_dir_ino_validate(
+       xfs_mount_t     *mp,
+       xfs_ino_t       ino)
+{
+       xfs_agblock_t   agblkno;
+       xfs_agino_t     agino;
+       xfs_agnumber_t  agno;
+       int             ino_ok;
+       int             ioff;
+
+       agno = XFS_INO_TO_AGNO(mp, ino);
+       agblkno = XFS_INO_TO_AGBNO(mp, ino);
+       ioff = XFS_INO_TO_OFFSET(mp, ino);
+       agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff);
+       ino_ok =
+               agno < mp->m_sb.sb_agcount &&
+               agblkno < mp->m_sb.sb_agblocks &&
+               agblkno != 0 &&
+               ioff < (1 << mp->m_sb.sb_inopblog) &&
+               XFS_AGINO_TO_INO(mp, agno, agino) == ino;
+       if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
+                       XFS_RANDOM_DIR_INO_VALIDATE))) {
+               xfs_warn(mp, "Invalid inode number 0x%Lx",
+                               (unsigned long long) ino);
+               XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
+               return -EFSCORRUPTED;
+       }
+       return 0;
+}
+
+/*
+ * Initialize a directory with its "." and ".." entries.
+ */
+int
+xfs_dir_init(
+       xfs_trans_t     *tp,
+       xfs_inode_t     *dp,
+       xfs_inode_t     *pdp)
+{
+       struct xfs_da_args *args;
+       int             error;
+
+       ASSERT(S_ISDIR(dp->i_d.di_mode));
+       error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino);
+       if (error)
+               return error;
+
+       args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+       if (!args)
+               return -ENOMEM;
+
+       args->geo = dp->i_mount->m_dir_geo;
+       args->dp = dp;
+       args->trans = tp;
+       error = xfs_dir2_sf_create(args, pdp->i_ino);
+       kmem_free(args);
+       return error;
+}
+
+/*
+  Enter a name in a directory.
+ */
+int
+xfs_dir_createname(
+       xfs_trans_t             *tp,
+       xfs_inode_t             *dp,
+       struct xfs_name         *name,
+       xfs_ino_t               inum,           /* new entry inode number */
+       xfs_fsblock_t           *first,         /* bmap's firstblock */
+       xfs_bmap_free_t         *flist,         /* bmap's freeblock list */
+       xfs_extlen_t            total)          /* bmap's total block count */
+{
+       struct xfs_da_args      *args;
+       int                     rval;
+       int                     v;              /* type-checking value */
+
+       ASSERT(S_ISDIR(dp->i_d.di_mode));
+       rval = xfs_dir_ino_validate(tp->t_mountp, inum);
+       if (rval)
+               return rval;
+       XFS_STATS_INC(xs_dir_create);
+
+       args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+       if (!args)
+               return -ENOMEM;
+
+       args->geo = dp->i_mount->m_dir_geo;
+       args->name = name->name;
+       args->namelen = name->len;
+       args->filetype = name->type;
+       args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+       args->inumber = inum;
+       args->dp = dp;
+       args->firstblock = first;
+       args->flist = flist;
+       args->total = total;
+       args->whichfork = XFS_DATA_FORK;
+       args->trans = tp;
+       args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+
+       if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+               rval = xfs_dir2_sf_addname(args);
+               goto out_free;
+       }
+
+       rval = xfs_dir2_isblock(args, &v);
+       if (rval)
+               goto out_free;
+       if (v) {
+               rval = xfs_dir2_block_addname(args);
+               goto out_free;
+       }
+
+       rval = xfs_dir2_isleaf(args, &v);
+       if (rval)
+               goto out_free;
+       if (v)
+               rval = xfs_dir2_leaf_addname(args);
+       else
+               rval = xfs_dir2_node_addname(args);
+
+out_free:
+       kmem_free(args);
+       return rval;
+}
+
+/*
+ * If doing a CI lookup and case-insensitive match, dup actual name into
+ * args.value. Return EEXIST for success (ie. name found) or an error.
+ */
+int
+xfs_dir_cilookup_result(
+       struct xfs_da_args *args,
+       const unsigned char *name,
+       int             len)
+{
+       if (args->cmpresult == XFS_CMP_DIFFERENT)
+               return -ENOENT;
+       if (args->cmpresult != XFS_CMP_CASE ||
+                                       !(args->op_flags & XFS_DA_OP_CILOOKUP))
+               return -EEXIST;
+
+       args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
+       if (!args->value)
+               return -ENOMEM;
+
+       memcpy(args->value, name, len);
+       args->valuelen = len;
+       return -EEXIST;
+}
+
+/*
+ * Lookup a name in a directory, give back the inode number.
+ * If ci_name is not NULL, returns the actual name in ci_name if it differs
+ * to name, or ci_name->name is set to NULL for an exact match.
+ */
+
+int
+xfs_dir_lookup(
+       xfs_trans_t     *tp,
+       xfs_inode_t     *dp,
+       struct xfs_name *name,
+       xfs_ino_t       *inum,          /* out: inode number */
+       struct xfs_name *ci_name)       /* out: actual name if CI match */
+{
+       struct xfs_da_args *args;
+       int             rval;
+       int             v;              /* type-checking value */
+
+       ASSERT(S_ISDIR(dp->i_d.di_mode));
+       XFS_STATS_INC(xs_dir_lookup);
+
+       /*
+        * We need to use KM_NOFS here so that lockdep will not throw false
+        * positive deadlock warnings on a non-transactional lookup path. It is
+        * safe to recurse into inode recalim in that case, but lockdep can't
+        * easily be taught about it. Hence KM_NOFS avoids having to add more
+        * lockdep Doing this avoids having to add a bunch of lockdep class
+        * annotations into the reclaim path for the ilock.
+        */
+       args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+       args->geo = dp->i_mount->m_dir_geo;
+       args->name = name->name;
+       args->namelen = name->len;
+       args->filetype = name->type;
+       args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+       args->dp = dp;
+       args->whichfork = XFS_DATA_FORK;
+       args->trans = tp;
+       args->op_flags = XFS_DA_OP_OKNOENT;
+       if (ci_name)
+               args->op_flags |= XFS_DA_OP_CILOOKUP;
+
+       if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+               rval = xfs_dir2_sf_lookup(args);
+               goto out_check_rval;
+       }
+
+       rval = xfs_dir2_isblock(args, &v);
+       if (rval)
+               goto out_free;
+       if (v) {
+               rval = xfs_dir2_block_lookup(args);
+               goto out_check_rval;
+       }
+
+       rval = xfs_dir2_isleaf(args, &v);
+       if (rval)
+               goto out_free;
+       if (v)
+               rval = xfs_dir2_leaf_lookup(args);
+       else
+               rval = xfs_dir2_node_lookup(args);
+
+out_check_rval:
+       if (rval == -EEXIST)
+               rval = 0;
+       if (!rval) {
+               *inum = args->inumber;
+               if (ci_name) {
+                       ci_name->name = args->value;
+                       ci_name->len = args->valuelen;
+               }
+       }
+out_free:
+       kmem_free(args);
+       return rval;
+}
+
+/*
+ * Remove an entry from a directory.
+ */
+int
+xfs_dir_removename(
+       xfs_trans_t     *tp,
+       xfs_inode_t     *dp,
+       struct xfs_name *name,
+       xfs_ino_t       ino,
+       xfs_fsblock_t   *first,         /* bmap's firstblock */
+       xfs_bmap_free_t *flist,         /* bmap's freeblock list */
+       xfs_extlen_t    total)          /* bmap's total block count */
+{
+       struct xfs_da_args *args;
+       int             rval;
+       int             v;              /* type-checking value */
+
+       ASSERT(S_ISDIR(dp->i_d.di_mode));
+       XFS_STATS_INC(xs_dir_remove);
+
+       args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+       if (!args)
+               return -ENOMEM;
+
+       args->geo = dp->i_mount->m_dir_geo;
+       args->name = name->name;
+       args->namelen = name->len;
+       args->filetype = name->type;
+       args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+       args->inumber = ino;
+       args->dp = dp;
+       args->firstblock = first;
+       args->flist = flist;
+       args->total = total;
+       args->whichfork = XFS_DATA_FORK;
+       args->trans = tp;
+
+       if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+               rval = xfs_dir2_sf_removename(args);
+               goto out_free;
+       }
+
+       rval = xfs_dir2_isblock(args, &v);
+       if (rval)
+               goto out_free;
+       if (v) {
+               rval = xfs_dir2_block_removename(args);
+               goto out_free;
+       }
+
+       rval = xfs_dir2_isleaf(args, &v);
+       if (rval)
+               goto out_free;
+       if (v)
+               rval = xfs_dir2_leaf_removename(args);
+       else
+               rval = xfs_dir2_node_removename(args);
+out_free:
+       kmem_free(args);
+       return rval;
+}
+
+/*
+ * Replace the inode number of a directory entry.
+ */
+int
+xfs_dir_replace(
+       xfs_trans_t     *tp,
+       xfs_inode_t     *dp,
+       struct xfs_name *name,          /* name of entry to replace */
+       xfs_ino_t       inum,           /* new inode number */
+       xfs_fsblock_t   *first,         /* bmap's firstblock */
+       xfs_bmap_free_t *flist,         /* bmap's freeblock list */
+       xfs_extlen_t    total)          /* bmap's total block count */
+{
+       struct xfs_da_args *args;
+       int             rval;
+       int             v;              /* type-checking value */
+
+       ASSERT(S_ISDIR(dp->i_d.di_mode));
+
+       rval = xfs_dir_ino_validate(tp->t_mountp, inum);
+       if (rval)
+               return rval;
+
+       args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+       if (!args)
+               return -ENOMEM;
+
+       args->geo = dp->i_mount->m_dir_geo;
+       args->name = name->name;
+       args->namelen = name->len;
+       args->filetype = name->type;
+       args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+       args->inumber = inum;
+       args->dp = dp;
+       args->firstblock = first;
+       args->flist = flist;
+       args->total = total;
+       args->whichfork = XFS_DATA_FORK;
+       args->trans = tp;
+
+       if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+               rval = xfs_dir2_sf_replace(args);
+               goto out_free;
+       }
+
+       rval = xfs_dir2_isblock(args, &v);
+       if (rval)
+               goto out_free;
+       if (v) {
+               rval = xfs_dir2_block_replace(args);
+               goto out_free;
+       }
+
+       rval = xfs_dir2_isleaf(args, &v);
+       if (rval)
+               goto out_free;
+       if (v)
+               rval = xfs_dir2_leaf_replace(args);
+       else
+               rval = xfs_dir2_node_replace(args);
+out_free:
+       kmem_free(args);
+       return rval;
+}
+
+/*
+ * See if this entry can be added to the directory without allocating space.
+ * First checks that the caller couldn't reserve enough space (resblks = 0).
+ */
+int
+xfs_dir_canenter(
+       xfs_trans_t     *tp,
+       xfs_inode_t     *dp,
+       struct xfs_name *name,          /* name of entry to add */
+       uint            resblks)
+{
+       struct xfs_da_args *args;
+       int             rval;
+       int             v;              /* type-checking value */
+
+       if (resblks)
+               return 0;
+
+       ASSERT(S_ISDIR(dp->i_d.di_mode));
+
+       args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+       if (!args)
+               return -ENOMEM;
+
+       args->geo = dp->i_mount->m_dir_geo;
+       args->name = name->name;
+       args->namelen = name->len;
+       args->filetype = name->type;
+       args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+       args->dp = dp;
+       args->whichfork = XFS_DATA_FORK;
+       args->trans = tp;
+       args->op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME |
+                                                       XFS_DA_OP_OKNOENT;
+
+       if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+               rval = xfs_dir2_sf_addname(args);
+               goto out_free;
+       }
+
+       rval = xfs_dir2_isblock(args, &v);
+       if (rval)
+               goto out_free;
+       if (v) {
+               rval = xfs_dir2_block_addname(args);
+               goto out_free;
+       }
+
+       rval = xfs_dir2_isleaf(args, &v);
+       if (rval)
+               goto out_free;
+       if (v)
+               rval = xfs_dir2_leaf_addname(args);
+       else
+               rval = xfs_dir2_node_addname(args);
+out_free:
+       kmem_free(args);
+       return rval;
+}
+
+/*
+ * Utility routines.
+ */
+
+/*
+ * Add a block to the directory.
+ *
+ * This routine is for data and free blocks, not leaf/node blocks which are
+ * handled by xfs_da_grow_inode.
+ */
+int
+xfs_dir2_grow_inode(
+       struct xfs_da_args      *args,
+       int                     space,  /* v2 dir's space XFS_DIR2_xxx_SPACE */
+       xfs_dir2_db_t           *dbp)   /* out: block number added */
+{
+       struct xfs_inode        *dp = args->dp;
+       struct xfs_mount        *mp = dp->i_mount;
+       xfs_fileoff_t           bno;    /* directory offset of new block */
+       int                     count;  /* count of filesystem blocks */
+       int                     error;
+
+       trace_xfs_dir2_grow_inode(args, space);
+
+       /*
+        * Set lowest possible block in the space requested.
+        */
+       bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE);
+       count = args->geo->fsbcount;
+
+       error = xfs_da_grow_inode_int(args, &bno, count);
+       if (error)
+               return error;
+
+       *dbp = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)bno);
+
+       /*
+        * Update file's size if this is the data space and it grew.
+        */
+       if (space == XFS_DIR2_DATA_SPACE) {
+               xfs_fsize_t     size;           /* directory file (data) size */
+
+               size = XFS_FSB_TO_B(mp, bno + count);
+               if (size > dp->i_d.di_size) {
+                       dp->i_d.di_size = size;
+                       xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+               }
+       }
+       return 0;
+}
+
+/*
+ * See if the directory is a single-block form directory.
+ */
+int
+xfs_dir2_isblock(
+       struct xfs_da_args      *args,
+       int                     *vp)    /* out: 1 is block, 0 is not block */
+{
+       xfs_fileoff_t           last;   /* last file offset */
+       int                     rval;
+
+       if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
+               return rval;
+       rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize;
+       ASSERT(rval == 0 || args->dp->i_d.di_size == args->geo->blksize);
+       *vp = rval;
+       return 0;
+}
+
+/*
+ * See if the directory is a single-leaf form directory.
+ */
+int
+xfs_dir2_isleaf(
+       struct xfs_da_args      *args,
+       int                     *vp)    /* out: 1 is block, 0 is not block */
+{
+       xfs_fileoff_t           last;   /* last file offset */
+       int                     rval;
+
+       if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
+               return rval;
+       *vp = last == args->geo->leafblk + args->geo->fsbcount;
+       return 0;
+}
+
+/*
+ * Remove the given block from the directory.
+ * This routine is used for data and free blocks, leaf/node are done
+ * by xfs_da_shrink_inode.
+ */
+int
+xfs_dir2_shrink_inode(
+       xfs_da_args_t   *args,
+       xfs_dir2_db_t   db,
+       struct xfs_buf  *bp)
+{
+       xfs_fileoff_t   bno;            /* directory file offset */
+       xfs_dablk_t     da;             /* directory file offset */
+       int             done;           /* bunmap is finished */
+       xfs_inode_t     *dp;
+       int             error;
+       xfs_mount_t     *mp;
+       xfs_trans_t     *tp;
+
+       trace_xfs_dir2_shrink_inode(args, db);
+
+       dp = args->dp;
+       mp = dp->i_mount;
+       tp = args->trans;
+       da = xfs_dir2_db_to_da(args->geo, db);
+       /*
+        * Unmap the fsblock(s).
+        */
+       if ((error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount,
+                       XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
+                       &done))) {
+               /*
+                * ENOSPC actually can happen if we're in a removename with
+                * no space reservation, and the resulting block removal
+                * would cause a bmap btree split or conversion from extents
+                * to btree.  This can only happen for un-fragmented
+                * directory blocks, since you need to be punching out
+                * the middle of an extent.
+                * In this case we need to leave the block in the file,
+                * and not binval it.
+                * So the block has to be in a consistent empty state
+                * and appropriately logged.
+                * We don't free up the buffer, the caller can tell it
+                * hasn't happened since it got an error back.
+                */
+               return error;
+       }
+       ASSERT(done);
+       /*
+        * Invalidate the buffer from the transaction.
+        */
+       xfs_trans_binval(tp, bp);
+       /*
+        * If it's not a data block, we're done.
+        */
+       if (db >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET))
+               return 0;
+       /*
+        * If the block isn't the last one in the directory, we're done.
+        */
+       if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0))
+               return 0;
+       bno = da;
+       if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) {
+               /*
+                * This can't really happen unless there's kernel corruption.
+                */
+               return error;
+       }
+       if (db == args->geo->datablk)
+               ASSERT(bno == 0);
+       else
+               ASSERT(bno > 0);
+       /*
+        * Set the size to the new last block.
+        */
+       dp->i_d.di_size = XFS_FSB_TO_B(mp, bno);
+       xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+       return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
new file mode 100644 (file)
index 0000000..c8e86b0
--- /dev/null
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DIR2_H__
+#define __XFS_DIR2_H__
+
+struct xfs_bmap_free;
+struct xfs_da_args;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dir2_sf_hdr;
+struct xfs_dir2_sf_entry;
+struct xfs_dir2_data_hdr;
+struct xfs_dir2_data_entry;
+struct xfs_dir2_data_unused;
+
+extern struct xfs_name xfs_name_dotdot;
+
+/*
+ * directory operations vector for encode/decode routines
+ */
+struct xfs_dir_ops {
+       int     (*sf_entsize)(struct xfs_dir2_sf_hdr *hdr, int len);
+       struct xfs_dir2_sf_entry *
+               (*sf_nextentry)(struct xfs_dir2_sf_hdr *hdr,
+                               struct xfs_dir2_sf_entry *sfep);
+       __uint8_t (*sf_get_ftype)(struct xfs_dir2_sf_entry *sfep);
+       void    (*sf_put_ftype)(struct xfs_dir2_sf_entry *sfep,
+                               __uint8_t ftype);
+       xfs_ino_t (*sf_get_ino)(struct xfs_dir2_sf_hdr *hdr,
+                               struct xfs_dir2_sf_entry *sfep);
+       void    (*sf_put_ino)(struct xfs_dir2_sf_hdr *hdr,
+                             struct xfs_dir2_sf_entry *sfep,
+                             xfs_ino_t ino);
+       xfs_ino_t (*sf_get_parent_ino)(struct xfs_dir2_sf_hdr *hdr);
+       void    (*sf_put_parent_ino)(struct xfs_dir2_sf_hdr *hdr,
+                                    xfs_ino_t ino);
+
+       int     (*data_entsize)(int len);
+       __uint8_t (*data_get_ftype)(struct xfs_dir2_data_entry *dep);
+       void    (*data_put_ftype)(struct xfs_dir2_data_entry *dep,
+                               __uint8_t ftype);
+       __be16 * (*data_entry_tag_p)(struct xfs_dir2_data_entry *dep);
+       struct xfs_dir2_data_free *
+               (*data_bestfree_p)(struct xfs_dir2_data_hdr *hdr);
+
+       xfs_dir2_data_aoff_t data_dot_offset;
+       xfs_dir2_data_aoff_t data_dotdot_offset;
+       xfs_dir2_data_aoff_t data_first_offset;
+       size_t  data_entry_offset;
+
+       struct xfs_dir2_data_entry *
+               (*data_dot_entry_p)(struct xfs_dir2_data_hdr *hdr);
+       struct xfs_dir2_data_entry *
+               (*data_dotdot_entry_p)(struct xfs_dir2_data_hdr *hdr);
+       struct xfs_dir2_data_entry *
+               (*data_first_entry_p)(struct xfs_dir2_data_hdr *hdr);
+       struct xfs_dir2_data_entry *
+               (*data_entry_p)(struct xfs_dir2_data_hdr *hdr);
+       struct xfs_dir2_data_unused *
+               (*data_unused_p)(struct xfs_dir2_data_hdr *hdr);
+
+       int     leaf_hdr_size;
+       void    (*leaf_hdr_to_disk)(struct xfs_dir2_leaf *to,
+                                   struct xfs_dir3_icleaf_hdr *from);
+       void    (*leaf_hdr_from_disk)(struct xfs_dir3_icleaf_hdr *to,
+                                     struct xfs_dir2_leaf *from);
+       int     (*leaf_max_ents)(struct xfs_da_geometry *geo);
+       struct xfs_dir2_leaf_entry *
+               (*leaf_ents_p)(struct xfs_dir2_leaf *lp);
+
+       int     node_hdr_size;
+       void    (*node_hdr_to_disk)(struct xfs_da_intnode *to,
+                                   struct xfs_da3_icnode_hdr *from);
+       void    (*node_hdr_from_disk)(struct xfs_da3_icnode_hdr *to,
+                                     struct xfs_da_intnode *from);
+       struct xfs_da_node_entry *
+               (*node_tree_p)(struct xfs_da_intnode *dap);
+
+       int     free_hdr_size;
+       void    (*free_hdr_to_disk)(struct xfs_dir2_free *to,
+                                   struct xfs_dir3_icfree_hdr *from);
+       void    (*free_hdr_from_disk)(struct xfs_dir3_icfree_hdr *to,
+                                     struct xfs_dir2_free *from);
+       int     (*free_max_bests)(struct xfs_da_geometry *geo);
+       __be16 * (*free_bests_p)(struct xfs_dir2_free *free);
+       xfs_dir2_db_t (*db_to_fdb)(struct xfs_da_geometry *geo,
+                                  xfs_dir2_db_t db);
+       int     (*db_to_fdindex)(struct xfs_da_geometry *geo,
+                                xfs_dir2_db_t db);
+};
+
+extern const struct xfs_dir_ops *
+       xfs_dir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp);
+extern const struct xfs_dir_ops *
+       xfs_nondir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp);
+
+/*
+ * Generic directory interface routines
+ */
+extern void xfs_dir_startup(void);
+extern int xfs_da_mount(struct xfs_mount *mp);
+extern void xfs_da_unmount(struct xfs_mount *mp);
+
+extern int xfs_dir_isempty(struct xfs_inode *dp);
+extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
+                               struct xfs_inode *pdp);
+extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
+                               struct xfs_name *name, xfs_ino_t inum,
+                               xfs_fsblock_t *first,
+                               struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
+                               struct xfs_name *name, xfs_ino_t *inum,
+                               struct xfs_name *ci_name);
+extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
+                               struct xfs_name *name, xfs_ino_t ino,
+                               xfs_fsblock_t *first,
+                               struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
+                               struct xfs_name *name, xfs_ino_t inum,
+                               xfs_fsblock_t *first,
+                               struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
+                               struct xfs_name *name, uint resblks);
+
+/*
+ * Direct call from the bmap code, bypassing the generic directory layer.
+ */
+extern int xfs_dir2_sf_to_block(struct xfs_da_args *args);
+
+/*
+ * Interface routines used by userspace utilities
+ */
+extern int xfs_dir2_isblock(struct xfs_da_args *args, int *r);
+extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r);
+extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
+                               struct xfs_buf *bp);
+
+extern void xfs_dir2_data_freescan(struct xfs_inode *dp,
+               struct xfs_dir2_data_hdr *hdr, int *loghead);
+extern void xfs_dir2_data_log_entry(struct xfs_da_args *args,
+               struct xfs_buf *bp, struct xfs_dir2_data_entry *dep);
+extern void xfs_dir2_data_log_header(struct xfs_da_args *args,
+               struct xfs_buf *bp);
+extern void xfs_dir2_data_log_unused(struct xfs_da_args *args,
+               struct xfs_buf *bp, struct xfs_dir2_data_unused *dup);
+extern void xfs_dir2_data_make_free(struct xfs_da_args *args,
+               struct xfs_buf *bp, xfs_dir2_data_aoff_t offset,
+               xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
+extern void xfs_dir2_data_use_free(struct xfs_da_args *args,
+               struct xfs_buf *bp, struct xfs_dir2_data_unused *dup,
+               xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len,
+               int *needlogp, int *needscanp);
+
+extern struct xfs_dir2_data_free *xfs_dir2_data_freefind(
+               struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf,
+               struct xfs_dir2_data_unused *dup);
+
+extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_free_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_data_buf_ops;
+
+#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
new file mode 100644 (file)
index 0000000..9628cec
--- /dev/null
@@ -0,0 +1,1265 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_buf_item.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_dinode.h"
+
+/*
+ * Local function prototypes.
+ */
+static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, struct xfs_buf *bp,
+                                   int first, int last);
+static void xfs_dir2_block_log_tail(xfs_trans_t *tp, struct xfs_buf *bp);
+static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, struct xfs_buf **bpp,
+                                    int *entno);
+static int xfs_dir2_block_sort(const void *a, const void *b);
+
+static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
+
+/*
+ * One-time startup routine called from xfs_init().
+ */
+void
+xfs_dir_startup(void)
+{
+       xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1);
+       xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
+}
+
+static bool
+xfs_dir3_block_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
+                       return false;
+               if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+                       return false;
+               if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+                       return false;
+       } else {
+               if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
+                       return false;
+       }
+       if (__xfs_dir3_data_check(NULL, bp))
+               return false;
+       return true;
+}
+
+static void
+xfs_dir3_block_read_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+
+       if (xfs_sb_version_hascrc(&mp->m_sb) &&
+            !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
+               xfs_buf_ioerror(bp, -EFSBADCRC);
+       else if (!xfs_dir3_block_verify(bp))
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+       if (bp->b_error)
+               xfs_verifier_error(bp);
+}
+
+static void
+xfs_dir3_block_write_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_buf_log_item *bip = bp->b_fspriv;
+       struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+       if (!xfs_dir3_block_verify(bp)) {
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+               xfs_verifier_error(bp);
+               return;
+       }
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return;
+
+       if (bip)
+               hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+       xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
+       .verify_read = xfs_dir3_block_read_verify,
+       .verify_write = xfs_dir3_block_write_verify,
+};
+
+int
+xfs_dir3_block_read(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *dp,
+       struct xfs_buf          **bpp)
+{
+       struct xfs_mount        *mp = dp->i_mount;
+       int                     err;
+
+       err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp,
+                               XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
+       if (!err && tp)
+               xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
+       return err;
+}
+
+static void
+xfs_dir3_block_init(
+       struct xfs_mount        *mp,
+       struct xfs_trans        *tp,
+       struct xfs_buf          *bp,
+       struct xfs_inode        *dp)
+{
+       struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+       bp->b_ops = &xfs_dir3_block_buf_ops;
+       xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF);
+
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               memset(hdr3, 0, sizeof(*hdr3));
+               hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
+               hdr3->blkno = cpu_to_be64(bp->b_bn);
+               hdr3->owner = cpu_to_be64(dp->i_ino);
+               uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+               return;
+
+       }
+       hdr3->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
+}
+
+static void
+xfs_dir2_block_need_space(
+       struct xfs_inode                *dp,
+       struct xfs_dir2_data_hdr        *hdr,
+       struct xfs_dir2_block_tail      *btp,
+       struct xfs_dir2_leaf_entry      *blp,
+       __be16                          **tagpp,
+       struct xfs_dir2_data_unused     **dupp,
+       struct xfs_dir2_data_unused     **enddupp,
+       int                             *compact,
+       int                             len)
+{
+       struct xfs_dir2_data_free       *bf;
+       __be16                          *tagp = NULL;
+       struct xfs_dir2_data_unused     *dup = NULL;
+       struct xfs_dir2_data_unused     *enddup = NULL;
+
+       *compact = 0;
+       bf = dp->d_ops->data_bestfree_p(hdr);
+
+       /*
+        * If there are stale entries we'll use one for the leaf.
+        */
+       if (btp->stale) {
+               if (be16_to_cpu(bf[0].length) >= len) {
+                       /*
+                        * The biggest entry enough to avoid compaction.
+                        */
+                       dup = (xfs_dir2_data_unused_t *)
+                             ((char *)hdr + be16_to_cpu(bf[0].offset));
+                       goto out;
+               }
+
+               /*
+                * Will need to compact to make this work.
+                * Tag just before the first leaf entry.
+                */
+               *compact = 1;
+               tagp = (__be16 *)blp - 1;
+
+               /* Data object just before the first leaf entry.  */
+               dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+
+               /*
+                * If it's not free then the data will go where the
+                * leaf data starts now, if it works at all.
+                */
+               if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+                       if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
+                           (uint)sizeof(*blp) < len)
+                               dup = NULL;
+               } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
+                       dup = NULL;
+               else
+                       dup = (xfs_dir2_data_unused_t *)blp;
+               goto out;
+       }
+
+       /*
+        * no stale entries, so just use free space.
+        * Tag just before the first leaf entry.
+        */
+       tagp = (__be16 *)blp - 1;
+
+       /* Data object just before the first leaf entry.  */
+       enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+
+       /*
+        * If it's not free then can't do this add without cleaning up:
+        * the space before the first leaf entry needs to be free so it
+        * can be expanded to hold the pointer to the new entry.
+        */
+       if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+               /*
+                * Check out the biggest freespace and see if it's the same one.
+                */
+               dup = (xfs_dir2_data_unused_t *)
+                     ((char *)hdr + be16_to_cpu(bf[0].offset));
+               if (dup != enddup) {
+                       /*
+                        * Not the same free entry, just check its length.
+                        */
+                       if (be16_to_cpu(dup->length) < len)
+                               dup = NULL;
+                       goto out;
+               }
+
+               /*
+                * It is the biggest freespace, can it hold the leaf too?
+                */
+               if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
+                       /*
+                        * Yes, use the second-largest entry instead if it works.
+                        */
+                       if (be16_to_cpu(bf[1].length) >= len)
+                               dup = (xfs_dir2_data_unused_t *)
+                                     ((char *)hdr + be16_to_cpu(bf[1].offset));
+                       else
+                               dup = NULL;
+               }
+       }
+out:
+       *tagpp = tagp;
+       *dupp = dup;
+       *enddupp = enddup;
+}
+
+/*
+ * compact the leaf entries.
+ * Leave the highest-numbered stale entry stale.
+ * XXX should be the one closest to mid but mid is not yet computed.
+ */
+static void
+xfs_dir2_block_compact(
+       struct xfs_da_args              *args,
+       struct xfs_buf                  *bp,
+       struct xfs_dir2_data_hdr        *hdr,
+       struct xfs_dir2_block_tail      *btp,
+       struct xfs_dir2_leaf_entry      *blp,
+       int                             *needlog,
+       int                             *lfloghigh,
+       int                             *lfloglow)
+{
+       int                     fromidx;        /* source leaf index */
+       int                     toidx;          /* target leaf index */
+       int                     needscan = 0;
+       int                     highstale;      /* high stale index */
+
+       fromidx = toidx = be32_to_cpu(btp->count) - 1;
+       highstale = *lfloghigh = -1;
+       for (; fromidx >= 0; fromidx--) {
+               if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
+                       if (highstale == -1)
+                               highstale = toidx;
+                       else {
+                               if (*lfloghigh == -1)
+                                       *lfloghigh = toidx;
+                               continue;
+                       }
+               }
+               if (fromidx < toidx)
+                       blp[toidx] = blp[fromidx];
+               toidx--;
+       }
+       *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
+       *lfloghigh -= be32_to_cpu(btp->stale) - 1;
+       be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
+       xfs_dir2_data_make_free(args, bp,
+               (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
+               (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
+               needlog, &needscan);
+       btp->stale = cpu_to_be32(1);
+       /*
+        * If we now need to rebuild the bestfree map, do so.
+        * This needs to happen before the next call to use_free.
+        */
+       if (needscan)
+               xfs_dir2_data_freescan(args->dp, hdr, needlog);
+}
+
+/*
+ * Add an entry to a block directory.
+ */
+int                                            /* error */
+xfs_dir2_block_addname(
+       xfs_da_args_t           *args)          /* directory op arguments */
+{
+       xfs_dir2_data_hdr_t     *hdr;           /* block header */
+       xfs_dir2_leaf_entry_t   *blp;           /* block leaf entries */
+       struct xfs_buf          *bp;            /* buffer for block */
+       xfs_dir2_block_tail_t   *btp;           /* block tail */
+       int                     compact;        /* need to compact leaf ents */
+       xfs_dir2_data_entry_t   *dep;           /* block data entry */
+       xfs_inode_t             *dp;            /* directory inode */
+       xfs_dir2_data_unused_t  *dup;           /* block unused entry */
+       int                     error;          /* error return value */
+       xfs_dir2_data_unused_t  *enddup=NULL;   /* unused at end of data */
+       xfs_dahash_t            hash;           /* hash value of found entry */
+       int                     high;           /* high index for binary srch */
+       int                     highstale;      /* high stale index */
+       int                     lfloghigh=0;    /* last final leaf to log */
+       int                     lfloglow=0;     /* first final leaf to log */
+       int                     len;            /* length of the new entry */
+       int                     low;            /* low index for binary srch */
+       int                     lowstale;       /* low stale index */
+       int                     mid=0;          /* midpoint for binary srch */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       int                     needlog;        /* need to log header */
+       int                     needscan;       /* need to rescan freespace */
+       __be16                  *tagp;          /* pointer to tag value */
+       xfs_trans_t             *tp;            /* transaction structure */
+
+       trace_xfs_dir2_block_addname(args);
+
+       dp = args->dp;
+       tp = args->trans;
+       mp = dp->i_mount;
+
+       /* Read the (one and only) directory block into bp. */
+       error = xfs_dir3_block_read(tp, dp, &bp);
+       if (error)
+               return error;
+
+       len = dp->d_ops->data_entsize(args->namelen);
+
+       /*
+        * Set up pointers to parts of the block.
+        */
+       hdr = bp->b_addr;
+       btp = xfs_dir2_block_tail_p(args->geo, hdr);
+       blp = xfs_dir2_block_leaf_p(btp);
+
+       /*
+        * Find out if we can reuse stale entries or whether we need extra
+        * space for entry and new leaf.
+        */
+       xfs_dir2_block_need_space(dp, hdr, btp, blp, &tagp, &dup,
+                                 &enddup, &compact, len);
+
+       /*
+        * Done everything we need for a space check now.
+        */
+       if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
+               xfs_trans_brelse(tp, bp);
+               if (!dup)
+                       return -ENOSPC;
+               return 0;
+       }
+
+       /*
+        * If we don't have space for the new entry & leaf ...
+        */
+       if (!dup) {
+               /* Don't have a space reservation: return no-space.  */
+               if (args->total == 0)
+                       return -ENOSPC;
+               /*
+                * Convert to the next larger format.
+                * Then add the new entry in that format.
+                */
+               error = xfs_dir2_block_to_leaf(args, bp);
+               if (error)
+                       return error;
+               return xfs_dir2_leaf_addname(args);
+       }
+
+       needlog = needscan = 0;
+
+       /*
+        * If need to compact the leaf entries, do it now.
+        */
+       if (compact) {
+               xfs_dir2_block_compact(args, bp, hdr, btp, blp, &needlog,
+                                     &lfloghigh, &lfloglow);
+               /* recalculate blp post-compaction */
+               blp = xfs_dir2_block_leaf_p(btp);
+       } else if (btp->stale) {
+               /*
+                * Set leaf logging boundaries to impossible state.
+                * For the no-stale case they're set explicitly.
+                */
+               lfloglow = be32_to_cpu(btp->count);
+               lfloghigh = -1;
+       }
+
+       /*
+        * Find the slot that's first lower than our hash value, -1 if none.
+        */
+       for (low = 0, high = be32_to_cpu(btp->count) - 1; low <= high; ) {
+               mid = (low + high) >> 1;
+               if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval)
+                       break;
+               if (hash < args->hashval)
+                       low = mid + 1;
+               else
+                       high = mid - 1;
+       }
+       while (mid >= 0 && be32_to_cpu(blp[mid].hashval) >= args->hashval) {
+               mid--;
+       }
+       /*
+        * No stale entries, will use enddup space to hold new leaf.
+        */
+       if (!btp->stale) {
+               /*
+                * Mark the space needed for the new leaf entry, now in use.
+                */
+               xfs_dir2_data_use_free(args, bp, enddup,
+                       (xfs_dir2_data_aoff_t)
+                       ((char *)enddup - (char *)hdr + be16_to_cpu(enddup->length) -
+                        sizeof(*blp)),
+                       (xfs_dir2_data_aoff_t)sizeof(*blp),
+                       &needlog, &needscan);
+               /*
+                * Update the tail (entry count).
+                */
+               be32_add_cpu(&btp->count, 1);
+               /*
+                * If we now need to rebuild the bestfree map, do so.
+                * This needs to happen before the next call to use_free.
+                */
+               if (needscan) {
+                       xfs_dir2_data_freescan(dp, hdr, &needlog);
+                       needscan = 0;
+               }
+               /*
+                * Adjust pointer to the first leaf entry, we're about to move
+                * the table up one to open up space for the new leaf entry.
+                * Then adjust our index to match.
+                */
+               blp--;
+               mid++;
+               if (mid)
+                       memmove(blp, &blp[1], mid * sizeof(*blp));
+               lfloglow = 0;
+               lfloghigh = mid;
+       }
+       /*
+        * Use a stale leaf for our new entry.
+        */
+       else {
+               for (lowstale = mid;
+                    lowstale >= 0 &&
+                       blp[lowstale].address !=
+                       cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+                    lowstale--)
+                       continue;
+               for (highstale = mid + 1;
+                    highstale < be32_to_cpu(btp->count) &&
+                       blp[highstale].address !=
+                       cpu_to_be32(XFS_DIR2_NULL_DATAPTR) &&
+                       (lowstale < 0 || mid - lowstale > highstale - mid);
+                    highstale++)
+                       continue;
+               /*
+                * Move entries toward the low-numbered stale entry.
+                */
+               if (lowstale >= 0 &&
+                   (highstale == be32_to_cpu(btp->count) ||
+                    mid - lowstale <= highstale - mid)) {
+                       if (mid - lowstale)
+                               memmove(&blp[lowstale], &blp[lowstale + 1],
+                                       (mid - lowstale) * sizeof(*blp));
+                       lfloglow = MIN(lowstale, lfloglow);
+                       lfloghigh = MAX(mid, lfloghigh);
+               }
+               /*
+                * Move entries toward the high-numbered stale entry.
+                */
+               else {
+                       ASSERT(highstale < be32_to_cpu(btp->count));
+                       mid++;
+                       if (highstale - mid)
+                               memmove(&blp[mid + 1], &blp[mid],
+                                       (highstale - mid) * sizeof(*blp));
+                       lfloglow = MIN(mid, lfloglow);
+                       lfloghigh = MAX(highstale, lfloghigh);
+               }
+               be32_add_cpu(&btp->stale, -1);
+       }
+       /*
+        * Point to the new data entry.
+        */
+       dep = (xfs_dir2_data_entry_t *)dup;
+       /*
+        * Fill in the leaf entry.
+        */
+       blp[mid].hashval = cpu_to_be32(args->hashval);
+       blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
+                               (char *)dep - (char *)hdr));
+       xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh);
+       /*
+        * Mark space for the data entry used.
+        */
+       xfs_dir2_data_use_free(args, bp, dup,
+               (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
+               (xfs_dir2_data_aoff_t)len, &needlog, &needscan);
+       /*
+        * Create the new data entry.
+        */
+       dep->inumber = cpu_to_be64(args->inumber);
+       dep->namelen = args->namelen;
+       memcpy(dep->name, args->name, args->namelen);
+       dp->d_ops->data_put_ftype(dep, args->filetype);
+       tagp = dp->d_ops->data_entry_tag_p(dep);
+       *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+       /*
+        * Clean up the bestfree array and log the header, tail, and entry.
+        */
+       if (needscan)
+               xfs_dir2_data_freescan(dp, hdr, &needlog);
+       if (needlog)
+               xfs_dir2_data_log_header(args, bp);
+       xfs_dir2_block_log_tail(tp, bp);
+       xfs_dir2_data_log_entry(args, bp, dep);
+       xfs_dir3_data_check(dp, bp);
+       return 0;
+}
+
+/*
+ * Log leaf entries from the block.
+ */
+static void
+xfs_dir2_block_log_leaf(
+       xfs_trans_t             *tp,            /* transaction structure */
+       struct xfs_buf          *bp,            /* block buffer */
+       int                     first,          /* index of first logged leaf */
+       int                     last)           /* index of last logged leaf */
+{
+       xfs_dir2_data_hdr_t     *hdr = bp->b_addr;
+       xfs_dir2_leaf_entry_t   *blp;
+       xfs_dir2_block_tail_t   *btp;
+
+       btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
+       blp = xfs_dir2_block_leaf_p(btp);
+       xfs_trans_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr),
+               (uint)((char *)&blp[last + 1] - (char *)hdr - 1));
+}
+
+/*
+ * Log the block tail.
+ */
+static void
+xfs_dir2_block_log_tail(
+       xfs_trans_t             *tp,            /* transaction structure */
+       struct xfs_buf          *bp)            /* block buffer */
+{
+       xfs_dir2_data_hdr_t     *hdr = bp->b_addr;
+       xfs_dir2_block_tail_t   *btp;
+
+       btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
+       xfs_trans_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr),
+               (uint)((char *)(btp + 1) - (char *)hdr - 1));
+}
+
+/*
+ * Look up an entry in the block.  This is the external routine,
+ * xfs_dir2_block_lookup_int does the real work.
+ */
+int                                            /* error */
+xfs_dir2_block_lookup(
+       xfs_da_args_t           *args)          /* dir lookup arguments */
+{
+       xfs_dir2_data_hdr_t     *hdr;           /* block header */
+       xfs_dir2_leaf_entry_t   *blp;           /* block leaf entries */
+       struct xfs_buf          *bp;            /* block buffer */
+       xfs_dir2_block_tail_t   *btp;           /* block tail */
+       xfs_dir2_data_entry_t   *dep;           /* block data entry */
+       xfs_inode_t             *dp;            /* incore inode */
+       int                     ent;            /* entry index */
+       int                     error;          /* error return value */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+
+       trace_xfs_dir2_block_lookup(args);
+
+       /*
+        * Get the buffer, look up the entry.
+        * If not found (ENOENT) then return, have no buffer.
+        */
+       if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent)))
+               return error;
+       dp = args->dp;
+       mp = dp->i_mount;
+       hdr = bp->b_addr;
+       xfs_dir3_data_check(dp, bp);
+       btp = xfs_dir2_block_tail_p(args->geo, hdr);
+       blp = xfs_dir2_block_leaf_p(btp);
+       /*
+        * Get the offset from the leaf entry, to point to the data.
+        */
+       dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+                       xfs_dir2_dataptr_to_off(args->geo,
+                                               be32_to_cpu(blp[ent].address)));
+       /*
+        * Fill in inode number, CI name if appropriate, release the block.
+        */
+       args->inumber = be64_to_cpu(dep->inumber);
+       args->filetype = dp->d_ops->data_get_ftype(dep);
+       error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+       xfs_trans_brelse(args->trans, bp);
+       return error;
+}
+
+/*
+ * Internal block lookup routine.
+ */
+static int                                     /* error */
+xfs_dir2_block_lookup_int(
+       xfs_da_args_t           *args,          /* dir lookup arguments */
+       struct xfs_buf          **bpp,          /* returned block buffer */
+       int                     *entno)         /* returned entry number */
+{
+       xfs_dir2_dataptr_t      addr;           /* data entry address */
+       xfs_dir2_data_hdr_t     *hdr;           /* block header */
+       xfs_dir2_leaf_entry_t   *blp;           /* block leaf entries */
+       struct xfs_buf          *bp;            /* block buffer */
+       xfs_dir2_block_tail_t   *btp;           /* block tail */
+       xfs_dir2_data_entry_t   *dep;           /* block data entry */
+       xfs_inode_t             *dp;            /* incore inode */
+       int                     error;          /* error return value */
+       xfs_dahash_t            hash;           /* found hash value */
+       int                     high;           /* binary search high index */
+       int                     low;            /* binary search low index */
+       int                     mid;            /* binary search current idx */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       xfs_trans_t             *tp;            /* transaction pointer */
+       enum xfs_dacmp          cmp;            /* comparison result */
+
+       dp = args->dp;
+       tp = args->trans;
+       mp = dp->i_mount;
+
+       error = xfs_dir3_block_read(tp, dp, &bp);
+       if (error)
+               return error;
+
+       hdr = bp->b_addr;
+       xfs_dir3_data_check(dp, bp);
+       btp = xfs_dir2_block_tail_p(args->geo, hdr);
+       blp = xfs_dir2_block_leaf_p(btp);
+       /*
+        * Loop doing a binary search for our hash value.
+        * Find our entry, ENOENT if it's not there.
+        */
+       for (low = 0, high = be32_to_cpu(btp->count) - 1; ; ) {
+               ASSERT(low <= high);
+               mid = (low + high) >> 1;
+               if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval)
+                       break;
+               if (hash < args->hashval)
+                       low = mid + 1;
+               else
+                       high = mid - 1;
+               if (low > high) {
+                       ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+                       xfs_trans_brelse(tp, bp);
+                       return -ENOENT;
+               }
+       }
+       /*
+        * Back up to the first one with the right hash value.
+        */
+       while (mid > 0 && be32_to_cpu(blp[mid - 1].hashval) == args->hashval) {
+               mid--;
+       }
+       /*
+        * Now loop forward through all the entries with the
+        * right hash value looking for our name.
+        */
+       do {
+               if ((addr = be32_to_cpu(blp[mid].address)) == XFS_DIR2_NULL_DATAPTR)
+                       continue;
+               /*
+                * Get pointer to the entry from the leaf.
+                */
+               dep = (xfs_dir2_data_entry_t *)
+                       ((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, addr));
+               /*
+                * Compare name and if it's an exact match, return the index
+                * and buffer. If it's the first case-insensitive match, store
+                * the index and buffer and continue looking for an exact match.
+                */
+               cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+               if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+                       args->cmpresult = cmp;
+                       *bpp = bp;
+                       *entno = mid;
+                       if (cmp == XFS_CMP_EXACT)
+                               return 0;
+               }
+       } while (++mid < be32_to_cpu(btp->count) &&
+                       be32_to_cpu(blp[mid].hashval) == hash);
+
+       ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+       /*
+        * Here, we can only be doing a lookup (not a rename or replace).
+        * If a case-insensitive match was found earlier, return success.
+        */
+       if (args->cmpresult == XFS_CMP_CASE)
+               return 0;
+       /*
+        * No match, release the buffer and return ENOENT.
+        */
+       xfs_trans_brelse(tp, bp);
+       return -ENOENT;
+}
+
+/*
+ * Remove an entry from a block format directory.
+ * If that makes the block small enough to fit in shortform, transform it.
+ */
+int                                            /* error */
+xfs_dir2_block_removename(
+       xfs_da_args_t           *args)          /* directory operation args */
+{
+       xfs_dir2_data_hdr_t     *hdr;           /* block header */
+       xfs_dir2_leaf_entry_t   *blp;           /* block leaf pointer */
+       struct xfs_buf          *bp;            /* block buffer */
+       xfs_dir2_block_tail_t   *btp;           /* block tail */
+       xfs_dir2_data_entry_t   *dep;           /* block data entry */
+       xfs_inode_t             *dp;            /* incore inode */
+       int                     ent;            /* block leaf entry index */
+       int                     error;          /* error return value */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       int                     needlog;        /* need to log block header */
+       int                     needscan;       /* need to fixup bestfree */
+       xfs_dir2_sf_hdr_t       sfh;            /* shortform header */
+       int                     size;           /* shortform size */
+       xfs_trans_t             *tp;            /* transaction pointer */
+
+       trace_xfs_dir2_block_removename(args);
+
+       /*
+        * Look up the entry in the block.  Gets the buffer and entry index.
+        * It will always be there, the vnodeops level does a lookup first.
+        */
+       if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
+               return error;
+       }
+       dp = args->dp;
+       tp = args->trans;
+       mp = dp->i_mount;
+       hdr = bp->b_addr;
+       btp = xfs_dir2_block_tail_p(args->geo, hdr);
+       blp = xfs_dir2_block_leaf_p(btp);
+       /*
+        * Point to the data entry using the leaf entry.
+        */
+       dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+                       xfs_dir2_dataptr_to_off(args->geo,
+                                               be32_to_cpu(blp[ent].address)));
+       /*
+        * Mark the data entry's space free.
+        */
+       needlog = needscan = 0;
+       xfs_dir2_data_make_free(args, bp,
+               (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
+               dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
+       /*
+        * Fix up the block tail.
+        */
+       be32_add_cpu(&btp->stale, 1);
+       xfs_dir2_block_log_tail(tp, bp);
+       /*
+        * Remove the leaf entry by marking it stale.
+        */
+       blp[ent].address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+       xfs_dir2_block_log_leaf(tp, bp, ent, ent);
+       /*
+        * Fix up bestfree, log the header if necessary.
+        */
+       if (needscan)
+               xfs_dir2_data_freescan(dp, hdr, &needlog);
+       if (needlog)
+               xfs_dir2_data_log_header(args, bp);
+       xfs_dir3_data_check(dp, bp);
+       /*
+        * See if the size as a shortform is good enough.
+        */
+       size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
+       if (size > XFS_IFORK_DSIZE(dp))
+               return 0;
+
+       /*
+        * If it works, do the conversion.
+        */
+       return xfs_dir2_block_to_sf(args, bp, size, &sfh);
+}
+
+/*
+ * Replace an entry in a V2 block directory.
+ * Change the inode number to the new value.
+ */
+int                                            /* error */
+xfs_dir2_block_replace(
+       xfs_da_args_t           *args)          /* directory operation args */
+{
+       xfs_dir2_data_hdr_t     *hdr;           /* block header */
+       xfs_dir2_leaf_entry_t   *blp;           /* block leaf entries */
+       struct xfs_buf          *bp;            /* block buffer */
+       xfs_dir2_block_tail_t   *btp;           /* block tail */
+       xfs_dir2_data_entry_t   *dep;           /* block data entry */
+       xfs_inode_t             *dp;            /* incore inode */
+       int                     ent;            /* leaf entry index */
+       int                     error;          /* error return value */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+
+       trace_xfs_dir2_block_replace(args);
+
+       /*
+        * Lookup the entry in the directory.  Get buffer and entry index.
+        * This will always succeed since the caller has already done a lookup.
+        */
+       if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
+               return error;
+       }
+       dp = args->dp;
+       mp = dp->i_mount;
+       hdr = bp->b_addr;
+       btp = xfs_dir2_block_tail_p(args->geo, hdr);
+       blp = xfs_dir2_block_leaf_p(btp);
+       /*
+        * Point to the data entry we need to change.
+        */
+       dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+                       xfs_dir2_dataptr_to_off(args->geo,
+                                               be32_to_cpu(blp[ent].address)));
+       ASSERT(be64_to_cpu(dep->inumber) != args->inumber);
+       /*
+        * Change the inode number to the new value.
+        */
+       dep->inumber = cpu_to_be64(args->inumber);
+       dp->d_ops->data_put_ftype(dep, args->filetype);
+       xfs_dir2_data_log_entry(args, bp, dep);
+       xfs_dir3_data_check(dp, bp);
+       return 0;
+}
+
+/*
+ * Qsort comparison routine for the block leaf entries.
+ */
+static int                                     /* sort order */
+xfs_dir2_block_sort(
+       const void                      *a,     /* first leaf entry */
+       const void                      *b)     /* second leaf entry */
+{
+       const xfs_dir2_leaf_entry_t     *la;    /* first leaf entry */
+       const xfs_dir2_leaf_entry_t     *lb;    /* second leaf entry */
+
+       la = a;
+       lb = b;
+       return be32_to_cpu(la->hashval) < be32_to_cpu(lb->hashval) ? -1 :
+               (be32_to_cpu(la->hashval) > be32_to_cpu(lb->hashval) ? 1 : 0);
+}
+
+/*
+ * Convert a V2 leaf directory to a V2 block directory if possible.
+ */
+int                                            /* error */
+xfs_dir2_leaf_to_block(
+       xfs_da_args_t           *args,          /* operation arguments */
+       struct xfs_buf          *lbp,           /* leaf buffer */
+       struct xfs_buf          *dbp)           /* data buffer */
+{
+       __be16                  *bestsp;        /* leaf bests table */
+       xfs_dir2_data_hdr_t     *hdr;           /* block header */
+       xfs_dir2_block_tail_t   *btp;           /* block tail */
+       xfs_inode_t             *dp;            /* incore directory inode */
+       xfs_dir2_data_unused_t  *dup;           /* unused data entry */
+       int                     error;          /* error return value */
+       int                     from;           /* leaf from index */
+       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+       xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+       xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
+       xfs_mount_t             *mp;            /* file system mount point */
+       int                     needlog;        /* need to log data header */
+       int                     needscan;       /* need to scan for bestfree */
+       xfs_dir2_sf_hdr_t       sfh;            /* shortform header */
+       int                     size;           /* bytes used */
+       __be16                  *tagp;          /* end of entry (tag) */
+       int                     to;             /* block/leaf to index */
+       xfs_trans_t             *tp;            /* transaction pointer */
+       struct xfs_dir2_leaf_entry *ents;
+       struct xfs_dir3_icleaf_hdr leafhdr;
+
+       trace_xfs_dir2_leaf_to_block(args);
+
+       dp = args->dp;
+       tp = args->trans;
+       mp = dp->i_mount;
+       leaf = lbp->b_addr;
+       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+       ents = dp->d_ops->leaf_ents_p(leaf);
+       ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+
+       ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC ||
+              leafhdr.magic == XFS_DIR3_LEAF1_MAGIC);
+       /*
+        * If there are data blocks other than the first one, take this
+        * opportunity to remove trailing empty data blocks that may have
+        * been left behind during no-space-reservation operations.
+        * These will show up in the leaf bests table.
+        */
+       while (dp->i_d.di_size > args->geo->blksize) {
+               int hdrsz;
+
+               hdrsz = dp->d_ops->data_entry_offset;
+               bestsp = xfs_dir2_leaf_bests_p(ltp);
+               if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) ==
+                                           args->geo->blksize - hdrsz) {
+                       if ((error =
+                           xfs_dir2_leaf_trim_data(args, lbp,
+                                   (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1))))
+                               return error;
+               } else
+                       return 0;
+       }
+       /*
+        * Read the data block if we don't already have it, give up if it fails.
+        */
+       if (!dbp) {
+               error = xfs_dir3_data_read(tp, dp, args->geo->datablk, -1, &dbp);
+               if (error)
+                       return error;
+       }
+       hdr = dbp->b_addr;
+       ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+
+       /*
+        * Size of the "leaf" area in the block.
+        */
+       size = (uint)sizeof(xfs_dir2_block_tail_t) +
+              (uint)sizeof(*lep) * (leafhdr.count - leafhdr.stale);
+       /*
+        * Look at the last data entry.
+        */
+       tagp = (__be16 *)((char *)hdr + args->geo->blksize) - 1;
+       dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+       /*
+        * If it's not free or is too short we can't do it.
+        */
+       if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG ||
+           be16_to_cpu(dup->length) < size)
+               return 0;
+
+       /*
+        * Start converting it to block form.
+        */
+       xfs_dir3_block_init(mp, tp, dbp, dp);
+
+       needlog = 1;
+       needscan = 0;
+       /*
+        * Use up the space at the end of the block (blp/btp).
+        */
+       xfs_dir2_data_use_free(args, dbp, dup, args->geo->blksize - size, size,
+               &needlog, &needscan);
+       /*
+        * Initialize the block tail.
+        */
+       btp = xfs_dir2_block_tail_p(args->geo, hdr);
+       btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale);
+       btp->stale = 0;
+       xfs_dir2_block_log_tail(tp, dbp);
+       /*
+        * Initialize the block leaf area.  We compact out stale entries.
+        */
+       lep = xfs_dir2_block_leaf_p(btp);
+       for (from = to = 0; from < leafhdr.count; from++) {
+               if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+                       continue;
+               lep[to++] = ents[from];
+       }
+       ASSERT(to == be32_to_cpu(btp->count));
+       xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1);
+       /*
+        * Scan the bestfree if we need it and log the data block header.
+        */
+       if (needscan)
+               xfs_dir2_data_freescan(dp, hdr, &needlog);
+       if (needlog)
+               xfs_dir2_data_log_header(args, dbp);
+       /*
+        * Pitch the old leaf block.
+        */
+       error = xfs_da_shrink_inode(args, args->geo->leafblk, lbp);
+       if (error)
+               return error;
+
+       /*
+        * Now see if the resulting block can be shrunken to shortform.
+        */
+       size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
+       if (size > XFS_IFORK_DSIZE(dp))
+               return 0;
+
+       return xfs_dir2_block_to_sf(args, dbp, size, &sfh);
+}
+
+/*
+ * Convert the shortform directory to block form.
+ */
+int                                            /* error */
+xfs_dir2_sf_to_block(
+       xfs_da_args_t           *args)          /* operation arguments */
+{
+       xfs_dir2_db_t           blkno;          /* dir-relative block # (0) */
+       xfs_dir2_data_hdr_t     *hdr;           /* block header */
+       xfs_dir2_leaf_entry_t   *blp;           /* block leaf entries */
+       struct xfs_buf          *bp;            /* block buffer */
+       xfs_dir2_block_tail_t   *btp;           /* block tail pointer */
+       xfs_dir2_data_entry_t   *dep;           /* data entry pointer */
+       xfs_inode_t             *dp;            /* incore directory inode */
+       int                     dummy;          /* trash */
+       xfs_dir2_data_unused_t  *dup;           /* unused entry pointer */
+       int                     endoffset;      /* end of data objects */
+       int                     error;          /* error return value */
+       int                     i;              /* index */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       int                     needlog;        /* need to log block header */
+       int                     needscan;       /* need to scan block freespc */
+       int                     newoffset;      /* offset from current entry */
+       int                     offset;         /* target block offset */
+       xfs_dir2_sf_entry_t     *sfep;          /* sf entry pointer */
+       xfs_dir2_sf_hdr_t       *oldsfp;        /* old shortform header  */
+       xfs_dir2_sf_hdr_t       *sfp;           /* shortform header  */
+       __be16                  *tagp;          /* end of data entry */
+       xfs_trans_t             *tp;            /* transaction pointer */
+       struct xfs_name         name;
+       struct xfs_ifork        *ifp;
+
+       trace_xfs_dir2_sf_to_block(args);
+
+       dp = args->dp;
+       tp = args->trans;
+       mp = dp->i_mount;
+       ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
+       ASSERT(ifp->if_flags & XFS_IFINLINE);
+       /*
+        * Bomb out if the shortform directory is way too short.
+        */
+       if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+               ASSERT(XFS_FORCED_SHUTDOWN(mp));
+               return -EIO;
+       }
+
+       oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
+
+       ASSERT(ifp->if_bytes == dp->i_d.di_size);
+       ASSERT(ifp->if_u1.if_data != NULL);
+       ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count));
+       ASSERT(dp->i_d.di_nextents == 0);
+
+       /*
+        * Copy the directory into a temporary buffer.
+        * Then pitch the incore inode data so we can make extents.
+        */
+       sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP);
+       memcpy(sfp, oldsfp, ifp->if_bytes);
+
+       xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
+       xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK);
+       dp->i_d.di_size = 0;
+
+       /*
+        * Add block 0 to the inode.
+        */
+       error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno);
+       if (error) {
+               kmem_free(sfp);
+               return error;
+       }
+       /*
+        * Initialize the data block, then convert it to block format.
+        */
+       error = xfs_dir3_data_init(args, blkno, &bp);
+       if (error) {
+               kmem_free(sfp);
+               return error;
+       }
+       xfs_dir3_block_init(mp, tp, bp, dp);
+       hdr = bp->b_addr;
+
+       /*
+        * Compute size of block "tail" area.
+        */
+       i = (uint)sizeof(*btp) +
+           (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t);
+       /*
+        * The whole thing is initialized to free by the init routine.
+        * Say we're using the leaf and tail area.
+        */
+       dup = dp->d_ops->data_unused_p(hdr);
+       needlog = needscan = 0;
+       xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i,
+                              i, &needlog, &needscan);
+       ASSERT(needscan == 0);
+       /*
+        * Fill in the tail.
+        */
+       btp = xfs_dir2_block_tail_p(args->geo, hdr);
+       btp->count = cpu_to_be32(sfp->count + 2);       /* ., .. */
+       btp->stale = 0;
+       blp = xfs_dir2_block_leaf_p(btp);
+       endoffset = (uint)((char *)blp - (char *)hdr);
+       /*
+        * Remove the freespace, we'll manage it.
+        */
+       xfs_dir2_data_use_free(args, bp, dup,
+               (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
+               be16_to_cpu(dup->length), &needlog, &needscan);
+       /*
+        * Create entry for .
+        */
+       dep = dp->d_ops->data_dot_entry_p(hdr);
+       dep->inumber = cpu_to_be64(dp->i_ino);
+       dep->namelen = 1;
+       dep->name[0] = '.';
+       dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
+       tagp = dp->d_ops->data_entry_tag_p(dep);
+       *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+       xfs_dir2_data_log_entry(args, bp, dep);
+       blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot);
+       blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
+                               (char *)dep - (char *)hdr));
+       /*
+        * Create entry for ..
+        */
+       dep = dp->d_ops->data_dotdot_entry_p(hdr);
+       dep->inumber = cpu_to_be64(dp->d_ops->sf_get_parent_ino(sfp));
+       dep->namelen = 2;
+       dep->name[0] = dep->name[1] = '.';
+       dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
+       tagp = dp->d_ops->data_entry_tag_p(dep);
+       *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+       xfs_dir2_data_log_entry(args, bp, dep);
+       blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
+       blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
+                               (char *)dep - (char *)hdr));
+       offset = dp->d_ops->data_first_offset;
+       /*
+        * Loop over existing entries, stuff them in.
+        */
+       i = 0;
+       if (!sfp->count)
+               sfep = NULL;
+       else
+               sfep = xfs_dir2_sf_firstentry(sfp);
+       /*
+        * Need to preserve the existing offset values in the sf directory.
+        * Insert holes (unused entries) where necessary.
+        */
+       while (offset < endoffset) {
+               /*
+                * sfep is null when we reach the end of the list.
+                */
+               if (sfep == NULL)
+                       newoffset = endoffset;
+               else
+                       newoffset = xfs_dir2_sf_get_offset(sfep);
+               /*
+                * There should be a hole here, make one.
+                */
+               if (offset < newoffset) {
+                       dup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
+                       dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+                       dup->length = cpu_to_be16(newoffset - offset);
+                       *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16(
+                               ((char *)dup - (char *)hdr));
+                       xfs_dir2_data_log_unused(args, bp, dup);
+                       xfs_dir2_data_freeinsert(hdr,
+                                                dp->d_ops->data_bestfree_p(hdr),
+                                                dup, &dummy);
+                       offset += be16_to_cpu(dup->length);
+                       continue;
+               }
+               /*
+                * Copy a real entry.
+                */
+               dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset);
+               dep->inumber = cpu_to_be64(dp->d_ops->sf_get_ino(sfp, sfep));
+               dep->namelen = sfep->namelen;
+               dp->d_ops->data_put_ftype(dep, dp->d_ops->sf_get_ftype(sfep));
+               memcpy(dep->name, sfep->name, dep->namelen);
+               tagp = dp->d_ops->data_entry_tag_p(dep);
+               *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+               xfs_dir2_data_log_entry(args, bp, dep);
+               name.name = sfep->name;
+               name.len = sfep->namelen;
+               blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops->
+                                                       hashname(&name));
+               blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
+                                                (char *)dep - (char *)hdr));
+               offset = (int)((char *)(tagp + 1) - (char *)hdr);
+               if (++i == sfp->count)
+                       sfep = NULL;
+               else
+                       sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+       }
+       /* Done with the temporary buffer */
+       kmem_free(sfp);
+       /*
+        * Sort the leaf entries by hash value.
+        */
+       xfs_sort(blp, be32_to_cpu(btp->count), sizeof(*blp), xfs_dir2_block_sort);
+       /*
+        * Log the leaf entry area and tail.
+        * Already logged the header in data_init, ignore needlog.
+        */
+       ASSERT(needscan == 0);
+       xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1);
+       xfs_dir2_block_log_tail(tp, bp);
+       xfs_dir3_data_check(dp, bp);
+       return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
new file mode 100644 (file)
index 0000000..fdd803f
--- /dev/null
@@ -0,0 +1,1050 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
+
+/*
+ * Check the consistency of the data block.
+ * The input can also be a block-format directory.
+ * Return 0 is the buffer is good, otherwise an error.
+ */
+int
+__xfs_dir3_data_check(
+       struct xfs_inode        *dp,            /* incore inode pointer */
+       struct xfs_buf          *bp)            /* data block's buffer */
+{
+       xfs_dir2_dataptr_t      addr;           /* addr for leaf lookup */
+       xfs_dir2_data_free_t    *bf;            /* bestfree table */
+       xfs_dir2_block_tail_t   *btp=NULL;      /* block tail */
+       int                     count;          /* count of entries found */
+       xfs_dir2_data_hdr_t     *hdr;           /* data block header */
+       xfs_dir2_data_entry_t   *dep;           /* data entry */
+       xfs_dir2_data_free_t    *dfp;           /* bestfree entry */
+       xfs_dir2_data_unused_t  *dup;           /* unused entry */
+       char                    *endp;          /* end of useful data */
+       int                     freeseen;       /* mask of bestfrees seen */
+       xfs_dahash_t            hash;           /* hash of current name */
+       int                     i;              /* leaf index */
+       int                     lastfree;       /* last entry was unused */
+       xfs_dir2_leaf_entry_t   *lep=NULL;      /* block leaf entries */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       char                    *p;             /* current data position */
+       int                     stale;          /* count of stale leaves */
+       struct xfs_name         name;
+       const struct xfs_dir_ops *ops;
+       struct xfs_da_geometry  *geo;
+
+       mp = bp->b_target->bt_mount;
+       geo = mp->m_dir_geo;
+
+       /*
+        * We can be passed a null dp here from a verifier, so we need to go the
+        * hard way to get them.
+        */
+       ops = xfs_dir_get_ops(mp, dp);
+
+       hdr = bp->b_addr;
+       p = (char *)ops->data_entry_p(hdr);
+
+       switch (hdr->magic) {
+       case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
+       case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+               btp = xfs_dir2_block_tail_p(geo, hdr);
+               lep = xfs_dir2_block_leaf_p(btp);
+               endp = (char *)lep;
+
+               /*
+                * The number of leaf entries is limited by the size of the
+                * block and the amount of space used by the data entries.
+                * We don't know how much space is used by the data entries yet,
+                * so just ensure that the count falls somewhere inside the
+                * block right now.
+                */
+               XFS_WANT_CORRUPTED_RETURN(be32_to_cpu(btp->count) <
+                       ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry));
+               break;
+       case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
+       case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+               endp = (char *)hdr + geo->blksize;
+               break;
+       default:
+               XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
+               return -EFSCORRUPTED;
+       }
+
+       /*
+        * Account for zero bestfree entries.
+        */
+       bf = ops->data_bestfree_p(hdr);
+       count = lastfree = freeseen = 0;
+       if (!bf[0].length) {
+               XFS_WANT_CORRUPTED_RETURN(!bf[0].offset);
+               freeseen |= 1 << 0;
+       }
+       if (!bf[1].length) {
+               XFS_WANT_CORRUPTED_RETURN(!bf[1].offset);
+               freeseen |= 1 << 1;
+       }
+       if (!bf[2].length) {
+               XFS_WANT_CORRUPTED_RETURN(!bf[2].offset);
+               freeseen |= 1 << 2;
+       }
+
+       XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >=
+                                               be16_to_cpu(bf[1].length));
+       XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >=
+                                               be16_to_cpu(bf[2].length));
+       /*
+        * Loop over the data/unused entries.
+        */
+       while (p < endp) {
+               dup = (xfs_dir2_data_unused_t *)p;
+               /*
+                * If it's unused, look for the space in the bestfree table.
+                * If we find it, account for that, else make sure it
+                * doesn't need to be there.
+                */
+               if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+                       XFS_WANT_CORRUPTED_RETURN(lastfree == 0);
+                       XFS_WANT_CORRUPTED_RETURN(
+                               be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
+                                              (char *)dup - (char *)hdr);
+                       dfp = xfs_dir2_data_freefind(hdr, bf, dup);
+                       if (dfp) {
+                               i = (int)(dfp - bf);
+                               XFS_WANT_CORRUPTED_RETURN(
+                                       (freeseen & (1 << i)) == 0);
+                               freeseen |= 1 << i;
+                       } else {
+                               XFS_WANT_CORRUPTED_RETURN(
+                                       be16_to_cpu(dup->length) <=
+                                               be16_to_cpu(bf[2].length));
+                       }
+                       p += be16_to_cpu(dup->length);
+                       lastfree = 1;
+                       continue;
+               }
+               /*
+                * It's a real entry.  Validate the fields.
+                * If this is a block directory then make sure it's
+                * in the leaf section of the block.
+                * The linear search is crude but this is DEBUG code.
+                */
+               dep = (xfs_dir2_data_entry_t *)p;
+               XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0);
+               XFS_WANT_CORRUPTED_RETURN(
+                       !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
+               XFS_WANT_CORRUPTED_RETURN(
+                       be16_to_cpu(*ops->data_entry_tag_p(dep)) ==
+                                              (char *)dep - (char *)hdr);
+               XFS_WANT_CORRUPTED_RETURN(
+                               ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX);
+               count++;
+               lastfree = 0;
+               if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+                   hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
+                       addr = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+                                               (xfs_dir2_data_aoff_t)
+                                               ((char *)dep - (char *)hdr));
+                       name.name = dep->name;
+                       name.len = dep->namelen;
+                       hash = mp->m_dirnameops->hashname(&name);
+                       for (i = 0; i < be32_to_cpu(btp->count); i++) {
+                               if (be32_to_cpu(lep[i].address) == addr &&
+                                   be32_to_cpu(lep[i].hashval) == hash)
+                                       break;
+                       }
+                       XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
+               }
+               p += ops->data_entsize(dep->namelen);
+       }
+       /*
+        * Need to have seen all the entries and all the bestfree slots.
+        */
+       XFS_WANT_CORRUPTED_RETURN(freeseen == 7);
+       if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+           hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
+               for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
+                       if (lep[i].address ==
+                           cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+                               stale++;
+                       if (i > 0)
+                               XFS_WANT_CORRUPTED_RETURN(
+                                       be32_to_cpu(lep[i].hashval) >=
+                                               be32_to_cpu(lep[i - 1].hashval));
+               }
+               XFS_WANT_CORRUPTED_RETURN(count ==
+                       be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
+               XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale));
+       }
+       return 0;
+}
+
+static bool
+xfs_dir3_data_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
+                       return false;
+               if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+                       return false;
+               if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+                       return false;
+       } else {
+               if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
+                       return false;
+       }
+       if (__xfs_dir3_data_check(NULL, bp))
+               return false;
+       return true;
+}
+
+/*
+ * Readahead of the first block of the directory when it is opened is completely
+ * oblivious to the format of the directory. Hence we can either get a block
+ * format buffer or a data format buffer on readahead.
+ */
+static void
+xfs_dir3_data_reada_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+
+       switch (hdr->magic) {
+       case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+       case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
+               bp->b_ops = &xfs_dir3_block_buf_ops;
+               bp->b_ops->verify_read(bp);
+               return;
+       case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+       case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
+               xfs_dir3_data_verify(bp);
+               return;
+       default:
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+               xfs_verifier_error(bp);
+               break;
+       }
+}
+
+static void
+xfs_dir3_data_read_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+
+       if (xfs_sb_version_hascrc(&mp->m_sb) &&
+            !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
+                xfs_buf_ioerror(bp, -EFSBADCRC);
+       else if (!xfs_dir3_data_verify(bp))
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+       if (bp->b_error)
+               xfs_verifier_error(bp);
+}
+
+static void
+xfs_dir3_data_write_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_buf_log_item *bip = bp->b_fspriv;
+       struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+       if (!xfs_dir3_data_verify(bp)) {
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+               xfs_verifier_error(bp);
+               return;
+       }
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return;
+
+       if (bip)
+               hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+       xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
+       .verify_read = xfs_dir3_data_read_verify,
+       .verify_write = xfs_dir3_data_write_verify,
+};
+
+static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
+       .verify_read = xfs_dir3_data_reada_verify,
+       .verify_write = xfs_dir3_data_write_verify,
+};
+
+
+int
+xfs_dir3_data_read(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *dp,
+       xfs_dablk_t             bno,
+       xfs_daddr_t             mapped_bno,
+       struct xfs_buf          **bpp)
+{
+       int                     err;
+
+       err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
+                               XFS_DATA_FORK, &xfs_dir3_data_buf_ops);
+       if (!err && tp)
+               xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
+       return err;
+}
+
+int
+xfs_dir3_data_readahead(
+       struct xfs_inode        *dp,
+       xfs_dablk_t             bno,
+       xfs_daddr_t             mapped_bno)
+{
+       return xfs_da_reada_buf(dp, bno, mapped_bno,
+                               XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops);
+}
+
+/*
+ * Given a data block and an unused entry from that block,
+ * return the bestfree entry if any that corresponds to it.
+ */
+xfs_dir2_data_free_t *
+xfs_dir2_data_freefind(
+       struct xfs_dir2_data_hdr *hdr,          /* data block header */
+       struct xfs_dir2_data_free *bf,          /* bestfree table pointer */
+       struct xfs_dir2_data_unused *dup)       /* unused space */
+{
+       xfs_dir2_data_free_t    *dfp;           /* bestfree entry */
+       xfs_dir2_data_aoff_t    off;            /* offset value needed */
+#ifdef DEBUG
+       int                     matched;        /* matched the value */
+       int                     seenzero;       /* saw a 0 bestfree entry */
+#endif
+
+       off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
+
+#ifdef DEBUG
+       /*
+        * Validate some consistency in the bestfree table.
+        * Check order, non-overlapping entries, and if we find the
+        * one we're looking for it has to be exact.
+        */
+       ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+       for (dfp = &bf[0], seenzero = matched = 0;
+            dfp < &bf[XFS_DIR2_DATA_FD_COUNT];
+            dfp++) {
+               if (!dfp->offset) {
+                       ASSERT(!dfp->length);
+                       seenzero = 1;
+                       continue;
+               }
+               ASSERT(seenzero == 0);
+               if (be16_to_cpu(dfp->offset) == off) {
+                       matched = 1;
+                       ASSERT(dfp->length == dup->length);
+               } else if (off < be16_to_cpu(dfp->offset))
+                       ASSERT(off + be16_to_cpu(dup->length) <= be16_to_cpu(dfp->offset));
+               else
+                       ASSERT(be16_to_cpu(dfp->offset) + be16_to_cpu(dfp->length) <= off);
+               ASSERT(matched || be16_to_cpu(dfp->length) >= be16_to_cpu(dup->length));
+               if (dfp > &bf[0])
+                       ASSERT(be16_to_cpu(dfp[-1].length) >= be16_to_cpu(dfp[0].length));
+       }
+#endif
+       /*
+        * If this is smaller than the smallest bestfree entry,
+        * it can't be there since they're sorted.
+        */
+       if (be16_to_cpu(dup->length) <
+           be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length))
+               return NULL;
+       /*
+        * Look at the three bestfree entries for our guy.
+        */
+       for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) {
+               if (!dfp->offset)
+                       return NULL;
+               if (be16_to_cpu(dfp->offset) == off)
+                       return dfp;
+       }
+       /*
+        * Didn't find it.  This only happens if there are duplicate lengths.
+        */
+       return NULL;
+}
+
+/*
+ * Insert an unused-space entry into the bestfree table.
+ */
+xfs_dir2_data_free_t *                         /* entry inserted */
+xfs_dir2_data_freeinsert(
+       struct xfs_dir2_data_hdr *hdr,          /* data block pointer */
+       struct xfs_dir2_data_free *dfp,         /* bestfree table pointer */
+       struct xfs_dir2_data_unused *dup,       /* unused space */
+       int                     *loghead)       /* log the data header (out) */
+{
+       xfs_dir2_data_free_t    new;            /* new bestfree entry */
+
+       ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+       new.length = dup->length;
+       new.offset = cpu_to_be16((char *)dup - (char *)hdr);
+
+       /*
+        * Insert at position 0, 1, or 2; or not at all.
+        */
+       if (be16_to_cpu(new.length) > be16_to_cpu(dfp[0].length)) {
+               dfp[2] = dfp[1];
+               dfp[1] = dfp[0];
+               dfp[0] = new;
+               *loghead = 1;
+               return &dfp[0];
+       }
+       if (be16_to_cpu(new.length) > be16_to_cpu(dfp[1].length)) {
+               dfp[2] = dfp[1];
+               dfp[1] = new;
+               *loghead = 1;
+               return &dfp[1];
+       }
+       if (be16_to_cpu(new.length) > be16_to_cpu(dfp[2].length)) {
+               dfp[2] = new;
+               *loghead = 1;
+               return &dfp[2];
+       }
+       return NULL;
+}
+
+/*
+ * Remove a bestfree entry from the table.
+ */
+STATIC void
+xfs_dir2_data_freeremove(
+       struct xfs_dir2_data_hdr *hdr,          /* data block header */
+       struct xfs_dir2_data_free *bf,          /* bestfree table pointer */
+       struct xfs_dir2_data_free *dfp,         /* bestfree entry pointer */
+       int                     *loghead)       /* out: log data header */
+{
+
+       ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+       /*
+        * It's the first entry, slide the next 2 up.
+        */
+       if (dfp == &bf[0]) {
+               bf[0] = bf[1];
+               bf[1] = bf[2];
+       }
+       /*
+        * It's the second entry, slide the 3rd entry up.
+        */
+       else if (dfp == &bf[1])
+               bf[1] = bf[2];
+       /*
+        * Must be the last entry.
+        */
+       else
+               ASSERT(dfp == &bf[2]);
+       /*
+        * Clear the 3rd entry, must be zero now.
+        */
+       bf[2].length = 0;
+       bf[2].offset = 0;
+       *loghead = 1;
+}
+
+/*
+ * Given a data block, reconstruct its bestfree map.
+ */
+void
+xfs_dir2_data_freescan(
+       struct xfs_inode        *dp,
+       struct xfs_dir2_data_hdr *hdr,
+       int                     *loghead)
+{
+       xfs_dir2_block_tail_t   *btp;           /* block tail */
+       xfs_dir2_data_entry_t   *dep;           /* active data entry */
+       xfs_dir2_data_unused_t  *dup;           /* unused data entry */
+       struct xfs_dir2_data_free *bf;
+       char                    *endp;          /* end of block's data */
+       char                    *p;             /* current entry pointer */
+       struct xfs_da_geometry  *geo = dp->i_mount->m_dir_geo;
+
+       ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+       /*
+        * Start by clearing the table.
+        */
+       bf = dp->d_ops->data_bestfree_p(hdr);
+       memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT);
+       *loghead = 1;
+       /*
+        * Set up pointers.
+        */
+       p = (char *)dp->d_ops->data_entry_p(hdr);
+       if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+           hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
+               btp = xfs_dir2_block_tail_p(geo, hdr);
+               endp = (char *)xfs_dir2_block_leaf_p(btp);
+       } else
+               endp = (char *)hdr + geo->blksize;
+       /*
+        * Loop over the block's entries.
+        */
+       while (p < endp) {
+               dup = (xfs_dir2_data_unused_t *)p;
+               /*
+                * If it's a free entry, insert it.
+                */
+               if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+                       ASSERT((char *)dup - (char *)hdr ==
+                              be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
+                       xfs_dir2_data_freeinsert(hdr, bf, dup, loghead);
+                       p += be16_to_cpu(dup->length);
+               }
+               /*
+                * For active entries, check their tags and skip them.
+                */
+               else {
+                       dep = (xfs_dir2_data_entry_t *)p;
+                       ASSERT((char *)dep - (char *)hdr ==
+                              be16_to_cpu(*dp->d_ops->data_entry_tag_p(dep)));
+                       p += dp->d_ops->data_entsize(dep->namelen);
+               }
+       }
+}
+
+/*
+ * Initialize a data block at the given block number in the directory.
+ * Give back the buffer for the created block.
+ */
+int                                            /* error */
+xfs_dir3_data_init(
+       xfs_da_args_t           *args,          /* directory operation args */
+       xfs_dir2_db_t           blkno,          /* logical dir block number */
+       struct xfs_buf          **bpp)          /* output block buffer */
+{
+       struct xfs_buf          *bp;            /* block buffer */
+       xfs_dir2_data_hdr_t     *hdr;           /* data block header */
+       xfs_inode_t             *dp;            /* incore directory inode */
+       xfs_dir2_data_unused_t  *dup;           /* unused entry pointer */
+       struct xfs_dir2_data_free *bf;
+       int                     error;          /* error return value */
+       int                     i;              /* bestfree index */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       xfs_trans_t             *tp;            /* transaction pointer */
+       int                     t;              /* temp */
+
+       dp = args->dp;
+       mp = dp->i_mount;
+       tp = args->trans;
+       /*
+        * Get the buffer set up for the block.
+        */
+       error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, blkno),
+                              -1, &bp, XFS_DATA_FORK);
+       if (error)
+               return error;
+       bp->b_ops = &xfs_dir3_data_buf_ops;
+       xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_DATA_BUF);
+
+       /*
+        * Initialize the header.
+        */
+       hdr = bp->b_addr;
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+               memset(hdr3, 0, sizeof(*hdr3));
+               hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
+               hdr3->blkno = cpu_to_be64(bp->b_bn);
+               hdr3->owner = cpu_to_be64(dp->i_ino);
+               uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+
+       } else
+               hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+
+       bf = dp->d_ops->data_bestfree_p(hdr);
+       bf[0].offset = cpu_to_be16(dp->d_ops->data_entry_offset);
+       for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
+               bf[i].length = 0;
+               bf[i].offset = 0;
+       }
+
+       /*
+        * Set up an unused entry for the block's body.
+        */
+       dup = dp->d_ops->data_unused_p(hdr);
+       dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+
+       t = args->geo->blksize - (uint)dp->d_ops->data_entry_offset;
+       bf[0].length = cpu_to_be16(t);
+       dup->length = cpu_to_be16(t);
+       *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr);
+       /*
+        * Log it and return it.
+        */
+       xfs_dir2_data_log_header(args, bp);
+       xfs_dir2_data_log_unused(args, bp, dup);
+       *bpp = bp;
+       return 0;
+}
+
+/*
+ * Log an active data entry from the block.
+ */
+void
+xfs_dir2_data_log_entry(
+       struct xfs_da_args      *args,
+       struct xfs_buf          *bp,
+       xfs_dir2_data_entry_t   *dep)           /* data entry pointer */
+{
+       struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+
+       ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+       xfs_trans_log_buf(args->trans, bp, (uint)((char *)dep - (char *)hdr),
+               (uint)((char *)(args->dp->d_ops->data_entry_tag_p(dep) + 1) -
+                      (char *)hdr - 1));
+}
+
+/*
+ * Log a data block header.
+ */
+void
+xfs_dir2_data_log_header(
+       struct xfs_da_args      *args,
+       struct xfs_buf          *bp)
+{
+#ifdef DEBUG
+       struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+
+       ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+#endif
+
+       xfs_trans_log_buf(args->trans, bp, 0,
+                         args->dp->d_ops->data_entry_offset - 1);
+}
+
+/*
+ * Log a data unused entry.
+ */
+void
+xfs_dir2_data_log_unused(
+       struct xfs_da_args      *args,
+       struct xfs_buf          *bp,
+       xfs_dir2_data_unused_t  *dup)           /* data unused pointer */
+{
+       xfs_dir2_data_hdr_t     *hdr = bp->b_addr;
+
+       ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+       /*
+        * Log the first part of the unused entry.
+        */
+       xfs_trans_log_buf(args->trans, bp, (uint)((char *)dup - (char *)hdr),
+               (uint)((char *)&dup->length + sizeof(dup->length) -
+                      1 - (char *)hdr));
+       /*
+        * Log the end (tag) of the unused entry.
+        */
+       xfs_trans_log_buf(args->trans, bp,
+               (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr),
+               (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr +
+                      sizeof(xfs_dir2_data_off_t) - 1));
+}
+
+/*
+ * Make a byte range in the data block unused.
+ * Its current contents are unimportant.
+ */
+void
+xfs_dir2_data_make_free(
+       struct xfs_da_args      *args,
+       struct xfs_buf          *bp,
+       xfs_dir2_data_aoff_t    offset,         /* starting byte offset */
+       xfs_dir2_data_aoff_t    len,            /* length in bytes */
+       int                     *needlogp,      /* out: log header */
+       int                     *needscanp)     /* out: regen bestfree */
+{
+       xfs_dir2_data_hdr_t     *hdr;           /* data block pointer */
+       xfs_dir2_data_free_t    *dfp;           /* bestfree pointer */
+       char                    *endptr;        /* end of data area */
+       int                     needscan;       /* need to regen bestfree */
+       xfs_dir2_data_unused_t  *newdup;        /* new unused entry */
+       xfs_dir2_data_unused_t  *postdup;       /* unused entry after us */
+       xfs_dir2_data_unused_t  *prevdup;       /* unused entry before us */
+       struct xfs_dir2_data_free *bf;
+
+       hdr = bp->b_addr;
+
+       /*
+        * Figure out where the end of the data area is.
+        */
+       if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+           hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC))
+               endptr = (char *)hdr + args->geo->blksize;
+       else {
+               xfs_dir2_block_tail_t   *btp;   /* block tail */
+
+               ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+                       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+               btp = xfs_dir2_block_tail_p(args->geo, hdr);
+               endptr = (char *)xfs_dir2_block_leaf_p(btp);
+       }
+       /*
+        * If this isn't the start of the block, then back up to
+        * the previous entry and see if it's free.
+        */
+       if (offset > args->dp->d_ops->data_entry_offset) {
+               __be16                  *tagp;  /* tag just before us */
+
+               tagp = (__be16 *)((char *)hdr + offset) - 1;
+               prevdup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+               if (be16_to_cpu(prevdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
+                       prevdup = NULL;
+       } else
+               prevdup = NULL;
+       /*
+        * If this isn't the end of the block, see if the entry after
+        * us is free.
+        */
+       if ((char *)hdr + offset + len < endptr) {
+               postdup =
+                       (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
+               if (be16_to_cpu(postdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
+                       postdup = NULL;
+       } else
+               postdup = NULL;
+       ASSERT(*needscanp == 0);
+       needscan = 0;
+       /*
+        * Previous and following entries are both free,
+        * merge everything into a single free entry.
+        */
+       bf = args->dp->d_ops->data_bestfree_p(hdr);
+       if (prevdup && postdup) {
+               xfs_dir2_data_free_t    *dfp2;  /* another bestfree pointer */
+
+               /*
+                * See if prevdup and/or postdup are in bestfree table.
+                */
+               dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
+               dfp2 = xfs_dir2_data_freefind(hdr, bf, postdup);
+               /*
+                * We need a rescan unless there are exactly 2 free entries
+                * namely our two.  Then we know what's happening, otherwise
+                * since the third bestfree is there, there might be more
+                * entries.
+                */
+               needscan = (bf[2].length != 0);
+               /*
+                * Fix up the new big freespace.
+                */
+               be16_add_cpu(&prevdup->length, len + be16_to_cpu(postdup->length));
+               *xfs_dir2_data_unused_tag_p(prevdup) =
+                       cpu_to_be16((char *)prevdup - (char *)hdr);
+               xfs_dir2_data_log_unused(args, bp, prevdup);
+               if (!needscan) {
+                       /*
+                        * Has to be the case that entries 0 and 1 are
+                        * dfp and dfp2 (don't know which is which), and
+                        * entry 2 is empty.
+                        * Remove entry 1 first then entry 0.
+                        */
+                       ASSERT(dfp && dfp2);
+                       if (dfp == &bf[1]) {
+                               dfp = &bf[0];
+                               ASSERT(dfp2 == dfp);
+                               dfp2 = &bf[1];
+                       }
+                       xfs_dir2_data_freeremove(hdr, bf, dfp2, needlogp);
+                       xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+                       /*
+                        * Now insert the new entry.
+                        */
+                       dfp = xfs_dir2_data_freeinsert(hdr, bf, prevdup,
+                                                      needlogp);
+                       ASSERT(dfp == &bf[0]);
+                       ASSERT(dfp->length == prevdup->length);
+                       ASSERT(!dfp[1].length);
+                       ASSERT(!dfp[2].length);
+               }
+       }
+       /*
+        * The entry before us is free, merge with it.
+        */
+       else if (prevdup) {
+               dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
+               be16_add_cpu(&prevdup->length, len);
+               *xfs_dir2_data_unused_tag_p(prevdup) =
+                       cpu_to_be16((char *)prevdup - (char *)hdr);
+               xfs_dir2_data_log_unused(args, bp, prevdup);
+               /*
+                * If the previous entry was in the table, the new entry
+                * is longer, so it will be in the table too.  Remove
+                * the old one and add the new one.
+                */
+               if (dfp) {
+                       xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+                       xfs_dir2_data_freeinsert(hdr, bf, prevdup, needlogp);
+               }
+               /*
+                * Otherwise we need a scan if the new entry is big enough.
+                */
+               else {
+                       needscan = be16_to_cpu(prevdup->length) >
+                                  be16_to_cpu(bf[2].length);
+               }
+       }
+       /*
+        * The following entry is free, merge with it.
+        */
+       else if (postdup) {
+               dfp = xfs_dir2_data_freefind(hdr, bf, postdup);
+               newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
+               newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+               newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length));
+               *xfs_dir2_data_unused_tag_p(newdup) =
+                       cpu_to_be16((char *)newdup - (char *)hdr);
+               xfs_dir2_data_log_unused(args, bp, newdup);
+               /*
+                * If the following entry was in the table, the new entry
+                * is longer, so it will be in the table too.  Remove
+                * the old one and add the new one.
+                */
+               if (dfp) {
+                       xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+                       xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
+               }
+               /*
+                * Otherwise we need a scan if the new entry is big enough.
+                */
+               else {
+                       needscan = be16_to_cpu(newdup->length) >
+                                  be16_to_cpu(bf[2].length);
+               }
+       }
+       /*
+        * Neither neighbor is free.  Make a new entry.
+        */
+       else {
+               newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
+               newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+               newdup->length = cpu_to_be16(len);
+               *xfs_dir2_data_unused_tag_p(newdup) =
+                       cpu_to_be16((char *)newdup - (char *)hdr);
+               xfs_dir2_data_log_unused(args, bp, newdup);
+               xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
+       }
+       *needscanp = needscan;
+}
+
+/*
+ * Take a byte range out of an existing unused space and make it un-free.
+ */
+void
+xfs_dir2_data_use_free(
+       struct xfs_da_args      *args,
+       struct xfs_buf          *bp,
+       xfs_dir2_data_unused_t  *dup,           /* unused entry */
+       xfs_dir2_data_aoff_t    offset,         /* starting offset to use */
+       xfs_dir2_data_aoff_t    len,            /* length to use */
+       int                     *needlogp,      /* out: need to log header */
+       int                     *needscanp)     /* out: need regen bestfree */
+{
+       xfs_dir2_data_hdr_t     *hdr;           /* data block header */
+       xfs_dir2_data_free_t    *dfp;           /* bestfree pointer */
+       int                     matchback;      /* matches end of freespace */
+       int                     matchfront;     /* matches start of freespace */
+       int                     needscan;       /* need to regen bestfree */
+       xfs_dir2_data_unused_t  *newdup;        /* new unused entry */
+       xfs_dir2_data_unused_t  *newdup2;       /* another new unused entry */
+       int                     oldlen;         /* old unused entry's length */
+       struct xfs_dir2_data_free *bf;
+
+       hdr = bp->b_addr;
+       ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+       ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG);
+       ASSERT(offset >= (char *)dup - (char *)hdr);
+       ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)hdr);
+       ASSERT((char *)dup - (char *)hdr == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
+       /*
+        * Look up the entry in the bestfree table.
+        */
+       oldlen = be16_to_cpu(dup->length);
+       bf = args->dp->d_ops->data_bestfree_p(hdr);
+       dfp = xfs_dir2_data_freefind(hdr, bf, dup);
+       ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length));
+       /*
+        * Check for alignment with front and back of the entry.
+        */
+       matchfront = (char *)dup - (char *)hdr == offset;
+       matchback = (char *)dup + oldlen - (char *)hdr == offset + len;
+       ASSERT(*needscanp == 0);
+       needscan = 0;
+       /*
+        * If we matched it exactly we just need to get rid of it from
+        * the bestfree table.
+        */
+       if (matchfront && matchback) {
+               if (dfp) {
+                       needscan = (bf[2].offset != 0);
+                       if (!needscan)
+                               xfs_dir2_data_freeremove(hdr, bf, dfp,
+                                                        needlogp);
+               }
+       }
+       /*
+        * We match the first part of the entry.
+        * Make a new entry with the remaining freespace.
+        */
+       else if (matchfront) {
+               newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
+               newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+               newdup->length = cpu_to_be16(oldlen - len);
+               *xfs_dir2_data_unused_tag_p(newdup) =
+                       cpu_to_be16((char *)newdup - (char *)hdr);
+               xfs_dir2_data_log_unused(args, bp, newdup);
+               /*
+                * If it was in the table, remove it and add the new one.
+                */
+               if (dfp) {
+                       xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+                       dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
+                                                      needlogp);
+                       ASSERT(dfp != NULL);
+                       ASSERT(dfp->length == newdup->length);
+                       ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr);
+                       /*
+                        * If we got inserted at the last slot,
+                        * that means we don't know if there was a better
+                        * choice for the last slot, or not.  Rescan.
+                        */
+                       needscan = dfp == &bf[2];
+               }
+       }
+       /*
+        * We match the last part of the entry.
+        * Trim the allocated space off the tail of the entry.
+        */
+       else if (matchback) {
+               newdup = dup;
+               newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
+               *xfs_dir2_data_unused_tag_p(newdup) =
+                       cpu_to_be16((char *)newdup - (char *)hdr);
+               xfs_dir2_data_log_unused(args, bp, newdup);
+               /*
+                * If it was in the table, remove it and add the new one.
+                */
+               if (dfp) {
+                       xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+                       dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
+                                                      needlogp);
+                       ASSERT(dfp != NULL);
+                       ASSERT(dfp->length == newdup->length);
+                       ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr);
+                       /*
+                        * If we got inserted at the last slot,
+                        * that means we don't know if there was a better
+                        * choice for the last slot, or not.  Rescan.
+                        */
+                       needscan = dfp == &bf[2];
+               }
+       }
+       /*
+        * Poking out the middle of an entry.
+        * Make two new entries.
+        */
+       else {
+               newdup = dup;
+               newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
+               *xfs_dir2_data_unused_tag_p(newdup) =
+                       cpu_to_be16((char *)newdup - (char *)hdr);
+               xfs_dir2_data_log_unused(args, bp, newdup);
+               newdup2 = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
+               newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+               newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length));
+               *xfs_dir2_data_unused_tag_p(newdup2) =
+                       cpu_to_be16((char *)newdup2 - (char *)hdr);
+               xfs_dir2_data_log_unused(args, bp, newdup2);
+               /*
+                * If the old entry was in the table, we need to scan
+                * if the 3rd entry was valid, since these entries
+                * are smaller than the old one.
+                * If we don't need to scan that means there were 1 or 2
+                * entries in the table, and removing the old and adding
+                * the 2 new will work.
+                */
+               if (dfp) {
+                       needscan = (bf[2].length != 0);
+                       if (!needscan) {
+                               xfs_dir2_data_freeremove(hdr, bf, dfp,
+                                                        needlogp);
+                               xfs_dir2_data_freeinsert(hdr, bf, newdup,
+                                                        needlogp);
+                               xfs_dir2_data_freeinsert(hdr, bf, newdup2,
+                                                        needlogp);
+                       }
+               }
+       }
+       *needscanp = needscan;
+}
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
new file mode 100644 (file)
index 0000000..a19174e
--- /dev/null
@@ -0,0 +1,1831 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
+
+/*
+ * Local function declarations.
+ */
+static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp,
+                                   int *indexp, struct xfs_buf **dbpp);
+static void xfs_dir3_leaf_log_bests(struct xfs_da_args *args,
+                                   struct xfs_buf *bp, int first, int last);
+static void xfs_dir3_leaf_log_tail(struct xfs_da_args *args,
+                                  struct xfs_buf *bp);
+
+/*
+ * Check the internal consistency of a leaf1 block.
+ * Pop an assert if something is wrong.
+ */
+#ifdef DEBUG
+#define        xfs_dir3_leaf_check(dp, bp) \
+do { \
+       if (!xfs_dir3_leaf1_check((dp), (bp))) \
+               ASSERT(0); \
+} while (0);
+
+STATIC bool
+xfs_dir3_leaf1_check(
+       struct xfs_inode        *dp,
+       struct xfs_buf          *bp)
+{
+       struct xfs_dir2_leaf    *leaf = bp->b_addr;
+       struct xfs_dir3_icleaf_hdr leafhdr;
+
+       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+       if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) {
+               struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+               if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+                       return false;
+       } else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC)
+               return false;
+
+       return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
+}
+#else
+#define        xfs_dir3_leaf_check(dp, bp)
+#endif
+
+bool
+xfs_dir3_leaf_check_int(
+       struct xfs_mount        *mp,
+       struct xfs_inode        *dp,
+       struct xfs_dir3_icleaf_hdr *hdr,
+       struct xfs_dir2_leaf    *leaf)
+{
+       struct xfs_dir2_leaf_entry *ents;
+       xfs_dir2_leaf_tail_t    *ltp;
+       int                     stale;
+       int                     i;
+       const struct xfs_dir_ops *ops;
+       struct xfs_dir3_icleaf_hdr leafhdr;
+       struct xfs_da_geometry  *geo = mp->m_dir_geo;
+
+       /*
+        * we can be passed a null dp here from a verifier, so we need to go the
+        * hard way to get them.
+        */
+       ops = xfs_dir_get_ops(mp, dp);
+
+       if (!hdr) {
+               ops->leaf_hdr_from_disk(&leafhdr, leaf);
+               hdr = &leafhdr;
+       }
+
+       ents = ops->leaf_ents_p(leaf);
+       ltp = xfs_dir2_leaf_tail_p(geo, leaf);
+
+       /*
+        * XXX (dgc): This value is not restrictive enough.
+        * Should factor in the size of the bests table as well.
+        * We can deduce a value for that from di_size.
+        */
+       if (hdr->count > ops->leaf_max_ents(geo))
+               return false;
+
+       /* Leaves and bests don't overlap in leaf format. */
+       if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
+            hdr->magic == XFS_DIR3_LEAF1_MAGIC) &&
+           (char *)&ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp))
+               return false;
+
+       /* Check hash value order, count stale entries.  */
+       for (i = stale = 0; i < hdr->count; i++) {
+               if (i + 1 < hdr->count) {
+                       if (be32_to_cpu(ents[i].hashval) >
+                                       be32_to_cpu(ents[i + 1].hashval))
+                               return false;
+               }
+               if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+                       stale++;
+       }
+       if (hdr->stale != stale)
+               return false;
+       return true;
+}
+
+/*
+ * We verify the magic numbers before decoding the leaf header so that on debug
+ * kernels we don't get assertion failures in xfs_dir3_leaf_hdr_from_disk() due
+ * to incorrect magic numbers.
+ */
+static bool
+xfs_dir3_leaf_verify(
+       struct xfs_buf          *bp,
+       __uint16_t              magic)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_dir2_leaf    *leaf = bp->b_addr;
+
+       ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
+
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+               __uint16_t              magic3;
+
+               magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC
+                                                        : XFS_DIR3_LEAFN_MAGIC;
+
+               if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
+                       return false;
+               if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid))
+                       return false;
+               if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+                       return false;
+       } else {
+               if (leaf->hdr.info.magic != cpu_to_be16(magic))
+                       return false;
+       }
+
+       return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf);
+}
+
+static void
+__read_verify(
+       struct xfs_buf  *bp,
+       __uint16_t      magic)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+
+       if (xfs_sb_version_hascrc(&mp->m_sb) &&
+            !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
+               xfs_buf_ioerror(bp, -EFSBADCRC);
+       else if (!xfs_dir3_leaf_verify(bp, magic))
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+       if (bp->b_error)
+               xfs_verifier_error(bp);
+}
+
+static void
+__write_verify(
+       struct xfs_buf  *bp,
+       __uint16_t      magic)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_buf_log_item *bip = bp->b_fspriv;
+       struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
+
+       if (!xfs_dir3_leaf_verify(bp, magic)) {
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+               xfs_verifier_error(bp);
+               return;
+       }
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return;
+
+       if (bip)
+               hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+       xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
+}
+
+static void
+xfs_dir3_leaf1_read_verify(
+       struct xfs_buf  *bp)
+{
+       __read_verify(bp, XFS_DIR2_LEAF1_MAGIC);
+}
+
+static void
+xfs_dir3_leaf1_write_verify(
+       struct xfs_buf  *bp)
+{
+       __write_verify(bp, XFS_DIR2_LEAF1_MAGIC);
+}
+
+static void
+xfs_dir3_leafn_read_verify(
+       struct xfs_buf  *bp)
+{
+       __read_verify(bp, XFS_DIR2_LEAFN_MAGIC);
+}
+
+static void
+xfs_dir3_leafn_write_verify(
+       struct xfs_buf  *bp)
+{
+       __write_verify(bp, XFS_DIR2_LEAFN_MAGIC);
+}
+
+const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
+       .verify_read = xfs_dir3_leaf1_read_verify,
+       .verify_write = xfs_dir3_leaf1_write_verify,
+};
+
+const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
+       .verify_read = xfs_dir3_leafn_read_verify,
+       .verify_write = xfs_dir3_leafn_write_verify,
+};
+
+static int
+xfs_dir3_leaf_read(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *dp,
+       xfs_dablk_t             fbno,
+       xfs_daddr_t             mappedbno,
+       struct xfs_buf          **bpp)
+{
+       int                     err;
+
+       err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+                               XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops);
+       if (!err && tp)
+               xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
+       return err;
+}
+
+int
+xfs_dir3_leafn_read(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *dp,
+       xfs_dablk_t             fbno,
+       xfs_daddr_t             mappedbno,
+       struct xfs_buf          **bpp)
+{
+       int                     err;
+
+       err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+                               XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops);
+       if (!err && tp)
+               xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
+       return err;
+}
+
+/*
+ * Initialize a new leaf block, leaf1 or leafn magic accepted.
+ */
+static void
+xfs_dir3_leaf_init(
+       struct xfs_mount        *mp,
+       struct xfs_trans        *tp,
+       struct xfs_buf          *bp,
+       xfs_ino_t               owner,
+       __uint16_t              type)
+{
+       struct xfs_dir2_leaf    *leaf = bp->b_addr;
+
+       ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC);
+
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+
+               memset(leaf3, 0, sizeof(*leaf3));
+
+               leaf3->info.hdr.magic = (type == XFS_DIR2_LEAF1_MAGIC)
+                                        ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)
+                                        : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
+               leaf3->info.blkno = cpu_to_be64(bp->b_bn);
+               leaf3->info.owner = cpu_to_be64(owner);
+               uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_uuid);
+       } else {
+               memset(leaf, 0, sizeof(*leaf));
+               leaf->hdr.info.magic = cpu_to_be16(type);
+       }
+
+       /*
+        * If it's a leaf-format directory initialize the tail.
+        * Caller is responsible for initialising the bests table.
+        */
+       if (type == XFS_DIR2_LEAF1_MAGIC) {
+               struct xfs_dir2_leaf_tail *ltp;
+
+               ltp = xfs_dir2_leaf_tail_p(mp->m_dir_geo, leaf);
+               ltp->bestcount = 0;
+               bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+               xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAF1_BUF);
+       } else {
+               bp->b_ops = &xfs_dir3_leafn_buf_ops;
+               xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
+       }
+}
+
+int
+xfs_dir3_leaf_get_buf(
+       xfs_da_args_t           *args,
+       xfs_dir2_db_t           bno,
+       struct xfs_buf          **bpp,
+       __uint16_t              magic)
+{
+       struct xfs_inode        *dp = args->dp;
+       struct xfs_trans        *tp = args->trans;
+       struct xfs_mount        *mp = dp->i_mount;
+       struct xfs_buf          *bp;
+       int                     error;
+
+       ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
+       ASSERT(bno >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET) &&
+              bno < xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
+
+       error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, bno),
+                              -1, &bp, XFS_DATA_FORK);
+       if (error)
+               return error;
+
+       xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic);
+       xfs_dir3_leaf_log_header(args, bp);
+       if (magic == XFS_DIR2_LEAF1_MAGIC)
+               xfs_dir3_leaf_log_tail(args, bp);
+       *bpp = bp;
+       return 0;
+}
+
+/*
+ * Convert a block form directory to a leaf form directory.
+ */
+int                                            /* error */
+xfs_dir2_block_to_leaf(
+       xfs_da_args_t           *args,          /* operation arguments */
+       struct xfs_buf          *dbp)           /* input block's buffer */
+{
+       __be16                  *bestsp;        /* leaf's bestsp entries */
+       xfs_dablk_t             blkno;          /* leaf block's bno */
+       xfs_dir2_data_hdr_t     *hdr;           /* block header */
+       xfs_dir2_leaf_entry_t   *blp;           /* block's leaf entries */
+       xfs_dir2_block_tail_t   *btp;           /* block's tail */
+       xfs_inode_t             *dp;            /* incore directory inode */
+       int                     error;          /* error return code */
+       struct xfs_buf          *lbp;           /* leaf block's buffer */
+       xfs_dir2_db_t           ldb;            /* leaf block's bno */
+       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+       xfs_dir2_leaf_tail_t    *ltp;           /* leaf's tail */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       int                     needlog;        /* need to log block header */
+       int                     needscan;       /* need to rescan bestfree */
+       xfs_trans_t             *tp;            /* transaction pointer */
+       struct xfs_dir2_data_free *bf;
+       struct xfs_dir2_leaf_entry *ents;
+       struct xfs_dir3_icleaf_hdr leafhdr;
+
+       trace_xfs_dir2_block_to_leaf(args);
+
+       dp = args->dp;
+       mp = dp->i_mount;
+       tp = args->trans;
+       /*
+        * Add the leaf block to the inode.
+        * This interface will only put blocks in the leaf/node range.
+        * Since that's empty now, we'll get the root (block 0 in range).
+        */
+       if ((error = xfs_da_grow_inode(args, &blkno))) {
+               return error;
+       }
+       ldb = xfs_dir2_da_to_db(args->geo, blkno);
+       ASSERT(ldb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET));
+       /*
+        * Initialize the leaf block, get a buffer for it.
+        */
+       error = xfs_dir3_leaf_get_buf(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC);
+       if (error)
+               return error;
+
+       leaf = lbp->b_addr;
+       hdr = dbp->b_addr;
+       xfs_dir3_data_check(dp, dbp);
+       btp = xfs_dir2_block_tail_p(args->geo, hdr);
+       blp = xfs_dir2_block_leaf_p(btp);
+       bf = dp->d_ops->data_bestfree_p(hdr);
+       ents = dp->d_ops->leaf_ents_p(leaf);
+
+       /*
+        * Set the counts in the leaf header.
+        */
+       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+       leafhdr.count = be32_to_cpu(btp->count);
+       leafhdr.stale = be32_to_cpu(btp->stale);
+       dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+       xfs_dir3_leaf_log_header(args, lbp);
+
+       /*
+        * Could compact these but I think we always do the conversion
+        * after squeezing out stale entries.
+        */
+       memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t));
+       xfs_dir3_leaf_log_ents(args, lbp, 0, leafhdr.count - 1);
+       needscan = 0;
+       needlog = 1;
+       /*
+        * Make the space formerly occupied by the leaf entries and block
+        * tail be free.
+        */
+       xfs_dir2_data_make_free(args, dbp,
+               (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
+               (xfs_dir2_data_aoff_t)((char *)hdr + args->geo->blksize -
+                                      (char *)blp),
+               &needlog, &needscan);
+       /*
+        * Fix up the block header, make it a data block.
+        */
+       dbp->b_ops = &xfs_dir3_data_buf_ops;
+       xfs_trans_buf_set_type(tp, dbp, XFS_BLFT_DIR_DATA_BUF);
+       if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
+               hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+       else
+               hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
+
+       if (needscan)
+               xfs_dir2_data_freescan(dp, hdr, &needlog);
+       /*
+        * Set up leaf tail and bests table.
+        */
+       ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+       ltp->bestcount = cpu_to_be32(1);
+       bestsp = xfs_dir2_leaf_bests_p(ltp);
+       bestsp[0] =  bf[0].length;
+       /*
+        * Log the data header and leaf bests table.
+        */
+       if (needlog)
+               xfs_dir2_data_log_header(args, dbp);
+       xfs_dir3_leaf_check(dp, lbp);
+       xfs_dir3_data_check(dp, dbp);
+       xfs_dir3_leaf_log_bests(args, lbp, 0, 0);
+       return 0;
+}
+
+STATIC void
+xfs_dir3_leaf_find_stale(
+       struct xfs_dir3_icleaf_hdr *leafhdr,
+       struct xfs_dir2_leaf_entry *ents,
+       int                     index,
+       int                     *lowstale,
+       int                     *highstale)
+{
+       /*
+        * Find the first stale entry before our index, if any.
+        */
+       for (*lowstale = index - 1; *lowstale >= 0; --*lowstale) {
+               if (ents[*lowstale].address ==
+                   cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+                       break;
+       }
+
+       /*
+        * Find the first stale entry at or after our index, if any.
+        * Stop if the result would require moving more entries than using
+        * lowstale.
+        */
+       for (*highstale = index; *highstale < leafhdr->count; ++*highstale) {
+               if (ents[*highstale].address ==
+                   cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+                       break;
+               if (*lowstale >= 0 && index - *lowstale <= *highstale - index)
+                       break;
+       }
+}
+
+struct xfs_dir2_leaf_entry *
+xfs_dir3_leaf_find_entry(
+       struct xfs_dir3_icleaf_hdr *leafhdr,
+       struct xfs_dir2_leaf_entry *ents,
+       int                     index,          /* leaf table position */
+       int                     compact,        /* need to compact leaves */
+       int                     lowstale,       /* index of prev stale leaf */
+       int                     highstale,      /* index of next stale leaf */
+       int                     *lfloglow,      /* low leaf logging index */
+       int                     *lfloghigh)     /* high leaf logging index */
+{
+       if (!leafhdr->stale) {
+               xfs_dir2_leaf_entry_t   *lep;   /* leaf entry table pointer */
+
+               /*
+                * Now we need to make room to insert the leaf entry.
+                *
+                * If there are no stale entries, just insert a hole at index.
+                */
+               lep = &ents[index];
+               if (index < leafhdr->count)
+                       memmove(lep + 1, lep,
+                               (leafhdr->count - index) * sizeof(*lep));
+
+               /*
+                * Record low and high logging indices for the leaf.
+                */
+               *lfloglow = index;
+               *lfloghigh = leafhdr->count++;
+               return lep;
+       }
+
+       /*
+        * There are stale entries.
+        *
+        * We will use one of them for the new entry.  It's probably not at
+        * the right location, so we'll have to shift some up or down first.
+        *
+        * If we didn't compact before, we need to find the nearest stale
+        * entries before and after our insertion point.
+        */
+       if (compact == 0)
+               xfs_dir3_leaf_find_stale(leafhdr, ents, index,
+                                        &lowstale, &highstale);
+
+       /*
+        * If the low one is better, use it.
+        */
+       if (lowstale >= 0 &&
+           (highstale == leafhdr->count ||
+            index - lowstale - 1 < highstale - index)) {
+               ASSERT(index - lowstale - 1 >= 0);
+               ASSERT(ents[lowstale].address ==
+                      cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
+
+               /*
+                * Copy entries up to cover the stale entry and make room
+                * for the new entry.
+                */
+               if (index - lowstale - 1 > 0) {
+                       memmove(&ents[lowstale], &ents[lowstale + 1],
+                               (index - lowstale - 1) *
+                                       sizeof(xfs_dir2_leaf_entry_t));
+               }
+               *lfloglow = MIN(lowstale, *lfloglow);
+               *lfloghigh = MAX(index - 1, *lfloghigh);
+               leafhdr->stale--;
+               return &ents[index - 1];
+       }
+
+       /*
+        * The high one is better, so use that one.
+        */
+       ASSERT(highstale - index >= 0);
+       ASSERT(ents[highstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
+
+       /*
+        * Copy entries down to cover the stale entry and make room for the
+        * new entry.
+        */
+       if (highstale - index > 0) {
+               memmove(&ents[index + 1], &ents[index],
+                       (highstale - index) * sizeof(xfs_dir2_leaf_entry_t));
+       }
+       *lfloglow = MIN(index, *lfloglow);
+       *lfloghigh = MAX(highstale, *lfloghigh);
+       leafhdr->stale--;
+       return &ents[index];
+}
+
+/*
+ * Add an entry to a leaf form directory.
+ */
+int                                            /* error */
+xfs_dir2_leaf_addname(
+       xfs_da_args_t           *args)          /* operation arguments */
+{
+       __be16                  *bestsp;        /* freespace table in leaf */
+       int                     compact;        /* need to compact leaves */
+       xfs_dir2_data_hdr_t     *hdr;           /* data block header */
+       struct xfs_buf          *dbp;           /* data block buffer */
+       xfs_dir2_data_entry_t   *dep;           /* data block entry */
+       xfs_inode_t             *dp;            /* incore directory inode */
+       xfs_dir2_data_unused_t  *dup;           /* data unused entry */
+       int                     error;          /* error return value */
+       int                     grown;          /* allocated new data block */
+       int                     highstale;      /* index of next stale leaf */
+       int                     i;              /* temporary, index */
+       int                     index;          /* leaf table position */
+       struct xfs_buf          *lbp;           /* leaf's buffer */
+       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+       int                     length;         /* length of new entry */
+       xfs_dir2_leaf_entry_t   *lep;           /* leaf entry table pointer */
+       int                     lfloglow;       /* low leaf logging index */
+       int                     lfloghigh;      /* high leaf logging index */
+       int                     lowstale;       /* index of prev stale leaf */
+       xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail pointer */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       int                     needbytes;      /* leaf block bytes needed */
+       int                     needlog;        /* need to log data header */
+       int                     needscan;       /* need to rescan data free */
+       __be16                  *tagp;          /* end of data entry */
+       xfs_trans_t             *tp;            /* transaction pointer */
+       xfs_dir2_db_t           use_block;      /* data block number */
+       struct xfs_dir2_data_free *bf;          /* bestfree table */
+       struct xfs_dir2_leaf_entry *ents;
+       struct xfs_dir3_icleaf_hdr leafhdr;
+
+       trace_xfs_dir2_leaf_addname(args);
+
+       dp = args->dp;
+       tp = args->trans;
+       mp = dp->i_mount;
+
+       error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
+       if (error)
+               return error;
+
+       /*
+        * Look up the entry by hash value and name.
+        * We know it's not there, our caller has already done a lookup.
+        * So the index is of the entry to insert in front of.
+        * But if there are dup hash values the index is of the first of those.
+        */
+       index = xfs_dir2_leaf_search_hash(args, lbp);
+       leaf = lbp->b_addr;
+       ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+       ents = dp->d_ops->leaf_ents_p(leaf);
+       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+       bestsp = xfs_dir2_leaf_bests_p(ltp);
+       length = dp->d_ops->data_entsize(args->namelen);
+
+       /*
+        * See if there are any entries with the same hash value
+        * and space in their block for the new entry.
+        * This is good because it puts multiple same-hash value entries
+        * in a data block, improving the lookup of those entries.
+        */
+       for (use_block = -1, lep = &ents[index];
+            index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+            index++, lep++) {
+               if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+                       continue;
+               i = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
+               ASSERT(i < be32_to_cpu(ltp->bestcount));
+               ASSERT(bestsp[i] != cpu_to_be16(NULLDATAOFF));
+               if (be16_to_cpu(bestsp[i]) >= length) {
+                       use_block = i;
+                       break;
+               }
+       }
+       /*
+        * Didn't find a block yet, linear search all the data blocks.
+        */
+       if (use_block == -1) {
+               for (i = 0; i < be32_to_cpu(ltp->bestcount); i++) {
+                       /*
+                        * Remember a block we see that's missing.
+                        */
+                       if (bestsp[i] == cpu_to_be16(NULLDATAOFF) &&
+                           use_block == -1)
+                               use_block = i;
+                       else if (be16_to_cpu(bestsp[i]) >= length) {
+                               use_block = i;
+                               break;
+                       }
+               }
+       }
+       /*
+        * How many bytes do we need in the leaf block?
+        */
+       needbytes = 0;
+       if (!leafhdr.stale)
+               needbytes += sizeof(xfs_dir2_leaf_entry_t);
+       if (use_block == -1)
+               needbytes += sizeof(xfs_dir2_data_off_t);
+
+       /*
+        * Now kill use_block if it refers to a missing block, so we
+        * can use it as an indication of allocation needed.
+        */
+       if (use_block != -1 && bestsp[use_block] == cpu_to_be16(NULLDATAOFF))
+               use_block = -1;
+       /*
+        * If we don't have enough free bytes but we can make enough
+        * by compacting out stale entries, we'll do that.
+        */
+       if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes &&
+           leafhdr.stale > 1)
+               compact = 1;
+
+       /*
+        * Otherwise if we don't have enough free bytes we need to
+        * convert to node form.
+        */
+       else if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes) {
+               /*
+                * Just checking or no space reservation, give up.
+                */
+               if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
+                                                       args->total == 0) {
+                       xfs_trans_brelse(tp, lbp);
+                       return -ENOSPC;
+               }
+               /*
+                * Convert to node form.
+                */
+               error = xfs_dir2_leaf_to_node(args, lbp);
+               if (error)
+                       return error;
+               /*
+                * Then add the new entry.
+                */
+               return xfs_dir2_node_addname(args);
+       }
+       /*
+        * Otherwise it will fit without compaction.
+        */
+       else
+               compact = 0;
+       /*
+        * If just checking, then it will fit unless we needed to allocate
+        * a new data block.
+        */
+       if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
+               xfs_trans_brelse(tp, lbp);
+               return use_block == -1 ? -ENOSPC : 0;
+       }
+       /*
+        * If no allocations are allowed, return now before we've
+        * changed anything.
+        */
+       if (args->total == 0 && use_block == -1) {
+               xfs_trans_brelse(tp, lbp);
+               return -ENOSPC;
+       }
+       /*
+        * Need to compact the leaf entries, removing stale ones.
+        * Leave one stale entry behind - the one closest to our
+        * insertion index - and we'll shift that one to our insertion
+        * point later.
+        */
+       if (compact) {
+               xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
+                       &highstale, &lfloglow, &lfloghigh);
+       }
+       /*
+        * There are stale entries, so we'll need log-low and log-high
+        * impossibly bad values later.
+        */
+       else if (leafhdr.stale) {
+               lfloglow = leafhdr.count;
+               lfloghigh = -1;
+       }
+       /*
+        * If there was no data block space found, we need to allocate
+        * a new one.
+        */
+       if (use_block == -1) {
+               /*
+                * Add the new data block.
+                */
+               if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE,
+                               &use_block))) {
+                       xfs_trans_brelse(tp, lbp);
+                       return error;
+               }
+               /*
+                * Initialize the block.
+                */
+               if ((error = xfs_dir3_data_init(args, use_block, &dbp))) {
+                       xfs_trans_brelse(tp, lbp);
+                       return error;
+               }
+               /*
+                * If we're adding a new data block on the end we need to
+                * extend the bests table.  Copy it up one entry.
+                */
+               if (use_block >= be32_to_cpu(ltp->bestcount)) {
+                       bestsp--;
+                       memmove(&bestsp[0], &bestsp[1],
+                               be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0]));
+                       be32_add_cpu(&ltp->bestcount, 1);
+                       xfs_dir3_leaf_log_tail(args, lbp);
+                       xfs_dir3_leaf_log_bests(args, lbp, 0,
+                                               be32_to_cpu(ltp->bestcount) - 1);
+               }
+               /*
+                * If we're filling in a previously empty block just log it.
+                */
+               else
+                       xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
+               hdr = dbp->b_addr;
+               bf = dp->d_ops->data_bestfree_p(hdr);
+               bestsp[use_block] = bf[0].length;
+               grown = 1;
+       } else {
+               /*
+                * Already had space in some data block.
+                * Just read that one in.
+                */
+               error = xfs_dir3_data_read(tp, dp,
+                                  xfs_dir2_db_to_da(args->geo, use_block),
+                                  -1, &dbp);
+               if (error) {
+                       xfs_trans_brelse(tp, lbp);
+                       return error;
+               }
+               hdr = dbp->b_addr;
+               bf = dp->d_ops->data_bestfree_p(hdr);
+               grown = 0;
+       }
+       /*
+        * Point to the biggest freespace in our data block.
+        */
+       dup = (xfs_dir2_data_unused_t *)
+             ((char *)hdr + be16_to_cpu(bf[0].offset));
+       ASSERT(be16_to_cpu(dup->length) >= length);
+       needscan = needlog = 0;
+       /*
+        * Mark the initial part of our freespace in use for the new entry.
+        */
+       xfs_dir2_data_use_free(args, dbp, dup,
+               (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
+               &needlog, &needscan);
+       /*
+        * Initialize our new entry (at last).
+        */
+       dep = (xfs_dir2_data_entry_t *)dup;
+       dep->inumber = cpu_to_be64(args->inumber);
+       dep->namelen = args->namelen;
+       memcpy(dep->name, args->name, dep->namelen);
+       dp->d_ops->data_put_ftype(dep, args->filetype);
+       tagp = dp->d_ops->data_entry_tag_p(dep);
+       *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+       /*
+        * Need to scan fix up the bestfree table.
+        */
+       if (needscan)
+               xfs_dir2_data_freescan(dp, hdr, &needlog);
+       /*
+        * Need to log the data block's header.
+        */
+       if (needlog)
+               xfs_dir2_data_log_header(args, dbp);
+       xfs_dir2_data_log_entry(args, dbp, dep);
+       /*
+        * If the bests table needs to be changed, do it.
+        * Log the change unless we've already done that.
+        */
+       if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(bf[0].length)) {
+               bestsp[use_block] = bf[0].length;
+               if (!grown)
+                       xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
+       }
+
+       lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
+                                      highstale, &lfloglow, &lfloghigh);
+
+       /*
+        * Fill in the new leaf entry.
+        */
+       lep->hashval = cpu_to_be32(args->hashval);
+       lep->address = cpu_to_be32(
+                               xfs_dir2_db_off_to_dataptr(args->geo, use_block,
+                               be16_to_cpu(*tagp)));
+       /*
+        * Log the leaf fields and give up the buffers.
+        */
+       dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+       xfs_dir3_leaf_log_header(args, lbp);
+       xfs_dir3_leaf_log_ents(args, lbp, lfloglow, lfloghigh);
+       xfs_dir3_leaf_check(dp, lbp);
+       xfs_dir3_data_check(dp, dbp);
+       return 0;
+}
+
+/*
+ * Compact out any stale entries in the leaf.
+ * Log the header and changed leaf entries, if any.
+ */
+void
+xfs_dir3_leaf_compact(
+       xfs_da_args_t   *args,          /* operation arguments */
+       struct xfs_dir3_icleaf_hdr *leafhdr,
+       struct xfs_buf  *bp)            /* leaf buffer */
+{
+       int             from;           /* source leaf index */
+       xfs_dir2_leaf_t *leaf;          /* leaf structure */
+       int             loglow;         /* first leaf entry to log */
+       int             to;             /* target leaf index */
+       struct xfs_dir2_leaf_entry *ents;
+       struct xfs_inode *dp = args->dp;
+
+       leaf = bp->b_addr;
+       if (!leafhdr->stale)
+               return;
+
+       /*
+        * Compress out the stale entries in place.
+        */
+       ents = dp->d_ops->leaf_ents_p(leaf);
+       for (from = to = 0, loglow = -1; from < leafhdr->count; from++) {
+               if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+                       continue;
+               /*
+                * Only actually copy the entries that are different.
+                */
+               if (from > to) {
+                       if (loglow == -1)
+                               loglow = to;
+                       ents[to] = ents[from];
+               }
+               to++;
+       }
+       /*
+        * Update and log the header, log the leaf entries.
+        */
+       ASSERT(leafhdr->stale == from - to);
+       leafhdr->count -= leafhdr->stale;
+       leafhdr->stale = 0;
+
+       dp->d_ops->leaf_hdr_to_disk(leaf, leafhdr);
+       xfs_dir3_leaf_log_header(args, bp);
+       if (loglow != -1)
+               xfs_dir3_leaf_log_ents(args, bp, loglow, to - 1);
+}
+
+/*
+ * Compact the leaf entries, removing stale ones.
+ * Leave one stale entry behind - the one closest to our
+ * insertion index - and the caller will shift that one to our insertion
+ * point later.
+ * Return new insertion index, where the remaining stale entry is,
+ * and leaf logging indices.
+ */
+void
+xfs_dir3_leaf_compact_x1(
+       struct xfs_dir3_icleaf_hdr *leafhdr,
+       struct xfs_dir2_leaf_entry *ents,
+       int             *indexp,        /* insertion index */
+       int             *lowstalep,     /* out: stale entry before us */
+       int             *highstalep,    /* out: stale entry after us */
+       int             *lowlogp,       /* out: low log index */
+       int             *highlogp)      /* out: high log index */
+{
+       int             from;           /* source copy index */
+       int             highstale;      /* stale entry at/after index */
+       int             index;          /* insertion index */
+       int             keepstale;      /* source index of kept stale */
+       int             lowstale;       /* stale entry before index */
+       int             newindex=0;     /* new insertion index */
+       int             to;             /* destination copy index */
+
+       ASSERT(leafhdr->stale > 1);
+       index = *indexp;
+
+       xfs_dir3_leaf_find_stale(leafhdr, ents, index, &lowstale, &highstale);
+
+       /*
+        * Pick the better of lowstale and highstale.
+        */
+       if (lowstale >= 0 &&
+           (highstale == leafhdr->count ||
+            index - lowstale <= highstale - index))
+               keepstale = lowstale;
+       else
+               keepstale = highstale;
+       /*
+        * Copy the entries in place, removing all the stale entries
+        * except keepstale.
+        */
+       for (from = to = 0; from < leafhdr->count; from++) {
+               /*
+                * Notice the new value of index.
+                */
+               if (index == from)
+                       newindex = to;
+               if (from != keepstale &&
+                   ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
+                       if (from == to)
+                               *lowlogp = to;
+                       continue;
+               }
+               /*
+                * Record the new keepstale value for the insertion.
+                */
+               if (from == keepstale)
+                       lowstale = highstale = to;
+               /*
+                * Copy only the entries that have moved.
+                */
+               if (from > to)
+                       ents[to] = ents[from];
+               to++;
+       }
+       ASSERT(from > to);
+       /*
+        * If the insertion point was past the last entry,
+        * set the new insertion point accordingly.
+        */
+       if (index == from)
+               newindex = to;
+       *indexp = newindex;
+       /*
+        * Adjust the leaf header values.
+        */
+       leafhdr->count -= from - to;
+       leafhdr->stale = 1;
+       /*
+        * Remember the low/high stale value only in the "right"
+        * direction.
+        */
+       if (lowstale >= newindex)
+               lowstale = -1;
+       else
+               highstale = leafhdr->count;
+       *highlogp = leafhdr->count - 1;
+       *lowstalep = lowstale;
+       *highstalep = highstale;
+}
+
+/*
+ * Log the bests entries indicated from a leaf1 block.
+ */
+static void
+xfs_dir3_leaf_log_bests(
+       struct xfs_da_args      *args,
+       struct xfs_buf          *bp,            /* leaf buffer */
+       int                     first,          /* first entry to log */
+       int                     last)           /* last entry to log */
+{
+       __be16                  *firstb;        /* pointer to first entry */
+       __be16                  *lastb;         /* pointer to last entry */
+       struct xfs_dir2_leaf    *leaf = bp->b_addr;
+       xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
+
+       ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+              leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC));
+
+       ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+       firstb = xfs_dir2_leaf_bests_p(ltp) + first;
+       lastb = xfs_dir2_leaf_bests_p(ltp) + last;
+       xfs_trans_log_buf(args->trans, bp,
+               (uint)((char *)firstb - (char *)leaf),
+               (uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1));
+}
+
+/*
+ * Log the leaf entries indicated from a leaf1 or leafn block.
+ */
+void
+xfs_dir3_leaf_log_ents(
+       struct xfs_da_args      *args,
+       struct xfs_buf          *bp,
+       int                     first,
+       int                     last)
+{
+       xfs_dir2_leaf_entry_t   *firstlep;      /* pointer to first entry */
+       xfs_dir2_leaf_entry_t   *lastlep;       /* pointer to last entry */
+       struct xfs_dir2_leaf    *leaf = bp->b_addr;
+       struct xfs_dir2_leaf_entry *ents;
+
+       ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+              leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+              leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+              leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+
+       ents = args->dp->d_ops->leaf_ents_p(leaf);
+       firstlep = &ents[first];
+       lastlep = &ents[last];
+       xfs_trans_log_buf(args->trans, bp,
+               (uint)((char *)firstlep - (char *)leaf),
+               (uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1));
+}
+
+/*
+ * Log the header of the leaf1 or leafn block.
+ */
+void
+xfs_dir3_leaf_log_header(
+       struct xfs_da_args      *args,
+       struct xfs_buf          *bp)
+{
+       struct xfs_dir2_leaf    *leaf = bp->b_addr;
+
+       ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+              leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+              leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+              leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+
+       xfs_trans_log_buf(args->trans, bp,
+                         (uint)((char *)&leaf->hdr - (char *)leaf),
+                         args->dp->d_ops->leaf_hdr_size - 1);
+}
+
+/*
+ * Log the tail of the leaf1 block.
+ */
+STATIC void
+xfs_dir3_leaf_log_tail(
+       struct xfs_da_args      *args,
+       struct xfs_buf          *bp)
+{
+       struct xfs_dir2_leaf    *leaf = bp->b_addr;
+       xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
+
+       ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+              leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+              leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+              leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+
+       ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+       xfs_trans_log_buf(args->trans, bp, (uint)((char *)ltp - (char *)leaf),
+               (uint)(args->geo->blksize - 1));
+}
+
+/*
+ * Look up the entry referred to by args in the leaf format directory.
+ * Most of the work is done by the xfs_dir2_leaf_lookup_int routine which
+ * is also used by the node-format code.
+ */
+int
+xfs_dir2_leaf_lookup(
+       xfs_da_args_t           *args)          /* operation arguments */
+{
+       struct xfs_buf          *dbp;           /* data block buffer */
+       xfs_dir2_data_entry_t   *dep;           /* data block entry */
+       xfs_inode_t             *dp;            /* incore directory inode */
+       int                     error;          /* error return code */
+       int                     index;          /* found entry index */
+       struct xfs_buf          *lbp;           /* leaf buffer */
+       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+       xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+       xfs_trans_t             *tp;            /* transaction pointer */
+       struct xfs_dir2_leaf_entry *ents;
+
+       trace_xfs_dir2_leaf_lookup(args);
+
+       /*
+        * Look up name in the leaf block, returning both buffers and index.
+        */
+       if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+               return error;
+       }
+       tp = args->trans;
+       dp = args->dp;
+       xfs_dir3_leaf_check(dp, lbp);
+       leaf = lbp->b_addr;
+       ents = dp->d_ops->leaf_ents_p(leaf);
+       /*
+        * Get to the leaf entry and contained data entry address.
+        */
+       lep = &ents[index];
+
+       /*
+        * Point to the data entry.
+        */
+       dep = (xfs_dir2_data_entry_t *)
+             ((char *)dbp->b_addr +
+              xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
+       /*
+        * Return the found inode number & CI name if appropriate
+        */
+       args->inumber = be64_to_cpu(dep->inumber);
+       args->filetype = dp->d_ops->data_get_ftype(dep);
+       error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+       xfs_trans_brelse(tp, dbp);
+       xfs_trans_brelse(tp, lbp);
+       return error;
+}
+
+/*
+ * Look up name/hash in the leaf block.
+ * Fill in indexp with the found index, and dbpp with the data buffer.
+ * If not found dbpp will be NULL, and ENOENT comes back.
+ * lbpp will always be filled in with the leaf buffer unless there's an error.
+ */
+static int                                     /* error */
+xfs_dir2_leaf_lookup_int(
+       xfs_da_args_t           *args,          /* operation arguments */
+       struct xfs_buf          **lbpp,         /* out: leaf buffer */
+       int                     *indexp,        /* out: index in leaf block */
+       struct xfs_buf          **dbpp)         /* out: data buffer */
+{
+       xfs_dir2_db_t           curdb = -1;     /* current data block number */
+       struct xfs_buf          *dbp = NULL;    /* data buffer */
+       xfs_dir2_data_entry_t   *dep;           /* data entry */
+       xfs_inode_t             *dp;            /* incore directory inode */
+       int                     error;          /* error return code */
+       int                     index;          /* index in leaf block */
+       struct xfs_buf          *lbp;           /* leaf buffer */
+       xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       xfs_dir2_db_t           newdb;          /* new data block number */
+       xfs_trans_t             *tp;            /* transaction pointer */
+       xfs_dir2_db_t           cidb = -1;      /* case match data block no. */
+       enum xfs_dacmp          cmp;            /* name compare result */
+       struct xfs_dir2_leaf_entry *ents;
+       struct xfs_dir3_icleaf_hdr leafhdr;
+
+       dp = args->dp;
+       tp = args->trans;
+       mp = dp->i_mount;
+
+       error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
+       if (error)
+               return error;
+
+       *lbpp = lbp;
+       leaf = lbp->b_addr;
+       xfs_dir3_leaf_check(dp, lbp);
+       ents = dp->d_ops->leaf_ents_p(leaf);
+       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+       /*
+        * Look for the first leaf entry with our hash value.
+        */
+       index = xfs_dir2_leaf_search_hash(args, lbp);
+       /*
+        * Loop over all the entries with the right hash value
+        * looking to match the name.
+        */
+       for (lep = &ents[index];
+            index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+            lep++, index++) {
+               /*
+                * Skip over stale leaf entries.
+                */
+               if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+                       continue;
+               /*
+                * Get the new data block number.
+                */
+               newdb = xfs_dir2_dataptr_to_db(args->geo,
+                                              be32_to_cpu(lep->address));
+               /*
+                * If it's not the same as the old data block number,
+                * need to pitch the old one and read the new one.
+                */
+               if (newdb != curdb) {
+                       if (dbp)
+                               xfs_trans_brelse(tp, dbp);
+                       error = xfs_dir3_data_read(tp, dp,
+                                          xfs_dir2_db_to_da(args->geo, newdb),
+                                          -1, &dbp);
+                       if (error) {
+                               xfs_trans_brelse(tp, lbp);
+                               return error;
+                       }
+                       curdb = newdb;
+               }
+               /*
+                * Point to the data entry.
+                */
+               dep = (xfs_dir2_data_entry_t *)((char *)dbp->b_addr +
+                       xfs_dir2_dataptr_to_off(args->geo,
+                                               be32_to_cpu(lep->address)));
+               /*
+                * Compare name and if it's an exact match, return the index
+                * and buffer. If it's the first case-insensitive match, store
+                * the index and buffer and continue looking for an exact match.
+                */
+               cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+               if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+                       args->cmpresult = cmp;
+                       *indexp = index;
+                       /* case exact match: return the current buffer. */
+                       if (cmp == XFS_CMP_EXACT) {
+                               *dbpp = dbp;
+                               return 0;
+                       }
+                       cidb = curdb;
+               }
+       }
+       ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+       /*
+        * Here, we can only be doing a lookup (not a rename or remove).
+        * If a case-insensitive match was found earlier, re-read the
+        * appropriate data block if required and return it.
+        */
+       if (args->cmpresult == XFS_CMP_CASE) {
+               ASSERT(cidb != -1);
+               if (cidb != curdb) {
+                       xfs_trans_brelse(tp, dbp);
+                       error = xfs_dir3_data_read(tp, dp,
+                                          xfs_dir2_db_to_da(args->geo, cidb),
+                                          -1, &dbp);
+                       if (error) {
+                               xfs_trans_brelse(tp, lbp);
+                               return error;
+                       }
+               }
+               *dbpp = dbp;
+               return 0;
+       }
+       /*
+        * No match found, return -ENOENT.
+        */
+       ASSERT(cidb == -1);
+       if (dbp)
+               xfs_trans_brelse(tp, dbp);
+       xfs_trans_brelse(tp, lbp);
+       return -ENOENT;
+}
+
+/*
+ * Remove an entry from a leaf format directory.
+ */
+int                                            /* error */
+xfs_dir2_leaf_removename(
+       xfs_da_args_t           *args)          /* operation arguments */
+{
+       __be16                  *bestsp;        /* leaf block best freespace */
+       xfs_dir2_data_hdr_t     *hdr;           /* data block header */
+       xfs_dir2_db_t           db;             /* data block number */
+       struct xfs_buf          *dbp;           /* data block buffer */
+       xfs_dir2_data_entry_t   *dep;           /* data entry structure */
+       xfs_inode_t             *dp;            /* incore directory inode */
+       int                     error;          /* error return code */
+       xfs_dir2_db_t           i;              /* temporary data block # */
+       int                     index;          /* index into leaf entries */
+       struct xfs_buf          *lbp;           /* leaf buffer */
+       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+       xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+       xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       int                     needlog;        /* need to log data header */
+       int                     needscan;       /* need to rescan data frees */
+       xfs_dir2_data_off_t     oldbest;        /* old value of best free */
+       xfs_trans_t             *tp;            /* transaction pointer */
+       struct xfs_dir2_data_free *bf;          /* bestfree table */
+       struct xfs_dir2_leaf_entry *ents;
+       struct xfs_dir3_icleaf_hdr leafhdr;
+
+       trace_xfs_dir2_leaf_removename(args);
+
+       /*
+        * Lookup the leaf entry, get the leaf and data blocks read in.
+        */
+       if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+               return error;
+       }
+       dp = args->dp;
+       tp = args->trans;
+       mp = dp->i_mount;
+       leaf = lbp->b_addr;
+       hdr = dbp->b_addr;
+       xfs_dir3_data_check(dp, dbp);
+       bf = dp->d_ops->data_bestfree_p(hdr);
+       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+       ents = dp->d_ops->leaf_ents_p(leaf);
+       /*
+        * Point to the leaf entry, use that to point to the data entry.
+        */
+       lep = &ents[index];
+       db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
+       dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+               xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
+       needscan = needlog = 0;
+       oldbest = be16_to_cpu(bf[0].length);
+       ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+       bestsp = xfs_dir2_leaf_bests_p(ltp);
+       ASSERT(be16_to_cpu(bestsp[db]) == oldbest);
+       /*
+        * Mark the former data entry unused.
+        */
+       xfs_dir2_data_make_free(args, dbp,
+               (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
+               dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
+       /*
+        * We just mark the leaf entry stale by putting a null in it.
+        */
+       leafhdr.stale++;
+       dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+       xfs_dir3_leaf_log_header(args, lbp);
+
+       lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+       xfs_dir3_leaf_log_ents(args, lbp, index, index);
+
+       /*
+        * Scan the freespace in the data block again if necessary,
+        * log the data block header if necessary.
+        */
+       if (needscan)
+               xfs_dir2_data_freescan(dp, hdr, &needlog);
+       if (needlog)
+               xfs_dir2_data_log_header(args, dbp);
+       /*
+        * If the longest freespace in the data block has changed,
+        * put the new value in the bests table and log that.
+        */
+       if (be16_to_cpu(bf[0].length) != oldbest) {
+               bestsp[db] = bf[0].length;
+               xfs_dir3_leaf_log_bests(args, lbp, db, db);
+       }
+       xfs_dir3_data_check(dp, dbp);
+       /*
+        * If the data block is now empty then get rid of the data block.
+        */
+       if (be16_to_cpu(bf[0].length) ==
+                       args->geo->blksize - dp->d_ops->data_entry_offset) {
+               ASSERT(db != args->geo->datablk);
+               if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
+                       /*
+                        * Nope, can't get rid of it because it caused
+                        * allocation of a bmap btree block to do so.
+                        * Just go on, returning success, leaving the
+                        * empty block in place.
+                        */
+                       if (error == -ENOSPC && args->total == 0)
+                               error = 0;
+                       xfs_dir3_leaf_check(dp, lbp);
+                       return error;
+               }
+               dbp = NULL;
+               /*
+                * If this is the last data block then compact the
+                * bests table by getting rid of entries.
+                */
+               if (db == be32_to_cpu(ltp->bestcount) - 1) {
+                       /*
+                        * Look for the last active entry (i).
+                        */
+                       for (i = db - 1; i > 0; i--) {
+                               if (bestsp[i] != cpu_to_be16(NULLDATAOFF))
+                                       break;
+                       }
+                       /*
+                        * Copy the table down so inactive entries at the
+                        * end are removed.
+                        */
+                       memmove(&bestsp[db - i], bestsp,
+                               (be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp));
+                       be32_add_cpu(&ltp->bestcount, -(db - i));
+                       xfs_dir3_leaf_log_tail(args, lbp);
+                       xfs_dir3_leaf_log_bests(args, lbp, 0,
+                                               be32_to_cpu(ltp->bestcount) - 1);
+               } else
+                       bestsp[db] = cpu_to_be16(NULLDATAOFF);
+       }
+       /*
+        * If the data block was not the first one, drop it.
+        */
+       else if (db != args->geo->datablk)
+               dbp = NULL;
+
+       xfs_dir3_leaf_check(dp, lbp);
+       /*
+        * See if we can convert to block form.
+        */
+       return xfs_dir2_leaf_to_block(args, lbp, dbp);
+}
+
+/*
+ * Replace the inode number in a leaf format directory entry.
+ */
+int                                            /* error */
+xfs_dir2_leaf_replace(
+       xfs_da_args_t           *args)          /* operation arguments */
+{
+       struct xfs_buf          *dbp;           /* data block buffer */
+       xfs_dir2_data_entry_t   *dep;           /* data block entry */
+       xfs_inode_t             *dp;            /* incore directory inode */
+       int                     error;          /* error return code */
+       int                     index;          /* index of leaf entry */
+       struct xfs_buf          *lbp;           /* leaf buffer */
+       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+       xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+       xfs_trans_t             *tp;            /* transaction pointer */
+       struct xfs_dir2_leaf_entry *ents;
+
+       trace_xfs_dir2_leaf_replace(args);
+
+       /*
+        * Look up the entry.
+        */
+       if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+               return error;
+       }
+       dp = args->dp;
+       leaf = lbp->b_addr;
+       ents = dp->d_ops->leaf_ents_p(leaf);
+       /*
+        * Point to the leaf entry, get data address from it.
+        */
+       lep = &ents[index];
+       /*
+        * Point to the data entry.
+        */
+       dep = (xfs_dir2_data_entry_t *)
+             ((char *)dbp->b_addr +
+              xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
+       ASSERT(args->inumber != be64_to_cpu(dep->inumber));
+       /*
+        * Put the new inode number in, log it.
+        */
+       dep->inumber = cpu_to_be64(args->inumber);
+       dp->d_ops->data_put_ftype(dep, args->filetype);
+       tp = args->trans;
+       xfs_dir2_data_log_entry(args, dbp, dep);
+       xfs_dir3_leaf_check(dp, lbp);
+       xfs_trans_brelse(tp, lbp);
+       return 0;
+}
+
+/*
+ * Return index in the leaf block (lbp) which is either the first
+ * one with this hash value, or if there are none, the insert point
+ * for that hash value.
+ */
+int                                            /* index value */
+xfs_dir2_leaf_search_hash(
+       xfs_da_args_t           *args,          /* operation arguments */
+       struct xfs_buf          *lbp)           /* leaf buffer */
+{
+       xfs_dahash_t            hash=0;         /* hash from this entry */
+       xfs_dahash_t            hashwant;       /* hash value looking for */
+       int                     high;           /* high leaf index */
+       int                     low;            /* low leaf index */
+       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+       xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+       int                     mid=0;          /* current leaf index */
+       struct xfs_dir2_leaf_entry *ents;
+       struct xfs_dir3_icleaf_hdr leafhdr;
+
+       leaf = lbp->b_addr;
+       ents = args->dp->d_ops->leaf_ents_p(leaf);
+       args->dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+       /*
+        * Note, the table cannot be empty, so we have to go through the loop.
+        * Binary search the leaf entries looking for our hash value.
+        */
+       for (lep = ents, low = 0, high = leafhdr.count - 1,
+               hashwant = args->hashval;
+            low <= high; ) {
+               mid = (low + high) >> 1;
+               if ((hash = be32_to_cpu(lep[mid].hashval)) == hashwant)
+                       break;
+               if (hash < hashwant)
+                       low = mid + 1;
+               else
+                       high = mid - 1;
+       }
+       /*
+        * Found one, back up through all the equal hash values.
+        */
+       if (hash == hashwant) {
+               while (mid > 0 && be32_to_cpu(lep[mid - 1].hashval) == hashwant) {
+                       mid--;
+               }
+       }
+       /*
+        * Need to point to an entry higher than ours.
+        */
+       else if (hash < hashwant)
+               mid++;
+       return mid;
+}
+
+/*
+ * Trim off a trailing data block.  We know it's empty since the leaf
+ * freespace table says so.
+ */
+int                                            /* error */
+xfs_dir2_leaf_trim_data(
+       xfs_da_args_t           *args,          /* operation arguments */
+       struct xfs_buf          *lbp,           /* leaf buffer */
+       xfs_dir2_db_t           db)             /* data block number */
+{
+       __be16                  *bestsp;        /* leaf bests table */
+       struct xfs_buf          *dbp;           /* data block buffer */
+       xfs_inode_t             *dp;            /* incore directory inode */
+       int                     error;          /* error return value */
+       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+       xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       xfs_trans_t             *tp;            /* transaction pointer */
+
+       dp = args->dp;
+       mp = dp->i_mount;
+       tp = args->trans;
+       /*
+        * Read the offending data block.  We need its buffer.
+        */
+       error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, db),
+                                  -1, &dbp);
+       if (error)
+               return error;
+
+       leaf = lbp->b_addr;
+       ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+
+#ifdef DEBUG
+{
+       struct xfs_dir2_data_hdr *hdr = dbp->b_addr;
+       struct xfs_dir2_data_free *bf = dp->d_ops->data_bestfree_p(hdr);
+
+       ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+              hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+       ASSERT(be16_to_cpu(bf[0].length) ==
+              args->geo->blksize - dp->d_ops->data_entry_offset);
+       ASSERT(db == be32_to_cpu(ltp->bestcount) - 1);
+}
+#endif
+
+       /*
+        * Get rid of the data block.
+        */
+       if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
+               ASSERT(error != -ENOSPC);
+               xfs_trans_brelse(tp, dbp);
+               return error;
+       }
+       /*
+        * Eliminate the last bests entry from the table.
+        */
+       bestsp = xfs_dir2_leaf_bests_p(ltp);
+       be32_add_cpu(&ltp->bestcount, -1);
+       memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp));
+       xfs_dir3_leaf_log_tail(args, lbp);
+       xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+       return 0;
+}
+
+static inline size_t
+xfs_dir3_leaf_size(
+       struct xfs_dir3_icleaf_hdr      *hdr,
+       int                             counts)
+{
+       int     entries;
+       int     hdrsize;
+
+       entries = hdr->count - hdr->stale;
+       if (hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
+           hdr->magic == XFS_DIR2_LEAFN_MAGIC)
+               hdrsize = sizeof(struct xfs_dir2_leaf_hdr);
+       else
+               hdrsize = sizeof(struct xfs_dir3_leaf_hdr);
+
+       return hdrsize + entries * sizeof(xfs_dir2_leaf_entry_t)
+                      + counts * sizeof(xfs_dir2_data_off_t)
+                      + sizeof(xfs_dir2_leaf_tail_t);
+}
+
+/*
+ * Convert node form directory to leaf form directory.
+ * The root of the node form dir needs to already be a LEAFN block.
+ * Just return if we can't do anything.
+ */
+int                                            /* error */
+xfs_dir2_node_to_leaf(
+       xfs_da_state_t          *state)         /* directory operation state */
+{
+       xfs_da_args_t           *args;          /* operation arguments */
+       xfs_inode_t             *dp;            /* incore directory inode */
+       int                     error;          /* error return code */
+       struct xfs_buf          *fbp;           /* buffer for freespace block */
+       xfs_fileoff_t           fo;             /* freespace file offset */
+       xfs_dir2_free_t         *free;          /* freespace structure */
+       struct xfs_buf          *lbp;           /* buffer for leaf block */
+       xfs_dir2_leaf_tail_t    *ltp;           /* tail of leaf structure */
+       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       int                     rval;           /* successful free trim? */
+       xfs_trans_t             *tp;            /* transaction pointer */
+       struct xfs_dir3_icleaf_hdr leafhdr;
+       struct xfs_dir3_icfree_hdr freehdr;
+
+       /*
+        * There's more than a leaf level in the btree, so there must
+        * be multiple leafn blocks.  Give up.
+        */
+       if (state->path.active > 1)
+               return 0;
+       args = state->args;
+
+       trace_xfs_dir2_node_to_leaf(args);
+
+       mp = state->mp;
+       dp = args->dp;
+       tp = args->trans;
+       /*
+        * Get the last offset in the file.
+        */
+       if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) {
+               return error;
+       }
+       fo -= args->geo->fsbcount;
+       /*
+        * If there are freespace blocks other than the first one,
+        * take this opportunity to remove trailing empty freespace blocks
+        * that may have been left behind during no-space-reservation
+        * operations.
+        */
+       while (fo > args->geo->freeblk) {
+               if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) {
+                       return error;
+               }
+               if (rval)
+                       fo -= args->geo->fsbcount;
+               else
+                       return 0;
+       }
+       /*
+        * Now find the block just before the freespace block.
+        */
+       if ((error = xfs_bmap_last_before(tp, dp, &fo, XFS_DATA_FORK))) {
+               return error;
+       }
+       /*
+        * If it's not the single leaf block, give up.
+        */
+       if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + args->geo->blksize)
+               return 0;
+       lbp = state->path.blk[0].bp;
+       leaf = lbp->b_addr;
+       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+       ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+              leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
+
+       /*
+        * Read the freespace block.
+        */
+       error = xfs_dir2_free_read(tp, dp,  args->geo->freeblk, &fbp);
+       if (error)
+               return error;
+       free = fbp->b_addr;
+       dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
+       ASSERT(!freehdr.firstdb);
+
+       /*
+        * Now see if the leafn and free data will fit in a leaf1.
+        * If not, release the buffer and give up.
+        */
+       if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > args->geo->blksize) {
+               xfs_trans_brelse(tp, fbp);
+               return 0;
+       }
+
+       /*
+        * If the leaf has any stale entries in it, compress them out.
+        */
+       if (leafhdr.stale)
+               xfs_dir3_leaf_compact(args, &leafhdr, lbp);
+
+       lbp->b_ops = &xfs_dir3_leaf1_buf_ops;
+       xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAF1_BUF);
+       leafhdr.magic = (leafhdr.magic == XFS_DIR2_LEAFN_MAGIC)
+                                       ? XFS_DIR2_LEAF1_MAGIC
+                                       : XFS_DIR3_LEAF1_MAGIC;
+
+       /*
+        * Set up the leaf tail from the freespace block.
+        */
+       ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+       ltp->bestcount = cpu_to_be32(freehdr.nvalid);
+
+       /*
+        * Set up the leaf bests table.
+        */
+       memcpy(xfs_dir2_leaf_bests_p(ltp), dp->d_ops->free_bests_p(free),
+               freehdr.nvalid * sizeof(xfs_dir2_data_off_t));
+
+       dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+       xfs_dir3_leaf_log_header(args, lbp);
+       xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+       xfs_dir3_leaf_log_tail(args, lbp);
+       xfs_dir3_leaf_check(dp, lbp);
+
+       /*
+        * Get rid of the freespace block.
+        */
+       error = xfs_dir2_shrink_inode(args,
+                       xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET),
+                       fbp);
+       if (error) {
+               /*
+                * This can't fail here because it can only happen when
+                * punching out the middle of an extent, and this is an
+                * isolated block.
+                */
+               ASSERT(error != -ENOSPC);
+               return error;
+       }
+       fbp = NULL;
+       /*
+        * Now see if we can convert the single-leaf directory
+        * down to a block form directory.
+        * This routine always kills the dabuf for the leaf, so
+        * eliminate it from the path.
+        */
+       error = xfs_dir2_leaf_to_block(args, lbp, NULL);
+       state->path.blk[0].bp = NULL;
+       return error;
+}
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
new file mode 100644 (file)
index 0000000..2ae6ac2
--- /dev/null
@@ -0,0 +1,2284 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
+
+/*
+ * Function declarations.
+ */
+static int xfs_dir2_leafn_add(struct xfs_buf *bp, xfs_da_args_t *args,
+                             int index);
+static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state,
+                                    xfs_da_state_blk_t *blk1,
+                                    xfs_da_state_blk_t *blk2);
+static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
+                                int index, xfs_da_state_blk_t *dblk,
+                                int *rval);
+static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
+                                    xfs_da_state_blk_t *fblk);
+
+/*
+ * Check internal consistency of a leafn block.
+ */
+#ifdef DEBUG
+#define        xfs_dir3_leaf_check(dp, bp) \
+do { \
+       if (!xfs_dir3_leafn_check((dp), (bp))) \
+               ASSERT(0); \
+} while (0);
+
+static bool
+xfs_dir3_leafn_check(
+       struct xfs_inode        *dp,
+       struct xfs_buf          *bp)
+{
+       struct xfs_dir2_leaf    *leaf = bp->b_addr;
+       struct xfs_dir3_icleaf_hdr leafhdr;
+
+       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+       if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) {
+               struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+               if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+                       return false;
+       } else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC)
+               return false;
+
+       return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
+}
+#else
+#define        xfs_dir3_leaf_check(dp, bp)
+#endif
+
+static bool
+xfs_dir3_free_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_dir2_free_hdr *hdr = bp->b_addr;
+
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+               if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC))
+                       return false;
+               if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+                       return false;
+               if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+                       return false;
+       } else {
+               if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
+                       return false;
+       }
+
+       /* XXX: should bounds check the xfs_dir3_icfree_hdr here */
+
+       return true;
+}
+
+static void
+xfs_dir3_free_read_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+
+       if (xfs_sb_version_hascrc(&mp->m_sb) &&
+           !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF))
+               xfs_buf_ioerror(bp, -EFSBADCRC);
+       else if (!xfs_dir3_free_verify(bp))
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+       if (bp->b_error)
+               xfs_verifier_error(bp);
+}
+
+static void
+xfs_dir3_free_write_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_buf_log_item *bip = bp->b_fspriv;
+       struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+       if (!xfs_dir3_free_verify(bp)) {
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+               xfs_verifier_error(bp);
+               return;
+       }
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return;
+
+       if (bip)
+               hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+       xfs_buf_update_cksum(bp, XFS_DIR3_FREE_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
+       .verify_read = xfs_dir3_free_read_verify,
+       .verify_write = xfs_dir3_free_write_verify,
+};
+
+
+static int
+__xfs_dir3_free_read(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *dp,
+       xfs_dablk_t             fbno,
+       xfs_daddr_t             mappedbno,
+       struct xfs_buf          **bpp)
+{
+       int                     err;
+
+       err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+                               XFS_DATA_FORK, &xfs_dir3_free_buf_ops);
+
+       /* try read returns without an error or *bpp if it lands in a hole */
+       if (!err && tp && *bpp)
+               xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF);
+       return err;
+}
+
+int
+xfs_dir2_free_read(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *dp,
+       xfs_dablk_t             fbno,
+       struct xfs_buf          **bpp)
+{
+       return __xfs_dir3_free_read(tp, dp, fbno, -1, bpp);
+}
+
+static int
+xfs_dir2_free_try_read(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *dp,
+       xfs_dablk_t             fbno,
+       struct xfs_buf          **bpp)
+{
+       return __xfs_dir3_free_read(tp, dp, fbno, -2, bpp);
+}
+
+static int
+xfs_dir3_free_get_buf(
+       xfs_da_args_t           *args,
+       xfs_dir2_db_t           fbno,
+       struct xfs_buf          **bpp)
+{
+       struct xfs_trans        *tp = args->trans;
+       struct xfs_inode        *dp = args->dp;
+       struct xfs_mount        *mp = dp->i_mount;
+       struct xfs_buf          *bp;
+       int                     error;
+       struct xfs_dir3_icfree_hdr hdr;
+
+       error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, fbno),
+                                  -1, &bp, XFS_DATA_FORK);
+       if (error)
+               return error;
+
+       xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_FREE_BUF);
+       bp->b_ops = &xfs_dir3_free_buf_ops;
+
+       /*
+        * Initialize the new block to be empty, and remember
+        * its first slot as our empty slot.
+        */
+       memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr));
+       memset(&hdr, 0, sizeof(hdr));
+
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
+
+               hdr.magic = XFS_DIR3_FREE_MAGIC;
+
+               hdr3->hdr.blkno = cpu_to_be64(bp->b_bn);
+               hdr3->hdr.owner = cpu_to_be64(dp->i_ino);
+               uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid);
+       } else
+               hdr.magic = XFS_DIR2_FREE_MAGIC;
+       dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr);
+       *bpp = bp;
+       return 0;
+}
+
+/*
+ * Log entries from a freespace block.
+ */
+STATIC void
+xfs_dir2_free_log_bests(
+       struct xfs_da_args      *args,
+       struct xfs_buf          *bp,
+       int                     first,          /* first entry to log */
+       int                     last)           /* last entry to log */
+{
+       xfs_dir2_free_t         *free;          /* freespace structure */
+       __be16                  *bests;
+
+       free = bp->b_addr;
+       bests = args->dp->d_ops->free_bests_p(free);
+       ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+              free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+       xfs_trans_log_buf(args->trans, bp,
+               (uint)((char *)&bests[first] - (char *)free),
+               (uint)((char *)&bests[last] - (char *)free +
+                      sizeof(bests[0]) - 1));
+}
+
+/*
+ * Log header from a freespace block.
+ */
+static void
+xfs_dir2_free_log_header(
+       struct xfs_da_args      *args,
+       struct xfs_buf          *bp)
+{
+#ifdef DEBUG
+       xfs_dir2_free_t         *free;          /* freespace structure */
+
+       free = bp->b_addr;
+       ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+              free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+#endif
+       xfs_trans_log_buf(args->trans, bp, 0,
+                         args->dp->d_ops->free_hdr_size - 1);
+}
+
+/*
+ * Convert a leaf-format directory to a node-format directory.
+ * We need to change the magic number of the leaf block, and copy
+ * the freespace table out of the leaf block into its own block.
+ */
+int                                            /* error */
+xfs_dir2_leaf_to_node(
+       xfs_da_args_t           *args,          /* operation arguments */
+       struct xfs_buf          *lbp)           /* leaf buffer */
+{
+       xfs_inode_t             *dp;            /* incore directory inode */
+       int                     error;          /* error return value */
+       struct xfs_buf          *fbp;           /* freespace buffer */
+       xfs_dir2_db_t           fdb;            /* freespace block number */
+       xfs_dir2_free_t         *free;          /* freespace structure */
+       __be16                  *from;          /* pointer to freespace entry */
+       int                     i;              /* leaf freespace index */
+       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+       xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       int                     n;              /* count of live freespc ents */
+       xfs_dir2_data_off_t     off;            /* freespace entry value */
+       __be16                  *to;            /* pointer to freespace entry */
+       xfs_trans_t             *tp;            /* transaction pointer */
+       struct xfs_dir3_icfree_hdr freehdr;
+
+       trace_xfs_dir2_leaf_to_node(args);
+
+       dp = args->dp;
+       mp = dp->i_mount;
+       tp = args->trans;
+       /*
+        * Add a freespace block to the directory.
+        */
+       if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) {
+               return error;
+       }
+       ASSERT(fdb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
+       /*
+        * Get the buffer for the new freespace block.
+        */
+       error = xfs_dir3_free_get_buf(args, fdb, &fbp);
+       if (error)
+               return error;
+
+       free = fbp->b_addr;
+       dp->d_ops->free_hdr_from_disk(&freehdr, free);
+       leaf = lbp->b_addr;
+       ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+       ASSERT(be32_to_cpu(ltp->bestcount) <=
+                               (uint)dp->i_d.di_size / args->geo->blksize);
+
+       /*
+        * Copy freespace entries from the leaf block to the new block.
+        * Count active entries.
+        */
+       from = xfs_dir2_leaf_bests_p(ltp);
+       to = dp->d_ops->free_bests_p(free);
+       for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) {
+               if ((off = be16_to_cpu(*from)) != NULLDATAOFF)
+                       n++;
+               *to = cpu_to_be16(off);
+       }
+
+       /*
+        * Now initialize the freespace block header.
+        */
+       freehdr.nused = n;
+       freehdr.nvalid = be32_to_cpu(ltp->bestcount);
+
+       dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
+       xfs_dir2_free_log_bests(args, fbp, 0, freehdr.nvalid - 1);
+       xfs_dir2_free_log_header(args, fbp);
+
+       /*
+        * Converting the leaf to a leafnode is just a matter of changing the
+        * magic number and the ops. Do the change directly to the buffer as
+        * it's less work (and less code) than decoding the header to host
+        * format and back again.
+        */
+       if (leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC))
+               leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
+       else
+               leaf->hdr.info.magic = cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
+       lbp->b_ops = &xfs_dir3_leafn_buf_ops;
+       xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAFN_BUF);
+       xfs_dir3_leaf_log_header(args, lbp);
+       xfs_dir3_leaf_check(dp, lbp);
+       return 0;
+}
+
+/*
+ * Add a leaf entry to a leaf block in a node-form directory.
+ * The other work necessary is done from the caller.
+ */
+static int                                     /* error */
+xfs_dir2_leafn_add(
+       struct xfs_buf          *bp,            /* leaf buffer */
+       xfs_da_args_t           *args,          /* operation arguments */
+       int                     index)          /* insertion pt for new entry */
+{
+       int                     compact;        /* compacting stale leaves */
+       xfs_inode_t             *dp;            /* incore directory inode */
+       int                     highstale;      /* next stale entry */
+       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+       xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+       int                     lfloghigh;      /* high leaf entry logging */
+       int                     lfloglow;       /* low leaf entry logging */
+       int                     lowstale;       /* previous stale entry */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       xfs_trans_t             *tp;            /* transaction pointer */
+       struct xfs_dir3_icleaf_hdr leafhdr;
+       struct xfs_dir2_leaf_entry *ents;
+
+       trace_xfs_dir2_leafn_add(args, index);
+
+       dp = args->dp;
+       mp = dp->i_mount;
+       tp = args->trans;
+       leaf = bp->b_addr;
+       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+       ents = dp->d_ops->leaf_ents_p(leaf);
+
+       /*
+        * Quick check just to make sure we are not going to index
+        * into other peoples memory
+        */
+       if (index < 0)
+               return -EFSCORRUPTED;
+
+       /*
+        * If there are already the maximum number of leaf entries in
+        * the block, if there are no stale entries it won't fit.
+        * Caller will do a split.  If there are stale entries we'll do
+        * a compact.
+        */
+
+       if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) {
+               if (!leafhdr.stale)
+                       return -ENOSPC;
+               compact = leafhdr.stale > 1;
+       } else
+               compact = 0;
+       ASSERT(index == 0 || be32_to_cpu(ents[index - 1].hashval) <= args->hashval);
+       ASSERT(index == leafhdr.count ||
+              be32_to_cpu(ents[index].hashval) >= args->hashval);
+
+       if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+               return 0;
+
+       /*
+        * Compact out all but one stale leaf entry.  Leaves behind
+        * the entry closest to index.
+        */
+       if (compact)
+               xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
+                                        &highstale, &lfloglow, &lfloghigh);
+       else if (leafhdr.stale) {
+               /*
+                * Set impossible logging indices for this case.
+                */
+               lfloglow = leafhdr.count;
+               lfloghigh = -1;
+       }
+
+       /*
+        * Insert the new entry, log everything.
+        */
+       lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
+                                      highstale, &lfloglow, &lfloghigh);
+
+       lep->hashval = cpu_to_be32(args->hashval);
+       lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(args->geo,
+                               args->blkno, args->index));
+
+       dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+       xfs_dir3_leaf_log_header(args, bp);
+       xfs_dir3_leaf_log_ents(args, bp, lfloglow, lfloghigh);
+       xfs_dir3_leaf_check(dp, bp);
+       return 0;
+}
+
+#ifdef DEBUG
+static void
+xfs_dir2_free_hdr_check(
+       struct xfs_inode *dp,
+       struct xfs_buf  *bp,
+       xfs_dir2_db_t   db)
+{
+       struct xfs_dir3_icfree_hdr hdr;
+
+       dp->d_ops->free_hdr_from_disk(&hdr, bp->b_addr);
+
+       ASSERT((hdr.firstdb %
+               dp->d_ops->free_max_bests(dp->i_mount->m_dir_geo)) == 0);
+       ASSERT(hdr.firstdb <= db);
+       ASSERT(db < hdr.firstdb + hdr.nvalid);
+}
+#else
+#define xfs_dir2_free_hdr_check(dp, bp, db)
+#endif /* DEBUG */
+
+/*
+ * Return the last hash value in the leaf.
+ * Stale entries are ok.
+ */
+xfs_dahash_t                                   /* hash value */
+xfs_dir2_leafn_lasthash(
+       struct xfs_inode *dp,
+       struct xfs_buf  *bp,                    /* leaf buffer */
+       int             *count)                 /* count of entries in leaf */
+{
+       struct xfs_dir2_leaf    *leaf = bp->b_addr;
+       struct xfs_dir2_leaf_entry *ents;
+       struct xfs_dir3_icleaf_hdr leafhdr;
+
+       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+       ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+              leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
+
+       if (count)
+               *count = leafhdr.count;
+       if (!leafhdr.count)
+               return 0;
+
+       ents = dp->d_ops->leaf_ents_p(leaf);
+       return be32_to_cpu(ents[leafhdr.count - 1].hashval);
+}
+
+/*
+ * Look up a leaf entry for space to add a name in a node-format leaf block.
+ * The extrablk in state is a freespace block.
+ */
+STATIC int
+xfs_dir2_leafn_lookup_for_addname(
+       struct xfs_buf          *bp,            /* leaf buffer */
+       xfs_da_args_t           *args,          /* operation arguments */
+       int                     *indexp,        /* out: leaf entry index */
+       xfs_da_state_t          *state)         /* state to fill in */
+{
+       struct xfs_buf          *curbp = NULL;  /* current data/free buffer */
+       xfs_dir2_db_t           curdb = -1;     /* current data block number */
+       xfs_dir2_db_t           curfdb = -1;    /* current free block number */
+       xfs_inode_t             *dp;            /* incore directory inode */
+       int                     error;          /* error return value */
+       int                     fi;             /* free entry index */
+       xfs_dir2_free_t         *free = NULL;   /* free block structure */
+       int                     index;          /* leaf entry index */
+       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+       int                     length;         /* length of new data entry */
+       xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       xfs_dir2_db_t           newdb;          /* new data block number */
+       xfs_dir2_db_t           newfdb;         /* new free block number */
+       xfs_trans_t             *tp;            /* transaction pointer */
+       struct xfs_dir2_leaf_entry *ents;
+       struct xfs_dir3_icleaf_hdr leafhdr;
+
+       dp = args->dp;
+       tp = args->trans;
+       mp = dp->i_mount;
+       leaf = bp->b_addr;
+       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+       ents = dp->d_ops->leaf_ents_p(leaf);
+
+       xfs_dir3_leaf_check(dp, bp);
+       ASSERT(leafhdr.count > 0);
+
+       /*
+        * Look up the hash value in the leaf entries.
+        */
+       index = xfs_dir2_leaf_search_hash(args, bp);
+       /*
+        * Do we have a buffer coming in?
+        */
+       if (state->extravalid) {
+               /* If so, it's a free block buffer, get the block number. */
+               curbp = state->extrablk.bp;
+               curfdb = state->extrablk.blkno;
+               free = curbp->b_addr;
+               ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+                      free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+       }
+       length = dp->d_ops->data_entsize(args->namelen);
+       /*
+        * Loop over leaf entries with the right hash value.
+        */
+       for (lep = &ents[index];
+            index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+            lep++, index++) {
+               /*
+                * Skip stale leaf entries.
+                */
+               if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+                       continue;
+               /*
+                * Pull the data block number from the entry.
+                */
+               newdb = xfs_dir2_dataptr_to_db(args->geo,
+                                              be32_to_cpu(lep->address));
+               /*
+                * For addname, we're looking for a place to put the new entry.
+                * We want to use a data block with an entry of equal
+                * hash value to ours if there is one with room.
+                *
+                * If this block isn't the data block we already have
+                * in hand, take a look at it.
+                */
+               if (newdb != curdb) {
+                       __be16 *bests;
+
+                       curdb = newdb;
+                       /*
+                        * Convert the data block to the free block
+                        * holding its freespace information.
+                        */
+                       newfdb = dp->d_ops->db_to_fdb(args->geo, newdb);
+                       /*
+                        * If it's not the one we have in hand, read it in.
+                        */
+                       if (newfdb != curfdb) {
+                               /*
+                                * If we had one before, drop it.
+                                */
+                               if (curbp)
+                                       xfs_trans_brelse(tp, curbp);
+
+                               error = xfs_dir2_free_read(tp, dp,
+                                               xfs_dir2_db_to_da(args->geo,
+                                                                 newfdb),
+                                               &curbp);
+                               if (error)
+                                       return error;
+                               free = curbp->b_addr;
+
+                               xfs_dir2_free_hdr_check(dp, curbp, curdb);
+                       }
+                       /*
+                        * Get the index for our entry.
+                        */
+                       fi = dp->d_ops->db_to_fdindex(args->geo, curdb);
+                       /*
+                        * If it has room, return it.
+                        */
+                       bests = dp->d_ops->free_bests_p(free);
+                       if (unlikely(bests[fi] == cpu_to_be16(NULLDATAOFF))) {
+                               XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
+                                                       XFS_ERRLEVEL_LOW, mp);
+                               if (curfdb != newfdb)
+                                       xfs_trans_brelse(tp, curbp);
+                               return -EFSCORRUPTED;
+                       }
+                       curfdb = newfdb;
+                       if (be16_to_cpu(bests[fi]) >= length)
+                               goto out;
+               }
+       }
+       /* Didn't find any space */
+       fi = -1;
+out:
+       ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+       if (curbp) {
+               /* Giving back a free block. */
+               state->extravalid = 1;
+               state->extrablk.bp = curbp;
+               state->extrablk.index = fi;
+               state->extrablk.blkno = curfdb;
+
+               /*
+                * Important: this magic number is not in the buffer - it's for
+                * buffer type information and therefore only the free/data type
+                * matters here, not whether CRCs are enabled or not.
+                */
+               state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
+       } else {
+               state->extravalid = 0;
+       }
+       /*
+        * Return the index, that will be the insertion point.
+        */
+       *indexp = index;
+       return -ENOENT;
+}
+
+/*
+ * Look up a leaf entry in a node-format leaf block.
+ * The extrablk in state a data block.
+ */
+STATIC int
+xfs_dir2_leafn_lookup_for_entry(
+       struct xfs_buf          *bp,            /* leaf buffer */
+       xfs_da_args_t           *args,          /* operation arguments */
+       int                     *indexp,        /* out: leaf entry index */
+       xfs_da_state_t          *state)         /* state to fill in */
+{
+       struct xfs_buf          *curbp = NULL;  /* current data/free buffer */
+       xfs_dir2_db_t           curdb = -1;     /* current data block number */
+       xfs_dir2_data_entry_t   *dep;           /* data block entry */
+       xfs_inode_t             *dp;            /* incore directory inode */
+       int                     error;          /* error return value */
+       int                     index;          /* leaf entry index */
+       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+       xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       xfs_dir2_db_t           newdb;          /* new data block number */
+       xfs_trans_t             *tp;            /* transaction pointer */
+       enum xfs_dacmp          cmp;            /* comparison result */
+       struct xfs_dir2_leaf_entry *ents;
+       struct xfs_dir3_icleaf_hdr leafhdr;
+
+       dp = args->dp;
+       tp = args->trans;
+       mp = dp->i_mount;
+       leaf = bp->b_addr;
+       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+       ents = dp->d_ops->leaf_ents_p(leaf);
+
+       xfs_dir3_leaf_check(dp, bp);
+       ASSERT(leafhdr.count > 0);
+
+       /*
+        * Look up the hash value in the leaf entries.
+        */
+       index = xfs_dir2_leaf_search_hash(args, bp);
+       /*
+        * Do we have a buffer coming in?
+        */
+       if (state->extravalid) {
+               curbp = state->extrablk.bp;
+               curdb = state->extrablk.blkno;
+       }
+       /*
+        * Loop over leaf entries with the right hash value.
+        */
+       for (lep = &ents[index];
+            index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+            lep++, index++) {
+               /*
+                * Skip stale leaf entries.
+                */
+               if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+                       continue;
+               /*
+                * Pull the data block number from the entry.
+                */
+               newdb = xfs_dir2_dataptr_to_db(args->geo,
+                                              be32_to_cpu(lep->address));
+               /*
+                * Not adding a new entry, so we really want to find
+                * the name given to us.
+                *
+                * If it's a different data block, go get it.
+                */
+               if (newdb != curdb) {
+                       /*
+                        * If we had a block before that we aren't saving
+                        * for a CI name, drop it
+                        */
+                       if (curbp && (args->cmpresult == XFS_CMP_DIFFERENT ||
+                                               curdb != state->extrablk.blkno))
+                               xfs_trans_brelse(tp, curbp);
+                       /*
+                        * If needing the block that is saved with a CI match,
+                        * use it otherwise read in the new data block.
+                        */
+                       if (args->cmpresult != XFS_CMP_DIFFERENT &&
+                                       newdb == state->extrablk.blkno) {
+                               ASSERT(state->extravalid);
+                               curbp = state->extrablk.bp;
+                       } else {
+                               error = xfs_dir3_data_read(tp, dp,
+                                               xfs_dir2_db_to_da(args->geo,
+                                                                 newdb),
+                                               -1, &curbp);
+                               if (error)
+                                       return error;
+                       }
+                       xfs_dir3_data_check(dp, curbp);
+                       curdb = newdb;
+               }
+               /*
+                * Point to the data entry.
+                */
+               dep = (xfs_dir2_data_entry_t *)((char *)curbp->b_addr +
+                       xfs_dir2_dataptr_to_off(args->geo,
+                                               be32_to_cpu(lep->address)));
+               /*
+                * Compare the entry and if it's an exact match, return
+                * EEXIST immediately. If it's the first case-insensitive
+                * match, store the block & inode number and continue looking.
+                */
+               cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+               if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+                       /* If there is a CI match block, drop it */
+                       if (args->cmpresult != XFS_CMP_DIFFERENT &&
+                                               curdb != state->extrablk.blkno)
+                               xfs_trans_brelse(tp, state->extrablk.bp);
+                       args->cmpresult = cmp;
+                       args->inumber = be64_to_cpu(dep->inumber);
+                       args->filetype = dp->d_ops->data_get_ftype(dep);
+                       *indexp = index;
+                       state->extravalid = 1;
+                       state->extrablk.bp = curbp;
+                       state->extrablk.blkno = curdb;
+                       state->extrablk.index = (int)((char *)dep -
+                                                       (char *)curbp->b_addr);
+                       state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+                       curbp->b_ops = &xfs_dir3_data_buf_ops;
+                       xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
+                       if (cmp == XFS_CMP_EXACT)
+                               return -EEXIST;
+               }
+       }
+       ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT));
+       if (curbp) {
+               if (args->cmpresult == XFS_CMP_DIFFERENT) {
+                       /* Giving back last used data block. */
+                       state->extravalid = 1;
+                       state->extrablk.bp = curbp;
+                       state->extrablk.index = -1;
+                       state->extrablk.blkno = curdb;
+                       state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+                       curbp->b_ops = &xfs_dir3_data_buf_ops;
+                       xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
+               } else {
+                       /* If the curbp is not the CI match block, drop it */
+                       if (state->extrablk.bp != curbp)
+                               xfs_trans_brelse(tp, curbp);
+               }
+       } else {
+               state->extravalid = 0;
+       }
+       *indexp = index;
+       return -ENOENT;
+}
+
+/*
+ * Look up a leaf entry in a node-format leaf block.
+ * If this is an addname then the extrablk in state is a freespace block,
+ * otherwise it's a data block.
+ */
+int
+xfs_dir2_leafn_lookup_int(
+       struct xfs_buf          *bp,            /* leaf buffer */
+       xfs_da_args_t           *args,          /* operation arguments */
+       int                     *indexp,        /* out: leaf entry index */
+       xfs_da_state_t          *state)         /* state to fill in */
+{
+       if (args->op_flags & XFS_DA_OP_ADDNAME)
+               return xfs_dir2_leafn_lookup_for_addname(bp, args, indexp,
+                                                       state);
+       return xfs_dir2_leafn_lookup_for_entry(bp, args, indexp, state);
+}
+
+/*
+ * Move count leaf entries from source to destination leaf.
+ * Log entries and headers.  Stale entries are preserved.
+ */
+static void
+xfs_dir3_leafn_moveents(
+       xfs_da_args_t                   *args,  /* operation arguments */
+       struct xfs_buf                  *bp_s,  /* source */
+       struct xfs_dir3_icleaf_hdr      *shdr,
+       struct xfs_dir2_leaf_entry      *sents,
+       int                             start_s,/* source leaf index */
+       struct xfs_buf                  *bp_d,  /* destination */
+       struct xfs_dir3_icleaf_hdr      *dhdr,
+       struct xfs_dir2_leaf_entry      *dents,
+       int                             start_d,/* destination leaf index */
+       int                             count)  /* count of leaves to copy */
+{
+       int                             stale;  /* count stale leaves copied */
+
+       trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count);
+
+       /*
+        * Silently return if nothing to do.
+        */
+       if (count == 0)
+               return;
+
+       /*
+        * If the destination index is not the end of the current
+        * destination leaf entries, open up a hole in the destination
+        * to hold the new entries.
+        */
+       if (start_d < dhdr->count) {
+               memmove(&dents[start_d + count], &dents[start_d],
+                       (dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t));
+               xfs_dir3_leaf_log_ents(args, bp_d, start_d + count,
+                                      count + dhdr->count - 1);
+       }
+       /*
+        * If the source has stale leaves, count the ones in the copy range
+        * so we can update the header correctly.
+        */
+       if (shdr->stale) {
+               int     i;                      /* temp leaf index */
+
+               for (i = start_s, stale = 0; i < start_s + count; i++) {
+                       if (sents[i].address ==
+                                       cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+                               stale++;
+               }
+       } else
+               stale = 0;
+       /*
+        * Copy the leaf entries from source to destination.
+        */
+       memcpy(&dents[start_d], &sents[start_s],
+               count * sizeof(xfs_dir2_leaf_entry_t));
+       xfs_dir3_leaf_log_ents(args, bp_d, start_d, start_d + count - 1);
+
+       /*
+        * If there are source entries after the ones we copied,
+        * delete the ones we copied by sliding the next ones down.
+        */
+       if (start_s + count < shdr->count) {
+               memmove(&sents[start_s], &sents[start_s + count],
+                       count * sizeof(xfs_dir2_leaf_entry_t));
+               xfs_dir3_leaf_log_ents(args, bp_s, start_s, start_s + count - 1);
+       }
+
+       /*
+        * Update the headers and log them.
+        */
+       shdr->count -= count;
+       shdr->stale -= stale;
+       dhdr->count += count;
+       dhdr->stale += stale;
+}
+
+/*
+ * Determine the sort order of two leaf blocks.
+ * Returns 1 if both are valid and leaf2 should be before leaf1, else 0.
+ */
+int                                            /* sort order */
+xfs_dir2_leafn_order(
+       struct xfs_inode        *dp,
+       struct xfs_buf          *leaf1_bp,              /* leaf1 buffer */
+       struct xfs_buf          *leaf2_bp)              /* leaf2 buffer */
+{
+       struct xfs_dir2_leaf    *leaf1 = leaf1_bp->b_addr;
+       struct xfs_dir2_leaf    *leaf2 = leaf2_bp->b_addr;
+       struct xfs_dir2_leaf_entry *ents1;
+       struct xfs_dir2_leaf_entry *ents2;
+       struct xfs_dir3_icleaf_hdr hdr1;
+       struct xfs_dir3_icleaf_hdr hdr2;
+
+       dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
+       dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
+       ents1 = dp->d_ops->leaf_ents_p(leaf1);
+       ents2 = dp->d_ops->leaf_ents_p(leaf2);
+
+       if (hdr1.count > 0 && hdr2.count > 0 &&
+           (be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) ||
+            be32_to_cpu(ents2[hdr2.count - 1].hashval) <
+                               be32_to_cpu(ents1[hdr1.count - 1].hashval)))
+               return 1;
+       return 0;
+}
+
+/*
+ * Rebalance leaf entries between two leaf blocks.
+ * This is actually only called when the second block is new,
+ * though the code deals with the general case.
+ * A new entry will be inserted in one of the blocks, and that
+ * entry is taken into account when balancing.
+ */
+static void
+xfs_dir2_leafn_rebalance(
+       xfs_da_state_t          *state,         /* btree cursor */
+       xfs_da_state_blk_t      *blk1,          /* first btree block */
+       xfs_da_state_blk_t      *blk2)          /* second btree block */
+{
+       xfs_da_args_t           *args;          /* operation arguments */
+       int                     count;          /* count (& direction) leaves */
+       int                     isleft;         /* new goes in left leaf */
+       xfs_dir2_leaf_t         *leaf1;         /* first leaf structure */
+       xfs_dir2_leaf_t         *leaf2;         /* second leaf structure */
+       int                     mid;            /* midpoint leaf index */
+#if defined(DEBUG) || defined(XFS_WARN)
+       int                     oldstale;       /* old count of stale leaves */
+#endif
+       int                     oldsum;         /* old total leaf count */
+       int                     swap;           /* swapped leaf blocks */
+       struct xfs_dir2_leaf_entry *ents1;
+       struct xfs_dir2_leaf_entry *ents2;
+       struct xfs_dir3_icleaf_hdr hdr1;
+       struct xfs_dir3_icleaf_hdr hdr2;
+       struct xfs_inode        *dp = state->args->dp;
+
+       args = state->args;
+       /*
+        * If the block order is wrong, swap the arguments.
+        */
+       if ((swap = xfs_dir2_leafn_order(dp, blk1->bp, blk2->bp))) {
+               xfs_da_state_blk_t      *tmp;   /* temp for block swap */
+
+               tmp = blk1;
+               blk1 = blk2;
+               blk2 = tmp;
+       }
+       leaf1 = blk1->bp->b_addr;
+       leaf2 = blk2->bp->b_addr;
+       dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
+       dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
+       ents1 = dp->d_ops->leaf_ents_p(leaf1);
+       ents2 = dp->d_ops->leaf_ents_p(leaf2);
+
+       oldsum = hdr1.count + hdr2.count;
+#if defined(DEBUG) || defined(XFS_WARN)
+       oldstale = hdr1.stale + hdr2.stale;
+#endif
+       mid = oldsum >> 1;
+
+       /*
+        * If the old leaf count was odd then the new one will be even,
+        * so we need to divide the new count evenly.
+        */
+       if (oldsum & 1) {
+               xfs_dahash_t    midhash;        /* middle entry hash value */
+
+               if (mid >= hdr1.count)
+                       midhash = be32_to_cpu(ents2[mid - hdr1.count].hashval);
+               else
+                       midhash = be32_to_cpu(ents1[mid].hashval);
+               isleft = args->hashval <= midhash;
+       }
+       /*
+        * If the old count is even then the new count is odd, so there's
+        * no preferred side for the new entry.
+        * Pick the left one.
+        */
+       else
+               isleft = 1;
+       /*
+        * Calculate moved entry count.  Positive means left-to-right,
+        * negative means right-to-left.  Then move the entries.
+        */
+       count = hdr1.count - mid + (isleft == 0);
+       if (count > 0)
+               xfs_dir3_leafn_moveents(args, blk1->bp, &hdr1, ents1,
+                                       hdr1.count - count, blk2->bp,
+                                       &hdr2, ents2, 0, count);
+       else if (count < 0)
+               xfs_dir3_leafn_moveents(args, blk2->bp, &hdr2, ents2, 0,
+                                       blk1->bp, &hdr1, ents1,
+                                       hdr1.count, count);
+
+       ASSERT(hdr1.count + hdr2.count == oldsum);
+       ASSERT(hdr1.stale + hdr2.stale == oldstale);
+
+       /* log the changes made when moving the entries */
+       dp->d_ops->leaf_hdr_to_disk(leaf1, &hdr1);
+       dp->d_ops->leaf_hdr_to_disk(leaf2, &hdr2);
+       xfs_dir3_leaf_log_header(args, blk1->bp);
+       xfs_dir3_leaf_log_header(args, blk2->bp);
+
+       xfs_dir3_leaf_check(dp, blk1->bp);
+       xfs_dir3_leaf_check(dp, blk2->bp);
+
+       /*
+        * Mark whether we're inserting into the old or new leaf.
+        */
+       if (hdr1.count < hdr2.count)
+               state->inleaf = swap;
+       else if (hdr1.count > hdr2.count)
+               state->inleaf = !swap;
+       else
+               state->inleaf = swap ^ (blk1->index <= hdr1.count);
+       /*
+        * Adjust the expected index for insertion.
+        */
+       if (!state->inleaf)
+               blk2->index = blk1->index - hdr1.count;
+
+       /*
+        * Finally sanity check just to make sure we are not returning a
+        * negative index
+        */
+       if (blk2->index < 0) {
+               state->inleaf = 1;
+               blk2->index = 0;
+               xfs_alert(dp->i_mount,
+       "%s: picked the wrong leaf? reverting original leaf: blk1->index %d",
+                       __func__, blk1->index);
+       }
+}
+
+static int
+xfs_dir3_data_block_free(
+       xfs_da_args_t           *args,
+       struct xfs_dir2_data_hdr *hdr,
+       struct xfs_dir2_free    *free,
+       xfs_dir2_db_t           fdb,
+       int                     findex,
+       struct xfs_buf          *fbp,
+       int                     longest)
+{
+       int                     logfree = 0;
+       __be16                  *bests;
+       struct xfs_dir3_icfree_hdr freehdr;
+       struct xfs_inode        *dp = args->dp;
+
+       dp->d_ops->free_hdr_from_disk(&freehdr, free);
+       bests = dp->d_ops->free_bests_p(free);
+       if (hdr) {
+               /*
+                * Data block is not empty, just set the free entry to the new
+                * value.
+                */
+               bests[findex] = cpu_to_be16(longest);
+               xfs_dir2_free_log_bests(args, fbp, findex, findex);
+               return 0;
+       }
+
+       /* One less used entry in the free table. */
+       freehdr.nused--;
+
+       /*
+        * If this was the last entry in the table, we can trim the table size
+        * back.  There might be other entries at the end referring to
+        * non-existent data blocks, get those too.
+        */
+       if (findex == freehdr.nvalid - 1) {
+               int     i;              /* free entry index */
+
+               for (i = findex - 1; i >= 0; i--) {
+                       if (bests[i] != cpu_to_be16(NULLDATAOFF))
+                               break;
+               }
+               freehdr.nvalid = i + 1;
+               logfree = 0;
+       } else {
+               /* Not the last entry, just punch it out.  */
+               bests[findex] = cpu_to_be16(NULLDATAOFF);
+               logfree = 1;
+       }
+
+       dp->d_ops->free_hdr_to_disk(free, &freehdr);
+       xfs_dir2_free_log_header(args, fbp);
+
+       /*
+        * If there are no useful entries left in the block, get rid of the
+        * block if we can.
+        */
+       if (!freehdr.nused) {
+               int error;
+
+               error = xfs_dir2_shrink_inode(args, fdb, fbp);
+               if (error == 0) {
+                       fbp = NULL;
+                       logfree = 0;
+               } else if (error != -ENOSPC || args->total != 0)
+                       return error;
+               /*
+                * It's possible to get ENOSPC if there is no
+                * space reservation.  In this case some one
+                * else will eventually get rid of this block.
+                */
+       }
+
+       /* Log the free entry that changed, unless we got rid of it.  */
+       if (logfree)
+               xfs_dir2_free_log_bests(args, fbp, findex, findex);
+       return 0;
+}
+
+/*
+ * Remove an entry from a node directory.
+ * This removes the leaf entry and the data entry,
+ * and updates the free block if necessary.
+ */
+static int                                     /* error */
+xfs_dir2_leafn_remove(
+       xfs_da_args_t           *args,          /* operation arguments */
+       struct xfs_buf          *bp,            /* leaf buffer */
+       int                     index,          /* leaf entry index */
+       xfs_da_state_blk_t      *dblk,          /* data block */
+       int                     *rval)          /* resulting block needs join */
+{
+       xfs_dir2_data_hdr_t     *hdr;           /* data block header */
+       xfs_dir2_db_t           db;             /* data block number */
+       struct xfs_buf          *dbp;           /* data block buffer */
+       xfs_dir2_data_entry_t   *dep;           /* data block entry */
+       xfs_inode_t             *dp;            /* incore directory inode */
+       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+       xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+       int                     longest;        /* longest data free entry */
+       int                     off;            /* data block entry offset */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       int                     needlog;        /* need to log data header */
+       int                     needscan;       /* need to rescan data frees */
+       xfs_trans_t             *tp;            /* transaction pointer */
+       struct xfs_dir2_data_free *bf;          /* bestfree table */
+       struct xfs_dir3_icleaf_hdr leafhdr;
+       struct xfs_dir2_leaf_entry *ents;
+
+       trace_xfs_dir2_leafn_remove(args, index);
+
+       dp = args->dp;
+       tp = args->trans;
+       mp = dp->i_mount;
+       leaf = bp->b_addr;
+       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+       ents = dp->d_ops->leaf_ents_p(leaf);
+
+       /*
+        * Point to the entry we're removing.
+        */
+       lep = &ents[index];
+
+       /*
+        * Extract the data block and offset from the entry.
+        */
+       db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
+       ASSERT(dblk->blkno == db);
+       off = xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address));
+       ASSERT(dblk->index == off);
+
+       /*
+        * Kill the leaf entry by marking it stale.
+        * Log the leaf block changes.
+        */
+       leafhdr.stale++;
+       dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+       xfs_dir3_leaf_log_header(args, bp);
+
+       lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+       xfs_dir3_leaf_log_ents(args, bp, index, index);
+
+       /*
+        * Make the data entry free.  Keep track of the longest freespace
+        * in the data block in case it changes.
+        */
+       dbp = dblk->bp;
+       hdr = dbp->b_addr;
+       dep = (xfs_dir2_data_entry_t *)((char *)hdr + off);
+       bf = dp->d_ops->data_bestfree_p(hdr);
+       longest = be16_to_cpu(bf[0].length);
+       needlog = needscan = 0;
+       xfs_dir2_data_make_free(args, dbp, off,
+               dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
+       /*
+        * Rescan the data block freespaces for bestfree.
+        * Log the data block header if needed.
+        */
+       if (needscan)
+               xfs_dir2_data_freescan(dp, hdr, &needlog);
+       if (needlog)
+               xfs_dir2_data_log_header(args, dbp);
+       xfs_dir3_data_check(dp, dbp);
+       /*
+        * If the longest data block freespace changes, need to update
+        * the corresponding freeblock entry.
+        */
+       if (longest < be16_to_cpu(bf[0].length)) {
+               int             error;          /* error return value */
+               struct xfs_buf  *fbp;           /* freeblock buffer */
+               xfs_dir2_db_t   fdb;            /* freeblock block number */
+               int             findex;         /* index in freeblock entries */
+               xfs_dir2_free_t *free;          /* freeblock structure */
+
+               /*
+                * Convert the data block number to a free block,
+                * read in the free block.
+                */
+               fdb = dp->d_ops->db_to_fdb(args->geo, db);
+               error = xfs_dir2_free_read(tp, dp,
+                                          xfs_dir2_db_to_da(args->geo, fdb),
+                                          &fbp);
+               if (error)
+                       return error;
+               free = fbp->b_addr;
+#ifdef DEBUG
+       {
+               struct xfs_dir3_icfree_hdr freehdr;
+               dp->d_ops->free_hdr_from_disk(&freehdr, free);
+               ASSERT(freehdr.firstdb == dp->d_ops->free_max_bests(args->geo) *
+                       (fdb - xfs_dir2_byte_to_db(args->geo,
+                                                  XFS_DIR2_FREE_OFFSET)));
+       }
+#endif
+               /*
+                * Calculate which entry we need to fix.
+                */
+               findex = dp->d_ops->db_to_fdindex(args->geo, db);
+               longest = be16_to_cpu(bf[0].length);
+               /*
+                * If the data block is now empty we can get rid of it
+                * (usually).
+                */
+               if (longest == args->geo->blksize -
+                              dp->d_ops->data_entry_offset) {
+                       /*
+                        * Try to punch out the data block.
+                        */
+                       error = xfs_dir2_shrink_inode(args, db, dbp);
+                       if (error == 0) {
+                               dblk->bp = NULL;
+                               hdr = NULL;
+                       }
+                       /*
+                        * We can get ENOSPC if there's no space reservation.
+                        * In this case just drop the buffer and some one else
+                        * will eventually get rid of the empty block.
+                        */
+                       else if (!(error == -ENOSPC && args->total == 0))
+                               return error;
+               }
+               /*
+                * If we got rid of the data block, we can eliminate that entry
+                * in the free block.
+                */
+               error = xfs_dir3_data_block_free(args, hdr, free,
+                                                fdb, findex, fbp, longest);
+               if (error)
+                       return error;
+       }
+
+       xfs_dir3_leaf_check(dp, bp);
+       /*
+        * Return indication of whether this leaf block is empty enough
+        * to justify trying to join it with a neighbor.
+        */
+       *rval = (dp->d_ops->leaf_hdr_size +
+                (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) <
+               args->geo->magicpct;
+       return 0;
+}
+
+/*
+ * Split the leaf entries in the old block into old and new blocks.
+ */
+int                                            /* error */
+xfs_dir2_leafn_split(
+       xfs_da_state_t          *state,         /* btree cursor */
+       xfs_da_state_blk_t      *oldblk,        /* original block */
+       xfs_da_state_blk_t      *newblk)        /* newly created block */
+{
+       xfs_da_args_t           *args;          /* operation arguments */
+       xfs_dablk_t             blkno;          /* new leaf block number */
+       int                     error;          /* error return value */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       struct xfs_inode        *dp;
+
+       /*
+        * Allocate space for a new leaf node.
+        */
+       args = state->args;
+       dp = args->dp;
+       mp = dp->i_mount;
+       ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC);
+       error = xfs_da_grow_inode(args, &blkno);
+       if (error) {
+               return error;
+       }
+       /*
+        * Initialize the new leaf block.
+        */
+       error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(args->geo, blkno),
+                                     &newblk->bp, XFS_DIR2_LEAFN_MAGIC);
+       if (error)
+               return error;
+
+       newblk->blkno = blkno;
+       newblk->magic = XFS_DIR2_LEAFN_MAGIC;
+       /*
+        * Rebalance the entries across the two leaves, link the new
+        * block into the leaves.
+        */
+       xfs_dir2_leafn_rebalance(state, oldblk, newblk);
+       error = xfs_da3_blk_link(state, oldblk, newblk);
+       if (error) {
+               return error;
+       }
+       /*
+        * Insert the new entry in the correct block.
+        */
+       if (state->inleaf)
+               error = xfs_dir2_leafn_add(oldblk->bp, args, oldblk->index);
+       else
+               error = xfs_dir2_leafn_add(newblk->bp, args, newblk->index);
+       /*
+        * Update last hashval in each block since we added the name.
+        */
+       oldblk->hashval = xfs_dir2_leafn_lasthash(dp, oldblk->bp, NULL);
+       newblk->hashval = xfs_dir2_leafn_lasthash(dp, newblk->bp, NULL);
+       xfs_dir3_leaf_check(dp, oldblk->bp);
+       xfs_dir3_leaf_check(dp, newblk->bp);
+       return error;
+}
+
+/*
+ * Check a leaf block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ */
+int                                            /* error */
+xfs_dir2_leafn_toosmall(
+       xfs_da_state_t          *state,         /* btree cursor */
+       int                     *action)        /* resulting action to take */
+{
+       xfs_da_state_blk_t      *blk;           /* leaf block */
+       xfs_dablk_t             blkno;          /* leaf block number */
+       struct xfs_buf          *bp;            /* leaf buffer */
+       int                     bytes;          /* bytes in use */
+       int                     count;          /* leaf live entry count */
+       int                     error;          /* error return value */
+       int                     forward;        /* sibling block direction */
+       int                     i;              /* sibling counter */
+       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+       int                     rval;           /* result from path_shift */
+       struct xfs_dir3_icleaf_hdr leafhdr;
+       struct xfs_dir2_leaf_entry *ents;
+       struct xfs_inode        *dp = state->args->dp;
+
+       /*
+        * Check for the degenerate case of the block being over 50% full.
+        * If so, it's not worth even looking to see if we might be able
+        * to coalesce with a sibling.
+        */
+       blk = &state->path.blk[state->path.active - 1];
+       leaf = blk->bp->b_addr;
+       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+       ents = dp->d_ops->leaf_ents_p(leaf);
+       xfs_dir3_leaf_check(dp, blk->bp);
+
+       count = leafhdr.count - leafhdr.stale;
+       bytes = dp->d_ops->leaf_hdr_size + count * sizeof(ents[0]);
+       if (bytes > (state->args->geo->blksize >> 1)) {
+               /*
+                * Blk over 50%, don't try to join.
+                */
+               *action = 0;
+               return 0;
+       }
+       /*
+        * Check for the degenerate case of the block being empty.
+        * If the block is empty, we'll simply delete it, no need to
+        * coalesce it with a sibling block.  We choose (arbitrarily)
+        * to merge with the forward block unless it is NULL.
+        */
+       if (count == 0) {
+               /*
+                * Make altpath point to the block we want to keep and
+                * path point to the block we want to drop (this one).
+                */
+               forward = (leafhdr.forw != 0);
+               memcpy(&state->altpath, &state->path, sizeof(state->path));
+               error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
+                       &rval);
+               if (error)
+                       return error;
+               *action = rval ? 2 : 0;
+               return 0;
+       }
+       /*
+        * Examine each sibling block to see if we can coalesce with
+        * at least 25% free space to spare.  We need to figure out
+        * whether to merge with the forward or the backward block.
+        * We prefer coalescing with the lower numbered sibling so as
+        * to shrink a directory over time.
+        */
+       forward = leafhdr.forw < leafhdr.back;
+       for (i = 0, bp = NULL; i < 2; forward = !forward, i++) {
+               struct xfs_dir3_icleaf_hdr hdr2;
+
+               blkno = forward ? leafhdr.forw : leafhdr.back;
+               if (blkno == 0)
+                       continue;
+               /*
+                * Read the sibling leaf block.
+                */
+               error = xfs_dir3_leafn_read(state->args->trans, dp,
+                                           blkno, -1, &bp);
+               if (error)
+                       return error;
+
+               /*
+                * Count bytes in the two blocks combined.
+                */
+               count = leafhdr.count - leafhdr.stale;
+               bytes = state->args->geo->blksize -
+                       (state->args->geo->blksize >> 2);
+
+               leaf = bp->b_addr;
+               dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf);
+               ents = dp->d_ops->leaf_ents_p(leaf);
+               count += hdr2.count - hdr2.stale;
+               bytes -= count * sizeof(ents[0]);
+
+               /*
+                * Fits with at least 25% to spare.
+                */
+               if (bytes >= 0)
+                       break;
+               xfs_trans_brelse(state->args->trans, bp);
+       }
+       /*
+        * Didn't like either block, give up.
+        */
+       if (i >= 2) {
+               *action = 0;
+               return 0;
+       }
+
+       /*
+        * Make altpath point to the block we want to keep (the lower
+        * numbered block) and path point to the block we want to drop.
+        */
+       memcpy(&state->altpath, &state->path, sizeof(state->path));
+       if (blkno < blk->blkno)
+               error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
+                       &rval);
+       else
+               error = xfs_da3_path_shift(state, &state->path, forward, 0,
+                       &rval);
+       if (error) {
+               return error;
+       }
+       *action = rval ? 0 : 1;
+       return 0;
+}
+
+/*
+ * Move all the leaf entries from drop_blk to save_blk.
+ * This is done as part of a join operation.
+ */
+void
+xfs_dir2_leafn_unbalance(
+       xfs_da_state_t          *state,         /* cursor */
+       xfs_da_state_blk_t      *drop_blk,      /* dead block */
+       xfs_da_state_blk_t      *save_blk)      /* surviving block */
+{
+       xfs_da_args_t           *args;          /* operation arguments */
+       xfs_dir2_leaf_t         *drop_leaf;     /* dead leaf structure */
+       xfs_dir2_leaf_t         *save_leaf;     /* surviving leaf structure */
+       struct xfs_dir3_icleaf_hdr savehdr;
+       struct xfs_dir3_icleaf_hdr drophdr;
+       struct xfs_dir2_leaf_entry *sents;
+       struct xfs_dir2_leaf_entry *dents;
+       struct xfs_inode        *dp = state->args->dp;
+
+       args = state->args;
+       ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+       ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+       drop_leaf = drop_blk->bp->b_addr;
+       save_leaf = save_blk->bp->b_addr;
+
+       dp->d_ops->leaf_hdr_from_disk(&savehdr, save_leaf);
+       dp->d_ops->leaf_hdr_from_disk(&drophdr, drop_leaf);
+       sents = dp->d_ops->leaf_ents_p(save_leaf);
+       dents = dp->d_ops->leaf_ents_p(drop_leaf);
+
+       /*
+        * If there are any stale leaf entries, take this opportunity
+        * to purge them.
+        */
+       if (drophdr.stale)
+               xfs_dir3_leaf_compact(args, &drophdr, drop_blk->bp);
+       if (savehdr.stale)
+               xfs_dir3_leaf_compact(args, &savehdr, save_blk->bp);
+
+       /*
+        * Move the entries from drop to the appropriate end of save.
+        */
+       drop_blk->hashval = be32_to_cpu(dents[drophdr.count - 1].hashval);
+       if (xfs_dir2_leafn_order(dp, save_blk->bp, drop_blk->bp))
+               xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
+                                       save_blk->bp, &savehdr, sents, 0,
+                                       drophdr.count);
+       else
+               xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
+                                       save_blk->bp, &savehdr, sents,
+                                       savehdr.count, drophdr.count);
+       save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval);
+
+       /* log the changes made when moving the entries */
+       dp->d_ops->leaf_hdr_to_disk(save_leaf, &savehdr);
+       dp->d_ops->leaf_hdr_to_disk(drop_leaf, &drophdr);
+       xfs_dir3_leaf_log_header(args, save_blk->bp);
+       xfs_dir3_leaf_log_header(args, drop_blk->bp);
+
+       xfs_dir3_leaf_check(dp, save_blk->bp);
+       xfs_dir3_leaf_check(dp, drop_blk->bp);
+}
+
+/*
+ * Top-level node form directory addname routine.
+ */
+int                                            /* error */
+xfs_dir2_node_addname(
+       xfs_da_args_t           *args)          /* operation arguments */
+{
+       xfs_da_state_blk_t      *blk;           /* leaf block for insert */
+       int                     error;          /* error return value */
+       int                     rval;           /* sub-return value */
+       xfs_da_state_t          *state;         /* btree cursor */
+
+       trace_xfs_dir2_node_addname(args);
+
+       /*
+        * Allocate and initialize the state (btree cursor).
+        */
+       state = xfs_da_state_alloc();
+       state->args = args;
+       state->mp = args->dp->i_mount;
+       /*
+        * Look up the name.  We're not supposed to find it, but
+        * this gives us the insertion point.
+        */
+       error = xfs_da3_node_lookup_int(state, &rval);
+       if (error)
+               rval = error;
+       if (rval != -ENOENT) {
+               goto done;
+       }
+       /*
+        * Add the data entry to a data block.
+        * Extravalid is set to a freeblock found by lookup.
+        */
+       rval = xfs_dir2_node_addname_int(args,
+               state->extravalid ? &state->extrablk : NULL);
+       if (rval) {
+               goto done;
+       }
+       blk = &state->path.blk[state->path.active - 1];
+       ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+       /*
+        * Add the new leaf entry.
+        */
+       rval = xfs_dir2_leafn_add(blk->bp, args, blk->index);
+       if (rval == 0) {
+               /*
+                * It worked, fix the hash values up the btree.
+                */
+               if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
+                       xfs_da3_fixhashpath(state, &state->path);
+       } else {
+               /*
+                * It didn't work, we need to split the leaf block.
+                */
+               if (args->total == 0) {
+                       ASSERT(rval == -ENOSPC);
+                       goto done;
+               }
+               /*
+                * Split the leaf block and insert the new entry.
+                */
+               rval = xfs_da3_split(state);
+       }
+done:
+       xfs_da_state_free(state);
+       return rval;
+}
+
+/*
+ * Add the data entry for a node-format directory name addition.
+ * The leaf entry is added in xfs_dir2_leafn_add.
+ * We may enter with a freespace block that the lookup found.
+ */
+static int                                     /* error */
+xfs_dir2_node_addname_int(
+       xfs_da_args_t           *args,          /* operation arguments */
+       xfs_da_state_blk_t      *fblk)          /* optional freespace block */
+{
+       xfs_dir2_data_hdr_t     *hdr;           /* data block header */
+       xfs_dir2_db_t           dbno;           /* data block number */
+       struct xfs_buf          *dbp;           /* data block buffer */
+       xfs_dir2_data_entry_t   *dep;           /* data entry pointer */
+       xfs_inode_t             *dp;            /* incore directory inode */
+       xfs_dir2_data_unused_t  *dup;           /* data unused entry pointer */
+       int                     error;          /* error return value */
+       xfs_dir2_db_t           fbno;           /* freespace block number */
+       struct xfs_buf          *fbp;           /* freespace buffer */
+       int                     findex;         /* freespace entry index */
+       xfs_dir2_free_t         *free=NULL;     /* freespace block structure */
+       xfs_dir2_db_t           ifbno;          /* initial freespace block no */
+       xfs_dir2_db_t           lastfbno=0;     /* highest freespace block no */
+       int                     length;         /* length of the new entry */
+       int                     logfree;        /* need to log free entry */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       int                     needlog;        /* need to log data header */
+       int                     needscan;       /* need to rescan data frees */
+       __be16                  *tagp;          /* data entry tag pointer */
+       xfs_trans_t             *tp;            /* transaction pointer */
+       __be16                  *bests;
+       struct xfs_dir3_icfree_hdr freehdr;
+       struct xfs_dir2_data_free *bf;
+
+       dp = args->dp;
+       mp = dp->i_mount;
+       tp = args->trans;
+       length = dp->d_ops->data_entsize(args->namelen);
+       /*
+        * If we came in with a freespace block that means that lookup
+        * found an entry with our hash value.  This is the freespace
+        * block for that data entry.
+        */
+       if (fblk) {
+               fbp = fblk->bp;
+               /*
+                * Remember initial freespace block number.
+                */
+               ifbno = fblk->blkno;
+               free = fbp->b_addr;
+               findex = fblk->index;
+               bests = dp->d_ops->free_bests_p(free);
+               dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
+               /*
+                * This means the free entry showed that the data block had
+                * space for our entry, so we remembered it.
+                * Use that data block.
+                */
+               if (findex >= 0) {
+                       ASSERT(findex < freehdr.nvalid);
+                       ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
+                       ASSERT(be16_to_cpu(bests[findex]) >= length);
+                       dbno = freehdr.firstdb + findex;
+               } else {
+                       /*
+                        * The data block looked at didn't have enough room.
+                        * We'll start at the beginning of the freespace entries.
+                        */
+                       dbno = -1;
+                       findex = 0;
+               }
+       } else {
+               /*
+                * Didn't come in with a freespace block, so no data block.
+                */
+               ifbno = dbno = -1;
+               fbp = NULL;
+               findex = 0;
+       }
+
+       /*
+        * If we don't have a data block yet, we're going to scan the
+        * freespace blocks looking for one.  Figure out what the
+        * highest freespace block number is.
+        */
+       if (dbno == -1) {
+               xfs_fileoff_t   fo;             /* freespace block number */
+
+               if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK)))
+                       return error;
+               lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
+               fbno = ifbno;
+       }
+       /*
+        * While we haven't identified a data block, search the freeblock
+        * data for a good data block.  If we find a null freeblock entry,
+        * indicating a hole in the data blocks, remember that.
+        */
+       while (dbno == -1) {
+               /*
+                * If we don't have a freeblock in hand, get the next one.
+                */
+               if (fbp == NULL) {
+                       /*
+                        * Happens the first time through unless lookup gave
+                        * us a freespace block to start with.
+                        */
+                       if (++fbno == 0)
+                               fbno = xfs_dir2_byte_to_db(args->geo,
+                                                       XFS_DIR2_FREE_OFFSET);
+                       /*
+                        * If it's ifbno we already looked at it.
+                        */
+                       if (fbno == ifbno)
+                               fbno++;
+                       /*
+                        * If it's off the end we're done.
+                        */
+                       if (fbno >= lastfbno)
+                               break;
+                       /*
+                        * Read the block.  There can be holes in the
+                        * freespace blocks, so this might not succeed.
+                        * This should be really rare, so there's no reason
+                        * to avoid it.
+                        */
+                       error = xfs_dir2_free_try_read(tp, dp,
+                                       xfs_dir2_db_to_da(args->geo, fbno),
+                                       &fbp);
+                       if (error)
+                               return error;
+                       if (!fbp)
+                               continue;
+                       free = fbp->b_addr;
+                       findex = 0;
+               }
+               /*
+                * Look at the current free entry.  Is it good enough?
+                *
+                * The bests initialisation should be where the bufer is read in
+                * the above branch. But gcc is too stupid to realise that bests
+                * and the freehdr are actually initialised if they are placed
+                * there, so we have to do it here to avoid warnings. Blech.
+                */
+               bests = dp->d_ops->free_bests_p(free);
+               dp->d_ops->free_hdr_from_disk(&freehdr, free);
+               if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
+                   be16_to_cpu(bests[findex]) >= length)
+                       dbno = freehdr.firstdb + findex;
+               else {
+                       /*
+                        * Are we done with the freeblock?
+                        */
+                       if (++findex == freehdr.nvalid) {
+                               /*
+                                * Drop the block.
+                                */
+                               xfs_trans_brelse(tp, fbp);
+                               fbp = NULL;
+                               if (fblk && fblk->bp)
+                                       fblk->bp = NULL;
+                       }
+               }
+       }
+       /*
+        * If we don't have a data block, we need to allocate one and make
+        * the freespace entries refer to it.
+        */
+       if (unlikely(dbno == -1)) {
+               /*
+                * Not allowed to allocate, return failure.
+                */
+               if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
+                       return -ENOSPC;
+
+               /*
+                * Allocate and initialize the new data block.
+                */
+               if (unlikely((error = xfs_dir2_grow_inode(args,
+                                                        XFS_DIR2_DATA_SPACE,
+                                                        &dbno)) ||
+                   (error = xfs_dir3_data_init(args, dbno, &dbp))))
+                       return error;
+
+               /*
+                * If (somehow) we have a freespace block, get rid of it.
+                */
+               if (fbp)
+                       xfs_trans_brelse(tp, fbp);
+               if (fblk && fblk->bp)
+                       fblk->bp = NULL;
+
+               /*
+                * Get the freespace block corresponding to the data block
+                * that was just allocated.
+                */
+               fbno = dp->d_ops->db_to_fdb(args->geo, dbno);
+               error = xfs_dir2_free_try_read(tp, dp,
+                                      xfs_dir2_db_to_da(args->geo, fbno),
+                                      &fbp);
+               if (error)
+                       return error;
+
+               /*
+                * If there wasn't a freespace block, the read will
+                * return a NULL fbp.  Allocate and initialize a new one.
+                */
+               if (!fbp) {
+                       error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
+                                                   &fbno);
+                       if (error)
+                               return error;
+
+                       if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) {
+                               xfs_alert(mp,
+                       "%s: dir ino %llu needed freesp block %lld for\n"
+                       "  data block %lld, got %lld ifbno %llu lastfbno %d",
+                                       __func__, (unsigned long long)dp->i_ino,
+                                       (long long)dp->d_ops->db_to_fdb(
+                                                               args->geo, dbno),
+                                       (long long)dbno, (long long)fbno,
+                                       (unsigned long long)ifbno, lastfbno);
+                               if (fblk) {
+                                       xfs_alert(mp,
+                               " fblk 0x%p blkno %llu index %d magic 0x%x",
+                                               fblk,
+                                               (unsigned long long)fblk->blkno,
+                                               fblk->index,
+                                               fblk->magic);
+                               } else {
+                                       xfs_alert(mp, " ... fblk is NULL");
+                               }
+                               XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
+                                                XFS_ERRLEVEL_LOW, mp);
+                               return -EFSCORRUPTED;
+                       }
+
+                       /*
+                        * Get a buffer for the new block.
+                        */
+                       error = xfs_dir3_free_get_buf(args, fbno, &fbp);
+                       if (error)
+                               return error;
+                       free = fbp->b_addr;
+                       bests = dp->d_ops->free_bests_p(free);
+                       dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
+                       /*
+                        * Remember the first slot as our empty slot.
+                        */
+                       freehdr.firstdb =
+                               (fbno - xfs_dir2_byte_to_db(args->geo,
+                                                       XFS_DIR2_FREE_OFFSET)) *
+                                       dp->d_ops->free_max_bests(args->geo);
+               } else {
+                       free = fbp->b_addr;
+                       bests = dp->d_ops->free_bests_p(free);
+                       dp->d_ops->free_hdr_from_disk(&freehdr, free);
+               }
+
+               /*
+                * Set the freespace block index from the data block number.
+                */
+               findex = dp->d_ops->db_to_fdindex(args->geo, dbno);
+               /*
+                * If it's after the end of the current entries in the
+                * freespace block, extend that table.
+                */
+               if (findex >= freehdr.nvalid) {
+                       ASSERT(findex < dp->d_ops->free_max_bests(args->geo));
+                       freehdr.nvalid = findex + 1;
+                       /*
+                        * Tag new entry so nused will go up.
+                        */
+                       bests[findex] = cpu_to_be16(NULLDATAOFF);
+               }
+               /*
+                * If this entry was for an empty data block
+                * (this should always be true) then update the header.
+                */
+               if (bests[findex] == cpu_to_be16(NULLDATAOFF)) {
+                       freehdr.nused++;
+                       dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
+                       xfs_dir2_free_log_header(args, fbp);
+               }
+               /*
+                * Update the real value in the table.
+                * We haven't allocated the data entry yet so this will
+                * change again.
+                */
+               hdr = dbp->b_addr;
+               bf = dp->d_ops->data_bestfree_p(hdr);
+               bests[findex] = bf[0].length;
+               logfree = 1;
+       }
+       /*
+        * We had a data block so we don't have to make a new one.
+        */
+       else {
+               /*
+                * If just checking, we succeeded.
+                */
+               if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+                       return 0;
+
+               /*
+                * Read the data block in.
+                */
+               error = xfs_dir3_data_read(tp, dp,
+                                          xfs_dir2_db_to_da(args->geo, dbno),
+                                          -1, &dbp);
+               if (error)
+                       return error;
+               hdr = dbp->b_addr;
+               bf = dp->d_ops->data_bestfree_p(hdr);
+               logfree = 0;
+       }
+       ASSERT(be16_to_cpu(bf[0].length) >= length);
+       /*
+        * Point to the existing unused space.
+        */
+       dup = (xfs_dir2_data_unused_t *)
+             ((char *)hdr + be16_to_cpu(bf[0].offset));
+       needscan = needlog = 0;
+       /*
+        * Mark the first part of the unused space, inuse for us.
+        */
+       xfs_dir2_data_use_free(args, dbp, dup,
+               (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
+               &needlog, &needscan);
+       /*
+        * Fill in the new entry and log it.
+        */
+       dep = (xfs_dir2_data_entry_t *)dup;
+       dep->inumber = cpu_to_be64(args->inumber);
+       dep->namelen = args->namelen;
+       memcpy(dep->name, args->name, dep->namelen);
+       dp->d_ops->data_put_ftype(dep, args->filetype);
+       tagp = dp->d_ops->data_entry_tag_p(dep);
+       *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+       xfs_dir2_data_log_entry(args, dbp, dep);
+       /*
+        * Rescan the block for bestfree if needed.
+        */
+       if (needscan)
+               xfs_dir2_data_freescan(dp, hdr, &needlog);
+       /*
+        * Log the data block header if needed.
+        */
+       if (needlog)
+               xfs_dir2_data_log_header(args, dbp);
+       /*
+        * If the freespace entry is now wrong, update it.
+        */
+       bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */
+       if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) {
+               bests[findex] = bf[0].length;
+               logfree = 1;
+       }
+       /*
+        * Log the freespace entry if needed.
+        */
+       if (logfree)
+               xfs_dir2_free_log_bests(args, fbp, findex, findex);
+       /*
+        * Return the data block and offset in args, then drop the data block.
+        */
+       args->blkno = (xfs_dablk_t)dbno;
+       args->index = be16_to_cpu(*tagp);
+       return 0;
+}
+
+/*
+ * Lookup an entry in a node-format directory.
+ * All the real work happens in xfs_da3_node_lookup_int.
+ * The only real output is the inode number of the entry.
+ */
+int                                            /* error */
+xfs_dir2_node_lookup(
+       xfs_da_args_t   *args)                  /* operation arguments */
+{
+       int             error;                  /* error return value */
+       int             i;                      /* btree level */
+       int             rval;                   /* operation return value */
+       xfs_da_state_t  *state;                 /* btree cursor */
+
+       trace_xfs_dir2_node_lookup(args);
+
+       /*
+        * Allocate and initialize the btree cursor.
+        */
+       state = xfs_da_state_alloc();
+       state->args = args;
+       state->mp = args->dp->i_mount;
+       /*
+        * Fill in the path to the entry in the cursor.
+        */
+       error = xfs_da3_node_lookup_int(state, &rval);
+       if (error)
+               rval = error;
+       else if (rval == -ENOENT && args->cmpresult == XFS_CMP_CASE) {
+               /* If a CI match, dup the actual name and return -EEXIST */
+               xfs_dir2_data_entry_t   *dep;
+
+               dep = (xfs_dir2_data_entry_t *)
+                       ((char *)state->extrablk.bp->b_addr +
+                                                state->extrablk.index);
+               rval = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+       }
+       /*
+        * Release the btree blocks and leaf block.
+        */
+       for (i = 0; i < state->path.active; i++) {
+               xfs_trans_brelse(args->trans, state->path.blk[i].bp);
+               state->path.blk[i].bp = NULL;
+       }
+       /*
+        * Release the data block if we have it.
+        */
+       if (state->extravalid && state->extrablk.bp) {
+               xfs_trans_brelse(args->trans, state->extrablk.bp);
+               state->extrablk.bp = NULL;
+       }
+       xfs_da_state_free(state);
+       return rval;
+}
+
+/*
+ * Remove an entry from a node-format directory.
+ */
+int                                            /* error */
+xfs_dir2_node_removename(
+       struct xfs_da_args      *args)          /* operation arguments */
+{
+       struct xfs_da_state_blk *blk;           /* leaf block */
+       int                     error;          /* error return value */
+       int                     rval;           /* operation return value */
+       struct xfs_da_state     *state;         /* btree cursor */
+
+       trace_xfs_dir2_node_removename(args);
+
+       /*
+        * Allocate and initialize the btree cursor.
+        */
+       state = xfs_da_state_alloc();
+       state->args = args;
+       state->mp = args->dp->i_mount;
+
+       /* Look up the entry we're deleting, set up the cursor. */
+       error = xfs_da3_node_lookup_int(state, &rval);
+       if (error)
+               goto out_free;
+
+       /* Didn't find it, upper layer screwed up. */
+       if (rval != -EEXIST) {
+               error = rval;
+               goto out_free;
+       }
+
+       blk = &state->path.blk[state->path.active - 1];
+       ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+       ASSERT(state->extravalid);
+       /*
+        * Remove the leaf and data entries.
+        * Extrablk refers to the data block.
+        */
+       error = xfs_dir2_leafn_remove(args, blk->bp, blk->index,
+               &state->extrablk, &rval);
+       if (error)
+               goto out_free;
+       /*
+        * Fix the hash values up the btree.
+        */
+       xfs_da3_fixhashpath(state, &state->path);
+       /*
+        * If we need to join leaf blocks, do it.
+        */
+       if (rval && state->path.active > 1)
+               error = xfs_da3_join(state);
+       /*
+        * If no errors so far, try conversion to leaf format.
+        */
+       if (!error)
+               error = xfs_dir2_node_to_leaf(state);
+out_free:
+       xfs_da_state_free(state);
+       return error;
+}
+
+/*
+ * Replace an entry's inode number in a node-format directory.
+ */
+int                                            /* error */
+xfs_dir2_node_replace(
+       xfs_da_args_t           *args)          /* operation arguments */
+{
+       xfs_da_state_blk_t      *blk;           /* leaf block */
+       xfs_dir2_data_hdr_t     *hdr;           /* data block header */
+       xfs_dir2_data_entry_t   *dep;           /* data entry changed */
+       int                     error;          /* error return value */
+       int                     i;              /* btree level */
+       xfs_ino_t               inum;           /* new inode number */
+       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+       xfs_dir2_leaf_entry_t   *lep;           /* leaf entry being changed */
+       int                     rval;           /* internal return value */
+       xfs_da_state_t          *state;         /* btree cursor */
+
+       trace_xfs_dir2_node_replace(args);
+
+       /*
+        * Allocate and initialize the btree cursor.
+        */
+       state = xfs_da_state_alloc();
+       state->args = args;
+       state->mp = args->dp->i_mount;
+       inum = args->inumber;
+       /*
+        * Lookup the entry to change in the btree.
+        */
+       error = xfs_da3_node_lookup_int(state, &rval);
+       if (error) {
+               rval = error;
+       }
+       /*
+        * It should be found, since the vnodeops layer has looked it up
+        * and locked it.  But paranoia is good.
+        */
+       if (rval == -EEXIST) {
+               struct xfs_dir2_leaf_entry *ents;
+               /*
+                * Find the leaf entry.
+                */
+               blk = &state->path.blk[state->path.active - 1];
+               ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+               leaf = blk->bp->b_addr;
+               ents = args->dp->d_ops->leaf_ents_p(leaf);
+               lep = &ents[blk->index];
+               ASSERT(state->extravalid);
+               /*
+                * Point to the data entry.
+                */
+               hdr = state->extrablk.bp->b_addr;
+               ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+                      hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+               dep = (xfs_dir2_data_entry_t *)
+                     ((char *)hdr +
+                      xfs_dir2_dataptr_to_off(args->geo,
+                                              be32_to_cpu(lep->address)));
+               ASSERT(inum != be64_to_cpu(dep->inumber));
+               /*
+                * Fill in the new inode number and log the entry.
+                */
+               dep->inumber = cpu_to_be64(inum);
+               args->dp->d_ops->data_put_ftype(dep, args->filetype);
+               xfs_dir2_data_log_entry(args, state->extrablk.bp, dep);
+               rval = 0;
+       }
+       /*
+        * Didn't find it, and we're holding a data block.  Drop it.
+        */
+       else if (state->extravalid) {
+               xfs_trans_brelse(args->trans, state->extrablk.bp);
+               state->extrablk.bp = NULL;
+       }
+       /*
+        * Release all the buffers in the cursor.
+        */
+       for (i = 0; i < state->path.active; i++) {
+               xfs_trans_brelse(args->trans, state->path.blk[i].bp);
+               state->path.blk[i].bp = NULL;
+       }
+       xfs_da_state_free(state);
+       return rval;
+}
+
+/*
+ * Trim off a trailing empty freespace block.
+ * Return (in rvalp) 1 if we did it, 0 if not.
+ */
+int                                            /* error */
+xfs_dir2_node_trim_free(
+       xfs_da_args_t           *args,          /* operation arguments */
+       xfs_fileoff_t           fo,             /* free block number */
+       int                     *rvalp)         /* out: did something */
+{
+       struct xfs_buf          *bp;            /* freespace buffer */
+       xfs_inode_t             *dp;            /* incore directory inode */
+       int                     error;          /* error return code */
+       xfs_dir2_free_t         *free;          /* freespace structure */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       xfs_trans_t             *tp;            /* transaction pointer */
+       struct xfs_dir3_icfree_hdr freehdr;
+
+       dp = args->dp;
+       mp = dp->i_mount;
+       tp = args->trans;
+       /*
+        * Read the freespace block.
+        */
+       error = xfs_dir2_free_try_read(tp, dp, fo, &bp);
+       if (error)
+               return error;
+       /*
+        * There can be holes in freespace.  If fo is a hole, there's
+        * nothing to do.
+        */
+       if (!bp)
+               return 0;
+       free = bp->b_addr;
+       dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
+       /*
+        * If there are used entries, there's nothing to do.
+        */
+       if (freehdr.nused > 0) {
+               xfs_trans_brelse(tp, bp);
+               *rvalp = 0;
+               return 0;
+       }
+       /*
+        * Blow the block away.
+        */
+       error = xfs_dir2_shrink_inode(args,
+                       xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo), bp);
+       if (error) {
+               /*
+                * Can't fail with ENOSPC since that only happens with no
+                * space reservation, when breaking up an extent into two
+                * pieces.  This is the last block of an extent.
+                */
+               ASSERT(error != -ENOSPC);
+               xfs_trans_brelse(tp, bp);
+               return error;
+       }
+       /*
+        * Return that we succeeded.
+        */
+       *rvalp = 1;
+       return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
new file mode 100644 (file)
index 0000000..27ce079
--- /dev/null
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DIR2_PRIV_H__
+#define __XFS_DIR2_PRIV_H__
+
+struct dir_context;
+
+/*
+ * Directory offset/block conversion functions.
+ *
+ * DB blocks here are logical directory block numbers, not filesystem blocks.
+ */
+
+/*
+ * Convert dataptr to byte in file space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp)
+{
+       return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG;
+}
+
+/*
+ * Convert byte in file space to dataptr.  It had better be aligned.
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by)
+{
+       return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG);
+}
+
+/*
+ * Convert byte in space to (DB) block
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+       return (xfs_dir2_db_t)(by >> geo->blklog);
+}
+
+/*
+ * Convert dataptr to a block number
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
+{
+       return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp));
+}
+
+/*
+ * Convert byte in space to offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+       return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1));
+}
+
+/*
+ * Convert dataptr to a byte offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
+{
+       return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp));
+}
+
+/*
+ * Convert block and offset to byte in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
+                       xfs_dir2_data_aoff_t o)
+{
+       return ((xfs_dir2_off_t)db << geo->blklog) + o;
+}
+
+/*
+ * Convert block (DB) to block (dablk)
+ */
+static inline xfs_dablk_t
+xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+       return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog));
+}
+
+/*
+ * Convert byte in space to (DA) block
+ */
+static inline xfs_dablk_t
+xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+       return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by));
+}
+
+/*
+ * Convert block and offset to dataptr
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
+                          xfs_dir2_data_aoff_t o)
+{
+       return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o));
+}
+
+/*
+ * Convert block (dablk) to block (DB)
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da)
+{
+       return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog));
+}
+
+/*
+ * Convert block (dablk) to byte offset in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da)
+{
+       return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0);
+}
+
+/*
+ * Directory tail pointer accessor functions. Based on block geometry.
+ */
+static inline struct xfs_dir2_block_tail *
+xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr)
+{
+       return ((struct xfs_dir2_block_tail *)
+               ((char *)hdr + geo->blksize)) - 1;
+}
+
+static inline struct xfs_dir2_leaf_tail *
+xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
+{
+       return (struct xfs_dir2_leaf_tail *)
+               ((char *)lp + geo->blksize -
+                 sizeof(struct xfs_dir2_leaf_tail));
+}
+
+/* xfs_dir2.c */
+extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
+extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
+                               xfs_dir2_db_t *dbp);
+extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
+                               const unsigned char *name, int len);
+
+#define S_SHIFT 12
+extern const unsigned char xfs_mode_to_ftype[];
+
+extern unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp,
+                                       __uint8_t filetype);
+
+
+/* xfs_dir2_block.c */
+extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp,
+                              struct xfs_buf **bpp);
+extern int xfs_dir2_block_addname(struct xfs_da_args *args);
+extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_block_removename(struct xfs_da_args *args);
+extern int xfs_dir2_block_replace(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
+               struct xfs_buf *lbp, struct xfs_buf *dbp);
+
+/* xfs_dir2_data.c */
+#ifdef DEBUG
+#define        xfs_dir3_data_check(dp,bp) __xfs_dir3_data_check(dp, bp);
+#else
+#define        xfs_dir3_data_check(dp,bp)
+#endif
+
+extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
+extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
+               xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
+extern int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno,
+               xfs_daddr_t mapped_bno);
+
+extern struct xfs_dir2_data_free *
+xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
+               struct xfs_dir2_data_free *bf, struct xfs_dir2_data_unused *dup,
+               int *loghead);
+extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
+               struct xfs_buf **bpp);
+
+/* xfs_dir2_leaf.c */
+extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
+               xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
+extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
+               struct xfs_buf *dbp);
+extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
+extern void xfs_dir3_leaf_compact(struct xfs_da_args *args,
+               struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_buf *bp);
+extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
+               struct xfs_dir2_leaf_entry *ents, int *indexp,
+               int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
+extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
+               struct xfs_buf **bpp, __uint16_t magic);
+extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args,
+               struct xfs_buf *bp, int first, int last);
+extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args,
+               struct xfs_buf *bp);
+extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_removename(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_replace(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args,
+               struct xfs_buf *lbp);
+extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args,
+               struct xfs_buf *lbp, xfs_dir2_db_t db);
+extern struct xfs_dir2_leaf_entry *
+xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr,
+               struct xfs_dir2_leaf_entry *ents, int index, int compact,
+               int lowstale, int highstale, int *lfloglow, int *lfloghigh);
+extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state);
+
+extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, struct xfs_inode *dp,
+               struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf);
+
+/* xfs_dir2_node.c */
+extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
+               struct xfs_buf *lbp);
+extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_inode *dp,
+               struct xfs_buf *bp, int *count);
+extern int xfs_dir2_leafn_lookup_int(struct xfs_buf *bp,
+               struct xfs_da_args *args, int *indexp,
+               struct xfs_da_state *state);
+extern int xfs_dir2_leafn_order(struct xfs_inode *dp, struct xfs_buf *leaf1_bp,
+               struct xfs_buf *leaf2_bp);
+extern int xfs_dir2_leafn_split(struct xfs_da_state *state,
+       struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk);
+extern int xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action);
+extern void xfs_dir2_leafn_unbalance(struct xfs_da_state *state,
+               struct xfs_da_state_blk *drop_blk,
+               struct xfs_da_state_blk *save_blk);
+extern int xfs_dir2_node_addname(struct xfs_da_args *args);
+extern int xfs_dir2_node_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_node_removename(struct xfs_da_args *args);
+extern int xfs_dir2_node_replace(struct xfs_da_args *args);
+extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
+               int *rvalp);
+extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
+               xfs_dablk_t fbno, struct xfs_buf **bpp);
+
+/* xfs_dir2_sf.c */
+extern int xfs_dir2_block_sfsize(struct xfs_inode *dp,
+               struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp);
+extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp,
+               int size, xfs_dir2_sf_hdr_t *sfhp);
+extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
+extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
+extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
+extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
+
+/* xfs_dir2_readdir.c */
+extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx,
+                      size_t bufsize);
+
+#endif /* __XFS_DIR2_PRIV_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
new file mode 100644 (file)
index 0000000..5079e05
--- /dev/null
@@ -0,0 +1,1155 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_error.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_trace.h"
+#include "xfs_dinode.h"
+
+/*
+ * Prototypes for internal functions.
+ */
+static void xfs_dir2_sf_addname_easy(xfs_da_args_t *args,
+                                    xfs_dir2_sf_entry_t *sfep,
+                                    xfs_dir2_data_aoff_t offset,
+                                    int new_isize);
+static void xfs_dir2_sf_addname_hard(xfs_da_args_t *args, int objchange,
+                                    int new_isize);
+static int xfs_dir2_sf_addname_pick(xfs_da_args_t *args, int objchange,
+                                   xfs_dir2_sf_entry_t **sfepp,
+                                   xfs_dir2_data_aoff_t *offsetp);
+#ifdef DEBUG
+static void xfs_dir2_sf_check(xfs_da_args_t *args);
+#else
+#define        xfs_dir2_sf_check(args)
+#endif /* DEBUG */
+
+static void xfs_dir2_sf_toino4(xfs_da_args_t *args);
+static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
+
+/*
+ * Given a block directory (dp/block), calculate its size as a shortform (sf)
+ * directory and a header for the sf directory, if it will fit it the
+ * space currently present in the inode.  If it won't fit, the output
+ * size is too big (but not accurate).
+ */
+int                                            /* size for sf form */
+xfs_dir2_block_sfsize(
+       xfs_inode_t             *dp,            /* incore inode pointer */
+       xfs_dir2_data_hdr_t     *hdr,           /* block directory data */
+       xfs_dir2_sf_hdr_t       *sfhp)          /* output: header for sf form */
+{
+       xfs_dir2_dataptr_t      addr;           /* data entry address */
+       xfs_dir2_leaf_entry_t   *blp;           /* leaf area of the block */
+       xfs_dir2_block_tail_t   *btp;           /* tail area of the block */
+       int                     count;          /* shortform entry count */
+       xfs_dir2_data_entry_t   *dep;           /* data entry in the block */
+       int                     i;              /* block entry index */
+       int                     i8count;        /* count of big-inode entries */
+       int                     isdot;          /* entry is "." */
+       int                     isdotdot;       /* entry is ".." */
+       xfs_mount_t             *mp;            /* mount structure pointer */
+       int                     namelen;        /* total name bytes */
+       xfs_ino_t               parent = 0;     /* parent inode number */
+       int                     size=0;         /* total computed size */
+       int                     has_ftype;
+       struct xfs_da_geometry  *geo;
+
+       mp = dp->i_mount;
+       geo = mp->m_dir_geo;
+
+       /*
+        * if there is a filetype field, add the extra byte to the namelen
+        * for each entry that we see.
+        */
+       has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0;
+
+       count = i8count = namelen = 0;
+       btp = xfs_dir2_block_tail_p(geo, hdr);
+       blp = xfs_dir2_block_leaf_p(btp);
+
+       /*
+        * Iterate over the block's data entries by using the leaf pointers.
+        */
+       for (i = 0; i < be32_to_cpu(btp->count); i++) {
+               if ((addr = be32_to_cpu(blp[i].address)) == XFS_DIR2_NULL_DATAPTR)
+                       continue;
+               /*
+                * Calculate the pointer to the entry at hand.
+                */
+               dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+                               xfs_dir2_dataptr_to_off(geo, addr));
+               /*
+                * Detect . and .., so we can special-case them.
+                * . is not included in sf directories.
+                * .. is included by just the parent inode number.
+                */
+               isdot = dep->namelen == 1 && dep->name[0] == '.';
+               isdotdot =
+                       dep->namelen == 2 &&
+                       dep->name[0] == '.' && dep->name[1] == '.';
+
+               if (!isdot)
+                       i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM;
+
+               /* take into account the file type field */
+               if (!isdot && !isdotdot) {
+                       count++;
+                       namelen += dep->namelen + has_ftype;
+               } else if (isdotdot)
+                       parent = be64_to_cpu(dep->inumber);
+               /*
+                * Calculate the new size, see if we should give up yet.
+                */
+               size = xfs_dir2_sf_hdr_size(i8count) +          /* header */
+                      count +                                  /* namelen */
+                      count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */
+                      namelen +                                /* name */
+                      (i8count ?                               /* inumber */
+                               (uint)sizeof(xfs_dir2_ino8_t) * count :
+                               (uint)sizeof(xfs_dir2_ino4_t) * count);
+               if (size > XFS_IFORK_DSIZE(dp))
+                       return size;            /* size value is a failure */
+       }
+       /*
+        * Create the output header, if it worked.
+        */
+       sfhp->count = count;
+       sfhp->i8count = i8count;
+       dp->d_ops->sf_put_parent_ino(sfhp, parent);
+       return size;
+}
+
+/*
+ * Convert a block format directory to shortform.
+ * Caller has already checked that it will fit, and built us a header.
+ */
+int                                            /* error */
+xfs_dir2_block_to_sf(
+       xfs_da_args_t           *args,          /* operation arguments */
+       struct xfs_buf          *bp,
+       int                     size,           /* shortform directory size */
+       xfs_dir2_sf_hdr_t       *sfhp)          /* shortform directory hdr */
+{
+       xfs_dir2_data_hdr_t     *hdr;           /* block header */
+       xfs_dir2_block_tail_t   *btp;           /* block tail pointer */
+       xfs_dir2_data_entry_t   *dep;           /* data entry pointer */
+       xfs_inode_t             *dp;            /* incore directory inode */
+       xfs_dir2_data_unused_t  *dup;           /* unused data pointer */
+       char                    *endptr;        /* end of data entries */
+       int                     error;          /* error return value */
+       int                     logflags;       /* inode logging flags */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       char                    *ptr;           /* current data pointer */
+       xfs_dir2_sf_entry_t     *sfep;          /* shortform entry */
+       xfs_dir2_sf_hdr_t       *sfp;           /* shortform directory header */
+       xfs_dir2_sf_hdr_t       *dst;           /* temporary data buffer */
+
+       trace_xfs_dir2_block_to_sf(args);
+
+       dp = args->dp;
+       mp = dp->i_mount;
+
+       /*
+        * allocate a temporary destination buffer the size of the inode
+        * to format the data into. Once we have formatted the data, we
+        * can free the block and copy the formatted data into the inode literal
+        * area.
+        */
+       dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP);
+       hdr = bp->b_addr;
+
+       /*
+        * Copy the header into the newly allocate local space.
+        */
+       sfp = (xfs_dir2_sf_hdr_t *)dst;
+       memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
+
+       /*
+        * Set up to loop over the block's entries.
+        */
+       btp = xfs_dir2_block_tail_p(args->geo, hdr);
+       ptr = (char *)dp->d_ops->data_entry_p(hdr);
+       endptr = (char *)xfs_dir2_block_leaf_p(btp);
+       sfep = xfs_dir2_sf_firstentry(sfp);
+       /*
+        * Loop over the active and unused entries.
+        * Stop when we reach the leaf/tail portion of the block.
+        */
+       while (ptr < endptr) {
+               /*
+                * If it's unused, just skip over it.
+                */
+               dup = (xfs_dir2_data_unused_t *)ptr;
+               if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+                       ptr += be16_to_cpu(dup->length);
+                       continue;
+               }
+               dep = (xfs_dir2_data_entry_t *)ptr;
+               /*
+                * Skip .
+                */
+               if (dep->namelen == 1 && dep->name[0] == '.')
+                       ASSERT(be64_to_cpu(dep->inumber) == dp->i_ino);
+               /*
+                * Skip .., but make sure the inode number is right.
+                */
+               else if (dep->namelen == 2 &&
+                        dep->name[0] == '.' && dep->name[1] == '.')
+                       ASSERT(be64_to_cpu(dep->inumber) ==
+                              dp->d_ops->sf_get_parent_ino(sfp));
+               /*
+                * Normal entry, copy it into shortform.
+                */
+               else {
+                       sfep->namelen = dep->namelen;
+                       xfs_dir2_sf_put_offset(sfep,
+                               (xfs_dir2_data_aoff_t)
+                               ((char *)dep - (char *)hdr));
+                       memcpy(sfep->name, dep->name, dep->namelen);
+                       dp->d_ops->sf_put_ino(sfp, sfep,
+                                             be64_to_cpu(dep->inumber));
+                       dp->d_ops->sf_put_ftype(sfep,
+                                       dp->d_ops->data_get_ftype(dep));
+
+                       sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+               }
+               ptr += dp->d_ops->data_entsize(dep->namelen);
+       }
+       ASSERT((char *)sfep - (char *)sfp == size);
+
+       /* now we are done with the block, we can shrink the inode */
+       logflags = XFS_ILOG_CORE;
+       error = xfs_dir2_shrink_inode(args, args->geo->datablk, bp);
+       if (error) {
+               ASSERT(error != -ENOSPC);
+               goto out;
+       }
+
+       /*
+        * The buffer is now unconditionally gone, whether
+        * xfs_dir2_shrink_inode worked or not.
+        *
+        * Convert the inode to local format and copy the data in.
+        */
+       dp->i_df.if_flags &= ~XFS_IFEXTENTS;
+       dp->i_df.if_flags |= XFS_IFINLINE;
+       dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+       ASSERT(dp->i_df.if_bytes == 0);
+       xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+
+       logflags |= XFS_ILOG_DDATA;
+       memcpy(dp->i_df.if_u1.if_data, dst, size);
+       dp->i_d.di_size = size;
+       xfs_dir2_sf_check(args);
+out:
+       xfs_trans_log_inode(args->trans, dp, logflags);
+       kmem_free(dst);
+       return error;
+}
+
+/*
+ * Add a name to a shortform directory.
+ * There are two algorithms, "easy" and "hard" which we decide on
+ * before changing anything.
+ * Convert to block form if necessary, if the new entry won't fit.
+ */
+int                                            /* error */
+xfs_dir2_sf_addname(
+       xfs_da_args_t           *args)          /* operation arguments */
+{
+       xfs_inode_t             *dp;            /* incore directory inode */
+       int                     error;          /* error return value */
+       int                     incr_isize;     /* total change in size */
+       int                     new_isize;      /* di_size after adding name */
+       int                     objchange;      /* changing to 8-byte inodes */
+       xfs_dir2_data_aoff_t    offset = 0;     /* offset for new entry */
+       int                     pick;           /* which algorithm to use */
+       xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
+       xfs_dir2_sf_entry_t     *sfep = NULL;   /* shortform entry */
+
+       trace_xfs_dir2_sf_addname(args);
+
+       ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT);
+       dp = args->dp;
+       ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+       /*
+        * Make sure the shortform value has some of its header.
+        */
+       if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+               ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+               return -EIO;
+       }
+       ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+       ASSERT(dp->i_df.if_u1.if_data != NULL);
+       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+       ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+       /*
+        * Compute entry (and change in) size.
+        */
+       incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen);
+       objchange = 0;
+
+       /*
+        * Do we have to change to 8 byte inodes?
+        */
+       if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
+               /*
+                * Yes, adjust the inode size.  old count + (parent + new)
+                */
+               incr_isize +=
+                       (sfp->count + 2) *
+                       ((uint)sizeof(xfs_dir2_ino8_t) -
+                        (uint)sizeof(xfs_dir2_ino4_t));
+               objchange = 1;
+       }
+
+       new_isize = (int)dp->i_d.di_size + incr_isize;
+       /*
+        * Won't fit as shortform any more (due to size),
+        * or the pick routine says it won't (due to offset values).
+        */
+       if (new_isize > XFS_IFORK_DSIZE(dp) ||
+           (pick =
+            xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) {
+               /*
+                * Just checking or no space reservation, it doesn't fit.
+                */
+               if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
+                       return -ENOSPC;
+               /*
+                * Convert to block form then add the name.
+                */
+               error = xfs_dir2_sf_to_block(args);
+               if (error)
+                       return error;
+               return xfs_dir2_block_addname(args);
+       }
+       /*
+        * Just checking, it fits.
+        */
+       if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+               return 0;
+       /*
+        * Do it the easy way - just add it at the end.
+        */
+       if (pick == 1)
+               xfs_dir2_sf_addname_easy(args, sfep, offset, new_isize);
+       /*
+        * Do it the hard way - look for a place to insert the new entry.
+        * Convert to 8 byte inode numbers first if necessary.
+        */
+       else {
+               ASSERT(pick == 2);
+               if (objchange)
+                       xfs_dir2_sf_toino8(args);
+               xfs_dir2_sf_addname_hard(args, objchange, new_isize);
+       }
+       xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+       return 0;
+}
+
+/*
+ * Add the new entry the "easy" way.
+ * This is copying the old directory and adding the new entry at the end.
+ * Since it's sorted by "offset" we need room after the last offset
+ * that's already there, and then room to convert to a block directory.
+ * This is already checked by the pick routine.
+ */
+static void
+xfs_dir2_sf_addname_easy(
+       xfs_da_args_t           *args,          /* operation arguments */
+       xfs_dir2_sf_entry_t     *sfep,          /* pointer to new entry */
+       xfs_dir2_data_aoff_t    offset,         /* offset to use for new ent */
+       int                     new_isize)      /* new directory size */
+{
+       int                     byteoff;        /* byte offset in sf dir */
+       xfs_inode_t             *dp;            /* incore directory inode */
+       xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
+
+       dp = args->dp;
+
+       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+       byteoff = (int)((char *)sfep - (char *)sfp);
+       /*
+        * Grow the in-inode space.
+        */
+       xfs_idata_realloc(dp, dp->d_ops->sf_entsize(sfp, args->namelen),
+                         XFS_DATA_FORK);
+       /*
+        * Need to set up again due to realloc of the inode data.
+        */
+       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+       sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff);
+       /*
+        * Fill in the new entry.
+        */
+       sfep->namelen = args->namelen;
+       xfs_dir2_sf_put_offset(sfep, offset);
+       memcpy(sfep->name, args->name, sfep->namelen);
+       dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
+       dp->d_ops->sf_put_ftype(sfep, args->filetype);
+
+       /*
+        * Update the header and inode.
+        */
+       sfp->count++;
+       if (args->inumber > XFS_DIR2_MAX_SHORT_INUM)
+               sfp->i8count++;
+       dp->i_d.di_size = new_isize;
+       xfs_dir2_sf_check(args);
+}
+
+/*
+ * Add the new entry the "hard" way.
+ * The caller has already converted to 8 byte inode numbers if necessary,
+ * in which case we need to leave the i8count at 1.
+ * Find a hole that the new entry will fit into, and copy
+ * the first part of the entries, the new entry, and the last part of
+ * the entries.
+ */
+/* ARGSUSED */
+static void
+xfs_dir2_sf_addname_hard(
+       xfs_da_args_t           *args,          /* operation arguments */
+       int                     objchange,      /* changing inode number size */
+       int                     new_isize)      /* new directory size */
+{
+       int                     add_datasize;   /* data size need for new ent */
+       char                    *buf;           /* buffer for old */
+       xfs_inode_t             *dp;            /* incore directory inode */
+       int                     eof;            /* reached end of old dir */
+       int                     nbytes;         /* temp for byte copies */
+       xfs_dir2_data_aoff_t    new_offset;     /* next offset value */
+       xfs_dir2_data_aoff_t    offset;         /* current offset value */
+       int                     old_isize;      /* previous di_size */
+       xfs_dir2_sf_entry_t     *oldsfep;       /* entry in original dir */
+       xfs_dir2_sf_hdr_t       *oldsfp;        /* original shortform dir */
+       xfs_dir2_sf_entry_t     *sfep;          /* entry in new dir */
+       xfs_dir2_sf_hdr_t       *sfp;           /* new shortform dir */
+       struct xfs_mount        *mp;
+
+       /*
+        * Copy the old directory to the stack buffer.
+        */
+       dp = args->dp;
+       mp = dp->i_mount;
+
+       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+       old_isize = (int)dp->i_d.di_size;
+       buf = kmem_alloc(old_isize, KM_SLEEP);
+       oldsfp = (xfs_dir2_sf_hdr_t *)buf;
+       memcpy(oldsfp, sfp, old_isize);
+       /*
+        * Loop over the old directory finding the place we're going
+        * to insert the new entry.
+        * If it's going to end up at the end then oldsfep will point there.
+        */
+       for (offset = dp->d_ops->data_first_offset,
+             oldsfep = xfs_dir2_sf_firstentry(oldsfp),
+             add_datasize = dp->d_ops->data_entsize(args->namelen),
+             eof = (char *)oldsfep == &buf[old_isize];
+            !eof;
+            offset = new_offset + dp->d_ops->data_entsize(oldsfep->namelen),
+             oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep),
+             eof = (char *)oldsfep == &buf[old_isize]) {
+               new_offset = xfs_dir2_sf_get_offset(oldsfep);
+               if (offset + add_datasize <= new_offset)
+                       break;
+       }
+       /*
+        * Get rid of the old directory, then allocate space for
+        * the new one.  We do this so xfs_idata_realloc won't copy
+        * the data.
+        */
+       xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK);
+       xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK);
+       /*
+        * Reset the pointer since the buffer was reallocated.
+        */
+       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+       /*
+        * Copy the first part of the directory, including the header.
+        */
+       nbytes = (int)((char *)oldsfep - (char *)oldsfp);
+       memcpy(sfp, oldsfp, nbytes);
+       sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + nbytes);
+       /*
+        * Fill in the new entry, and update the header counts.
+        */
+       sfep->namelen = args->namelen;
+       xfs_dir2_sf_put_offset(sfep, offset);
+       memcpy(sfep->name, args->name, sfep->namelen);
+       dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
+       dp->d_ops->sf_put_ftype(sfep, args->filetype);
+       sfp->count++;
+       if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
+               sfp->i8count++;
+       /*
+        * If there's more left to copy, do that.
+        */
+       if (!eof) {
+               sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+               memcpy(sfep, oldsfep, old_isize - nbytes);
+       }
+       kmem_free(buf);
+       dp->i_d.di_size = new_isize;
+       xfs_dir2_sf_check(args);
+}
+
+/*
+ * Decide if the new entry will fit at all.
+ * If it will fit, pick between adding the new entry to the end (easy)
+ * or somewhere else (hard).
+ * Return 0 (won't fit), 1 (easy), 2 (hard).
+ */
+/*ARGSUSED*/
+static int                                     /* pick result */
+xfs_dir2_sf_addname_pick(
+       xfs_da_args_t           *args,          /* operation arguments */
+       int                     objchange,      /* inode # size changes */
+       xfs_dir2_sf_entry_t     **sfepp,        /* out(1): new entry ptr */
+       xfs_dir2_data_aoff_t    *offsetp)       /* out(1): new offset */
+{
+       xfs_inode_t             *dp;            /* incore directory inode */
+       int                     holefit;        /* found hole it will fit in */
+       int                     i;              /* entry number */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       xfs_dir2_data_aoff_t    offset;         /* data block offset */
+       xfs_dir2_sf_entry_t     *sfep;          /* shortform entry */
+       xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
+       int                     size;           /* entry's data size */
+       int                     used;           /* data bytes used */
+
+       dp = args->dp;
+       mp = dp->i_mount;
+
+       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+       size = dp->d_ops->data_entsize(args->namelen);
+       offset = dp->d_ops->data_first_offset;
+       sfep = xfs_dir2_sf_firstentry(sfp);
+       holefit = 0;
+       /*
+        * Loop over sf entries.
+        * Keep track of data offset and whether we've seen a place
+        * to insert the new entry.
+        */
+       for (i = 0; i < sfp->count; i++) {
+               if (!holefit)
+                       holefit = offset + size <= xfs_dir2_sf_get_offset(sfep);
+               offset = xfs_dir2_sf_get_offset(sfep) +
+                        dp->d_ops->data_entsize(sfep->namelen);
+               sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+       }
+       /*
+        * Calculate data bytes used excluding the new entry, if this
+        * was a data block (block form directory).
+        */
+       used = offset +
+              (sfp->count + 3) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+              (uint)sizeof(xfs_dir2_block_tail_t);
+       /*
+        * If it won't fit in a block form then we can't insert it,
+        * we'll go back, convert to block, then try the insert and convert
+        * to leaf.
+        */
+       if (used + (holefit ? 0 : size) > args->geo->blksize)
+               return 0;
+       /*
+        * If changing the inode number size, do it the hard way.
+        */
+       if (objchange)
+               return 2;
+       /*
+        * If it won't fit at the end then do it the hard way (use the hole).
+        */
+       if (used + size > args->geo->blksize)
+               return 2;
+       /*
+        * Do it the easy way.
+        */
+       *sfepp = sfep;
+       *offsetp = offset;
+       return 1;
+}
+
+#ifdef DEBUG
+/*
+ * Check consistency of shortform directory, assert if bad.
+ */
+static void
+xfs_dir2_sf_check(
+       xfs_da_args_t           *args)          /* operation arguments */
+{
+       xfs_inode_t             *dp;            /* incore directory inode */
+       int                     i;              /* entry number */
+       int                     i8count;        /* number of big inode#s */
+       xfs_ino_t               ino;            /* entry inode number */
+       int                     offset;         /* data offset */
+       xfs_dir2_sf_entry_t     *sfep;          /* shortform dir entry */
+       xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
+       struct xfs_mount        *mp;
+
+       dp = args->dp;
+       mp = dp->i_mount;
+
+       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+       offset = dp->d_ops->data_first_offset;
+       ino = dp->d_ops->sf_get_parent_ino(sfp);
+       i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
+
+       for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
+            i < sfp->count;
+            i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+               ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset);
+               ino = dp->d_ops->sf_get_ino(sfp, sfep);
+               i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
+               offset =
+                       xfs_dir2_sf_get_offset(sfep) +
+                       dp->d_ops->data_entsize(sfep->namelen);
+               ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX);
+       }
+       ASSERT(i8count == sfp->i8count);
+       ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
+       ASSERT(offset +
+              (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+              (uint)sizeof(xfs_dir2_block_tail_t) <= args->geo->blksize);
+}
+#endif /* DEBUG */
+
+/*
+ * Create a new (shortform) directory.
+ */
+int                                    /* error, always 0 */
+xfs_dir2_sf_create(
+       xfs_da_args_t   *args,          /* operation arguments */
+       xfs_ino_t       pino)           /* parent inode number */
+{
+       xfs_inode_t     *dp;            /* incore directory inode */
+       int             i8count;        /* parent inode is an 8-byte number */
+       xfs_dir2_sf_hdr_t *sfp;         /* shortform structure */
+       int             size;           /* directory size */
+
+       trace_xfs_dir2_sf_create(args);
+
+       dp = args->dp;
+
+       ASSERT(dp != NULL);
+       ASSERT(dp->i_d.di_size == 0);
+       /*
+        * If it's currently a zero-length extent file,
+        * convert it to local format.
+        */
+       if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) {
+               dp->i_df.if_flags &= ~XFS_IFEXTENTS;    /* just in case */
+               dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+               xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+               dp->i_df.if_flags |= XFS_IFINLINE;
+       }
+       ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+       ASSERT(dp->i_df.if_bytes == 0);
+       i8count = pino > XFS_DIR2_MAX_SHORT_INUM;
+       size = xfs_dir2_sf_hdr_size(i8count);
+       /*
+        * Make a buffer for the data.
+        */
+       xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+       /*
+        * Fill in the header,
+        */
+       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+       sfp->i8count = i8count;
+       /*
+        * Now can put in the inode number, since i8count is set.
+        */
+       dp->d_ops->sf_put_parent_ino(sfp, pino);
+       sfp->count = 0;
+       dp->i_d.di_size = size;
+       xfs_dir2_sf_check(args);
+       xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+       return 0;
+}
+
+/*
+ * Lookup an entry in a shortform directory.
+ * Returns EEXIST if found, ENOENT if not found.
+ */
+int                                            /* error */
+xfs_dir2_sf_lookup(
+       xfs_da_args_t           *args)          /* operation arguments */
+{
+       xfs_inode_t             *dp;            /* incore directory inode */
+       int                     i;              /* entry index */
+       int                     error;
+       xfs_dir2_sf_entry_t     *sfep;          /* shortform directory entry */
+       xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
+       enum xfs_dacmp          cmp;            /* comparison result */
+       xfs_dir2_sf_entry_t     *ci_sfep;       /* case-insens. entry */
+
+       trace_xfs_dir2_sf_lookup(args);
+
+       xfs_dir2_sf_check(args);
+       dp = args->dp;
+
+       ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+       /*
+        * Bail out if the directory is way too short.
+        */
+       if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+               ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+               return -EIO;
+       }
+       ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+       ASSERT(dp->i_df.if_u1.if_data != NULL);
+       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+       ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+       /*
+        * Special case for .
+        */
+       if (args->namelen == 1 && args->name[0] == '.') {
+               args->inumber = dp->i_ino;
+               args->cmpresult = XFS_CMP_EXACT;
+               args->filetype = XFS_DIR3_FT_DIR;
+               return -EEXIST;
+       }
+       /*
+        * Special case for ..
+        */
+       if (args->namelen == 2 &&
+           args->name[0] == '.' && args->name[1] == '.') {
+               args->inumber = dp->d_ops->sf_get_parent_ino(sfp);
+               args->cmpresult = XFS_CMP_EXACT;
+               args->filetype = XFS_DIR3_FT_DIR;
+               return -EEXIST;
+       }
+       /*
+        * Loop over all the entries trying to match ours.
+        */
+       ci_sfep = NULL;
+       for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
+            i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+               /*
+                * Compare name and if it's an exact match, return the inode
+                * number. If it's the first case-insensitive match, store the
+                * inode number and continue looking for an exact match.
+                */
+               cmp = dp->i_mount->m_dirnameops->compname(args, sfep->name,
+                                                               sfep->namelen);
+               if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+                       args->cmpresult = cmp;
+                       args->inumber = dp->d_ops->sf_get_ino(sfp, sfep);
+                       args->filetype = dp->d_ops->sf_get_ftype(sfep);
+                       if (cmp == XFS_CMP_EXACT)
+                               return -EEXIST;
+                       ci_sfep = sfep;
+               }
+       }
+       ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+       /*
+        * Here, we can only be doing a lookup (not a rename or replace).
+        * If a case-insensitive match was not found, return -ENOENT.
+        */
+       if (!ci_sfep)
+               return -ENOENT;
+       /* otherwise process the CI match as required by the caller */
+       error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen);
+       return error;
+}
+
+/*
+ * Remove an entry from a shortform directory.
+ */
+int                                            /* error */
+xfs_dir2_sf_removename(
+       xfs_da_args_t           *args)
+{
+       int                     byteoff;        /* offset of removed entry */
+       xfs_inode_t             *dp;            /* incore directory inode */
+       int                     entsize;        /* this entry's size */
+       int                     i;              /* shortform entry index */
+       int                     newsize;        /* new inode size */
+       int                     oldsize;        /* old inode size */
+       xfs_dir2_sf_entry_t     *sfep;          /* shortform directory entry */
+       xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
+
+       trace_xfs_dir2_sf_removename(args);
+
+       dp = args->dp;
+
+       ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+       oldsize = (int)dp->i_d.di_size;
+       /*
+        * Bail out if the directory is way too short.
+        */
+       if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+               ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+               return -EIO;
+       }
+       ASSERT(dp->i_df.if_bytes == oldsize);
+       ASSERT(dp->i_df.if_u1.if_data != NULL);
+       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+       ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->i8count));
+       /*
+        * Loop over the old directory entries.
+        * Find the one we're deleting.
+        */
+       for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
+            i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+               if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
+                                                               XFS_CMP_EXACT) {
+                       ASSERT(dp->d_ops->sf_get_ino(sfp, sfep) ==
+                              args->inumber);
+                       break;
+               }
+       }
+       /*
+        * Didn't find it.
+        */
+       if (i == sfp->count)
+               return -ENOENT;
+       /*
+        * Calculate sizes.
+        */
+       byteoff = (int)((char *)sfep - (char *)sfp);
+       entsize = dp->d_ops->sf_entsize(sfp, args->namelen);
+       newsize = oldsize - entsize;
+       /*
+        * Copy the part if any after the removed entry, sliding it down.
+        */
+       if (byteoff + entsize < oldsize)
+               memmove((char *)sfp + byteoff, (char *)sfp + byteoff + entsize,
+                       oldsize - (byteoff + entsize));
+       /*
+        * Fix up the header and file size.
+        */
+       sfp->count--;
+       dp->i_d.di_size = newsize;
+       /*
+        * Reallocate, making it smaller.
+        */
+       xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
+       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+       /*
+        * Are we changing inode number size?
+        */
+       if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
+               if (sfp->i8count == 1)
+                       xfs_dir2_sf_toino4(args);
+               else
+                       sfp->i8count--;
+       }
+       xfs_dir2_sf_check(args);
+       xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+       return 0;
+}
+
+/*
+ * Replace the inode number of an entry in a shortform directory.
+ */
+int                                            /* error */
+xfs_dir2_sf_replace(
+       xfs_da_args_t           *args)          /* operation arguments */
+{
+       xfs_inode_t             *dp;            /* incore directory inode */
+       int                     i;              /* entry index */
+       xfs_ino_t               ino=0;          /* entry old inode number */
+       int                     i8elevated;     /* sf_toino8 set i8count=1 */
+       xfs_dir2_sf_entry_t     *sfep;          /* shortform directory entry */
+       xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
+
+       trace_xfs_dir2_sf_replace(args);
+
+       dp = args->dp;
+
+       ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+       /*
+        * Bail out if the shortform directory is way too small.
+        */
+       if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+               ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+               return -EIO;
+       }
+       ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+       ASSERT(dp->i_df.if_u1.if_data != NULL);
+       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+       ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+
+       /*
+        * New inode number is large, and need to convert to 8-byte inodes.
+        */
+       if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
+               int     error;                  /* error return value */
+               int     newsize;                /* new inode size */
+
+               newsize =
+                       dp->i_df.if_bytes +
+                       (sfp->count + 1) *
+                       ((uint)sizeof(xfs_dir2_ino8_t) -
+                        (uint)sizeof(xfs_dir2_ino4_t));
+               /*
+                * Won't fit as shortform, convert to block then do replace.
+                */
+               if (newsize > XFS_IFORK_DSIZE(dp)) {
+                       error = xfs_dir2_sf_to_block(args);
+                       if (error) {
+                               return error;
+                       }
+                       return xfs_dir2_block_replace(args);
+               }
+               /*
+                * Still fits, convert to 8-byte now.
+                */
+               xfs_dir2_sf_toino8(args);
+               i8elevated = 1;
+               sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+       } else
+               i8elevated = 0;
+
+       ASSERT(args->namelen != 1 || args->name[0] != '.');
+       /*
+        * Replace ..'s entry.
+        */
+       if (args->namelen == 2 &&
+           args->name[0] == '.' && args->name[1] == '.') {
+               ino = dp->d_ops->sf_get_parent_ino(sfp);
+               ASSERT(args->inumber != ino);
+               dp->d_ops->sf_put_parent_ino(sfp, args->inumber);
+       }
+       /*
+        * Normal entry, look for the name.
+        */
+       else {
+               for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
+                    i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+                       if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
+                                                               XFS_CMP_EXACT) {
+                               ino = dp->d_ops->sf_get_ino(sfp, sfep);
+                               ASSERT(args->inumber != ino);
+                               dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
+                               dp->d_ops->sf_put_ftype(sfep, args->filetype);
+                               break;
+                       }
+               }
+               /*
+                * Didn't find it.
+                */
+               if (i == sfp->count) {
+                       ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+                       if (i8elevated)
+                               xfs_dir2_sf_toino4(args);
+                       return -ENOENT;
+               }
+       }
+       /*
+        * See if the old number was large, the new number is small.
+        */
+       if (ino > XFS_DIR2_MAX_SHORT_INUM &&
+           args->inumber <= XFS_DIR2_MAX_SHORT_INUM) {
+               /*
+                * And the old count was one, so need to convert to small.
+                */
+               if (sfp->i8count == 1)
+                       xfs_dir2_sf_toino4(args);
+               else
+                       sfp->i8count--;
+       }
+       /*
+        * See if the old number was small, the new number is large.
+        */
+       if (ino <= XFS_DIR2_MAX_SHORT_INUM &&
+           args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
+               /*
+                * add to the i8count unless we just converted to 8-byte
+                * inodes (which does an implied i8count = 1)
+                */
+               ASSERT(sfp->i8count != 0);
+               if (!i8elevated)
+                       sfp->i8count++;
+       }
+       xfs_dir2_sf_check(args);
+       xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
+       return 0;
+}
+
+/*
+ * Convert from 8-byte inode numbers to 4-byte inode numbers.
+ * The last 8-byte inode number is gone, but the count is still 1.
+ */
+static void
+xfs_dir2_sf_toino4(
+       xfs_da_args_t           *args)          /* operation arguments */
+{
+       char                    *buf;           /* old dir's buffer */
+       xfs_inode_t             *dp;            /* incore directory inode */
+       int                     i;              /* entry index */
+       int                     newsize;        /* new inode size */
+       xfs_dir2_sf_entry_t     *oldsfep;       /* old sf entry */
+       xfs_dir2_sf_hdr_t       *oldsfp;        /* old sf directory */
+       int                     oldsize;        /* old inode size */
+       xfs_dir2_sf_entry_t     *sfep;          /* new sf entry */
+       xfs_dir2_sf_hdr_t       *sfp;           /* new sf directory */
+       struct xfs_mount        *mp;
+
+       trace_xfs_dir2_sf_toino4(args);
+
+       dp = args->dp;
+       mp = dp->i_mount;
+
+       /*
+        * Copy the old directory to the buffer.
+        * Then nuke it from the inode, and add the new buffer to the inode.
+        * Don't want xfs_idata_realloc copying the data here.
+        */
+       oldsize = dp->i_df.if_bytes;
+       buf = kmem_alloc(oldsize, KM_SLEEP);
+       oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+       ASSERT(oldsfp->i8count == 1);
+       memcpy(buf, oldsfp, oldsize);
+       /*
+        * Compute the new inode size.
+        */
+       newsize =
+               oldsize -
+               (oldsfp->count + 1) *
+               ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+       xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
+       xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
+       /*
+        * Reset our pointers, the data has moved.
+        */
+       oldsfp = (xfs_dir2_sf_hdr_t *)buf;
+       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+       /*
+        * Fill in the new header.
+        */
+       sfp->count = oldsfp->count;
+       sfp->i8count = 0;
+       dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
+       /*
+        * Copy the entries field by field.
+        */
+       for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
+                   oldsfep = xfs_dir2_sf_firstentry(oldsfp);
+            i < sfp->count;
+            i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
+                 oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
+               sfep->namelen = oldsfep->namelen;
+               sfep->offset = oldsfep->offset;
+               memcpy(sfep->name, oldsfep->name, sfep->namelen);
+               dp->d_ops->sf_put_ino(sfp, sfep,
+                                     dp->d_ops->sf_get_ino(oldsfp, oldsfep));
+               dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
+       }
+       /*
+        * Clean up the inode.
+        */
+       kmem_free(buf);
+       dp->i_d.di_size = newsize;
+       xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+}
+
+/*
+ * Convert existing entries from 4-byte inode numbers to 8-byte inode numbers.
+ * The new entry w/ an 8-byte inode number is not there yet; we leave with
+ * i8count set to 1, but no corresponding 8-byte entry.
+ */
+static void
+xfs_dir2_sf_toino8(
+       xfs_da_args_t           *args)          /* operation arguments */
+{
+       char                    *buf;           /* old dir's buffer */
+       xfs_inode_t             *dp;            /* incore directory inode */
+       int                     i;              /* entry index */
+       int                     newsize;        /* new inode size */
+       xfs_dir2_sf_entry_t     *oldsfep;       /* old sf entry */
+       xfs_dir2_sf_hdr_t       *oldsfp;        /* old sf directory */
+       int                     oldsize;        /* old inode size */
+       xfs_dir2_sf_entry_t     *sfep;          /* new sf entry */
+       xfs_dir2_sf_hdr_t       *sfp;           /* new sf directory */
+       struct xfs_mount        *mp;
+
+       trace_xfs_dir2_sf_toino8(args);
+
+       dp = args->dp;
+       mp = dp->i_mount;
+
+       /*
+        * Copy the old directory to the buffer.
+        * Then nuke it from the inode, and add the new buffer to the inode.
+        * Don't want xfs_idata_realloc copying the data here.
+        */
+       oldsize = dp->i_df.if_bytes;
+       buf = kmem_alloc(oldsize, KM_SLEEP);
+       oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+       ASSERT(oldsfp->i8count == 0);
+       memcpy(buf, oldsfp, oldsize);
+       /*
+        * Compute the new inode size (nb: entry count + 1 for parent)
+        */
+       newsize =
+               oldsize +
+               (oldsfp->count + 1) *
+               ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+       xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
+       xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
+       /*
+        * Reset our pointers, the data has moved.
+        */
+       oldsfp = (xfs_dir2_sf_hdr_t *)buf;
+       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+       /*
+        * Fill in the new header.
+        */
+       sfp->count = oldsfp->count;
+       sfp->i8count = 1;
+       dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
+       /*
+        * Copy the entries field by field.
+        */
+       for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
+                   oldsfep = xfs_dir2_sf_firstentry(oldsfp);
+            i < sfp->count;
+            i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
+                 oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
+               sfep->namelen = oldsfep->namelen;
+               sfep->offset = oldsfep->offset;
+               memcpy(sfep->name, oldsfep->name, sfep->namelen);
+               dp->d_ops->sf_put_ino(sfp, sfep,
+                                     dp->d_ops->sf_get_ino(oldsfp, oldsfep));
+               dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
+       }
+       /*
+        * Clean up the inode.
+        */
+       kmem_free(buf);
+       dp->i_d.di_size = newsize;
+       xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+}
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
new file mode 100644 (file)
index 0000000..bb96933
--- /dev/null
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_qm.h"
+#include "xfs_error.h"
+#include "xfs_cksum.h"
+#include "xfs_trace.h"
+
+int
+xfs_calc_dquots_per_chunk(
+       unsigned int            nbblks) /* basic block units */
+{
+       unsigned int    ndquots;
+
+       ASSERT(nbblks > 0);
+       ndquots = BBTOB(nbblks);
+       do_div(ndquots, sizeof(xfs_dqblk_t));
+
+       return ndquots;
+}
+
+/*
+ * Do some primitive error checking on ondisk dquot data structures.
+ */
+int
+xfs_dqcheck(
+       struct xfs_mount *mp,
+       xfs_disk_dquot_t *ddq,
+       xfs_dqid_t       id,
+       uint             type,    /* used only when IO_dorepair is true */
+       uint             flags,
+       char             *str)
+{
+       xfs_dqblk_t      *d = (xfs_dqblk_t *)ddq;
+       int             errs = 0;
+
+       /*
+        * We can encounter an uninitialized dquot buffer for 2 reasons:
+        * 1. If we crash while deleting the quotainode(s), and those blks got
+        *    used for user data. This is because we take the path of regular
+        *    file deletion; however, the size field of quotainodes is never
+        *    updated, so all the tricks that we play in itruncate_finish
+        *    don't quite matter.
+        *
+        * 2. We don't play the quota buffers when there's a quotaoff logitem.
+        *    But the allocation will be replayed so we'll end up with an
+        *    uninitialized quota block.
+        *
+        * This is all fine; things are still consistent, and we haven't lost
+        * any quota information. Just don't complain about bad dquot blks.
+        */
+       if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) {
+               if (flags & XFS_QMOPT_DOWARN)
+                       xfs_alert(mp,
+                       "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
+                       str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
+               errs++;
+       }
+       if (ddq->d_version != XFS_DQUOT_VERSION) {
+               if (flags & XFS_QMOPT_DOWARN)
+                       xfs_alert(mp,
+                       "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
+                       str, id, ddq->d_version, XFS_DQUOT_VERSION);
+               errs++;
+       }
+
+       if (ddq->d_flags != XFS_DQ_USER &&
+           ddq->d_flags != XFS_DQ_PROJ &&
+           ddq->d_flags != XFS_DQ_GROUP) {
+               if (flags & XFS_QMOPT_DOWARN)
+                       xfs_alert(mp,
+                       "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
+                       str, id, ddq->d_flags);
+               errs++;
+       }
+
+       if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
+               if (flags & XFS_QMOPT_DOWARN)
+                       xfs_alert(mp,
+                       "%s : ondisk-dquot 0x%p, ID mismatch: "
+                       "0x%x expected, found id 0x%x",
+                       str, ddq, id, be32_to_cpu(ddq->d_id));
+               errs++;
+       }
+
+       if (!errs && ddq->d_id) {
+               if (ddq->d_blk_softlimit &&
+                   be64_to_cpu(ddq->d_bcount) >
+                               be64_to_cpu(ddq->d_blk_softlimit)) {
+                       if (!ddq->d_btimer) {
+                               if (flags & XFS_QMOPT_DOWARN)
+                                       xfs_alert(mp,
+                       "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
+                                       str, (int)be32_to_cpu(ddq->d_id), ddq);
+                               errs++;
+                       }
+               }
+               if (ddq->d_ino_softlimit &&
+                   be64_to_cpu(ddq->d_icount) >
+                               be64_to_cpu(ddq->d_ino_softlimit)) {
+                       if (!ddq->d_itimer) {
+                               if (flags & XFS_QMOPT_DOWARN)
+                                       xfs_alert(mp,
+                       "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
+                                       str, (int)be32_to_cpu(ddq->d_id), ddq);
+                               errs++;
+                       }
+               }
+               if (ddq->d_rtb_softlimit &&
+                   be64_to_cpu(ddq->d_rtbcount) >
+                               be64_to_cpu(ddq->d_rtb_softlimit)) {
+                       if (!ddq->d_rtbtimer) {
+                               if (flags & XFS_QMOPT_DOWARN)
+                                       xfs_alert(mp,
+                       "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
+                                       str, (int)be32_to_cpu(ddq->d_id), ddq);
+                               errs++;
+                       }
+               }
+       }
+
+       if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
+               return errs;
+
+       if (flags & XFS_QMOPT_DOWARN)
+               xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
+
+       /*
+        * Typically, a repair is only requested by quotacheck.
+        */
+       ASSERT(id != -1);
+       ASSERT(flags & XFS_QMOPT_DQREPAIR);
+       memset(d, 0, sizeof(xfs_dqblk_t));
+
+       d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+       d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
+       d->dd_diskdq.d_flags = type;
+       d->dd_diskdq.d_id = cpu_to_be32(id);
+
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+               xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
+                                XFS_DQUOT_CRC_OFF);
+       }
+
+       return errs;
+}
+
+STATIC bool
+xfs_dquot_buf_verify_crc(
+       struct xfs_mount        *mp,
+       struct xfs_buf          *bp)
+{
+       struct xfs_dqblk        *d = (struct xfs_dqblk *)bp->b_addr;
+       int                     ndquots;
+       int                     i;
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return true;
+
+       /*
+        * if we are in log recovery, the quota subsystem has not been
+        * initialised so we have no quotainfo structure. In that case, we need
+        * to manually calculate the number of dquots in the buffer.
+        */
+       if (mp->m_quotainfo)
+               ndquots = mp->m_quotainfo->qi_dqperchunk;
+       else
+               ndquots = xfs_calc_dquots_per_chunk(
+                                       XFS_BB_TO_FSB(mp, bp->b_length));
+
+       for (i = 0; i < ndquots; i++, d++) {
+               if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
+                                XFS_DQUOT_CRC_OFF))
+                       return false;
+               if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid))
+                       return false;
+       }
+       return true;
+}
+
+STATIC bool
+xfs_dquot_buf_verify(
+       struct xfs_mount        *mp,
+       struct xfs_buf          *bp)
+{
+       struct xfs_dqblk        *d = (struct xfs_dqblk *)bp->b_addr;
+       xfs_dqid_t              id = 0;
+       int                     ndquots;
+       int                     i;
+
+       /*
+        * if we are in log recovery, the quota subsystem has not been
+        * initialised so we have no quotainfo structure. In that case, we need
+        * to manually calculate the number of dquots in the buffer.
+        */
+       if (mp->m_quotainfo)
+               ndquots = mp->m_quotainfo->qi_dqperchunk;
+       else
+               ndquots = xfs_calc_dquots_per_chunk(bp->b_length);
+
+       /*
+        * On the first read of the buffer, verify that each dquot is valid.
+        * We don't know what the id of the dquot is supposed to be, just that
+        * they should be increasing monotonically within the buffer. If the
+        * first id is corrupt, then it will fail on the second dquot in the
+        * buffer so corruptions could point to the wrong dquot in this case.
+        */
+       for (i = 0; i < ndquots; i++) {
+               struct xfs_disk_dquot   *ddq;
+               int                     error;
+
+               ddq = &d[i].dd_diskdq;
+
+               if (i == 0)
+                       id = be32_to_cpu(ddq->d_id);
+
+               error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
+                                      "xfs_dquot_buf_verify");
+               if (error)
+                       return false;
+       }
+       return true;
+}
+
+static void
+xfs_dquot_buf_read_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+
+       if (!xfs_dquot_buf_verify_crc(mp, bp))
+               xfs_buf_ioerror(bp, -EFSBADCRC);
+       else if (!xfs_dquot_buf_verify(mp, bp))
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+       if (bp->b_error)
+               xfs_verifier_error(bp);
+}
+
+/*
+ * we don't calculate the CRC here as that is done when the dquot is flushed to
+ * the buffer after the update is done. This ensures that the dquot in the
+ * buffer always has an up-to-date CRC value.
+ */
+static void
+xfs_dquot_buf_write_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+
+       if (!xfs_dquot_buf_verify(mp, bp)) {
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+               xfs_verifier_error(bp);
+               return;
+       }
+}
+
+const struct xfs_buf_ops xfs_dquot_buf_ops = {
+       .verify_read = xfs_dquot_buf_read_verify,
+       .verify_write = xfs_dquot_buf_write_verify,
+};
+
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
new file mode 100644 (file)
index 0000000..7e42bba
--- /dev/null
@@ -0,0 +1,416 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_FORMAT_H__
+#define __XFS_FORMAT_H__
+
+/*
+ * XFS On Disk Format Definitions
+ *
+ * This header file defines all the on-disk format definitions for 
+ * general XFS objects. Directory and attribute related objects are defined in
+ * xfs_da_format.h, which log and log item formats are defined in
+ * xfs_log_format.h. Everything else goes here.
+ */
+
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_inode;
+struct xfs_buf;
+struct xfs_ifork;
+
+/*
+ * RealTime Device format definitions
+ */
+
+/* Min and max rt extent sizes, specified in bytes */
+#define        XFS_MAX_RTEXTSIZE       (1024 * 1024 * 1024)    /* 1GB */
+#define        XFS_DFL_RTEXTSIZE       (64 * 1024)             /* 64kB */
+#define        XFS_MIN_RTEXTSIZE       (4 * 1024)              /* 4kB */
+
+#define        XFS_BLOCKSIZE(mp)       ((mp)->m_sb.sb_blocksize)
+#define        XFS_BLOCKMASK(mp)       ((mp)->m_blockmask)
+#define        XFS_BLOCKWSIZE(mp)      ((mp)->m_blockwsize)
+#define        XFS_BLOCKWMASK(mp)      ((mp)->m_blockwmask)
+
+/*
+ * RT Summary and bit manipulation macros.
+ */
+#define        XFS_SUMOFFS(mp,ls,bb)   ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
+#define        XFS_SUMOFFSTOBLOCK(mp,s)        \
+       (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
+#define        XFS_SUMPTR(mp,bp,so)    \
+       ((xfs_suminfo_t *)((bp)->b_addr + \
+               (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
+
+#define        XFS_BITTOBLOCK(mp,bi)   ((bi) >> (mp)->m_blkbit_log)
+#define        XFS_BLOCKTOBIT(mp,bb)   ((bb) << (mp)->m_blkbit_log)
+#define        XFS_BITTOWORD(mp,bi)    \
+       ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
+
+#define        XFS_RTMIN(a,b)  ((a) < (b) ? (a) : (b))
+#define        XFS_RTMAX(a,b)  ((a) > (b) ? (a) : (b))
+
+#define        XFS_RTLOBIT(w)  xfs_lowbit32(w)
+#define        XFS_RTHIBIT(w)  xfs_highbit32(w)
+
+#define        XFS_RTBLOCKLOG(b)       xfs_highbit64(b)
+
+/*
+ * Dquot and dquot block format definitions
+ */
+#define XFS_DQUOT_MAGIC                0x4451          /* 'DQ' */
+#define XFS_DQUOT_VERSION      (u_int8_t)0x01  /* latest version number */
+
+/*
+ * This is the main portion of the on-disk representation of quota
+ * information for a user. This is the q_core of the xfs_dquot_t that
+ * is kept in kernel memory. We pad this with some more expansion room
+ * to construct the on disk structure.
+ */
+typedef struct xfs_disk_dquot {
+       __be16          d_magic;        /* dquot magic = XFS_DQUOT_MAGIC */
+       __u8            d_version;      /* dquot version */
+       __u8            d_flags;        /* XFS_DQ_USER/PROJ/GROUP */
+       __be32          d_id;           /* user,project,group id */
+       __be64          d_blk_hardlimit;/* absolute limit on disk blks */
+       __be64          d_blk_softlimit;/* preferred limit on disk blks */
+       __be64          d_ino_hardlimit;/* maximum # allocated inodes */
+       __be64          d_ino_softlimit;/* preferred inode limit */
+       __be64          d_bcount;       /* disk blocks owned by the user */
+       __be64          d_icount;       /* inodes owned by the user */
+       __be32          d_itimer;       /* zero if within inode limits if not,
+                                          this is when we refuse service */
+       __be32          d_btimer;       /* similar to above; for disk blocks */
+       __be16          d_iwarns;       /* warnings issued wrt num inodes */
+       __be16          d_bwarns;       /* warnings issued wrt disk blocks */
+       __be32          d_pad0;         /* 64 bit align */
+       __be64          d_rtb_hardlimit;/* absolute limit on realtime blks */
+       __be64          d_rtb_softlimit;/* preferred limit on RT disk blks */
+       __be64          d_rtbcount;     /* realtime blocks owned */
+       __be32          d_rtbtimer;     /* similar to above; for RT disk blocks */
+       __be16          d_rtbwarns;     /* warnings issued wrt RT disk blocks */
+       __be16          d_pad;
+} xfs_disk_dquot_t;
+
+/*
+ * This is what goes on disk. This is separated from the xfs_disk_dquot because
+ * carrying the unnecessary padding would be a waste of memory.
+ */
+typedef struct xfs_dqblk {
+       xfs_disk_dquot_t  dd_diskdq;    /* portion that lives incore as well */
+       char              dd_fill[4];   /* filling for posterity */
+
+       /*
+        * These two are only present on filesystems with the CRC bits set.
+        */
+       __be32            dd_crc;       /* checksum */
+       __be64            dd_lsn;       /* last modification in log */
+       uuid_t            dd_uuid;      /* location information */
+} xfs_dqblk_t;
+
+#define XFS_DQUOT_CRC_OFF      offsetof(struct xfs_dqblk, dd_crc)
+
+/*
+ * Remote symlink format and access functions.
+ */
+#define XFS_SYMLINK_MAGIC      0x58534c4d      /* XSLM */
+
+struct xfs_dsymlink_hdr {
+       __be32  sl_magic;
+       __be32  sl_offset;
+       __be32  sl_bytes;
+       __be32  sl_crc;
+       uuid_t  sl_uuid;
+       __be64  sl_owner;
+       __be64  sl_blkno;
+       __be64  sl_lsn;
+};
+
+#define XFS_SYMLINK_CRC_OFF    offsetof(struct xfs_dsymlink_hdr, sl_crc)
+
+/*
+ * The maximum pathlen is 1024 bytes. Since the minimum file system
+ * blocksize is 512 bytes, we can get a max of 3 extents back from
+ * bmapi when crc headers are taken into account.
+ */
+#define XFS_SYMLINK_MAPS 3
+
+#define XFS_SYMLINK_BUF_SPACE(mp, bufsize)     \
+       ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
+                       sizeof(struct xfs_dsymlink_hdr) : 0))
+
+
+/*
+ * Allocation Btree format definitions
+ *
+ * There are two on-disk btrees, one sorted by blockno and one sorted
+ * by blockcount and blockno.  All blocks look the same to make the code
+ * simpler; if we have time later, we'll make the optimizations.
+ */
+#define        XFS_ABTB_MAGIC          0x41425442      /* 'ABTB' for bno tree */
+#define        XFS_ABTB_CRC_MAGIC      0x41423342      /* 'AB3B' */
+#define        XFS_ABTC_MAGIC          0x41425443      /* 'ABTC' for cnt tree */
+#define        XFS_ABTC_CRC_MAGIC      0x41423343      /* 'AB3C' */
+
+/*
+ * Data record/key structure
+ */
+typedef struct xfs_alloc_rec {
+       __be32          ar_startblock;  /* starting block number */
+       __be32          ar_blockcount;  /* count of free blocks */
+} xfs_alloc_rec_t, xfs_alloc_key_t;
+
+typedef struct xfs_alloc_rec_incore {
+       xfs_agblock_t   ar_startblock;  /* starting block number */
+       xfs_extlen_t    ar_blockcount;  /* count of free blocks */
+} xfs_alloc_rec_incore_t;
+
+/* btree pointer type */
+typedef __be32 xfs_alloc_ptr_t;
+
+/*
+ * Block numbers in the AG:
+ * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3.
+ */
+#define        XFS_BNO_BLOCK(mp)       ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1))
+#define        XFS_CNT_BLOCK(mp)       ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
+
+
+/*
+ * Inode Allocation Btree format definitions
+ *
+ * There is a btree for the inode map per allocation group.
+ */
+#define        XFS_IBT_MAGIC           0x49414254      /* 'IABT' */
+#define        XFS_IBT_CRC_MAGIC       0x49414233      /* 'IAB3' */
+#define        XFS_FIBT_MAGIC          0x46494254      /* 'FIBT' */
+#define        XFS_FIBT_CRC_MAGIC      0x46494233      /* 'FIB3' */
+
+typedef        __uint64_t      xfs_inofree_t;
+#define        XFS_INODES_PER_CHUNK            (NBBY * sizeof(xfs_inofree_t))
+#define        XFS_INODES_PER_CHUNK_LOG        (XFS_NBBYLOG + 3)
+#define        XFS_INOBT_ALL_FREE              ((xfs_inofree_t)-1)
+#define        XFS_INOBT_MASK(i)               ((xfs_inofree_t)1 << (i))
+
+static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
+{
+       return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
+}
+
+/*
+ * Data record structure
+ */
+typedef struct xfs_inobt_rec {
+       __be32          ir_startino;    /* starting inode number */
+       __be32          ir_freecount;   /* count of free inodes (set bits) */
+       __be64          ir_free;        /* free inode mask */
+} xfs_inobt_rec_t;
+
+typedef struct xfs_inobt_rec_incore {
+       xfs_agino_t     ir_startino;    /* starting inode number */
+       __int32_t       ir_freecount;   /* count of free inodes (set bits) */
+       xfs_inofree_t   ir_free;        /* free inode mask */
+} xfs_inobt_rec_incore_t;
+
+
+/*
+ * Key structure
+ */
+typedef struct xfs_inobt_key {
+       __be32          ir_startino;    /* starting inode number */
+} xfs_inobt_key_t;
+
+/* btree pointer type */
+typedef __be32 xfs_inobt_ptr_t;
+
+/*
+ * block numbers in the AG.
+ */
+#define        XFS_IBT_BLOCK(mp)               ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
+#define        XFS_FIBT_BLOCK(mp)              ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
+
+/*
+ * The first data block of an AG depends on whether the filesystem was formatted
+ * with the finobt feature. If so, account for the finobt reserved root btree
+ * block.
+ */
+#define XFS_PREALLOC_BLOCKS(mp) \
+       (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \
+        XFS_FIBT_BLOCK(mp) + 1 : \
+        XFS_IBT_BLOCK(mp) + 1)
+
+
+
+/*
+ * BMAP Btree format definitions
+ *
+ * This includes both the root block definition that sits inside an inode fork
+ * and the record/pointer formats for the leaf/node in the blocks.
+ */
+#define XFS_BMAP_MAGIC         0x424d4150      /* 'BMAP' */
+#define XFS_BMAP_CRC_MAGIC     0x424d4133      /* 'BMA3' */
+
+/*
+ * Bmap root header, on-disk form only.
+ */
+typedef struct xfs_bmdr_block {
+       __be16          bb_level;       /* 0 is a leaf */
+       __be16          bb_numrecs;     /* current # of data records */
+} xfs_bmdr_block_t;
+
+/*
+ * Bmap btree record and extent descriptor.
+ *  l0:63 is an extent flag (value 1 indicates non-normal).
+ *  l0:9-62 are startoff.
+ *  l0:0-8 and l1:21-63 are startblock.
+ *  l1:0-20 are blockcount.
+ */
+#define BMBT_EXNTFLAG_BITLEN   1
+#define BMBT_STARTOFF_BITLEN   54
+#define BMBT_STARTBLOCK_BITLEN 52
+#define BMBT_BLOCKCOUNT_BITLEN 21
+
+typedef struct xfs_bmbt_rec {
+       __be64                  l0, l1;
+} xfs_bmbt_rec_t;
+
+typedef __uint64_t     xfs_bmbt_rec_base_t;    /* use this for casts */
+typedef xfs_bmbt_rec_t xfs_bmdr_rec_t;
+
+typedef struct xfs_bmbt_rec_host {
+       __uint64_t              l0, l1;
+} xfs_bmbt_rec_host_t;
+
+/*
+ * Values and macros for delayed-allocation startblock fields.
+ */
+#define STARTBLOCKVALBITS      17
+#define STARTBLOCKMASKBITS     (15 + 20)
+#define STARTBLOCKMASK         \
+       (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
+
+static inline int isnullstartblock(xfs_fsblock_t x)
+{
+       return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
+}
+
+static inline xfs_fsblock_t nullstartblock(int k)
+{
+       ASSERT(k < (1 << STARTBLOCKVALBITS));
+       return STARTBLOCKMASK | (k);
+}
+
+static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
+{
+       return (xfs_filblks_t)((x) & ~STARTBLOCKMASK);
+}
+
+/*
+ * Possible extent formats.
+ */
+typedef enum {
+       XFS_EXTFMT_NOSTATE = 0,
+       XFS_EXTFMT_HASSTATE
+} xfs_exntfmt_t;
+
+/*
+ * Possible extent states.
+ */
+typedef enum {
+       XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
+       XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID
+} xfs_exntst_t;
+
+/*
+ * Incore version of above.
+ */
+typedef struct xfs_bmbt_irec
+{
+       xfs_fileoff_t   br_startoff;    /* starting file offset */
+       xfs_fsblock_t   br_startblock;  /* starting block number */
+       xfs_filblks_t   br_blockcount;  /* number of blocks */
+       xfs_exntst_t    br_state;       /* extent state */
+} xfs_bmbt_irec_t;
+
+/*
+ * Key structure for non-leaf levels of the tree.
+ */
+typedef struct xfs_bmbt_key {
+       __be64          br_startoff;    /* starting file offset */
+} xfs_bmbt_key_t, xfs_bmdr_key_t;
+
+/* btree pointer type */
+typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
+
+
+/*
+ * Generic Btree block format definitions
+ *
+ * This is a combination of the actual format used on disk for short and long
+ * format btrees.  The first three fields are shared by both format, but the
+ * pointers are different and should be used with care.
+ *
+ * To get the size of the actual short or long form headers please use the size
+ * macros below.  Never use sizeof(xfs_btree_block).
+ *
+ * The blkno, crc, lsn, owner and uuid fields are only available in filesystems
+ * with the crc feature bit, and all accesses to them must be conditional on
+ * that flag.
+ */
+struct xfs_btree_block {
+       __be32          bb_magic;       /* magic number for block type */
+       __be16          bb_level;       /* 0 is a leaf */
+       __be16          bb_numrecs;     /* current # of data records */
+       union {
+               struct {
+                       __be32          bb_leftsib;
+                       __be32          bb_rightsib;
+
+                       __be64          bb_blkno;
+                       __be64          bb_lsn;
+                       uuid_t          bb_uuid;
+                       __be32          bb_owner;
+                       __le32          bb_crc;
+               } s;                    /* short form pointers */
+               struct  {
+                       __be64          bb_leftsib;
+                       __be64          bb_rightsib;
+
+                       __be64          bb_blkno;
+                       __be64          bb_lsn;
+                       uuid_t          bb_uuid;
+                       __be64          bb_owner;
+                       __le32          bb_crc;
+                       __be32          bb_pad; /* padding for alignment */
+               } l;                    /* long form pointers */
+       } bb_u;                         /* rest */
+};
+
+#define XFS_BTREE_SBLOCK_LEN   16      /* size of a short form block */
+#define XFS_BTREE_LBLOCK_LEN   24      /* size of a long form block */
+
+/* sizes of CRC enabled btree blocks */
+#define XFS_BTREE_SBLOCK_CRC_LEN       (XFS_BTREE_SBLOCK_LEN + 40)
+#define XFS_BTREE_LBLOCK_CRC_LEN       (XFS_BTREE_LBLOCK_LEN + 48)
+
+#define XFS_BTREE_SBLOCK_CRC_OFF \
+       offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
+#define XFS_BTREE_LBLOCK_CRC_OFF \
+       offsetof(struct xfs_btree_block, bb_u.l.bb_crc)
+
+#endif /* __XFS_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
new file mode 100644 (file)
index 0000000..b62771f
--- /dev/null
@@ -0,0 +1,2189 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_inum.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_bmap.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_icreate_item.h"
+#include "xfs_icache.h"
+#include "xfs_dinode.h"
+#include "xfs_trace.h"
+
+
+/*
+ * Allocation group level functions.
+ */
+static inline int
+xfs_ialloc_cluster_alignment(
+       xfs_alloc_arg_t *args)
+{
+       if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
+           args->mp->m_sb.sb_inoalignmt >=
+            XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size))
+               return args->mp->m_sb.sb_inoalignmt;
+       return 1;
+}
+
+/*
+ * Lookup a record by ino in the btree given by cur.
+ */
+int                                    /* error */
+xfs_inobt_lookup(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agino_t             ino,    /* starting inode of chunk */
+       xfs_lookup_t            dir,    /* <=, >=, == */
+       int                     *stat)  /* success/failure */
+{
+       cur->bc_rec.i.ir_startino = ino;
+       cur->bc_rec.i.ir_freecount = 0;
+       cur->bc_rec.i.ir_free = 0;
+       return xfs_btree_lookup(cur, dir, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given.
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int                             /* error */
+xfs_inobt_update(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_inobt_rec_incore_t  *irec)  /* btree record */
+{
+       union xfs_btree_rec     rec;
+
+       rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
+       rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
+       rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
+       return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int                                    /* error */
+xfs_inobt_get_rec(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_inobt_rec_incore_t  *irec,  /* btree record */
+       int                     *stat)  /* output: success/failure */
+{
+       union xfs_btree_rec     *rec;
+       int                     error;
+
+       error = xfs_btree_get_rec(cur, &rec, stat);
+       if (!error && *stat == 1) {
+               irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
+               irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
+               irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
+       }
+       return error;
+}
+
+/*
+ * Insert a single inobt record. Cursor must already point to desired location.
+ */
+STATIC int
+xfs_inobt_insert_rec(
+       struct xfs_btree_cur    *cur,
+       __int32_t               freecount,
+       xfs_inofree_t           free,
+       int                     *stat)
+{
+       cur->bc_rec.i.ir_freecount = freecount;
+       cur->bc_rec.i.ir_free = free;
+       return xfs_btree_insert(cur, stat);
+}
+
+/*
+ * Insert records describing a newly allocated inode chunk into the inobt.
+ */
+STATIC int
+xfs_inobt_insert(
+       struct xfs_mount        *mp,
+       struct xfs_trans        *tp,
+       struct xfs_buf          *agbp,
+       xfs_agino_t             newino,
+       xfs_agino_t             newlen,
+       xfs_btnum_t             btnum)
+{
+       struct xfs_btree_cur    *cur;
+       struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
+       xfs_agnumber_t          agno = be32_to_cpu(agi->agi_seqno);
+       xfs_agino_t             thisino;
+       int                     i;
+       int                     error;
+
+       cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
+
+       for (thisino = newino;
+            thisino < newino + newlen;
+            thisino += XFS_INODES_PER_CHUNK) {
+               error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i);
+               if (error) {
+                       xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+                       return error;
+               }
+               ASSERT(i == 0);
+
+               error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK,
+                                            XFS_INOBT_ALL_FREE, &i);
+               if (error) {
+                       xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+                       return error;
+               }
+               ASSERT(i == 1);
+       }
+
+       xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+
+       return 0;
+}
+
+/*
+ * Verify that the number of free inodes in the AGI is correct.
+ */
+#ifdef DEBUG
+STATIC int
+xfs_check_agi_freecount(
+       struct xfs_btree_cur    *cur,
+       struct xfs_agi          *agi)
+{
+       if (cur->bc_nlevels == 1) {
+               xfs_inobt_rec_incore_t rec;
+               int             freecount = 0;
+               int             error;
+               int             i;
+
+               error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+               if (error)
+                       return error;
+
+               do {
+                       error = xfs_inobt_get_rec(cur, &rec, &i);
+                       if (error)
+                               return error;
+
+                       if (i) {
+                               freecount += rec.ir_freecount;
+                               error = xfs_btree_increment(cur, 0, &i);
+                               if (error)
+                                       return error;
+                       }
+               } while (i == 1);
+
+               if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
+                       ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
+       }
+       return 0;
+}
+#else
+#define xfs_check_agi_freecount(cur, agi)      0
+#endif
+
+/*
+ * Initialise a new set of inodes. When called without a transaction context
+ * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
+ * than logging them (which in a transaction context puts them into the AIL
+ * for writeback rather than the xfsbufd queue).
+ */
+int
+xfs_ialloc_inode_init(
+       struct xfs_mount        *mp,
+       struct xfs_trans        *tp,
+       struct list_head        *buffer_list,
+       xfs_agnumber_t          agno,
+       xfs_agblock_t           agbno,
+       xfs_agblock_t           length,
+       unsigned int            gen)
+{
+       struct xfs_buf          *fbuf;
+       struct xfs_dinode       *free;
+       int                     nbufs, blks_per_cluster, inodes_per_cluster;
+       int                     version;
+       int                     i, j;
+       xfs_daddr_t             d;
+       xfs_ino_t               ino = 0;
+
+       /*
+        * Loop over the new block(s), filling in the inodes.  For small block
+        * sizes, manipulate the inodes in buffers  which are multiples of the
+        * blocks size.
+        */
+       blks_per_cluster = xfs_icluster_size_fsb(mp);
+       inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
+       nbufs = length / blks_per_cluster;
+
+       /*
+        * Figure out what version number to use in the inodes we create.  If
+        * the superblock version has caught up to the one that supports the new
+        * inode format, then use the new inode version.  Otherwise use the old
+        * version so that old kernels will continue to be able to use the file
+        * system.
+        *
+        * For v3 inodes, we also need to write the inode number into the inode,
+        * so calculate the first inode number of the chunk here as
+        * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not
+        * across multiple filesystem blocks (such as a cluster) and so cannot
+        * be used in the cluster buffer loop below.
+        *
+        * Further, because we are writing the inode directly into the buffer
+        * and calculating a CRC on the entire inode, we have ot log the entire
+        * inode so that the entire range the CRC covers is present in the log.
+        * That means for v3 inode we log the entire buffer rather than just the
+        * inode cores.
+        */
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               version = 3;
+               ino = XFS_AGINO_TO_INO(mp, agno,
+                                      XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
+
+               /*
+                * log the initialisation that is about to take place as an
+                * logical operation. This means the transaction does not
+                * need to log the physical changes to the inode buffers as log
+                * recovery will know what initialisation is actually needed.
+                * Hence we only need to log the buffers as "ordered" buffers so
+                * they track in the AIL as if they were physically logged.
+                */
+               if (tp)
+                       xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos,
+                                       mp->m_sb.sb_inodesize, length, gen);
+       } else
+               version = 2;
+
+       for (j = 0; j < nbufs; j++) {
+               /*
+                * Get the block.
+                */
+               d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
+               fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+                                        mp->m_bsize * blks_per_cluster,
+                                        XBF_UNMAPPED);
+               if (!fbuf)
+                       return -ENOMEM;
+
+               /* Initialize the inode buffers and log them appropriately. */
+               fbuf->b_ops = &xfs_inode_buf_ops;
+               xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
+               for (i = 0; i < inodes_per_cluster; i++) {
+                       int     ioffset = i << mp->m_sb.sb_inodelog;
+                       uint    isize = xfs_dinode_size(version);
+
+                       free = xfs_make_iptr(mp, fbuf, i);
+                       free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+                       free->di_version = version;
+                       free->di_gen = cpu_to_be32(gen);
+                       free->di_next_unlinked = cpu_to_be32(NULLAGINO);
+
+                       if (version == 3) {
+                               free->di_ino = cpu_to_be64(ino);
+                               ino++;
+                               uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid);
+                               xfs_dinode_calc_crc(mp, free);
+                       } else if (tp) {
+                               /* just log the inode core */
+                               xfs_trans_log_buf(tp, fbuf, ioffset,
+                                                 ioffset + isize - 1);
+                       }
+               }
+
+               if (tp) {
+                       /*
+                        * Mark the buffer as an inode allocation buffer so it
+                        * sticks in AIL at the point of this allocation
+                        * transaction. This ensures the they are on disk before
+                        * the tail of the log can be moved past this
+                        * transaction (i.e. by preventing relogging from moving
+                        * it forward in the log).
+                        */
+                       xfs_trans_inode_alloc_buf(tp, fbuf);
+                       if (version == 3) {
+                               /*
+                                * Mark the buffer as ordered so that they are
+                                * not physically logged in the transaction but
+                                * still tracked in the AIL as part of the
+                                * transaction and pin the log appropriately.
+                                */
+                               xfs_trans_ordered_buf(tp, fbuf);
+                               xfs_trans_log_buf(tp, fbuf, 0,
+                                                 BBTOB(fbuf->b_length) - 1);
+                       }
+               } else {
+                       fbuf->b_flags |= XBF_DONE;
+                       xfs_buf_delwri_queue(fbuf, buffer_list);
+                       xfs_buf_relse(fbuf);
+               }
+       }
+       return 0;
+}
+
+/*
+ * Allocate new inodes in the allocation group specified by agbp.
+ * Return 0 for success, else error code.
+ */
+STATIC int                             /* error code or 0 */
+xfs_ialloc_ag_alloc(
+       xfs_trans_t     *tp,            /* transaction pointer */
+       xfs_buf_t       *agbp,          /* alloc group buffer */
+       int             *alloc)
+{
+       xfs_agi_t       *agi;           /* allocation group header */
+       xfs_alloc_arg_t args;           /* allocation argument structure */
+       xfs_agnumber_t  agno;
+       int             error;
+       xfs_agino_t     newino;         /* new first inode's number */
+       xfs_agino_t     newlen;         /* new number of inodes */
+       int             isaligned = 0;  /* inode allocation at stripe unit */
+                                       /* boundary */
+       struct xfs_perag *pag;
+
+       memset(&args, 0, sizeof(args));
+       args.tp = tp;
+       args.mp = tp->t_mountp;
+
+       /*
+        * Locking will ensure that we don't have two callers in here
+        * at one time.
+        */
+       newlen = args.mp->m_ialloc_inos;
+       if (args.mp->m_maxicount &&
+           args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
+               return -ENOSPC;
+       args.minlen = args.maxlen = args.mp->m_ialloc_blks;
+       /*
+        * First try to allocate inodes contiguous with the last-allocated
+        * chunk of inodes.  If the filesystem is striped, this will fill
+        * an entire stripe unit with inodes.
+        */
+       agi = XFS_BUF_TO_AGI(agbp);
+       newino = be32_to_cpu(agi->agi_newino);
+       agno = be32_to_cpu(agi->agi_seqno);
+       args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
+                    args.mp->m_ialloc_blks;
+       if (likely(newino != NULLAGINO &&
+                 (args.agbno < be32_to_cpu(agi->agi_length)))) {
+               args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+               args.type = XFS_ALLOCTYPE_THIS_BNO;
+               args.prod = 1;
+
+               /*
+                * We need to take into account alignment here to ensure that
+                * we don't modify the free list if we fail to have an exact
+                * block. If we don't have an exact match, and every oher
+                * attempt allocation attempt fails, we'll end up cancelling
+                * a dirty transaction and shutting down.
+                *
+                * For an exact allocation, alignment must be 1,
+                * however we need to take cluster alignment into account when
+                * fixing up the freelist. Use the minalignslop field to
+                * indicate that extra blocks might be required for alignment,
+                * but not to use them in the actual exact allocation.
+                */
+               args.alignment = 1;
+               args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
+
+               /* Allow space for the inode btree to split. */
+               args.minleft = args.mp->m_in_maxlevels - 1;
+               if ((error = xfs_alloc_vextent(&args)))
+                       return error;
+
+               /*
+                * This request might have dirtied the transaction if the AG can
+                * satisfy the request, but the exact block was not available.
+                * If the allocation did fail, subsequent requests will relax
+                * the exact agbno requirement and increase the alignment
+                * instead. It is critical that the total size of the request
+                * (len + alignment + slop) does not increase from this point
+                * on, so reset minalignslop to ensure it is not included in
+                * subsequent requests.
+                */
+               args.minalignslop = 0;
+       } else
+               args.fsbno = NULLFSBLOCK;
+
+       if (unlikely(args.fsbno == NULLFSBLOCK)) {
+               /*
+                * Set the alignment for the allocation.
+                * If stripe alignment is turned on then align at stripe unit
+                * boundary.
+                * If the cluster size is smaller than a filesystem block
+                * then we're doing I/O for inodes in filesystem block size
+                * pieces, so don't need alignment anyway.
+                */
+               isaligned = 0;
+               if (args.mp->m_sinoalign) {
+                       ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
+                       args.alignment = args.mp->m_dalign;
+                       isaligned = 1;
+               } else
+                       args.alignment = xfs_ialloc_cluster_alignment(&args);
+               /*
+                * Need to figure out where to allocate the inode blocks.
+                * Ideally they should be spaced out through the a.g.
+                * For now, just allocate blocks up front.
+                */
+               args.agbno = be32_to_cpu(agi->agi_root);
+               args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+               /*
+                * Allocate a fixed-size extent of inodes.
+                */
+               args.type = XFS_ALLOCTYPE_NEAR_BNO;
+               args.prod = 1;
+               /*
+                * Allow space for the inode btree to split.
+                */
+               args.minleft = args.mp->m_in_maxlevels - 1;
+               if ((error = xfs_alloc_vextent(&args)))
+                       return error;
+       }
+
+       /*
+        * If stripe alignment is turned on, then try again with cluster
+        * alignment.
+        */
+       if (isaligned && args.fsbno == NULLFSBLOCK) {
+               args.type = XFS_ALLOCTYPE_NEAR_BNO;
+               args.agbno = be32_to_cpu(agi->agi_root);
+               args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+               args.alignment = xfs_ialloc_cluster_alignment(&args);
+               if ((error = xfs_alloc_vextent(&args)))
+                       return error;
+       }
+
+       if (args.fsbno == NULLFSBLOCK) {
+               *alloc = 0;
+               return 0;
+       }
+       ASSERT(args.len == args.minlen);
+
+       /*
+        * Stamp and write the inode buffers.
+        *
+        * Seed the new inode cluster with a random generation number. This
+        * prevents short-term reuse of generation numbers if a chunk is
+        * freed and then immediately reallocated. We use random numbers
+        * rather than a linear progression to prevent the next generation
+        * number from being easily guessable.
+        */
+       error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno,
+                       args.len, prandom_u32());
+
+       if (error)
+               return error;
+       /*
+        * Convert the results.
+        */
+       newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
+       be32_add_cpu(&agi->agi_count, newlen);
+       be32_add_cpu(&agi->agi_freecount, newlen);
+       pag = xfs_perag_get(args.mp, agno);
+       pag->pagi_freecount += newlen;
+       xfs_perag_put(pag);
+       agi->agi_newino = cpu_to_be32(newino);
+
+       /*
+        * Insert records describing the new inode chunk into the btrees.
+        */
+       error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+                                XFS_BTNUM_INO);
+       if (error)
+               return error;
+
+       if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
+               error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+                                        XFS_BTNUM_FINO);
+               if (error)
+                       return error;
+       }
+       /*
+        * Log allocation group header fields
+        */
+       xfs_ialloc_log_agi(tp, agbp,
+               XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
+       /*
+        * Modify/log superblock values for inode count and inode free count.
+        */
+       xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
+       xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
+       *alloc = 1;
+       return 0;
+}
+
+STATIC xfs_agnumber_t
+xfs_ialloc_next_ag(
+       xfs_mount_t     *mp)
+{
+       xfs_agnumber_t  agno;
+
+       spin_lock(&mp->m_agirotor_lock);
+       agno = mp->m_agirotor;
+       if (++mp->m_agirotor >= mp->m_maxagi)
+               mp->m_agirotor = 0;
+       spin_unlock(&mp->m_agirotor_lock);
+
+       return agno;
+}
+
+/*
+ * Select an allocation group to look for a free inode in, based on the parent
+ * inode and the mode.  Return the allocation group buffer.
+ */
+STATIC xfs_agnumber_t
+xfs_ialloc_ag_select(
+       xfs_trans_t     *tp,            /* transaction pointer */
+       xfs_ino_t       parent,         /* parent directory inode number */
+       umode_t         mode,           /* bits set to indicate file type */
+       int             okalloc)        /* ok to allocate more space */
+{
+       xfs_agnumber_t  agcount;        /* number of ag's in the filesystem */
+       xfs_agnumber_t  agno;           /* current ag number */
+       int             flags;          /* alloc buffer locking flags */
+       xfs_extlen_t    ineed;          /* blocks needed for inode allocation */
+       xfs_extlen_t    longest = 0;    /* longest extent available */
+       xfs_mount_t     *mp;            /* mount point structure */
+       int             needspace;      /* file mode implies space allocated */
+       xfs_perag_t     *pag;           /* per allocation group data */
+       xfs_agnumber_t  pagno;          /* parent (starting) ag number */
+       int             error;
+
+       /*
+        * Files of these types need at least one block if length > 0
+        * (and they won't fit in the inode, but that's hard to figure out).
+        */
+       needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
+       mp = tp->t_mountp;
+       agcount = mp->m_maxagi;
+       if (S_ISDIR(mode))
+               pagno = xfs_ialloc_next_ag(mp);
+       else {
+               pagno = XFS_INO_TO_AGNO(mp, parent);
+               if (pagno >= agcount)
+                       pagno = 0;
+       }
+
+       ASSERT(pagno < agcount);
+
+       /*
+        * Loop through allocation groups, looking for one with a little
+        * free space in it.  Note we don't look for free inodes, exactly.
+        * Instead, we include whether there is a need to allocate inodes
+        * to mean that blocks must be allocated for them,
+        * if none are currently free.
+        */
+       agno = pagno;
+       flags = XFS_ALLOC_FLAG_TRYLOCK;
+       for (;;) {
+               pag = xfs_perag_get(mp, agno);
+               if (!pag->pagi_inodeok) {
+                       xfs_ialloc_next_ag(mp);
+                       goto nextag;
+               }
+
+               if (!pag->pagi_init) {
+                       error = xfs_ialloc_pagi_init(mp, tp, agno);
+                       if (error)
+                               goto nextag;
+               }
+
+               if (pag->pagi_freecount) {
+                       xfs_perag_put(pag);
+                       return agno;
+               }
+
+               if (!okalloc)
+                       goto nextag;
+
+               if (!pag->pagf_init) {
+                       error = xfs_alloc_pagf_init(mp, tp, agno, flags);
+                       if (error)
+                               goto nextag;
+               }
+
+               /*
+                * Is there enough free space for the file plus a block of
+                * inodes? (if we need to allocate some)?
+                */
+               ineed = mp->m_ialloc_blks;
+               longest = pag->pagf_longest;
+               if (!longest)
+                       longest = pag->pagf_flcount > 0;
+
+               if (pag->pagf_freeblks >= needspace + ineed &&
+                   longest >= ineed) {
+                       xfs_perag_put(pag);
+                       return agno;
+               }
+nextag:
+               xfs_perag_put(pag);
+               /*
+                * No point in iterating over the rest, if we're shutting
+                * down.
+                */
+               if (XFS_FORCED_SHUTDOWN(mp))
+                       return NULLAGNUMBER;
+               agno++;
+               if (agno >= agcount)
+                       agno = 0;
+               if (agno == pagno) {
+                       if (flags == 0)
+                               return NULLAGNUMBER;
+                       flags = 0;
+               }
+       }
+}
+
+/*
+ * Try to retrieve the next record to the left/right from the current one.
+ */
+STATIC int
+xfs_ialloc_next_rec(
+       struct xfs_btree_cur    *cur,
+       xfs_inobt_rec_incore_t  *rec,
+       int                     *done,
+       int                     left)
+{
+       int                     error;
+       int                     i;
+
+       if (left)
+               error = xfs_btree_decrement(cur, 0, &i);
+       else
+               error = xfs_btree_increment(cur, 0, &i);
+
+       if (error)
+               return error;
+       *done = !i;
+       if (i) {
+               error = xfs_inobt_get_rec(cur, rec, &i);
+               if (error)
+                       return error;
+               XFS_WANT_CORRUPTED_RETURN(i == 1);
+       }
+
+       return 0;
+}
+
+STATIC int
+xfs_ialloc_get_rec(
+       struct xfs_btree_cur    *cur,
+       xfs_agino_t             agino,
+       xfs_inobt_rec_incore_t  *rec,
+       int                     *done)
+{
+       int                     error;
+       int                     i;
+
+       error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
+       if (error)
+               return error;
+       *done = !i;
+       if (i) {
+               error = xfs_inobt_get_rec(cur, rec, &i);
+               if (error)
+                       return error;
+               XFS_WANT_CORRUPTED_RETURN(i == 1);
+       }
+
+       return 0;
+}
+
+/*
+ * Allocate an inode using the inobt-only algorithm.
+ */
+STATIC int
+xfs_dialloc_ag_inobt(
+       struct xfs_trans        *tp,
+       struct xfs_buf          *agbp,
+       xfs_ino_t               parent,
+       xfs_ino_t               *inop)
+{
+       struct xfs_mount        *mp = tp->t_mountp;
+       struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
+       xfs_agnumber_t          agno = be32_to_cpu(agi->agi_seqno);
+       xfs_agnumber_t          pagno = XFS_INO_TO_AGNO(mp, parent);
+       xfs_agino_t             pagino = XFS_INO_TO_AGINO(mp, parent);
+       struct xfs_perag        *pag;
+       struct xfs_btree_cur    *cur, *tcur;
+       struct xfs_inobt_rec_incore rec, trec;
+       xfs_ino_t               ino;
+       int                     error;
+       int                     offset;
+       int                     i, j;
+
+       pag = xfs_perag_get(mp, agno);
+
+       ASSERT(pag->pagi_init);
+       ASSERT(pag->pagi_inodeok);
+       ASSERT(pag->pagi_freecount > 0);
+
+ restart_pagno:
+       cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+       /*
+        * If pagino is 0 (this is the root inode allocation) use newino.
+        * This must work because we've just allocated some.
+        */
+       if (!pagino)
+               pagino = be32_to_cpu(agi->agi_newino);
+
+       error = xfs_check_agi_freecount(cur, agi);
+       if (error)
+               goto error0;
+
+       /*
+        * If in the same AG as the parent, try to get near the parent.
+        */
+       if (pagno == agno) {
+               int             doneleft;       /* done, to the left */
+               int             doneright;      /* done, to the right */
+               int             searchdistance = 10;
+
+               error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
+               if (error)
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+               error = xfs_inobt_get_rec(cur, &rec, &j);
+               if (error)
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(j == 1, error0);
+
+               if (rec.ir_freecount > 0) {
+                       /*
+                        * Found a free inode in the same chunk
+                        * as the parent, done.
+                        */
+                       goto alloc_inode;
+               }
+
+
+               /*
+                * In the same AG as parent, but parent's chunk is full.
+                */
+
+               /* duplicate the cursor, search left & right simultaneously */
+               error = xfs_btree_dup_cursor(cur, &tcur);
+               if (error)
+                       goto error0;
+
+               /*
+                * Skip to last blocks looked up if same parent inode.
+                */
+               if (pagino != NULLAGINO &&
+                   pag->pagl_pagino == pagino &&
+                   pag->pagl_leftrec != NULLAGINO &&
+                   pag->pagl_rightrec != NULLAGINO) {
+                       error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
+                                                  &trec, &doneleft);
+                       if (error)
+                               goto error1;
+
+                       error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
+                                                  &rec, &doneright);
+                       if (error)
+                               goto error1;
+               } else {
+                       /* search left with tcur, back up 1 record */
+                       error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
+                       if (error)
+                               goto error1;
+
+                       /* search right with cur, go forward 1 record. */
+                       error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
+                       if (error)
+                               goto error1;
+               }
+
+               /*
+                * Loop until we find an inode chunk with a free inode.
+                */
+               while (!doneleft || !doneright) {
+                       int     useleft;  /* using left inode chunk this time */
+
+                       if (!--searchdistance) {
+                               /*
+                                * Not in range - save last search
+                                * location and allocate a new inode
+                                */
+                               xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+                               pag->pagl_leftrec = trec.ir_startino;
+                               pag->pagl_rightrec = rec.ir_startino;
+                               pag->pagl_pagino = pagino;
+                               goto newino;
+                       }
+
+                       /* figure out the closer block if both are valid. */
+                       if (!doneleft && !doneright) {
+                               useleft = pagino -
+                                (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
+                                 rec.ir_startino - pagino;
+                       } else {
+                               useleft = !doneleft;
+                       }
+
+                       /* free inodes to the left? */
+                       if (useleft && trec.ir_freecount) {
+                               rec = trec;
+                               xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+                               cur = tcur;
+
+                               pag->pagl_leftrec = trec.ir_startino;
+                               pag->pagl_rightrec = rec.ir_startino;
+                               pag->pagl_pagino = pagino;
+                               goto alloc_inode;
+                       }
+
+                       /* free inodes to the right? */
+                       if (!useleft && rec.ir_freecount) {
+                               xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+
+                               pag->pagl_leftrec = trec.ir_startino;
+                               pag->pagl_rightrec = rec.ir_startino;
+                               pag->pagl_pagino = pagino;
+                               goto alloc_inode;
+                       }
+
+                       /* get next record to check */
+                       if (useleft) {
+                               error = xfs_ialloc_next_rec(tcur, &trec,
+                                                                &doneleft, 1);
+                       } else {
+                               error = xfs_ialloc_next_rec(cur, &rec,
+                                                                &doneright, 0);
+                       }
+                       if (error)
+                               goto error1;
+               }
+
+               /*
+                * We've reached the end of the btree. because
+                * we are only searching a small chunk of the
+                * btree each search, there is obviously free
+                * inodes closer to the parent inode than we
+                * are now. restart the search again.
+                */
+               pag->pagl_pagino = NULLAGINO;
+               pag->pagl_leftrec = NULLAGINO;
+               pag->pagl_rightrec = NULLAGINO;
+               xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+               xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+               goto restart_pagno;
+       }
+
+       /*
+        * In a different AG from the parent.
+        * See if the most recently allocated block has any free.
+        */
+newino:
+       if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
+               error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+                                        XFS_LOOKUP_EQ, &i);
+               if (error)
+                       goto error0;
+
+               if (i == 1) {
+                       error = xfs_inobt_get_rec(cur, &rec, &j);
+                       if (error)
+                               goto error0;
+
+                       if (j == 1 && rec.ir_freecount > 0) {
+                               /*
+                                * The last chunk allocated in the group
+                                * still has a free inode.
+                                */
+                               goto alloc_inode;
+                       }
+               }
+       }
+
+       /*
+        * None left in the last group, search the whole AG
+        */
+       error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+       if (error)
+               goto error0;
+       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+       for (;;) {
+               error = xfs_inobt_get_rec(cur, &rec, &i);
+               if (error)
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               if (rec.ir_freecount > 0)
+                       break;
+               error = xfs_btree_increment(cur, 0, &i);
+               if (error)
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+       }
+
+alloc_inode:
+       offset = xfs_lowbit64(rec.ir_free);
+       ASSERT(offset >= 0);
+       ASSERT(offset < XFS_INODES_PER_CHUNK);
+       ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
+                                  XFS_INODES_PER_CHUNK) == 0);
+       ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
+       rec.ir_free &= ~XFS_INOBT_MASK(offset);
+       rec.ir_freecount--;
+       error = xfs_inobt_update(cur, &rec);
+       if (error)
+               goto error0;
+       be32_add_cpu(&agi->agi_freecount, -1);
+       xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+       pag->pagi_freecount--;
+
+       error = xfs_check_agi_freecount(cur, agi);
+       if (error)
+               goto error0;
+
+       xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+       xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+       xfs_perag_put(pag);
+       *inop = ino;
+       return 0;
+error1:
+       xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+error0:
+       xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+       xfs_perag_put(pag);
+       return error;
+}
+
+/*
+ * Use the free inode btree to allocate an inode based on distance from the
+ * parent. Note that the provided cursor may be deleted and replaced.
+ */
+STATIC int
+xfs_dialloc_ag_finobt_near(
+       xfs_agino_t                     pagino,
+       struct xfs_btree_cur            **ocur,
+       struct xfs_inobt_rec_incore     *rec)
+{
+       struct xfs_btree_cur            *lcur = *ocur;  /* left search cursor */
+       struct xfs_btree_cur            *rcur;  /* right search cursor */
+       struct xfs_inobt_rec_incore     rrec;
+       int                             error;
+       int                             i, j;
+
+       error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i);
+       if (error)
+               return error;
+
+       if (i == 1) {
+               error = xfs_inobt_get_rec(lcur, rec, &i);
+               if (error)
+                       return error;
+               XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+               /*
+                * See if we've landed in the parent inode record. The finobt
+                * only tracks chunks with at least one free inode, so record
+                * existence is enough.
+                */
+               if (pagino >= rec->ir_startino &&
+                   pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK))
+                       return 0;
+       }
+
+       error = xfs_btree_dup_cursor(lcur, &rcur);
+       if (error)
+               return error;
+
+       error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j);
+       if (error)
+               goto error_rcur;
+       if (j == 1) {
+               error = xfs_inobt_get_rec(rcur, &rrec, &j);
+               if (error)
+                       goto error_rcur;
+               XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur);
+       }
+
+       XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur);
+       if (i == 1 && j == 1) {
+               /*
+                * Both the left and right records are valid. Choose the closer
+                * inode chunk to the target.
+                */
+               if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) >
+                   (rrec.ir_startino - pagino)) {
+                       *rec = rrec;
+                       xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
+                       *ocur = rcur;
+               } else {
+                       xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
+               }
+       } else if (j == 1) {
+               /* only the right record is valid */
+               *rec = rrec;
+               xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
+               *ocur = rcur;
+       } else if (i == 1) {
+               /* only the left record is valid */
+               xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
+       }
+
+       return 0;
+
+error_rcur:
+       xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR);
+       return error;
+}
+
+/*
+ * Use the free inode btree to find a free inode based on a newino hint. If
+ * the hint is NULL, find the first free inode in the AG.
+ */
+STATIC int
+xfs_dialloc_ag_finobt_newino(
+       struct xfs_agi                  *agi,
+       struct xfs_btree_cur            *cur,
+       struct xfs_inobt_rec_incore     *rec)
+{
+       int error;
+       int i;
+
+       if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
+               error = xfs_inobt_lookup(cur, agi->agi_newino, XFS_LOOKUP_EQ,
+                                        &i);
+               if (error)
+                       return error;
+               if (i == 1) {
+                       error = xfs_inobt_get_rec(cur, rec, &i);
+                       if (error)
+                               return error;
+                       XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+                       return 0;
+               }
+       }
+
+       /*
+        * Find the first inode available in the AG.
+        */
+       error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+       if (error)
+               return error;
+       XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+       error = xfs_inobt_get_rec(cur, rec, &i);
+       if (error)
+               return error;
+       XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+       return 0;
+}
+
+/*
+ * Update the inobt based on a modification made to the finobt. Also ensure that
+ * the records from both trees are equivalent post-modification.
+ */
+STATIC int
+xfs_dialloc_ag_update_inobt(
+       struct xfs_btree_cur            *cur,   /* inobt cursor */
+       struct xfs_inobt_rec_incore     *frec,  /* finobt record */
+       int                             offset) /* inode offset */
+{
+       struct xfs_inobt_rec_incore     rec;
+       int                             error;
+       int                             i;
+
+       error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
+       if (error)
+               return error;
+       XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+       error = xfs_inobt_get_rec(cur, &rec, &i);
+       if (error)
+               return error;
+       XFS_WANT_CORRUPTED_RETURN(i == 1);
+       ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
+                                  XFS_INODES_PER_CHUNK) == 0);
+
+       rec.ir_free &= ~XFS_INOBT_MASK(offset);
+       rec.ir_freecount--;
+
+       XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) &&
+                                 (rec.ir_freecount == frec->ir_freecount));
+
+       error = xfs_inobt_update(cur, &rec);
+       if (error)
+               return error;
+
+       return 0;
+}
+
+/*
+ * Allocate an inode using the free inode btree, if available. Otherwise, fall
+ * back to the inobt search algorithm.
+ *
+ * The caller selected an AG for us, and made sure that free inodes are
+ * available.
+ */
+STATIC int
+xfs_dialloc_ag(
+       struct xfs_trans        *tp,
+       struct xfs_buf          *agbp,
+       xfs_ino_t               parent,
+       xfs_ino_t               *inop)
+{
+       struct xfs_mount                *mp = tp->t_mountp;
+       struct xfs_agi                  *agi = XFS_BUF_TO_AGI(agbp);
+       xfs_agnumber_t                  agno = be32_to_cpu(agi->agi_seqno);
+       xfs_agnumber_t                  pagno = XFS_INO_TO_AGNO(mp, parent);
+       xfs_agino_t                     pagino = XFS_INO_TO_AGINO(mp, parent);
+       struct xfs_perag                *pag;
+       struct xfs_btree_cur            *cur;   /* finobt cursor */
+       struct xfs_btree_cur            *icur;  /* inobt cursor */
+       struct xfs_inobt_rec_incore     rec;
+       xfs_ino_t                       ino;
+       int                             error;
+       int                             offset;
+       int                             i;
+
+       if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+               return xfs_dialloc_ag_inobt(tp, agbp, parent, inop);
+
+       pag = xfs_perag_get(mp, agno);
+
+       /*
+        * If pagino is 0 (this is the root inode allocation) use newino.
+        * This must work because we've just allocated some.
+        */
+       if (!pagino)
+               pagino = be32_to_cpu(agi->agi_newino);
+
+       cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
+
+       error = xfs_check_agi_freecount(cur, agi);
+       if (error)
+               goto error_cur;
+
+       /*
+        * The search algorithm depends on whether we're in the same AG as the
+        * parent. If so, find the closest available inode to the parent. If
+        * not, consider the agi hint or find the first free inode in the AG.
+        */
+       if (agno == pagno)
+               error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
+       else
+               error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
+       if (error)
+               goto error_cur;
+
+       offset = xfs_lowbit64(rec.ir_free);
+       ASSERT(offset >= 0);
+       ASSERT(offset < XFS_INODES_PER_CHUNK);
+       ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
+                                  XFS_INODES_PER_CHUNK) == 0);
+       ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
+
+       /*
+        * Modify or remove the finobt record.
+        */
+       rec.ir_free &= ~XFS_INOBT_MASK(offset);
+       rec.ir_freecount--;
+       if (rec.ir_freecount)
+               error = xfs_inobt_update(cur, &rec);
+       else
+               error = xfs_btree_delete(cur, &i);
+       if (error)
+               goto error_cur;
+
+       /*
+        * The finobt has now been updated appropriately. We haven't updated the
+        * agi and superblock yet, so we can create an inobt cursor and validate
+        * the original freecount. If all is well, make the equivalent update to
+        * the inobt using the finobt record and offset information.
+        */
+       icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+
+       error = xfs_check_agi_freecount(icur, agi);
+       if (error)
+               goto error_icur;
+
+       error = xfs_dialloc_ag_update_inobt(icur, &rec, offset);
+       if (error)
+               goto error_icur;
+
+       /*
+        * Both trees have now been updated. We must update the perag and
+        * superblock before we can check the freecount for each btree.
+        */
+       be32_add_cpu(&agi->agi_freecount, -1);
+       xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+       pag->pagi_freecount--;
+
+       xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+
+       error = xfs_check_agi_freecount(icur, agi);
+       if (error)
+               goto error_icur;
+       error = xfs_check_agi_freecount(cur, agi);
+       if (error)
+               goto error_icur;
+
+       xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
+       xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+       xfs_perag_put(pag);
+       *inop = ino;
+       return 0;
+
+error_icur:
+       xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
+error_cur:
+       xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+       xfs_perag_put(pag);
+       return error;
+}
+
+/*
+ * Allocate an inode on disk.
+ *
+ * Mode is used to tell whether the new inode will need space, and whether it
+ * is a directory.
+ *
+ * This function is designed to be called twice if it has to do an allocation
+ * to make more free inodes.  On the first call, *IO_agbp should be set to NULL.
+ * If an inode is available without having to performn an allocation, an inode
+ * number is returned.  In this case, *IO_agbp is set to NULL.  If an allocation
+ * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp.
+ * The caller should then commit the current transaction, allocate a
+ * new transaction, and call xfs_dialloc() again, passing in the previous value
+ * of *IO_agbp.  IO_agbp should be held across the transactions. Since the AGI
+ * buffer is locked across the two calls, the second call is guaranteed to have
+ * a free inode available.
+ *
+ * Once we successfully pick an inode its number is returned and the on-disk
+ * data structures are updated.  The inode itself is not read in, since doing so
+ * would break ordering constraints with xfs_reclaim.
+ */
+int
+xfs_dialloc(
+       struct xfs_trans        *tp,
+       xfs_ino_t               parent,
+       umode_t                 mode,
+       int                     okalloc,
+       struct xfs_buf          **IO_agbp,
+       xfs_ino_t               *inop)
+{
+       struct xfs_mount        *mp = tp->t_mountp;
+       struct xfs_buf          *agbp;
+       xfs_agnumber_t          agno;
+       int                     error;
+       int                     ialloced;
+       int                     noroom = 0;
+       xfs_agnumber_t          start_agno;
+       struct xfs_perag        *pag;
+
+       if (*IO_agbp) {
+               /*
+                * If the caller passes in a pointer to the AGI buffer,
+                * continue where we left off before.  In this case, we
+                * know that the allocation group has free inodes.
+                */
+               agbp = *IO_agbp;
+               goto out_alloc;
+       }
+
+       /*
+        * We do not have an agbp, so select an initial allocation
+        * group for inode allocation.
+        */
+       start_agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc);
+       if (start_agno == NULLAGNUMBER) {
+               *inop = NULLFSINO;
+               return 0;
+       }
+
+       /*
+        * If we have already hit the ceiling of inode blocks then clear
+        * okalloc so we scan all available agi structures for a free
+        * inode.
+        */
+       if (mp->m_maxicount &&
+           mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) {
+               noroom = 1;
+               okalloc = 0;
+       }
+
+       /*
+        * Loop until we find an allocation group that either has free inodes
+        * or in which we can allocate some inodes.  Iterate through the
+        * allocation groups upward, wrapping at the end.
+        */
+       agno = start_agno;
+       for (;;) {
+               pag = xfs_perag_get(mp, agno);
+               if (!pag->pagi_inodeok) {
+                       xfs_ialloc_next_ag(mp);
+                       goto nextag;
+               }
+
+               if (!pag->pagi_init) {
+                       error = xfs_ialloc_pagi_init(mp, tp, agno);
+                       if (error)
+                               goto out_error;
+               }
+
+               /*
+                * Do a first racy fast path check if this AG is usable.
+                */
+               if (!pag->pagi_freecount && !okalloc)
+                       goto nextag;
+
+               /*
+                * Then read in the AGI buffer and recheck with the AGI buffer
+                * lock held.
+                */
+               error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+               if (error)
+                       goto out_error;
+
+               if (pag->pagi_freecount) {
+                       xfs_perag_put(pag);
+                       goto out_alloc;
+               }
+
+               if (!okalloc)
+                       goto nextag_relse_buffer;
+
+
+               error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced);
+               if (error) {
+                       xfs_trans_brelse(tp, agbp);
+
+                       if (error != -ENOSPC)
+                               goto out_error;
+
+                       xfs_perag_put(pag);
+                       *inop = NULLFSINO;
+                       return 0;
+               }
+
+               if (ialloced) {
+                       /*
+                        * We successfully allocated some inodes, return
+                        * the current context to the caller so that it
+                        * can commit the current transaction and call
+                        * us again where we left off.
+                        */
+                       ASSERT(pag->pagi_freecount > 0);
+                       xfs_perag_put(pag);
+
+                       *IO_agbp = agbp;
+                       *inop = NULLFSINO;
+                       return 0;
+               }
+
+nextag_relse_buffer:
+               xfs_trans_brelse(tp, agbp);
+nextag:
+               xfs_perag_put(pag);
+               if (++agno == mp->m_sb.sb_agcount)
+                       agno = 0;
+               if (agno == start_agno) {
+                       *inop = NULLFSINO;
+                       return noroom ? -ENOSPC : 0;
+               }
+       }
+
+out_alloc:
+       *IO_agbp = NULL;
+       return xfs_dialloc_ag(tp, agbp, parent, inop);
+out_error:
+       xfs_perag_put(pag);
+       return error;
+}
+
+STATIC int
+xfs_difree_inobt(
+       struct xfs_mount                *mp,
+       struct xfs_trans                *tp,
+       struct xfs_buf                  *agbp,
+       xfs_agino_t                     agino,
+       struct xfs_bmap_free            *flist,
+       int                             *deleted,
+       xfs_ino_t                       *first_ino,
+       struct xfs_inobt_rec_incore     *orec)
+{
+       struct xfs_agi                  *agi = XFS_BUF_TO_AGI(agbp);
+       xfs_agnumber_t                  agno = be32_to_cpu(agi->agi_seqno);
+       struct xfs_perag                *pag;
+       struct xfs_btree_cur            *cur;
+       struct xfs_inobt_rec_incore     rec;
+       int                             ilen;
+       int                             error;
+       int                             i;
+       int                             off;
+
+       ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
+       ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length));
+
+       /*
+        * Initialize the cursor.
+        */
+       cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+
+       error = xfs_check_agi_freecount(cur, agi);
+       if (error)
+               goto error0;
+
+       /*
+        * Look for the entry describing this inode.
+        */
+       if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
+               xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
+                       __func__, error);
+               goto error0;
+       }
+       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+       error = xfs_inobt_get_rec(cur, &rec, &i);
+       if (error) {
+               xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
+                       __func__, error);
+               goto error0;
+       }
+       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+       /*
+        * Get the offset in the inode chunk.
+        */
+       off = agino - rec.ir_startino;
+       ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
+       ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
+       /*
+        * Mark the inode free & increment the count.
+        */
+       rec.ir_free |= XFS_INOBT_MASK(off);
+       rec.ir_freecount++;
+
+       /*
+        * When an inode cluster is free, it becomes eligible for removal
+        */
+       if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
+           (rec.ir_freecount == mp->m_ialloc_inos)) {
+
+               *deleted = 1;
+               *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
+
+               /*
+                * Remove the inode cluster from the AGI B+Tree, adjust the
+                * AGI and Superblock inode counts, and mark the disk space
+                * to be freed when the transaction is committed.
+                */
+               ilen = mp->m_ialloc_inos;
+               be32_add_cpu(&agi->agi_count, -ilen);
+               be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
+               xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
+               pag = xfs_perag_get(mp, agno);
+               pag->pagi_freecount -= ilen - 1;
+               xfs_perag_put(pag);
+               xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
+               xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
+
+               if ((error = xfs_btree_delete(cur, &i))) {
+                       xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
+                               __func__, error);
+                       goto error0;
+               }
+
+               xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
+                                 XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
+                                 mp->m_ialloc_blks, flist, mp);
+       } else {
+               *deleted = 0;
+
+               error = xfs_inobt_update(cur, &rec);
+               if (error) {
+                       xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
+                               __func__, error);
+                       goto error0;
+               }
+
+               /* 
+                * Change the inode free counts and log the ag/sb changes.
+                */
+               be32_add_cpu(&agi->agi_freecount, 1);
+               xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+               pag = xfs_perag_get(mp, agno);
+               pag->pagi_freecount++;
+               xfs_perag_put(pag);
+               xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
+       }
+
+       error = xfs_check_agi_freecount(cur, agi);
+       if (error)
+               goto error0;
+
+       *orec = rec;
+       xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+       return 0;
+
+error0:
+       xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+       return error;
+}
+
+/*
+ * Free an inode in the free inode btree.
+ */
+STATIC int
+xfs_difree_finobt(
+       struct xfs_mount                *mp,
+       struct xfs_trans                *tp,
+       struct xfs_buf                  *agbp,
+       xfs_agino_t                     agino,
+       struct xfs_inobt_rec_incore     *ibtrec) /* inobt record */
+{
+       struct xfs_agi                  *agi = XFS_BUF_TO_AGI(agbp);
+       xfs_agnumber_t                  agno = be32_to_cpu(agi->agi_seqno);
+       struct xfs_btree_cur            *cur;
+       struct xfs_inobt_rec_incore     rec;
+       int                             offset = agino - ibtrec->ir_startino;
+       int                             error;
+       int                             i;
+
+       cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
+
+       error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
+       if (error)
+               goto error;
+       if (i == 0) {
+               /*
+                * If the record does not exist in the finobt, we must have just
+                * freed an inode in a previously fully allocated chunk. If not,
+                * something is out of sync.
+                */
+               XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error);
+
+               error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount,
+                                            ibtrec->ir_free, &i);
+               if (error)
+                       goto error;
+               ASSERT(i == 1);
+
+               goto out;
+       }
+
+       /*
+        * Read and update the existing record. We could just copy the ibtrec
+        * across here, but that would defeat the purpose of having redundant
+        * metadata. By making the modifications independently, we can catch
+        * corruptions that we wouldn't see if we just copied from one record
+        * to another.
+        */
+       error = xfs_inobt_get_rec(cur, &rec, &i);
+       if (error)
+               goto error;
+       XFS_WANT_CORRUPTED_GOTO(i == 1, error);
+
+       rec.ir_free |= XFS_INOBT_MASK(offset);
+       rec.ir_freecount++;
+
+       XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) &&
+                               (rec.ir_freecount == ibtrec->ir_freecount),
+                               error);
+
+       /*
+        * The content of inobt records should always match between the inobt
+        * and finobt. The lifecycle of records in the finobt is different from
+        * the inobt in that the finobt only tracks records with at least one
+        * free inode. Hence, if all of the inodes are free and we aren't
+        * keeping inode chunks permanently on disk, remove the record.
+        * Otherwise, update the record with the new information.
+        */
+       if (rec.ir_freecount == mp->m_ialloc_inos &&
+           !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+               error = xfs_btree_delete(cur, &i);
+               if (error)
+                       goto error;
+               ASSERT(i == 1);
+       } else {
+               error = xfs_inobt_update(cur, &rec);
+               if (error)
+                       goto error;
+       }
+
+out:
+       error = xfs_check_agi_freecount(cur, agi);
+       if (error)
+               goto error;
+
+       xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+       return 0;
+
+error:
+       xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+       return error;
+}
+
+/*
+ * Free disk inode.  Carefully avoids touching the incore inode, all
+ * manipulations incore are the caller's responsibility.
+ * The on-disk inode is not changed by this operation, only the
+ * btree (free inode mask) is changed.
+ */
+int
+xfs_difree(
+       struct xfs_trans        *tp,            /* transaction pointer */
+       xfs_ino_t               inode,          /* inode to be freed */
+       struct xfs_bmap_free    *flist,         /* extents to free */
+       int                     *deleted,/* set if inode cluster was deleted */
+       xfs_ino_t               *first_ino)/* first inode in deleted cluster */
+{
+       /* REFERENCED */
+       xfs_agblock_t           agbno;  /* block number containing inode */
+       struct xfs_buf          *agbp;  /* buffer for allocation group header */
+       xfs_agino_t             agino;  /* allocation group inode number */
+       xfs_agnumber_t          agno;   /* allocation group number */
+       int                     error;  /* error return value */
+       struct xfs_mount        *mp;    /* mount structure for filesystem */
+       struct xfs_inobt_rec_incore rec;/* btree record */
+
+       mp = tp->t_mountp;
+
+       /*
+        * Break up inode number into its components.
+        */
+       agno = XFS_INO_TO_AGNO(mp, inode);
+       if (agno >= mp->m_sb.sb_agcount)  {
+               xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
+                       __func__, agno, mp->m_sb.sb_agcount);
+               ASSERT(0);
+               return -EINVAL;
+       }
+       agino = XFS_INO_TO_AGINO(mp, inode);
+       if (inode != XFS_AGINO_TO_INO(mp, agno, agino))  {
+               xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
+                       __func__, (unsigned long long)inode,
+                       (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
+               ASSERT(0);
+               return -EINVAL;
+       }
+       agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+       if (agbno >= mp->m_sb.sb_agblocks)  {
+               xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
+                       __func__, agbno, mp->m_sb.sb_agblocks);
+               ASSERT(0);
+               return -EINVAL;
+       }
+       /*
+        * Get the allocation group header.
+        */
+       error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+       if (error) {
+               xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
+                       __func__, error);
+               return error;
+       }
+
+       /*
+        * Fix up the inode allocation btree.
+        */
+       error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino,
+                                &rec);
+       if (error)
+               goto error0;
+
+       /*
+        * Fix up the free inode btree.
+        */
+       if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+               error = xfs_difree_finobt(mp, tp, agbp, agino, &rec);
+               if (error)
+                       goto error0;
+       }
+
+       return 0;
+
+error0:
+       return error;
+}
+
+STATIC int
+xfs_imap_lookup(
+       struct xfs_mount        *mp,
+       struct xfs_trans        *tp,
+       xfs_agnumber_t          agno,
+       xfs_agino_t             agino,
+       xfs_agblock_t           agbno,
+       xfs_agblock_t           *chunk_agbno,
+       xfs_agblock_t           *offset_agbno,
+       int                     flags)
+{
+       struct xfs_inobt_rec_incore rec;
+       struct xfs_btree_cur    *cur;
+       struct xfs_buf          *agbp;
+       int                     error;
+       int                     i;
+
+       error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+       if (error) {
+               xfs_alert(mp,
+                       "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
+                       __func__, error, agno);
+               return error;
+       }
+
+       /*
+        * Lookup the inode record for the given agino. If the record cannot be
+        * found, then it's an invalid inode number and we should abort. Once
+        * we have a record, we need to ensure it contains the inode number
+        * we are looking up.
+        */
+       cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+       error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
+       if (!error) {
+               if (i)
+                       error = xfs_inobt_get_rec(cur, &rec, &i);
+               if (!error && i == 0)
+                       error = -EINVAL;
+       }
+
+       xfs_trans_brelse(tp, agbp);
+       xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+       if (error)
+               return error;
+
+       /* check that the returned record contains the required inode */
+       if (rec.ir_startino > agino ||
+           rec.ir_startino + mp->m_ialloc_inos <= agino)
+               return -EINVAL;
+
+       /* for untrusted inodes check it is allocated first */
+       if ((flags & XFS_IGET_UNTRUSTED) &&
+           (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
+               return -EINVAL;
+
+       *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
+       *offset_agbno = agbno - *chunk_agbno;
+       return 0;
+}
+
+/*
+ * Return the location of the inode in imap, for mapping it into a buffer.
+ */
+int
+xfs_imap(
+       xfs_mount_t      *mp,   /* file system mount structure */
+       xfs_trans_t      *tp,   /* transaction pointer */
+       xfs_ino_t       ino,    /* inode to locate */
+       struct xfs_imap *imap,  /* location map structure */
+       uint            flags)  /* flags for inode btree lookup */
+{
+       xfs_agblock_t   agbno;  /* block number of inode in the alloc group */
+       xfs_agino_t     agino;  /* inode number within alloc group */
+       xfs_agnumber_t  agno;   /* allocation group number */
+       int             blks_per_cluster; /* num blocks per inode cluster */
+       xfs_agblock_t   chunk_agbno;    /* first block in inode chunk */
+       xfs_agblock_t   cluster_agbno;  /* first block in inode cluster */
+       int             error;  /* error code */
+       int             offset; /* index of inode in its buffer */
+       xfs_agblock_t   offset_agbno;   /* blks from chunk start to inode */
+
+       ASSERT(ino != NULLFSINO);
+
+       /*
+        * Split up the inode number into its parts.
+        */
+       agno = XFS_INO_TO_AGNO(mp, ino);
+       agino = XFS_INO_TO_AGINO(mp, ino);
+       agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+       if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
+           ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+#ifdef DEBUG
+               /*
+                * Don't output diagnostic information for untrusted inodes
+                * as they can be invalid without implying corruption.
+                */
+               if (flags & XFS_IGET_UNTRUSTED)
+                       return -EINVAL;
+               if (agno >= mp->m_sb.sb_agcount) {
+                       xfs_alert(mp,
+                               "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
+                               __func__, agno, mp->m_sb.sb_agcount);
+               }
+               if (agbno >= mp->m_sb.sb_agblocks) {
+                       xfs_alert(mp,
+               "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
+                               __func__, (unsigned long long)agbno,
+                               (unsigned long)mp->m_sb.sb_agblocks);
+               }
+               if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+                       xfs_alert(mp,
+               "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
+                               __func__, ino,
+                               XFS_AGINO_TO_INO(mp, agno, agino));
+               }
+               xfs_stack_trace();
+#endif /* DEBUG */
+               return -EINVAL;
+       }
+
+       blks_per_cluster = xfs_icluster_size_fsb(mp);
+
+       /*
+        * For bulkstat and handle lookups, we have an untrusted inode number
+        * that we have to verify is valid. We cannot do this just by reading
+        * the inode buffer as it may have been unlinked and removed leaving
+        * inodes in stale state on disk. Hence we have to do a btree lookup
+        * in all cases where an untrusted inode number is passed.
+        */
+       if (flags & XFS_IGET_UNTRUSTED) {
+               error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
+                                       &chunk_agbno, &offset_agbno, flags);
+               if (error)
+                       return error;
+               goto out_map;
+       }
+
+       /*
+        * If the inode cluster size is the same as the blocksize or
+        * smaller we get to the buffer by simple arithmetics.
+        */
+       if (blks_per_cluster == 1) {
+               offset = XFS_INO_TO_OFFSET(mp, ino);
+               ASSERT(offset < mp->m_sb.sb_inopblock);
+
+               imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+               imap->im_len = XFS_FSB_TO_BB(mp, 1);
+               imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+               return 0;
+       }
+
+       /*
+        * If the inode chunks are aligned then use simple maths to
+        * find the location. Otherwise we have to do a btree
+        * lookup to find the location.
+        */
+       if (mp->m_inoalign_mask) {
+               offset_agbno = agbno & mp->m_inoalign_mask;
+               chunk_agbno = agbno - offset_agbno;
+       } else {
+               error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
+                                       &chunk_agbno, &offset_agbno, flags);
+               if (error)
+                       return error;
+       }
+
+out_map:
+       ASSERT(agbno >= chunk_agbno);
+       cluster_agbno = chunk_agbno +
+               ((offset_agbno / blks_per_cluster) * blks_per_cluster);
+       offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
+               XFS_INO_TO_OFFSET(mp, ino);
+
+       imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
+       imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
+       imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+
+       /*
+        * If the inode number maps to a block outside the bounds
+        * of the file system then return NULL rather than calling
+        * read_buf and panicing when we get an error from the
+        * driver.
+        */
+       if ((imap->im_blkno + imap->im_len) >
+           XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+               xfs_alert(mp,
+       "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
+                       __func__, (unsigned long long) imap->im_blkno,
+                       (unsigned long long) imap->im_len,
+                       XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
+               return -EINVAL;
+       }
+       return 0;
+}
+
+/*
+ * Compute and fill in value of m_in_maxlevels.
+ */
+void
+xfs_ialloc_compute_maxlevels(
+       xfs_mount_t     *mp)            /* file system mount structure */
+{
+       int             level;
+       uint            maxblocks;
+       uint            maxleafents;
+       int             minleafrecs;
+       int             minnoderecs;
+
+       maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
+               XFS_INODES_PER_CHUNK_LOG;
+       minleafrecs = mp->m_alloc_mnr[0];
+       minnoderecs = mp->m_alloc_mnr[1];
+       maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+       for (level = 1; maxblocks > 1; level++)
+               maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+       mp->m_in_maxlevels = level;
+}
+
+/*
+ * Log specified fields for the ag hdr (inode section). The growth of the agi
+ * structure over time requires that we interpret the buffer as two logical
+ * regions delineated by the end of the unlinked list. This is due to the size
+ * of the hash table and its location in the middle of the agi.
+ *
+ * For example, a request to log a field before agi_unlinked and a field after
+ * agi_unlinked could cause us to log the entire hash table and use an excessive
+ * amount of log space. To avoid this behavior, log the region up through
+ * agi_unlinked in one call and the region after agi_unlinked through the end of
+ * the structure in another.
+ */
+void
+xfs_ialloc_log_agi(
+       xfs_trans_t     *tp,            /* transaction pointer */
+       xfs_buf_t       *bp,            /* allocation group header buffer */
+       int             fields)         /* bitmask of fields to log */
+{
+       int                     first;          /* first byte number */
+       int                     last;           /* last byte number */
+       static const short      offsets[] = {   /* field starting offsets */
+                                       /* keep in sync with bit definitions */
+               offsetof(xfs_agi_t, agi_magicnum),
+               offsetof(xfs_agi_t, agi_versionnum),
+               offsetof(xfs_agi_t, agi_seqno),
+               offsetof(xfs_agi_t, agi_length),
+               offsetof(xfs_agi_t, agi_count),
+               offsetof(xfs_agi_t, agi_root),
+               offsetof(xfs_agi_t, agi_level),
+               offsetof(xfs_agi_t, agi_freecount),
+               offsetof(xfs_agi_t, agi_newino),
+               offsetof(xfs_agi_t, agi_dirino),
+               offsetof(xfs_agi_t, agi_unlinked),
+               offsetof(xfs_agi_t, agi_free_root),
+               offsetof(xfs_agi_t, agi_free_level),
+               sizeof(xfs_agi_t)
+       };
+#ifdef DEBUG
+       xfs_agi_t               *agi;   /* allocation group header */
+
+       agi = XFS_BUF_TO_AGI(bp);
+       ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
+#endif
+
+       xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF);
+
+       /*
+        * Compute byte offsets for the first and last fields in the first
+        * region and log the agi buffer. This only logs up through
+        * agi_unlinked.
+        */
+       if (fields & XFS_AGI_ALL_BITS_R1) {
+               xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1,
+                                 &first, &last);
+               xfs_trans_log_buf(tp, bp, first, last);
+       }
+
+       /*
+        * Mask off the bits in the first region and calculate the first and
+        * last field offsets for any bits in the second region.
+        */
+       fields &= ~XFS_AGI_ALL_BITS_R1;
+       if (fields) {
+               xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2,
+                                 &first, &last);
+               xfs_trans_log_buf(tp, bp, first, last);
+       }
+}
+
+#ifdef DEBUG
+STATIC void
+xfs_check_agi_unlinked(
+       struct xfs_agi          *agi)
+{
+       int                     i;
+
+       for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
+               ASSERT(agi->agi_unlinked[i]);
+}
+#else
+#define xfs_check_agi_unlinked(agi)
+#endif
+
+static bool
+xfs_agi_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+       struct xfs_agi  *agi = XFS_BUF_TO_AGI(bp);
+
+       if (xfs_sb_version_hascrc(&mp->m_sb) &&
+           !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid))
+                       return false;
+       /*
+        * Validate the magic number of the agi block.
+        */
+       if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC))
+               return false;
+       if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
+               return false;
+
+       /*
+        * during growfs operations, the perag is not fully initialised,
+        * so we can't use it for any useful checking. growfs ensures we can't
+        * use it by using uncached buffers that don't have the perag attached
+        * so we can detect and avoid this problem.
+        */
+       if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno)
+               return false;
+
+       xfs_check_agi_unlinked(agi);
+       return true;
+}
+
+static void
+xfs_agi_read_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+
+       if (xfs_sb_version_hascrc(&mp->m_sb) &&
+           !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
+               xfs_buf_ioerror(bp, -EFSBADCRC);
+       else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp,
+                               XFS_ERRTAG_IALLOC_READ_AGI,
+                               XFS_RANDOM_IALLOC_READ_AGI))
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+       if (bp->b_error)
+               xfs_verifier_error(bp);
+}
+
+static void
+xfs_agi_write_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+       struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+       if (!xfs_agi_verify(bp)) {
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+               xfs_verifier_error(bp);
+               return;
+       }
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return;
+
+       if (bip)
+               XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+       xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_agi_buf_ops = {
+       .verify_read = xfs_agi_read_verify,
+       .verify_write = xfs_agi_write_verify,
+};
+
+/*
+ * Read in the allocation group header (inode allocation section)
+ */
+int
+xfs_read_agi(
+       struct xfs_mount        *mp,    /* file system mount structure */
+       struct xfs_trans        *tp,    /* transaction pointer */
+       xfs_agnumber_t          agno,   /* allocation group number */
+       struct xfs_buf          **bpp)  /* allocation group hdr buf */
+{
+       int                     error;
+
+       trace_xfs_read_agi(mp, agno);
+
+       ASSERT(agno != NULLAGNUMBER);
+       error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+                       XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+                       XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
+       if (error)
+               return error;
+
+       xfs_buf_set_ref(*bpp, XFS_AGI_REF);
+       return 0;
+}
+
+int
+xfs_ialloc_read_agi(
+       struct xfs_mount        *mp,    /* file system mount structure */
+       struct xfs_trans        *tp,    /* transaction pointer */
+       xfs_agnumber_t          agno,   /* allocation group number */
+       struct xfs_buf          **bpp)  /* allocation group hdr buf */
+{
+       struct xfs_agi          *agi;   /* allocation group header */
+       struct xfs_perag        *pag;   /* per allocation group data */
+       int                     error;
+
+       trace_xfs_ialloc_read_agi(mp, agno);
+
+       error = xfs_read_agi(mp, tp, agno, bpp);
+       if (error)
+               return error;
+
+       agi = XFS_BUF_TO_AGI(*bpp);
+       pag = xfs_perag_get(mp, agno);
+       if (!pag->pagi_init) {
+               pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
+               pag->pagi_count = be32_to_cpu(agi->agi_count);
+               pag->pagi_init = 1;
+       }
+
+       /*
+        * It's possible for these to be out of sync if
+        * we are in the middle of a forced shutdown.
+        */
+       ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
+               XFS_FORCED_SHUTDOWN(mp));
+       xfs_perag_put(pag);
+       return 0;
+}
+
+/*
+ * Read in the agi to initialise the per-ag data in the mount structure
+ */
+int
+xfs_ialloc_pagi_init(
+       xfs_mount_t     *mp,            /* file system mount structure */
+       xfs_trans_t     *tp,            /* transaction pointer */
+       xfs_agnumber_t  agno)           /* allocation group number */
+{
+       xfs_buf_t       *bp = NULL;
+       int             error;
+
+       error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
+       if (error)
+               return error;
+       if (bp)
+               xfs_trans_brelse(tp, bp);
+       return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
new file mode 100644 (file)
index 0000000..95ad1c0
--- /dev/null
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_IALLOC_H__
+#define        __XFS_IALLOC_H__
+
+struct xfs_buf;
+struct xfs_dinode;
+struct xfs_imap;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_btree_cur;
+
+/* Move inodes in clusters of this size */
+#define        XFS_INODE_BIG_CLUSTER_SIZE      8192
+
+/* Calculate and return the number of filesystem blocks per inode cluster */
+static inline int
+xfs_icluster_size_fsb(
+       struct xfs_mount        *mp)
+{
+       if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size)
+               return 1;
+       return mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog;
+}
+
+/*
+ * Make an inode pointer out of the buffer/offset.
+ */
+static inline struct xfs_dinode *
+xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
+{
+       return (struct xfs_dinode *)
+               (xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog));
+}
+
+/*
+ * Allocate an inode on disk.
+ * Mode is used to tell whether the new inode will need space, and whether
+ * it is a directory.
+ *
+ * To work within the constraint of one allocation per transaction,
+ * xfs_dialloc() is designed to be called twice if it has to do an
+ * allocation to make more free inodes.  If an inode is
+ * available without an allocation, agbp would be set to the current
+ * agbp and alloc_done set to false.
+ * If an allocation needed to be done, agbp would be set to the
+ * inode header of the allocation group and alloc_done set to true.
+ * The caller should then commit the current transaction and allocate a new
+ * transaction.  xfs_dialloc() should then be called again with
+ * the agbp value returned from the previous call.
+ *
+ * Once we successfully pick an inode its number is returned and the
+ * on-disk data structures are updated.  The inode itself is not read
+ * in, since doing so would break ordering constraints with xfs_reclaim.
+ *
+ * *agbp should be set to NULL on the first call, *alloc_done set to FALSE.
+ */
+int                                    /* error */
+xfs_dialloc(
+       struct xfs_trans *tp,           /* transaction pointer */
+       xfs_ino_t       parent,         /* parent inode (directory) */
+       umode_t         mode,           /* mode bits for new inode */
+       int             okalloc,        /* ok to allocate more space */
+       struct xfs_buf  **agbp,         /* buf for a.g. inode header */
+       xfs_ino_t       *inop);         /* inode number allocated */
+
+/*
+ * Free disk inode.  Carefully avoids touching the incore inode, all
+ * manipulations incore are the caller's responsibility.
+ * The on-disk inode is not changed by this operation, only the
+ * btree (free inode mask) is changed.
+ */
+int                                    /* error */
+xfs_difree(
+       struct xfs_trans *tp,           /* transaction pointer */
+       xfs_ino_t       inode,          /* inode to be freed */
+       struct xfs_bmap_free *flist,    /* extents to free */
+       int             *deleted,       /* set if inode cluster was deleted */
+       xfs_ino_t       *first_ino);    /* first inode in deleted cluster */
+
+/*
+ * Return the location of the inode in imap, for mapping it into a buffer.
+ */
+int
+xfs_imap(
+       struct xfs_mount *mp,           /* file system mount structure */
+       struct xfs_trans *tp,           /* transaction pointer */
+       xfs_ino_t       ino,            /* inode to locate */
+       struct xfs_imap *imap,          /* location map structure */
+       uint            flags);         /* flags for inode btree lookup */
+
+/*
+ * Compute and fill in value of m_in_maxlevels.
+ */
+void
+xfs_ialloc_compute_maxlevels(
+       struct xfs_mount *mp);          /* file system mount structure */
+
+/*
+ * Log specified fields for the ag hdr (inode section)
+ */
+void
+xfs_ialloc_log_agi(
+       struct xfs_trans *tp,           /* transaction pointer */
+       struct xfs_buf  *bp,            /* allocation group header buffer */
+       int             fields);        /* bitmask of fields to log */
+
+/*
+ * Read in the allocation group header (inode allocation section)
+ */
+int                                    /* error */
+xfs_ialloc_read_agi(
+       struct xfs_mount *mp,           /* file system mount structure */
+       struct xfs_trans *tp,           /* transaction pointer */
+       xfs_agnumber_t  agno,           /* allocation group number */
+       struct xfs_buf  **bpp);         /* allocation group hdr buf */
+
+/*
+ * Read in the allocation group header to initialise the per-ag data
+ * in the mount structure
+ */
+int
+xfs_ialloc_pagi_init(
+       struct xfs_mount *mp,           /* file system mount structure */
+       struct xfs_trans *tp,           /* transaction pointer */
+        xfs_agnumber_t  agno);         /* allocation group number */
+
+/*
+ * Lookup a record by ino in the btree given by cur.
+ */
+int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
+               xfs_lookup_t dir, int *stat);
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
+               xfs_inobt_rec_incore_t *rec, int *stat);
+
+/*
+ * Inode chunk initialisation routine
+ */
+int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
+                         struct list_head *buffer_list,
+                         xfs_agnumber_t agno, xfs_agblock_t agbno,
+                         xfs_agblock_t length, unsigned int gen);
+
+#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
new file mode 100644 (file)
index 0000000..c9b06f3
--- /dev/null
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+
+
+STATIC int
+xfs_inobt_get_minrecs(
+       struct xfs_btree_cur    *cur,
+       int                     level)
+{
+       return cur->bc_mp->m_inobt_mnr[level != 0];
+}
+
+STATIC struct xfs_btree_cur *
+xfs_inobt_dup_cursor(
+       struct xfs_btree_cur    *cur)
+{
+       return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
+                       cur->bc_private.a.agbp, cur->bc_private.a.agno,
+                       cur->bc_btnum);
+}
+
+STATIC void
+xfs_inobt_set_root(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *nptr,
+       int                     inc)    /* level change */
+{
+       struct xfs_buf          *agbp = cur->bc_private.a.agbp;
+       struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
+
+       agi->agi_root = nptr->s;
+       be32_add_cpu(&agi->agi_level, inc);
+       xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
+}
+
+STATIC void
+xfs_finobt_set_root(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *nptr,
+       int                     inc)    /* level change */
+{
+       struct xfs_buf          *agbp = cur->bc_private.a.agbp;
+       struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
+
+       agi->agi_free_root = nptr->s;
+       be32_add_cpu(&agi->agi_free_level, inc);
+       xfs_ialloc_log_agi(cur->bc_tp, agbp,
+                          XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL);
+}
+
+STATIC int
+xfs_inobt_alloc_block(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *start,
+       union xfs_btree_ptr     *new,
+       int                     *stat)
+{
+       xfs_alloc_arg_t         args;           /* block allocation args */
+       int                     error;          /* error return value */
+       xfs_agblock_t           sbno = be32_to_cpu(start->s);
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+       memset(&args, 0, sizeof(args));
+       args.tp = cur->bc_tp;
+       args.mp = cur->bc_mp;
+       args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
+       args.minlen = 1;
+       args.maxlen = 1;
+       args.prod = 1;
+       args.type = XFS_ALLOCTYPE_NEAR_BNO;
+
+       error = xfs_alloc_vextent(&args);
+       if (error) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+               return error;
+       }
+       if (args.fsbno == NULLFSBLOCK) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+               *stat = 0;
+               return 0;
+       }
+       ASSERT(args.len == 1);
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+
+       new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno));
+       *stat = 1;
+       return 0;
+}
+
+STATIC int
+xfs_inobt_free_block(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp)
+{
+       xfs_fsblock_t           fsbno;
+       int                     error;
+
+       fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp));
+       error = xfs_free_extent(cur->bc_tp, fsbno, 1);
+       if (error)
+               return error;
+
+       xfs_trans_binval(cur->bc_tp, bp);
+       return error;
+}
+
+STATIC int
+xfs_inobt_get_maxrecs(
+       struct xfs_btree_cur    *cur,
+       int                     level)
+{
+       return cur->bc_mp->m_inobt_mxr[level != 0];
+}
+
+STATIC void
+xfs_inobt_init_key_from_rec(
+       union xfs_btree_key     *key,
+       union xfs_btree_rec     *rec)
+{
+       key->inobt.ir_startino = rec->inobt.ir_startino;
+}
+
+STATIC void
+xfs_inobt_init_rec_from_key(
+       union xfs_btree_key     *key,
+       union xfs_btree_rec     *rec)
+{
+       rec->inobt.ir_startino = key->inobt.ir_startino;
+}
+
+STATIC void
+xfs_inobt_init_rec_from_cur(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *rec)
+{
+       rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
+       rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
+       rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
+}
+
+/*
+ * initial value of ptr for lookup
+ */
+STATIC void
+xfs_inobt_init_ptr_from_cur(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr)
+{
+       struct xfs_agi          *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
+
+       ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
+
+       ptr->s = agi->agi_root;
+}
+
+STATIC void
+xfs_finobt_init_ptr_from_cur(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr)
+{
+       struct xfs_agi          *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
+
+       ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
+       ptr->s = agi->agi_free_root;
+}
+
+STATIC __int64_t
+xfs_inobt_key_diff(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *key)
+{
+       return (__int64_t)be32_to_cpu(key->inobt.ir_startino) -
+                         cur->bc_rec.i.ir_startino;
+}
+
+static int
+xfs_inobt_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+       struct xfs_perag        *pag = bp->b_pag;
+       unsigned int            level;
+
+       /*
+        * During growfs operations, we can't verify the exact owner as the
+        * perag is not fully initialised and hence not attached to the buffer.
+        *
+        * Similarly, during log recovery we will have a perag structure
+        * attached, but the agi information will not yet have been initialised
+        * from the on disk AGI. We don't currently use any of this information,
+        * but beware of the landmine (i.e. need to check pag->pagi_init) if we
+        * ever do.
+        */
+       switch (block->bb_magic) {
+       case cpu_to_be32(XFS_IBT_CRC_MAGIC):
+       case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
+               if (!xfs_sb_version_hascrc(&mp->m_sb))
+                       return false;
+               if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+                       return false;
+               if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+                       return false;
+               if (pag &&
+                   be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+                       return false;
+               /* fall through */
+       case cpu_to_be32(XFS_IBT_MAGIC):
+       case cpu_to_be32(XFS_FIBT_MAGIC):
+               break;
+       default:
+               return 0;
+       }
+
+       /* numrecs and level verification */
+       level = be16_to_cpu(block->bb_level);
+       if (level >= mp->m_in_maxlevels)
+               return false;
+       if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0])
+               return false;
+
+       /* sibling pointer verification */
+       if (!block->bb_u.s.bb_leftsib ||
+           (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
+            block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
+               return false;
+       if (!block->bb_u.s.bb_rightsib ||
+           (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
+            block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
+               return false;
+
+       return true;
+}
+
+static void
+xfs_inobt_read_verify(
+       struct xfs_buf  *bp)
+{
+       if (!xfs_btree_sblock_verify_crc(bp))
+               xfs_buf_ioerror(bp, -EFSBADCRC);
+       else if (!xfs_inobt_verify(bp))
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+       if (bp->b_error) {
+               trace_xfs_btree_corrupt(bp, _RET_IP_);
+               xfs_verifier_error(bp);
+       }
+}
+
+static void
+xfs_inobt_write_verify(
+       struct xfs_buf  *bp)
+{
+       if (!xfs_inobt_verify(bp)) {
+               trace_xfs_btree_corrupt(bp, _RET_IP_);
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+               xfs_verifier_error(bp);
+               return;
+       }
+       xfs_btree_sblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_inobt_buf_ops = {
+       .verify_read = xfs_inobt_read_verify,
+       .verify_write = xfs_inobt_write_verify,
+};
+
+#if defined(DEBUG) || defined(XFS_WARN)
+STATIC int
+xfs_inobt_keys_inorder(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *k1,
+       union xfs_btree_key     *k2)
+{
+       return be32_to_cpu(k1->inobt.ir_startino) <
+               be32_to_cpu(k2->inobt.ir_startino);
+}
+
+STATIC int
+xfs_inobt_recs_inorder(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *r1,
+       union xfs_btree_rec     *r2)
+{
+       return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <=
+               be32_to_cpu(r2->inobt.ir_startino);
+}
+#endif /* DEBUG */
+
+static const struct xfs_btree_ops xfs_inobt_ops = {
+       .rec_len                = sizeof(xfs_inobt_rec_t),
+       .key_len                = sizeof(xfs_inobt_key_t),
+
+       .dup_cursor             = xfs_inobt_dup_cursor,
+       .set_root               = xfs_inobt_set_root,
+       .alloc_block            = xfs_inobt_alloc_block,
+       .free_block             = xfs_inobt_free_block,
+       .get_minrecs            = xfs_inobt_get_minrecs,
+       .get_maxrecs            = xfs_inobt_get_maxrecs,
+       .init_key_from_rec      = xfs_inobt_init_key_from_rec,
+       .init_rec_from_key      = xfs_inobt_init_rec_from_key,
+       .init_rec_from_cur      = xfs_inobt_init_rec_from_cur,
+       .init_ptr_from_cur      = xfs_inobt_init_ptr_from_cur,
+       .key_diff               = xfs_inobt_key_diff,
+       .buf_ops                = &xfs_inobt_buf_ops,
+#if defined(DEBUG) || defined(XFS_WARN)
+       .keys_inorder           = xfs_inobt_keys_inorder,
+       .recs_inorder           = xfs_inobt_recs_inorder,
+#endif
+};
+
+static const struct xfs_btree_ops xfs_finobt_ops = {
+       .rec_len                = sizeof(xfs_inobt_rec_t),
+       .key_len                = sizeof(xfs_inobt_key_t),
+
+       .dup_cursor             = xfs_inobt_dup_cursor,
+       .set_root               = xfs_finobt_set_root,
+       .alloc_block            = xfs_inobt_alloc_block,
+       .free_block             = xfs_inobt_free_block,
+       .get_minrecs            = xfs_inobt_get_minrecs,
+       .get_maxrecs            = xfs_inobt_get_maxrecs,
+       .init_key_from_rec      = xfs_inobt_init_key_from_rec,
+       .init_rec_from_key      = xfs_inobt_init_rec_from_key,
+       .init_rec_from_cur      = xfs_inobt_init_rec_from_cur,
+       .init_ptr_from_cur      = xfs_finobt_init_ptr_from_cur,
+       .key_diff               = xfs_inobt_key_diff,
+       .buf_ops                = &xfs_inobt_buf_ops,
+#if defined(DEBUG) || defined(XFS_WARN)
+       .keys_inorder           = xfs_inobt_keys_inorder,
+       .recs_inorder           = xfs_inobt_recs_inorder,
+#endif
+};
+
+/*
+ * Allocate a new inode btree cursor.
+ */
+struct xfs_btree_cur *                         /* new inode btree cursor */
+xfs_inobt_init_cursor(
+       struct xfs_mount        *mp,            /* file system mount point */
+       struct xfs_trans        *tp,            /* transaction pointer */
+       struct xfs_buf          *agbp,          /* buffer for agi structure */
+       xfs_agnumber_t          agno,           /* allocation group number */
+       xfs_btnum_t             btnum)          /* ialloc or free ino btree */
+{
+       struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
+       struct xfs_btree_cur    *cur;
+
+       cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+
+       cur->bc_tp = tp;
+       cur->bc_mp = mp;
+       cur->bc_btnum = btnum;
+       if (btnum == XFS_BTNUM_INO) {
+               cur->bc_nlevels = be32_to_cpu(agi->agi_level);
+               cur->bc_ops = &xfs_inobt_ops;
+       } else {
+               cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
+               cur->bc_ops = &xfs_finobt_ops;
+       }
+
+       cur->bc_blocklog = mp->m_sb.sb_blocklog;
+
+       if (xfs_sb_version_hascrc(&mp->m_sb))
+               cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+       cur->bc_private.a.agbp = agbp;
+       cur->bc_private.a.agno = agno;
+
+       return cur;
+}
+
+/*
+ * Calculate number of records in an inobt btree block.
+ */
+int
+xfs_inobt_maxrecs(
+       struct xfs_mount        *mp,
+       int                     blocklen,
+       int                     leaf)
+{
+       blocklen -= XFS_INOBT_BLOCK_LEN(mp);
+
+       if (leaf)
+               return blocklen / sizeof(xfs_inobt_rec_t);
+       return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
new file mode 100644 (file)
index 0000000..d7ebea7
--- /dev/null
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_IALLOC_BTREE_H__
+#define        __XFS_IALLOC_BTREE_H__
+
+/*
+ * Inode map on-disk structures
+ */
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+
+/*
+ * Btree block header size depends on a superblock flag.
+ */
+#define XFS_INOBT_BLOCK_LEN(mp) \
+       (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+               XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_INOBT_REC_ADDR(mp, block, index) \
+       ((xfs_inobt_rec_t *) \
+               ((char *)(block) + \
+                XFS_INOBT_BLOCK_LEN(mp) + \
+                (((index) - 1) * sizeof(xfs_inobt_rec_t))))
+
+#define XFS_INOBT_KEY_ADDR(mp, block, index) \
+       ((xfs_inobt_key_t *) \
+               ((char *)(block) + \
+                XFS_INOBT_BLOCK_LEN(mp) + \
+                ((index) - 1) * sizeof(xfs_inobt_key_t)))
+
+#define XFS_INOBT_PTR_ADDR(mp, block, index, maxrecs) \
+       ((xfs_inobt_ptr_t *) \
+               ((char *)(block) + \
+                XFS_INOBT_BLOCK_LEN(mp) + \
+                (maxrecs) * sizeof(xfs_inobt_key_t) + \
+                ((index) - 1) * sizeof(xfs_inobt_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
+               struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t,
+               xfs_btnum_t);
+extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
+
+#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
new file mode 100644 (file)
index 0000000..f18fd2d
--- /dev/null
@@ -0,0 +1,479 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_cksum.h"
+#include "xfs_icache.h"
+#include "xfs_trans.h"
+#include "xfs_ialloc.h"
+#include "xfs_dinode.h"
+
+/*
+ * Check that none of the inode's in the buffer have a next
+ * unlinked field of 0.
+ */
+#if defined(DEBUG)
+void
+xfs_inobp_check(
+       xfs_mount_t     *mp,
+       xfs_buf_t       *bp)
+{
+       int             i;
+       int             j;
+       xfs_dinode_t    *dip;
+
+       j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
+
+       for (i = 0; i < j; i++) {
+               dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+                                       i * mp->m_sb.sb_inodesize);
+               if (!dip->di_next_unlinked)  {
+                       xfs_alert(mp,
+       "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.",
+                               i, (long long)bp->b_bn);
+               }
+       }
+}
+#endif
+
+/*
+ * If we are doing readahead on an inode buffer, we might be in log recovery
+ * reading an inode allocation buffer that hasn't yet been replayed, and hence
+ * has not had the inode cores stamped into it. Hence for readahead, the buffer
+ * may be potentially invalid.
+ *
+ * If the readahead buffer is invalid, we don't want to mark it with an error,
+ * but we do want to clear the DONE status of the buffer so that a followup read
+ * will re-read it from disk. This will ensure that we don't get an unnecessary
+ * warnings during log recovery and we don't get unnecssary panics on debug
+ * kernels.
+ */
+static void
+xfs_inode_buf_verify(
+       struct xfs_buf  *bp,
+       bool            readahead)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+       int             i;
+       int             ni;
+
+       /*
+        * Validate the magic number and version of every inode in the buffer
+        */
+       ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
+       for (i = 0; i < ni; i++) {
+               int             di_ok;
+               xfs_dinode_t    *dip;
+
+               dip = (struct xfs_dinode *)xfs_buf_offset(bp,
+                                       (i << mp->m_sb.sb_inodelog));
+               di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+                           XFS_DINODE_GOOD_VERSION(dip->di_version);
+               if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
+                                               XFS_ERRTAG_ITOBP_INOTOBP,
+                                               XFS_RANDOM_ITOBP_INOTOBP))) {
+                       if (readahead) {
+                               bp->b_flags &= ~XBF_DONE;
+                               return;
+                       }
+
+                       xfs_buf_ioerror(bp, -EFSCORRUPTED);
+                       xfs_verifier_error(bp);
+#ifdef DEBUG
+                       xfs_alert(mp,
+                               "bad inode magic/vsn daddr %lld #%d (magic=%x)",
+                               (unsigned long long)bp->b_bn, i,
+                               be16_to_cpu(dip->di_magic));
+#endif
+               }
+       }
+       xfs_inobp_check(mp, bp);
+}
+
+
+static void
+xfs_inode_buf_read_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_inode_buf_verify(bp, false);
+}
+
+static void
+xfs_inode_buf_readahead_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_inode_buf_verify(bp, true);
+}
+
+static void
+xfs_inode_buf_write_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_inode_buf_verify(bp, false);
+}
+
+const struct xfs_buf_ops xfs_inode_buf_ops = {
+       .verify_read = xfs_inode_buf_read_verify,
+       .verify_write = xfs_inode_buf_write_verify,
+};
+
+const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
+       .verify_read = xfs_inode_buf_readahead_verify,
+       .verify_write = xfs_inode_buf_write_verify,
+};
+
+
+/*
+ * This routine is called to map an inode to the buffer containing the on-disk
+ * version of the inode.  It returns a pointer to the buffer containing the
+ * on-disk inode in the bpp parameter, and in the dipp parameter it returns a
+ * pointer to the on-disk inode within that buffer.
+ *
+ * If a non-zero error is returned, then the contents of bpp and dipp are
+ * undefined.
+ */
+int
+xfs_imap_to_bp(
+       struct xfs_mount        *mp,
+       struct xfs_trans        *tp,
+       struct xfs_imap         *imap,
+       struct xfs_dinode       **dipp,
+       struct xfs_buf          **bpp,
+       uint                    buf_flags,
+       uint                    iget_flags)
+{
+       struct xfs_buf          *bp;
+       int                     error;
+
+       buf_flags |= XBF_UNMAPPED;
+       error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
+                                  (int)imap->im_len, buf_flags, &bp,
+                                  &xfs_inode_buf_ops);
+       if (error) {
+               if (error == -EAGAIN) {
+                       ASSERT(buf_flags & XBF_TRYLOCK);
+                       return error;
+               }
+
+               if (error == -EFSCORRUPTED &&
+                   (iget_flags & XFS_IGET_UNTRUSTED))
+                       return -EINVAL;
+
+               xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
+                       __func__, error);
+               return error;
+       }
+
+       *bpp = bp;
+       *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
+       return 0;
+}
+
+void
+xfs_dinode_from_disk(
+       xfs_icdinode_t          *to,
+       xfs_dinode_t            *from)
+{
+       to->di_magic = be16_to_cpu(from->di_magic);
+       to->di_mode = be16_to_cpu(from->di_mode);
+       to->di_version = from ->di_version;
+       to->di_format = from->di_format;
+       to->di_onlink = be16_to_cpu(from->di_onlink);
+       to->di_uid = be32_to_cpu(from->di_uid);
+       to->di_gid = be32_to_cpu(from->di_gid);
+       to->di_nlink = be32_to_cpu(from->di_nlink);
+       to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
+       to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
+       memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+       to->di_flushiter = be16_to_cpu(from->di_flushiter);
+       to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
+       to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
+       to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
+       to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
+       to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
+       to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
+       to->di_size = be64_to_cpu(from->di_size);
+       to->di_nblocks = be64_to_cpu(from->di_nblocks);
+       to->di_extsize = be32_to_cpu(from->di_extsize);
+       to->di_nextents = be32_to_cpu(from->di_nextents);
+       to->di_anextents = be16_to_cpu(from->di_anextents);
+       to->di_forkoff = from->di_forkoff;
+       to->di_aformat  = from->di_aformat;
+       to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
+       to->di_dmstate  = be16_to_cpu(from->di_dmstate);
+       to->di_flags    = be16_to_cpu(from->di_flags);
+       to->di_gen      = be32_to_cpu(from->di_gen);
+
+       if (to->di_version == 3) {
+               to->di_changecount = be64_to_cpu(from->di_changecount);
+               to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
+               to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
+               to->di_flags2 = be64_to_cpu(from->di_flags2);
+               to->di_ino = be64_to_cpu(from->di_ino);
+               to->di_lsn = be64_to_cpu(from->di_lsn);
+               memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+               uuid_copy(&to->di_uuid, &from->di_uuid);
+       }
+}
+
+void
+xfs_dinode_to_disk(
+       xfs_dinode_t            *to,
+       xfs_icdinode_t          *from)
+{
+       to->di_magic = cpu_to_be16(from->di_magic);
+       to->di_mode = cpu_to_be16(from->di_mode);
+       to->di_version = from ->di_version;
+       to->di_format = from->di_format;
+       to->di_onlink = cpu_to_be16(from->di_onlink);
+       to->di_uid = cpu_to_be32(from->di_uid);
+       to->di_gid = cpu_to_be32(from->di_gid);
+       to->di_nlink = cpu_to_be32(from->di_nlink);
+       to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
+       to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
+       memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+       to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
+       to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
+       to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
+       to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
+       to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
+       to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
+       to->di_size = cpu_to_be64(from->di_size);
+       to->di_nblocks = cpu_to_be64(from->di_nblocks);
+       to->di_extsize = cpu_to_be32(from->di_extsize);
+       to->di_nextents = cpu_to_be32(from->di_nextents);
+       to->di_anextents = cpu_to_be16(from->di_anextents);
+       to->di_forkoff = from->di_forkoff;
+       to->di_aformat = from->di_aformat;
+       to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
+       to->di_dmstate = cpu_to_be16(from->di_dmstate);
+       to->di_flags = cpu_to_be16(from->di_flags);
+       to->di_gen = cpu_to_be32(from->di_gen);
+
+       if (from->di_version == 3) {
+               to->di_changecount = cpu_to_be64(from->di_changecount);
+               to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
+               to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
+               to->di_flags2 = cpu_to_be64(from->di_flags2);
+               to->di_ino = cpu_to_be64(from->di_ino);
+               to->di_lsn = cpu_to_be64(from->di_lsn);
+               memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+               uuid_copy(&to->di_uuid, &from->di_uuid);
+               to->di_flushiter = 0;
+       } else {
+               to->di_flushiter = cpu_to_be16(from->di_flushiter);
+       }
+}
+
+static bool
+xfs_dinode_verify(
+       struct xfs_mount        *mp,
+       struct xfs_inode        *ip,
+       struct xfs_dinode       *dip)
+{
+       if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
+               return false;
+
+       /* only version 3 or greater inodes are extensively verified here */
+       if (dip->di_version < 3)
+               return true;
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return false;
+       if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
+                             XFS_DINODE_CRC_OFF))
+               return false;
+       if (be64_to_cpu(dip->di_ino) != ip->i_ino)
+               return false;
+       if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid))
+               return false;
+       return true;
+}
+
+void
+xfs_dinode_calc_crc(
+       struct xfs_mount        *mp,
+       struct xfs_dinode       *dip)
+{
+       __uint32_t              crc;
+
+       if (dip->di_version < 3)
+               return;
+
+       ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
+       crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
+                             XFS_DINODE_CRC_OFF);
+       dip->di_crc = xfs_end_cksum(crc);
+}
+
+/*
+ * Read the disk inode attributes into the in-core inode structure.
+ *
+ * For version 5 superblocks, if we are initialising a new inode and we are not
+ * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
+ * inode core with a random generation number. If we are keeping inodes around,
+ * we need to read the inode cluster to get the existing generation number off
+ * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
+ * format) then log recovery is dependent on the di_flushiter field being
+ * initialised from the current on-disk value and hence we must also read the
+ * inode off disk.
+ */
+int
+xfs_iread(
+       xfs_mount_t     *mp,
+       xfs_trans_t     *tp,
+       xfs_inode_t     *ip,
+       uint            iget_flags)
+{
+       xfs_buf_t       *bp;
+       xfs_dinode_t    *dip;
+       int             error;
+
+       /*
+        * Fill in the location information in the in-core inode.
+        */
+       error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
+       if (error)
+               return error;
+
+       /* shortcut IO on inode allocation if possible */
+       if ((iget_flags & XFS_IGET_CREATE) &&
+           xfs_sb_version_hascrc(&mp->m_sb) &&
+           !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+               /* initialise the on-disk inode core */
+               memset(&ip->i_d, 0, sizeof(ip->i_d));
+               ip->i_d.di_magic = XFS_DINODE_MAGIC;
+               ip->i_d.di_gen = prandom_u32();
+               if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                       ip->i_d.di_version = 3;
+                       ip->i_d.di_ino = ip->i_ino;
+                       uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
+               } else
+                       ip->i_d.di_version = 2;
+               return 0;
+       }
+
+       /*
+        * Get pointers to the on-disk inode and the buffer containing it.
+        */
+       error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
+       if (error)
+               return error;
+
+       /* even unallocated inodes are verified */
+       if (!xfs_dinode_verify(mp, ip, dip)) {
+               xfs_alert(mp, "%s: validation failed for inode %lld failed",
+                               __func__, ip->i_ino);
+
+               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
+               error = -EFSCORRUPTED;
+               goto out_brelse;
+       }
+
+       /*
+        * If the on-disk inode is already linked to a directory
+        * entry, copy all of the inode into the in-core inode.
+        * xfs_iformat_fork() handles copying in the inode format
+        * specific information.
+        * Otherwise, just get the truly permanent information.
+        */
+       if (dip->di_mode) {
+               xfs_dinode_from_disk(&ip->i_d, dip);
+               error = xfs_iformat_fork(ip, dip);
+               if (error)  {
+#ifdef DEBUG
+                       xfs_alert(mp, "%s: xfs_iformat() returned error %d",
+                               __func__, error);
+#endif /* DEBUG */
+                       goto out_brelse;
+               }
+       } else {
+               /*
+                * Partial initialisation of the in-core inode. Just the bits
+                * that xfs_ialloc won't overwrite or relies on being correct.
+                */
+               ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
+               ip->i_d.di_version = dip->di_version;
+               ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
+               ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
+
+               if (dip->di_version == 3) {
+                       ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
+                       uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
+               }
+
+               /*
+                * Make sure to pull in the mode here as well in
+                * case the inode is released without being used.
+                * This ensures that xfs_inactive() will see that
+                * the inode is already free and not try to mess
+                * with the uninitialized part of it.
+                */
+               ip->i_d.di_mode = 0;
+       }
+
+       /*
+        * Automatically convert version 1 inode formats in memory to version 2
+        * inode format. If the inode is modified, it will get logged and
+        * rewritten as a version 2 inode. We can do this because we set the
+        * superblock feature bit for v2 inodes unconditionally during mount
+        * and it means the reast of the code can assume the inode version is 2
+        * or higher.
+        */
+       if (ip->i_d.di_version == 1) {
+               ip->i_d.di_version = 2;
+               memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
+               ip->i_d.di_nlink = ip->i_d.di_onlink;
+               ip->i_d.di_onlink = 0;
+               xfs_set_projid(ip, 0);
+       }
+
+       ip->i_delayed_blks = 0;
+
+       /*
+        * Mark the buffer containing the inode as something to keep
+        * around for a while.  This helps to keep recently accessed
+        * meta-data in-core longer.
+        */
+       xfs_buf_set_ref(bp, XFS_INO_REF);
+
+       /*
+        * Use xfs_trans_brelse() to release the buffer containing the on-disk
+        * inode, because it was acquired with xfs_trans_read_buf() in
+        * xfs_imap_to_bp() above.  If tp is NULL, this is just a normal
+        * brelse().  If we're within a transaction, then xfs_trans_brelse()
+        * will only release the buffer if it is not dirty within the
+        * transaction.  It will be OK to release the buffer in this case,
+        * because inodes on disk are never destroyed and we will be locking the
+        * new in-core inode before putting it in the cache where other
+        * processes can find it.  Thus we don't have to worry about the inode
+        * being changed just because we released the buffer.
+        */
+ out_brelse:
+       xfs_trans_brelse(tp, bp);
+       return error;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
new file mode 100644 (file)
index 0000000..9308c47
--- /dev/null
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef        __XFS_INODE_BUF_H__
+#define        __XFS_INODE_BUF_H__
+
+struct xfs_inode;
+struct xfs_dinode;
+struct xfs_icdinode;
+
+/*
+ * Inode location information.  Stored in the inode and passed to
+ * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
+ */
+struct xfs_imap {
+       xfs_daddr_t     im_blkno;       /* starting BB of inode chunk */
+       ushort          im_len;         /* length in BBs of inode chunk */
+       ushort          im_boffset;     /* inode offset in block in bytes */
+};
+
+int    xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
+                      struct xfs_imap *, struct xfs_dinode **,
+                      struct xfs_buf **, uint, uint);
+int    xfs_iread(struct xfs_mount *, struct xfs_trans *,
+                 struct xfs_inode *, uint);
+void   xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
+void   xfs_dinode_to_disk(struct xfs_dinode *to, struct xfs_icdinode *from);
+void   xfs_dinode_from_disk(struct xfs_icdinode *to, struct xfs_dinode *from);
+
+#if defined(DEBUG)
+void   xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
+#else
+#define        xfs_inobp_check(mp, bp)
+#endif /* DEBUG */
+
+#endif /* __XFS_INODE_BUF_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
new file mode 100644 (file)
index 0000000..6a00f7f
--- /dev/null
@@ -0,0 +1,1906 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/log2.h>
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_inum.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dinode.h"
+
+kmem_zone_t *xfs_ifork_zone;
+
+STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
+STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
+STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
+
+#ifdef DEBUG
+/*
+ * Make sure that the extents in the given memory buffer
+ * are valid.
+ */
+void
+xfs_validate_extents(
+       xfs_ifork_t             *ifp,
+       int                     nrecs,
+       xfs_exntfmt_t           fmt)
+{
+       xfs_bmbt_irec_t         irec;
+       xfs_bmbt_rec_host_t     rec;
+       int                     i;
+
+       for (i = 0; i < nrecs; i++) {
+               xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+               rec.l0 = get_unaligned(&ep->l0);
+               rec.l1 = get_unaligned(&ep->l1);
+               xfs_bmbt_get_all(&rec, &irec);
+               if (fmt == XFS_EXTFMT_NOSTATE)
+                       ASSERT(irec.br_state == XFS_EXT_NORM);
+       }
+}
+#else /* DEBUG */
+#define xfs_validate_extents(ifp, nrecs, fmt)
+#endif /* DEBUG */
+
+
+/*
+ * Move inode type and inode format specific information from the
+ * on-disk inode to the in-core inode.  For fifos, devs, and sockets
+ * this means set if_rdev to the proper value.  For files, directories,
+ * and symlinks this means to bring in the in-line data or extent
+ * pointers.  For a file in B-tree format, only the root is immediately
+ * brought in-core.  The rest will be in-lined in if_extents when it
+ * is first referenced (see xfs_iread_extents()).
+ */
+int
+xfs_iformat_fork(
+       xfs_inode_t             *ip,
+       xfs_dinode_t            *dip)
+{
+       xfs_attr_shortform_t    *atp;
+       int                     size;
+       int                     error = 0;
+       xfs_fsize_t             di_size;
+
+       if (unlikely(be32_to_cpu(dip->di_nextents) +
+                    be16_to_cpu(dip->di_anextents) >
+                    be64_to_cpu(dip->di_nblocks))) {
+               xfs_warn(ip->i_mount,
+                       "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
+                       (unsigned long long)ip->i_ino,
+                       (int)(be32_to_cpu(dip->di_nextents) +
+                             be16_to_cpu(dip->di_anextents)),
+                       (unsigned long long)
+                               be64_to_cpu(dip->di_nblocks));
+               XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
+                                    ip->i_mount, dip);
+               return -EFSCORRUPTED;
+       }
+
+       if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
+               xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
+                       (unsigned long long)ip->i_ino,
+                       dip->di_forkoff);
+               XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
+                                    ip->i_mount, dip);
+               return -EFSCORRUPTED;
+       }
+
+       if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
+                    !ip->i_mount->m_rtdev_targp)) {
+               xfs_warn(ip->i_mount,
+                       "corrupt dinode %Lu, has realtime flag set.",
+                       ip->i_ino);
+               XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
+                                    XFS_ERRLEVEL_LOW, ip->i_mount, dip);
+               return -EFSCORRUPTED;
+       }
+
+       switch (ip->i_d.di_mode & S_IFMT) {
+       case S_IFIFO:
+       case S_IFCHR:
+       case S_IFBLK:
+       case S_IFSOCK:
+               if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
+                       XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
+                                             ip->i_mount, dip);
+                       return -EFSCORRUPTED;
+               }
+               ip->i_d.di_size = 0;
+               ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
+               break;
+
+       case S_IFREG:
+       case S_IFLNK:
+       case S_IFDIR:
+               switch (dip->di_format) {
+               case XFS_DINODE_FMT_LOCAL:
+                       /*
+                        * no local regular files yet
+                        */
+                       if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) {
+                               xfs_warn(ip->i_mount,
+                       "corrupt inode %Lu (local format for regular file).",
+                                       (unsigned long long) ip->i_ino);
+                               XFS_CORRUPTION_ERROR("xfs_iformat(4)",
+                                                    XFS_ERRLEVEL_LOW,
+                                                    ip->i_mount, dip);
+                               return -EFSCORRUPTED;
+                       }
+
+                       di_size = be64_to_cpu(dip->di_size);
+                       if (unlikely(di_size < 0 ||
+                                    di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
+                               xfs_warn(ip->i_mount,
+                       "corrupt inode %Lu (bad size %Ld for local inode).",
+                                       (unsigned long long) ip->i_ino,
+                                       (long long) di_size);
+                               XFS_CORRUPTION_ERROR("xfs_iformat(5)",
+                                                    XFS_ERRLEVEL_LOW,
+                                                    ip->i_mount, dip);
+                               return -EFSCORRUPTED;
+                       }
+
+                       size = (int)di_size;
+                       error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
+                       break;
+               case XFS_DINODE_FMT_EXTENTS:
+                       error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
+                       break;
+               case XFS_DINODE_FMT_BTREE:
+                       error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
+                       break;
+               default:
+                       XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
+                                        ip->i_mount);
+                       return -EFSCORRUPTED;
+               }
+               break;
+
+       default:
+               XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
+               return -EFSCORRUPTED;
+       }
+       if (error) {
+               return error;
+       }
+       if (!XFS_DFORK_Q(dip))
+               return 0;
+
+       ASSERT(ip->i_afp == NULL);
+       ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
+
+       switch (dip->di_aformat) {
+       case XFS_DINODE_FMT_LOCAL:
+               atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
+               size = be16_to_cpu(atp->hdr.totsize);
+
+               if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
+                       xfs_warn(ip->i_mount,
+                               "corrupt inode %Lu (bad attr fork size %Ld).",
+                               (unsigned long long) ip->i_ino,
+                               (long long) size);
+                       XFS_CORRUPTION_ERROR("xfs_iformat(8)",
+                                            XFS_ERRLEVEL_LOW,
+                                            ip->i_mount, dip);
+                       return -EFSCORRUPTED;
+               }
+
+               error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
+               break;
+       case XFS_DINODE_FMT_EXTENTS:
+               error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
+               break;
+       case XFS_DINODE_FMT_BTREE:
+               error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
+               break;
+       default:
+               error = -EFSCORRUPTED;
+               break;
+       }
+       if (error) {
+               kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+               ip->i_afp = NULL;
+               xfs_idestroy_fork(ip, XFS_DATA_FORK);
+       }
+       return error;
+}
+
+/*
+ * The file is in-lined in the on-disk inode.
+ * If it fits into if_inline_data, then copy
+ * it there, otherwise allocate a buffer for it
+ * and copy the data there.  Either way, set
+ * if_data to point at the data.
+ * If we allocate a buffer for the data, make
+ * sure that its size is a multiple of 4 and
+ * record the real size in i_real_bytes.
+ */
+STATIC int
+xfs_iformat_local(
+       xfs_inode_t     *ip,
+       xfs_dinode_t    *dip,
+       int             whichfork,
+       int             size)
+{
+       xfs_ifork_t     *ifp;
+       int             real_size;
+
+       /*
+        * If the size is unreasonable, then something
+        * is wrong and we just bail out rather than crash in
+        * kmem_alloc() or memcpy() below.
+        */
+       if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+               xfs_warn(ip->i_mount,
+       "corrupt inode %Lu (bad size %d for local fork, size = %d).",
+                       (unsigned long long) ip->i_ino, size,
+                       XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
+               XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
+                                    ip->i_mount, dip);
+               return -EFSCORRUPTED;
+       }
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       real_size = 0;
+       if (size == 0)
+               ifp->if_u1.if_data = NULL;
+       else if (size <= sizeof(ifp->if_u2.if_inline_data))
+               ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+       else {
+               real_size = roundup(size, 4);
+               ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
+       }
+       ifp->if_bytes = size;
+       ifp->if_real_bytes = real_size;
+       if (size)
+               memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
+       ifp->if_flags &= ~XFS_IFEXTENTS;
+       ifp->if_flags |= XFS_IFINLINE;
+       return 0;
+}
+
+/*
+ * The file consists of a set of extents all
+ * of which fit into the on-disk inode.
+ * If there are few enough extents to fit into
+ * the if_inline_ext, then copy them there.
+ * Otherwise allocate a buffer for them and copy
+ * them into it.  Either way, set if_extents
+ * to point at the extents.
+ */
+STATIC int
+xfs_iformat_extents(
+       xfs_inode_t     *ip,
+       xfs_dinode_t    *dip,
+       int             whichfork)
+{
+       xfs_bmbt_rec_t  *dp;
+       xfs_ifork_t     *ifp;
+       int             nex;
+       int             size;
+       int             i;
+
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       nex = XFS_DFORK_NEXTENTS(dip, whichfork);
+       size = nex * (uint)sizeof(xfs_bmbt_rec_t);
+
+       /*
+        * If the number of extents is unreasonable, then something
+        * is wrong and we just bail out rather than crash in
+        * kmem_alloc() or memcpy() below.
+        */
+       if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+               xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
+                       (unsigned long long) ip->i_ino, nex);
+               XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
+                                    ip->i_mount, dip);
+               return -EFSCORRUPTED;
+       }
+
+       ifp->if_real_bytes = 0;
+       if (nex == 0)
+               ifp->if_u1.if_extents = NULL;
+       else if (nex <= XFS_INLINE_EXTS)
+               ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+       else
+               xfs_iext_add(ifp, 0, nex);
+
+       ifp->if_bytes = size;
+       if (size) {
+               dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
+               xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
+               for (i = 0; i < nex; i++, dp++) {
+                       xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+                       ep->l0 = get_unaligned_be64(&dp->l0);
+                       ep->l1 = get_unaligned_be64(&dp->l1);
+               }
+               XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
+               if (whichfork != XFS_DATA_FORK ||
+                       XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
+                               if (unlikely(xfs_check_nostate_extents(
+                                   ifp, 0, nex))) {
+                                       XFS_ERROR_REPORT("xfs_iformat_extents(2)",
+                                                        XFS_ERRLEVEL_LOW,
+                                                        ip->i_mount);
+                                       return -EFSCORRUPTED;
+                               }
+       }
+       ifp->if_flags |= XFS_IFEXTENTS;
+       return 0;
+}
+
+/*
+ * The file has too many extents to fit into
+ * the inode, so they are in B-tree format.
+ * Allocate a buffer for the root of the B-tree
+ * and copy the root into it.  The i_extents
+ * field will remain NULL until all of the
+ * extents are read in (when they are needed).
+ */
+STATIC int
+xfs_iformat_btree(
+       xfs_inode_t             *ip,
+       xfs_dinode_t            *dip,
+       int                     whichfork)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_bmdr_block_t        *dfp;
+       xfs_ifork_t             *ifp;
+       /* REFERENCED */
+       int                     nrecs;
+       int                     size;
+
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
+       size = XFS_BMAP_BROOT_SPACE(mp, dfp);
+       nrecs = be16_to_cpu(dfp->bb_numrecs);
+
+       /*
+        * blow out if -- fork has less extents than can fit in
+        * fork (fork shouldn't be a btree format), root btree
+        * block has more records than can fit into the fork,
+        * or the number of extents is greater than the number of
+        * blocks.
+        */
+       if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
+                                       XFS_IFORK_MAXEXT(ip, whichfork) ||
+                    XFS_BMDR_SPACE_CALC(nrecs) >
+                                       XFS_DFORK_SIZE(dip, mp, whichfork) ||
+                    XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
+               xfs_warn(mp, "corrupt inode %Lu (btree).",
+                                       (unsigned long long) ip->i_ino);
+               XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
+                                        mp, dip);
+               return -EFSCORRUPTED;
+       }
+
+       ifp->if_broot_bytes = size;
+       ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
+       ASSERT(ifp->if_broot != NULL);
+       /*
+        * Copy and convert from the on-disk structure
+        * to the in-memory structure.
+        */
+       xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
+                        ifp->if_broot, size);
+       ifp->if_flags &= ~XFS_IFEXTENTS;
+       ifp->if_flags |= XFS_IFBROOT;
+
+       return 0;
+}
+
+/*
+ * Read in extents from a btree-format inode.
+ * Allocate and fill in if_extents.  Real work is done in xfs_bmap.c.
+ */
+int
+xfs_iread_extents(
+       xfs_trans_t     *tp,
+       xfs_inode_t     *ip,
+       int             whichfork)
+{
+       int             error;
+       xfs_ifork_t     *ifp;
+       xfs_extnum_t    nextents;
+
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+       if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
+               XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
+                                ip->i_mount);
+               return -EFSCORRUPTED;
+       }
+       nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+
+       /*
+        * We know that the size is valid (it's checked in iformat_btree)
+        */
+       ifp->if_bytes = ifp->if_real_bytes = 0;
+       ifp->if_flags |= XFS_IFEXTENTS;
+       xfs_iext_add(ifp, 0, nextents);
+       error = xfs_bmap_read_extents(tp, ip, whichfork);
+       if (error) {
+               xfs_iext_destroy(ifp);
+               ifp->if_flags &= ~XFS_IFEXTENTS;
+               return error;
+       }
+       xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
+       return 0;
+}
+/*
+ * Reallocate the space for if_broot based on the number of records
+ * being added or deleted as indicated in rec_diff.  Move the records
+ * and pointers in if_broot to fit the new size.  When shrinking this
+ * will eliminate holes between the records and pointers created by
+ * the caller.  When growing this will create holes to be filled in
+ * by the caller.
+ *
+ * The caller must not request to add more records than would fit in
+ * the on-disk inode root.  If the if_broot is currently NULL, then
+ * if we are adding records, one will be allocated.  The caller must also
+ * not request that the number of records go below zero, although
+ * it can go to zero.
+ *
+ * ip -- the inode whose if_broot area is changing
+ * ext_diff -- the change in the number of records, positive or negative,
+ *      requested for the if_broot array.
+ */
+void
+xfs_iroot_realloc(
+       xfs_inode_t             *ip,
+       int                     rec_diff,
+       int                     whichfork)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       int                     cur_max;
+       xfs_ifork_t             *ifp;
+       struct xfs_btree_block  *new_broot;
+       int                     new_max;
+       size_t                  new_size;
+       char                    *np;
+       char                    *op;
+
+       /*
+        * Handle the degenerate case quietly.
+        */
+       if (rec_diff == 0) {
+               return;
+       }
+
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       if (rec_diff > 0) {
+               /*
+                * If there wasn't any memory allocated before, just
+                * allocate it now and get out.
+                */
+               if (ifp->if_broot_bytes == 0) {
+                       new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
+                       ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+                       ifp->if_broot_bytes = (int)new_size;
+                       return;
+               }
+
+               /*
+                * If there is already an existing if_broot, then we need
+                * to realloc() it and shift the pointers to their new
+                * location.  The records don't change location because
+                * they are kept butted up against the btree block header.
+                */
+               cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+               new_max = cur_max + rec_diff;
+               new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
+               ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
+                               XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max),
+                               KM_SLEEP | KM_NOFS);
+               op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+                                                    ifp->if_broot_bytes);
+               np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+                                                    (int)new_size);
+               ifp->if_broot_bytes = (int)new_size;
+               ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+                       XFS_IFORK_SIZE(ip, whichfork));
+               memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t));
+               return;
+       }
+
+       /*
+        * rec_diff is less than 0.  In this case, we are shrinking the
+        * if_broot buffer.  It must already exist.  If we go to zero
+        * records, just get rid of the root and clear the status bit.
+        */
+       ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
+       cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+       new_max = cur_max + rec_diff;
+       ASSERT(new_max >= 0);
+       if (new_max > 0)
+               new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
+       else
+               new_size = 0;
+       if (new_size > 0) {
+               new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+               /*
+                * First copy over the btree block header.
+                */
+               memcpy(new_broot, ifp->if_broot,
+                       XFS_BMBT_BLOCK_LEN(ip->i_mount));
+       } else {
+               new_broot = NULL;
+               ifp->if_flags &= ~XFS_IFBROOT;
+       }
+
+       /*
+        * Only copy the records and pointers if there are any.
+        */
+       if (new_max > 0) {
+               /*
+                * First copy the records.
+                */
+               op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
+               np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
+               memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
+
+               /*
+                * Then copy the pointers.
+                */
+               op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+                                                    ifp->if_broot_bytes);
+               np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
+                                                    (int)new_size);
+               memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t));
+       }
+       kmem_free(ifp->if_broot);
+       ifp->if_broot = new_broot;
+       ifp->if_broot_bytes = (int)new_size;
+       if (ifp->if_broot)
+               ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+                       XFS_IFORK_SIZE(ip, whichfork));
+       return;
+}
+
+
+/*
+ * This is called when the amount of space needed for if_data
+ * is increased or decreased.  The change in size is indicated by
+ * the number of bytes that need to be added or deleted in the
+ * byte_diff parameter.
+ *
+ * If the amount of space needed has decreased below the size of the
+ * inline buffer, then switch to using the inline buffer.  Otherwise,
+ * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
+ * to what is needed.
+ *
+ * ip -- the inode whose if_data area is changing
+ * byte_diff -- the change in the number of bytes, positive or negative,
+ *      requested for the if_data array.
+ */
+void
+xfs_idata_realloc(
+       xfs_inode_t     *ip,
+       int             byte_diff,
+       int             whichfork)
+{
+       xfs_ifork_t     *ifp;
+       int             new_size;
+       int             real_size;
+
+       if (byte_diff == 0) {
+               return;
+       }
+
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       new_size = (int)ifp->if_bytes + byte_diff;
+       ASSERT(new_size >= 0);
+
+       if (new_size == 0) {
+               if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+                       kmem_free(ifp->if_u1.if_data);
+               }
+               ifp->if_u1.if_data = NULL;
+               real_size = 0;
+       } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
+               /*
+                * If the valid extents/data can fit in if_inline_ext/data,
+                * copy them from the malloc'd vector and free it.
+                */
+               if (ifp->if_u1.if_data == NULL) {
+                       ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+               } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+                       ASSERT(ifp->if_real_bytes != 0);
+                       memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
+                             new_size);
+                       kmem_free(ifp->if_u1.if_data);
+                       ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+               }
+               real_size = 0;
+       } else {
+               /*
+                * Stuck with malloc/realloc.
+                * For inline data, the underlying buffer must be
+                * a multiple of 4 bytes in size so that it can be
+                * logged and stay on word boundaries.  We enforce
+                * that here.
+                */
+               real_size = roundup(new_size, 4);
+               if (ifp->if_u1.if_data == NULL) {
+                       ASSERT(ifp->if_real_bytes == 0);
+                       ifp->if_u1.if_data = kmem_alloc(real_size,
+                                                       KM_SLEEP | KM_NOFS);
+               } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+                       /*
+                        * Only do the realloc if the underlying size
+                        * is really changing.
+                        */
+                       if (ifp->if_real_bytes != real_size) {
+                               ifp->if_u1.if_data =
+                                       kmem_realloc(ifp->if_u1.if_data,
+                                                       real_size,
+                                                       ifp->if_real_bytes,
+                                                       KM_SLEEP | KM_NOFS);
+                       }
+               } else {
+                       ASSERT(ifp->if_real_bytes == 0);
+                       ifp->if_u1.if_data = kmem_alloc(real_size,
+                                                       KM_SLEEP | KM_NOFS);
+                       memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
+                               ifp->if_bytes);
+               }
+       }
+       ifp->if_real_bytes = real_size;
+       ifp->if_bytes = new_size;
+       ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+}
+
+void
+xfs_idestroy_fork(
+       xfs_inode_t     *ip,
+       int             whichfork)
+{
+       xfs_ifork_t     *ifp;
+
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       if (ifp->if_broot != NULL) {
+               kmem_free(ifp->if_broot);
+               ifp->if_broot = NULL;
+       }
+
+       /*
+        * If the format is local, then we can't have an extents
+        * array so just look for an inline data array.  If we're
+        * not local then we may or may not have an extents list,
+        * so check and free it up if we do.
+        */
+       if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+               if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
+                   (ifp->if_u1.if_data != NULL)) {
+                       ASSERT(ifp->if_real_bytes != 0);
+                       kmem_free(ifp->if_u1.if_data);
+                       ifp->if_u1.if_data = NULL;
+                       ifp->if_real_bytes = 0;
+               }
+       } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
+                  ((ifp->if_flags & XFS_IFEXTIREC) ||
+                   ((ifp->if_u1.if_extents != NULL) &&
+                    (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
+               ASSERT(ifp->if_real_bytes != 0);
+               xfs_iext_destroy(ifp);
+       }
+       ASSERT(ifp->if_u1.if_extents == NULL ||
+              ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
+       ASSERT(ifp->if_real_bytes == 0);
+       if (whichfork == XFS_ATTR_FORK) {
+               kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+               ip->i_afp = NULL;
+       }
+}
+
+/*
+ * Convert in-core extents to on-disk form
+ *
+ * For either the data or attr fork in extent format, we need to endian convert
+ * the in-core extent as we place them into the on-disk inode.
+ *
+ * In the case of the data fork, the in-core and on-disk fork sizes can be
+ * different due to delayed allocation extents. We only copy on-disk extents
+ * here, so callers must always use the physical fork size to determine the
+ * size of the buffer passed to this routine.  We will return the size actually
+ * used.
+ */
+int
+xfs_iextents_copy(
+       xfs_inode_t             *ip,
+       xfs_bmbt_rec_t          *dp,
+       int                     whichfork)
+{
+       int                     copied;
+       int                     i;
+       xfs_ifork_t             *ifp;
+       int                     nrecs;
+       xfs_fsblock_t           start_block;
+
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+       ASSERT(ifp->if_bytes > 0);
+
+       nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
+       ASSERT(nrecs > 0);
+
+       /*
+        * There are some delayed allocation extents in the
+        * inode, so copy the extents one at a time and skip
+        * the delayed ones.  There must be at least one
+        * non-delayed extent.
+        */
+       copied = 0;
+       for (i = 0; i < nrecs; i++) {
+               xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+               start_block = xfs_bmbt_get_startblock(ep);
+               if (isnullstartblock(start_block)) {
+                       /*
+                        * It's a delayed allocation extent, so skip it.
+                        */
+                       continue;
+               }
+
+               /* Translate to on disk format */
+               put_unaligned_be64(ep->l0, &dp->l0);
+               put_unaligned_be64(ep->l1, &dp->l1);
+               dp++;
+               copied++;
+       }
+       ASSERT(copied != 0);
+       xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
+
+       return (copied * (uint)sizeof(xfs_bmbt_rec_t));
+}
+
+/*
+ * Each of the following cases stores data into the same region
+ * of the on-disk inode, so only one of them can be valid at
+ * any given time. While it is possible to have conflicting formats
+ * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
+ * in EXTENTS format, this can only happen when the fork has
+ * changed formats after being modified but before being flushed.
+ * In these cases, the format always takes precedence, because the
+ * format indicates the current state of the fork.
+ */
+void
+xfs_iflush_fork(
+       xfs_inode_t             *ip,
+       xfs_dinode_t            *dip,
+       xfs_inode_log_item_t    *iip,
+       int                     whichfork)
+{
+       char                    *cp;
+       xfs_ifork_t             *ifp;
+       xfs_mount_t             *mp;
+       static const short      brootflag[2] =
+               { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
+       static const short      dataflag[2] =
+               { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
+       static const short      extflag[2] =
+               { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
+
+       if (!iip)
+               return;
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       /*
+        * This can happen if we gave up in iformat in an error path,
+        * for the attribute fork.
+        */
+       if (!ifp) {
+               ASSERT(whichfork == XFS_ATTR_FORK);
+               return;
+       }
+       cp = XFS_DFORK_PTR(dip, whichfork);
+       mp = ip->i_mount;
+       switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+       case XFS_DINODE_FMT_LOCAL:
+               if ((iip->ili_fields & dataflag[whichfork]) &&
+                   (ifp->if_bytes > 0)) {
+                       ASSERT(ifp->if_u1.if_data != NULL);
+                       ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+                       memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
+               }
+               break;
+
+       case XFS_DINODE_FMT_EXTENTS:
+               ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
+                      !(iip->ili_fields & extflag[whichfork]));
+               if ((iip->ili_fields & extflag[whichfork]) &&
+                   (ifp->if_bytes > 0)) {
+                       ASSERT(xfs_iext_get_ext(ifp, 0));
+                       ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
+                       (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
+                               whichfork);
+               }
+               break;
+
+       case XFS_DINODE_FMT_BTREE:
+               if ((iip->ili_fields & brootflag[whichfork]) &&
+                   (ifp->if_broot_bytes > 0)) {
+                       ASSERT(ifp->if_broot != NULL);
+                       ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+                               XFS_IFORK_SIZE(ip, whichfork));
+                       xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
+                               (xfs_bmdr_block_t *)cp,
+                               XFS_DFORK_SIZE(dip, mp, whichfork));
+               }
+               break;
+
+       case XFS_DINODE_FMT_DEV:
+               if (iip->ili_fields & XFS_ILOG_DEV) {
+                       ASSERT(whichfork == XFS_DATA_FORK);
+                       xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
+               }
+               break;
+
+       case XFS_DINODE_FMT_UUID:
+               if (iip->ili_fields & XFS_ILOG_UUID) {
+                       ASSERT(whichfork == XFS_DATA_FORK);
+                       memcpy(XFS_DFORK_DPTR(dip),
+                              &ip->i_df.if_u2.if_uuid,
+                              sizeof(uuid_t));
+               }
+               break;
+
+       default:
+               ASSERT(0);
+               break;
+       }
+}
+
+/*
+ * Return a pointer to the extent record at file index idx.
+ */
+xfs_bmbt_rec_host_t *
+xfs_iext_get_ext(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_extnum_t    idx)            /* index of target extent */
+{
+       ASSERT(idx >= 0);
+       ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+
+       if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
+               return ifp->if_u1.if_ext_irec->er_extbuf;
+       } else if (ifp->if_flags & XFS_IFEXTIREC) {
+               xfs_ext_irec_t  *erp;           /* irec pointer */
+               int             erp_idx = 0;    /* irec index */
+               xfs_extnum_t    page_idx = idx; /* ext index in target list */
+
+               erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
+               return &erp->er_extbuf[page_idx];
+       } else if (ifp->if_bytes) {
+               return &ifp->if_u1.if_extents[idx];
+       } else {
+               return NULL;
+       }
+}
+
+/*
+ * Insert new item(s) into the extent records for incore inode
+ * fork 'ifp'.  'count' new items are inserted at index 'idx'.
+ */
+void
+xfs_iext_insert(
+       xfs_inode_t     *ip,            /* incore inode pointer */
+       xfs_extnum_t    idx,            /* starting index of new items */
+       xfs_extnum_t    count,          /* number of inserted items */
+       xfs_bmbt_irec_t *new,           /* items to insert */
+       int             state)          /* type of extent conversion */
+{
+       xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+       xfs_extnum_t    i;              /* extent record index */
+
+       trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
+
+       ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+       xfs_iext_add(ifp, idx, count);
+       for (i = idx; i < idx + count; i++, new++)
+               xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
+}
+
+/*
+ * This is called when the amount of space required for incore file
+ * extents needs to be increased. The ext_diff parameter stores the
+ * number of new extents being added and the idx parameter contains
+ * the extent index where the new extents will be added. If the new
+ * extents are being appended, then we just need to (re)allocate and
+ * initialize the space. Otherwise, if the new extents are being
+ * inserted into the middle of the existing entries, a bit more work
+ * is required to make room for the new extents to be inserted. The
+ * caller is responsible for filling in the new extent entries upon
+ * return.
+ */
+void
+xfs_iext_add(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_extnum_t    idx,            /* index to begin adding exts */
+       int             ext_diff)       /* number of extents to add */
+{
+       int             byte_diff;      /* new bytes being added */
+       int             new_size;       /* size of extents after adding */
+       xfs_extnum_t    nextents;       /* number of extents in file */
+
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       ASSERT((idx >= 0) && (idx <= nextents));
+       byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
+       new_size = ifp->if_bytes + byte_diff;
+       /*
+        * If the new number of extents (nextents + ext_diff)
+        * fits inside the inode, then continue to use the inline
+        * extent buffer.
+        */
+       if (nextents + ext_diff <= XFS_INLINE_EXTS) {
+               if (idx < nextents) {
+                       memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
+                               &ifp->if_u2.if_inline_ext[idx],
+                               (nextents - idx) * sizeof(xfs_bmbt_rec_t));
+                       memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
+               }
+               ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+               ifp->if_real_bytes = 0;
+       }
+       /*
+        * Otherwise use a linear (direct) extent list.
+        * If the extents are currently inside the inode,
+        * xfs_iext_realloc_direct will switch us from
+        * inline to direct extent allocation mode.
+        */
+       else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
+               xfs_iext_realloc_direct(ifp, new_size);
+               if (idx < nextents) {
+                       memmove(&ifp->if_u1.if_extents[idx + ext_diff],
+                               &ifp->if_u1.if_extents[idx],
+                               (nextents - idx) * sizeof(xfs_bmbt_rec_t));
+                       memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
+               }
+       }
+       /* Indirection array */
+       else {
+               xfs_ext_irec_t  *erp;
+               int             erp_idx = 0;
+               int             page_idx = idx;
+
+               ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
+               if (ifp->if_flags & XFS_IFEXTIREC) {
+                       erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
+               } else {
+                       xfs_iext_irec_init(ifp);
+                       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+                       erp = ifp->if_u1.if_ext_irec;
+               }
+               /* Extents fit in target extent page */
+               if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
+                       if (page_idx < erp->er_extcount) {
+                               memmove(&erp->er_extbuf[page_idx + ext_diff],
+                                       &erp->er_extbuf[page_idx],
+                                       (erp->er_extcount - page_idx) *
+                                       sizeof(xfs_bmbt_rec_t));
+                               memset(&erp->er_extbuf[page_idx], 0, byte_diff);
+                       }
+                       erp->er_extcount += ext_diff;
+                       xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+               }
+               /* Insert a new extent page */
+               else if (erp) {
+                       xfs_iext_add_indirect_multi(ifp,
+                               erp_idx, page_idx, ext_diff);
+               }
+               /*
+                * If extent(s) are being appended to the last page in
+                * the indirection array and the new extent(s) don't fit
+                * in the page, then erp is NULL and erp_idx is set to
+                * the next index needed in the indirection array.
+                */
+               else {
+                       uint    count = ext_diff;
+
+                       while (count) {
+                               erp = xfs_iext_irec_new(ifp, erp_idx);
+                               erp->er_extcount = min(count, XFS_LINEAR_EXTS);
+                               count -= erp->er_extcount;
+                               if (count)
+                                       erp_idx++;
+                       }
+               }
+       }
+       ifp->if_bytes = new_size;
+}
+
+/*
+ * This is called when incore extents are being added to the indirection
+ * array and the new extents do not fit in the target extent list. The
+ * erp_idx parameter contains the irec index for the target extent list
+ * in the indirection array, and the idx parameter contains the extent
+ * index within the list. The number of extents being added is stored
+ * in the count parameter.
+ *
+ *    |-------|   |-------|
+ *    |       |   |       |    idx - number of extents before idx
+ *    |  idx  |   | count |
+ *    |       |   |       |    count - number of extents being inserted at idx
+ *    |-------|   |-------|
+ *    | count |   | nex2  |    nex2 - number of extents after idx + count
+ *    |-------|   |-------|
+ */
+void
+xfs_iext_add_indirect_multi(
+       xfs_ifork_t     *ifp,                   /* inode fork pointer */
+       int             erp_idx,                /* target extent irec index */
+       xfs_extnum_t    idx,                    /* index within target list */
+       int             count)                  /* new extents being added */
+{
+       int             byte_diff;              /* new bytes being added */
+       xfs_ext_irec_t  *erp;                   /* pointer to irec entry */
+       xfs_extnum_t    ext_diff;               /* number of extents to add */
+       xfs_extnum_t    ext_cnt;                /* new extents still needed */
+       xfs_extnum_t    nex2;                   /* extents after idx + count */
+       xfs_bmbt_rec_t  *nex2_ep = NULL;        /* temp list for nex2 extents */
+       int             nlists;                 /* number of irec's (lists) */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       erp = &ifp->if_u1.if_ext_irec[erp_idx];
+       nex2 = erp->er_extcount - idx;
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+
+       /*
+        * Save second part of target extent list
+        * (all extents past */
+       if (nex2) {
+               byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
+               nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
+               memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
+               erp->er_extcount -= nex2;
+               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
+               memset(&erp->er_extbuf[idx], 0, byte_diff);
+       }
+
+       /*
+        * Add the new extents to the end of the target
+        * list, then allocate new irec record(s) and
+        * extent buffer(s) as needed to store the rest
+        * of the new extents.
+        */
+       ext_cnt = count;
+       ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
+       if (ext_diff) {
+               erp->er_extcount += ext_diff;
+               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+               ext_cnt -= ext_diff;
+       }
+       while (ext_cnt) {
+               erp_idx++;
+               erp = xfs_iext_irec_new(ifp, erp_idx);
+               ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
+               erp->er_extcount = ext_diff;
+               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+               ext_cnt -= ext_diff;
+       }
+
+       /* Add nex2 extents back to indirection array */
+       if (nex2) {
+               xfs_extnum_t    ext_avail;
+               int             i;
+
+               byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
+               ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
+               i = 0;
+               /*
+                * If nex2 extents fit in the current page, append
+                * nex2_ep after the new extents.
+                */
+               if (nex2 <= ext_avail) {
+                       i = erp->er_extcount;
+               }
+               /*
+                * Otherwise, check if space is available in the
+                * next page.
+                */
+               else if ((erp_idx < nlists - 1) &&
+                        (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
+                         ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
+                       erp_idx++;
+                       erp++;
+                       /* Create a hole for nex2 extents */
+                       memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
+                               erp->er_extcount * sizeof(xfs_bmbt_rec_t));
+               }
+               /*
+                * Final choice, create a new extent page for
+                * nex2 extents.
+                */
+               else {
+                       erp_idx++;
+                       erp = xfs_iext_irec_new(ifp, erp_idx);
+               }
+               memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
+               kmem_free(nex2_ep);
+               erp->er_extcount += nex2;
+               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
+       }
+}
+
+/*
+ * This is called when the amount of space required for incore file
+ * extents needs to be decreased. The ext_diff parameter stores the
+ * number of extents to be removed and the idx parameter contains
+ * the extent index where the extents will be removed from.
+ *
+ * If the amount of space needed has decreased below the linear
+ * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
+ * extent array.  Otherwise, use kmem_realloc() to adjust the
+ * size to what is needed.
+ */
+void
+xfs_iext_remove(
+       xfs_inode_t     *ip,            /* incore inode pointer */
+       xfs_extnum_t    idx,            /* index to begin removing exts */
+       int             ext_diff,       /* number of extents to remove */
+       int             state)          /* type of extent conversion */
+{
+       xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+       xfs_extnum_t    nextents;       /* number of extents in file */
+       int             new_size;       /* size of extents after removal */
+
+       trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
+
+       ASSERT(ext_diff > 0);
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
+
+       if (new_size == 0) {
+               xfs_iext_destroy(ifp);
+       } else if (ifp->if_flags & XFS_IFEXTIREC) {
+               xfs_iext_remove_indirect(ifp, idx, ext_diff);
+       } else if (ifp->if_real_bytes) {
+               xfs_iext_remove_direct(ifp, idx, ext_diff);
+       } else {
+               xfs_iext_remove_inline(ifp, idx, ext_diff);
+       }
+       ifp->if_bytes = new_size;
+}
+
+/*
+ * This removes ext_diff extents from the inline buffer, beginning
+ * at extent index idx.
+ */
+void
+xfs_iext_remove_inline(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_extnum_t    idx,            /* index to begin removing exts */
+       int             ext_diff)       /* number of extents to remove */
+{
+       int             nextents;       /* number of extents in file */
+
+       ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+       ASSERT(idx < XFS_INLINE_EXTS);
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       ASSERT(((nextents - ext_diff) > 0) &&
+               (nextents - ext_diff) < XFS_INLINE_EXTS);
+
+       if (idx + ext_diff < nextents) {
+               memmove(&ifp->if_u2.if_inline_ext[idx],
+                       &ifp->if_u2.if_inline_ext[idx + ext_diff],
+                       (nextents - (idx + ext_diff)) *
+                        sizeof(xfs_bmbt_rec_t));
+               memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
+                       0, ext_diff * sizeof(xfs_bmbt_rec_t));
+       } else {
+               memset(&ifp->if_u2.if_inline_ext[idx], 0,
+                       ext_diff * sizeof(xfs_bmbt_rec_t));
+       }
+}
+
+/*
+ * This removes ext_diff extents from a linear (direct) extent list,
+ * beginning at extent index idx. If the extents are being removed
+ * from the end of the list (ie. truncate) then we just need to re-
+ * allocate the list to remove the extra space. Otherwise, if the
+ * extents are being removed from the middle of the existing extent
+ * entries, then we first need to move the extent records beginning
+ * at idx + ext_diff up in the list to overwrite the records being
+ * removed, then remove the extra space via kmem_realloc.
+ */
+void
+xfs_iext_remove_direct(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_extnum_t    idx,            /* index to begin removing exts */
+       int             ext_diff)       /* number of extents to remove */
+{
+       xfs_extnum_t    nextents;       /* number of extents in file */
+       int             new_size;       /* size of extents after removal */
+
+       ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+       new_size = ifp->if_bytes -
+               (ext_diff * sizeof(xfs_bmbt_rec_t));
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+
+       if (new_size == 0) {
+               xfs_iext_destroy(ifp);
+               return;
+       }
+       /* Move extents up in the list (if needed) */
+       if (idx + ext_diff < nextents) {
+               memmove(&ifp->if_u1.if_extents[idx],
+                       &ifp->if_u1.if_extents[idx + ext_diff],
+                       (nextents - (idx + ext_diff)) *
+                        sizeof(xfs_bmbt_rec_t));
+       }
+       memset(&ifp->if_u1.if_extents[nextents - ext_diff],
+               0, ext_diff * sizeof(xfs_bmbt_rec_t));
+       /*
+        * Reallocate the direct extent list. If the extents
+        * will fit inside the inode then xfs_iext_realloc_direct
+        * will switch from direct to inline extent allocation
+        * mode for us.
+        */
+       xfs_iext_realloc_direct(ifp, new_size);
+       ifp->if_bytes = new_size;
+}
+
+/*
+ * This is called when incore extents are being removed from the
+ * indirection array and the extents being removed span multiple extent
+ * buffers. The idx parameter contains the file extent index where we
+ * want to begin removing extents, and the count parameter contains
+ * how many extents need to be removed.
+ *
+ *    |-------|   |-------|
+ *    | nex1  |   |       |    nex1 - number of extents before idx
+ *    |-------|   | count |
+ *    |       |   |       |    count - number of extents being removed at idx
+ *    | count |   |-------|
+ *    |       |   | nex2  |    nex2 - number of extents after idx + count
+ *    |-------|   |-------|
+ */
+void
+xfs_iext_remove_indirect(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_extnum_t    idx,            /* index to begin removing extents */
+       int             count)          /* number of extents to remove */
+{
+       xfs_ext_irec_t  *erp;           /* indirection array pointer */
+       int             erp_idx = 0;    /* indirection array index */
+       xfs_extnum_t    ext_cnt;        /* extents left to remove */
+       xfs_extnum_t    ext_diff;       /* extents to remove in current list */
+       xfs_extnum_t    nex1;           /* number of extents before idx */
+       xfs_extnum_t    nex2;           /* extents after idx + count */
+       int             page_idx = idx; /* index in target extent list */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       erp = xfs_iext_idx_to_irec(ifp,  &page_idx, &erp_idx, 0);
+       ASSERT(erp != NULL);
+       nex1 = page_idx;
+       ext_cnt = count;
+       while (ext_cnt) {
+               nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
+               ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
+               /*
+                * Check for deletion of entire list;
+                * xfs_iext_irec_remove() updates extent offsets.
+                */
+               if (ext_diff == erp->er_extcount) {
+                       xfs_iext_irec_remove(ifp, erp_idx);
+                       ext_cnt -= ext_diff;
+                       nex1 = 0;
+                       if (ext_cnt) {
+                               ASSERT(erp_idx < ifp->if_real_bytes /
+                                       XFS_IEXT_BUFSZ);
+                               erp = &ifp->if_u1.if_ext_irec[erp_idx];
+                               nex1 = 0;
+                               continue;
+                       } else {
+                               break;
+                       }
+               }
+               /* Move extents up (if needed) */
+               if (nex2) {
+                       memmove(&erp->er_extbuf[nex1],
+                               &erp->er_extbuf[nex1 + ext_diff],
+                               nex2 * sizeof(xfs_bmbt_rec_t));
+               }
+               /* Zero out rest of page */
+               memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
+                       ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
+               /* Update remaining counters */
+               erp->er_extcount -= ext_diff;
+               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
+               ext_cnt -= ext_diff;
+               nex1 = 0;
+               erp_idx++;
+               erp++;
+       }
+       ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
+       xfs_iext_irec_compact(ifp);
+}
+
+/*
+ * Create, destroy, or resize a linear (direct) block of extents.
+ */
+void
+xfs_iext_realloc_direct(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       int             new_size)       /* new size of extents after adding */
+{
+       int             rnew_size;      /* real new size of extents */
+
+       rnew_size = new_size;
+
+       ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
+               ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
+                (new_size != ifp->if_real_bytes)));
+
+       /* Free extent records */
+       if (new_size == 0) {
+               xfs_iext_destroy(ifp);
+       }
+       /* Resize direct extent list and zero any new bytes */
+       else if (ifp->if_real_bytes) {
+               /* Check if extents will fit inside the inode */
+               if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
+                       xfs_iext_direct_to_inline(ifp, new_size /
+                               (uint)sizeof(xfs_bmbt_rec_t));
+                       ifp->if_bytes = new_size;
+                       return;
+               }
+               if (!is_power_of_2(new_size)){
+                       rnew_size = roundup_pow_of_two(new_size);
+               }
+               if (rnew_size != ifp->if_real_bytes) {
+                       ifp->if_u1.if_extents =
+                               kmem_realloc(ifp->if_u1.if_extents,
+                                               rnew_size,
+                                               ifp->if_real_bytes, KM_NOFS);
+               }
+               if (rnew_size > ifp->if_real_bytes) {
+                       memset(&ifp->if_u1.if_extents[ifp->if_bytes /
+                               (uint)sizeof(xfs_bmbt_rec_t)], 0,
+                               rnew_size - ifp->if_real_bytes);
+               }
+       }
+       /* Switch from the inline extent buffer to a direct extent list */
+       else {
+               if (!is_power_of_2(new_size)) {
+                       rnew_size = roundup_pow_of_two(new_size);
+               }
+               xfs_iext_inline_to_direct(ifp, rnew_size);
+       }
+       ifp->if_real_bytes = rnew_size;
+       ifp->if_bytes = new_size;
+}
+
+/*
+ * Switch from linear (direct) extent records to inline buffer.
+ */
+void
+xfs_iext_direct_to_inline(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_extnum_t    nextents)       /* number of extents in file */
+{
+       ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+       ASSERT(nextents <= XFS_INLINE_EXTS);
+       /*
+        * The inline buffer was zeroed when we switched
+        * from inline to direct extent allocation mode,
+        * so we don't need to clear it here.
+        */
+       memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
+               nextents * sizeof(xfs_bmbt_rec_t));
+       kmem_free(ifp->if_u1.if_extents);
+       ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+       ifp->if_real_bytes = 0;
+}
+
+/*
+ * Switch from inline buffer to linear (direct) extent records.
+ * new_size should already be rounded up to the next power of 2
+ * by the caller (when appropriate), so use new_size as it is.
+ * However, since new_size may be rounded up, we can't update
+ * if_bytes here. It is the caller's responsibility to update
+ * if_bytes upon return.
+ */
+void
+xfs_iext_inline_to_direct(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       int             new_size)       /* number of extents in file */
+{
+       ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
+       memset(ifp->if_u1.if_extents, 0, new_size);
+       if (ifp->if_bytes) {
+               memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
+                       ifp->if_bytes);
+               memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
+                       sizeof(xfs_bmbt_rec_t));
+       }
+       ifp->if_real_bytes = new_size;
+}
+
+/*
+ * Resize an extent indirection array to new_size bytes.
+ */
+STATIC void
+xfs_iext_realloc_indirect(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       int             new_size)       /* new indirection array size */
+{
+       int             nlists;         /* number of irec's (ex lists) */
+       int             size;           /* current indirection array size */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+       size = nlists * sizeof(xfs_ext_irec_t);
+       ASSERT(ifp->if_real_bytes);
+       ASSERT((new_size >= 0) && (new_size != size));
+       if (new_size == 0) {
+               xfs_iext_destroy(ifp);
+       } else {
+               ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
+                       kmem_realloc(ifp->if_u1.if_ext_irec,
+                               new_size, size, KM_NOFS);
+       }
+}
+
+/*
+ * Switch from indirection array to linear (direct) extent allocations.
+ */
+STATIC void
+xfs_iext_indirect_to_direct(
+        xfs_ifork_t    *ifp)           /* inode fork pointer */
+{
+       xfs_bmbt_rec_host_t *ep;        /* extent record pointer */
+       xfs_extnum_t    nextents;       /* number of extents in file */
+       int             size;           /* size of file extents */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       ASSERT(nextents <= XFS_LINEAR_EXTS);
+       size = nextents * sizeof(xfs_bmbt_rec_t);
+
+       xfs_iext_irec_compact_pages(ifp);
+       ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
+
+       ep = ifp->if_u1.if_ext_irec->er_extbuf;
+       kmem_free(ifp->if_u1.if_ext_irec);
+       ifp->if_flags &= ~XFS_IFEXTIREC;
+       ifp->if_u1.if_extents = ep;
+       ifp->if_bytes = size;
+       if (nextents < XFS_LINEAR_EXTS) {
+               xfs_iext_realloc_direct(ifp, size);
+       }
+}
+
+/*
+ * Free incore file extents.
+ */
+void
+xfs_iext_destroy(
+       xfs_ifork_t     *ifp)           /* inode fork pointer */
+{
+       if (ifp->if_flags & XFS_IFEXTIREC) {
+               int     erp_idx;
+               int     nlists;
+
+               nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+               for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
+                       xfs_iext_irec_remove(ifp, erp_idx);
+               }
+               ifp->if_flags &= ~XFS_IFEXTIREC;
+       } else if (ifp->if_real_bytes) {
+               kmem_free(ifp->if_u1.if_extents);
+       } else if (ifp->if_bytes) {
+               memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
+                       sizeof(xfs_bmbt_rec_t));
+       }
+       ifp->if_u1.if_extents = NULL;
+       ifp->if_real_bytes = 0;
+       ifp->if_bytes = 0;
+}
+
+/*
+ * Return a pointer to the extent record for file system block bno.
+ */
+xfs_bmbt_rec_host_t *                  /* pointer to found extent record */
+xfs_iext_bno_to_ext(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_fileoff_t   bno,            /* block number to search for */
+       xfs_extnum_t    *idxp)          /* index of target extent */
+{
+       xfs_bmbt_rec_host_t *base;      /* pointer to first extent */
+       xfs_filblks_t   blockcount = 0; /* number of blocks in extent */
+       xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */
+       xfs_ext_irec_t  *erp = NULL;    /* indirection array pointer */
+       int             high;           /* upper boundary in search */
+       xfs_extnum_t    idx = 0;        /* index of target extent */
+       int             low;            /* lower boundary in search */
+       xfs_extnum_t    nextents;       /* number of file extents */
+       xfs_fileoff_t   startoff = 0;   /* start offset of extent */
+
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       if (nextents == 0) {
+               *idxp = 0;
+               return NULL;
+       }
+       low = 0;
+       if (ifp->if_flags & XFS_IFEXTIREC) {
+               /* Find target extent list */
+               int     erp_idx = 0;
+               erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
+               base = erp->er_extbuf;
+               high = erp->er_extcount - 1;
+       } else {
+               base = ifp->if_u1.if_extents;
+               high = nextents - 1;
+       }
+       /* Binary search extent records */
+       while (low <= high) {
+               idx = (low + high) >> 1;
+               ep = base + idx;
+               startoff = xfs_bmbt_get_startoff(ep);
+               blockcount = xfs_bmbt_get_blockcount(ep);
+               if (bno < startoff) {
+                       high = idx - 1;
+               } else if (bno >= startoff + blockcount) {
+                       low = idx + 1;
+               } else {
+                       /* Convert back to file-based extent index */
+                       if (ifp->if_flags & XFS_IFEXTIREC) {
+                               idx += erp->er_extoff;
+                       }
+                       *idxp = idx;
+                       return ep;
+               }
+       }
+       /* Convert back to file-based extent index */
+       if (ifp->if_flags & XFS_IFEXTIREC) {
+               idx += erp->er_extoff;
+       }
+       if (bno >= startoff + blockcount) {
+               if (++idx == nextents) {
+                       ep = NULL;
+               } else {
+                       ep = xfs_iext_get_ext(ifp, idx);
+               }
+       }
+       *idxp = idx;
+       return ep;
+}
+
+/*
+ * Return a pointer to the indirection array entry containing the
+ * extent record for filesystem block bno. Store the index of the
+ * target irec in *erp_idxp.
+ */
+xfs_ext_irec_t *                       /* pointer to found extent record */
+xfs_iext_bno_to_irec(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_fileoff_t   bno,            /* block number to search for */
+       int             *erp_idxp)      /* irec index of target ext list */
+{
+       xfs_ext_irec_t  *erp = NULL;    /* indirection array pointer */
+       xfs_ext_irec_t  *erp_next;      /* next indirection array entry */
+       int             erp_idx;        /* indirection array index */
+       int             nlists;         /* number of extent irec's (lists) */
+       int             high;           /* binary search upper limit */
+       int             low;            /* binary search lower limit */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+       erp_idx = 0;
+       low = 0;
+       high = nlists - 1;
+       while (low <= high) {
+               erp_idx = (low + high) >> 1;
+               erp = &ifp->if_u1.if_ext_irec[erp_idx];
+               erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
+               if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
+                       high = erp_idx - 1;
+               } else if (erp_next && bno >=
+                          xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
+                       low = erp_idx + 1;
+               } else {
+                       break;
+               }
+       }
+       *erp_idxp = erp_idx;
+       return erp;
+}
+
+/*
+ * Return a pointer to the indirection array entry containing the
+ * extent record at file extent index *idxp. Store the index of the
+ * target irec in *erp_idxp and store the page index of the target
+ * extent record in *idxp.
+ */
+xfs_ext_irec_t *
+xfs_iext_idx_to_irec(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_extnum_t    *idxp,          /* extent index (file -> page) */
+       int             *erp_idxp,      /* pointer to target irec */
+       int             realloc)        /* new bytes were just added */
+{
+       xfs_ext_irec_t  *prev;          /* pointer to previous irec */
+       xfs_ext_irec_t  *erp = NULL;    /* pointer to current irec */
+       int             erp_idx;        /* indirection array index */
+       int             nlists;         /* number of irec's (ex lists) */
+       int             high;           /* binary search upper limit */
+       int             low;            /* binary search lower limit */
+       xfs_extnum_t    page_idx = *idxp; /* extent index in target list */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       ASSERT(page_idx >= 0);
+       ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+       ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
+
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+       erp_idx = 0;
+       low = 0;
+       high = nlists - 1;
+
+       /* Binary search extent irec's */
+       while (low <= high) {
+               erp_idx = (low + high) >> 1;
+               erp = &ifp->if_u1.if_ext_irec[erp_idx];
+               prev = erp_idx > 0 ? erp - 1 : NULL;
+               if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
+                    realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
+                       high = erp_idx - 1;
+               } else if (page_idx > erp->er_extoff + erp->er_extcount ||
+                          (page_idx == erp->er_extoff + erp->er_extcount &&
+                           !realloc)) {
+                       low = erp_idx + 1;
+               } else if (page_idx == erp->er_extoff + erp->er_extcount &&
+                          erp->er_extcount == XFS_LINEAR_EXTS) {
+                       ASSERT(realloc);
+                       page_idx = 0;
+                       erp_idx++;
+                       erp = erp_idx < nlists ? erp + 1 : NULL;
+                       break;
+               } else {
+                       page_idx -= erp->er_extoff;
+                       break;
+               }
+       }
+       *idxp = page_idx;
+       *erp_idxp = erp_idx;
+       return erp;
+}
+
+/*
+ * Allocate and initialize an indirection array once the space needed
+ * for incore extents increases above XFS_IEXT_BUFSZ.
+ */
+void
+xfs_iext_irec_init(
+       xfs_ifork_t     *ifp)           /* inode fork pointer */
+{
+       xfs_ext_irec_t  *erp;           /* indirection array pointer */
+       xfs_extnum_t    nextents;       /* number of extents in file */
+
+       ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       ASSERT(nextents <= XFS_LINEAR_EXTS);
+
+       erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
+
+       if (nextents == 0) {
+               ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
+       } else if (!ifp->if_real_bytes) {
+               xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
+       } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
+               xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
+       }
+       erp->er_extbuf = ifp->if_u1.if_extents;
+       erp->er_extcount = nextents;
+       erp->er_extoff = 0;
+
+       ifp->if_flags |= XFS_IFEXTIREC;
+       ifp->if_real_bytes = XFS_IEXT_BUFSZ;
+       ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
+       ifp->if_u1.if_ext_irec = erp;
+
+       return;
+}
+
+/*
+ * Allocate and initialize a new entry in the indirection array.
+ */
+xfs_ext_irec_t *
+xfs_iext_irec_new(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       int             erp_idx)        /* index for new irec */
+{
+       xfs_ext_irec_t  *erp;           /* indirection array pointer */
+       int             i;              /* loop counter */
+       int             nlists;         /* number of irec's (ex lists) */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+
+       /* Resize indirection array */
+       xfs_iext_realloc_indirect(ifp, ++nlists *
+                                 sizeof(xfs_ext_irec_t));
+       /*
+        * Move records down in the array so the
+        * new page can use erp_idx.
+        */
+       erp = ifp->if_u1.if_ext_irec;
+       for (i = nlists - 1; i > erp_idx; i--) {
+               memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
+       }
+       ASSERT(i == erp_idx);
+
+       /* Initialize new extent record */
+       erp = ifp->if_u1.if_ext_irec;
+       erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
+       ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
+       memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
+       erp[erp_idx].er_extcount = 0;
+       erp[erp_idx].er_extoff = erp_idx > 0 ?
+               erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
+       return (&erp[erp_idx]);
+}
+
+/*
+ * Remove a record from the indirection array.
+ */
+void
+xfs_iext_irec_remove(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       int             erp_idx)        /* irec index to remove */
+{
+       xfs_ext_irec_t  *erp;           /* indirection array pointer */
+       int             i;              /* loop counter */
+       int             nlists;         /* number of irec's (ex lists) */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+       erp = &ifp->if_u1.if_ext_irec[erp_idx];
+       if (erp->er_extbuf) {
+               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
+                       -erp->er_extcount);
+               kmem_free(erp->er_extbuf);
+       }
+       /* Compact extent records */
+       erp = ifp->if_u1.if_ext_irec;
+       for (i = erp_idx; i < nlists - 1; i++) {
+               memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
+       }
+       /*
+        * Manually free the last extent record from the indirection
+        * array.  A call to xfs_iext_realloc_indirect() with a size
+        * of zero would result in a call to xfs_iext_destroy() which
+        * would in turn call this function again, creating a nasty
+        * infinite loop.
+        */
+       if (--nlists) {
+               xfs_iext_realloc_indirect(ifp,
+                       nlists * sizeof(xfs_ext_irec_t));
+       } else {
+               kmem_free(ifp->if_u1.if_ext_irec);
+       }
+       ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
+}
+
+/*
+ * This is called to clean up large amounts of unused memory allocated
+ * by the indirection array.  Before compacting anything though, verify
+ * that the indirection array is still needed and switch back to the
+ * linear extent list (or even the inline buffer) if possible.  The
+ * compaction policy is as follows:
+ *
+ *    Full Compaction: Extents fit into a single page (or inline buffer)
+ * Partial Compaction: Extents occupy less than 50% of allocated space
+ *      No Compaction: Extents occupy at least 50% of allocated space
+ */
+void
+xfs_iext_irec_compact(
+       xfs_ifork_t     *ifp)           /* inode fork pointer */
+{
+       xfs_extnum_t    nextents;       /* number of extents in file */
+       int             nlists;         /* number of irec's (ex lists) */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+
+       if (nextents == 0) {
+               xfs_iext_destroy(ifp);
+       } else if (nextents <= XFS_INLINE_EXTS) {
+               xfs_iext_indirect_to_direct(ifp);
+               xfs_iext_direct_to_inline(ifp, nextents);
+       } else if (nextents <= XFS_LINEAR_EXTS) {
+               xfs_iext_indirect_to_direct(ifp);
+       } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
+               xfs_iext_irec_compact_pages(ifp);
+       }
+}
+
+/*
+ * Combine extents from neighboring extent pages.
+ */
+void
+xfs_iext_irec_compact_pages(
+       xfs_ifork_t     *ifp)           /* inode fork pointer */
+{
+       xfs_ext_irec_t  *erp, *erp_next;/* pointers to irec entries */
+       int             erp_idx = 0;    /* indirection array index */
+       int             nlists;         /* number of irec's (ex lists) */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+       while (erp_idx < nlists - 1) {
+               erp = &ifp->if_u1.if_ext_irec[erp_idx];
+               erp_next = erp + 1;
+               if (erp_next->er_extcount <=
+                   (XFS_LINEAR_EXTS - erp->er_extcount)) {
+                       memcpy(&erp->er_extbuf[erp->er_extcount],
+                               erp_next->er_extbuf, erp_next->er_extcount *
+                               sizeof(xfs_bmbt_rec_t));
+                       erp->er_extcount += erp_next->er_extcount;
+                       /*
+                        * Free page before removing extent record
+                        * so er_extoffs don't get modified in
+                        * xfs_iext_irec_remove.
+                        */
+                       kmem_free(erp_next->er_extbuf);
+                       erp_next->er_extbuf = NULL;
+                       xfs_iext_irec_remove(ifp, erp_idx + 1);
+                       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+               } else {
+                       erp_idx++;
+               }
+       }
+}
+
+/*
+ * This is called to update the er_extoff field in the indirection
+ * array when extents have been added or removed from one of the
+ * extent lists. erp_idx contains the irec index to begin updating
+ * at and ext_diff contains the number of extents that were added
+ * or removed.
+ */
+void
+xfs_iext_irec_update_extoffs(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       int             erp_idx,        /* irec index to update */
+       int             ext_diff)       /* number of new extents */
+{
+       int             i;              /* loop counter */
+       int             nlists;         /* number of irec's (ex lists */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+       for (i = erp_idx; i < nlists; i++) {
+               ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
+       }
+}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
new file mode 100644 (file)
index 0000000..7d3b1ed
--- /dev/null
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef        __XFS_INODE_FORK_H__
+#define        __XFS_INODE_FORK_H__
+
+struct xfs_inode_log_item;
+struct xfs_dinode;
+
+/*
+ * The following xfs_ext_irec_t struct introduces a second (top) level
+ * to the in-core extent allocation scheme. These structs are allocated
+ * in a contiguous block, creating an indirection array where each entry
+ * (irec) contains a pointer to a buffer of in-core extent records which
+ * it manages. Each extent buffer is 4k in size, since 4k is the system
+ * page size on Linux i386 and systems with larger page sizes don't seem
+ * to gain much, if anything, by using their native page size as the
+ * extent buffer size. Also, using 4k extent buffers everywhere provides
+ * a consistent interface for CXFS across different platforms.
+ *
+ * There is currently no limit on the number of irec's (extent lists)
+ * allowed, so heavily fragmented files may require an indirection array
+ * which spans multiple system pages of memory. The number of extents
+ * which would require this amount of contiguous memory is very large
+ * and should not cause problems in the foreseeable future. However,
+ * if the memory needed for the contiguous array ever becomes a problem,
+ * it is possible that a third level of indirection may be required.
+ */
+typedef struct xfs_ext_irec {
+       xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */
+       xfs_extnum_t    er_extoff;      /* extent offset in file */
+       xfs_extnum_t    er_extcount;    /* number of extents in page/block */
+} xfs_ext_irec_t;
+
+/*
+ * File incore extent information, present for each of data & attr forks.
+ */
+#define        XFS_IEXT_BUFSZ          4096
+#define        XFS_LINEAR_EXTS         (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t))
+#define        XFS_INLINE_EXTS         2
+#define        XFS_INLINE_DATA         32
+typedef struct xfs_ifork {
+       int                     if_bytes;       /* bytes in if_u1 */
+       int                     if_real_bytes;  /* bytes allocated in if_u1 */
+       struct xfs_btree_block  *if_broot;      /* file's incore btree root */
+       short                   if_broot_bytes; /* bytes allocated for root */
+       unsigned char           if_flags;       /* per-fork flags */
+       union {
+               xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
+               xfs_ext_irec_t  *if_ext_irec;   /* irec map file exts */
+               char            *if_data;       /* inline file data */
+       } if_u1;
+       union {
+               xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS];
+                                               /* very small file extents */
+               char            if_inline_data[XFS_INLINE_DATA];
+                                               /* very small file data */
+               xfs_dev_t       if_rdev;        /* dev number if special */
+               uuid_t          if_uuid;        /* mount point value */
+       } if_u2;
+} xfs_ifork_t;
+
+/*
+ * Per-fork incore inode flags.
+ */
+#define        XFS_IFINLINE    0x01    /* Inline data is read in */
+#define        XFS_IFEXTENTS   0x02    /* All extent pointers are read in */
+#define        XFS_IFBROOT     0x04    /* i_broot points to the bmap b-tree root */
+#define        XFS_IFEXTIREC   0x08    /* Indirection array of extent blocks */
+
+/*
+ * Fork handling.
+ */
+
+#define XFS_IFORK_Q(ip)                        ((ip)->i_d.di_forkoff != 0)
+#define XFS_IFORK_BOFF(ip)             ((int)((ip)->i_d.di_forkoff << 3))
+
+#define XFS_IFORK_PTR(ip,w)            \
+       ((w) == XFS_DATA_FORK ? \
+               &(ip)->i_df : \
+               (ip)->i_afp)
+#define XFS_IFORK_DSIZE(ip) \
+       (XFS_IFORK_Q(ip) ? \
+               XFS_IFORK_BOFF(ip) : \
+               XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version))
+#define XFS_IFORK_ASIZE(ip) \
+       (XFS_IFORK_Q(ip) ? \
+               XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \
+                       XFS_IFORK_BOFF(ip) : \
+               0)
+#define XFS_IFORK_SIZE(ip,w) \
+       ((w) == XFS_DATA_FORK ? \
+               XFS_IFORK_DSIZE(ip) : \
+               XFS_IFORK_ASIZE(ip))
+#define XFS_IFORK_FORMAT(ip,w) \
+       ((w) == XFS_DATA_FORK ? \
+               (ip)->i_d.di_format : \
+               (ip)->i_d.di_aformat)
+#define XFS_IFORK_FMT_SET(ip,w,n) \
+       ((w) == XFS_DATA_FORK ? \
+               ((ip)->i_d.di_format = (n)) : \
+               ((ip)->i_d.di_aformat = (n)))
+#define XFS_IFORK_NEXTENTS(ip,w) \
+       ((w) == XFS_DATA_FORK ? \
+               (ip)->i_d.di_nextents : \
+               (ip)->i_d.di_anextents)
+#define XFS_IFORK_NEXT_SET(ip,w,n) \
+       ((w) == XFS_DATA_FORK ? \
+               ((ip)->i_d.di_nextents = (n)) : \
+               ((ip)->i_d.di_anextents = (n)))
+#define XFS_IFORK_MAXEXT(ip, w) \
+       (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
+
+int            xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *);
+void           xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
+                               struct xfs_inode_log_item *, int);
+void           xfs_idestroy_fork(struct xfs_inode *, int);
+void           xfs_idata_realloc(struct xfs_inode *, int, int);
+void           xfs_iroot_realloc(struct xfs_inode *, int, int);
+int            xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
+int            xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
+                                 int);
+
+struct xfs_bmbt_rec_host *
+               xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t);
+void           xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t,
+                               struct xfs_bmbt_irec *, int);
+void           xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int);
+void           xfs_iext_add_indirect_multi(struct xfs_ifork *, int,
+                                           xfs_extnum_t, int);
+void           xfs_iext_remove(struct xfs_inode *, xfs_extnum_t, int, int);
+void           xfs_iext_remove_inline(struct xfs_ifork *, xfs_extnum_t, int);
+void           xfs_iext_remove_direct(struct xfs_ifork *, xfs_extnum_t, int);
+void           xfs_iext_remove_indirect(struct xfs_ifork *, xfs_extnum_t, int);
+void           xfs_iext_realloc_direct(struct xfs_ifork *, int);
+void           xfs_iext_direct_to_inline(struct xfs_ifork *, xfs_extnum_t);
+void           xfs_iext_inline_to_direct(struct xfs_ifork *, int);
+void           xfs_iext_destroy(struct xfs_ifork *);
+struct xfs_bmbt_rec_host *
+               xfs_iext_bno_to_ext(struct xfs_ifork *, xfs_fileoff_t, int *);
+struct xfs_ext_irec *
+               xfs_iext_bno_to_irec(struct xfs_ifork *, xfs_fileoff_t, int *);
+struct xfs_ext_irec *
+               xfs_iext_idx_to_irec(struct xfs_ifork *, xfs_extnum_t *, int *,
+                                    int);
+void           xfs_iext_irec_init(struct xfs_ifork *);
+struct xfs_ext_irec *
+               xfs_iext_irec_new(struct xfs_ifork *, int);
+void           xfs_iext_irec_remove(struct xfs_ifork *, int);
+void           xfs_iext_irec_compact(struct xfs_ifork *);
+void           xfs_iext_irec_compact_pages(struct xfs_ifork *);
+void           xfs_iext_irec_compact_full(struct xfs_ifork *);
+void           xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int);
+
+extern struct kmem_zone        *xfs_ifork_zone;
+
+#endif /* __XFS_INODE_FORK_H__ */
diff --git a/fs/xfs/libxfs/xfs_inum.h b/fs/xfs/libxfs/xfs_inum.h
new file mode 100644 (file)
index 0000000..4ff2278
--- /dev/null
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_INUM_H__
+#define        __XFS_INUM_H__
+
+/*
+ * Inode number format:
+ * low inopblog bits - offset in block
+ * next agblklog bits - block number in ag
+ * next agno_log bits - ag number
+ * high agno_log-agblklog-inopblog bits - 0
+ */
+
+struct xfs_mount;
+
+#define        XFS_INO_MASK(k)                 (__uint32_t)((1ULL << (k)) - 1)
+#define        XFS_INO_OFFSET_BITS(mp)         (mp)->m_sb.sb_inopblog
+#define        XFS_INO_AGBNO_BITS(mp)          (mp)->m_sb.sb_agblklog
+#define        XFS_INO_AGINO_BITS(mp)          (mp)->m_agino_log
+#define        XFS_INO_AGNO_BITS(mp)           (mp)->m_agno_log
+#define        XFS_INO_BITS(mp)                \
+       XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp)
+#define        XFS_INO_TO_AGNO(mp,i)           \
+       ((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp)))
+#define        XFS_INO_TO_AGINO(mp,i)          \
+       ((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp)))
+#define        XFS_INO_TO_AGBNO(mp,i)          \
+       (((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \
+               XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp)))
+#define        XFS_INO_TO_OFFSET(mp,i)         \
+       ((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
+#define        XFS_INO_TO_FSB(mp,i)            \
+       XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i))
+#define        XFS_AGINO_TO_INO(mp,a,i)        \
+       (((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i))
+#define        XFS_AGINO_TO_AGBNO(mp,i)        ((i) >> XFS_INO_OFFSET_BITS(mp))
+#define        XFS_AGINO_TO_OFFSET(mp,i)       \
+       ((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
+#define        XFS_OFFBNO_TO_AGINO(mp,b,o)     \
+       ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
+
+#define        XFS_MAXINUMBER          ((xfs_ino_t)((1ULL << 56) - 1ULL))
+#define        XFS_MAXINUMBER_32       ((xfs_ino_t)((1ULL << 32) - 1ULL))
+
+#endif /* __XFS_INUM_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
new file mode 100644 (file)
index 0000000..aff12f2
--- /dev/null
@@ -0,0 +1,679 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef        __XFS_LOG_FORMAT_H__
+#define __XFS_LOG_FORMAT_H__
+
+struct xfs_mount;
+struct xfs_trans_res;
+
+/*
+ * On-disk Log Format definitions.
+ *
+ * This file contains all the on-disk format definitions used within the log. It
+ * includes the physical log structure itself, as well as all the log item
+ * format structures that are written into the log and intepreted by log
+ * recovery. We start with the physical log format definitions, and then work
+ * through all the log items definitions and everything they encode into the
+ * log.
+ */
+typedef __uint32_t xlog_tid_t;
+
+#define XLOG_MIN_ICLOGS                2
+#define XLOG_MAX_ICLOGS                8
+#define XLOG_HEADER_MAGIC_NUM  0xFEEDbabe      /* Invalid cycle number */
+#define XLOG_VERSION_1         1
+#define XLOG_VERSION_2         2               /* Large IClogs, Log sunit */
+#define XLOG_VERSION_OKBITS    (XLOG_VERSION_1 | XLOG_VERSION_2)
+#define XLOG_MIN_RECORD_BSIZE  (16*1024)       /* eventually 32k */
+#define XLOG_BIG_RECORD_BSIZE  (32*1024)       /* 32k buffers */
+#define XLOG_MAX_RECORD_BSIZE  (256*1024)
+#define XLOG_HEADER_CYCLE_SIZE (32*1024)       /* cycle data in header */
+#define XLOG_MIN_RECORD_BSHIFT 14              /* 16384 == 1 << 14 */
+#define XLOG_BIG_RECORD_BSHIFT 15              /* 32k == 1 << 15 */
+#define XLOG_MAX_RECORD_BSHIFT 18              /* 256k == 1 << 18 */
+#define XLOG_BTOLSUNIT(log, b)  (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \
+                                 (log)->l_mp->m_sb.sb_logsunit)
+#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)
+
+#define XLOG_HEADER_SIZE       512
+
+/* Minimum number of transactions that must fit in the log (defined by mkfs) */
+#define XFS_MIN_LOG_FACTOR     3
+
+#define XLOG_REC_SHIFT(log) \
+       BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
+        XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
+#define XLOG_TOTAL_REC_SHIFT(log) \
+       BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
+        XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
+
+/* get lsn fields */
+#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
+#define BLOCK_LSN(lsn) ((uint)(lsn))
+
+/* this is used in a spot where we might otherwise double-endian-flip */
+#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0])
+
+static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
+{
+       return ((xfs_lsn_t)cycle << 32) | block;
+}
+
+static inline uint xlog_get_cycle(char *ptr)
+{
+       if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
+               return be32_to_cpu(*((__be32 *)ptr + 1));
+       else
+               return be32_to_cpu(*(__be32 *)ptr);
+}
+
+/* Log Clients */
+#define XFS_TRANSACTION                0x69
+#define XFS_VOLUME             0x2
+#define XFS_LOG                        0xaa
+
+#define XLOG_UNMOUNT_TYPE      0x556e  /* Un for Unmount */
+
+/* Region types for iovec's i_type */
+#define XLOG_REG_TYPE_BFORMAT          1
+#define XLOG_REG_TYPE_BCHUNK           2
+#define XLOG_REG_TYPE_EFI_FORMAT       3
+#define XLOG_REG_TYPE_EFD_FORMAT       4
+#define XLOG_REG_TYPE_IFORMAT          5
+#define XLOG_REG_TYPE_ICORE            6
+#define XLOG_REG_TYPE_IEXT             7
+#define XLOG_REG_TYPE_IBROOT           8
+#define XLOG_REG_TYPE_ILOCAL           9
+#define XLOG_REG_TYPE_IATTR_EXT                10
+#define XLOG_REG_TYPE_IATTR_BROOT      11
+#define XLOG_REG_TYPE_IATTR_LOCAL      12
+#define XLOG_REG_TYPE_QFORMAT          13
+#define XLOG_REG_TYPE_DQUOT            14
+#define XLOG_REG_TYPE_QUOTAOFF         15
+#define XLOG_REG_TYPE_LRHEADER         16
+#define XLOG_REG_TYPE_UNMOUNT          17
+#define XLOG_REG_TYPE_COMMIT           18
+#define XLOG_REG_TYPE_TRANSHDR         19
+#define XLOG_REG_TYPE_ICREATE          20
+#define XLOG_REG_TYPE_MAX              20
+
+/*
+ * Flags to log operation header
+ *
+ * The first write of a new transaction will be preceded with a start
+ * record, XLOG_START_TRANS.  Once a transaction is committed, a commit
+ * record is written, XLOG_COMMIT_TRANS.  If a single region can not fit into
+ * the remainder of the current active in-core log, it is split up into
+ * multiple regions.  Each partial region will be marked with a
+ * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS.
+ *
+ */
+#define XLOG_START_TRANS       0x01    /* Start a new transaction */
+#define XLOG_COMMIT_TRANS      0x02    /* Commit this transaction */
+#define XLOG_CONTINUE_TRANS    0x04    /* Cont this trans into new region */
+#define XLOG_WAS_CONT_TRANS    0x08    /* Cont this trans into new region */
+#define XLOG_END_TRANS         0x10    /* End a continued transaction */
+#define XLOG_UNMOUNT_TRANS     0x20    /* Unmount a filesystem transaction */
+
+
+typedef struct xlog_op_header {
+       __be32     oh_tid;      /* transaction id of operation  :  4 b */
+       __be32     oh_len;      /* bytes in data region         :  4 b */
+       __u8       oh_clientid; /* who sent me this             :  1 b */
+       __u8       oh_flags;    /*                              :  1 b */
+       __u16      oh_res2;     /* 32 bit align                 :  2 b */
+} xlog_op_header_t;
+
+/* valid values for h_fmt */
+#define XLOG_FMT_UNKNOWN  0
+#define XLOG_FMT_LINUX_LE 1
+#define XLOG_FMT_LINUX_BE 2
+#define XLOG_FMT_IRIX_BE  3
+
+/* our fmt */
+#ifdef XFS_NATIVE_HOST
+#define XLOG_FMT XLOG_FMT_LINUX_BE
+#else
+#define XLOG_FMT XLOG_FMT_LINUX_LE
+#endif
+
+typedef struct xlog_rec_header {
+       __be32    h_magicno;    /* log record (LR) identifier           :  4 */
+       __be32    h_cycle;      /* write cycle of log                   :  4 */
+       __be32    h_version;    /* LR version                           :  4 */
+       __be32    h_len;        /* len in bytes; should be 64-bit aligned: 4 */
+       __be64    h_lsn;        /* lsn of this LR                       :  8 */
+       __be64    h_tail_lsn;   /* lsn of 1st LR w/ buffers not committed: 8 */
+       __le32    h_crc;        /* crc of log record                    :  4 */
+       __be32    h_prev_block; /* block number to previous LR          :  4 */
+       __be32    h_num_logops; /* number of log operations in this LR  :  4 */
+       __be32    h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
+       /* new fields */
+       __be32    h_fmt;        /* format of log record                 :  4 */
+       uuid_t    h_fs_uuid;    /* uuid of FS                           : 16 */
+       __be32    h_size;       /* iclog size                           :  4 */
+} xlog_rec_header_t;
+
+typedef struct xlog_rec_ext_header {
+       __be32    xh_cycle;     /* write cycle of log                   : 4 */
+       __be32    xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /*    : 256 */
+} xlog_rec_ext_header_t;
+
+/*
+ * Quite misnamed, because this union lays out the actual on-disk log buffer.
+ */
+typedef union xlog_in_core2 {
+       xlog_rec_header_t       hic_header;
+       xlog_rec_ext_header_t   hic_xheader;
+       char                    hic_sector[XLOG_HEADER_SIZE];
+} xlog_in_core_2_t;
+
+/* not an on-disk structure, but needed by log recovery in userspace */
+typedef struct xfs_log_iovec {
+       void            *i_addr;        /* beginning address of region */
+       int             i_len;          /* length in bytes of region */
+       uint            i_type;         /* type of region */
+} xfs_log_iovec_t;
+
+
+/*
+ * Transaction Header definitions.
+ *
+ * This is the structure written in the log at the head of every transaction. It
+ * identifies the type and id of the transaction, and contains the number of
+ * items logged by the transaction so we know how many to expect during
+ * recovery.
+ *
+ * Do not change the below structure without redoing the code in
+ * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
+ */
+typedef struct xfs_trans_header {
+       uint            th_magic;               /* magic number */
+       uint            th_type;                /* transaction type */
+       __int32_t       th_tid;                 /* transaction id (unused) */
+       uint            th_num_items;           /* num items logged by trans */
+} xfs_trans_header_t;
+
+#define        XFS_TRANS_HEADER_MAGIC  0x5452414e      /* TRAN */
+
+/*
+ * Log item types.
+ */
+#define        XFS_LI_EFI              0x1236
+#define        XFS_LI_EFD              0x1237
+#define        XFS_LI_IUNLINK          0x1238
+#define        XFS_LI_INODE            0x123b  /* aligned ino chunks, var-size ibufs */
+#define        XFS_LI_BUF              0x123c  /* v2 bufs, variable sized inode bufs */
+#define        XFS_LI_DQUOT            0x123d
+#define        XFS_LI_QUOTAOFF         0x123e
+#define        XFS_LI_ICREATE          0x123f
+
+#define XFS_LI_TYPE_DESC \
+       { XFS_LI_EFI,           "XFS_LI_EFI" }, \
+       { XFS_LI_EFD,           "XFS_LI_EFD" }, \
+       { XFS_LI_IUNLINK,       "XFS_LI_IUNLINK" }, \
+       { XFS_LI_INODE,         "XFS_LI_INODE" }, \
+       { XFS_LI_BUF,           "XFS_LI_BUF" }, \
+       { XFS_LI_DQUOT,         "XFS_LI_DQUOT" }, \
+       { XFS_LI_QUOTAOFF,      "XFS_LI_QUOTAOFF" }, \
+       { XFS_LI_ICREATE,       "XFS_LI_ICREATE" }
+
+/*
+ * Inode Log Item Format definitions.
+ *
+ * This is the structure used to lay out an inode log item in the
+ * log.  The size of the inline data/extents/b-tree root to be logged
+ * (if any) is indicated in the ilf_dsize field.  Changes to this structure
+ * must be added on to the end.
+ */
+typedef struct xfs_inode_log_format {
+       __uint16_t              ilf_type;       /* inode log item type */
+       __uint16_t              ilf_size;       /* size of this item */
+       __uint32_t              ilf_fields;     /* flags for fields logged */
+       __uint16_t              ilf_asize;      /* size of attr d/ext/root */
+       __uint16_t              ilf_dsize;      /* size of data/ext/root */
+       __uint64_t              ilf_ino;        /* inode number */
+       union {
+               __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
+               uuid_t          ilfu_uuid;      /* mount point value */
+       } ilf_u;
+       __int64_t               ilf_blkno;      /* blkno of inode buffer */
+       __int32_t               ilf_len;        /* len of inode buffer */
+       __int32_t               ilf_boffset;    /* off of inode in buffer */
+} xfs_inode_log_format_t;
+
+typedef struct xfs_inode_log_format_32 {
+       __uint16_t              ilf_type;       /* inode log item type */
+       __uint16_t              ilf_size;       /* size of this item */
+       __uint32_t              ilf_fields;     /* flags for fields logged */
+       __uint16_t              ilf_asize;      /* size of attr d/ext/root */
+       __uint16_t              ilf_dsize;      /* size of data/ext/root */
+       __uint64_t              ilf_ino;        /* inode number */
+       union {
+               __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
+               uuid_t          ilfu_uuid;      /* mount point value */
+       } ilf_u;
+       __int64_t               ilf_blkno;      /* blkno of inode buffer */
+       __int32_t               ilf_len;        /* len of inode buffer */
+       __int32_t               ilf_boffset;    /* off of inode in buffer */
+} __attribute__((packed)) xfs_inode_log_format_32_t;
+
+typedef struct xfs_inode_log_format_64 {
+       __uint16_t              ilf_type;       /* inode log item type */
+       __uint16_t              ilf_size;       /* size of this item */
+       __uint32_t              ilf_fields;     /* flags for fields logged */
+       __uint16_t              ilf_asize;      /* size of attr d/ext/root */
+       __uint16_t              ilf_dsize;      /* size of data/ext/root */
+       __uint32_t              ilf_pad;        /* pad for 64 bit boundary */
+       __uint64_t              ilf_ino;        /* inode number */
+       union {
+               __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
+               uuid_t          ilfu_uuid;      /* mount point value */
+       } ilf_u;
+       __int64_t               ilf_blkno;      /* blkno of inode buffer */
+       __int32_t               ilf_len;        /* len of inode buffer */
+       __int32_t               ilf_boffset;    /* off of inode in buffer */
+} xfs_inode_log_format_64_t;
+
+/*
+ * Flags for xfs_trans_log_inode flags field.
+ */
+#define        XFS_ILOG_CORE   0x001   /* log standard inode fields */
+#define        XFS_ILOG_DDATA  0x002   /* log i_df.if_data */
+#define        XFS_ILOG_DEXT   0x004   /* log i_df.if_extents */
+#define        XFS_ILOG_DBROOT 0x008   /* log i_df.i_broot */
+#define        XFS_ILOG_DEV    0x010   /* log the dev field */
+#define        XFS_ILOG_UUID   0x020   /* log the uuid field */
+#define        XFS_ILOG_ADATA  0x040   /* log i_af.if_data */
+#define        XFS_ILOG_AEXT   0x080   /* log i_af.if_extents */
+#define        XFS_ILOG_ABROOT 0x100   /* log i_af.i_broot */
+#define XFS_ILOG_DOWNER        0x200   /* change the data fork owner on replay */
+#define XFS_ILOG_AOWNER        0x400   /* change the attr fork owner on replay */
+
+
+/*
+ * The timestamps are dirty, but not necessarily anything else in the inode
+ * core.  Unlike the other fields above this one must never make it to disk
+ * in the ilf_fields of the inode_log_format, but is purely store in-memory in
+ * ili_fields in the inode_log_item.
+ */
+#define XFS_ILOG_TIMESTAMP     0x4000
+
+#define        XFS_ILOG_NONCORE        (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+                                XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
+                                XFS_ILOG_UUID | XFS_ILOG_ADATA | \
+                                XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \
+                                XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
+
+#define        XFS_ILOG_DFORK          (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+                                XFS_ILOG_DBROOT)
+
+#define        XFS_ILOG_AFORK          (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+                                XFS_ILOG_ABROOT)
+
+#define        XFS_ILOG_ALL            (XFS_ILOG_CORE | XFS_ILOG_DDATA | \
+                                XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
+                                XFS_ILOG_DEV | XFS_ILOG_UUID | \
+                                XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+                                XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP | \
+                                XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
+
+static inline int xfs_ilog_fbroot(int w)
+{
+       return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
+}
+
+static inline int xfs_ilog_fext(int w)
+{
+       return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
+}
+
+static inline int xfs_ilog_fdata(int w)
+{
+       return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
+}
+
+/*
+ * Incore version of the on-disk inode core structures. We log this directly
+ * into the journal in host CPU format (for better or worse) and as such
+ * directly mirrors the xfs_dinode structure as it must contain all the same
+ * information.
+ */
+typedef struct xfs_ictimestamp {
+       __int32_t       t_sec;          /* timestamp seconds */
+       __int32_t       t_nsec;         /* timestamp nanoseconds */
+} xfs_ictimestamp_t;
+
+/*
+ * NOTE:  This structure must be kept identical to struct xfs_dinode
+ *       in xfs_dinode.h except for the endianness annotations.
+ */
+typedef struct xfs_icdinode {
+       __uint16_t      di_magic;       /* inode magic # = XFS_DINODE_MAGIC */
+       __uint16_t      di_mode;        /* mode and type of file */
+       __int8_t        di_version;     /* inode version */
+       __int8_t        di_format;      /* format of di_c data */
+       __uint16_t      di_onlink;      /* old number of links to file */
+       __uint32_t      di_uid;         /* owner's user id */
+       __uint32_t      di_gid;         /* owner's group id */
+       __uint32_t      di_nlink;       /* number of links to file */
+       __uint16_t      di_projid_lo;   /* lower part of owner's project id */
+       __uint16_t      di_projid_hi;   /* higher part of owner's project id */
+       __uint8_t       di_pad[6];      /* unused, zeroed space */
+       __uint16_t      di_flushiter;   /* incremented on flush */
+       xfs_ictimestamp_t di_atime;     /* time last accessed */
+       xfs_ictimestamp_t di_mtime;     /* time last modified */
+       xfs_ictimestamp_t di_ctime;     /* time created/inode modified */
+       xfs_fsize_t     di_size;        /* number of bytes in file */
+       xfs_rfsblock_t  di_nblocks;     /* # of direct & btree blocks used */
+       xfs_extlen_t    di_extsize;     /* basic/minimum extent size for file */
+       xfs_extnum_t    di_nextents;    /* number of extents in data fork */
+       xfs_aextnum_t   di_anextents;   /* number of extents in attribute fork*/
+       __uint8_t       di_forkoff;     /* attr fork offs, <<3 for 64b align */
+       __int8_t        di_aformat;     /* format of attr fork's data */
+       __uint32_t      di_dmevmask;    /* DMIG event mask */
+       __uint16_t      di_dmstate;     /* DMIG state info */
+       __uint16_t      di_flags;       /* random flags, XFS_DIFLAG_... */
+       __uint32_t      di_gen;         /* generation number */
+
+       /* di_next_unlinked is the only non-core field in the old dinode */
+       xfs_agino_t     di_next_unlinked;/* agi unlinked list ptr */
+
+       /* start of the extended dinode, writable fields */
+       __uint32_t      di_crc;         /* CRC of the inode */
+       __uint64_t      di_changecount; /* number of attribute changes */
+       xfs_lsn_t       di_lsn;         /* flush sequence */
+       __uint64_t      di_flags2;      /* more random flags */
+       __uint8_t       di_pad2[16];    /* more padding for future expansion */
+
+       /* fields only written to during inode creation */
+       xfs_ictimestamp_t di_crtime;    /* time created */
+       xfs_ino_t       di_ino;         /* inode number */
+       uuid_t          di_uuid;        /* UUID of the filesystem */
+
+       /* structure must be padded to 64 bit alignment */
+} xfs_icdinode_t;
+
+static inline uint xfs_icdinode_size(int version)
+{
+       if (version == 3)
+               return sizeof(struct xfs_icdinode);
+       return offsetof(struct xfs_icdinode, di_next_unlinked);
+}
+
+/*
+ * Buffer Log Format defintions
+ *
+ * These are the physical dirty bitmap defintions for the log format structure.
+ */
+#define        XFS_BLF_CHUNK           128
+#define        XFS_BLF_SHIFT           7
+#define        BIT_TO_WORD_SHIFT       5
+#define        NBWORD                  (NBBY * sizeof(unsigned int))
+
+/*
+ * This flag indicates that the buffer contains on disk inodes
+ * and requires special recovery handling.
+ */
+#define        XFS_BLF_INODE_BUF       (1<<0)
+
+/*
+ * This flag indicates that the buffer should not be replayed
+ * during recovery because its blocks are being freed.
+ */
+#define        XFS_BLF_CANCEL          (1<<1)
+
+/*
+ * This flag indicates that the buffer contains on disk
+ * user or group dquots and may require special recovery handling.
+ */
+#define        XFS_BLF_UDQUOT_BUF      (1<<2)
+#define XFS_BLF_PDQUOT_BUF     (1<<3)
+#define        XFS_BLF_GDQUOT_BUF      (1<<4)
+
+/*
+ * This is the structure used to lay out a buf log item in the
+ * log.  The data map describes which 128 byte chunks of the buffer
+ * have been logged.
+ */
+#define XFS_BLF_DATAMAP_SIZE   ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
+
+typedef struct xfs_buf_log_format {
+       unsigned short  blf_type;       /* buf log item type indicator */
+       unsigned short  blf_size;       /* size of this item */
+       ushort          blf_flags;      /* misc state */
+       ushort          blf_len;        /* number of blocks in this buf */
+       __int64_t       blf_blkno;      /* starting blkno of this buf */
+       unsigned int    blf_map_size;   /* used size of data bitmap in words */
+       unsigned int    blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
+} xfs_buf_log_format_t;
+
+/*
+ * All buffers now need to tell recovery where the magic number
+ * is so that it can verify and calculate the CRCs on the buffer correctly
+ * once the changes have been replayed into the buffer.
+ *
+ * The type value is held in the upper 5 bits of the blf_flags field, which is
+ * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down.
+ */
+#define XFS_BLFT_BITS  5
+#define XFS_BLFT_SHIFT 11
+#define XFS_BLFT_MASK  (((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT)
+
+enum xfs_blft {
+       XFS_BLFT_UNKNOWN_BUF = 0,
+       XFS_BLFT_UDQUOT_BUF,
+       XFS_BLFT_PDQUOT_BUF,
+       XFS_BLFT_GDQUOT_BUF,
+       XFS_BLFT_BTREE_BUF,
+       XFS_BLFT_AGF_BUF,
+       XFS_BLFT_AGFL_BUF,
+       XFS_BLFT_AGI_BUF,
+       XFS_BLFT_DINO_BUF,
+       XFS_BLFT_SYMLINK_BUF,
+       XFS_BLFT_DIR_BLOCK_BUF,
+       XFS_BLFT_DIR_DATA_BUF,
+       XFS_BLFT_DIR_FREE_BUF,
+       XFS_BLFT_DIR_LEAF1_BUF,
+       XFS_BLFT_DIR_LEAFN_BUF,
+       XFS_BLFT_DA_NODE_BUF,
+       XFS_BLFT_ATTR_LEAF_BUF,
+       XFS_BLFT_ATTR_RMT_BUF,
+       XFS_BLFT_SB_BUF,
+       XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS),
+};
+
+static inline void
+xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type)
+{
+       ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF);
+       blf->blf_flags &= ~XFS_BLFT_MASK;
+       blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK);
+}
+
+static inline __uint16_t
+xfs_blft_from_flags(struct xfs_buf_log_format *blf)
+{
+       return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT;
+}
+
+/*
+ * EFI/EFD log format definitions
+ */
+typedef struct xfs_extent {
+       xfs_fsblock_t   ext_start;
+       xfs_extlen_t    ext_len;
+} xfs_extent_t;
+
+/*
+ * Since an xfs_extent_t has types (start:64, len: 32)
+ * there are different alignments on 32 bit and 64 bit kernels.
+ * So we provide the different variants for use by a
+ * conversion routine.
+ */
+typedef struct xfs_extent_32 {
+       __uint64_t      ext_start;
+       __uint32_t      ext_len;
+} __attribute__((packed)) xfs_extent_32_t;
+
+typedef struct xfs_extent_64 {
+       __uint64_t      ext_start;
+       __uint32_t      ext_len;
+       __uint32_t      ext_pad;
+} xfs_extent_64_t;
+
+/*
+ * This is the structure used to lay out an efi log item in the
+ * log.  The efi_extents field is a variable size array whose
+ * size is given by efi_nextents.
+ */
+typedef struct xfs_efi_log_format {
+       __uint16_t              efi_type;       /* efi log item type */
+       __uint16_t              efi_size;       /* size of this item */
+       __uint32_t              efi_nextents;   /* # extents to free */
+       __uint64_t              efi_id;         /* efi identifier */
+       xfs_extent_t            efi_extents[1]; /* array of extents to free */
+} xfs_efi_log_format_t;
+
+typedef struct xfs_efi_log_format_32 {
+       __uint16_t              efi_type;       /* efi log item type */
+       __uint16_t              efi_size;       /* size of this item */
+       __uint32_t              efi_nextents;   /* # extents to free */
+       __uint64_t              efi_id;         /* efi identifier */
+       xfs_extent_32_t         efi_extents[1]; /* array of extents to free */
+} __attribute__((packed)) xfs_efi_log_format_32_t;
+
+typedef struct xfs_efi_log_format_64 {
+       __uint16_t              efi_type;       /* efi log item type */
+       __uint16_t              efi_size;       /* size of this item */
+       __uint32_t              efi_nextents;   /* # extents to free */
+       __uint64_t              efi_id;         /* efi identifier */
+       xfs_extent_64_t         efi_extents[1]; /* array of extents to free */
+} xfs_efi_log_format_64_t;
+
+/*
+ * This is the structure used to lay out an efd log item in the
+ * log.  The efd_extents array is a variable size array whose
+ * size is given by efd_nextents;
+ */
+typedef struct xfs_efd_log_format {
+       __uint16_t              efd_type;       /* efd log item type */
+       __uint16_t              efd_size;       /* size of this item */
+       __uint32_t              efd_nextents;   /* # of extents freed */
+       __uint64_t              efd_efi_id;     /* id of corresponding efi */
+       xfs_extent_t            efd_extents[1]; /* array of extents freed */
+} xfs_efd_log_format_t;
+
+typedef struct xfs_efd_log_format_32 {
+       __uint16_t              efd_type;       /* efd log item type */
+       __uint16_t              efd_size;       /* size of this item */
+       __uint32_t              efd_nextents;   /* # of extents freed */
+       __uint64_t              efd_efi_id;     /* id of corresponding efi */
+       xfs_extent_32_t         efd_extents[1]; /* array of extents freed */
+} __attribute__((packed)) xfs_efd_log_format_32_t;
+
+typedef struct xfs_efd_log_format_64 {
+       __uint16_t              efd_type;       /* efd log item type */
+       __uint16_t              efd_size;       /* size of this item */
+       __uint32_t              efd_nextents;   /* # of extents freed */
+       __uint64_t              efd_efi_id;     /* id of corresponding efi */
+       xfs_extent_64_t         efd_extents[1]; /* array of extents freed */
+} xfs_efd_log_format_64_t;
+
+/*
+ * Dquot Log format definitions.
+ *
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ */
+typedef struct xfs_dq_logformat {
+       __uint16_t              qlf_type;      /* dquot log item type */
+       __uint16_t              qlf_size;      /* size of this item */
+       xfs_dqid_t              qlf_id;        /* usr/grp/proj id : 32 bits */
+       __int64_t               qlf_blkno;     /* blkno of dquot buffer */
+       __int32_t               qlf_len;       /* len of dquot buffer */
+       __uint32_t              qlf_boffset;   /* off of dquot in buffer */
+} xfs_dq_logformat_t;
+
+/*
+ * log format struct for QUOTAOFF records.
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer
+ * to the first and ensures that the first logitem is taken out of the AIL
+ * only when the last one is securely committed.
+ */
+typedef struct xfs_qoff_logformat {
+       unsigned short          qf_type;        /* quotaoff log item type */
+       unsigned short          qf_size;        /* size of this item */
+       unsigned int            qf_flags;       /* USR and/or GRP */
+       char                    qf_pad[12];     /* padding for future */
+} xfs_qoff_logformat_t;
+
+/*
+ * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
+ */
+#define XFS_UQUOTA_ACCT        0x0001  /* user quota accounting ON */
+#define XFS_UQUOTA_ENFD        0x0002  /* user quota limits enforced */
+#define XFS_UQUOTA_CHKD        0x0004  /* quotacheck run on usr quotas */
+#define XFS_PQUOTA_ACCT        0x0008  /* project quota accounting ON */
+#define XFS_OQUOTA_ENFD        0x0010  /* other (grp/prj) quota limits enforced */
+#define XFS_OQUOTA_CHKD        0x0020  /* quotacheck run on other (grp/prj) quotas */
+#define XFS_GQUOTA_ACCT        0x0040  /* group quota accounting ON */
+
+/*
+ * Conversion to and from the combined OQUOTA flag (if necessary)
+ * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk()
+ */
+#define XFS_GQUOTA_ENFD        0x0080  /* group quota limits enforced */
+#define XFS_GQUOTA_CHKD        0x0100  /* quotacheck run on group quotas */
+#define XFS_PQUOTA_ENFD        0x0200  /* project quota limits enforced */
+#define XFS_PQUOTA_CHKD        0x0400  /* quotacheck run on project quotas */
+
+#define XFS_ALL_QUOTA_ACCT     \
+               (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
+#define XFS_ALL_QUOTA_ENFD     \
+               (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD)
+#define XFS_ALL_QUOTA_CHKD     \
+               (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD)
+
+#define XFS_MOUNT_QUOTA_ALL    (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
+                                XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
+                                XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\
+                                XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\
+                                XFS_PQUOTA_CHKD)
+
+/*
+ * Inode create log item structure
+ *
+ * Log recovery assumes the first two entries are the type and size and they fit
+ * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so
+ * decoding can be done correctly.
+ */
+struct xfs_icreate_log {
+       __uint16_t      icl_type;       /* type of log format structure */
+       __uint16_t      icl_size;       /* size of log format structure */
+       __be32          icl_ag;         /* ag being allocated in */
+       __be32          icl_agbno;      /* start block of inode range */
+       __be32          icl_count;      /* number of inodes to initialise */
+       __be32          icl_isize;      /* size of inodes */
+       __be32          icl_length;     /* length of extent to initialise */
+       __be32          icl_gen;        /* inode generation number to use */
+};
+
+#endif /* __XFS_LOG_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
new file mode 100644 (file)
index 0000000..1c55ccb
--- /dev/null
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef        __XFS_LOG_RECOVER_H__
+#define __XFS_LOG_RECOVER_H__
+
+/*
+ * Macros, structures, prototypes for internal log manager use.
+ */
+
+#define XLOG_RHASH_BITS  4
+#define XLOG_RHASH_SIZE        16
+#define XLOG_RHASH_SHIFT 2
+#define XLOG_RHASH(tid)        \
+       ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
+
+#define XLOG_MAX_REGIONS_IN_ITEM   (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1)
+
+
+/*
+ * item headers are in ri_buf[0].  Additional buffers follow.
+ */
+typedef struct xlog_recover_item {
+       struct list_head        ri_list;
+       int                     ri_type;
+       int                     ri_cnt; /* count of regions found */
+       int                     ri_total;       /* total regions */
+       xfs_log_iovec_t         *ri_buf;        /* ptr to regions buffer */
+} xlog_recover_item_t;
+
+struct xlog_tid;
+typedef struct xlog_recover {
+       struct hlist_node       r_list;
+       xlog_tid_t              r_log_tid;      /* log's transaction id */
+       xfs_trans_header_t      r_theader;      /* trans header for partial */
+       int                     r_state;        /* not needed */
+       xfs_lsn_t               r_lsn;          /* xact lsn */
+       struct list_head        r_itemq;        /* q for items */
+} xlog_recover_t;
+
+#define ITEM_TYPE(i)   (*(ushort *)(i)->ri_buf[0].i_addr)
+
+/*
+ * This is the number of entries in the l_buf_cancel_table used during
+ * recovery.
+ */
+#define        XLOG_BC_TABLE_SIZE      64
+
+#define        XLOG_RECOVER_PASS1      1
+#define        XLOG_RECOVER_PASS2      2
+
+#endif /* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
new file mode 100644 (file)
index 0000000..ee7e0e8
--- /dev/null
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2013 Jie Liu.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_ag.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_trans_space.h"
+#include "xfs_inode.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_bmap_btree.h"
+
+/*
+ * Calculate the maximum length in bytes that would be required for a local
+ * attribute value as large attributes out of line are not logged.
+ */
+STATIC int
+xfs_log_calc_max_attrsetm_res(
+       struct xfs_mount        *mp)
+{
+       int                     size;
+       int                     nblks;
+
+       size = xfs_attr_leaf_entsize_local_max(mp->m_attr_geo->blksize) -
+              MAXNAMELEN - 1;
+       nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+       nblks += XFS_B_TO_FSB(mp, size);
+       nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK);
+
+       return  M_RES(mp)->tr_attrsetm.tr_logres +
+               M_RES(mp)->tr_attrsetrt.tr_logres * nblks;
+}
+
+/*
+ * Iterate over the log space reservation table to figure out and return
+ * the maximum one in terms of the pre-calculated values which were done
+ * at mount time.
+ */
+STATIC void
+xfs_log_get_max_trans_res(
+       struct xfs_mount        *mp,
+       struct xfs_trans_res    *max_resp)
+{
+       struct xfs_trans_res    *resp;
+       struct xfs_trans_res    *end_resp;
+       int                     log_space = 0;
+       int                     attr_space;
+
+       attr_space = xfs_log_calc_max_attrsetm_res(mp);
+
+       resp = (struct xfs_trans_res *)M_RES(mp);
+       end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1);
+       for (; resp < end_resp; resp++) {
+               int             tmp = resp->tr_logcount > 1 ?
+                                     resp->tr_logres * resp->tr_logcount :
+                                     resp->tr_logres;
+               if (log_space < tmp) {
+                       log_space = tmp;
+                       *max_resp = *resp;              /* struct copy */
+               }
+       }
+
+       if (attr_space > log_space) {
+               *max_resp = M_RES(mp)->tr_attrsetm;     /* struct copy */
+               max_resp->tr_logres = attr_space;
+       }
+}
+
+/*
+ * Calculate the minimum valid log size for the given superblock configuration.
+ * Used to calculate the minimum log size at mkfs time, and to determine if
+ * the log is large enough or not at mount time. Returns the minimum size in
+ * filesystem block size units.
+ */
+int
+xfs_log_calc_minimum_size(
+       struct xfs_mount        *mp)
+{
+       struct xfs_trans_res    tres = {0};
+       int                     max_logres;
+       int                     min_logblks = 0;
+       int                     lsunit = 0;
+
+       xfs_log_get_max_trans_res(mp, &tres);
+
+       max_logres = xfs_log_calc_unit_res(mp, tres.tr_logres);
+       if (tres.tr_logcount > 1)
+               max_logres *= tres.tr_logcount;
+
+       if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1)
+               lsunit = BTOBB(mp->m_sb.sb_logsunit);
+
+       /*
+        * Two factors should be taken into account for calculating the minimum
+        * log space.
+        * 1) The fundamental limitation is that no single transaction can be
+        *    larger than half size of the log.
+        *
+        *    From mkfs.xfs, this is considered by the XFS_MIN_LOG_FACTOR
+        *    define, which is set to 3. That means we can definitely fit
+        *    maximally sized 2 transactions in the log. We'll use this same
+        *    value here.
+        *
+        * 2) If the lsunit option is specified, a transaction requires 2 LSU
+        *    for the reservation because there are two log writes that can
+        *    require padding - the transaction data and the commit record which
+        *    are written separately and both can require padding to the LSU.
+        *    Consider that we can have an active CIL reservation holding 2*LSU,
+        *    but the CIL is not over a push threshold, in this case, if we
+        *    don't have enough log space for at one new transaction, which
+        *    includes another 2*LSU in the reservation, we will run into dead
+        *    loop situation in log space grant procedure. i.e.
+        *    xlog_grant_head_wait().
+        *
+        *    Hence the log size needs to be able to contain two maximally sized
+        *    and padded transactions, which is (2 * (2 * LSU + maxlres)).
+        *
+        * Also, the log size should be a multiple of the log stripe unit, round
+        * it up to lsunit boundary if lsunit is specified.
+        */
+       if (lsunit) {
+               min_logblks = roundup_64(BTOBB(max_logres), lsunit) +
+                             2 * lsunit;
+       } else
+               min_logblks = BTOBB(max_logres) + 2 * BBSIZE;
+       min_logblks *= XFS_MIN_LOG_FACTOR;
+
+       return XFS_BB_TO_FSB(mp, min_logblks);
+}
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
new file mode 100644 (file)
index 0000000..1b0a083
--- /dev/null
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_QUOTA_DEFS_H__
+#define __XFS_QUOTA_DEFS_H__
+
+/*
+ * Quota definitions shared between user and kernel source trees.
+ */
+
+/*
+ * Even though users may not have quota limits occupying all 64-bits,
+ * they may need 64-bit accounting. Hence, 64-bit quota-counters,
+ * and quota-limits. This is a waste in the common case, but hey ...
+ */
+typedef __uint64_t     xfs_qcnt_t;
+typedef __uint16_t     xfs_qwarncnt_t;
+
+/*
+ * flags for q_flags field in the dquot.
+ */
+#define XFS_DQ_USER            0x0001          /* a user quota */
+#define XFS_DQ_PROJ            0x0002          /* project quota */
+#define XFS_DQ_GROUP           0x0004          /* a group quota */
+#define XFS_DQ_DIRTY           0x0008          /* dquot is dirty */
+#define XFS_DQ_FREEING         0x0010          /* dquot is beeing torn down */
+
+#define XFS_DQ_ALLTYPES                (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
+
+#define XFS_DQ_FLAGS \
+       { XFS_DQ_USER,          "USER" }, \
+       { XFS_DQ_PROJ,          "PROJ" }, \
+       { XFS_DQ_GROUP,         "GROUP" }, \
+       { XFS_DQ_DIRTY,         "DIRTY" }, \
+       { XFS_DQ_FREEING,       "FREEING" }
+
+/*
+ * We have the possibility of all three quota types being active at once, and
+ * hence free space modification requires modification of all three current
+ * dquots in a single transaction. For this case we need to have a reservation
+ * of at least 3 dquots.
+ *
+ * However, a chmod operation can change both UID and GID in a single
+ * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be
+ * modified. Hence for this case we need to reserve space for at least 4 dquots.
+ *
+ * And in the worst case, there's a rename operation that can be modifying up to
+ * 4 inodes with dquots attached to them. In reality, the only inodes that can
+ * have their dquots modified are the source and destination directory inodes
+ * due to directory name creation and removal. That can require space allocation
+ * and/or freeing on both directory inodes, and hence all three dquots on each
+ * inode can be modified. And if the directories are world writeable, all the
+ * dquots can be unique and so 6 dquots can be modified....
+ *
+ * And, of course, we also need to take into account the dquot log format item
+ * used to describe each dquot.
+ */
+#define XFS_DQUOT_LOGRES(mp)   \
+       ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6)
+
+#define XFS_IS_QUOTA_RUNNING(mp)       ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
+#define XFS_IS_UQUOTA_RUNNING(mp)      ((mp)->m_qflags & XFS_UQUOTA_ACCT)
+#define XFS_IS_PQUOTA_RUNNING(mp)      ((mp)->m_qflags & XFS_PQUOTA_ACCT)
+#define XFS_IS_GQUOTA_RUNNING(mp)      ((mp)->m_qflags & XFS_GQUOTA_ACCT)
+#define XFS_IS_UQUOTA_ENFORCED(mp)     ((mp)->m_qflags & XFS_UQUOTA_ENFD)
+#define XFS_IS_GQUOTA_ENFORCED(mp)     ((mp)->m_qflags & XFS_GQUOTA_ENFD)
+#define XFS_IS_PQUOTA_ENFORCED(mp)     ((mp)->m_qflags & XFS_PQUOTA_ENFD)
+
+/*
+ * Incore only flags for quotaoff - these bits get cleared when quota(s)
+ * are in the process of getting turned off. These flags are in m_qflags but
+ * never in sb_qflags.
+ */
+#define XFS_UQUOTA_ACTIVE      0x1000  /* uquotas are being turned off */
+#define XFS_GQUOTA_ACTIVE      0x2000  /* gquotas are being turned off */
+#define XFS_PQUOTA_ACTIVE      0x4000  /* pquotas are being turned off */
+#define XFS_ALL_QUOTA_ACTIVE   \
+       (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
+
+/*
+ * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
+ * quota will be not be switched off as long as that inode lock is held.
+ */
+#define XFS_IS_QUOTA_ON(mp)    ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
+                                                  XFS_GQUOTA_ACTIVE | \
+                                                  XFS_PQUOTA_ACTIVE))
+#define XFS_IS_UQUOTA_ON(mp)   ((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
+#define XFS_IS_GQUOTA_ON(mp)   ((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
+#define XFS_IS_PQUOTA_ON(mp)   ((mp)->m_qflags & XFS_PQUOTA_ACTIVE)
+
+/*
+ * Flags to tell various functions what to do. Not all of these are meaningful
+ * to a single function. None of these XFS_QMOPT_* flags are meant to have
+ * persistent values (ie. their values can and will change between versions)
+ */
+#define XFS_QMOPT_DQALLOC      0x0000002 /* alloc dquot ondisk if needed */
+#define XFS_QMOPT_UQUOTA       0x0000004 /* user dquot requested */
+#define XFS_QMOPT_PQUOTA       0x0000008 /* project dquot requested */
+#define XFS_QMOPT_FORCE_RES    0x0000010 /* ignore quota limits */
+#define XFS_QMOPT_SBVERSION    0x0000040 /* change superblock version num */
+#define XFS_QMOPT_DOWARN        0x0000400 /* increase warning cnt if needed */
+#define XFS_QMOPT_DQREPAIR     0x0001000 /* repair dquot if damaged */
+#define XFS_QMOPT_GQUOTA       0x0002000 /* group dquot requested */
+#define XFS_QMOPT_ENOSPC       0x0004000 /* enospc instead of edquot (prj) */
+
+/*
+ * flags to xfs_trans_mod_dquot to indicate which field needs to be
+ * modified.
+ */
+#define XFS_QMOPT_RES_REGBLKS  0x0010000
+#define XFS_QMOPT_RES_RTBLKS   0x0020000
+#define XFS_QMOPT_BCOUNT       0x0040000
+#define XFS_QMOPT_ICOUNT       0x0080000
+#define XFS_QMOPT_RTBCOUNT     0x0100000
+#define XFS_QMOPT_DELBCOUNT    0x0200000
+#define XFS_QMOPT_DELRTBCOUNT  0x0400000
+#define XFS_QMOPT_RES_INOS     0x0800000
+
+/*
+ * flags for dqalloc.
+ */
+#define XFS_QMOPT_INHERIT      0x1000000
+
+/*
+ * flags to xfs_trans_mod_dquot.
+ */
+#define XFS_TRANS_DQ_RES_BLKS  XFS_QMOPT_RES_REGBLKS
+#define XFS_TRANS_DQ_RES_RTBLKS        XFS_QMOPT_RES_RTBLKS
+#define XFS_TRANS_DQ_RES_INOS  XFS_QMOPT_RES_INOS
+#define XFS_TRANS_DQ_BCOUNT    XFS_QMOPT_BCOUNT
+#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT
+#define XFS_TRANS_DQ_ICOUNT    XFS_QMOPT_ICOUNT
+#define XFS_TRANS_DQ_RTBCOUNT  XFS_QMOPT_RTBCOUNT
+#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT
+
+
+#define XFS_QMOPT_QUOTALL      \
+               (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
+#define XFS_QMOPT_RESBLK_MASK  (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
+
+extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq,
+                      xfs_dqid_t id, uint type, uint flags, char *str);
+extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
+
+#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
new file mode 100644 (file)
index 0000000..f4dd697
--- /dev/null
@@ -0,0 +1,973 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_buf.h"
+#include "xfs_icache.h"
+#include "xfs_dinode.h"
+#include "xfs_rtalloc.h"
+
+
+/*
+ * Realtime allocator bitmap functions shared with userspace.
+ */
+
+/*
+ * Get a buffer for the bitmap or summary file block specified.
+ * The buffer is returned read and locked.
+ */
+int
+xfs_rtbuf_get(
+       xfs_mount_t     *mp,            /* file system mount structure */
+       xfs_trans_t     *tp,            /* transaction pointer */
+       xfs_rtblock_t   block,          /* block number in bitmap or summary */
+       int             issum,          /* is summary not bitmap */
+       xfs_buf_t       **bpp)          /* output: buffer for the block */
+{
+       xfs_buf_t       *bp;            /* block buffer, result */
+       xfs_inode_t     *ip;            /* bitmap or summary inode */
+       xfs_bmbt_irec_t map;
+       int             nmap = 1;
+       int             error;          /* error value */
+
+       ip = issum ? mp->m_rsumip : mp->m_rbmip;
+
+       error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK);
+       if (error)
+               return error;
+
+       ASSERT(map.br_startblock != NULLFSBLOCK);
+       error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+                                  XFS_FSB_TO_DADDR(mp, map.br_startblock),
+                                  mp->m_bsize, 0, &bp, NULL);
+       if (error)
+               return error;
+       *bpp = bp;
+       return 0;
+}
+
+/*
+ * Searching backward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+int
+xfs_rtfind_back(
+       xfs_mount_t     *mp,            /* file system mount point */
+       xfs_trans_t     *tp,            /* transaction pointer */
+       xfs_rtblock_t   start,          /* starting block to look at */
+       xfs_rtblock_t   limit,          /* last block to look at */
+       xfs_rtblock_t   *rtblock)       /* out: start block found */
+{
+       xfs_rtword_t    *b;             /* current word in buffer */
+       int             bit;            /* bit number in the word */
+       xfs_rtblock_t   block;          /* bitmap block number */
+       xfs_buf_t       *bp;            /* buf for the block */
+       xfs_rtword_t    *bufp;          /* starting word in buffer */
+       int             error;          /* error value */
+       xfs_rtblock_t   firstbit;       /* first useful bit in the word */
+       xfs_rtblock_t   i;              /* current bit number rel. to start */
+       xfs_rtblock_t   len;            /* length of inspected area */
+       xfs_rtword_t    mask;           /* mask of relevant bits for value */
+       xfs_rtword_t    want;           /* mask for "good" values */
+       xfs_rtword_t    wdiff;          /* difference from wanted value */
+       int             word;           /* word number in the buffer */
+
+       /*
+        * Compute and read in starting bitmap block for starting block.
+        */
+       block = XFS_BITTOBLOCK(mp, start);
+       error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+       if (error) {
+               return error;
+       }
+       bufp = bp->b_addr;
+       /*
+        * Get the first word's index & point to it.
+        */
+       word = XFS_BITTOWORD(mp, start);
+       b = &bufp[word];
+       bit = (int)(start & (XFS_NBWORD - 1));
+       len = start - limit + 1;
+       /*
+        * Compute match value, based on the bit at start: if 1 (free)
+        * then all-ones, else all-zeroes.
+        */
+       want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+       /*
+        * If the starting position is not word-aligned, deal with the
+        * partial word.
+        */
+       if (bit < XFS_NBWORD - 1) {
+               /*
+                * Calculate first (leftmost) bit number to look at,
+                * and mask for all the relevant bits in this word.
+                */
+               firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0);
+               mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) <<
+                       firstbit;
+               /*
+                * Calculate the difference between the value there
+                * and what we're looking for.
+                */
+               if ((wdiff = (*b ^ want) & mask)) {
+                       /*
+                        * Different.  Mark where we are and return.
+                        */
+                       xfs_trans_brelse(tp, bp);
+                       i = bit - XFS_RTHIBIT(wdiff);
+                       *rtblock = start - i + 1;
+                       return 0;
+               }
+               i = bit - firstbit + 1;
+               /*
+                * Go on to previous block if that's where the previous word is
+                * and we need the previous word.
+                */
+               if (--word == -1 && i < len) {
+                       /*
+                        * If done with this block, get the previous one.
+                        */
+                       xfs_trans_brelse(tp, bp);
+                       error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+                       if (error) {
+                               return error;
+                       }
+                       bufp = bp->b_addr;
+                       word = XFS_BLOCKWMASK(mp);
+                       b = &bufp[word];
+               } else {
+                       /*
+                        * Go on to the previous word in the buffer.
+                        */
+                       b--;
+               }
+       } else {
+               /*
+                * Starting on a word boundary, no partial word.
+                */
+               i = 0;
+       }
+       /*
+        * Loop over whole words in buffers.  When we use up one buffer
+        * we move on to the previous one.
+        */
+       while (len - i >= XFS_NBWORD) {
+               /*
+                * Compute difference between actual and desired value.
+                */
+               if ((wdiff = *b ^ want)) {
+                       /*
+                        * Different, mark where we are and return.
+                        */
+                       xfs_trans_brelse(tp, bp);
+                       i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+                       *rtblock = start - i + 1;
+                       return 0;
+               }
+               i += XFS_NBWORD;
+               /*
+                * Go on to previous block if that's where the previous word is
+                * and we need the previous word.
+                */
+               if (--word == -1 && i < len) {
+                       /*
+                        * If done with this block, get the previous one.
+                        */
+                       xfs_trans_brelse(tp, bp);
+                       error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+                       if (error) {
+                               return error;
+                       }
+                       bufp = bp->b_addr;
+                       word = XFS_BLOCKWMASK(mp);
+                       b = &bufp[word];
+               } else {
+                       /*
+                        * Go on to the previous word in the buffer.
+                        */
+                       b--;
+               }
+       }
+       /*
+        * If not ending on a word boundary, deal with the last
+        * (partial) word.
+        */
+       if (len - i) {
+               /*
+                * Calculate first (leftmost) bit number to look at,
+                * and mask for all the relevant bits in this word.
+                */
+               firstbit = XFS_NBWORD - (len - i);
+               mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit;
+               /*
+                * Compute difference between actual and desired value.
+                */
+               if ((wdiff = (*b ^ want) & mask)) {
+                       /*
+                        * Different, mark where we are and return.
+                        */
+                       xfs_trans_brelse(tp, bp);
+                       i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+                       *rtblock = start - i + 1;
+                       return 0;
+               } else
+                       i = len;
+       }
+       /*
+        * No match, return that we scanned the whole area.
+        */
+       xfs_trans_brelse(tp, bp);
+       *rtblock = start - i + 1;
+       return 0;
+}
+
+/*
+ * Searching forward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+int
+xfs_rtfind_forw(
+       xfs_mount_t     *mp,            /* file system mount point */
+       xfs_trans_t     *tp,            /* transaction pointer */
+       xfs_rtblock_t   start,          /* starting block to look at */
+       xfs_rtblock_t   limit,          /* last block to look at */
+       xfs_rtblock_t   *rtblock)       /* out: start block found */
+{
+       xfs_rtword_t    *b;             /* current word in buffer */
+       int             bit;            /* bit number in the word */
+       xfs_rtblock_t   block;          /* bitmap block number */
+       xfs_buf_t       *bp;            /* buf for the block */
+       xfs_rtword_t    *bufp;          /* starting word in buffer */
+       int             error;          /* error value */
+       xfs_rtblock_t   i;              /* current bit number rel. to start */
+       xfs_rtblock_t   lastbit;        /* last useful bit in the word */
+       xfs_rtblock_t   len;            /* length of inspected area */
+       xfs_rtword_t    mask;           /* mask of relevant bits for value */
+       xfs_rtword_t    want;           /* mask for "good" values */
+       xfs_rtword_t    wdiff;          /* difference from wanted value */
+       int             word;           /* word number in the buffer */
+
+       /*
+        * Compute and read in starting bitmap block for starting block.
+        */
+       block = XFS_BITTOBLOCK(mp, start);
+       error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+       if (error) {
+               return error;
+       }
+       bufp = bp->b_addr;
+       /*
+        * Get the first word's index & point to it.
+        */
+       word = XFS_BITTOWORD(mp, start);
+       b = &bufp[word];
+       bit = (int)(start & (XFS_NBWORD - 1));
+       len = limit - start + 1;
+       /*
+        * Compute match value, based on the bit at start: if 1 (free)
+        * then all-ones, else all-zeroes.
+        */
+       want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+       /*
+        * If the starting position is not word-aligned, deal with the
+        * partial word.
+        */
+       if (bit) {
+               /*
+                * Calculate last (rightmost) bit number to look at,
+                * and mask for all the relevant bits in this word.
+                */
+               lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+               mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+               /*
+                * Calculate the difference between the value there
+                * and what we're looking for.
+                */
+               if ((wdiff = (*b ^ want) & mask)) {
+                       /*
+                        * Different.  Mark where we are and return.
+                        */
+                       xfs_trans_brelse(tp, bp);
+                       i = XFS_RTLOBIT(wdiff) - bit;
+                       *rtblock = start + i - 1;
+                       return 0;
+               }
+               i = lastbit - bit;
+               /*
+                * Go on to next block if that's where the next word is
+                * and we need the next word.
+                */
+               if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+                       /*
+                        * If done with this block, get the previous one.
+                        */
+                       xfs_trans_brelse(tp, bp);
+                       error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+                       if (error) {
+                               return error;
+                       }
+                       b = bufp = bp->b_addr;
+                       word = 0;
+               } else {
+                       /*
+                        * Go on to the previous word in the buffer.
+                        */
+                       b++;
+               }
+       } else {
+               /*
+                * Starting on a word boundary, no partial word.
+                */
+               i = 0;
+       }
+       /*
+        * Loop over whole words in buffers.  When we use up one buffer
+        * we move on to the next one.
+        */
+       while (len - i >= XFS_NBWORD) {
+               /*
+                * Compute difference between actual and desired value.
+                */
+               if ((wdiff = *b ^ want)) {
+                       /*
+                        * Different, mark where we are and return.
+                        */
+                       xfs_trans_brelse(tp, bp);
+                       i += XFS_RTLOBIT(wdiff);
+                       *rtblock = start + i - 1;
+                       return 0;
+               }
+               i += XFS_NBWORD;
+               /*
+                * Go on to next block if that's where the next word is
+                * and we need the next word.
+                */
+               if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+                       /*
+                        * If done with this block, get the next one.
+                        */
+                       xfs_trans_brelse(tp, bp);
+                       error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+                       if (error) {
+                               return error;
+                       }
+                       b = bufp = bp->b_addr;
+                       word = 0;
+               } else {
+                       /*
+                        * Go on to the next word in the buffer.
+                        */
+                       b++;
+               }
+       }
+       /*
+        * If not ending on a word boundary, deal with the last
+        * (partial) word.
+        */
+       if ((lastbit = len - i)) {
+               /*
+                * Calculate mask for all the relevant bits in this word.
+                */
+               mask = ((xfs_rtword_t)1 << lastbit) - 1;
+               /*
+                * Compute difference between actual and desired value.
+                */
+               if ((wdiff = (*b ^ want) & mask)) {
+                       /*
+                        * Different, mark where we are and return.
+                        */
+                       xfs_trans_brelse(tp, bp);
+                       i += XFS_RTLOBIT(wdiff);
+                       *rtblock = start + i - 1;
+                       return 0;
+               } else
+                       i = len;
+       }
+       /*
+        * No match, return that we scanned the whole area.
+        */
+       xfs_trans_brelse(tp, bp);
+       *rtblock = start + i - 1;
+       return 0;
+}
+
+/*
+ * Read and modify the summary information for a given extent size,
+ * bitmap block combination.
+ * Keeps track of a current summary block, so we don't keep reading
+ * it from the buffer cache.
+ */
+int
+xfs_rtmodify_summary(
+       xfs_mount_t     *mp,            /* file system mount point */
+       xfs_trans_t     *tp,            /* transaction pointer */
+       int             log,            /* log2 of extent size */
+       xfs_rtblock_t   bbno,           /* bitmap block number */
+       int             delta,          /* change to make to summary info */
+       xfs_buf_t       **rbpp,         /* in/out: summary block buffer */
+       xfs_fsblock_t   *rsb)           /* in/out: summary block number */
+{
+       xfs_buf_t       *bp;            /* buffer for the summary block */
+       int             error;          /* error value */
+       xfs_fsblock_t   sb;             /* summary fsblock */
+       int             so;             /* index into the summary file */
+       xfs_suminfo_t   *sp;            /* pointer to returned data */
+
+       /*
+        * Compute entry number in the summary file.
+        */
+       so = XFS_SUMOFFS(mp, log, bbno);
+       /*
+        * Compute the block number in the summary file.
+        */
+       sb = XFS_SUMOFFSTOBLOCK(mp, so);
+       /*
+        * If we have an old buffer, and the block number matches, use that.
+        */
+       if (rbpp && *rbpp && *rsb == sb)
+               bp = *rbpp;
+       /*
+        * Otherwise we have to get the buffer.
+        */
+       else {
+               /*
+                * If there was an old one, get rid of it first.
+                */
+               if (rbpp && *rbpp)
+                       xfs_trans_brelse(tp, *rbpp);
+               error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
+               if (error) {
+                       return error;
+               }
+               /*
+                * Remember this buffer and block for the next call.
+                */
+               if (rbpp) {
+                       *rbpp = bp;
+                       *rsb = sb;
+               }
+       }
+       /*
+        * Point to the summary information, modify and log it.
+        */
+       sp = XFS_SUMPTR(mp, bp, so);
+       *sp += delta;
+       xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr),
+               (uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1));
+       return 0;
+}
+
+/*
+ * Set the given range of bitmap bits to the given value.
+ * Do whatever I/O and logging is required.
+ */
+int
+xfs_rtmodify_range(
+       xfs_mount_t     *mp,            /* file system mount point */
+       xfs_trans_t     *tp,            /* transaction pointer */
+       xfs_rtblock_t   start,          /* starting block to modify */
+       xfs_extlen_t    len,            /* length of extent to modify */
+       int             val)            /* 1 for free, 0 for allocated */
+{
+       xfs_rtword_t    *b;             /* current word in buffer */
+       int             bit;            /* bit number in the word */
+       xfs_rtblock_t   block;          /* bitmap block number */
+       xfs_buf_t       *bp;            /* buf for the block */
+       xfs_rtword_t    *bufp;          /* starting word in buffer */
+       int             error;          /* error value */
+       xfs_rtword_t    *first;         /* first used word in the buffer */
+       int             i;              /* current bit number rel. to start */
+       int             lastbit;        /* last useful bit in word */
+       xfs_rtword_t    mask;           /* mask o frelevant bits for value */
+       int             word;           /* word number in the buffer */
+
+       /*
+        * Compute starting bitmap block number.
+        */
+       block = XFS_BITTOBLOCK(mp, start);
+       /*
+        * Read the bitmap block, and point to its data.
+        */
+       error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+       if (error) {
+               return error;
+       }
+       bufp = bp->b_addr;
+       /*
+        * Compute the starting word's address, and starting bit.
+        */
+       word = XFS_BITTOWORD(mp, start);
+       first = b = &bufp[word];
+       bit = (int)(start & (XFS_NBWORD - 1));
+       /*
+        * 0 (allocated) => all zeroes; 1 (free) => all ones.
+        */
+       val = -val;
+       /*
+        * If not starting on a word boundary, deal with the first
+        * (partial) word.
+        */
+       if (bit) {
+               /*
+                * Compute first bit not changed and mask of relevant bits.
+                */
+               lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+               mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+               /*
+                * Set/clear the active bits.
+                */
+               if (val)
+                       *b |= mask;
+               else
+                       *b &= ~mask;
+               i = lastbit - bit;
+               /*
+                * Go on to the next block if that's where the next word is
+                * and we need the next word.
+                */
+               if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+                       /*
+                        * Log the changed part of this block.
+                        * Get the next one.
+                        */
+                       xfs_trans_log_buf(tp, bp,
+                               (uint)((char *)first - (char *)bufp),
+                               (uint)((char *)b - (char *)bufp));
+                       error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+                       if (error) {
+                               return error;
+                       }
+                       first = b = bufp = bp->b_addr;
+                       word = 0;
+               } else {
+                       /*
+                        * Go on to the next word in the buffer
+                        */
+                       b++;
+               }
+       } else {
+               /*
+                * Starting on a word boundary, no partial word.
+                */
+               i = 0;
+       }
+       /*
+        * Loop over whole words in buffers.  When we use up one buffer
+        * we move on to the next one.
+        */
+       while (len - i >= XFS_NBWORD) {
+               /*
+                * Set the word value correctly.
+                */
+               *b = val;
+               i += XFS_NBWORD;
+               /*
+                * Go on to the next block if that's where the next word is
+                * and we need the next word.
+                */
+               if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+                       /*
+                        * Log the changed part of this block.
+                        * Get the next one.
+                        */
+                       xfs_trans_log_buf(tp, bp,
+                               (uint)((char *)first - (char *)bufp),
+                               (uint)((char *)b - (char *)bufp));
+                       error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+                       if (error) {
+                               return error;
+                       }
+                       first = b = bufp = bp->b_addr;
+                       word = 0;
+               } else {
+                       /*
+                        * Go on to the next word in the buffer
+                        */
+                       b++;
+               }
+       }
+       /*
+        * If not ending on a word boundary, deal with the last
+        * (partial) word.
+        */
+       if ((lastbit = len - i)) {
+               /*
+                * Compute a mask of relevant bits.
+                */
+               bit = 0;
+               mask = ((xfs_rtword_t)1 << lastbit) - 1;
+               /*
+                * Set/clear the active bits.
+                */
+               if (val)
+                       *b |= mask;
+               else
+                       *b &= ~mask;
+               b++;
+       }
+       /*
+        * Log any remaining changed bytes.
+        */
+       if (b > first)
+               xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp),
+                       (uint)((char *)b - (char *)bufp - 1));
+       return 0;
+}
+
+/*
+ * Mark an extent specified by start and len freed.
+ * Updates all the summary information as well as the bitmap.
+ */
+int
+xfs_rtfree_range(
+       xfs_mount_t     *mp,            /* file system mount point */
+       xfs_trans_t     *tp,            /* transaction pointer */
+       xfs_rtblock_t   start,          /* starting block to free */
+       xfs_extlen_t    len,            /* length to free */
+       xfs_buf_t       **rbpp,         /* in/out: summary block buffer */
+       xfs_fsblock_t   *rsb)           /* in/out: summary block number */
+{
+       xfs_rtblock_t   end;            /* end of the freed extent */
+       int             error;          /* error value */
+       xfs_rtblock_t   postblock;      /* first block freed > end */
+       xfs_rtblock_t   preblock;       /* first block freed < start */
+
+       end = start + len - 1;
+       /*
+        * Modify the bitmap to mark this extent freed.
+        */
+       error = xfs_rtmodify_range(mp, tp, start, len, 1);
+       if (error) {
+               return error;
+       }
+       /*
+        * Assume we're freeing out of the middle of an allocated extent.
+        * We need to find the beginning and end of the extent so we can
+        * properly update the summary.
+        */
+       error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+       if (error) {
+               return error;
+       }
+       /*
+        * Find the next allocated block (end of allocated extent).
+        */
+       error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
+               &postblock);
+       if (error)
+               return error;
+       /*
+        * If there are blocks not being freed at the front of the
+        * old extent, add summary data for them to be allocated.
+        */
+       if (preblock < start) {
+               error = xfs_rtmodify_summary(mp, tp,
+                       XFS_RTBLOCKLOG(start - preblock),
+                       XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+               if (error) {
+                       return error;
+               }
+       }
+       /*
+        * If there are blocks not being freed at the end of the
+        * old extent, add summary data for them to be allocated.
+        */
+       if (postblock > end) {
+               error = xfs_rtmodify_summary(mp, tp,
+                       XFS_RTBLOCKLOG(postblock - end),
+                       XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb);
+               if (error) {
+                       return error;
+               }
+       }
+       /*
+        * Increment the summary information corresponding to the entire
+        * (new) free extent.
+        */
+       error = xfs_rtmodify_summary(mp, tp,
+               XFS_RTBLOCKLOG(postblock + 1 - preblock),
+               XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+       return error;
+}
+
+/*
+ * Check that the given range is either all allocated (val = 0) or
+ * all free (val = 1).
+ */
+int
+xfs_rtcheck_range(
+       xfs_mount_t     *mp,            /* file system mount point */
+       xfs_trans_t     *tp,            /* transaction pointer */
+       xfs_rtblock_t   start,          /* starting block number of extent */
+       xfs_extlen_t    len,            /* length of extent */
+       int             val,            /* 1 for free, 0 for allocated */
+       xfs_rtblock_t   *new,           /* out: first block not matching */
+       int             *stat)          /* out: 1 for matches, 0 for not */
+{
+       xfs_rtword_t    *b;             /* current word in buffer */
+       int             bit;            /* bit number in the word */
+       xfs_rtblock_t   block;          /* bitmap block number */
+       xfs_buf_t       *bp;            /* buf for the block */
+       xfs_rtword_t    *bufp;          /* starting word in buffer */
+       int             error;          /* error value */
+       xfs_rtblock_t   i;              /* current bit number rel. to start */
+       xfs_rtblock_t   lastbit;        /* last useful bit in word */
+       xfs_rtword_t    mask;           /* mask of relevant bits for value */
+       xfs_rtword_t    wdiff;          /* difference from wanted value */
+       int             word;           /* word number in the buffer */
+
+       /*
+        * Compute starting bitmap block number
+        */
+       block = XFS_BITTOBLOCK(mp, start);
+       /*
+        * Read the bitmap block.
+        */
+       error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+       if (error) {
+               return error;
+       }
+       bufp = bp->b_addr;
+       /*
+        * Compute the starting word's address, and starting bit.
+        */
+       word = XFS_BITTOWORD(mp, start);
+       b = &bufp[word];
+       bit = (int)(start & (XFS_NBWORD - 1));
+       /*
+        * 0 (allocated) => all zero's; 1 (free) => all one's.
+        */
+       val = -val;
+       /*
+        * If not starting on a word boundary, deal with the first
+        * (partial) word.
+        */
+       if (bit) {
+               /*
+                * Compute first bit not examined.
+                */
+               lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+               /*
+                * Mask of relevant bits.
+                */
+               mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+               /*
+                * Compute difference between actual and desired value.
+                */
+               if ((wdiff = (*b ^ val) & mask)) {
+                       /*
+                        * Different, compute first wrong bit and return.
+                        */
+                       xfs_trans_brelse(tp, bp);
+                       i = XFS_RTLOBIT(wdiff) - bit;
+                       *new = start + i;
+                       *stat = 0;
+                       return 0;
+               }
+               i = lastbit - bit;
+               /*
+                * Go on to next block if that's where the next word is
+                * and we need the next word.
+                */
+               if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+                       /*
+                        * If done with this block, get the next one.
+                        */
+                       xfs_trans_brelse(tp, bp);
+                       error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+                       if (error) {
+                               return error;
+                       }
+                       b = bufp = bp->b_addr;
+                       word = 0;
+               } else {
+                       /*
+                        * Go on to the next word in the buffer.
+                        */
+                       b++;
+               }
+       } else {
+               /*
+                * Starting on a word boundary, no partial word.
+                */
+               i = 0;
+       }
+       /*
+        * Loop over whole words in buffers.  When we use up one buffer
+        * we move on to the next one.
+        */
+       while (len - i >= XFS_NBWORD) {
+               /*
+                * Compute difference between actual and desired value.
+                */
+               if ((wdiff = *b ^ val)) {
+                       /*
+                        * Different, compute first wrong bit and return.
+                        */
+                       xfs_trans_brelse(tp, bp);
+                       i += XFS_RTLOBIT(wdiff);
+                       *new = start + i;
+                       *stat = 0;
+                       return 0;
+               }
+               i += XFS_NBWORD;
+               /*
+                * Go on to next block if that's where the next word is
+                * and we need the next word.
+                */
+               if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+                       /*
+                        * If done with this block, get the next one.
+                        */
+                       xfs_trans_brelse(tp, bp);
+                       error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+                       if (error) {
+                               return error;
+                       }
+                       b = bufp = bp->b_addr;
+                       word = 0;
+               } else {
+                       /*
+                        * Go on to the next word in the buffer.
+                        */
+                       b++;
+               }
+       }
+       /*
+        * If not ending on a word boundary, deal with the last
+        * (partial) word.
+        */
+       if ((lastbit = len - i)) {
+               /*
+                * Mask of relevant bits.
+                */
+               mask = ((xfs_rtword_t)1 << lastbit) - 1;
+               /*
+                * Compute difference between actual and desired value.
+                */
+               if ((wdiff = (*b ^ val) & mask)) {
+                       /*
+                        * Different, compute first wrong bit and return.
+                        */
+                       xfs_trans_brelse(tp, bp);
+                       i += XFS_RTLOBIT(wdiff);
+                       *new = start + i;
+                       *stat = 0;
+                       return 0;
+               } else
+                       i = len;
+       }
+       /*
+        * Successful, return.
+        */
+       xfs_trans_brelse(tp, bp);
+       *new = start + i;
+       *stat = 1;
+       return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check that the given extent (block range) is allocated already.
+ */
+STATIC int                             /* error */
+xfs_rtcheck_alloc_range(
+       xfs_mount_t     *mp,            /* file system mount point */
+       xfs_trans_t     *tp,            /* transaction pointer */
+       xfs_rtblock_t   bno,            /* starting block number of extent */
+       xfs_extlen_t    len)            /* length of extent */
+{
+       xfs_rtblock_t   new;            /* dummy for xfs_rtcheck_range */
+       int             stat;
+       int             error;
+
+       error = xfs_rtcheck_range(mp, tp, bno, len, 0, &new, &stat);
+       if (error)
+               return error;
+       ASSERT(stat);
+       return 0;
+}
+#else
+#define xfs_rtcheck_alloc_range(m,t,b,l)       (0)
+#endif
+/*
+ * Free an extent in the realtime subvolume.  Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int                                    /* error */
+xfs_rtfree_extent(
+       xfs_trans_t     *tp,            /* transaction pointer */
+       xfs_rtblock_t   bno,            /* starting block number to free */
+       xfs_extlen_t    len)            /* length of extent freed */
+{
+       int             error;          /* error value */
+       xfs_mount_t     *mp;            /* file system mount structure */
+       xfs_fsblock_t   sb;             /* summary file block number */
+       xfs_buf_t       *sumbp = NULL;  /* summary file block buffer */
+
+       mp = tp->t_mountp;
+
+       ASSERT(mp->m_rbmip->i_itemp != NULL);
+       ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+
+       error = xfs_rtcheck_alloc_range(mp, tp, bno, len);
+       if (error)
+               return error;
+
+       /*
+        * Free the range of realtime blocks.
+        */
+       error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb);
+       if (error) {
+               return error;
+       }
+       /*
+        * Mark more blocks free in the superblock.
+        */
+       xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len);
+       /*
+        * If we've now freed all the blocks, reset the file sequence
+        * number to 0.
+        */
+       if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
+           mp->m_sb.sb_rextents) {
+               if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
+                       mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+               *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
+               xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+       }
+       return 0;
+}
+
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
new file mode 100644 (file)
index 0000000..ad525a5
--- /dev/null
@@ -0,0 +1,852 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_dinode.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+
+/*
+ * Physical superblock buffer manipulations. Shared with libxfs in userspace.
+ */
+
+static const struct {
+       short offset;
+       short type;     /* 0 = integer
+                        * 1 = binary / string (no translation)
+                        */
+} xfs_sb_info[] = {
+       { offsetof(xfs_sb_t, sb_magicnum),      0 },
+       { offsetof(xfs_sb_t, sb_blocksize),     0 },
+       { offsetof(xfs_sb_t, sb_dblocks),       0 },
+       { offsetof(xfs_sb_t, sb_rblocks),       0 },
+       { offsetof(xfs_sb_t, sb_rextents),      0 },
+       { offsetof(xfs_sb_t, sb_uuid),          1 },
+       { offsetof(xfs_sb_t, sb_logstart),      0 },
+       { offsetof(xfs_sb_t, sb_rootino),       0 },
+       { offsetof(xfs_sb_t, sb_rbmino),        0 },
+       { offsetof(xfs_sb_t, sb_rsumino),       0 },
+       { offsetof(xfs_sb_t, sb_rextsize),      0 },
+       { offsetof(xfs_sb_t, sb_agblocks),      0 },
+       { offsetof(xfs_sb_t, sb_agcount),       0 },
+       { offsetof(xfs_sb_t, sb_rbmblocks),     0 },
+       { offsetof(xfs_sb_t, sb_logblocks),     0 },
+       { offsetof(xfs_sb_t, sb_versionnum),    0 },
+       { offsetof(xfs_sb_t, sb_sectsize),      0 },
+       { offsetof(xfs_sb_t, sb_inodesize),     0 },
+       { offsetof(xfs_sb_t, sb_inopblock),     0 },
+       { offsetof(xfs_sb_t, sb_fname[0]),      1 },
+       { offsetof(xfs_sb_t, sb_blocklog),      0 },
+       { offsetof(xfs_sb_t, sb_sectlog),       0 },
+       { offsetof(xfs_sb_t, sb_inodelog),      0 },
+       { offsetof(xfs_sb_t, sb_inopblog),      0 },
+       { offsetof(xfs_sb_t, sb_agblklog),      0 },
+       { offsetof(xfs_sb_t, sb_rextslog),      0 },
+       { offsetof(xfs_sb_t, sb_inprogress),    0 },
+       { offsetof(xfs_sb_t, sb_imax_pct),      0 },
+       { offsetof(xfs_sb_t, sb_icount),        0 },
+       { offsetof(xfs_sb_t, sb_ifree),         0 },
+       { offsetof(xfs_sb_t, sb_fdblocks),      0 },
+       { offsetof(xfs_sb_t, sb_frextents),     0 },
+       { offsetof(xfs_sb_t, sb_uquotino),      0 },
+       { offsetof(xfs_sb_t, sb_gquotino),      0 },
+       { offsetof(xfs_sb_t, sb_qflags),        0 },
+       { offsetof(xfs_sb_t, sb_flags),         0 },
+       { offsetof(xfs_sb_t, sb_shared_vn),     0 },
+       { offsetof(xfs_sb_t, sb_inoalignmt),    0 },
+       { offsetof(xfs_sb_t, sb_unit),          0 },
+       { offsetof(xfs_sb_t, sb_width),         0 },
+       { offsetof(xfs_sb_t, sb_dirblklog),     0 },
+       { offsetof(xfs_sb_t, sb_logsectlog),    0 },
+       { offsetof(xfs_sb_t, sb_logsectsize),   0 },
+       { offsetof(xfs_sb_t, sb_logsunit),      0 },
+       { offsetof(xfs_sb_t, sb_features2),     0 },
+       { offsetof(xfs_sb_t, sb_bad_features2), 0 },
+       { offsetof(xfs_sb_t, sb_features_compat),       0 },
+       { offsetof(xfs_sb_t, sb_features_ro_compat),    0 },
+       { offsetof(xfs_sb_t, sb_features_incompat),     0 },
+       { offsetof(xfs_sb_t, sb_features_log_incompat), 0 },
+       { offsetof(xfs_sb_t, sb_crc),           0 },
+       { offsetof(xfs_sb_t, sb_pad),           0 },
+       { offsetof(xfs_sb_t, sb_pquotino),      0 },
+       { offsetof(xfs_sb_t, sb_lsn),           0 },
+       { sizeof(xfs_sb_t),                     0 }
+};
+
+/*
+ * Reference counting access wrappers to the perag structures.
+ * Because we never free per-ag structures, the only thing we
+ * have to protect against changes is the tree structure itself.
+ */
+struct xfs_perag *
+xfs_perag_get(
+       struct xfs_mount        *mp,
+       xfs_agnumber_t          agno)
+{
+       struct xfs_perag        *pag;
+       int                     ref = 0;
+
+       rcu_read_lock();
+       pag = radix_tree_lookup(&mp->m_perag_tree, agno);
+       if (pag) {
+               ASSERT(atomic_read(&pag->pag_ref) >= 0);
+               ref = atomic_inc_return(&pag->pag_ref);
+       }
+       rcu_read_unlock();
+       trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
+       return pag;
+}
+
+/*
+ * search from @first to find the next perag with the given tag set.
+ */
+struct xfs_perag *
+xfs_perag_get_tag(
+       struct xfs_mount        *mp,
+       xfs_agnumber_t          first,
+       int                     tag)
+{
+       struct xfs_perag        *pag;
+       int                     found;
+       int                     ref;
+
+       rcu_read_lock();
+       found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+                                       (void **)&pag, first, 1, tag);
+       if (found <= 0) {
+               rcu_read_unlock();
+               return NULL;
+       }
+       ref = atomic_inc_return(&pag->pag_ref);
+       rcu_read_unlock();
+       trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
+       return pag;
+}
+
+void
+xfs_perag_put(
+       struct xfs_perag        *pag)
+{
+       int     ref;
+
+       ASSERT(atomic_read(&pag->pag_ref) > 0);
+       ref = atomic_dec_return(&pag->pag_ref);
+       trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
+}
+
+/*
+ * Check the validity of the SB found.
+ */
+STATIC int
+xfs_mount_validate_sb(
+       xfs_mount_t     *mp,
+       xfs_sb_t        *sbp,
+       bool            check_inprogress,
+       bool            check_version)
+{
+
+       /*
+        * If the log device and data device have the
+        * same device number, the log is internal.
+        * Consequently, the sb_logstart should be non-zero.  If
+        * we have a zero sb_logstart in this case, we may be trying to mount
+        * a volume filesystem in a non-volume manner.
+        */
+       if (sbp->sb_magicnum != XFS_SB_MAGIC) {
+               xfs_warn(mp, "bad magic number");
+               return -EWRONGFS;
+       }
+
+
+       if (!xfs_sb_good_version(sbp)) {
+               xfs_warn(mp, "bad version");
+               return -EWRONGFS;
+       }
+
+       /*
+        * Version 5 superblock feature mask validation. Reject combinations the
+        * kernel cannot support up front before checking anything else. For
+        * write validation, we don't need to check feature masks.
+        */
+       if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
+               if (xfs_sb_has_compat_feature(sbp,
+                                       XFS_SB_FEAT_COMPAT_UNKNOWN)) {
+                       xfs_warn(mp,
+"Superblock has unknown compatible features (0x%x) enabled.\n"
+"Using a more recent kernel is recommended.",
+                               (sbp->sb_features_compat &
+                                               XFS_SB_FEAT_COMPAT_UNKNOWN));
+               }
+
+               if (xfs_sb_has_ro_compat_feature(sbp,
+                                       XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+                       xfs_alert(mp,
+"Superblock has unknown read-only compatible features (0x%x) enabled.",
+                               (sbp->sb_features_ro_compat &
+                                               XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
+                       if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+                               xfs_warn(mp,
+"Attempted to mount read-only compatible filesystem read-write.\n"
+"Filesystem can only be safely mounted read only.");
+                               return -EINVAL;
+                       }
+               }
+               if (xfs_sb_has_incompat_feature(sbp,
+                                       XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
+                       xfs_warn(mp,
+"Superblock has unknown incompatible features (0x%x) enabled.\n"
+"Filesystem can not be safely mounted by this kernel.",
+                               (sbp->sb_features_incompat &
+                                               XFS_SB_FEAT_INCOMPAT_UNKNOWN));
+                       return -EINVAL;
+               }
+       }
+
+       if (xfs_sb_version_has_pquotino(sbp)) {
+               if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) {
+                       xfs_notice(mp,
+                          "Version 5 of Super block has XFS_OQUOTA bits.");
+                       return -EFSCORRUPTED;
+               }
+       } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
+                               XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
+                       xfs_notice(mp,
+"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits.");
+                       return -EFSCORRUPTED;
+       }
+
+       if (unlikely(
+           sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
+               xfs_warn(mp,
+               "filesystem is marked as having an external log; "
+               "specify logdev on the mount command line.");
+               return -EINVAL;
+       }
+
+       if (unlikely(
+           sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
+               xfs_warn(mp,
+               "filesystem is marked as having an internal log; "
+               "do not specify logdev on the mount command line.");
+               return -EINVAL;
+       }
+
+       /*
+        * More sanity checking.  Most of these were stolen directly from
+        * xfs_repair.
+        */
+       if (unlikely(
+           sbp->sb_agcount <= 0                                        ||
+           sbp->sb_sectsize < XFS_MIN_SECTORSIZE                       ||
+           sbp->sb_sectsize > XFS_MAX_SECTORSIZE                       ||
+           sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG                    ||
+           sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG                    ||
+           sbp->sb_sectsize != (1 << sbp->sb_sectlog)                  ||
+           sbp->sb_blocksize < XFS_MIN_BLOCKSIZE                       ||
+           sbp->sb_blocksize > XFS_MAX_BLOCKSIZE                       ||
+           sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG                    ||
+           sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG                    ||
+           sbp->sb_blocksize != (1 << sbp->sb_blocklog)                ||
+           sbp->sb_inodesize < XFS_DINODE_MIN_SIZE                     ||
+           sbp->sb_inodesize > XFS_DINODE_MAX_SIZE                     ||
+           sbp->sb_inodelog < XFS_DINODE_MIN_LOG                       ||
+           sbp->sb_inodelog > XFS_DINODE_MAX_LOG                       ||
+           sbp->sb_inodesize != (1 << sbp->sb_inodelog)                ||
+           sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) ||
+           (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog)   ||
+           (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)  ||
+           (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)  ||
+           (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */)    ||
+           sbp->sb_dblocks == 0                                        ||
+           sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp)                      ||
+           sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp)                      ||
+           sbp->sb_shared_vn != 0)) {
+               xfs_notice(mp, "SB sanity check failed");
+               return -EFSCORRUPTED;
+       }
+
+       /*
+        * Until this is fixed only page-sized or smaller data blocks work.
+        */
+       if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
+               xfs_warn(mp,
+               "File system with blocksize %d bytes. "
+               "Only pagesize (%ld) or less will currently work.",
+                               sbp->sb_blocksize, PAGE_SIZE);
+               return -ENOSYS;
+       }
+
+       /*
+        * Currently only very few inode sizes are supported.
+        */
+       switch (sbp->sb_inodesize) {
+       case 256:
+       case 512:
+       case 1024:
+       case 2048:
+               break;
+       default:
+               xfs_warn(mp, "inode size of %d bytes not supported",
+                               sbp->sb_inodesize);
+               return -ENOSYS;
+       }
+
+       if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
+           xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
+               xfs_warn(mp,
+               "file system too large to be mounted on this system.");
+               return -EFBIG;
+       }
+
+       if (check_inprogress && sbp->sb_inprogress) {
+               xfs_warn(mp, "Offline file system operation in progress!");
+               return -EFSCORRUPTED;
+       }
+       return 0;
+}
+
+void
+xfs_sb_quota_from_disk(struct xfs_sb *sbp)
+{
+       /*
+        * older mkfs doesn't initialize quota inodes to NULLFSINO. This
+        * leads to in-core values having two different values for a quota
+        * inode to be invalid: 0 and NULLFSINO. Change it to a single value
+        * NULLFSINO.
+        *
+        * Note that this change affect only the in-core values. These
+        * values are not written back to disk unless any quota information
+        * is written to the disk. Even in that case, sb_pquotino field is
+        * not written to disk unless the superblock supports pquotino.
+        */
+       if (sbp->sb_uquotino == 0)
+               sbp->sb_uquotino = NULLFSINO;
+       if (sbp->sb_gquotino == 0)
+               sbp->sb_gquotino = NULLFSINO;
+       if (sbp->sb_pquotino == 0)
+               sbp->sb_pquotino = NULLFSINO;
+
+       /*
+        * We need to do these manipilations only if we are working
+        * with an older version of on-disk superblock.
+        */
+       if (xfs_sb_version_has_pquotino(sbp))
+               return;
+
+       if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
+               sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+                                       XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
+       if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
+               sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+                                       XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
+       sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
+
+       if (sbp->sb_qflags & XFS_PQUOTA_ACCT)  {
+               /*
+                * In older version of superblock, on-disk superblock only
+                * has sb_gquotino, and in-core superblock has both sb_gquotino
+                * and sb_pquotino. But, only one of them is supported at any
+                * point of time. So, if PQUOTA is set in disk superblock,
+                * copy over sb_gquotino to sb_pquotino.
+                */
+               sbp->sb_pquotino = sbp->sb_gquotino;
+               sbp->sb_gquotino = NULLFSINO;
+       }
+}
+
+static void
+__xfs_sb_from_disk(
+       struct xfs_sb   *to,
+       xfs_dsb_t       *from,
+       bool            convert_xquota)
+{
+       to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
+       to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
+       to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
+       to->sb_rblocks = be64_to_cpu(from->sb_rblocks);
+       to->sb_rextents = be64_to_cpu(from->sb_rextents);
+       memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
+       to->sb_logstart = be64_to_cpu(from->sb_logstart);
+       to->sb_rootino = be64_to_cpu(from->sb_rootino);
+       to->sb_rbmino = be64_to_cpu(from->sb_rbmino);
+       to->sb_rsumino = be64_to_cpu(from->sb_rsumino);
+       to->sb_rextsize = be32_to_cpu(from->sb_rextsize);
+       to->sb_agblocks = be32_to_cpu(from->sb_agblocks);
+       to->sb_agcount = be32_to_cpu(from->sb_agcount);
+       to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks);
+       to->sb_logblocks = be32_to_cpu(from->sb_logblocks);
+       to->sb_versionnum = be16_to_cpu(from->sb_versionnum);
+       to->sb_sectsize = be16_to_cpu(from->sb_sectsize);
+       to->sb_inodesize = be16_to_cpu(from->sb_inodesize);
+       to->sb_inopblock = be16_to_cpu(from->sb_inopblock);
+       memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
+       to->sb_blocklog = from->sb_blocklog;
+       to->sb_sectlog = from->sb_sectlog;
+       to->sb_inodelog = from->sb_inodelog;
+       to->sb_inopblog = from->sb_inopblog;
+       to->sb_agblklog = from->sb_agblklog;
+       to->sb_rextslog = from->sb_rextslog;
+       to->sb_inprogress = from->sb_inprogress;
+       to->sb_imax_pct = from->sb_imax_pct;
+       to->sb_icount = be64_to_cpu(from->sb_icount);
+       to->sb_ifree = be64_to_cpu(from->sb_ifree);
+       to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks);
+       to->sb_frextents = be64_to_cpu(from->sb_frextents);
+       to->sb_uquotino = be64_to_cpu(from->sb_uquotino);
+       to->sb_gquotino = be64_to_cpu(from->sb_gquotino);
+       to->sb_qflags = be16_to_cpu(from->sb_qflags);
+       to->sb_flags = from->sb_flags;
+       to->sb_shared_vn = from->sb_shared_vn;
+       to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt);
+       to->sb_unit = be32_to_cpu(from->sb_unit);
+       to->sb_width = be32_to_cpu(from->sb_width);
+       to->sb_dirblklog = from->sb_dirblklog;
+       to->sb_logsectlog = from->sb_logsectlog;
+       to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize);
+       to->sb_logsunit = be32_to_cpu(from->sb_logsunit);
+       to->sb_features2 = be32_to_cpu(from->sb_features2);
+       to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2);
+       to->sb_features_compat = be32_to_cpu(from->sb_features_compat);
+       to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat);
+       to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat);
+       to->sb_features_log_incompat =
+                               be32_to_cpu(from->sb_features_log_incompat);
+       to->sb_pad = 0;
+       to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
+       to->sb_lsn = be64_to_cpu(from->sb_lsn);
+       /* Convert on-disk flags to in-memory flags? */
+       if (convert_xquota)
+               xfs_sb_quota_from_disk(to);
+}
+
+void
+xfs_sb_from_disk(
+       struct xfs_sb   *to,
+       xfs_dsb_t       *from)
+{
+       __xfs_sb_from_disk(to, from, true);
+}
+
+static inline void
+xfs_sb_quota_to_disk(
+       xfs_dsb_t       *to,
+       xfs_sb_t        *from,
+       __int64_t       *fields)
+{
+       __uint16_t      qflags = from->sb_qflags;
+
+       /*
+        * We need to do these manipilations only if we are working
+        * with an older version of on-disk superblock.
+        */
+       if (xfs_sb_version_has_pquotino(from))
+               return;
+
+       if (*fields & XFS_SB_QFLAGS) {
+               /*
+                * The in-core version of sb_qflags do not have
+                * XFS_OQUOTA_* flags, whereas the on-disk version
+                * does.  So, convert incore XFS_{PG}QUOTA_* flags
+                * to on-disk XFS_OQUOTA_* flags.
+                */
+               qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
+                               XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
+
+               if (from->sb_qflags &
+                               (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
+                       qflags |= XFS_OQUOTA_ENFD;
+               if (from->sb_qflags &
+                               (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
+                       qflags |= XFS_OQUOTA_CHKD;
+               to->sb_qflags = cpu_to_be16(qflags);
+               *fields &= ~XFS_SB_QFLAGS;
+       }
+
+       /*
+        * GQUOTINO and PQUOTINO cannot be used together in versions of
+        * superblock that do not have pquotino. from->sb_flags tells us which
+        * quota is active and should be copied to disk. If neither are active,
+        * make sure we write NULLFSINO to the sb_gquotino field as a quota
+        * inode value of "0" is invalid when the XFS_SB_VERSION_QUOTA feature
+        * bit is set.
+        *
+        * Note that we don't need to handle the sb_uquotino or sb_pquotino here
+        * as they do not require any translation. Hence the main sb field loop
+        * will write them appropriately from the in-core superblock.
+        */
+       if ((*fields & XFS_SB_GQUOTINO) &&
+                               (from->sb_qflags & XFS_GQUOTA_ACCT))
+               to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
+       else if ((*fields & XFS_SB_PQUOTINO) &&
+                               (from->sb_qflags & XFS_PQUOTA_ACCT))
+               to->sb_gquotino = cpu_to_be64(from->sb_pquotino);
+       else {
+               /*
+                * We can't rely on just the fields being logged to tell us
+                * that it is safe to write NULLFSINO - we should only do that
+                * if quotas are not actually enabled. Hence only write
+                * NULLFSINO if both in-core quota inodes are NULL.
+                */
+               if (from->sb_gquotino == NULLFSINO &&
+                   from->sb_pquotino == NULLFSINO)
+                       to->sb_gquotino = cpu_to_be64(NULLFSINO);
+       }
+
+       *fields &= ~(XFS_SB_PQUOTINO | XFS_SB_GQUOTINO);
+}
+
+/*
+ * Copy in core superblock to ondisk one.
+ *
+ * The fields argument is mask of superblock fields to copy.
+ */
+void
+xfs_sb_to_disk(
+       xfs_dsb_t       *to,
+       xfs_sb_t        *from,
+       __int64_t       fields)
+{
+       xfs_caddr_t     to_ptr = (xfs_caddr_t)to;
+       xfs_caddr_t     from_ptr = (xfs_caddr_t)from;
+       xfs_sb_field_t  f;
+       int             first;
+       int             size;
+
+       ASSERT(fields);
+       if (!fields)
+               return;
+
+       xfs_sb_quota_to_disk(to, from, &fields);
+       while (fields) {
+               f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+               first = xfs_sb_info[f].offset;
+               size = xfs_sb_info[f + 1].offset - first;
+
+               ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
+
+               if (size == 1 || xfs_sb_info[f].type == 1) {
+                       memcpy(to_ptr + first, from_ptr + first, size);
+               } else {
+                       switch (size) {
+                       case 2:
+                               *(__be16 *)(to_ptr + first) =
+                                     cpu_to_be16(*(__u16 *)(from_ptr + first));
+                               break;
+                       case 4:
+                               *(__be32 *)(to_ptr + first) =
+                                     cpu_to_be32(*(__u32 *)(from_ptr + first));
+                               break;
+                       case 8:
+                               *(__be64 *)(to_ptr + first) =
+                                     cpu_to_be64(*(__u64 *)(from_ptr + first));
+                               break;
+                       default:
+                               ASSERT(0);
+                       }
+               }
+
+               fields &= ~(1LL << f);
+       }
+}
+
+static int
+xfs_sb_verify(
+       struct xfs_buf  *bp,
+       bool            check_version)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+       struct xfs_sb   sb;
+
+       /*
+        * Use call variant which doesn't convert quota flags from disk 
+        * format, because xfs_mount_validate_sb checks the on-disk flags.
+        */
+       __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false);
+
+       /*
+        * Only check the in progress field for the primary superblock as
+        * mkfs.xfs doesn't clear it from secondary superblocks.
+        */
+       return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR,
+                                    check_version);
+}
+
+/*
+ * If the superblock has the CRC feature bit set or the CRC field is non-null,
+ * check that the CRC is valid.  We check the CRC field is non-null because a
+ * single bit error could clear the feature bit and unused parts of the
+ * superblock are supposed to be zero. Hence a non-null crc field indicates that
+ * we've potentially lost a feature bit and we should check it anyway.
+ *
+ * However, past bugs (i.e. in growfs) left non-zeroed regions beyond the
+ * last field in V4 secondary superblocks.  So for secondary superblocks,
+ * we are more forgiving, and ignore CRC failures if the primary doesn't
+ * indicate that the fs version is V5.
+ */
+static void
+xfs_sb_read_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+       struct xfs_dsb  *dsb = XFS_BUF_TO_SBP(bp);
+       int             error;
+
+       /*
+        * open code the version check to avoid needing to convert the entire
+        * superblock from disk order just to check the version number
+        */
+       if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) &&
+           (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) ==
+                                               XFS_SB_VERSION_5) ||
+            dsb->sb_crc != 0)) {
+
+               if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) {
+                       /* Only fail bad secondaries on a known V5 filesystem */
+                       if (bp->b_bn == XFS_SB_DADDR ||
+                           xfs_sb_version_hascrc(&mp->m_sb)) {
+                               error = -EFSBADCRC;
+                               goto out_error;
+                       }
+               }
+       }
+       error = xfs_sb_verify(bp, true);
+
+out_error:
+       if (error) {
+               xfs_buf_ioerror(bp, error);
+               if (error == -EFSCORRUPTED || error == -EFSBADCRC)
+                       xfs_verifier_error(bp);
+       }
+}
+
+/*
+ * We may be probed for a filesystem match, so we may not want to emit
+ * messages when the superblock buffer is not actually an XFS superblock.
+ * If we find an XFS superblock, then run a normal, noisy mount because we are
+ * really going to mount it and want to know about errors.
+ */
+static void
+xfs_sb_quiet_read_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_dsb  *dsb = XFS_BUF_TO_SBP(bp);
+
+       if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) {
+               /* XFS filesystem, verify noisily! */
+               xfs_sb_read_verify(bp);
+               return;
+       }
+       /* quietly fail */
+       xfs_buf_ioerror(bp, -EWRONGFS);
+}
+
+static void
+xfs_sb_write_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_buf_log_item *bip = bp->b_fspriv;
+       int                     error;
+
+       error = xfs_sb_verify(bp, false);
+       if (error) {
+               xfs_buf_ioerror(bp, error);
+               xfs_verifier_error(bp);
+               return;
+       }
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return;
+
+       if (bip)
+               XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+       xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_sb_buf_ops = {
+       .verify_read = xfs_sb_read_verify,
+       .verify_write = xfs_sb_write_verify,
+};
+
+const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
+       .verify_read = xfs_sb_quiet_read_verify,
+       .verify_write = xfs_sb_write_verify,
+};
+
+/*
+ * xfs_mount_common
+ *
+ * Mount initialization code establishing various mount
+ * fields from the superblock associated with the given
+ * mount structure
+ */
+void
+xfs_sb_mount_common(
+       struct xfs_mount *mp,
+       struct xfs_sb   *sbp)
+{
+       mp->m_agfrotor = mp->m_agirotor = 0;
+       spin_lock_init(&mp->m_agirotor_lock);
+       mp->m_maxagi = mp->m_sb.sb_agcount;
+       mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
+       mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
+       mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
+       mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
+       mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
+       mp->m_blockmask = sbp->sb_blocksize - 1;
+       mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
+       mp->m_blockwmask = mp->m_blockwsize - 1;
+
+       mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
+       mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
+       mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
+       mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
+
+       mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
+       mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
+       mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
+       mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
+
+       mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
+       mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
+       mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
+       mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
+
+       mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
+       mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
+                                       sbp->sb_inopblock);
+       mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
+}
+
+/*
+ * xfs_initialize_perag_data
+ *
+ * Read in each per-ag structure so we can count up the number of
+ * allocated inodes, free inodes and used filesystem blocks as this
+ * information is no longer persistent in the superblock. Once we have
+ * this information, write it into the in-core superblock structure.
+ */
+int
+xfs_initialize_perag_data(
+       struct xfs_mount *mp,
+       xfs_agnumber_t  agcount)
+{
+       xfs_agnumber_t  index;
+       xfs_perag_t     *pag;
+       xfs_sb_t        *sbp = &mp->m_sb;
+       uint64_t        ifree = 0;
+       uint64_t        ialloc = 0;
+       uint64_t        bfree = 0;
+       uint64_t        bfreelst = 0;
+       uint64_t        btree = 0;
+       int             error;
+
+       for (index = 0; index < agcount; index++) {
+               /*
+                * read the agf, then the agi. This gets us
+                * all the information we need and populates the
+                * per-ag structures for us.
+                */
+               error = xfs_alloc_pagf_init(mp, NULL, index, 0);
+               if (error)
+                       return error;
+
+               error = xfs_ialloc_pagi_init(mp, NULL, index);
+               if (error)
+                       return error;
+               pag = xfs_perag_get(mp, index);
+               ifree += pag->pagi_freecount;
+               ialloc += pag->pagi_count;
+               bfree += pag->pagf_freeblks;
+               bfreelst += pag->pagf_flcount;
+               btree += pag->pagf_btreeblks;
+               xfs_perag_put(pag);
+       }
+       /*
+        * Overwrite incore superblock counters with just-read data
+        */
+       spin_lock(&mp->m_sb_lock);
+       sbp->sb_ifree = ifree;
+       sbp->sb_icount = ialloc;
+       sbp->sb_fdblocks = bfree + bfreelst + btree;
+       spin_unlock(&mp->m_sb_lock);
+
+       /* Fixup the per-cpu counters as well. */
+       xfs_icsb_reinit_counters(mp);
+
+       return 0;
+}
+
+/*
+ * xfs_mod_sb() can be used to copy arbitrary changes to the
+ * in-core superblock into the superblock buffer to be logged.
+ * It does not provide the higher level of locking that is
+ * needed to protect the in-core superblock from concurrent
+ * access.
+ */
+void
+xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
+{
+       xfs_buf_t       *bp;
+       int             first;
+       int             last;
+       xfs_mount_t     *mp;
+       xfs_sb_field_t  f;
+
+       ASSERT(fields);
+       if (!fields)
+               return;
+       mp = tp->t_mountp;
+       bp = xfs_trans_getsb(tp, mp, 0);
+       first = sizeof(xfs_sb_t);
+       last = 0;
+
+       /* translate/copy */
+
+       xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
+
+       /* find modified range */
+       f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
+       ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+       last = xfs_sb_info[f + 1].offset - 1;
+
+       f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+       ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+       first = xfs_sb_info[f].offset;
+
+       xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
+       xfs_trans_log_buf(tp, bp, first, last);
+}
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
new file mode 100644 (file)
index 0000000..2e73970
--- /dev/null
@@ -0,0 +1,621 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SB_H__
+#define        __XFS_SB_H__
+
+/*
+ * Super block
+ * Fits into a sector-sized buffer at address 0 of each allocation group.
+ * Only the first of these is ever updated except during growfs.
+ */
+
+struct xfs_buf;
+struct xfs_mount;
+struct xfs_trans;
+
+#define        XFS_SB_MAGIC            0x58465342      /* 'XFSB' */
+#define        XFS_SB_VERSION_1        1               /* 5.3, 6.0.1, 6.1 */
+#define        XFS_SB_VERSION_2        2               /* 6.2 - attributes */
+#define        XFS_SB_VERSION_3        3               /* 6.2 - new inode version */
+#define        XFS_SB_VERSION_4        4               /* 6.2+ - bitmask version */
+#define        XFS_SB_VERSION_5        5               /* CRC enabled filesystem */
+#define        XFS_SB_VERSION_NUMBITS          0x000f
+#define        XFS_SB_VERSION_ALLFBITS         0xfff0
+#define        XFS_SB_VERSION_ATTRBIT          0x0010
+#define        XFS_SB_VERSION_NLINKBIT         0x0020
+#define        XFS_SB_VERSION_QUOTABIT         0x0040
+#define        XFS_SB_VERSION_ALIGNBIT         0x0080
+#define        XFS_SB_VERSION_DALIGNBIT        0x0100
+#define        XFS_SB_VERSION_SHAREDBIT        0x0200
+#define XFS_SB_VERSION_LOGV2BIT                0x0400
+#define XFS_SB_VERSION_SECTORBIT       0x0800
+#define        XFS_SB_VERSION_EXTFLGBIT        0x1000
+#define        XFS_SB_VERSION_DIRV2BIT         0x2000
+#define        XFS_SB_VERSION_BORGBIT          0x4000  /* ASCII only case-insens. */
+#define        XFS_SB_VERSION_MOREBITSBIT      0x8000
+
+/*
+ * Supported feature bit list is just all bits in the versionnum field because
+ * we've used them all up and understand them all. Except, of course, for the
+ * shared superblock bit, which nobody knows what it does and so is unsupported.
+ */
+#define        XFS_SB_VERSION_OKBITS           \
+       ((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \
+               ~XFS_SB_VERSION_SHAREDBIT)
+
+/*
+ * There are two words to hold XFS "feature" bits: the original
+ * word, sb_versionnum, and sb_features2.  Whenever a bit is set in
+ * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set.
+ *
+ * These defines represent bits in sb_features2.
+ */
+#define XFS_SB_VERSION2_RESERVED1BIT   0x00000001
+#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002      /* Superblk counters */
+#define XFS_SB_VERSION2_RESERVED4BIT   0x00000004
+#define XFS_SB_VERSION2_ATTR2BIT       0x00000008      /* Inline attr rework */
+#define XFS_SB_VERSION2_PARENTBIT      0x00000010      /* parent pointers */
+#define XFS_SB_VERSION2_PROJID32BIT    0x00000080      /* 32 bit project id */
+#define XFS_SB_VERSION2_CRCBIT         0x00000100      /* metadata CRCs */
+#define XFS_SB_VERSION2_FTYPE          0x00000200      /* inode type in dir */
+
+#define        XFS_SB_VERSION2_OKBITS          \
+       (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
+        XFS_SB_VERSION2_ATTR2BIT       | \
+        XFS_SB_VERSION2_PROJID32BIT    | \
+        XFS_SB_VERSION2_FTYPE)
+
+/*
+ * Superblock - in core version.  Must match the ondisk version below.
+ * Must be padded to 64 bit alignment.
+ */
+typedef struct xfs_sb {
+       __uint32_t      sb_magicnum;    /* magic number == XFS_SB_MAGIC */
+       __uint32_t      sb_blocksize;   /* logical block size, bytes */
+       xfs_rfsblock_t  sb_dblocks;     /* number of data blocks */
+       xfs_rfsblock_t  sb_rblocks;     /* number of realtime blocks */
+       xfs_rtblock_t   sb_rextents;    /* number of realtime extents */
+       uuid_t          sb_uuid;        /* file system unique id */
+       xfs_fsblock_t   sb_logstart;    /* starting block of log if internal */
+       xfs_ino_t       sb_rootino;     /* root inode number */
+       xfs_ino_t       sb_rbmino;      /* bitmap inode for realtime extents */
+       xfs_ino_t       sb_rsumino;     /* summary inode for rt bitmap */
+       xfs_agblock_t   sb_rextsize;    /* realtime extent size, blocks */
+       xfs_agblock_t   sb_agblocks;    /* size of an allocation group */
+       xfs_agnumber_t  sb_agcount;     /* number of allocation groups */
+       xfs_extlen_t    sb_rbmblocks;   /* number of rt bitmap blocks */
+       xfs_extlen_t    sb_logblocks;   /* number of log blocks */
+       __uint16_t      sb_versionnum;  /* header version == XFS_SB_VERSION */
+       __uint16_t      sb_sectsize;    /* volume sector size, bytes */
+       __uint16_t      sb_inodesize;   /* inode size, bytes */
+       __uint16_t      sb_inopblock;   /* inodes per block */
+       char            sb_fname[12];   /* file system name */
+       __uint8_t       sb_blocklog;    /* log2 of sb_blocksize */
+       __uint8_t       sb_sectlog;     /* log2 of sb_sectsize */
+       __uint8_t       sb_inodelog;    /* log2 of sb_inodesize */
+       __uint8_t       sb_inopblog;    /* log2 of sb_inopblock */
+       __uint8_t       sb_agblklog;    /* log2 of sb_agblocks (rounded up) */
+       __uint8_t       sb_rextslog;    /* log2 of sb_rextents */
+       __uint8_t       sb_inprogress;  /* mkfs is in progress, don't mount */
+       __uint8_t       sb_imax_pct;    /* max % of fs for inode space */
+                                       /* statistics */
+       /*
+        * These fields must remain contiguous.  If you really
+        * want to change their layout, make sure you fix the
+        * code in xfs_trans_apply_sb_deltas().
+        */
+       __uint64_t      sb_icount;      /* allocated inodes */
+       __uint64_t      sb_ifree;       /* free inodes */
+       __uint64_t      sb_fdblocks;    /* free data blocks */
+       __uint64_t      sb_frextents;   /* free realtime extents */
+       /*
+        * End contiguous fields.
+        */
+       xfs_ino_t       sb_uquotino;    /* user quota inode */
+       xfs_ino_t       sb_gquotino;    /* group quota inode */
+       __uint16_t      sb_qflags;      /* quota flags */
+       __uint8_t       sb_flags;       /* misc. flags */
+       __uint8_t       sb_shared_vn;   /* shared version number */
+       xfs_extlen_t    sb_inoalignmt;  /* inode chunk alignment, fsblocks */
+       __uint32_t      sb_unit;        /* stripe or raid unit */
+       __uint32_t      sb_width;       /* stripe or raid width */
+       __uint8_t       sb_dirblklog;   /* log2 of dir block size (fsbs) */
+       __uint8_t       sb_logsectlog;  /* log2 of the log sector size */
+       __uint16_t      sb_logsectsize; /* sector size for the log, bytes */
+       __uint32_t      sb_logsunit;    /* stripe unit size for the log */
+       __uint32_t      sb_features2;   /* additional feature bits */
+
+       /*
+        * bad features2 field as a result of failing to pad the sb
+        * structure to 64 bits. Some machines will be using this field
+        * for features2 bits. Easiest just to mark it bad and not use
+        * it for anything else.
+        */
+       __uint32_t      sb_bad_features2;
+
+       /* version 5 superblock fields start here */
+
+       /* feature masks */
+       __uint32_t      sb_features_compat;
+       __uint32_t      sb_features_ro_compat;
+       __uint32_t      sb_features_incompat;
+       __uint32_t      sb_features_log_incompat;
+
+       __uint32_t      sb_crc;         /* superblock crc */
+       __uint32_t      sb_pad;
+
+       xfs_ino_t       sb_pquotino;    /* project quota inode */
+       xfs_lsn_t       sb_lsn;         /* last write sequence */
+
+       /* must be padded to 64 bit alignment */
+} xfs_sb_t;
+
+#define XFS_SB_CRC_OFF         offsetof(struct xfs_sb, sb_crc)
+
+/*
+ * Superblock - on disk version.  Must match the in core version above.
+ * Must be padded to 64 bit alignment.
+ */
+typedef struct xfs_dsb {
+       __be32          sb_magicnum;    /* magic number == XFS_SB_MAGIC */
+       __be32          sb_blocksize;   /* logical block size, bytes */
+       __be64          sb_dblocks;     /* number of data blocks */
+       __be64          sb_rblocks;     /* number of realtime blocks */
+       __be64          sb_rextents;    /* number of realtime extents */
+       uuid_t          sb_uuid;        /* file system unique id */
+       __be64          sb_logstart;    /* starting block of log if internal */
+       __be64          sb_rootino;     /* root inode number */
+       __be64          sb_rbmino;      /* bitmap inode for realtime extents */
+       __be64          sb_rsumino;     /* summary inode for rt bitmap */
+       __be32          sb_rextsize;    /* realtime extent size, blocks */
+       __be32          sb_agblocks;    /* size of an allocation group */
+       __be32          sb_agcount;     /* number of allocation groups */
+       __be32          sb_rbmblocks;   /* number of rt bitmap blocks */
+       __be32          sb_logblocks;   /* number of log blocks */
+       __be16          sb_versionnum;  /* header version == XFS_SB_VERSION */
+       __be16          sb_sectsize;    /* volume sector size, bytes */
+       __be16          sb_inodesize;   /* inode size, bytes */
+       __be16          sb_inopblock;   /* inodes per block */
+       char            sb_fname[12];   /* file system name */
+       __u8            sb_blocklog;    /* log2 of sb_blocksize */
+       __u8            sb_sectlog;     /* log2 of sb_sectsize */
+       __u8            sb_inodelog;    /* log2 of sb_inodesize */
+       __u8            sb_inopblog;    /* log2 of sb_inopblock */
+       __u8            sb_agblklog;    /* log2 of sb_agblocks (rounded up) */
+       __u8            sb_rextslog;    /* log2 of sb_rextents */
+       __u8            sb_inprogress;  /* mkfs is in progress, don't mount */
+       __u8            sb_imax_pct;    /* max % of fs for inode space */
+                                       /* statistics */
+       /*
+        * These fields must remain contiguous.  If you really
+        * want to change their layout, make sure you fix the
+        * code in xfs_trans_apply_sb_deltas().
+        */
+       __be64          sb_icount;      /* allocated inodes */
+       __be64          sb_ifree;       /* free inodes */
+       __be64          sb_fdblocks;    /* free data blocks */
+       __be64          sb_frextents;   /* free realtime extents */
+       /*
+        * End contiguous fields.
+        */
+       __be64          sb_uquotino;    /* user quota inode */
+       __be64          sb_gquotino;    /* group quota inode */
+       __be16          sb_qflags;      /* quota flags */
+       __u8            sb_flags;       /* misc. flags */
+       __u8            sb_shared_vn;   /* shared version number */
+       __be32          sb_inoalignmt;  /* inode chunk alignment, fsblocks */
+       __be32          sb_unit;        /* stripe or raid unit */
+       __be32          sb_width;       /* stripe or raid width */
+       __u8            sb_dirblklog;   /* log2 of dir block size (fsbs) */
+       __u8            sb_logsectlog;  /* log2 of the log sector size */
+       __be16          sb_logsectsize; /* sector size for the log, bytes */
+       __be32          sb_logsunit;    /* stripe unit size for the log */
+       __be32          sb_features2;   /* additional feature bits */
+       /*
+        * bad features2 field as a result of failing to pad the sb
+        * structure to 64 bits. Some machines will be using this field
+        * for features2 bits. Easiest just to mark it bad and not use
+        * it for anything else.
+        */
+       __be32          sb_bad_features2;
+
+       /* version 5 superblock fields start here */
+
+       /* feature masks */
+       __be32          sb_features_compat;
+       __be32          sb_features_ro_compat;
+       __be32          sb_features_incompat;
+       __be32          sb_features_log_incompat;
+
+       __le32          sb_crc;         /* superblock crc */
+       __be32          sb_pad;
+
+       __be64          sb_pquotino;    /* project quota inode */
+       __be64          sb_lsn;         /* last write sequence */
+
+       /* must be padded to 64 bit alignment */
+} xfs_dsb_t;
+
+/*
+ * Sequence number values for the fields.
+ */
+typedef enum {
+       XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS,
+       XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO,
+       XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS,
+       XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS,
+       XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE,
+       XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG,
+       XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG,
+       XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT,
+       XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO,
+       XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN,
+       XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG,
+       XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT,
+       XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT,
+       XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT,
+       XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD,
+       XFS_SBS_PQUOTINO, XFS_SBS_LSN,
+       XFS_SBS_FIELDCOUNT
+} xfs_sb_field_t;
+
+/*
+ * Mask values, defined based on the xfs_sb_field_t values.
+ * Only define the ones we're using.
+ */
+#define        XFS_SB_MVAL(x)          (1LL << XFS_SBS_ ## x)
+#define        XFS_SB_UUID             XFS_SB_MVAL(UUID)
+#define        XFS_SB_FNAME            XFS_SB_MVAL(FNAME)
+#define        XFS_SB_ROOTINO          XFS_SB_MVAL(ROOTINO)
+#define        XFS_SB_RBMINO           XFS_SB_MVAL(RBMINO)
+#define        XFS_SB_RSUMINO          XFS_SB_MVAL(RSUMINO)
+#define        XFS_SB_VERSIONNUM       XFS_SB_MVAL(VERSIONNUM)
+#define XFS_SB_UQUOTINO                XFS_SB_MVAL(UQUOTINO)
+#define XFS_SB_GQUOTINO                XFS_SB_MVAL(GQUOTINO)
+#define XFS_SB_QFLAGS          XFS_SB_MVAL(QFLAGS)
+#define XFS_SB_SHARED_VN       XFS_SB_MVAL(SHARED_VN)
+#define XFS_SB_UNIT            XFS_SB_MVAL(UNIT)
+#define XFS_SB_WIDTH           XFS_SB_MVAL(WIDTH)
+#define XFS_SB_ICOUNT          XFS_SB_MVAL(ICOUNT)
+#define XFS_SB_IFREE           XFS_SB_MVAL(IFREE)
+#define XFS_SB_FDBLOCKS                XFS_SB_MVAL(FDBLOCKS)
+#define XFS_SB_FEATURES2       XFS_SB_MVAL(FEATURES2)
+#define XFS_SB_BAD_FEATURES2   XFS_SB_MVAL(BAD_FEATURES2)
+#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT)
+#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT)
+#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT)
+#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT)
+#define XFS_SB_CRC             XFS_SB_MVAL(CRC)
+#define XFS_SB_PQUOTINO                XFS_SB_MVAL(PQUOTINO)
+#define        XFS_SB_NUM_BITS         ((int)XFS_SBS_FIELDCOUNT)
+#define        XFS_SB_ALL_BITS         ((1LL << XFS_SB_NUM_BITS) - 1)
+#define        XFS_SB_MOD_BITS         \
+       (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \
+        XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
+        XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
+        XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
+        XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \
+        XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \
+        XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO)
+
+
+/*
+ * Misc. Flags - warning - these will be cleared by xfs_repair unless
+ * a feature bit is set when the flag is used.
+ */
+#define XFS_SBF_NOFLAGS                0x00    /* no flags set */
+#define XFS_SBF_READONLY       0x01    /* only read-only mounts allowed */
+
+/*
+ * define max. shared version we can interoperate with
+ */
+#define XFS_SB_MAX_SHARED_VN   0
+
+#define        XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
+
+/*
+ * The first XFS version we support is a v4 superblock with V2 directories.
+ */
+static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp)
+{
+       if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
+               return false;
+
+       /* check for unknown features in the fs */
+       if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
+           ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
+            (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
+               return false;
+
+       return true;
+}
+
+static inline bool xfs_sb_good_version(struct xfs_sb *sbp)
+{
+       if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5)
+               return true;
+       if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
+               return xfs_sb_good_v4_features(sbp);
+       return false;
+}
+
+/*
+ * Detect a mismatched features2 field.  Older kernels read/wrote
+ * this into the wrong slot, so to be safe we keep them in sync.
+ */
+static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp)
+{
+       return sbp->sb_bad_features2 != sbp->sb_features2;
+}
+
+static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp)
+{
+       return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT);
+}
+
+static inline void xfs_sb_version_addattr(struct xfs_sb *sbp)
+{
+       sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
+}
+
+static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp)
+{
+       return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
+}
+
+static inline void xfs_sb_version_addquota(struct xfs_sb *sbp)
+{
+       sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
+}
+
+static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp)
+{
+       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+               (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT));
+}
+
+static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp)
+{
+       return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
+}
+
+static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp)
+{
+       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+              (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
+}
+
+static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp)
+{
+       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+              (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
+}
+
+static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp)
+{
+       return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
+}
+
+static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp)
+{
+       return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
+}
+
+static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp)
+{
+       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+              (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
+}
+
+/*
+ * sb_features2 bit version macros.
+ */
+static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp)
+{
+       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+              (xfs_sb_version_hasmorebits(sbp) &&
+               (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT));
+}
+
+static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp)
+{
+       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+              (xfs_sb_version_hasmorebits(sbp) &&
+               (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT));
+}
+
+static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp)
+{
+       sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
+       sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
+       sbp->sb_bad_features2 |= XFS_SB_VERSION2_ATTR2BIT;
+}
+
+static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp)
+{
+       sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
+       sbp->sb_bad_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
+       if (!sbp->sb_features2)
+               sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
+}
+
+static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp)
+{
+       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+              (xfs_sb_version_hasmorebits(sbp) &&
+               (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT));
+}
+
+static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp)
+{
+       sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
+       sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
+       sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT;
+}
+
+/*
+ * Extended v5 superblock feature masks. These are to be used for new v5
+ * superblock features only.
+ *
+ * Compat features are new features that old kernels will not notice or affect
+ * and so can mount read-write without issues.
+ *
+ * RO-Compat (read only) are features that old kernels can read but will break
+ * if they write. Hence only read-only mounts of such filesystems are allowed on
+ * kernels that don't support the feature bit.
+ *
+ * InCompat features are features which old kernels will not understand and so
+ * must not mount.
+ *
+ * Log-InCompat features are for changes to log formats or new transactions that
+ * can't be replayed on older kernels. The fields are set when the filesystem is
+ * mounted, and a clean unmount clears the fields.
+ */
+#define XFS_SB_FEAT_COMPAT_ALL 0
+#define XFS_SB_FEAT_COMPAT_UNKNOWN     ~XFS_SB_FEAT_COMPAT_ALL
+static inline bool
+xfs_sb_has_compat_feature(
+       struct xfs_sb   *sbp,
+       __uint32_t      feature)
+{
+       return (sbp->sb_features_compat & feature) != 0;
+}
+
+#define XFS_SB_FEAT_RO_COMPAT_FINOBT   (1 << 0)                /* free inode btree */
+#define XFS_SB_FEAT_RO_COMPAT_ALL \
+               (XFS_SB_FEAT_RO_COMPAT_FINOBT)
+#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN  ~XFS_SB_FEAT_RO_COMPAT_ALL
+static inline bool
+xfs_sb_has_ro_compat_feature(
+       struct xfs_sb   *sbp,
+       __uint32_t      feature)
+{
+       return (sbp->sb_features_ro_compat & feature) != 0;
+}
+
+#define XFS_SB_FEAT_INCOMPAT_FTYPE     (1 << 0)        /* filetype in dirent */
+#define XFS_SB_FEAT_INCOMPAT_ALL \
+               (XFS_SB_FEAT_INCOMPAT_FTYPE)
+
+#define XFS_SB_FEAT_INCOMPAT_UNKNOWN   ~XFS_SB_FEAT_INCOMPAT_ALL
+static inline bool
+xfs_sb_has_incompat_feature(
+       struct xfs_sb   *sbp,
+       __uint32_t      feature)
+{
+       return (sbp->sb_features_incompat & feature) != 0;
+}
+
+#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0
+#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN       ~XFS_SB_FEAT_INCOMPAT_LOG_ALL
+static inline bool
+xfs_sb_has_incompat_log_feature(
+       struct xfs_sb   *sbp,
+       __uint32_t      feature)
+{
+       return (sbp->sb_features_log_incompat & feature) != 0;
+}
+
+/*
+ * V5 superblock specific feature checks
+ */
+static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp)
+{
+       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
+}
+
+static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp)
+{
+       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
+}
+
+static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp)
+{
+       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+               xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) ||
+              (xfs_sb_version_hasmorebits(sbp) &&
+                (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE));
+}
+
+static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
+{
+       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
+               (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
+}
+
+/*
+ * end of superblock version macros
+ */
+
+static inline bool
+xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
+{
+       return (ino == sbp->sb_uquotino ||
+               ino == sbp->sb_gquotino ||
+               ino == sbp->sb_pquotino);
+}
+
+#define XFS_SB_DADDR           ((xfs_daddr_t)0) /* daddr in filesystem/ag */
+#define        XFS_SB_BLOCK(mp)        XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
+#define XFS_BUF_TO_SBP(bp)     ((xfs_dsb_t *)((bp)->b_addr))
+
+#define        XFS_HDR_BLOCK(mp,d)     ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
+#define        XFS_DADDR_TO_FSB(mp,d)  XFS_AGB_TO_FSB(mp, \
+                       xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d))
+#define        XFS_FSB_TO_DADDR(mp,fsbno)      XFS_AGB_TO_DADDR(mp, \
+                       XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno))
+
+/*
+ * File system sector to basic block conversions.
+ */
+#define XFS_FSS_TO_BB(mp,sec)  ((sec) << (mp)->m_sectbb_log)
+
+/*
+ * File system block to basic block conversions.
+ */
+#define        XFS_FSB_TO_BB(mp,fsbno) ((fsbno) << (mp)->m_blkbb_log)
+#define        XFS_BB_TO_FSB(mp,bb)    \
+       (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log)
+#define        XFS_BB_TO_FSBT(mp,bb)   ((bb) >> (mp)->m_blkbb_log)
+
+/*
+ * File system block to byte conversions.
+ */
+#define XFS_FSB_TO_B(mp,fsbno) ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog)
+#define XFS_B_TO_FSB(mp,b)     \
+       ((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog)
+#define XFS_B_TO_FSBT(mp,b)    (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
+#define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask)
+
+/*
+ * perag get/put wrappers for ref counting
+ */
+extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t);
+extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t,
+                                          int tag);
+extern void    xfs_perag_put(struct xfs_perag *pag);
+extern int     xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
+
+extern void    xfs_sb_calc_crc(struct xfs_buf  *);
+extern void    xfs_mod_sb(struct xfs_trans *, __int64_t);
+extern void    xfs_sb_mount_common(struct xfs_mount *, struct xfs_sb *);
+extern void    xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
+extern void    xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
+extern void    xfs_sb_quota_from_disk(struct xfs_sb *sbp);
+
+#endif /* __XFS_SB_H__ */
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
new file mode 100644 (file)
index 0000000..82404da
--- /dev/null
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SHARED_H__
+#define __XFS_SHARED_H__
+
+/*
+ * Definitions shared between kernel and userspace that don't fit into any other
+ * header file that is shared with userspace.
+ */
+struct xfs_ifork;
+struct xfs_buf;
+struct xfs_buf_ops;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_inode;
+
+/*
+ * Buffer verifier operations are widely used, including userspace tools
+ */
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+extern const struct xfs_buf_ops xfs_agfl_buf_ops;
+extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
+extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
+extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
+extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
+extern const struct xfs_buf_ops xfs_da3_node_buf_ops;
+extern const struct xfs_buf_ops xfs_dquot_buf_ops;
+extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
+extern const struct xfs_buf_ops xfs_inobt_buf_ops;
+extern const struct xfs_buf_ops xfs_inode_buf_ops;
+extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
+extern const struct xfs_buf_ops xfs_dquot_buf_ops;
+extern const struct xfs_buf_ops xfs_sb_buf_ops;
+extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
+extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+
+/*
+ * Transaction types.  Used to distinguish types of buffers. These never reach
+ * the log.
+ */
+#define XFS_TRANS_SETATTR_NOT_SIZE     1
+#define XFS_TRANS_SETATTR_SIZE         2
+#define XFS_TRANS_INACTIVE             3
+#define XFS_TRANS_CREATE               4
+#define XFS_TRANS_CREATE_TRUNC         5
+#define XFS_TRANS_TRUNCATE_FILE                6
+#define XFS_TRANS_REMOVE               7
+#define XFS_TRANS_LINK                 8
+#define XFS_TRANS_RENAME               9
+#define XFS_TRANS_MKDIR                        10
+#define XFS_TRANS_RMDIR                        11
+#define XFS_TRANS_SYMLINK              12
+#define XFS_TRANS_SET_DMATTRS          13
+#define XFS_TRANS_GROWFS               14
+#define XFS_TRANS_STRAT_WRITE          15
+#define XFS_TRANS_DIOSTRAT             16
+/* 17 was XFS_TRANS_WRITE_SYNC */
+#define        XFS_TRANS_WRITEID               18
+#define        XFS_TRANS_ADDAFORK              19
+#define        XFS_TRANS_ATTRINVAL             20
+#define        XFS_TRANS_ATRUNCATE             21
+#define        XFS_TRANS_ATTR_SET              22
+#define        XFS_TRANS_ATTR_RM               23
+#define        XFS_TRANS_ATTR_FLAG             24
+#define        XFS_TRANS_CLEAR_AGI_BUCKET      25
+#define XFS_TRANS_QM_SBCHANGE          26
+/*
+ * Dummy entries since we use the transaction type to index into the
+ * trans_type[] in xlog_recover_print_trans_head()
+ */
+#define XFS_TRANS_DUMMY1               27
+#define XFS_TRANS_DUMMY2               28
+#define XFS_TRANS_QM_QUOTAOFF          29
+#define XFS_TRANS_QM_DQALLOC           30
+#define XFS_TRANS_QM_SETQLIM           31
+#define XFS_TRANS_QM_DQCLUSTER         32
+#define XFS_TRANS_QM_QINOCREATE                33
+#define XFS_TRANS_QM_QUOTAOFF_END      34
+#define XFS_TRANS_SB_UNIT              35
+#define XFS_TRANS_FSYNC_TS             36
+#define        XFS_TRANS_GROWFSRT_ALLOC        37
+#define        XFS_TRANS_GROWFSRT_ZERO         38
+#define        XFS_TRANS_GROWFSRT_FREE         39
+#define        XFS_TRANS_SWAPEXT               40
+#define        XFS_TRANS_SB_COUNT              41
+#define        XFS_TRANS_CHECKPOINT            42
+#define        XFS_TRANS_ICREATE               43
+#define        XFS_TRANS_CREATE_TMPFILE        44
+#define        XFS_TRANS_TYPE_MAX              44
+/* new transaction types need to be reflected in xfs_logprint(8) */
+
+#define XFS_TRANS_TYPES \
+       { XFS_TRANS_SETATTR_NOT_SIZE,   "SETATTR_NOT_SIZE" }, \
+       { XFS_TRANS_SETATTR_SIZE,       "SETATTR_SIZE" }, \
+       { XFS_TRANS_INACTIVE,           "INACTIVE" }, \
+       { XFS_TRANS_CREATE,             "CREATE" }, \
+       { XFS_TRANS_CREATE_TMPFILE,     "CREATE_TMPFILE" }, \
+       { XFS_TRANS_CREATE_TRUNC,       "CREATE_TRUNC" }, \
+       { XFS_TRANS_TRUNCATE_FILE,      "TRUNCATE_FILE" }, \
+       { XFS_TRANS_REMOVE,             "REMOVE" }, \
+       { XFS_TRANS_LINK,               "LINK" }, \
+       { XFS_TRANS_RENAME,             "RENAME" }, \
+       { XFS_TRANS_MKDIR,              "MKDIR" }, \
+       { XFS_TRANS_RMDIR,              "RMDIR" }, \
+       { XFS_TRANS_SYMLINK,            "SYMLINK" }, \
+       { XFS_TRANS_SET_DMATTRS,        "SET_DMATTRS" }, \
+       { XFS_TRANS_GROWFS,             "GROWFS" }, \
+       { XFS_TRANS_STRAT_WRITE,        "STRAT_WRITE" }, \
+       { XFS_TRANS_DIOSTRAT,           "DIOSTRAT" }, \
+       { XFS_TRANS_WRITEID,            "WRITEID" }, \
+       { XFS_TRANS_ADDAFORK,           "ADDAFORK" }, \
+       { XFS_TRANS_ATTRINVAL,          "ATTRINVAL" }, \
+       { XFS_TRANS_ATRUNCATE,          "ATRUNCATE" }, \
+       { XFS_TRANS_ATTR_SET,           "ATTR_SET" }, \
+       { XFS_TRANS_ATTR_RM,            "ATTR_RM" }, \
+       { XFS_TRANS_ATTR_FLAG,          "ATTR_FLAG" }, \
+       { XFS_TRANS_CLEAR_AGI_BUCKET,   "CLEAR_AGI_BUCKET" }, \
+       { XFS_TRANS_QM_SBCHANGE,        "QM_SBCHANGE" }, \
+       { XFS_TRANS_QM_QUOTAOFF,        "QM_QUOTAOFF" }, \
+       { XFS_TRANS_QM_DQALLOC,         "QM_DQALLOC" }, \
+       { XFS_TRANS_QM_SETQLIM,         "QM_SETQLIM" }, \
+       { XFS_TRANS_QM_DQCLUSTER,       "QM_DQCLUSTER" }, \
+       { XFS_TRANS_QM_QINOCREATE,      "QM_QINOCREATE" }, \
+       { XFS_TRANS_QM_QUOTAOFF_END,    "QM_QOFF_END" }, \
+       { XFS_TRANS_SB_UNIT,            "SB_UNIT" }, \
+       { XFS_TRANS_FSYNC_TS,           "FSYNC_TS" }, \
+       { XFS_TRANS_GROWFSRT_ALLOC,     "GROWFSRT_ALLOC" }, \
+       { XFS_TRANS_GROWFSRT_ZERO,      "GROWFSRT_ZERO" }, \
+       { XFS_TRANS_GROWFSRT_FREE,      "GROWFSRT_FREE" }, \
+       { XFS_TRANS_SWAPEXT,            "SWAPEXT" }, \
+       { XFS_TRANS_SB_COUNT,           "SB_COUNT" }, \
+       { XFS_TRANS_CHECKPOINT,         "CHECKPOINT" }, \
+       { XFS_TRANS_DUMMY1,             "DUMMY1" }, \
+       { XFS_TRANS_DUMMY2,             "DUMMY2" }, \
+       { XLOG_UNMOUNT_REC_TYPE,        "UNMOUNT" }
+
+/*
+ * This structure is used to track log items associated with
+ * a transaction.  It points to the log item and keeps some
+ * flags to track the state of the log item.  It also tracks
+ * the amount of space needed to log the item it describes
+ * once we get to commit processing (see xfs_trans_commit()).
+ */
+struct xfs_log_item_desc {
+       struct xfs_log_item     *lid_item;
+       struct list_head        lid_trans;
+       unsigned char           lid_flags;
+};
+
+#define XFS_LID_DIRTY          0x1
+
+/* log size calculation functions */
+int    xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
+int    xfs_log_calc_minimum_size(struct xfs_mount *);
+
+
+/*
+ * Values for t_flags.
+ */
+#define        XFS_TRANS_DIRTY         0x01    /* something needs to be logged */
+#define        XFS_TRANS_SB_DIRTY      0x02    /* superblock is modified */
+#define        XFS_TRANS_PERM_LOG_RES  0x04    /* xact took a permanent log res */
+#define        XFS_TRANS_SYNC          0x08    /* make commit synchronous */
+#define XFS_TRANS_DQ_DIRTY     0x10    /* at least one dquot in trx dirty */
+#define XFS_TRANS_RESERVE      0x20    /* OK to use reserved data blocks */
+#define XFS_TRANS_FREEZE_PROT  0x40    /* Transaction has elevated writer
+                                          count in superblock */
+/*
+ * Values for call flags parameter.
+ */
+#define        XFS_TRANS_RELEASE_LOG_RES       0x4
+#define        XFS_TRANS_ABORT                 0x8
+
+/*
+ * Field values for xfs_trans_mod_sb.
+ */
+#define        XFS_TRANS_SB_ICOUNT             0x00000001
+#define        XFS_TRANS_SB_IFREE              0x00000002
+#define        XFS_TRANS_SB_FDBLOCKS           0x00000004
+#define        XFS_TRANS_SB_RES_FDBLOCKS       0x00000008
+#define        XFS_TRANS_SB_FREXTENTS          0x00000010
+#define        XFS_TRANS_SB_RES_FREXTENTS      0x00000020
+#define        XFS_TRANS_SB_DBLOCKS            0x00000040
+#define        XFS_TRANS_SB_AGCOUNT            0x00000080
+#define        XFS_TRANS_SB_IMAXPCT            0x00000100
+#define        XFS_TRANS_SB_REXTSIZE           0x00000200
+#define        XFS_TRANS_SB_RBMBLOCKS          0x00000400
+#define        XFS_TRANS_SB_RBLOCKS            0x00000800
+#define        XFS_TRANS_SB_REXTENTS           0x00001000
+#define        XFS_TRANS_SB_REXTSLOG           0x00002000
+
+/*
+ * Here we centralize the specification of XFS meta-data buffer reference count
+ * values.  This determines how hard the buffer cache tries to hold onto the
+ * buffer.
+ */
+#define        XFS_AGF_REF             4
+#define        XFS_AGI_REF             4
+#define        XFS_AGFL_REF            3
+#define        XFS_INO_BTREE_REF       3
+#define        XFS_ALLOC_BTREE_REF     2
+#define        XFS_BMAP_BTREE_REF      2
+#define        XFS_DIR_BTREE_REF       2
+#define        XFS_INO_REF             2
+#define        XFS_ATTR_BTREE_REF      1
+#define        XFS_DQUOT_REF           1
+
+/*
+ * Flags for xfs_trans_ichgtime().
+ */
+#define        XFS_ICHGTIME_MOD        0x1     /* data fork modification timestamp */
+#define        XFS_ICHGTIME_CHG        0x2     /* inode field change timestamp */
+#define        XFS_ICHGTIME_CREATE     0x4     /* inode create timestamp */
+
+
+/*
+ * Symlink decoding/encoding functions
+ */
+int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
+int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
+                       uint32_t size, struct xfs_buf *bp);
+bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset,
+                       uint32_t size, struct xfs_buf *bp);
+void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
+                                struct xfs_inode *ip, struct xfs_ifork *ifp);
+
+#endif /* __XFS_SHARED_H__ */
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
new file mode 100644 (file)
index 0000000..5782f03
--- /dev/null
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2012-2013 Red Hat, Inc.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_ag.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_symlink.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+
+
+/*
+ * Each contiguous block has a header, so it is not just a simple pathlen
+ * to FSB conversion.
+ */
+int
+xfs_symlink_blocks(
+       struct xfs_mount *mp,
+       int             pathlen)
+{
+       int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
+
+       return (pathlen + buflen - 1) / buflen;
+}
+
+int
+xfs_symlink_hdr_set(
+       struct xfs_mount        *mp,
+       xfs_ino_t               ino,
+       uint32_t                offset,
+       uint32_t                size,
+       struct xfs_buf          *bp)
+{
+       struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return 0;
+
+       dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
+       dsl->sl_offset = cpu_to_be32(offset);
+       dsl->sl_bytes = cpu_to_be32(size);
+       uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid);
+       dsl->sl_owner = cpu_to_be64(ino);
+       dsl->sl_blkno = cpu_to_be64(bp->b_bn);
+       bp->b_ops = &xfs_symlink_buf_ops;
+
+       return sizeof(struct xfs_dsymlink_hdr);
+}
+
+/*
+ * Checking of the symlink header is split into two parts. the verifier does
+ * CRC, location and bounds checking, the unpacking function checks the path
+ * parameters and owner.
+ */
+bool
+xfs_symlink_hdr_ok(
+       xfs_ino_t               ino,
+       uint32_t                offset,
+       uint32_t                size,
+       struct xfs_buf          *bp)
+{
+       struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+
+       if (offset != be32_to_cpu(dsl->sl_offset))
+               return false;
+       if (size != be32_to_cpu(dsl->sl_bytes))
+               return false;
+       if (ino != be64_to_cpu(dsl->sl_owner))
+               return false;
+
+       /* ok */
+       return true;
+}
+
+static bool
+xfs_symlink_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return false;
+       if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
+               return false;
+       if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid))
+               return false;
+       if (bp->b_bn != be64_to_cpu(dsl->sl_blkno))
+               return false;
+       if (be32_to_cpu(dsl->sl_offset) +
+                               be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN)
+               return false;
+       if (dsl->sl_owner == 0)
+               return false;
+
+       return true;
+}
+
+static void
+xfs_symlink_read_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+
+       /* no verification of non-crc buffers */
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return;
+
+       if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF))
+               xfs_buf_ioerror(bp, -EFSBADCRC);
+       else if (!xfs_symlink_verify(bp))
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+       if (bp->b_error)
+               xfs_verifier_error(bp);
+}
+
+static void
+xfs_symlink_write_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+       struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+       /* no verification of non-crc buffers */
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return;
+
+       if (!xfs_symlink_verify(bp)) {
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+               xfs_verifier_error(bp);
+               return;
+       }
+
+       if (bip) {
+               struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+               dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+       }
+       xfs_buf_update_cksum(bp, XFS_SYMLINK_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_symlink_buf_ops = {
+       .verify_read = xfs_symlink_read_verify,
+       .verify_write = xfs_symlink_write_verify,
+};
+
+void
+xfs_symlink_local_to_remote(
+       struct xfs_trans        *tp,
+       struct xfs_buf          *bp,
+       struct xfs_inode        *ip,
+       struct xfs_ifork        *ifp)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       char                    *buf;
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb)) {
+               bp->b_ops = NULL;
+               memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+               return;
+       }
+
+       /*
+        * As this symlink fits in an inode literal area, it must also fit in
+        * the smallest buffer the filesystem supports.
+        */
+       ASSERT(BBTOB(bp->b_length) >=
+                       ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr));
+
+       bp->b_ops = &xfs_symlink_buf_ops;
+
+       buf = bp->b_addr;
+       buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
+       memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
+}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
new file mode 100644 (file)
index 0000000..f2bda7c
--- /dev/null
@@ -0,0 +1,894 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_qm.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+
+/*
+ * A buffer has a format structure overhead in the log in addition
+ * to the data, so we need to take this into account when reserving
+ * space in a transaction for a buffer.  Round the space required up
+ * to a multiple of 128 bytes so that we don't change the historical
+ * reservation that has been used for this overhead.
+ */
+STATIC uint
+xfs_buf_log_overhead(void)
+{
+       return round_up(sizeof(struct xlog_op_header) +
+                       sizeof(struct xfs_buf_log_format), 128);
+}
+
+/*
+ * Calculate out transaction log reservation per item in bytes.
+ *
+ * The nbufs argument is used to indicate the number of items that
+ * will be changed in a transaction.  size is used to tell how many
+ * bytes should be reserved per item.
+ */
+STATIC uint
+xfs_calc_buf_res(
+       uint            nbufs,
+       uint            size)
+{
+       return nbufs * (size + xfs_buf_log_overhead());
+}
+
+/*
+ * Logging inodes is really tricksy. They are logged in memory format,
+ * which means that what we write into the log doesn't directly translate into
+ * the amount of space they use on disk.
+ *
+ * Case in point - btree format forks in memory format use more space than the
+ * on-disk format. In memory, the buffer contains a normal btree block header so
+ * the btree code can treat it as though it is just another generic buffer.
+ * However, when we write it to the inode fork, we don't write all of this
+ * header as it isn't needed. e.g. the root is only ever in the inode, so
+ * there's no need for sibling pointers which would waste 16 bytes of space.
+ *
+ * Hence when we have an inode with a maximally sized btree format fork, then
+ * amount of information we actually log is greater than the size of the inode
+ * on disk. Hence we need an inode reservation function that calculates all this
+ * correctly. So, we log:
+ *
+ * - 4 log op headers for object
+ *     - for the ilf, the inode core and 2 forks
+ * - inode log format object
+ * - the inode core
+ * - two inode forks containing bmap btree root blocks.
+ *     - the btree data contained by both forks will fit into the inode size,
+ *       hence when combined with the inode core above, we have a total of the
+ *       actual inode size.
+ *     - the BMBT headers need to be accounted separately, as they are
+ *       additional to the records and pointers that fit inside the inode
+ *       forks.
+ */
+STATIC uint
+xfs_calc_inode_res(
+       struct xfs_mount        *mp,
+       uint                    ninodes)
+{
+       return ninodes *
+               (4 * sizeof(struct xlog_op_header) +
+                sizeof(struct xfs_inode_log_format) +
+                mp->m_sb.sb_inodesize +
+                2 * XFS_BMBT_BLOCK_LEN(mp));
+}
+
+/*
+ * The free inode btree is a conditional feature and the log reservation
+ * requirements differ slightly from that of the traditional inode allocation
+ * btree. The finobt tracks records for inode chunks with at least one free
+ * inode. A record can be removed from the tree for an inode allocation
+ * or free and thus the finobt reservation is unconditional across:
+ *
+ *     - inode allocation
+ *     - inode free
+ *     - inode chunk allocation
+ *
+ * The 'modify' param indicates to include the record modification scenario. The
+ * 'alloc' param indicates to include the reservation for free space btree
+ * modifications on behalf of finobt modifications. This is required only for
+ * transactions that do not already account for free space btree modifications.
+ *
+ * the free inode btree: max depth * block size
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
+ * the free inode btree entry: block size
+ */
+STATIC uint
+xfs_calc_finobt_res(
+       struct xfs_mount        *mp,
+       int                     alloc,
+       int                     modify)
+{
+       uint res;
+
+       if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+               return 0;
+
+       res = xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1));
+       if (alloc)
+               res += xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), 
+                                       XFS_FSB_TO_B(mp, 1));
+       if (modify)
+               res += (uint)XFS_FSB_TO_B(mp, 1);
+
+       return res;
+}
+
+/*
+ * Various log reservation values.
+ *
+ * These are based on the size of the file system block because that is what
+ * most transactions manipulate.  Each adds in an additional 128 bytes per
+ * item logged to try to account for the overhead of the transaction mechanism.
+ *
+ * Note:  Most of the reservations underestimate the number of allocation
+ * groups into which they could free extents in the xfs_bmap_finish() call.
+ * This is because the number in the worst case is quite high and quite
+ * unusual.  In order to fix this we need to change xfs_bmap_finish() to free
+ * extents in only a single AG at a time.  This will require changes to the
+ * EFI code as well, however, so that the EFI for the extents not freed is
+ * logged again in each transaction.  See SGI PV #261917.
+ *
+ * Reservation functions here avoid a huge stack in xfs_trans_init due to
+ * register overflow from temporaries in the calculations.
+ */
+
+
+/*
+ * In a write transaction we can allocate a maximum of 2
+ * extents.  This gives:
+ *    the inode getting the new extents: inode size
+ *    the inode's bmap btree: max depth * block size
+ *    the agfs of the ags from which the extents are allocated: 2 * sector
+ *    the superblock free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And the bmap_finish transaction can free bmap blocks in a join:
+ *    the agfs of the ags containing the blocks: 2 * sector size
+ *    the agfls of the ags containing the blocks: 2 * sector size
+ *    the super block free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_write_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               MAX((xfs_calc_inode_res(mp, 1) +
+                    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
+                                     XFS_FSB_TO_B(mp, 1)) +
+                    xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+                                     XFS_FSB_TO_B(mp, 1))),
+                   (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+                                     XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * In truncating a file we free up to two extents at once.  We can modify:
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: (max depth + 1) * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *             4 exts * 2 trees * (2 * max depth - 1) * block size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_itruncate_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               MAX((xfs_calc_inode_res(mp, 1) +
+                    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
+                                     XFS_FSB_TO_B(mp, 1))),
+                   (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
+                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
+                                     XFS_FSB_TO_B(mp, 1)) +
+                   xfs_calc_buf_res(5, 0) +
+                   xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                    XFS_FSB_TO_B(mp, 1)) +
+                   xfs_calc_buf_res(2 + mp->m_ialloc_blks +
+                                    mp->m_in_maxlevels, 0)));
+}
+
+/*
+ * In renaming a files we can modify:
+ *    the four inodes involved: 4 * inode size
+ *    the two directory btrees: 2 * (max depth + v2) * dir block size
+ *    the two directory bmap btrees: 2 * max depth * block size
+ * And the bmap_finish transaction can free dir and bmap blocks (two sets
+ *     of bmap blocks) giving:
+ *    the agf for the ags in which the blocks live: 3 * sector size
+ *    the agfl for the ags in which the blocks live: 3 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_rename_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               MAX((xfs_calc_inode_res(mp, 4) +
+                    xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
+                                     XFS_FSB_TO_B(mp, 1))),
+                   (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
+                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3),
+                                     XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * For removing an inode from unlinked list at first, we can modify:
+ *    the agi hash list and counters: sector size
+ *    the on disk inode before ours in the agi hash list: inode cluster size
+ */
+STATIC uint
+xfs_calc_iunlink_remove_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+              max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size);
+}
+
+/*
+ * For creating a link to an inode:
+ *    the parent directory inode: inode size
+ *    the linked inode: inode size
+ *    the directory btree could split: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free some bmap blocks giving:
+ *    the agf for the ag in which the blocks live: sector size
+ *    the agfl for the ag in which the blocks live: sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_link_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               xfs_calc_iunlink_remove_reservation(mp) +
+               MAX((xfs_calc_inode_res(mp, 2) +
+                    xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
+                                     XFS_FSB_TO_B(mp, 1))),
+                   (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                     XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * For adding an inode to unlinked list we can modify:
+ *    the agi hash list: sector size
+ *    the unlinked inode: inode size
+ */
+STATIC uint
+xfs_calc_iunlink_add_reservation(xfs_mount_t *mp)
+{
+       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+               xfs_calc_inode_res(mp, 1);
+}
+
+/*
+ * For removing a directory entry we can modify:
+ *    the parent directory inode: inode size
+ *    the removed inode: inode size
+ *    the directory btree could join: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free the dir and bmap blocks giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_remove_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               xfs_calc_iunlink_add_reservation(mp) +
+               MAX((xfs_calc_inode_res(mp, 1) +
+                    xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
+                                     XFS_FSB_TO_B(mp, 1))),
+                   (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
+                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+                                     XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * For create, break it in to the two cases that the transaction
+ * covers. We start with the modify case - allocation done by modification
+ * of the state of existing inodes - and the allocation case.
+ */
+
+/*
+ * For create we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: block size
+ *    the superblock for the nlink flag: sector size
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode's bmap btree: (max depth + v2) * block size
+ *    the finobt (record modification and allocation btrees)
+ */
+STATIC uint
+xfs_calc_create_resv_modify(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_inode_res(mp, 2) +
+               xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+               (uint)XFS_FSB_TO_B(mp, 1) +
+               xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)) +
+               xfs_calc_finobt_res(mp, 1, 1);
+}
+
+/*
+ * For create we can allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
+ *    the inode blocks allocated: mp->m_ialloc_blks * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_create_resv_alloc(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+               mp->m_sb.sb_sectsize +
+               xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) +
+               xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                XFS_FSB_TO_B(mp, 1));
+}
+
+STATIC uint
+__xfs_calc_create_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               MAX(xfs_calc_create_resv_alloc(mp),
+                   xfs_calc_create_resv_modify(mp));
+}
+
+/*
+ * For icreate we can allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ *    the finobt (record insertion)
+ */
+STATIC uint
+xfs_calc_icreate_resv_alloc(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+               mp->m_sb.sb_sectsize +
+               xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                XFS_FSB_TO_B(mp, 1)) +
+               xfs_calc_finobt_res(mp, 0, 0);
+}
+
+STATIC uint
+xfs_calc_icreate_reservation(xfs_mount_t *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               MAX(xfs_calc_icreate_resv_alloc(mp),
+                   xfs_calc_create_resv_modify(mp));
+}
+
+STATIC uint
+xfs_calc_create_reservation(
+       struct xfs_mount        *mp)
+{
+       if (xfs_sb_version_hascrc(&mp->m_sb))
+               return xfs_calc_icreate_reservation(mp);
+       return __xfs_calc_create_reservation(mp);
+
+}
+
+STATIC uint
+xfs_calc_create_tmpfile_reservation(
+       struct xfs_mount        *mp)
+{
+       uint    res = XFS_DQUOT_LOGRES(mp);
+
+       if (xfs_sb_version_hascrc(&mp->m_sb))
+               res += xfs_calc_icreate_resv_alloc(mp);
+       else
+               res += xfs_calc_create_resv_alloc(mp);
+
+       return res + xfs_calc_iunlink_add_reservation(mp);
+}
+
+/*
+ * Making a new directory is the same as creating a new file.
+ */
+STATIC uint
+xfs_calc_mkdir_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_create_reservation(mp);
+}
+
+
+/*
+ * Making a new symplink is the same as creating a new file, but
+ * with the added blocks for remote symlink data which can be up to 1kB in
+ * length (MAXPATHLEN).
+ */
+STATIC uint
+xfs_calc_symlink_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_create_reservation(mp) +
+              xfs_calc_buf_res(1, MAXPATHLEN);
+}
+
+/*
+ * In freeing an inode we can modify:
+ *    the inode being freed: inode size
+ *    the super block free inode counter: sector size
+ *    the agi hash list and counters: sector size
+ *    the inode btree entry: block size
+ *    the on disk inode before ours in the agi hash list: inode cluster size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ *    the finobt (record insertion, removal or modification)
+ */
+STATIC uint
+xfs_calc_ifree_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               xfs_calc_inode_res(mp, 1) +
+               xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+               xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
+               xfs_calc_iunlink_remove_reservation(mp) +
+               xfs_calc_buf_res(1, 0) +
+               xfs_calc_buf_res(2 + mp->m_ialloc_blks +
+                                mp->m_in_maxlevels, 0) +
+               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                XFS_FSB_TO_B(mp, 1)) +
+               xfs_calc_finobt_res(mp, 0, 1);
+}
+
+/*
+ * When only changing the inode we log the inode and possibly the superblock
+ * We also add a bit of slop for the transaction stuff.
+ */
+STATIC uint
+xfs_calc_ichange_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               xfs_calc_inode_res(mp, 1) +
+               xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+
+}
+
+/*
+ * Growing the data section of the filesystem.
+ *     superblock
+ *     agi and agf
+ *     allocation btrees
+ */
+STATIC uint
+xfs_calc_growdata_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the first set of transactions (ALLOC) we allocate space to the
+ * bitmap or summary files.
+ *     superblock: sector size
+ *     agf of the ag from which the extent is allocated: sector size
+ *     bmap btree for bitmap/summary inode: max depth * blocksize
+ *     bitmap/summary inode: inode size
+ *     allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
+ */
+STATIC uint
+xfs_calc_growrtalloc_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+               xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
+                                XFS_FSB_TO_B(mp, 1)) +
+               xfs_calc_inode_res(mp, 1) +
+               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the second set of transactions (ZERO) we zero the new metadata blocks.
+ *     one bitmap/summary block: blocksize
+ */
+STATIC uint
+xfs_calc_growrtzero_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize);
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the third set of transactions (FREE) we update metadata without
+ * allocating any new blocks.
+ *     superblock: sector size
+ *     bitmap inode: inode size
+ *     summary inode: inode size
+ *     one bitmap block: blocksize
+ *     summary blocks: new summary size
+ */
+STATIC uint
+xfs_calc_growrtfree_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+               xfs_calc_inode_res(mp, 2) +
+               xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) +
+               xfs_calc_buf_res(1, mp->m_rsumsize);
+}
+
+/*
+ * Logging the inode modification timestamp on a synchronous write.
+ *     inode
+ */
+STATIC uint
+xfs_calc_swrite_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_inode_res(mp, 1);
+}
+
+/*
+ * Logging the inode mode bits when writing a setuid/setgid file
+ *     inode
+ */
+STATIC uint
+xfs_calc_writeid_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_inode_res(mp, 1);
+}
+
+/*
+ * Converting the inode from non-attributed to attributed.
+ *     the inode being converted: inode size
+ *     agf block and superblock (for block allocation)
+ *     the new block (directory sized)
+ *     bmap blocks for the new directory block
+ *     allocation btrees
+ */
+STATIC uint
+xfs_calc_addafork_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               xfs_calc_inode_res(mp, 1) +
+               xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+               xfs_calc_buf_res(1, mp->m_dir_geo->blksize) +
+               xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
+                                XFS_FSB_TO_B(mp, 1)) +
+               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Removing the attribute fork of a file
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: max depth * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *             4 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_attrinval_reservation(
+       struct xfs_mount        *mp)
+{
+       return MAX((xfs_calc_inode_res(mp, 1) +
+                   xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
+                                    XFS_FSB_TO_B(mp, 1))),
+                  (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
+                   xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
+                                    XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * Setting an attribute at mount time.
+ *     the inode getting the attribute
+ *     the superblock for allocations
+ *     the agfs extents are allocated from
+ *     the attribute btree * max depth
+ *     the inode allocation btree
+ * Since attribute transaction space is dependent on the size of the attribute,
+ * the calculation is done partially at mount time and partially at runtime(see
+ * below).
+ */
+STATIC uint
+xfs_calc_attrsetm_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               xfs_calc_inode_res(mp, 1) +
+               xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+               xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Setting an attribute at runtime, transaction space unit per block.
+ *     the superblock for allocations: sector size
+ *     the inode bmap btree could join or split: max depth * block size
+ * Since the runtime attribute transaction space is dependent on the total
+ * blocks needed for the 1st bmap, here we calculate out the space unit for
+ * one block so that the caller could figure out the total space according
+ * to the attibute extent length in blocks by:
+ *     ext * M_RES(mp)->tr_attrsetrt.tr_logres
+ */
+STATIC uint
+xfs_calc_attrsetrt_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+               xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
+                                XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Removing an attribute.
+ *    the inode: inode size
+ *    the attribute btree could join: max depth * block size
+ *    the inode bmap btree could join or split: max depth * block size
+ * And the bmap_finish transaction can free the attr blocks freed giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_attrrm_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               MAX((xfs_calc_inode_res(mp, 1) +
+                    xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH,
+                                     XFS_FSB_TO_B(mp, 1)) +
+                    (uint)XFS_FSB_TO_B(mp,
+                                       XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+                    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
+                   (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+                                     XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * Clearing a bad agino number in an agi hash bucket.
+ */
+STATIC uint
+xfs_calc_clear_agi_bucket_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+/*
+ * Clearing the quotaflags in the superblock.
+ *     the super block for changing quota flags: sector size
+ */
+STATIC uint
+xfs_calc_qm_sbchange_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+/*
+ * Adjusting quota limits.
+ *    the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
+ */
+STATIC uint
+xfs_calc_qm_setqlim_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot));
+}
+
+/*
+ * Allocating quota on disk if needed.
+ *     the write transaction log space for quota file extent allocation
+ *     the unit of quota allocation: one system block size
+ */
+STATIC uint
+xfs_calc_qm_dqalloc_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_write_reservation(mp) +
+               xfs_calc_buf_res(1,
+                       XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
+}
+
+/*
+ * Turning off quotas.
+ *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ *    the superblock for the quota flags: sector size
+ */
+STATIC uint
+xfs_calc_qm_quotaoff_reservation(
+       struct xfs_mount        *mp)
+{
+       return sizeof(struct xfs_qoff_logitem) * 2 +
+               xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+/*
+ * End of turning off quotas.
+ *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ */
+STATIC uint
+xfs_calc_qm_quotaoff_end_reservation(
+       struct xfs_mount        *mp)
+{
+       return sizeof(struct xfs_qoff_logitem) * 2;
+}
+
+/*
+ * Syncing the incore super block changes to disk.
+ *     the super block to reflect the changes: sector size
+ */
+STATIC uint
+xfs_calc_sb_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+void
+xfs_trans_resv_calc(
+       struct xfs_mount        *mp,
+       struct xfs_trans_resv   *resp)
+{
+       /*
+        * The following transactions are logged in physical format and
+        * require a permanent reservation on space.
+        */
+       resp->tr_write.tr_logres = xfs_calc_write_reservation(mp);
+       resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+       resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp);
+       resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+       resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
+       resp->tr_rename.tr_logcount = XFS_RENAME_LOG_COUNT;
+       resp->tr_rename.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_link.tr_logres = xfs_calc_link_reservation(mp);
+       resp->tr_link.tr_logcount = XFS_LINK_LOG_COUNT;
+       resp->tr_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_remove.tr_logres = xfs_calc_remove_reservation(mp);
+       resp->tr_remove.tr_logcount = XFS_REMOVE_LOG_COUNT;
+       resp->tr_remove.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_symlink.tr_logres = xfs_calc_symlink_reservation(mp);
+       resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT;
+       resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_create.tr_logres = xfs_calc_create_reservation(mp);
+       resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT;
+       resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_create_tmpfile.tr_logres =
+                       xfs_calc_create_tmpfile_reservation(mp);
+       resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT;
+       resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp);
+       resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT;
+       resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_ifree.tr_logres = xfs_calc_ifree_reservation(mp);
+       resp->tr_ifree.tr_logcount = XFS_INACTIVE_LOG_COUNT;
+       resp->tr_ifree.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_addafork.tr_logres = xfs_calc_addafork_reservation(mp);
+       resp->tr_addafork.tr_logcount = XFS_ADDAFORK_LOG_COUNT;
+       resp->tr_addafork.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_attrinval.tr_logres = xfs_calc_attrinval_reservation(mp);
+       resp->tr_attrinval.tr_logcount = XFS_ATTRINVAL_LOG_COUNT;
+       resp->tr_attrinval.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_attrsetm.tr_logres = xfs_calc_attrsetm_reservation(mp);
+       resp->tr_attrsetm.tr_logcount = XFS_ATTRSET_LOG_COUNT;
+       resp->tr_attrsetm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_attrrm.tr_logres = xfs_calc_attrrm_reservation(mp);
+       resp->tr_attrrm.tr_logcount = XFS_ATTRRM_LOG_COUNT;
+       resp->tr_attrrm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_growrtalloc.tr_logres = xfs_calc_growrtalloc_reservation(mp);
+       resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT;
+       resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp);
+       resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+       resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       /*
+        * The following transactions are logged in logical format with
+        * a default log count.
+        */
+       resp->tr_qm_sbchange.tr_logres = xfs_calc_qm_sbchange_reservation(mp);
+       resp->tr_qm_sbchange.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+       resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp);
+       resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+       resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp);
+       resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+       resp->tr_qm_equotaoff.tr_logres =
+               xfs_calc_qm_quotaoff_end_reservation(mp);
+       resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+       resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp);
+       resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+       /* The following transaction are logged in logical format */
+       resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp);
+       resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp);
+       resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp);
+       resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp);
+       resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp);
+       resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp);
+       resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp);
+       resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp);
+}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
new file mode 100644 (file)
index 0000000..1097d14
--- /dev/null
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef        __XFS_TRANS_RESV_H__
+#define        __XFS_TRANS_RESV_H__
+
+struct xfs_mount;
+
+/*
+ * structure for maintaining pre-calculated transaction reservations.
+ */
+struct xfs_trans_res {
+       uint    tr_logres;      /* log space unit in bytes per log ticket */
+       int     tr_logcount;    /* number of log operations per log ticket */
+       int     tr_logflags;    /* log flags, currently only used for indicating
+                                * a reservation request is permanent or not */
+};
+
+struct xfs_trans_resv {
+       struct xfs_trans_res    tr_write;       /* extent alloc trans */
+       struct xfs_trans_res    tr_itruncate;   /* truncate trans */
+       struct xfs_trans_res    tr_rename;      /* rename trans */
+       struct xfs_trans_res    tr_link;        /* link trans */
+       struct xfs_trans_res    tr_remove;      /* unlink trans */
+       struct xfs_trans_res    tr_symlink;     /* symlink trans */
+       struct xfs_trans_res    tr_create;      /* create trans */
+       struct xfs_trans_res    tr_create_tmpfile; /* create O_TMPFILE trans */
+       struct xfs_trans_res    tr_mkdir;       /* mkdir trans */
+       struct xfs_trans_res    tr_ifree;       /* inode free trans */
+       struct xfs_trans_res    tr_ichange;     /* inode update trans */
+       struct xfs_trans_res    tr_growdata;    /* fs data section grow trans */
+       struct xfs_trans_res    tr_addafork;    /* add inode attr fork trans */
+       struct xfs_trans_res    tr_writeid;     /* write setuid/setgid file */
+       struct xfs_trans_res    tr_attrinval;   /* attr fork buffer
+                                                * invalidation */
+       struct xfs_trans_res    tr_attrsetm;    /* set/create an attribute at
+                                                * mount time */
+       struct xfs_trans_res    tr_attrsetrt;   /* set/create an attribute at
+                                                * runtime */
+       struct xfs_trans_res    tr_attrrm;      /* remove an attribute */
+       struct xfs_trans_res    tr_clearagi;    /* clear agi unlinked bucket */
+       struct xfs_trans_res    tr_growrtalloc; /* grow realtime allocations */
+       struct xfs_trans_res    tr_growrtzero;  /* grow realtime zeroing */
+       struct xfs_trans_res    tr_growrtfree;  /* grow realtime freeing */
+       struct xfs_trans_res    tr_qm_sbchange; /* change quota flags */
+       struct xfs_trans_res    tr_qm_setqlim;  /* adjust quota limits */
+       struct xfs_trans_res    tr_qm_dqalloc;  /* allocate quota on disk */
+       struct xfs_trans_res    tr_qm_quotaoff; /* turn quota off */
+       struct xfs_trans_res    tr_qm_equotaoff;/* end of turn quota off */
+       struct xfs_trans_res    tr_sb;          /* modify superblock */
+       struct xfs_trans_res    tr_fsyncts;     /* update timestamps on fsync */
+};
+
+/* shorthand way of accessing reservation structure */
+#define M_RES(mp)      (&(mp)->m_resv)
+
+/*
+ * Per-extent log reservation for the allocation btree changes
+ * involved in freeing or allocating an extent.
+ * 2 trees * (2 blocks/level * max depth - 1) * block size
+ */
+#define        XFS_ALLOCFREE_LOG_RES(mp,nx) \
+       ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
+#define        XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
+       ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
+
+/*
+ * Per-directory log reservation for any directory change.
+ * dir blocks: (1 btree block per level + data block + free block) * dblock size
+ * bmap btree: (levels + 2) * max depth * block size
+ * v2 directory blocks can be fragmented below the dirblksize down to the fsb
+ * size, so account for that in the DAENTER macros.
+ */
+#define        XFS_DIROP_LOG_RES(mp)   \
+       (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
+        (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
+#define        XFS_DIROP_LOG_COUNT(mp) \
+       (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
+        XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
+
+/*
+ * Various log count values.
+ */
+#define        XFS_DEFAULT_LOG_COUNT           1
+#define        XFS_DEFAULT_PERM_LOG_COUNT      2
+#define        XFS_ITRUNCATE_LOG_COUNT         2
+#define XFS_INACTIVE_LOG_COUNT         2
+#define        XFS_CREATE_LOG_COUNT            2
+#define        XFS_CREATE_TMPFILE_LOG_COUNT    2
+#define        XFS_MKDIR_LOG_COUNT             3
+#define        XFS_SYMLINK_LOG_COUNT           3
+#define        XFS_REMOVE_LOG_COUNT            2
+#define        XFS_LINK_LOG_COUNT              2
+#define        XFS_RENAME_LOG_COUNT            2
+#define        XFS_WRITE_LOG_COUNT             2
+#define        XFS_ADDAFORK_LOG_COUNT          2
+#define        XFS_ATTRINVAL_LOG_COUNT         1
+#define        XFS_ATTRSET_LOG_COUNT           3
+#define        XFS_ATTRRM_LOG_COUNT            3
+
+void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
+
+#endif /* __XFS_TRANS_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
new file mode 100644 (file)
index 0000000..bf9c457
--- /dev/null
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_TRANS_SPACE_H__
+#define __XFS_TRANS_SPACE_H__
+
+/*
+ * Components of space reservations.
+ */
+#define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)    \
+               (((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0]))
+#define        XFS_EXTENTADD_SPACE_RES(mp,w)   (XFS_BM_MAXLEVELS(mp,w) - 1)
+#define XFS_NEXTENTADD_SPACE_RES(mp,b,w)\
+       (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
+         XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
+         XFS_EXTENTADD_SPACE_RES(mp,w))
+#define        XFS_DAENTER_1B(mp,w)    \
+       ((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1)
+#define        XFS_DAENTER_DBS(mp,w)   \
+       (XFS_DA_NODE_MAXDEPTH + (((w) == XFS_DATA_FORK) ? 2 : 0))
+#define        XFS_DAENTER_BLOCKS(mp,w)        \
+       (XFS_DAENTER_1B(mp,w) * XFS_DAENTER_DBS(mp,w))
+#define        XFS_DAENTER_BMAP1B(mp,w)        \
+       XFS_NEXTENTADD_SPACE_RES(mp, XFS_DAENTER_1B(mp, w), w)
+#define        XFS_DAENTER_BMAPS(mp,w)         \
+       (XFS_DAENTER_DBS(mp,w) * XFS_DAENTER_BMAP1B(mp,w))
+#define        XFS_DAENTER_SPACE_RES(mp,w)     \
+       (XFS_DAENTER_BLOCKS(mp,w) + XFS_DAENTER_BMAPS(mp,w))
+#define        XFS_DAREMOVE_SPACE_RES(mp,w)    XFS_DAENTER_BMAPS(mp,w)
+#define        XFS_DIRENTER_MAX_SPLIT(mp,nl)   1
+#define        XFS_DIRENTER_SPACE_RES(mp,nl)   \
+       (XFS_DAENTER_SPACE_RES(mp, XFS_DATA_FORK) * \
+        XFS_DIRENTER_MAX_SPLIT(mp,nl))
+#define        XFS_DIRREMOVE_SPACE_RES(mp)     \
+       XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
+#define        XFS_IALLOC_SPACE_RES(mp)        \
+       ((mp)->m_ialloc_blks + \
+        (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \
+         ((mp)->m_in_maxlevels - 1)))
+
+/*
+ * Space reservation values for various transactions.
+ */
+#define        XFS_ADDAFORK_SPACE_RES(mp)      \
+       ((mp)->m_dir_geo->fsbcount + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK))
+#define        XFS_ATTRRM_SPACE_RES(mp)        \
+       XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK)
+/* This macro is not used - see inline code in xfs_attr_set */
+#define        XFS_ATTRSET_SPACE_RES(mp, v)    \
+       (XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + XFS_B_TO_FSB(mp, v))
+#define        XFS_CREATE_SPACE_RES(mp,nl)     \
+       (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define        XFS_DIOSTRAT_SPACE_RES(mp, v)   \
+       (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v))
+#define        XFS_GROWFS_SPACE_RES(mp)        \
+       (2 * XFS_AG_MAXLEVELS(mp))
+#define        XFS_GROWFSRT_SPACE_RES(mp,b)    \
+       ((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK))
+#define        XFS_LINK_SPACE_RES(mp,nl)       \
+       XFS_DIRENTER_SPACE_RES(mp,nl)
+#define        XFS_MKDIR_SPACE_RES(mp,nl)      \
+       (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define        XFS_QM_DQALLOC_SPACE_RES(mp)    \
+       (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + \
+        XFS_DQUOT_CLUSTER_SIZE_FSB)
+#define        XFS_QM_QINOCREATE_SPACE_RES(mp) \
+       XFS_IALLOC_SPACE_RES(mp)
+#define        XFS_REMOVE_SPACE_RES(mp)        \
+       XFS_DIRREMOVE_SPACE_RES(mp)
+#define        XFS_RENAME_SPACE_RES(mp,nl)     \
+       (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define        XFS_SYMLINK_SPACE_RES(mp,nl,b)  \
+       (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b))
+#define XFS_IFREE_SPACE_RES(mp)                \
+       (xfs_sb_version_hasfinobt(&mp->m_sb) ? (mp)->m_in_maxlevels : 0)
+
+
+#endif /* __XFS_TRANS_SPACE_H__ */
index 6888ad886ff6205cc0baf0aa36352fa92f9d93ea..a65fa5dde6e9c514b0e5fbfeecd6d14a83e202bc 100644 (file)
@@ -152,7 +152,7 @@ xfs_get_acl(struct inode *inode, int type)
        if (!xfs_acl)
                return ERR_PTR(-ENOMEM);
 
-       error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
+       error = xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
                                                        &len, ATTR_ROOT);
        if (error) {
                /*
@@ -210,7 +210,7 @@ __xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
                len -= sizeof(struct xfs_acl_entry) *
                         (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count);
 
-               error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
+               error = xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
                                len, ATTR_ROOT);
 
                kmem_free(xfs_acl);
@@ -218,7 +218,7 @@ __xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
                /*
                 * A NULL ACL argument means we want to remove the ACL.
                 */
-               error = -xfs_attr_remove(ip, ea_name, ATTR_ROOT);
+               error = xfs_attr_remove(ip, ea_name, ATTR_ROOT);
 
                /*
                 * If the attribute didn't exist to start with that's fine.
@@ -244,7 +244,7 @@ xfs_set_mode(struct inode *inode, umode_t mode)
                iattr.ia_mode = mode;
                iattr.ia_ctime = current_fs_time(inode->i_sb);
 
-               error = -xfs_setattr_nonsize(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
+               error = xfs_setattr_nonsize(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
        }
 
        return error;
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
deleted file mode 100644 (file)
index 6e247a9..0000000
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_AG_H__
-#define        __XFS_AG_H__
-
-/*
- * Allocation group header
- * This is divided into three structures, placed in sequential 512-byte
- * buffers after a copy of the superblock (also in a 512-byte buffer).
- */
-
-struct xfs_buf;
-struct xfs_mount;
-struct xfs_trans;
-
-#define        XFS_AGF_MAGIC   0x58414746      /* 'XAGF' */
-#define        XFS_AGI_MAGIC   0x58414749      /* 'XAGI' */
-#define        XFS_AGFL_MAGIC  0x5841464c      /* 'XAFL' */
-#define        XFS_AGF_VERSION 1
-#define        XFS_AGI_VERSION 1
-
-#define        XFS_AGF_GOOD_VERSION(v) ((v) == XFS_AGF_VERSION)
-#define        XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION)
-
-/*
- * Btree number 0 is bno, 1 is cnt.  This value gives the size of the
- * arrays below.
- */
-#define        XFS_BTNUM_AGF   ((int)XFS_BTNUM_CNTi + 1)
-
-/*
- * The second word of agf_levels in the first a.g. overlaps the EFS
- * superblock's magic number.  Since the magic numbers valid for EFS
- * are > 64k, our value cannot be confused for an EFS superblock's.
- */
-
-typedef struct xfs_agf {
-       /*
-        * Common allocation group header information
-        */
-       __be32          agf_magicnum;   /* magic number == XFS_AGF_MAGIC */
-       __be32          agf_versionnum; /* header version == XFS_AGF_VERSION */
-       __be32          agf_seqno;      /* sequence # starting from 0 */
-       __be32          agf_length;     /* size in blocks of a.g. */
-       /*
-        * Freespace information
-        */
-       __be32          agf_roots[XFS_BTNUM_AGF];       /* root blocks */
-       __be32          agf_spare0;     /* spare field */
-       __be32          agf_levels[XFS_BTNUM_AGF];      /* btree levels */
-       __be32          agf_spare1;     /* spare field */
-
-       __be32          agf_flfirst;    /* first freelist block's index */
-       __be32          agf_fllast;     /* last freelist block's index */
-       __be32          agf_flcount;    /* count of blocks in freelist */
-       __be32          agf_freeblks;   /* total free blocks */
-
-       __be32          agf_longest;    /* longest free space */
-       __be32          agf_btreeblks;  /* # of blocks held in AGF btrees */
-       uuid_t          agf_uuid;       /* uuid of filesystem */
-
-       /*
-        * reserve some contiguous space for future logged fields before we add
-        * the unlogged fields. This makes the range logging via flags and
-        * structure offsets much simpler.
-        */
-       __be64          agf_spare64[16];
-
-       /* unlogged fields, written during buffer writeback. */
-       __be64          agf_lsn;        /* last write sequence */
-       __be32          agf_crc;        /* crc of agf sector */
-       __be32          agf_spare2;
-
-       /* structure must be padded to 64 bit alignment */
-} xfs_agf_t;
-
-#define XFS_AGF_CRC_OFF                offsetof(struct xfs_agf, agf_crc)
-
-#define        XFS_AGF_MAGICNUM        0x00000001
-#define        XFS_AGF_VERSIONNUM      0x00000002
-#define        XFS_AGF_SEQNO           0x00000004
-#define        XFS_AGF_LENGTH          0x00000008
-#define        XFS_AGF_ROOTS           0x00000010
-#define        XFS_AGF_LEVELS          0x00000020
-#define        XFS_AGF_FLFIRST         0x00000040
-#define        XFS_AGF_FLLAST          0x00000080
-#define        XFS_AGF_FLCOUNT         0x00000100
-#define        XFS_AGF_FREEBLKS        0x00000200
-#define        XFS_AGF_LONGEST         0x00000400
-#define        XFS_AGF_BTREEBLKS       0x00000800
-#define        XFS_AGF_UUID            0x00001000
-#define        XFS_AGF_NUM_BITS        13
-#define        XFS_AGF_ALL_BITS        ((1 << XFS_AGF_NUM_BITS) - 1)
-
-#define XFS_AGF_FLAGS \
-       { XFS_AGF_MAGICNUM,     "MAGICNUM" }, \
-       { XFS_AGF_VERSIONNUM,   "VERSIONNUM" }, \
-       { XFS_AGF_SEQNO,        "SEQNO" }, \
-       { XFS_AGF_LENGTH,       "LENGTH" }, \
-       { XFS_AGF_ROOTS,        "ROOTS" }, \
-       { XFS_AGF_LEVELS,       "LEVELS" }, \
-       { XFS_AGF_FLFIRST,      "FLFIRST" }, \
-       { XFS_AGF_FLLAST,       "FLLAST" }, \
-       { XFS_AGF_FLCOUNT,      "FLCOUNT" }, \
-       { XFS_AGF_FREEBLKS,     "FREEBLKS" }, \
-       { XFS_AGF_LONGEST,      "LONGEST" }, \
-       { XFS_AGF_BTREEBLKS,    "BTREEBLKS" }, \
-       { XFS_AGF_UUID,         "UUID" }
-
-/* disk block (xfs_daddr_t) in the AG */
-#define XFS_AGF_DADDR(mp)      ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
-#define        XFS_AGF_BLOCK(mp)       XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
-#define        XFS_BUF_TO_AGF(bp)      ((xfs_agf_t *)((bp)->b_addr))
-
-extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
-                       xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
-
-/*
- * Size of the unlinked inode hash table in the agi.
- */
-#define        XFS_AGI_UNLINKED_BUCKETS        64
-
-typedef struct xfs_agi {
-       /*
-        * Common allocation group header information
-        */
-       __be32          agi_magicnum;   /* magic number == XFS_AGI_MAGIC */
-       __be32          agi_versionnum; /* header version == XFS_AGI_VERSION */
-       __be32          agi_seqno;      /* sequence # starting from 0 */
-       __be32          agi_length;     /* size in blocks of a.g. */
-       /*
-        * Inode information
-        * Inodes are mapped by interpreting the inode number, so no
-        * mapping data is needed here.
-        */
-       __be32          agi_count;      /* count of allocated inodes */
-       __be32          agi_root;       /* root of inode btree */
-       __be32          agi_level;      /* levels in inode btree */
-       __be32          agi_freecount;  /* number of free inodes */
-
-       __be32          agi_newino;     /* new inode just allocated */
-       __be32          agi_dirino;     /* last directory inode chunk */
-       /*
-        * Hash table of inodes which have been unlinked but are
-        * still being referenced.
-        */
-       __be32          agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
-       /*
-        * This marks the end of logging region 1 and start of logging region 2.
-        */
-       uuid_t          agi_uuid;       /* uuid of filesystem */
-       __be32          agi_crc;        /* crc of agi sector */
-       __be32          agi_pad32;
-       __be64          agi_lsn;        /* last write sequence */
-
-       __be32          agi_free_root; /* root of the free inode btree */
-       __be32          agi_free_level;/* levels in free inode btree */
-
-       /* structure must be padded to 64 bit alignment */
-} xfs_agi_t;
-
-#define XFS_AGI_CRC_OFF                offsetof(struct xfs_agi, agi_crc)
-
-#define        XFS_AGI_MAGICNUM        (1 << 0)
-#define        XFS_AGI_VERSIONNUM      (1 << 1)
-#define        XFS_AGI_SEQNO           (1 << 2)
-#define        XFS_AGI_LENGTH          (1 << 3)
-#define        XFS_AGI_COUNT           (1 << 4)
-#define        XFS_AGI_ROOT            (1 << 5)
-#define        XFS_AGI_LEVEL           (1 << 6)
-#define        XFS_AGI_FREECOUNT       (1 << 7)
-#define        XFS_AGI_NEWINO          (1 << 8)
-#define        XFS_AGI_DIRINO          (1 << 9)
-#define        XFS_AGI_UNLINKED        (1 << 10)
-#define        XFS_AGI_NUM_BITS_R1     11      /* end of the 1st agi logging region */
-#define        XFS_AGI_ALL_BITS_R1     ((1 << XFS_AGI_NUM_BITS_R1) - 1)
-#define        XFS_AGI_FREE_ROOT       (1 << 11)
-#define        XFS_AGI_FREE_LEVEL      (1 << 12)
-#define        XFS_AGI_NUM_BITS_R2     13
-
-/* disk block (xfs_daddr_t) in the AG */
-#define XFS_AGI_DADDR(mp)      ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
-#define        XFS_AGI_BLOCK(mp)       XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
-#define        XFS_BUF_TO_AGI(bp)      ((xfs_agi_t *)((bp)->b_addr))
-
-extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
-                               xfs_agnumber_t agno, struct xfs_buf **bpp);
-
-/*
- * The third a.g. block contains the a.g. freelist, an array
- * of block pointers to blocks owned by the allocation btree code.
- */
-#define XFS_AGFL_DADDR(mp)     ((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
-#define        XFS_AGFL_BLOCK(mp)      XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
-#define        XFS_BUF_TO_AGFL(bp)     ((xfs_agfl_t *)((bp)->b_addr))
-
-#define XFS_BUF_TO_AGFL_BNO(mp, bp) \
-       (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
-               &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \
-               (__be32 *)(bp)->b_addr)
-
-/*
- * Size of the AGFL.  For CRC-enabled filesystes we steal a couple of
- * slots in the beginning of the block for a proper header with the
- * location information and CRC.
- */
-#define XFS_AGFL_SIZE(mp) \
-       (((mp)->m_sb.sb_sectsize - \
-        (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
-               sizeof(struct xfs_agfl) : 0)) / \
-         sizeof(xfs_agblock_t))
-
-typedef struct xfs_agfl {
-       __be32          agfl_magicnum;
-       __be32          agfl_seqno;
-       uuid_t          agfl_uuid;
-       __be64          agfl_lsn;
-       __be32          agfl_crc;
-       __be32          agfl_bno[];     /* actually XFS_AGFL_SIZE(mp) */
-} xfs_agfl_t;
-
-#define XFS_AGFL_CRC_OFF       offsetof(struct xfs_agfl, agfl_crc)
-
-/*
- * tags for inode radix tree
- */
-#define XFS_ICI_NO_TAG         (-1)    /* special flag for an untagged lookup
-                                          in xfs_inode_ag_iterator */
-#define XFS_ICI_RECLAIM_TAG    0       /* inode is to be reclaimed */
-#define XFS_ICI_EOFBLOCKS_TAG  1       /* inode has blocks beyond EOF */
-
-#define        XFS_AG_MAXLEVELS(mp)            ((mp)->m_ag_maxlevels)
-#define        XFS_MIN_FREELIST_RAW(bl,cl,mp)  \
-       (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
-#define        XFS_MIN_FREELIST(a,mp)          \
-       (XFS_MIN_FREELIST_RAW(          \
-               be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \
-               be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
-#define        XFS_MIN_FREELIST_PAG(pag,mp)    \
-       (XFS_MIN_FREELIST_RAW(          \
-               (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
-               (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
-
-#define XFS_AGB_TO_FSB(mp,agno,agbno)  \
-       (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
-#define        XFS_FSB_TO_AGNO(mp,fsbno)       \
-       ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
-#define        XFS_FSB_TO_AGBNO(mp,fsbno)      \
-       ((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog)))
-#define        XFS_AGB_TO_DADDR(mp,agno,agbno) \
-       ((xfs_daddr_t)XFS_FSB_TO_BB(mp, \
-               (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno)))
-#define        XFS_AG_DADDR(mp,agno,d)         (XFS_AGB_TO_DADDR(mp, agno, 0) + (d))
-
-/*
- * For checking for bad ranges of xfs_daddr_t's, covering multiple
- * allocation groups or a single xfs_daddr_t that's a superblock copy.
- */
-#define        XFS_AG_CHECK_DADDR(mp,d,len)    \
-       ((len) == 1 ? \
-           ASSERT((d) == XFS_SB_DADDR || \
-                  xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \
-           ASSERT(xfs_daddr_to_agno(mp, d) == \
-                  xfs_daddr_to_agno(mp, (d) + (len) - 1)))
-
-#endif /* __XFS_AG_H__ */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
deleted file mode 100644 (file)
index d438132..0000000
+++ /dev/null
@@ -1,2630 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_shared.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_extent_busy.h"
-#include "xfs_error.h"
-#include "xfs_cksum.h"
-#include "xfs_trace.h"
-#include "xfs_trans.h"
-#include "xfs_buf_item.h"
-#include "xfs_log.h"
-
-struct workqueue_struct *xfs_alloc_wq;
-
-#define XFS_ABSDIFF(a,b)       (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
-
-#define        XFSA_FIXUP_BNO_OK       1
-#define        XFSA_FIXUP_CNT_OK       2
-
-STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
-STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
-STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
-STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
-               xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
-
-/*
- * Lookup the record equal to [bno, len] in the btree given by cur.
- */
-STATIC int                             /* error */
-xfs_alloc_lookup_eq(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       xfs_agblock_t           bno,    /* starting block of extent */
-       xfs_extlen_t            len,    /* length of extent */
-       int                     *stat)  /* success/failure */
-{
-       cur->bc_rec.a.ar_startblock = bno;
-       cur->bc_rec.a.ar_blockcount = len;
-       return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
-
-/*
- * Lookup the first record greater than or equal to [bno, len]
- * in the btree given by cur.
- */
-int                            /* error */
-xfs_alloc_lookup_ge(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       xfs_agblock_t           bno,    /* starting block of extent */
-       xfs_extlen_t            len,    /* length of extent */
-       int                     *stat)  /* success/failure */
-{
-       cur->bc_rec.a.ar_startblock = bno;
-       cur->bc_rec.a.ar_blockcount = len;
-       return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
-}
-
-/*
- * Lookup the first record less than or equal to [bno, len]
- * in the btree given by cur.
- */
-int                                    /* error */
-xfs_alloc_lookup_le(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       xfs_agblock_t           bno,    /* starting block of extent */
-       xfs_extlen_t            len,    /* length of extent */
-       int                     *stat)  /* success/failure */
-{
-       cur->bc_rec.a.ar_startblock = bno;
-       cur->bc_rec.a.ar_blockcount = len;
-       return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
-}
-
-/*
- * Update the record referred to by cur to the value given
- * by [bno, len].
- * This either works (return 0) or gets an EFSCORRUPTED error.
- */
-STATIC int                             /* error */
-xfs_alloc_update(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       xfs_agblock_t           bno,    /* starting block of extent */
-       xfs_extlen_t            len)    /* length of extent */
-{
-       union xfs_btree_rec     rec;
-
-       rec.alloc.ar_startblock = cpu_to_be32(bno);
-       rec.alloc.ar_blockcount = cpu_to_be32(len);
-       return xfs_btree_update(cur, &rec);
-}
-
-/*
- * Get the data from the pointed-to record.
- */
-int                                    /* error */
-xfs_alloc_get_rec(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       xfs_agblock_t           *bno,   /* output: starting block of extent */
-       xfs_extlen_t            *len,   /* output: length of extent */
-       int                     *stat)  /* output: success/failure */
-{
-       union xfs_btree_rec     *rec;
-       int                     error;
-
-       error = xfs_btree_get_rec(cur, &rec, stat);
-       if (!error && *stat == 1) {
-               *bno = be32_to_cpu(rec->alloc.ar_startblock);
-               *len = be32_to_cpu(rec->alloc.ar_blockcount);
-       }
-       return error;
-}
-
-/*
- * Compute aligned version of the found extent.
- * Takes alignment and min length into account.
- */
-STATIC void
-xfs_alloc_compute_aligned(
-       xfs_alloc_arg_t *args,          /* allocation argument structure */
-       xfs_agblock_t   foundbno,       /* starting block in found extent */
-       xfs_extlen_t    foundlen,       /* length in found extent */
-       xfs_agblock_t   *resbno,        /* result block number */
-       xfs_extlen_t    *reslen)        /* result length */
-{
-       xfs_agblock_t   bno;
-       xfs_extlen_t    len;
-
-       /* Trim busy sections out of found extent */
-       xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len);
-
-       if (args->alignment > 1 && len >= args->minlen) {
-               xfs_agblock_t   aligned_bno = roundup(bno, args->alignment);
-               xfs_extlen_t    diff = aligned_bno - bno;
-
-               *resbno = aligned_bno;
-               *reslen = diff >= len ? 0 : len - diff;
-       } else {
-               *resbno = bno;
-               *reslen = len;
-       }
-}
-
-/*
- * Compute best start block and diff for "near" allocations.
- * freelen >= wantlen already checked by caller.
- */
-STATIC xfs_extlen_t                    /* difference value (absolute) */
-xfs_alloc_compute_diff(
-       xfs_agblock_t   wantbno,        /* target starting block */
-       xfs_extlen_t    wantlen,        /* target length */
-       xfs_extlen_t    alignment,      /* target alignment */
-       char            userdata,       /* are we allocating data? */
-       xfs_agblock_t   freebno,        /* freespace's starting block */
-       xfs_extlen_t    freelen,        /* freespace's length */
-       xfs_agblock_t   *newbnop)       /* result: best start block from free */
-{
-       xfs_agblock_t   freeend;        /* end of freespace extent */
-       xfs_agblock_t   newbno1;        /* return block number */
-       xfs_agblock_t   newbno2;        /* other new block number */
-       xfs_extlen_t    newlen1=0;      /* length with newbno1 */
-       xfs_extlen_t    newlen2=0;      /* length with newbno2 */
-       xfs_agblock_t   wantend;        /* end of target extent */
-
-       ASSERT(freelen >= wantlen);
-       freeend = freebno + freelen;
-       wantend = wantbno + wantlen;
-       /*
-        * We want to allocate from the start of a free extent if it is past
-        * the desired block or if we are allocating user data and the free
-        * extent is before desired block. The second case is there to allow
-        * for contiguous allocation from the remaining free space if the file
-        * grows in the short term.
-        */
-       if (freebno >= wantbno || (userdata && freeend < wantend)) {
-               if ((newbno1 = roundup(freebno, alignment)) >= freeend)
-                       newbno1 = NULLAGBLOCK;
-       } else if (freeend >= wantend && alignment > 1) {
-               newbno1 = roundup(wantbno, alignment);
-               newbno2 = newbno1 - alignment;
-               if (newbno1 >= freeend)
-                       newbno1 = NULLAGBLOCK;
-               else
-                       newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
-               if (newbno2 < freebno)
-                       newbno2 = NULLAGBLOCK;
-               else
-                       newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
-               if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
-                       if (newlen1 < newlen2 ||
-                           (newlen1 == newlen2 &&
-                            XFS_ABSDIFF(newbno1, wantbno) >
-                            XFS_ABSDIFF(newbno2, wantbno)))
-                               newbno1 = newbno2;
-               } else if (newbno2 != NULLAGBLOCK)
-                       newbno1 = newbno2;
-       } else if (freeend >= wantend) {
-               newbno1 = wantbno;
-       } else if (alignment > 1) {
-               newbno1 = roundup(freeend - wantlen, alignment);
-               if (newbno1 > freeend - wantlen &&
-                   newbno1 - alignment >= freebno)
-                       newbno1 -= alignment;
-               else if (newbno1 >= freeend)
-                       newbno1 = NULLAGBLOCK;
-       } else
-               newbno1 = freeend - wantlen;
-       *newbnop = newbno1;
-       return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
-}
-
-/*
- * Fix up the length, based on mod and prod.
- * len should be k * prod + mod for some k.
- * If len is too small it is returned unchanged.
- * If len hits maxlen it is left alone.
- */
-STATIC void
-xfs_alloc_fix_len(
-       xfs_alloc_arg_t *args)          /* allocation argument structure */
-{
-       xfs_extlen_t    k;
-       xfs_extlen_t    rlen;
-
-       ASSERT(args->mod < args->prod);
-       rlen = args->len;
-       ASSERT(rlen >= args->minlen);
-       ASSERT(rlen <= args->maxlen);
-       if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
-           (args->mod == 0 && rlen < args->prod))
-               return;
-       k = rlen % args->prod;
-       if (k == args->mod)
-               return;
-       if (k > args->mod)
-               rlen = rlen - (k - args->mod);
-       else
-               rlen = rlen - args->prod + (args->mod - k);
-       if ((int)rlen < (int)args->minlen)
-               return;
-       ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
-       ASSERT(rlen % args->prod == args->mod);
-       args->len = rlen;
-}
-
-/*
- * Fix up length if there is too little space left in the a.g.
- * Return 1 if ok, 0 if too little, should give up.
- */
-STATIC int
-xfs_alloc_fix_minleft(
-       xfs_alloc_arg_t *args)          /* allocation argument structure */
-{
-       xfs_agf_t       *agf;           /* a.g. freelist header */
-       int             diff;           /* free space difference */
-
-       if (args->minleft == 0)
-               return 1;
-       agf = XFS_BUF_TO_AGF(args->agbp);
-       diff = be32_to_cpu(agf->agf_freeblks)
-               - args->len - args->minleft;
-       if (diff >= 0)
-               return 1;
-       args->len += diff;              /* shrink the allocated space */
-       if (args->len >= args->minlen)
-               return 1;
-       args->agbno = NULLAGBLOCK;
-       return 0;
-}
-
-/*
- * Update the two btrees, logically removing from freespace the extent
- * starting at rbno, rlen blocks.  The extent is contained within the
- * actual (current) free extent fbno for flen blocks.
- * Flags are passed in indicating whether the cursors are set to the
- * relevant records.
- */
-STATIC int                             /* error code */
-xfs_alloc_fixup_trees(
-       xfs_btree_cur_t *cnt_cur,       /* cursor for by-size btree */
-       xfs_btree_cur_t *bno_cur,       /* cursor for by-block btree */
-       xfs_agblock_t   fbno,           /* starting block of free extent */
-       xfs_extlen_t    flen,           /* length of free extent */
-       xfs_agblock_t   rbno,           /* starting block of returned extent */
-       xfs_extlen_t    rlen,           /* length of returned extent */
-       int             flags)          /* flags, XFSA_FIXUP_... */
-{
-       int             error;          /* error code */
-       int             i;              /* operation results */
-       xfs_agblock_t   nfbno1;         /* first new free startblock */
-       xfs_agblock_t   nfbno2;         /* second new free startblock */
-       xfs_extlen_t    nflen1=0;       /* first new free length */
-       xfs_extlen_t    nflen2=0;       /* second new free length */
-
-       /*
-        * Look up the record in the by-size tree if necessary.
-        */
-       if (flags & XFSA_FIXUP_CNT_OK) {
-#ifdef DEBUG
-               if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
-                       return error;
-               XFS_WANT_CORRUPTED_RETURN(
-                       i == 1 && nfbno1 == fbno && nflen1 == flen);
-#endif
-       } else {
-               if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
-                       return error;
-               XFS_WANT_CORRUPTED_RETURN(i == 1);
-       }
-       /*
-        * Look up the record in the by-block tree if necessary.
-        */
-       if (flags & XFSA_FIXUP_BNO_OK) {
-#ifdef DEBUG
-               if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
-                       return error;
-               XFS_WANT_CORRUPTED_RETURN(
-                       i == 1 && nfbno1 == fbno && nflen1 == flen);
-#endif
-       } else {
-               if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
-                       return error;
-               XFS_WANT_CORRUPTED_RETURN(i == 1);
-       }
-
-#ifdef DEBUG
-       if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
-               struct xfs_btree_block  *bnoblock;
-               struct xfs_btree_block  *cntblock;
-
-               bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
-               cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
-
-               XFS_WANT_CORRUPTED_RETURN(
-                       bnoblock->bb_numrecs == cntblock->bb_numrecs);
-       }
-#endif
-
-       /*
-        * Deal with all four cases: the allocated record is contained
-        * within the freespace record, so we can have new freespace
-        * at either (or both) end, or no freespace remaining.
-        */
-       if (rbno == fbno && rlen == flen)
-               nfbno1 = nfbno2 = NULLAGBLOCK;
-       else if (rbno == fbno) {
-               nfbno1 = rbno + rlen;
-               nflen1 = flen - rlen;
-               nfbno2 = NULLAGBLOCK;
-       } else if (rbno + rlen == fbno + flen) {
-               nfbno1 = fbno;
-               nflen1 = flen - rlen;
-               nfbno2 = NULLAGBLOCK;
-       } else {
-               nfbno1 = fbno;
-               nflen1 = rbno - fbno;
-               nfbno2 = rbno + rlen;
-               nflen2 = (fbno + flen) - nfbno2;
-       }
-       /*
-        * Delete the entry from the by-size btree.
-        */
-       if ((error = xfs_btree_delete(cnt_cur, &i)))
-               return error;
-       XFS_WANT_CORRUPTED_RETURN(i == 1);
-       /*
-        * Add new by-size btree entry(s).
-        */
-       if (nfbno1 != NULLAGBLOCK) {
-               if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
-                       return error;
-               XFS_WANT_CORRUPTED_RETURN(i == 0);
-               if ((error = xfs_btree_insert(cnt_cur, &i)))
-                       return error;
-               XFS_WANT_CORRUPTED_RETURN(i == 1);
-       }
-       if (nfbno2 != NULLAGBLOCK) {
-               if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
-                       return error;
-               XFS_WANT_CORRUPTED_RETURN(i == 0);
-               if ((error = xfs_btree_insert(cnt_cur, &i)))
-                       return error;
-               XFS_WANT_CORRUPTED_RETURN(i == 1);
-       }
-       /*
-        * Fix up the by-block btree entry(s).
-        */
-       if (nfbno1 == NULLAGBLOCK) {
-               /*
-                * No remaining freespace, just delete the by-block tree entry.
-                */
-               if ((error = xfs_btree_delete(bno_cur, &i)))
-                       return error;
-               XFS_WANT_CORRUPTED_RETURN(i == 1);
-       } else {
-               /*
-                * Update the by-block entry to start later|be shorter.
-                */
-               if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
-                       return error;
-       }
-       if (nfbno2 != NULLAGBLOCK) {
-               /*
-                * 2 resulting free entries, need to add one.
-                */
-               if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
-                       return error;
-               XFS_WANT_CORRUPTED_RETURN(i == 0);
-               if ((error = xfs_btree_insert(bno_cur, &i)))
-                       return error;
-               XFS_WANT_CORRUPTED_RETURN(i == 1);
-       }
-       return 0;
-}
-
-static bool
-xfs_agfl_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount *mp = bp->b_target->bt_mount;
-       struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
-       int             i;
-
-       if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid))
-               return false;
-       if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
-               return false;
-       /*
-        * during growfs operations, the perag is not fully initialised,
-        * so we can't use it for any useful checking. growfs ensures we can't
-        * use it by using uncached buffers that don't have the perag attached
-        * so we can detect and avoid this problem.
-        */
-       if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
-               return false;
-
-       for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
-               if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK &&
-                   be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
-                       return false;
-       }
-       return true;
-}
-
-static void
-xfs_agfl_read_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount *mp = bp->b_target->bt_mount;
-
-       /*
-        * There is no verification of non-crc AGFLs because mkfs does not
-        * initialise the AGFL to zero or NULL. Hence the only valid part of the
-        * AGFL is what the AGF says is active. We can't get to the AGF, so we
-        * can't verify just those entries are valid.
-        */
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return;
-
-       if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
-               xfs_buf_ioerror(bp, EFSBADCRC);
-       else if (!xfs_agfl_verify(bp))
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-       if (bp->b_error)
-               xfs_verifier_error(bp);
-}
-
-static void
-xfs_agfl_write_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount *mp = bp->b_target->bt_mount;
-       struct xfs_buf_log_item *bip = bp->b_fspriv;
-
-       /* no verification of non-crc AGFLs */
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return;
-
-       if (!xfs_agfl_verify(bp)) {
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-               xfs_verifier_error(bp);
-               return;
-       }
-
-       if (bip)
-               XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
-
-       xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
-}
-
-const struct xfs_buf_ops xfs_agfl_buf_ops = {
-       .verify_read = xfs_agfl_read_verify,
-       .verify_write = xfs_agfl_write_verify,
-};
-
-/*
- * Read in the allocation group free block array.
- */
-STATIC int                             /* error */
-xfs_alloc_read_agfl(
-       xfs_mount_t     *mp,            /* mount point structure */
-       xfs_trans_t     *tp,            /* transaction pointer */
-       xfs_agnumber_t  agno,           /* allocation group number */
-       xfs_buf_t       **bpp)          /* buffer for the ag free block array */
-{
-       xfs_buf_t       *bp;            /* return value */
-       int             error;
-
-       ASSERT(agno != NULLAGNUMBER);
-       error = xfs_trans_read_buf(
-                       mp, tp, mp->m_ddev_targp,
-                       XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
-                       XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
-       if (error)
-               return error;
-       xfs_buf_set_ref(bp, XFS_AGFL_REF);
-       *bpp = bp;
-       return 0;
-}
-
-STATIC int
-xfs_alloc_update_counters(
-       struct xfs_trans        *tp,
-       struct xfs_perag        *pag,
-       struct xfs_buf          *agbp,
-       long                    len)
-{
-       struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
-
-       pag->pagf_freeblks += len;
-       be32_add_cpu(&agf->agf_freeblks, len);
-
-       xfs_trans_agblocks_delta(tp, len);
-       if (unlikely(be32_to_cpu(agf->agf_freeblks) >
-                    be32_to_cpu(agf->agf_length)))
-               return EFSCORRUPTED;
-
-       xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
-       return 0;
-}
-
-/*
- * Allocation group level functions.
- */
-
-/*
- * Allocate a variable extent in the allocation group agno.
- * Type and bno are used to determine where in the allocation group the
- * extent will start.
- * Extent's length (returned in *len) will be between minlen and maxlen,
- * and of the form k * prod + mod unless there's nothing that large.
- * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
- */
-STATIC int                     /* error */
-xfs_alloc_ag_vextent(
-       xfs_alloc_arg_t *args)  /* argument structure for allocation */
-{
-       int             error=0;
-
-       ASSERT(args->minlen > 0);
-       ASSERT(args->maxlen > 0);
-       ASSERT(args->minlen <= args->maxlen);
-       ASSERT(args->mod < args->prod);
-       ASSERT(args->alignment > 0);
-       /*
-        * Branch to correct routine based on the type.
-        */
-       args->wasfromfl = 0;
-       switch (args->type) {
-       case XFS_ALLOCTYPE_THIS_AG:
-               error = xfs_alloc_ag_vextent_size(args);
-               break;
-       case XFS_ALLOCTYPE_NEAR_BNO:
-               error = xfs_alloc_ag_vextent_near(args);
-               break;
-       case XFS_ALLOCTYPE_THIS_BNO:
-               error = xfs_alloc_ag_vextent_exact(args);
-               break;
-       default:
-               ASSERT(0);
-               /* NOTREACHED */
-       }
-
-       if (error || args->agbno == NULLAGBLOCK)
-               return error;
-
-       ASSERT(args->len >= args->minlen);
-       ASSERT(args->len <= args->maxlen);
-       ASSERT(!args->wasfromfl || !args->isfl);
-       ASSERT(args->agbno % args->alignment == 0);
-
-       if (!args->wasfromfl) {
-               error = xfs_alloc_update_counters(args->tp, args->pag,
-                                                 args->agbp,
-                                                 -((long)(args->len)));
-               if (error)
-                       return error;
-
-               ASSERT(!xfs_extent_busy_search(args->mp, args->agno,
-                                             args->agbno, args->len));
-       }
-
-       if (!args->isfl) {
-               xfs_trans_mod_sb(args->tp, args->wasdel ?
-                                XFS_TRANS_SB_RES_FDBLOCKS :
-                                XFS_TRANS_SB_FDBLOCKS,
-                                -((long)(args->len)));
-       }
-
-       XFS_STATS_INC(xs_allocx);
-       XFS_STATS_ADD(xs_allocb, args->len);
-       return error;
-}
-
-/*
- * Allocate a variable extent at exactly agno/bno.
- * Extent's length (returned in *len) will be between minlen and maxlen,
- * and of the form k * prod + mod unless there's nothing that large.
- * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
- */
-STATIC int                     /* error */
-xfs_alloc_ag_vextent_exact(
-       xfs_alloc_arg_t *args)  /* allocation argument structure */
-{
-       xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
-       xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
-       int             error;
-       xfs_agblock_t   fbno;   /* start block of found extent */
-       xfs_extlen_t    flen;   /* length of found extent */
-       xfs_agblock_t   tbno;   /* start block of trimmed extent */
-       xfs_extlen_t    tlen;   /* length of trimmed extent */
-       xfs_agblock_t   tend;   /* end block of trimmed extent */
-       int             i;      /* success/failure of operation */
-
-       ASSERT(args->alignment == 1);
-
-       /*
-        * Allocate/initialize a cursor for the by-number freespace btree.
-        */
-       bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                                         args->agno, XFS_BTNUM_BNO);
-
-       /*
-        * Lookup bno and minlen in the btree (minlen is irrelevant, really).
-        * Look for the closest free block <= bno, it must contain bno
-        * if any free block does.
-        */
-       error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
-       if (error)
-               goto error0;
-       if (!i)
-               goto not_found;
-
-       /*
-        * Grab the freespace record.
-        */
-       error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
-       if (error)
-               goto error0;
-       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-       ASSERT(fbno <= args->agbno);
-
-       /*
-        * Check for overlapping busy extents.
-        */
-       xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen);
-
-       /*
-        * Give up if the start of the extent is busy, or the freespace isn't
-        * long enough for the minimum request.
-        */
-       if (tbno > args->agbno)
-               goto not_found;
-       if (tlen < args->minlen)
-               goto not_found;
-       tend = tbno + tlen;
-       if (tend < args->agbno + args->minlen)
-               goto not_found;
-
-       /*
-        * End of extent will be smaller of the freespace end and the
-        * maximal requested end.
-        *
-        * Fix the length according to mod and prod if given.
-        */
-       args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
-                                               - args->agbno;
-       xfs_alloc_fix_len(args);
-       if (!xfs_alloc_fix_minleft(args))
-               goto not_found;
-
-       ASSERT(args->agbno + args->len <= tend);
-
-       /*
-        * We are allocating agbno for args->len
-        * Allocate/initialize a cursor for the by-size btree.
-        */
-       cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-               args->agno, XFS_BTNUM_CNT);
-       ASSERT(args->agbno + args->len <=
-               be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
-       error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
-                                     args->len, XFSA_FIXUP_BNO_OK);
-       if (error) {
-               xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
-               goto error0;
-       }
-
-       xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-       xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-
-       args->wasfromfl = 0;
-       trace_xfs_alloc_exact_done(args);
-       return 0;
-
-not_found:
-       /* Didn't find it, return null. */
-       xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-       args->agbno = NULLAGBLOCK;
-       trace_xfs_alloc_exact_notfound(args);
-       return 0;
-
-error0:
-       xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
-       trace_xfs_alloc_exact_error(args);
-       return error;
-}
-
-/*
- * Search the btree in a given direction via the search cursor and compare
- * the records found against the good extent we've already found.
- */
-STATIC int
-xfs_alloc_find_best_extent(
-       struct xfs_alloc_arg    *args,  /* allocation argument structure */
-       struct xfs_btree_cur    **gcur, /* good cursor */
-       struct xfs_btree_cur    **scur, /* searching cursor */
-       xfs_agblock_t           gdiff,  /* difference for search comparison */
-       xfs_agblock_t           *sbno,  /* extent found by search */
-       xfs_extlen_t            *slen,  /* extent length */
-       xfs_agblock_t           *sbnoa, /* aligned extent found by search */
-       xfs_extlen_t            *slena, /* aligned extent length */
-       int                     dir)    /* 0 = search right, 1 = search left */
-{
-       xfs_agblock_t           new;
-       xfs_agblock_t           sdiff;
-       int                     error;
-       int                     i;
-
-       /* The good extent is perfect, no need to  search. */
-       if (!gdiff)
-               goto out_use_good;
-
-       /*
-        * Look until we find a better one, run out of space or run off the end.
-        */
-       do {
-               error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
-               if (error)
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
-
-               /*
-                * The good extent is closer than this one.
-                */
-               if (!dir) {
-                       if (*sbnoa >= args->agbno + gdiff)
-                               goto out_use_good;
-               } else {
-                       if (*sbnoa <= args->agbno - gdiff)
-                               goto out_use_good;
-               }
-
-               /*
-                * Same distance, compare length and pick the best.
-                */
-               if (*slena >= args->minlen) {
-                       args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
-                       xfs_alloc_fix_len(args);
-
-                       sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                                                      args->alignment,
-                                                      args->userdata, *sbnoa,
-                                                      *slena, &new);
-
-                       /*
-                        * Choose closer size and invalidate other cursor.
-                        */
-                       if (sdiff < gdiff)
-                               goto out_use_search;
-                       goto out_use_good;
-               }
-
-               if (!dir)
-                       error = xfs_btree_increment(*scur, 0, &i);
-               else
-                       error = xfs_btree_decrement(*scur, 0, &i);
-               if (error)
-                       goto error0;
-       } while (i);
-
-out_use_good:
-       xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
-       *scur = NULL;
-       return 0;
-
-out_use_search:
-       xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
-       *gcur = NULL;
-       return 0;
-
-error0:
-       /* caller invalidates cursors */
-       return error;
-}
-
-/*
- * Allocate a variable extent near bno in the allocation group agno.
- * Extent's length (returned in len) will be between minlen and maxlen,
- * and of the form k * prod + mod unless there's nothing that large.
- * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
- */
-STATIC int                             /* error */
-xfs_alloc_ag_vextent_near(
-       xfs_alloc_arg_t *args)          /* allocation argument structure */
-{
-       xfs_btree_cur_t *bno_cur_gt;    /* cursor for bno btree, right side */
-       xfs_btree_cur_t *bno_cur_lt;    /* cursor for bno btree, left side */
-       xfs_btree_cur_t *cnt_cur;       /* cursor for count btree */
-       xfs_agblock_t   gtbno;          /* start bno of right side entry */
-       xfs_agblock_t   gtbnoa;         /* aligned ... */
-       xfs_extlen_t    gtdiff;         /* difference to right side entry */
-       xfs_extlen_t    gtlen;          /* length of right side entry */
-       xfs_extlen_t    gtlena;         /* aligned ... */
-       xfs_agblock_t   gtnew;          /* useful start bno of right side */
-       int             error;          /* error code */
-       int             i;              /* result code, temporary */
-       int             j;              /* result code, temporary */
-       xfs_agblock_t   ltbno;          /* start bno of left side entry */
-       xfs_agblock_t   ltbnoa;         /* aligned ... */
-       xfs_extlen_t    ltdiff;         /* difference to left side entry */
-       xfs_extlen_t    ltlen;          /* length of left side entry */
-       xfs_extlen_t    ltlena;         /* aligned ... */
-       xfs_agblock_t   ltnew;          /* useful start bno of left side */
-       xfs_extlen_t    rlen;           /* length of returned extent */
-       int             forced = 0;
-#ifdef DEBUG
-       /*
-        * Randomly don't execute the first algorithm.
-        */
-       int             dofirst;        /* set to do first algorithm */
-
-       dofirst = prandom_u32() & 1;
-#endif
-
-restart:
-       bno_cur_lt = NULL;
-       bno_cur_gt = NULL;
-       ltlen = 0;
-       gtlena = 0;
-       ltlena = 0;
-
-       /*
-        * Get a cursor for the by-size btree.
-        */
-       cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-               args->agno, XFS_BTNUM_CNT);
-
-       /*
-        * See if there are any free extents as big as maxlen.
-        */
-       if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen, &i)))
-               goto error0;
-       /*
-        * If none, then pick up the last entry in the tree unless the
-        * tree is empty.
-        */
-       if (!i) {
-               if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &ltbno,
-                               &ltlen, &i)))
-                       goto error0;
-               if (i == 0 || ltlen == 0) {
-                       xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-                       trace_xfs_alloc_near_noentry(args);
-                       return 0;
-               }
-               ASSERT(i == 1);
-       }
-       args->wasfromfl = 0;
-
-       /*
-        * First algorithm.
-        * If the requested extent is large wrt the freespaces available
-        * in this a.g., then the cursor will be pointing to a btree entry
-        * near the right edge of the tree.  If it's in the last btree leaf
-        * block, then we just examine all the entries in that block
-        * that are big enough, and pick the best one.
-        * This is written as a while loop so we can break out of it,
-        * but we never loop back to the top.
-        */
-       while (xfs_btree_islastblock(cnt_cur, 0)) {
-               xfs_extlen_t    bdiff;
-               int             besti=0;
-               xfs_extlen_t    blen=0;
-               xfs_agblock_t   bnew=0;
-
-#ifdef DEBUG
-               if (dofirst)
-                       break;
-#endif
-               /*
-                * Start from the entry that lookup found, sequence through
-                * all larger free blocks.  If we're actually pointing at a
-                * record smaller than maxlen, go to the start of this block,
-                * and skip all those smaller than minlen.
-                */
-               if (ltlen || args->alignment > 1) {
-                       cnt_cur->bc_ptrs[0] = 1;
-                       do {
-                               if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno,
-                                               &ltlen, &i)))
-                                       goto error0;
-                               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                               if (ltlen >= args->minlen)
-                                       break;
-                               if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
-                                       goto error0;
-                       } while (i);
-                       ASSERT(ltlen >= args->minlen);
-                       if (!i)
-                               break;
-               }
-               i = cnt_cur->bc_ptrs[0];
-               for (j = 1, blen = 0, bdiff = 0;
-                    !error && j && (blen < args->maxlen || bdiff > 0);
-                    error = xfs_btree_increment(cnt_cur, 0, &j)) {
-                       /*
-                        * For each entry, decide if it's better than
-                        * the previous best entry.
-                        */
-                       if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
-                               goto error0;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                       xfs_alloc_compute_aligned(args, ltbno, ltlen,
-                                                 &ltbnoa, &ltlena);
-                       if (ltlena < args->minlen)
-                               continue;
-                       args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
-                       xfs_alloc_fix_len(args);
-                       ASSERT(args->len >= args->minlen);
-                       if (args->len < blen)
-                               continue;
-                       ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                               args->alignment, args->userdata, ltbnoa,
-                               ltlena, &ltnew);
-                       if (ltnew != NULLAGBLOCK &&
-                           (args->len > blen || ltdiff < bdiff)) {
-                               bdiff = ltdiff;
-                               bnew = ltnew;
-                               blen = args->len;
-                               besti = cnt_cur->bc_ptrs[0];
-                       }
-               }
-               /*
-                * It didn't work.  We COULD be in a case where
-                * there's a good record somewhere, so try again.
-                */
-               if (blen == 0)
-                       break;
-               /*
-                * Point at the best entry, and retrieve it again.
-                */
-               cnt_cur->bc_ptrs[0] = besti;
-               if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
-               args->len = blen;
-               if (!xfs_alloc_fix_minleft(args)) {
-                       xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-                       trace_xfs_alloc_near_nominleft(args);
-                       return 0;
-               }
-               blen = args->len;
-               /*
-                * We are allocating starting at bnew for blen blocks.
-                */
-               args->agbno = bnew;
-               ASSERT(bnew >= ltbno);
-               ASSERT(bnew + blen <= ltbno + ltlen);
-               /*
-                * Set up a cursor for the by-bno tree.
-                */
-               bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp,
-                       args->agbp, args->agno, XFS_BTNUM_BNO);
-               /*
-                * Fix up the btree entries.
-                */
-               if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno,
-                               ltlen, bnew, blen, XFSA_FIXUP_CNT_OK)))
-                       goto error0;
-               xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-               xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
-
-               trace_xfs_alloc_near_first(args);
-               return 0;
-       }
-       /*
-        * Second algorithm.
-        * Search in the by-bno tree to the left and to the right
-        * simultaneously, until in each case we find a space big enough,
-        * or run into the edge of the tree.  When we run into the edge,
-        * we deallocate that cursor.
-        * If both searches succeed, we compare the two spaces and pick
-        * the better one.
-        * With alignment, it's possible for both to fail; the upper
-        * level algorithm that picks allocation groups for allocations
-        * is not supposed to do this.
-        */
-       /*
-        * Allocate and initialize the cursor for the leftward search.
-        */
-       bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-               args->agno, XFS_BTNUM_BNO);
-       /*
-        * Lookup <= bno to find the leftward search's starting point.
-        */
-       if ((error = xfs_alloc_lookup_le(bno_cur_lt, args->agbno, args->maxlen, &i)))
-               goto error0;
-       if (!i) {
-               /*
-                * Didn't find anything; use this cursor for the rightward
-                * search.
-                */
-               bno_cur_gt = bno_cur_lt;
-               bno_cur_lt = NULL;
-       }
-       /*
-        * Found something.  Duplicate the cursor for the rightward search.
-        */
-       else if ((error = xfs_btree_dup_cursor(bno_cur_lt, &bno_cur_gt)))
-               goto error0;
-       /*
-        * Increment the cursor, so we will point at the entry just right
-        * of the leftward entry if any, or to the leftmost entry.
-        */
-       if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
-               goto error0;
-       if (!i) {
-               /*
-                * It failed, there are no rightward entries.
-                */
-               xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR);
-               bno_cur_gt = NULL;
-       }
-       /*
-        * Loop going left with the leftward cursor, right with the
-        * rightward cursor, until either both directions give up or
-        * we find an entry at least as big as minlen.
-        */
-       do {
-               if (bno_cur_lt) {
-                       if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
-                               goto error0;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                       xfs_alloc_compute_aligned(args, ltbno, ltlen,
-                                                 &ltbnoa, &ltlena);
-                       if (ltlena >= args->minlen)
-                               break;
-                       if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
-                               goto error0;
-                       if (!i) {
-                               xfs_btree_del_cursor(bno_cur_lt,
-                                                    XFS_BTREE_NOERROR);
-                               bno_cur_lt = NULL;
-                       }
-               }
-               if (bno_cur_gt) {
-                       if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
-                               goto error0;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                       xfs_alloc_compute_aligned(args, gtbno, gtlen,
-                                                 &gtbnoa, &gtlena);
-                       if (gtlena >= args->minlen)
-                               break;
-                       if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
-                               goto error0;
-                       if (!i) {
-                               xfs_btree_del_cursor(bno_cur_gt,
-                                                    XFS_BTREE_NOERROR);
-                               bno_cur_gt = NULL;
-                       }
-               }
-       } while (bno_cur_lt || bno_cur_gt);
-
-       /*
-        * Got both cursors still active, need to find better entry.
-        */
-       if (bno_cur_lt && bno_cur_gt) {
-               if (ltlena >= args->minlen) {
-                       /*
-                        * Left side is good, look for a right side entry.
-                        */
-                       args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
-                       xfs_alloc_fix_len(args);
-                       ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                               args->alignment, args->userdata, ltbnoa,
-                               ltlena, &ltnew);
-
-                       error = xfs_alloc_find_best_extent(args,
-                                               &bno_cur_lt, &bno_cur_gt,
-                                               ltdiff, &gtbno, &gtlen,
-                                               &gtbnoa, &gtlena,
-                                               0 /* search right */);
-               } else {
-                       ASSERT(gtlena >= args->minlen);
-
-                       /*
-                        * Right side is good, look for a left side entry.
-                        */
-                       args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
-                       xfs_alloc_fix_len(args);
-                       gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                               args->alignment, args->userdata, gtbnoa,
-                               gtlena, &gtnew);
-
-                       error = xfs_alloc_find_best_extent(args,
-                                               &bno_cur_gt, &bno_cur_lt,
-                                               gtdiff, &ltbno, &ltlen,
-                                               &ltbnoa, &ltlena,
-                                               1 /* search left */);
-               }
-
-               if (error)
-                       goto error0;
-       }
-
-       /*
-        * If we couldn't get anything, give up.
-        */
-       if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
-               xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-
-               if (!forced++) {
-                       trace_xfs_alloc_near_busy(args);
-                       xfs_log_force(args->mp, XFS_LOG_SYNC);
-                       goto restart;
-               }
-               trace_xfs_alloc_size_neither(args);
-               args->agbno = NULLAGBLOCK;
-               return 0;
-       }
-
-       /*
-        * At this point we have selected a freespace entry, either to the
-        * left or to the right.  If it's on the right, copy all the
-        * useful variables to the "left" set so we only have one
-        * copy of this code.
-        */
-       if (bno_cur_gt) {
-               bno_cur_lt = bno_cur_gt;
-               bno_cur_gt = NULL;
-               ltbno = gtbno;
-               ltbnoa = gtbnoa;
-               ltlen = gtlen;
-               ltlena = gtlena;
-               j = 1;
-       } else
-               j = 0;
-
-       /*
-        * Fix up the length and compute the useful address.
-        */
-       args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
-       xfs_alloc_fix_len(args);
-       if (!xfs_alloc_fix_minleft(args)) {
-               trace_xfs_alloc_near_nominleft(args);
-               xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
-               xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-               return 0;
-       }
-       rlen = args->len;
-       (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
-                                    args->userdata, ltbnoa, ltlena, &ltnew);
-       ASSERT(ltnew >= ltbno);
-       ASSERT(ltnew + rlen <= ltbnoa + ltlena);
-       ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
-       args->agbno = ltnew;
-
-       if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
-                       ltnew, rlen, XFSA_FIXUP_BNO_OK)))
-               goto error0;
-
-       if (j)
-               trace_xfs_alloc_near_greater(args);
-       else
-               trace_xfs_alloc_near_lesser(args);
-
-       xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-       xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
-       return 0;
-
- error0:
-       trace_xfs_alloc_near_error(args);
-       if (cnt_cur != NULL)
-               xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
-       if (bno_cur_lt != NULL)
-               xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_ERROR);
-       if (bno_cur_gt != NULL)
-               xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_ERROR);
-       return error;
-}
-
-/*
- * Allocate a variable extent anywhere in the allocation group agno.
- * Extent's length (returned in len) will be between minlen and maxlen,
- * and of the form k * prod + mod unless there's nothing that large.
- * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
- */
-STATIC int                             /* error */
-xfs_alloc_ag_vextent_size(
-       xfs_alloc_arg_t *args)          /* allocation argument structure */
-{
-       xfs_btree_cur_t *bno_cur;       /* cursor for bno btree */
-       xfs_btree_cur_t *cnt_cur;       /* cursor for cnt btree */
-       int             error;          /* error result */
-       xfs_agblock_t   fbno;           /* start of found freespace */
-       xfs_extlen_t    flen;           /* length of found freespace */
-       int             i;              /* temp status variable */
-       xfs_agblock_t   rbno;           /* returned block number */
-       xfs_extlen_t    rlen;           /* length of returned extent */
-       int             forced = 0;
-
-restart:
-       /*
-        * Allocate and initialize a cursor for the by-size btree.
-        */
-       cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-               args->agno, XFS_BTNUM_CNT);
-       bno_cur = NULL;
-
-       /*
-        * Look for an entry >= maxlen+alignment-1 blocks.
-        */
-       if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
-                       args->maxlen + args->alignment - 1, &i)))
-               goto error0;
-
-       /*
-        * If none or we have busy extents that we cannot allocate from, then
-        * we have to settle for a smaller extent. In the case that there are
-        * no large extents, this will return the last entry in the tree unless
-        * the tree is empty. In the case that there are only busy large
-        * extents, this will return the largest small extent unless there
-        * are no smaller extents available.
-        */
-       if (!i || forced > 1) {
-               error = xfs_alloc_ag_vextent_small(args, cnt_cur,
-                                                  &fbno, &flen, &i);
-               if (error)
-                       goto error0;
-               if (i == 0 || flen == 0) {
-                       xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-                       trace_xfs_alloc_size_noentry(args);
-                       return 0;
-               }
-               ASSERT(i == 1);
-               xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
-       } else {
-               /*
-                * Search for a non-busy extent that is large enough.
-                * If we are at low space, don't check, or if we fall of
-                * the end of the btree, turn off the busy check and
-                * restart.
-                */
-               for (;;) {
-                       error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
-                       if (error)
-                               goto error0;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-
-                       xfs_alloc_compute_aligned(args, fbno, flen,
-                                                 &rbno, &rlen);
-
-                       if (rlen >= args->maxlen)
-                               break;
-
-                       error = xfs_btree_increment(cnt_cur, 0, &i);
-                       if (error)
-                               goto error0;
-                       if (i == 0) {
-                               /*
-                                * Our only valid extents must have been busy.
-                                * Make it unbusy by forcing the log out and
-                                * retrying. If we've been here before, forcing
-                                * the log isn't making the extents available,
-                                * which means they have probably been freed in
-                                * this transaction.  In that case, we have to
-                                * give up on them and we'll attempt a minlen
-                                * allocation the next time around.
-                                */
-                               xfs_btree_del_cursor(cnt_cur,
-                                                    XFS_BTREE_NOERROR);
-                               trace_xfs_alloc_size_busy(args);
-                               if (!forced++)
-                                       xfs_log_force(args->mp, XFS_LOG_SYNC);
-                               goto restart;
-                       }
-               }
-       }
-
-       /*
-        * In the first case above, we got the last entry in the
-        * by-size btree.  Now we check to see if the space hits maxlen
-        * once aligned; if not, we search left for something better.
-        * This can't happen in the second case above.
-        */
-       rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
-       XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
-                       (rlen <= flen && rbno + rlen <= fbno + flen), error0);
-       if (rlen < args->maxlen) {
-               xfs_agblock_t   bestfbno;
-               xfs_extlen_t    bestflen;
-               xfs_agblock_t   bestrbno;
-               xfs_extlen_t    bestrlen;
-
-               bestrlen = rlen;
-               bestrbno = rbno;
-               bestflen = flen;
-               bestfbno = fbno;
-               for (;;) {
-                       if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
-                               goto error0;
-                       if (i == 0)
-                               break;
-                       if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
-                                       &i)))
-                               goto error0;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                       if (flen < bestrlen)
-                               break;
-                       xfs_alloc_compute_aligned(args, fbno, flen,
-                                                 &rbno, &rlen);
-                       rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
-                       XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
-                               (rlen <= flen && rbno + rlen <= fbno + flen),
-                               error0);
-                       if (rlen > bestrlen) {
-                               bestrlen = rlen;
-                               bestrbno = rbno;
-                               bestflen = flen;
-                               bestfbno = fbno;
-                               if (rlen == args->maxlen)
-                                       break;
-                       }
-               }
-               if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
-                               &i)))
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               rlen = bestrlen;
-               rbno = bestrbno;
-               flen = bestflen;
-               fbno = bestfbno;
-       }
-       args->wasfromfl = 0;
-       /*
-        * Fix up the length.
-        */
-       args->len = rlen;
-       if (rlen < args->minlen) {
-               if (!forced++) {
-                       xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-                       trace_xfs_alloc_size_busy(args);
-                       xfs_log_force(args->mp, XFS_LOG_SYNC);
-                       goto restart;
-               }
-               goto out_nominleft;
-       }
-       xfs_alloc_fix_len(args);
-
-       if (!xfs_alloc_fix_minleft(args))
-               goto out_nominleft;
-       rlen = args->len;
-       XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
-       /*
-        * Allocate and initialize a cursor for the by-block tree.
-        */
-       bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-               args->agno, XFS_BTNUM_BNO);
-       if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
-                       rbno, rlen, XFSA_FIXUP_CNT_OK)))
-               goto error0;
-       xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-       xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-       cnt_cur = bno_cur = NULL;
-       args->len = rlen;
-       args->agbno = rbno;
-       XFS_WANT_CORRUPTED_GOTO(
-               args->agbno + args->len <=
-                       be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
-               error0);
-       trace_xfs_alloc_size_done(args);
-       return 0;
-
-error0:
-       trace_xfs_alloc_size_error(args);
-       if (cnt_cur)
-               xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
-       if (bno_cur)
-               xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
-       return error;
-
-out_nominleft:
-       xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-       trace_xfs_alloc_size_nominleft(args);
-       args->agbno = NULLAGBLOCK;
-       return 0;
-}
-
-/*
- * Deal with the case where only small freespaces remain.
- * Either return the contents of the last freespace record,
- * or allocate space from the freelist if there is nothing in the tree.
- */
-STATIC int                     /* error */
-xfs_alloc_ag_vextent_small(
-       xfs_alloc_arg_t *args,  /* allocation argument structure */
-       xfs_btree_cur_t *ccur,  /* by-size cursor */
-       xfs_agblock_t   *fbnop, /* result block number */
-       xfs_extlen_t    *flenp, /* result length */
-       int             *stat)  /* status: 0-freelist, 1-normal/none */
-{
-       int             error;
-       xfs_agblock_t   fbno;
-       xfs_extlen_t    flen;
-       int             i;
-
-       if ((error = xfs_btree_decrement(ccur, 0, &i)))
-               goto error0;
-       if (i) {
-               if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-       }
-       /*
-        * Nothing in the btree, try the freelist.  Make sure
-        * to respect minleft even when pulling from the
-        * freelist.
-        */
-       else if (args->minlen == 1 && args->alignment == 1 && !args->isfl &&
-                (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)
-                 > args->minleft)) {
-               error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
-               if (error)
-                       goto error0;
-               if (fbno != NULLAGBLOCK) {
-                       xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
-                                            args->userdata);
-
-                       if (args->userdata) {
-                               xfs_buf_t       *bp;
-
-                               bp = xfs_btree_get_bufs(args->mp, args->tp,
-                                       args->agno, fbno, 0);
-                               xfs_trans_binval(args->tp, bp);
-                       }
-                       args->len = 1;
-                       args->agbno = fbno;
-                       XFS_WANT_CORRUPTED_GOTO(
-                               args->agbno + args->len <=
-                               be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
-                               error0);
-                       args->wasfromfl = 1;
-                       trace_xfs_alloc_small_freelist(args);
-                       *stat = 0;
-                       return 0;
-               }
-               /*
-                * Nothing in the freelist.
-                */
-               else
-                       flen = 0;
-       }
-       /*
-        * Can't allocate from the freelist for some reason.
-        */
-       else {
-               fbno = NULLAGBLOCK;
-               flen = 0;
-       }
-       /*
-        * Can't do the allocation, give up.
-        */
-       if (flen < args->minlen) {
-               args->agbno = NULLAGBLOCK;
-               trace_xfs_alloc_small_notenough(args);
-               flen = 0;
-       }
-       *fbnop = fbno;
-       *flenp = flen;
-       *stat = 1;
-       trace_xfs_alloc_small_done(args);
-       return 0;
-
-error0:
-       trace_xfs_alloc_small_error(args);
-       return error;
-}
-
-/*
- * Free the extent starting at agno/bno for length.
- */
-STATIC int                     /* error */
-xfs_free_ag_extent(
-       xfs_trans_t     *tp,    /* transaction pointer */
-       xfs_buf_t       *agbp,  /* buffer for a.g. freelist header */
-       xfs_agnumber_t  agno,   /* allocation group number */
-       xfs_agblock_t   bno,    /* starting block number */
-       xfs_extlen_t    len,    /* length of extent */
-       int             isfl)   /* set if is freelist blocks - no sb acctg */
-{
-       xfs_btree_cur_t *bno_cur;       /* cursor for by-block btree */
-       xfs_btree_cur_t *cnt_cur;       /* cursor for by-size btree */
-       int             error;          /* error return value */
-       xfs_agblock_t   gtbno;          /* start of right neighbor block */
-       xfs_extlen_t    gtlen;          /* length of right neighbor block */
-       int             haveleft;       /* have a left neighbor block */
-       int             haveright;      /* have a right neighbor block */
-       int             i;              /* temp, result code */
-       xfs_agblock_t   ltbno;          /* start of left neighbor block */
-       xfs_extlen_t    ltlen;          /* length of left neighbor block */
-       xfs_mount_t     *mp;            /* mount point struct for filesystem */
-       xfs_agblock_t   nbno;           /* new starting block of freespace */
-       xfs_extlen_t    nlen;           /* new length of freespace */
-       xfs_perag_t     *pag;           /* per allocation group data */
-
-       mp = tp->t_mountp;
-       /*
-        * Allocate and initialize a cursor for the by-block btree.
-        */
-       bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
-       cnt_cur = NULL;
-       /*
-        * Look for a neighboring block on the left (lower block numbers)
-        * that is contiguous with this space.
-        */
-       if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
-               goto error0;
-       if (haveleft) {
-               /*
-                * There is a block to our left.
-                */
-               if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               /*
-                * It's not contiguous, though.
-                */
-               if (ltbno + ltlen < bno)
-                       haveleft = 0;
-               else {
-                       /*
-                        * If this failure happens the request to free this
-                        * space was invalid, it's (partly) already free.
-                        * Very bad.
-                        */
-                       XFS_WANT_CORRUPTED_GOTO(ltbno + ltlen <= bno, error0);
-               }
-       }
-       /*
-        * Look for a neighboring block on the right (higher block numbers)
-        * that is contiguous with this space.
-        */
-       if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
-               goto error0;
-       if (haveright) {
-               /*
-                * There is a block to our right.
-                */
-               if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               /*
-                * It's not contiguous, though.
-                */
-               if (bno + len < gtbno)
-                       haveright = 0;
-               else {
-                       /*
-                        * If this failure happens the request to free this
-                        * space was invalid, it's (partly) already free.
-                        * Very bad.
-                        */
-                       XFS_WANT_CORRUPTED_GOTO(gtbno >= bno + len, error0);
-               }
-       }
-       /*
-        * Now allocate and initialize a cursor for the by-size tree.
-        */
-       cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
-       /*
-        * Have both left and right contiguous neighbors.
-        * Merge all three into a single free block.
-        */
-       if (haveleft && haveright) {
-               /*
-                * Delete the old by-size entry on the left.
-                */
-               if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               if ((error = xfs_btree_delete(cnt_cur, &i)))
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               /*
-                * Delete the old by-size entry on the right.
-                */
-               if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               if ((error = xfs_btree_delete(cnt_cur, &i)))
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               /*
-                * Delete the old by-block entry for the right block.
-                */
-               if ((error = xfs_btree_delete(bno_cur, &i)))
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               /*
-                * Move the by-block cursor back to the left neighbor.
-                */
-               if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-#ifdef DEBUG
-               /*
-                * Check that this is the right record: delete didn't
-                * mangle the cursor.
-                */
-               {
-                       xfs_agblock_t   xxbno;
-                       xfs_extlen_t    xxlen;
-
-                       if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
-                                       &i)))
-                               goto error0;
-                       XFS_WANT_CORRUPTED_GOTO(
-                               i == 1 && xxbno == ltbno && xxlen == ltlen,
-                               error0);
-               }
-#endif
-               /*
-                * Update remaining by-block entry to the new, joined block.
-                */
-               nbno = ltbno;
-               nlen = len + ltlen + gtlen;
-               if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
-                       goto error0;
-       }
-       /*
-        * Have only a left contiguous neighbor.
-        * Merge it together with the new freespace.
-        */
-       else if (haveleft) {
-               /*
-                * Delete the old by-size entry on the left.
-                */
-               if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               if ((error = xfs_btree_delete(cnt_cur, &i)))
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               /*
-                * Back up the by-block cursor to the left neighbor, and
-                * update its length.
-                */
-               if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               nbno = ltbno;
-               nlen = len + ltlen;
-               if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
-                       goto error0;
-       }
-       /*
-        * Have only a right contiguous neighbor.
-        * Merge it together with the new freespace.
-        */
-       else if (haveright) {
-               /*
-                * Delete the old by-size entry on the right.
-                */
-               if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               if ((error = xfs_btree_delete(cnt_cur, &i)))
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               /*
-                * Update the starting block and length of the right
-                * neighbor in the by-block tree.
-                */
-               nbno = bno;
-               nlen = len + gtlen;
-               if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
-                       goto error0;
-       }
-       /*
-        * No contiguous neighbors.
-        * Insert the new freespace into the by-block tree.
-        */
-       else {
-               nbno = bno;
-               nlen = len;
-               if ((error = xfs_btree_insert(bno_cur, &i)))
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-       }
-       xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-       bno_cur = NULL;
-       /*
-        * In all cases we need to insert the new freespace in the by-size tree.
-        */
-       if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
-               goto error0;
-       XFS_WANT_CORRUPTED_GOTO(i == 0, error0);
-       if ((error = xfs_btree_insert(cnt_cur, &i)))
-               goto error0;
-       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-       xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-       cnt_cur = NULL;
-
-       /*
-        * Update the freespace totals in the ag and superblock.
-        */
-       pag = xfs_perag_get(mp, agno);
-       error = xfs_alloc_update_counters(tp, pag, agbp, len);
-       xfs_perag_put(pag);
-       if (error)
-               goto error0;
-
-       if (!isfl)
-               xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
-       XFS_STATS_INC(xs_freex);
-       XFS_STATS_ADD(xs_freeb, len);
-
-       trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
-
-       return 0;
-
- error0:
-       trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1);
-       if (bno_cur)
-               xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
-       if (cnt_cur)
-               xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
-       return error;
-}
-
-/*
- * Visible (exported) allocation/free functions.
- * Some of these are used just by xfs_alloc_btree.c and this file.
- */
-
-/*
- * Compute and fill in value of m_ag_maxlevels.
- */
-void
-xfs_alloc_compute_maxlevels(
-       xfs_mount_t     *mp)    /* file system mount structure */
-{
-       int             level;
-       uint            maxblocks;
-       uint            maxleafents;
-       int             minleafrecs;
-       int             minnoderecs;
-
-       maxleafents = (mp->m_sb.sb_agblocks + 1) / 2;
-       minleafrecs = mp->m_alloc_mnr[0];
-       minnoderecs = mp->m_alloc_mnr[1];
-       maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
-       for (level = 1; maxblocks > 1; level++)
-               maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
-       mp->m_ag_maxlevels = level;
-}
-
-/*
- * Find the length of the longest extent in an AG.
- */
-xfs_extlen_t
-xfs_alloc_longest_free_extent(
-       struct xfs_mount        *mp,
-       struct xfs_perag        *pag)
-{
-       xfs_extlen_t            need, delta = 0;
-
-       need = XFS_MIN_FREELIST_PAG(pag, mp);
-       if (need > pag->pagf_flcount)
-               delta = need - pag->pagf_flcount;
-
-       if (pag->pagf_longest > delta)
-               return pag->pagf_longest - delta;
-       return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
-}
-
-/*
- * Decide whether to use this allocation group for this allocation.
- * If so, fix up the btree freelist's size.
- */
-STATIC int                     /* error */
-xfs_alloc_fix_freelist(
-       xfs_alloc_arg_t *args,  /* allocation argument structure */
-       int             flags)  /* XFS_ALLOC_FLAG_... */
-{
-       xfs_buf_t       *agbp;  /* agf buffer pointer */
-       xfs_agf_t       *agf;   /* a.g. freespace structure pointer */
-       xfs_buf_t       *agflbp;/* agfl buffer pointer */
-       xfs_agblock_t   bno;    /* freelist block */
-       xfs_extlen_t    delta;  /* new blocks needed in freelist */
-       int             error;  /* error result code */
-       xfs_extlen_t    longest;/* longest extent in allocation group */
-       xfs_mount_t     *mp;    /* file system mount point structure */
-       xfs_extlen_t    need;   /* total blocks needed in freelist */
-       xfs_perag_t     *pag;   /* per-ag information structure */
-       xfs_alloc_arg_t targs;  /* local allocation arguments */
-       xfs_trans_t     *tp;    /* transaction pointer */
-
-       mp = args->mp;
-
-       pag = args->pag;
-       tp = args->tp;
-       if (!pag->pagf_init) {
-               if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
-                               &agbp)))
-                       return error;
-               if (!pag->pagf_init) {
-                       ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
-                       ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
-                       args->agbp = NULL;
-                       return 0;
-               }
-       } else
-               agbp = NULL;
-
-       /*
-        * If this is a metadata preferred pag and we are user data
-        * then try somewhere else if we are not being asked to
-        * try harder at this point
-        */
-       if (pag->pagf_metadata && args->userdata &&
-           (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
-               ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
-               args->agbp = NULL;
-               return 0;
-       }
-
-       if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
-               /*
-                * If it looks like there isn't a long enough extent, or enough
-                * total blocks, reject it.
-                */
-               need = XFS_MIN_FREELIST_PAG(pag, mp);
-               longest = xfs_alloc_longest_free_extent(mp, pag);
-               if ((args->minlen + args->alignment + args->minalignslop - 1) >
-                               longest ||
-                   ((int)(pag->pagf_freeblks + pag->pagf_flcount -
-                          need - args->total) < (int)args->minleft)) {
-                       if (agbp)
-                               xfs_trans_brelse(tp, agbp);
-                       args->agbp = NULL;
-                       return 0;
-               }
-       }
-
-       /*
-        * Get the a.g. freespace buffer.
-        * Can fail if we're not blocking on locks, and it's held.
-        */
-       if (agbp == NULL) {
-               if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
-                               &agbp)))
-                       return error;
-               if (agbp == NULL) {
-                       ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
-                       ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
-                       args->agbp = NULL;
-                       return 0;
-               }
-       }
-       /*
-        * Figure out how many blocks we should have in the freelist.
-        */
-       agf = XFS_BUF_TO_AGF(agbp);
-       need = XFS_MIN_FREELIST(agf, mp);
-       /*
-        * If there isn't enough total or single-extent, reject it.
-        */
-       if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
-               delta = need > be32_to_cpu(agf->agf_flcount) ?
-                       (need - be32_to_cpu(agf->agf_flcount)) : 0;
-               longest = be32_to_cpu(agf->agf_longest);
-               longest = (longest > delta) ? (longest - delta) :
-                       (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
-               if ((args->minlen + args->alignment + args->minalignslop - 1) >
-                               longest ||
-                   ((int)(be32_to_cpu(agf->agf_freeblks) +
-                    be32_to_cpu(agf->agf_flcount) - need - args->total) <
-                               (int)args->minleft)) {
-                       xfs_trans_brelse(tp, agbp);
-                       args->agbp = NULL;
-                       return 0;
-               }
-       }
-       /*
-        * Make the freelist shorter if it's too long.
-        */
-       while (be32_to_cpu(agf->agf_flcount) > need) {
-               xfs_buf_t       *bp;
-
-               error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
-               if (error)
-                       return error;
-               if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1)))
-                       return error;
-               bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
-               xfs_trans_binval(tp, bp);
-       }
-       /*
-        * Initialize the args structure.
-        */
-       memset(&targs, 0, sizeof(targs));
-       targs.tp = tp;
-       targs.mp = mp;
-       targs.agbp = agbp;
-       targs.agno = args->agno;
-       targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
-       targs.type = XFS_ALLOCTYPE_THIS_AG;
-       targs.pag = pag;
-       if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp)))
-               return error;
-       /*
-        * Make the freelist longer if it's too short.
-        */
-       while (be32_to_cpu(agf->agf_flcount) < need) {
-               targs.agbno = 0;
-               targs.maxlen = need - be32_to_cpu(agf->agf_flcount);
-               /*
-                * Allocate as many blocks as possible at once.
-                */
-               if ((error = xfs_alloc_ag_vextent(&targs))) {
-                       xfs_trans_brelse(tp, agflbp);
-                       return error;
-               }
-               /*
-                * Stop if we run out.  Won't happen if callers are obeying
-                * the restrictions correctly.  Can happen for free calls
-                * on a completely full ag.
-                */
-               if (targs.agbno == NULLAGBLOCK) {
-                       if (flags & XFS_ALLOC_FLAG_FREEING)
-                               break;
-                       xfs_trans_brelse(tp, agflbp);
-                       args->agbp = NULL;
-                       return 0;
-               }
-               /*
-                * Put each allocated block on the list.
-                */
-               for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
-                       error = xfs_alloc_put_freelist(tp, agbp,
-                                                       agflbp, bno, 0);
-                       if (error)
-                               return error;
-               }
-       }
-       xfs_trans_brelse(tp, agflbp);
-       args->agbp = agbp;
-       return 0;
-}
-
-/*
- * Get a block from the freelist.
- * Returns with the buffer for the block gotten.
- */
-int                            /* error */
-xfs_alloc_get_freelist(
-       xfs_trans_t     *tp,    /* transaction pointer */
-       xfs_buf_t       *agbp,  /* buffer containing the agf structure */
-       xfs_agblock_t   *bnop,  /* block address retrieved from freelist */
-       int             btreeblk) /* destination is a AGF btree */
-{
-       xfs_agf_t       *agf;   /* a.g. freespace structure */
-       xfs_buf_t       *agflbp;/* buffer for a.g. freelist structure */
-       xfs_agblock_t   bno;    /* block number returned */
-       __be32          *agfl_bno;
-       int             error;
-       int             logflags;
-       xfs_mount_t     *mp = tp->t_mountp;
-       xfs_perag_t     *pag;   /* per allocation group data */
-
-       /*
-        * Freelist is empty, give up.
-        */
-       agf = XFS_BUF_TO_AGF(agbp);
-       if (!agf->agf_flcount) {
-               *bnop = NULLAGBLOCK;
-               return 0;
-       }
-       /*
-        * Read the array of free blocks.
-        */
-       error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno),
-                                   &agflbp);
-       if (error)
-               return error;
-
-
-       /*
-        * Get the block number and update the data structures.
-        */
-       agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
-       bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
-       be32_add_cpu(&agf->agf_flfirst, 1);
-       xfs_trans_brelse(tp, agflbp);
-       if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
-               agf->agf_flfirst = 0;
-
-       pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
-       be32_add_cpu(&agf->agf_flcount, -1);
-       xfs_trans_agflist_delta(tp, -1);
-       pag->pagf_flcount--;
-       xfs_perag_put(pag);
-
-       logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
-       if (btreeblk) {
-               be32_add_cpu(&agf->agf_btreeblks, 1);
-               pag->pagf_btreeblks++;
-               logflags |= XFS_AGF_BTREEBLKS;
-       }
-
-       xfs_alloc_log_agf(tp, agbp, logflags);
-       *bnop = bno;
-
-       return 0;
-}
-
-/*
- * Log the given fields from the agf structure.
- */
-void
-xfs_alloc_log_agf(
-       xfs_trans_t     *tp,    /* transaction pointer */
-       xfs_buf_t       *bp,    /* buffer for a.g. freelist header */
-       int             fields) /* mask of fields to be logged (XFS_AGF_...) */
-{
-       int     first;          /* first byte offset */
-       int     last;           /* last byte offset */
-       static const short      offsets[] = {
-               offsetof(xfs_agf_t, agf_magicnum),
-               offsetof(xfs_agf_t, agf_versionnum),
-               offsetof(xfs_agf_t, agf_seqno),
-               offsetof(xfs_agf_t, agf_length),
-               offsetof(xfs_agf_t, agf_roots[0]),
-               offsetof(xfs_agf_t, agf_levels[0]),
-               offsetof(xfs_agf_t, agf_flfirst),
-               offsetof(xfs_agf_t, agf_fllast),
-               offsetof(xfs_agf_t, agf_flcount),
-               offsetof(xfs_agf_t, agf_freeblks),
-               offsetof(xfs_agf_t, agf_longest),
-               offsetof(xfs_agf_t, agf_btreeblks),
-               offsetof(xfs_agf_t, agf_uuid),
-               sizeof(xfs_agf_t)
-       };
-
-       trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_);
-
-       xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
-
-       xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
-       xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
-}
-
-/*
- * Interface for inode allocation to force the pag data to be initialized.
- */
-int                                    /* error */
-xfs_alloc_pagf_init(
-       xfs_mount_t             *mp,    /* file system mount structure */
-       xfs_trans_t             *tp,    /* transaction pointer */
-       xfs_agnumber_t          agno,   /* allocation group number */
-       int                     flags)  /* XFS_ALLOC_FLAGS_... */
-{
-       xfs_buf_t               *bp;
-       int                     error;
-
-       if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp)))
-               return error;
-       if (bp)
-               xfs_trans_brelse(tp, bp);
-       return 0;
-}
-
-/*
- * Put the block on the freelist for the allocation group.
- */
-int                                    /* error */
-xfs_alloc_put_freelist(
-       xfs_trans_t             *tp,    /* transaction pointer */
-       xfs_buf_t               *agbp,  /* buffer for a.g. freelist header */
-       xfs_buf_t               *agflbp,/* buffer for a.g. free block array */
-       xfs_agblock_t           bno,    /* block being freed */
-       int                     btreeblk) /* block came from a AGF btree */
-{
-       xfs_agf_t               *agf;   /* a.g. freespace structure */
-       __be32                  *blockp;/* pointer to array entry */
-       int                     error;
-       int                     logflags;
-       xfs_mount_t             *mp;    /* mount structure */
-       xfs_perag_t             *pag;   /* per allocation group data */
-       __be32                  *agfl_bno;
-       int                     startoff;
-
-       agf = XFS_BUF_TO_AGF(agbp);
-       mp = tp->t_mountp;
-
-       if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
-                       be32_to_cpu(agf->agf_seqno), &agflbp)))
-               return error;
-       be32_add_cpu(&agf->agf_fllast, 1);
-       if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
-               agf->agf_fllast = 0;
-
-       pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
-       be32_add_cpu(&agf->agf_flcount, 1);
-       xfs_trans_agflist_delta(tp, 1);
-       pag->pagf_flcount++;
-
-       logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
-       if (btreeblk) {
-               be32_add_cpu(&agf->agf_btreeblks, -1);
-               pag->pagf_btreeblks--;
-               logflags |= XFS_AGF_BTREEBLKS;
-       }
-       xfs_perag_put(pag);
-
-       xfs_alloc_log_agf(tp, agbp, logflags);
-
-       ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp));
-
-       agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
-       blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
-       *blockp = cpu_to_be32(bno);
-       startoff = (char *)blockp - (char *)agflbp->b_addr;
-
-       xfs_alloc_log_agf(tp, agbp, logflags);
-
-       xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
-       xfs_trans_log_buf(tp, agflbp, startoff,
-                         startoff + sizeof(xfs_agblock_t) - 1);
-       return 0;
-}
-
-static bool
-xfs_agf_verify(
-       struct xfs_mount *mp,
-       struct xfs_buf  *bp)
- {
-       struct xfs_agf  *agf = XFS_BUF_TO_AGF(bp);
-
-       if (xfs_sb_version_hascrc(&mp->m_sb) &&
-           !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid))
-                       return false;
-
-       if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
-             XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
-             be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
-             be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
-             be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
-             be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)))
-               return false;
-
-       /*
-        * during growfs operations, the perag is not fully initialised,
-        * so we can't use it for any useful checking. growfs ensures we can't
-        * use it by using uncached buffers that don't have the perag attached
-        * so we can detect and avoid this problem.
-        */
-       if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
-               return false;
-
-       if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
-           be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
-               return false;
-
-       return true;;
-
-}
-
-static void
-xfs_agf_read_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount *mp = bp->b_target->bt_mount;
-
-       if (xfs_sb_version_hascrc(&mp->m_sb) &&
-           !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
-               xfs_buf_ioerror(bp, EFSBADCRC);
-       else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp,
-                               XFS_ERRTAG_ALLOC_READ_AGF,
-                               XFS_RANDOM_ALLOC_READ_AGF))
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-       if (bp->b_error)
-               xfs_verifier_error(bp);
-}
-
-static void
-xfs_agf_write_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount *mp = bp->b_target->bt_mount;
-       struct xfs_buf_log_item *bip = bp->b_fspriv;
-
-       if (!xfs_agf_verify(mp, bp)) {
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-               xfs_verifier_error(bp);
-               return;
-       }
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return;
-
-       if (bip)
-               XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
-
-       xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
-}
-
-const struct xfs_buf_ops xfs_agf_buf_ops = {
-       .verify_read = xfs_agf_read_verify,
-       .verify_write = xfs_agf_write_verify,
-};
-
-/*
- * Read in the allocation group header (free/alloc section).
- */
-int                                    /* error */
-xfs_read_agf(
-       struct xfs_mount        *mp,    /* mount point structure */
-       struct xfs_trans        *tp,    /* transaction pointer */
-       xfs_agnumber_t          agno,   /* allocation group number */
-       int                     flags,  /* XFS_BUF_ */
-       struct xfs_buf          **bpp)  /* buffer for the ag freelist header */
-{
-       int             error;
-
-       trace_xfs_read_agf(mp, agno);
-
-       ASSERT(agno != NULLAGNUMBER);
-       error = xfs_trans_read_buf(
-                       mp, tp, mp->m_ddev_targp,
-                       XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
-                       XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
-       if (error)
-               return error;
-       if (!*bpp)
-               return 0;
-
-       ASSERT(!(*bpp)->b_error);
-       xfs_buf_set_ref(*bpp, XFS_AGF_REF);
-       return 0;
-}
-
-/*
- * Read in the allocation group header (free/alloc section).
- */
-int                                    /* error */
-xfs_alloc_read_agf(
-       struct xfs_mount        *mp,    /* mount point structure */
-       struct xfs_trans        *tp,    /* transaction pointer */
-       xfs_agnumber_t          agno,   /* allocation group number */
-       int                     flags,  /* XFS_ALLOC_FLAG_... */
-       struct xfs_buf          **bpp)  /* buffer for the ag freelist header */
-{
-       struct xfs_agf          *agf;           /* ag freelist header */
-       struct xfs_perag        *pag;           /* per allocation group data */
-       int                     error;
-
-       trace_xfs_alloc_read_agf(mp, agno);
-
-       ASSERT(agno != NULLAGNUMBER);
-       error = xfs_read_agf(mp, tp, agno,
-                       (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
-                       bpp);
-       if (error)
-               return error;
-       if (!*bpp)
-               return 0;
-       ASSERT(!(*bpp)->b_error);
-
-       agf = XFS_BUF_TO_AGF(*bpp);
-       pag = xfs_perag_get(mp, agno);
-       if (!pag->pagf_init) {
-               pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
-               pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
-               pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
-               pag->pagf_longest = be32_to_cpu(agf->agf_longest);
-               pag->pagf_levels[XFS_BTNUM_BNOi] =
-                       be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
-               pag->pagf_levels[XFS_BTNUM_CNTi] =
-                       be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
-               spin_lock_init(&pag->pagb_lock);
-               pag->pagb_count = 0;
-               pag->pagb_tree = RB_ROOT;
-               pag->pagf_init = 1;
-       }
-#ifdef DEBUG
-       else if (!XFS_FORCED_SHUTDOWN(mp)) {
-               ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
-               ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
-               ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
-               ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
-               ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
-                      be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
-               ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
-                      be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
-       }
-#endif
-       xfs_perag_put(pag);
-       return 0;
-}
-
-/*
- * Allocate an extent (variable-size).
- * Depending on the allocation type, we either look in a single allocation
- * group or loop over the allocation groups to find the result.
- */
-int                            /* error */
-xfs_alloc_vextent(
-       xfs_alloc_arg_t *args)  /* allocation argument structure */
-{
-       xfs_agblock_t   agsize; /* allocation group size */
-       int             error;
-       int             flags;  /* XFS_ALLOC_FLAG_... locking flags */
-       xfs_extlen_t    minleft;/* minimum left value, temp copy */
-       xfs_mount_t     *mp;    /* mount structure pointer */
-       xfs_agnumber_t  sagno;  /* starting allocation group number */
-       xfs_alloctype_t type;   /* input allocation type */
-       int             bump_rotor = 0;
-       int             no_min = 0;
-       xfs_agnumber_t  rotorstep = xfs_rotorstep; /* inode32 agf stepper */
-
-       mp = args->mp;
-       type = args->otype = args->type;
-       args->agbno = NULLAGBLOCK;
-       /*
-        * Just fix this up, for the case where the last a.g. is shorter
-        * (or there's only one a.g.) and the caller couldn't easily figure
-        * that out (xfs_bmap_alloc).
-        */
-       agsize = mp->m_sb.sb_agblocks;
-       if (args->maxlen > agsize)
-               args->maxlen = agsize;
-       if (args->alignment == 0)
-               args->alignment = 1;
-       ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
-       ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
-       ASSERT(args->minlen <= args->maxlen);
-       ASSERT(args->minlen <= agsize);
-       ASSERT(args->mod < args->prod);
-       if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
-           XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
-           args->minlen > args->maxlen || args->minlen > agsize ||
-           args->mod >= args->prod) {
-               args->fsbno = NULLFSBLOCK;
-               trace_xfs_alloc_vextent_badargs(args);
-               return 0;
-       }
-       minleft = args->minleft;
-
-       switch (type) {
-       case XFS_ALLOCTYPE_THIS_AG:
-       case XFS_ALLOCTYPE_NEAR_BNO:
-       case XFS_ALLOCTYPE_THIS_BNO:
-               /*
-                * These three force us into a single a.g.
-                */
-               args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
-               args->pag = xfs_perag_get(mp, args->agno);
-               args->minleft = 0;
-               error = xfs_alloc_fix_freelist(args, 0);
-               args->minleft = minleft;
-               if (error) {
-                       trace_xfs_alloc_vextent_nofix(args);
-                       goto error0;
-               }
-               if (!args->agbp) {
-                       trace_xfs_alloc_vextent_noagbp(args);
-                       break;
-               }
-               args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
-               if ((error = xfs_alloc_ag_vextent(args)))
-                       goto error0;
-               break;
-       case XFS_ALLOCTYPE_START_BNO:
-               /*
-                * Try near allocation first, then anywhere-in-ag after
-                * the first a.g. fails.
-                */
-               if ((args->userdata  == XFS_ALLOC_INITIAL_USER_DATA) &&
-                   (mp->m_flags & XFS_MOUNT_32BITINODES)) {
-                       args->fsbno = XFS_AGB_TO_FSB(mp,
-                                       ((mp->m_agfrotor / rotorstep) %
-                                       mp->m_sb.sb_agcount), 0);
-                       bump_rotor = 1;
-               }
-               args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
-               args->type = XFS_ALLOCTYPE_NEAR_BNO;
-               /* FALLTHROUGH */
-       case XFS_ALLOCTYPE_ANY_AG:
-       case XFS_ALLOCTYPE_START_AG:
-       case XFS_ALLOCTYPE_FIRST_AG:
-               /*
-                * Rotate through the allocation groups looking for a winner.
-                */
-               if (type == XFS_ALLOCTYPE_ANY_AG) {
-                       /*
-                        * Start with the last place we left off.
-                        */
-                       args->agno = sagno = (mp->m_agfrotor / rotorstep) %
-                                       mp->m_sb.sb_agcount;
-                       args->type = XFS_ALLOCTYPE_THIS_AG;
-                       flags = XFS_ALLOC_FLAG_TRYLOCK;
-               } else if (type == XFS_ALLOCTYPE_FIRST_AG) {
-                       /*
-                        * Start with allocation group given by bno.
-                        */
-                       args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
-                       args->type = XFS_ALLOCTYPE_THIS_AG;
-                       sagno = 0;
-                       flags = 0;
-               } else {
-                       if (type == XFS_ALLOCTYPE_START_AG)
-                               args->type = XFS_ALLOCTYPE_THIS_AG;
-                       /*
-                        * Start with the given allocation group.
-                        */
-                       args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
-                       flags = XFS_ALLOC_FLAG_TRYLOCK;
-               }
-               /*
-                * Loop over allocation groups twice; first time with
-                * trylock set, second time without.
-                */
-               for (;;) {
-                       args->pag = xfs_perag_get(mp, args->agno);
-                       if (no_min) args->minleft = 0;
-                       error = xfs_alloc_fix_freelist(args, flags);
-                       args->minleft = minleft;
-                       if (error) {
-                               trace_xfs_alloc_vextent_nofix(args);
-                               goto error0;
-                       }
-                       /*
-                        * If we get a buffer back then the allocation will fly.
-                        */
-                       if (args->agbp) {
-                               if ((error = xfs_alloc_ag_vextent(args)))
-                                       goto error0;
-                               break;
-                       }
-
-                       trace_xfs_alloc_vextent_loopfailed(args);
-
-                       /*
-                        * Didn't work, figure out the next iteration.
-                        */
-                       if (args->agno == sagno &&
-                           type == XFS_ALLOCTYPE_START_BNO)
-                               args->type = XFS_ALLOCTYPE_THIS_AG;
-                       /*
-                       * For the first allocation, we can try any AG to get
-                       * space.  However, if we already have allocated a
-                       * block, we don't want to try AGs whose number is below
-                       * sagno. Otherwise, we may end up with out-of-order
-                       * locking of AGF, which might cause deadlock.
-                       */
-                       if (++(args->agno) == mp->m_sb.sb_agcount) {
-                               if (args->firstblock != NULLFSBLOCK)
-                                       args->agno = sagno;
-                               else
-                                       args->agno = 0;
-                       }
-                       /*
-                        * Reached the starting a.g., must either be done
-                        * or switch to non-trylock mode.
-                        */
-                       if (args->agno == sagno) {
-                               if (no_min == 1) {
-                                       args->agbno = NULLAGBLOCK;
-                                       trace_xfs_alloc_vextent_allfailed(args);
-                                       break;
-                               }
-                               if (flags == 0) {
-                                       no_min = 1;
-                               } else {
-                                       flags = 0;
-                                       if (type == XFS_ALLOCTYPE_START_BNO) {
-                                               args->agbno = XFS_FSB_TO_AGBNO(mp,
-                                                       args->fsbno);
-                                               args->type = XFS_ALLOCTYPE_NEAR_BNO;
-                                       }
-                               }
-                       }
-                       xfs_perag_put(args->pag);
-               }
-               if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
-                       if (args->agno == sagno)
-                               mp->m_agfrotor = (mp->m_agfrotor + 1) %
-                                       (mp->m_sb.sb_agcount * rotorstep);
-                       else
-                               mp->m_agfrotor = (args->agno * rotorstep + 1) %
-                                       (mp->m_sb.sb_agcount * rotorstep);
-               }
-               break;
-       default:
-               ASSERT(0);
-               /* NOTREACHED */
-       }
-       if (args->agbno == NULLAGBLOCK)
-               args->fsbno = NULLFSBLOCK;
-       else {
-               args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
-#ifdef DEBUG
-               ASSERT(args->len >= args->minlen);
-               ASSERT(args->len <= args->maxlen);
-               ASSERT(args->agbno % args->alignment == 0);
-               XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
-                       args->len);
-#endif
-       }
-       xfs_perag_put(args->pag);
-       return 0;
-error0:
-       xfs_perag_put(args->pag);
-       return error;
-}
-
-/*
- * Free an extent.
- * Just break up the extent address and hand off to xfs_free_ag_extent
- * after fixing up the freelist.
- */
-int                            /* error */
-xfs_free_extent(
-       xfs_trans_t     *tp,    /* transaction pointer */
-       xfs_fsblock_t   bno,    /* starting block number of extent */
-       xfs_extlen_t    len)    /* length of extent */
-{
-       xfs_alloc_arg_t args;
-       int             error;
-
-       ASSERT(len != 0);
-       memset(&args, 0, sizeof(xfs_alloc_arg_t));
-       args.tp = tp;
-       args.mp = tp->t_mountp;
-
-       /*
-        * validate that the block number is legal - the enables us to detect
-        * and handle a silent filesystem corruption rather than crashing.
-        */
-       args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
-       if (args.agno >= args.mp->m_sb.sb_agcount)
-               return EFSCORRUPTED;
-
-       args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
-       if (args.agbno >= args.mp->m_sb.sb_agblocks)
-               return EFSCORRUPTED;
-
-       args.pag = xfs_perag_get(args.mp, args.agno);
-       ASSERT(args.pag);
-
-       error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
-       if (error)
-               goto error0;
-
-       /* validate the extent size is legal now we have the agf locked */
-       if (args.agbno + len >
-                       be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
-               error = EFSCORRUPTED;
-               goto error0;
-       }
-
-       error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
-       if (!error)
-               xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0);
-error0:
-       xfs_perag_put(args.pag);
-       return error;
-}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
deleted file mode 100644 (file)
index feacb06..0000000
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_ALLOC_H__
-#define        __XFS_ALLOC_H__
-
-struct xfs_buf;
-struct xfs_btree_cur;
-struct xfs_mount;
-struct xfs_perag;
-struct xfs_trans;
-
-extern struct workqueue_struct *xfs_alloc_wq;
-
-/*
- * Freespace allocation types.  Argument to xfs_alloc_[v]extent.
- */
-#define XFS_ALLOCTYPE_ANY_AG   0x01    /* allocate anywhere, use rotor */
-#define XFS_ALLOCTYPE_FIRST_AG 0x02    /* ... start at ag 0 */
-#define XFS_ALLOCTYPE_START_AG 0x04    /* anywhere, start in this a.g. */
-#define XFS_ALLOCTYPE_THIS_AG  0x08    /* anywhere in this a.g. */
-#define XFS_ALLOCTYPE_START_BNO        0x10    /* near this block else anywhere */
-#define XFS_ALLOCTYPE_NEAR_BNO 0x20    /* in this a.g. and near this block */
-#define XFS_ALLOCTYPE_THIS_BNO 0x40    /* at exactly this block */
-
-/* this should become an enum again when the tracing code is fixed */
-typedef unsigned int xfs_alloctype_t;
-
-#define XFS_ALLOC_TYPES \
-       { XFS_ALLOCTYPE_ANY_AG,         "ANY_AG" }, \
-       { XFS_ALLOCTYPE_FIRST_AG,       "FIRST_AG" }, \
-       { XFS_ALLOCTYPE_START_AG,       "START_AG" }, \
-       { XFS_ALLOCTYPE_THIS_AG,        "THIS_AG" }, \
-       { XFS_ALLOCTYPE_START_BNO,      "START_BNO" }, \
-       { XFS_ALLOCTYPE_NEAR_BNO,       "NEAR_BNO" }, \
-       { XFS_ALLOCTYPE_THIS_BNO,       "THIS_BNO" }
-
-/*
- * Flags for xfs_alloc_fix_freelist.
- */
-#define        XFS_ALLOC_FLAG_TRYLOCK  0x00000001  /* use trylock for buffer locking */
-#define        XFS_ALLOC_FLAG_FREEING  0x00000002  /* indicate caller is freeing extents*/
-
-/*
- * In order to avoid ENOSPC-related deadlock caused by
- * out-of-order locking of AGF buffer (PV 947395), we place
- * constraints on the relationship among actual allocations for
- * data blocks, freelist blocks, and potential file data bmap
- * btree blocks. However, these restrictions may result in no
- * actual space allocated for a delayed extent, for example, a data
- * block in a certain AG is allocated but there is no additional
- * block for the additional bmap btree block due to a split of the
- * bmap btree of the file. The result of this may lead to an
- * infinite loop in xfssyncd when the file gets flushed to disk and
- * all delayed extents need to be actually allocated. To get around
- * this, we explicitly set aside a few blocks which will not be
- * reserved in delayed allocation. Considering the minimum number of
- * needed freelist blocks is 4 fsbs _per AG_, a potential split of file's bmap
- * btree requires 1 fsb, so we set the number of set-aside blocks
- * to 4 + 4*agcount.
- */
-#define XFS_ALLOC_SET_ASIDE(mp)  (4 + ((mp)->m_sb.sb_agcount * 4))
-
-/*
- * When deciding how much space to allocate out of an AG, we limit the
- * allocation maximum size to the size the AG. However, we cannot use all the
- * blocks in the AG - some are permanently used by metadata. These
- * blocks are generally:
- *     - the AG superblock, AGF, AGI and AGFL
- *     - the AGF (bno and cnt) and AGI btree root blocks
- *     - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
- *
- * The AG headers are sector sized, so the amount of space they take up is
- * dependent on filesystem geometry. The others are all single blocks.
- */
-#define XFS_ALLOC_AG_MAX_USABLE(mp)    \
-       ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
-
-
-/*
- * Argument structure for xfs_alloc routines.
- * This is turned into a structure to avoid having 20 arguments passed
- * down several levels of the stack.
- */
-typedef struct xfs_alloc_arg {
-       struct xfs_trans *tp;           /* transaction pointer */
-       struct xfs_mount *mp;           /* file system mount point */
-       struct xfs_buf  *agbp;          /* buffer for a.g. freelist header */
-       struct xfs_perag *pag;          /* per-ag struct for this agno */
-       xfs_fsblock_t   fsbno;          /* file system block number */
-       xfs_agnumber_t  agno;           /* allocation group number */
-       xfs_agblock_t   agbno;          /* allocation group-relative block # */
-       xfs_extlen_t    minlen;         /* minimum size of extent */
-       xfs_extlen_t    maxlen;         /* maximum size of extent */
-       xfs_extlen_t    mod;            /* mod value for extent size */
-       xfs_extlen_t    prod;           /* prod value for extent size */
-       xfs_extlen_t    minleft;        /* min blocks must be left after us */
-       xfs_extlen_t    total;          /* total blocks needed in xaction */
-       xfs_extlen_t    alignment;      /* align answer to multiple of this */
-       xfs_extlen_t    minalignslop;   /* slop for minlen+alignment calcs */
-       xfs_extlen_t    len;            /* output: actual size of extent */
-       xfs_alloctype_t type;           /* allocation type XFS_ALLOCTYPE_... */
-       xfs_alloctype_t otype;          /* original allocation type */
-       char            wasdel;         /* set if allocation was prev delayed */
-       char            wasfromfl;      /* set if allocation is from freelist */
-       char            isfl;           /* set if is freelist blocks - !acctg */
-       char            userdata;       /* set if this is user data */
-       xfs_fsblock_t   firstblock;     /* io first block allocated */
-} xfs_alloc_arg_t;
-
-/*
- * Defines for userdata
- */
-#define XFS_ALLOC_USERDATA             1       /* allocation is for user data*/
-#define XFS_ALLOC_INITIAL_USER_DATA    2       /* special case start of file */
-
-/*
- * Find the length of the longest extent in an AG.
- */
-xfs_extlen_t
-xfs_alloc_longest_free_extent(struct xfs_mount *mp,
-               struct xfs_perag *pag);
-
-/*
- * Compute and fill in value of m_ag_maxlevels.
- */
-void
-xfs_alloc_compute_maxlevels(
-       struct xfs_mount        *mp);   /* file system mount structure */
-
-/*
- * Get a block from the freelist.
- * Returns with the buffer for the block gotten.
- */
-int                            /* error */
-xfs_alloc_get_freelist(
-       struct xfs_trans *tp,   /* transaction pointer */
-       struct xfs_buf  *agbp,  /* buffer containing the agf structure */
-       xfs_agblock_t   *bnop,  /* block address retrieved from freelist */
-       int             btreeblk); /* destination is a AGF btree */
-
-/*
- * Log the given fields from the agf structure.
- */
-void
-xfs_alloc_log_agf(
-       struct xfs_trans *tp,   /* transaction pointer */
-       struct xfs_buf  *bp,    /* buffer for a.g. freelist header */
-       int             fields);/* mask of fields to be logged (XFS_AGF_...) */
-
-/*
- * Interface for inode allocation to force the pag data to be initialized.
- */
-int                            /* error */
-xfs_alloc_pagf_init(
-       struct xfs_mount *mp,   /* file system mount structure */
-       struct xfs_trans *tp,   /* transaction pointer */
-       xfs_agnumber_t  agno,   /* allocation group number */
-       int             flags); /* XFS_ALLOC_FLAGS_... */
-
-/*
- * Put the block on the freelist for the allocation group.
- */
-int                            /* error */
-xfs_alloc_put_freelist(
-       struct xfs_trans *tp,   /* transaction pointer */
-       struct xfs_buf  *agbp,  /* buffer for a.g. freelist header */
-       struct xfs_buf  *agflbp,/* buffer for a.g. free block array */
-       xfs_agblock_t   bno,    /* block being freed */
-       int             btreeblk); /* owner was a AGF btree */
-
-/*
- * Read in the allocation group header (free/alloc section).
- */
-int                                    /* error  */
-xfs_alloc_read_agf(
-       struct xfs_mount *mp,           /* mount point structure */
-       struct xfs_trans *tp,           /* transaction pointer */
-       xfs_agnumber_t  agno,           /* allocation group number */
-       int             flags,          /* XFS_ALLOC_FLAG_... */
-       struct xfs_buf  **bpp);         /* buffer for the ag freelist header */
-
-/*
- * Allocate an extent (variable-size).
- */
-int                            /* error */
-xfs_alloc_vextent(
-       xfs_alloc_arg_t *args); /* allocation argument structure */
-
-/*
- * Free an extent.
- */
-int                            /* error */
-xfs_free_extent(
-       struct xfs_trans *tp,   /* transaction pointer */
-       xfs_fsblock_t   bno,    /* starting block number of extent */
-       xfs_extlen_t    len);   /* length of extent */
-
-int                                    /* error */
-xfs_alloc_lookup_le(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       xfs_agblock_t           bno,    /* starting block of extent */
-       xfs_extlen_t            len,    /* length of extent */
-       int                     *stat); /* success/failure */
-
-int                            /* error */
-xfs_alloc_lookup_ge(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       xfs_agblock_t           bno,    /* starting block of extent */
-       xfs_extlen_t            len,    /* length of extent */
-       int                     *stat); /* success/failure */
-
-int                                    /* error */
-xfs_alloc_get_rec(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       xfs_agblock_t           *bno,   /* output: starting block of extent */
-       xfs_extlen_t            *len,   /* output: length of extent */
-       int                     *stat); /* output: success/failure */
-
-#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
deleted file mode 100644 (file)
index 8358f1d..0000000
+++ /dev/null
@@ -1,504 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_extent_busy.h"
-#include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_cksum.h"
-#include "xfs_trans.h"
-
-
-STATIC struct xfs_btree_cur *
-xfs_allocbt_dup_cursor(
-       struct xfs_btree_cur    *cur)
-{
-       return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
-                       cur->bc_private.a.agbp, cur->bc_private.a.agno,
-                       cur->bc_btnum);
-}
-
-STATIC void
-xfs_allocbt_set_root(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_ptr     *ptr,
-       int                     inc)
-{
-       struct xfs_buf          *agbp = cur->bc_private.a.agbp;
-       struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
-       xfs_agnumber_t          seqno = be32_to_cpu(agf->agf_seqno);
-       int                     btnum = cur->bc_btnum;
-       struct xfs_perag        *pag = xfs_perag_get(cur->bc_mp, seqno);
-
-       ASSERT(ptr->s != 0);
-
-       agf->agf_roots[btnum] = ptr->s;
-       be32_add_cpu(&agf->agf_levels[btnum], inc);
-       pag->pagf_levels[btnum] += inc;
-       xfs_perag_put(pag);
-
-       xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
-}
-
-STATIC int
-xfs_allocbt_alloc_block(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_ptr     *start,
-       union xfs_btree_ptr     *new,
-       int                     *stat)
-{
-       int                     error;
-       xfs_agblock_t           bno;
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-
-       /* Allocate the new block from the freelist. If we can't, give up.  */
-       error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
-                                      &bno, 1);
-       if (error) {
-               XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-               return error;
-       }
-
-       if (bno == NULLAGBLOCK) {
-               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-               *stat = 0;
-               return 0;
-       }
-
-       xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
-
-       xfs_trans_agbtree_delta(cur->bc_tp, 1);
-       new->s = cpu_to_be32(bno);
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-       *stat = 1;
-       return 0;
-}
-
-STATIC int
-xfs_allocbt_free_block(
-       struct xfs_btree_cur    *cur,
-       struct xfs_buf          *bp)
-{
-       struct xfs_buf          *agbp = cur->bc_private.a.agbp;
-       struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
-       xfs_agblock_t           bno;
-       int                     error;
-
-       bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
-       error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
-       if (error)
-               return error;
-
-       xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
-                             XFS_EXTENT_BUSY_SKIP_DISCARD);
-       xfs_trans_agbtree_delta(cur->bc_tp, -1);
-
-       xfs_trans_binval(cur->bc_tp, bp);
-       return 0;
-}
-
-/*
- * Update the longest extent in the AGF
- */
-STATIC void
-xfs_allocbt_update_lastrec(
-       struct xfs_btree_cur    *cur,
-       struct xfs_btree_block  *block,
-       union xfs_btree_rec     *rec,
-       int                     ptr,
-       int                     reason)
-{
-       struct xfs_agf          *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-       xfs_agnumber_t          seqno = be32_to_cpu(agf->agf_seqno);
-       struct xfs_perag        *pag;
-       __be32                  len;
-       int                     numrecs;
-
-       ASSERT(cur->bc_btnum == XFS_BTNUM_CNT);
-
-       switch (reason) {
-       case LASTREC_UPDATE:
-               /*
-                * If this is the last leaf block and it's the last record,
-                * then update the size of the longest extent in the AG.
-                */
-               if (ptr != xfs_btree_get_numrecs(block))
-                       return;
-               len = rec->alloc.ar_blockcount;
-               break;
-       case LASTREC_INSREC:
-               if (be32_to_cpu(rec->alloc.ar_blockcount) <=
-                   be32_to_cpu(agf->agf_longest))
-                       return;
-               len = rec->alloc.ar_blockcount;
-               break;
-       case LASTREC_DELREC:
-               numrecs = xfs_btree_get_numrecs(block);
-               if (ptr <= numrecs)
-                       return;
-               ASSERT(ptr == numrecs + 1);
-
-               if (numrecs) {
-                       xfs_alloc_rec_t *rrp;
-
-                       rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs);
-                       len = rrp->ar_blockcount;
-               } else {
-                       len = 0;
-               }
-
-               break;
-       default:
-               ASSERT(0);
-               return;
-       }
-
-       agf->agf_longest = len;
-       pag = xfs_perag_get(cur->bc_mp, seqno);
-       pag->pagf_longest = be32_to_cpu(len);
-       xfs_perag_put(pag);
-       xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
-}
-
-STATIC int
-xfs_allocbt_get_minrecs(
-       struct xfs_btree_cur    *cur,
-       int                     level)
-{
-       return cur->bc_mp->m_alloc_mnr[level != 0];
-}
-
-STATIC int
-xfs_allocbt_get_maxrecs(
-       struct xfs_btree_cur    *cur,
-       int                     level)
-{
-       return cur->bc_mp->m_alloc_mxr[level != 0];
-}
-
-STATIC void
-xfs_allocbt_init_key_from_rec(
-       union xfs_btree_key     *key,
-       union xfs_btree_rec     *rec)
-{
-       ASSERT(rec->alloc.ar_startblock != 0);
-
-       key->alloc.ar_startblock = rec->alloc.ar_startblock;
-       key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
-}
-
-STATIC void
-xfs_allocbt_init_rec_from_key(
-       union xfs_btree_key     *key,
-       union xfs_btree_rec     *rec)
-{
-       ASSERT(key->alloc.ar_startblock != 0);
-
-       rec->alloc.ar_startblock = key->alloc.ar_startblock;
-       rec->alloc.ar_blockcount = key->alloc.ar_blockcount;
-}
-
-STATIC void
-xfs_allocbt_init_rec_from_cur(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_rec     *rec)
-{
-       ASSERT(cur->bc_rec.a.ar_startblock != 0);
-
-       rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
-       rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
-}
-
-STATIC void
-xfs_allocbt_init_ptr_from_cur(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_ptr     *ptr)
-{
-       struct xfs_agf          *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-
-       ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
-       ASSERT(agf->agf_roots[cur->bc_btnum] != 0);
-
-       ptr->s = agf->agf_roots[cur->bc_btnum];
-}
-
-STATIC __int64_t
-xfs_allocbt_key_diff(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_key     *key)
-{
-       xfs_alloc_rec_incore_t  *rec = &cur->bc_rec.a;
-       xfs_alloc_key_t         *kp = &key->alloc;
-       __int64_t               diff;
-
-       if (cur->bc_btnum == XFS_BTNUM_BNO) {
-               return (__int64_t)be32_to_cpu(kp->ar_startblock) -
-                               rec->ar_startblock;
-       }
-
-       diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
-       if (diff)
-               return diff;
-
-       return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
-}
-
-static bool
-xfs_allocbt_verify(
-       struct xfs_buf          *bp)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-       struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
-       struct xfs_perag        *pag = bp->b_pag;
-       unsigned int            level;
-
-       /*
-        * magic number and level verification
-        *
-        * During growfs operations, we can't verify the exact level or owner as
-        * the perag is not fully initialised and hence not attached to the
-        * buffer.  In this case, check against the maximum tree depth.
-        *
-        * Similarly, during log recovery we will have a perag structure
-        * attached, but the agf information will not yet have been initialised
-        * from the on disk AGF. Again, we can only check against maximum limits
-        * in this case.
-        */
-       level = be16_to_cpu(block->bb_level);
-       switch (block->bb_magic) {
-       case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
-               if (!xfs_sb_version_hascrc(&mp->m_sb))
-                       return false;
-               if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
-                       return false;
-               if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
-                       return false;
-               if (pag &&
-                   be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
-                       return false;
-               /* fall through */
-       case cpu_to_be32(XFS_ABTB_MAGIC):
-               if (pag && pag->pagf_init) {
-                       if (level >= pag->pagf_levels[XFS_BTNUM_BNOi])
-                               return false;
-               } else if (level >= mp->m_ag_maxlevels)
-                       return false;
-               break;
-       case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
-               if (!xfs_sb_version_hascrc(&mp->m_sb))
-                       return false;
-               if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
-                       return false;
-               if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
-                       return false;
-               if (pag &&
-                   be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
-                       return false;
-               /* fall through */
-       case cpu_to_be32(XFS_ABTC_MAGIC):
-               if (pag && pag->pagf_init) {
-                       if (level >= pag->pagf_levels[XFS_BTNUM_CNTi])
-                               return false;
-               } else if (level >= mp->m_ag_maxlevels)
-                       return false;
-               break;
-       default:
-               return false;
-       }
-
-       /* numrecs verification */
-       if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0])
-               return false;
-
-       /* sibling pointer verification */
-       if (!block->bb_u.s.bb_leftsib ||
-           (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
-            block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
-               return false;
-       if (!block->bb_u.s.bb_rightsib ||
-           (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
-            block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
-               return false;
-
-       return true;
-}
-
-static void
-xfs_allocbt_read_verify(
-       struct xfs_buf  *bp)
-{
-       if (!xfs_btree_sblock_verify_crc(bp))
-               xfs_buf_ioerror(bp, EFSBADCRC);
-       else if (!xfs_allocbt_verify(bp))
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-       if (bp->b_error) {
-               trace_xfs_btree_corrupt(bp, _RET_IP_);
-               xfs_verifier_error(bp);
-       }
-}
-
-static void
-xfs_allocbt_write_verify(
-       struct xfs_buf  *bp)
-{
-       if (!xfs_allocbt_verify(bp)) {
-               trace_xfs_btree_corrupt(bp, _RET_IP_);
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-               xfs_verifier_error(bp);
-               return;
-       }
-       xfs_btree_sblock_calc_crc(bp);
-
-}
-
-const struct xfs_buf_ops xfs_allocbt_buf_ops = {
-       .verify_read = xfs_allocbt_read_verify,
-       .verify_write = xfs_allocbt_write_verify,
-};
-
-
-#if defined(DEBUG) || defined(XFS_WARN)
-STATIC int
-xfs_allocbt_keys_inorder(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_key     *k1,
-       union xfs_btree_key     *k2)
-{
-       if (cur->bc_btnum == XFS_BTNUM_BNO) {
-               return be32_to_cpu(k1->alloc.ar_startblock) <
-                      be32_to_cpu(k2->alloc.ar_startblock);
-       } else {
-               return be32_to_cpu(k1->alloc.ar_blockcount) <
-                       be32_to_cpu(k2->alloc.ar_blockcount) ||
-                       (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
-                        be32_to_cpu(k1->alloc.ar_startblock) <
-                        be32_to_cpu(k2->alloc.ar_startblock));
-       }
-}
-
-STATIC int
-xfs_allocbt_recs_inorder(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_rec     *r1,
-       union xfs_btree_rec     *r2)
-{
-       if (cur->bc_btnum == XFS_BTNUM_BNO) {
-               return be32_to_cpu(r1->alloc.ar_startblock) +
-                       be32_to_cpu(r1->alloc.ar_blockcount) <=
-                       be32_to_cpu(r2->alloc.ar_startblock);
-       } else {
-               return be32_to_cpu(r1->alloc.ar_blockcount) <
-                       be32_to_cpu(r2->alloc.ar_blockcount) ||
-                       (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
-                        be32_to_cpu(r1->alloc.ar_startblock) <
-                        be32_to_cpu(r2->alloc.ar_startblock));
-       }
-}
-#endif /* DEBUG */
-
-static const struct xfs_btree_ops xfs_allocbt_ops = {
-       .rec_len                = sizeof(xfs_alloc_rec_t),
-       .key_len                = sizeof(xfs_alloc_key_t),
-
-       .dup_cursor             = xfs_allocbt_dup_cursor,
-       .set_root               = xfs_allocbt_set_root,
-       .alloc_block            = xfs_allocbt_alloc_block,
-       .free_block             = xfs_allocbt_free_block,
-       .update_lastrec         = xfs_allocbt_update_lastrec,
-       .get_minrecs            = xfs_allocbt_get_minrecs,
-       .get_maxrecs            = xfs_allocbt_get_maxrecs,
-       .init_key_from_rec      = xfs_allocbt_init_key_from_rec,
-       .init_rec_from_key      = xfs_allocbt_init_rec_from_key,
-       .init_rec_from_cur      = xfs_allocbt_init_rec_from_cur,
-       .init_ptr_from_cur      = xfs_allocbt_init_ptr_from_cur,
-       .key_diff               = xfs_allocbt_key_diff,
-       .buf_ops                = &xfs_allocbt_buf_ops,
-#if defined(DEBUG) || defined(XFS_WARN)
-       .keys_inorder           = xfs_allocbt_keys_inorder,
-       .recs_inorder           = xfs_allocbt_recs_inorder,
-#endif
-};
-
-/*
- * Allocate a new allocation btree cursor.
- */
-struct xfs_btree_cur *                 /* new alloc btree cursor */
-xfs_allocbt_init_cursor(
-       struct xfs_mount        *mp,            /* file system mount point */
-       struct xfs_trans        *tp,            /* transaction pointer */
-       struct xfs_buf          *agbp,          /* buffer for agf structure */
-       xfs_agnumber_t          agno,           /* allocation group number */
-       xfs_btnum_t             btnum)          /* btree identifier */
-{
-       struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
-       struct xfs_btree_cur    *cur;
-
-       ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
-
-       cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
-
-       cur->bc_tp = tp;
-       cur->bc_mp = mp;
-       cur->bc_btnum = btnum;
-       cur->bc_blocklog = mp->m_sb.sb_blocklog;
-       cur->bc_ops = &xfs_allocbt_ops;
-
-       if (btnum == XFS_BTNUM_CNT) {
-               cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
-               cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
-       } else {
-               cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
-       }
-
-       cur->bc_private.a.agbp = agbp;
-       cur->bc_private.a.agno = agno;
-
-       if (xfs_sb_version_hascrc(&mp->m_sb))
-               cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
-
-       return cur;
-}
-
-/*
- * Calculate number of records in an alloc btree block.
- */
-int
-xfs_allocbt_maxrecs(
-       struct xfs_mount        *mp,
-       int                     blocklen,
-       int                     leaf)
-{
-       blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
-
-       if (leaf)
-               return blocklen / sizeof(xfs_alloc_rec_t);
-       return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
-}
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
deleted file mode 100644 (file)
index 45e189e..0000000
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_ALLOC_BTREE_H__
-#define        __XFS_ALLOC_BTREE_H__
-
-/*
- * Freespace on-disk structures
- */
-
-struct xfs_buf;
-struct xfs_btree_cur;
-struct xfs_mount;
-
-/*
- * Btree block header size depends on a superblock flag.
- */
-#define XFS_ALLOC_BLOCK_LEN(mp) \
-       (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
-               XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
-
-/*
- * Record, key, and pointer address macros for btree blocks.
- *
- * (note that some of these may appear unused, but they are used in userspace)
- */
-#define XFS_ALLOC_REC_ADDR(mp, block, index) \
-       ((xfs_alloc_rec_t *) \
-               ((char *)(block) + \
-                XFS_ALLOC_BLOCK_LEN(mp) + \
-                (((index) - 1) * sizeof(xfs_alloc_rec_t))))
-
-#define XFS_ALLOC_KEY_ADDR(mp, block, index) \
-       ((xfs_alloc_key_t *) \
-               ((char *)(block) + \
-                XFS_ALLOC_BLOCK_LEN(mp) + \
-                ((index) - 1) * sizeof(xfs_alloc_key_t)))
-
-#define XFS_ALLOC_PTR_ADDR(mp, block, index, maxrecs) \
-       ((xfs_alloc_ptr_t *) \
-               ((char *)(block) + \
-                XFS_ALLOC_BLOCK_LEN(mp) + \
-                (maxrecs) * sizeof(xfs_alloc_key_t) + \
-                ((index) - 1) * sizeof(xfs_alloc_ptr_t)))
-
-extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
-               struct xfs_trans *, struct xfs_buf *,
-               xfs_agnumber_t, xfs_btnum_t);
-extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
-
-#endif /* __XFS_ALLOC_BTREE_H__ */
index faaf716e2080ad5d41cd86dd05c1ac8f4e3e2fad..11e9b4caa54f168f7e429f5d7e5a01302d026e1c 100644 (file)
@@ -240,7 +240,7 @@ xfs_end_io(
 
 done:
        if (error)
-               ioend->io_error = -error;
+               ioend->io_error = error;
        xfs_destroy_ioend(ioend);
 }
 
@@ -308,14 +308,14 @@ xfs_map_blocks(
        int                     nimaps = 1;
 
        if (XFS_FORCED_SHUTDOWN(mp))
-               return -XFS_ERROR(EIO);
+               return -EIO;
 
        if (type == XFS_IO_UNWRITTEN)
                bmapi_flags |= XFS_BMAPI_IGSTATE;
 
        if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
                if (nonblocking)
-                       return -XFS_ERROR(EAGAIN);
+                       return -EAGAIN;
                xfs_ilock(ip, XFS_ILOCK_SHARED);
        }
 
@@ -332,14 +332,14 @@ xfs_map_blocks(
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
        if (error)
-               return -XFS_ERROR(error);
+               return error;
 
        if (type == XFS_IO_DELALLOC &&
            (!nimaps || isnullstartblock(imap->br_startblock))) {
                error = xfs_iomap_write_allocate(ip, offset, imap);
                if (!error)
                        trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
-               return -XFS_ERROR(error);
+               return error;
        }
 
 #ifdef DEBUG
@@ -502,7 +502,7 @@ xfs_submit_ioend(
                 * time.
                 */
                if (fail) {
-                       ioend->io_error = -fail;
+                       ioend->io_error = fail;
                        xfs_finish_ioend(ioend);
                        continue;
                }
@@ -1253,7 +1253,7 @@ __xfs_get_blocks(
        int                     new = 0;
 
        if (XFS_FORCED_SHUTDOWN(mp))
-               return -XFS_ERROR(EIO);
+               return -EIO;
 
        offset = (xfs_off_t)iblock << inode->i_blkbits;
        ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
@@ -1302,7 +1302,7 @@ __xfs_get_blocks(
                        error = xfs_iomap_write_direct(ip, offset, size,
                                                       &imap, nimaps);
                        if (error)
-                               return -error;
+                               return error;
                        new = 1;
                } else {
                        /*
@@ -1415,7 +1415,7 @@ __xfs_get_blocks(
 
 out_unlock:
        xfs_iunlock(ip, lockmode);
-       return -error;
+       return error;
 }
 
 int
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
deleted file mode 100644 (file)
index bfe36fc..0000000
+++ /dev/null
@@ -1,1459 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_inode.h"
-#include "xfs_alloc.h"
-#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_attr.h"
-#include "xfs_attr_leaf.h"
-#include "xfs_attr_remote.h"
-#include "xfs_error.h"
-#include "xfs_quota.h"
-#include "xfs_trans_space.h"
-#include "xfs_trace.h"
-#include "xfs_dinode.h"
-
-/*
- * xfs_attr.c
- *
- * Provide the external interfaces to manage attribute lists.
- */
-
-/*========================================================================
- * Function prototypes for the kernel.
- *========================================================================*/
-
-/*
- * Internal routines when attribute list fits inside the inode.
- */
-STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
-
-/*
- * Internal routines when attribute list is one block.
- */
-STATIC int xfs_attr_leaf_get(xfs_da_args_t *args);
-STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args);
-STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
-
-/*
- * Internal routines when attribute list is more than one block.
- */
-STATIC int xfs_attr_node_get(xfs_da_args_t *args);
-STATIC int xfs_attr_node_addname(xfs_da_args_t *args);
-STATIC int xfs_attr_node_removename(xfs_da_args_t *args);
-STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
-STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
-
-
-STATIC int
-xfs_attr_args_init(
-       struct xfs_da_args      *args,
-       struct xfs_inode        *dp,
-       const unsigned char     *name,
-       int                     flags)
-{
-
-       if (!name)
-               return EINVAL;
-
-       memset(args, 0, sizeof(*args));
-       args->geo = dp->i_mount->m_attr_geo;
-       args->whichfork = XFS_ATTR_FORK;
-       args->dp = dp;
-       args->flags = flags;
-       args->name = name;
-       args->namelen = strlen((const char *)name);
-       if (args->namelen >= MAXNAMELEN)
-               return EFAULT;          /* match IRIX behaviour */
-
-       args->hashval = xfs_da_hashname(args->name, args->namelen);
-       return 0;
-}
-
-int
-xfs_inode_hasattr(
-       struct xfs_inode        *ip)
-{
-       if (!XFS_IFORK_Q(ip) ||
-           (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
-            ip->i_d.di_anextents == 0))
-               return 0;
-       return 1;
-}
-
-/*========================================================================
- * Overall external interface routines.
- *========================================================================*/
-
-int
-xfs_attr_get(
-       struct xfs_inode        *ip,
-       const unsigned char     *name,
-       unsigned char           *value,
-       int                     *valuelenp,
-       int                     flags)
-{
-       struct xfs_da_args      args;
-       uint                    lock_mode;
-       int                     error;
-
-       XFS_STATS_INC(xs_attr_get);
-
-       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-               return EIO;
-
-       if (!xfs_inode_hasattr(ip))
-               return ENOATTR;
-
-       error = xfs_attr_args_init(&args, ip, name, flags);
-       if (error)
-               return error;
-
-       args.value = value;
-       args.valuelen = *valuelenp;
-
-       lock_mode = xfs_ilock_attr_map_shared(ip);
-       if (!xfs_inode_hasattr(ip))
-               error = ENOATTR;
-       else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
-               error = xfs_attr_shortform_getvalue(&args);
-       else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK))
-               error = xfs_attr_leaf_get(&args);
-       else
-               error = xfs_attr_node_get(&args);
-       xfs_iunlock(ip, lock_mode);
-
-       *valuelenp = args.valuelen;
-       return error == EEXIST ? 0 : error;
-}
-
-/*
- * Calculate how many blocks we need for the new attribute,
- */
-STATIC int
-xfs_attr_calc_size(
-       struct xfs_da_args      *args,
-       int                     *local)
-{
-       struct xfs_mount        *mp = args->dp->i_mount;
-       int                     size;
-       int                     nblks;
-
-       /*
-        * Determine space new attribute will use, and if it would be
-        * "local" or "remote" (note: local != inline).
-        */
-       size = xfs_attr_leaf_newentsize(args, local);
-       nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
-       if (*local) {
-               if (size > (args->geo->blksize / 2)) {
-                       /* Double split possible */
-                       nblks *= 2;
-               }
-       } else {
-               /*
-                * Out of line attribute, cannot double split, but
-                * make room for the attribute value itself.
-                */
-               uint    dblocks = xfs_attr3_rmt_blocks(mp, args->valuelen);
-               nblks += dblocks;
-               nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
-       }
-
-       return nblks;
-}
-
-int
-xfs_attr_set(
-       struct xfs_inode        *dp,
-       const unsigned char     *name,
-       unsigned char           *value,
-       int                     valuelen,
-       int                     flags)
-{
-       struct xfs_mount        *mp = dp->i_mount;
-       struct xfs_da_args      args;
-       struct xfs_bmap_free    flist;
-       struct xfs_trans_res    tres;
-       xfs_fsblock_t           firstblock;
-       int                     rsvd = (flags & ATTR_ROOT) != 0;
-       int                     error, err2, committed, local;
-
-       XFS_STATS_INC(xs_attr_set);
-
-       if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-               return EIO;
-
-       error = xfs_attr_args_init(&args, dp, name, flags);
-       if (error)
-               return error;
-
-       args.value = value;
-       args.valuelen = valuelen;
-       args.firstblock = &firstblock;
-       args.flist = &flist;
-       args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
-       args.total = xfs_attr_calc_size(&args, &local);
-
-       error = xfs_qm_dqattach(dp, 0);
-       if (error)
-               return error;
-
-       /*
-        * If the inode doesn't have an attribute fork, add one.
-        * (inode must not be locked when we call this routine)
-        */
-       if (XFS_IFORK_Q(dp) == 0) {
-               int sf_size = sizeof(xfs_attr_sf_hdr_t) +
-                       XFS_ATTR_SF_ENTSIZE_BYNAME(args.namelen, valuelen);
-
-               error = xfs_bmap_add_attrfork(dp, sf_size, rsvd);
-               if (error)
-                       return error;
-       }
-
-       /*
-        * Start our first transaction of the day.
-        *
-        * All future transactions during this code must be "chained" off
-        * this one via the trans_dup() call.  All transactions will contain
-        * the inode, and the inode will always be marked with trans_ihold().
-        * Since the inode will be locked in all transactions, we must log
-        * the inode in every transaction to let it float upward through
-        * the log.
-        */
-       args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET);
-
-       /*
-        * Root fork attributes can use reserved data blocks for this
-        * operation if necessary
-        */
-
-       if (rsvd)
-               args.trans->t_flags |= XFS_TRANS_RESERVE;
-
-       tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
-                        M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
-       tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
-       tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
-       error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
-       if (error) {
-               xfs_trans_cancel(args.trans, 0);
-               return error;
-       }
-       xfs_ilock(dp, XFS_ILOCK_EXCL);
-
-       error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
-                               rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
-                                      XFS_QMOPT_RES_REGBLKS);
-       if (error) {
-               xfs_iunlock(dp, XFS_ILOCK_EXCL);
-               xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
-               return error;
-       }
-
-       xfs_trans_ijoin(args.trans, dp, 0);
-
-       /*
-        * If the attribute list is non-existent or a shortform list,
-        * upgrade it to a single-leaf-block attribute list.
-        */
-       if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL ||
-           (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
-            dp->i_d.di_anextents == 0)) {
-
-               /*
-                * Build initial attribute list (if required).
-                */
-               if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS)
-                       xfs_attr_shortform_create(&args);
-
-               /*
-                * Try to add the attr to the attribute list in
-                * the inode.
-                */
-               error = xfs_attr_shortform_addname(&args);
-               if (error != ENOSPC) {
-                       /*
-                        * Commit the shortform mods, and we're done.
-                        * NOTE: this is also the error path (EEXIST, etc).
-                        */
-                       ASSERT(args.trans != NULL);
-
-                       /*
-                        * If this is a synchronous mount, make sure that
-                        * the transaction goes to disk before returning
-                        * to the user.
-                        */
-                       if (mp->m_flags & XFS_MOUNT_WSYNC)
-                               xfs_trans_set_sync(args.trans);
-
-                       if (!error && (flags & ATTR_KERNOTIME) == 0) {
-                               xfs_trans_ichgtime(args.trans, dp,
-                                                       XFS_ICHGTIME_CHG);
-                       }
-                       err2 = xfs_trans_commit(args.trans,
-                                                XFS_TRANS_RELEASE_LOG_RES);
-                       xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
-                       return error ? error : err2;
-               }
-
-               /*
-                * It won't fit in the shortform, transform to a leaf block.
-                * GROT: another possible req'mt for a double-split btree op.
-                */
-               xfs_bmap_init(args.flist, args.firstblock);
-               error = xfs_attr_shortform_to_leaf(&args);
-               if (!error) {
-                       error = xfs_bmap_finish(&args.trans, args.flist,
-                                               &committed);
-               }
-               if (error) {
-                       ASSERT(committed);
-                       args.trans = NULL;
-                       xfs_bmap_cancel(&flist);
-                       goto out;
-               }
-
-               /*
-                * bmap_finish() may have committed the last trans and started
-                * a new one.  We need the inode to be in all transactions.
-                */
-               if (committed)
-                       xfs_trans_ijoin(args.trans, dp, 0);
-
-               /*
-                * Commit the leaf transformation.  We'll need another (linked)
-                * transaction to add the new attribute to the leaf.
-                */
-
-               error = xfs_trans_roll(&args.trans, dp);
-               if (error)
-                       goto out;
-
-       }
-
-       if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
-               error = xfs_attr_leaf_addname(&args);
-       else
-               error = xfs_attr_node_addname(&args);
-       if (error)
-               goto out;
-
-       /*
-        * If this is a synchronous mount, make sure that the
-        * transaction goes to disk before returning to the user.
-        */
-       if (mp->m_flags & XFS_MOUNT_WSYNC)
-               xfs_trans_set_sync(args.trans);
-
-       if ((flags & ATTR_KERNOTIME) == 0)
-               xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
-
-       /*
-        * Commit the last in the sequence of transactions.
-        */
-       xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
-       error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
-       xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
-       return error;
-
-out:
-       if (args.trans) {
-               xfs_trans_cancel(args.trans,
-                       XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
-       }
-       xfs_iunlock(dp, XFS_ILOCK_EXCL);
-       return error;
-}
-
-/*
- * Generic handler routine to remove a name from an attribute list.
- * Transitions attribute list from Btree to shortform as necessary.
- */
-int
-xfs_attr_remove(
-       struct xfs_inode        *dp,
-       const unsigned char     *name,
-       int                     flags)
-{
-       struct xfs_mount        *mp = dp->i_mount;
-       struct xfs_da_args      args;
-       struct xfs_bmap_free    flist;
-       xfs_fsblock_t           firstblock;
-       int                     error;
-
-       XFS_STATS_INC(xs_attr_remove);
-
-       if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-               return EIO;
-
-       if (!xfs_inode_hasattr(dp))
-               return ENOATTR;
-
-       error = xfs_attr_args_init(&args, dp, name, flags);
-       if (error)
-               return error;
-
-       args.firstblock = &firstblock;
-       args.flist = &flist;
-
-       /*
-        * we have no control over the attribute names that userspace passes us
-        * to remove, so we have to allow the name lookup prior to attribute
-        * removal to fail.
-        */
-       args.op_flags = XFS_DA_OP_OKNOENT;
-
-       error = xfs_qm_dqattach(dp, 0);
-       if (error)
-               return error;
-
-       /*
-        * Start our first transaction of the day.
-        *
-        * All future transactions during this code must be "chained" off
-        * this one via the trans_dup() call.  All transactions will contain
-        * the inode, and the inode will always be marked with trans_ihold().
-        * Since the inode will be locked in all transactions, we must log
-        * the inode in every transaction to let it float upward through
-        * the log.
-        */
-       args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM);
-
-       /*
-        * Root fork attributes can use reserved data blocks for this
-        * operation if necessary
-        */
-
-       if (flags & ATTR_ROOT)
-               args.trans->t_flags |= XFS_TRANS_RESERVE;
-
-       error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
-                                 XFS_ATTRRM_SPACE_RES(mp), 0);
-       if (error) {
-               xfs_trans_cancel(args.trans, 0);
-               return error;
-       }
-
-       xfs_ilock(dp, XFS_ILOCK_EXCL);
-       /*
-        * No need to make quota reservations here. We expect to release some
-        * blocks not allocate in the common case.
-        */
-       xfs_trans_ijoin(args.trans, dp, 0);
-
-       if (!xfs_inode_hasattr(dp)) {
-               error = XFS_ERROR(ENOATTR);
-       } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
-               ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
-               error = xfs_attr_shortform_remove(&args);
-       } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
-               error = xfs_attr_leaf_removename(&args);
-       } else {
-               error = xfs_attr_node_removename(&args);
-       }
-
-       if (error)
-               goto out;
-
-       /*
-        * If this is a synchronous mount, make sure that the
-        * transaction goes to disk before returning to the user.
-        */
-       if (mp->m_flags & XFS_MOUNT_WSYNC)
-               xfs_trans_set_sync(args.trans);
-
-       if ((flags & ATTR_KERNOTIME) == 0)
-               xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
-
-       /*
-        * Commit the last in the sequence of transactions.
-        */
-       xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
-       error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
-       xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
-       return error;
-
-out:
-       if (args.trans) {
-               xfs_trans_cancel(args.trans,
-                       XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
-       }
-       xfs_iunlock(dp, XFS_ILOCK_EXCL);
-       return error;
-}
-
-/*========================================================================
- * External routines when attribute list is inside the inode
- *========================================================================*/
-
-/*
- * Add a name to the shortform attribute list structure
- * This is the external routine.
- */
-STATIC int
-xfs_attr_shortform_addname(xfs_da_args_t *args)
-{
-       int newsize, forkoff, retval;
-
-       trace_xfs_attr_sf_addname(args);
-
-       retval = xfs_attr_shortform_lookup(args);
-       if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
-               return(retval);
-       } else if (retval == EEXIST) {
-               if (args->flags & ATTR_CREATE)
-                       return(retval);
-               retval = xfs_attr_shortform_remove(args);
-               ASSERT(retval == 0);
-       }
-
-       if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
-           args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX)
-               return(XFS_ERROR(ENOSPC));
-
-       newsize = XFS_ATTR_SF_TOTSIZE(args->dp);
-       newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
-
-       forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize);
-       if (!forkoff)
-               return(XFS_ERROR(ENOSPC));
-
-       xfs_attr_shortform_add(args, forkoff);
-       return(0);
-}
-
-
-/*========================================================================
- * External routines when attribute list is one block
- *========================================================================*/
-
-/*
- * Add a name to the leaf attribute list structure
- *
- * This leaf block cannot have a "remote" value, we only call this routine
- * if bmap_one_block() says there is only one block (ie: no remote blks).
- */
-STATIC int
-xfs_attr_leaf_addname(xfs_da_args_t *args)
-{
-       xfs_inode_t *dp;
-       struct xfs_buf *bp;
-       int retval, error, committed, forkoff;
-
-       trace_xfs_attr_leaf_addname(args);
-
-       /*
-        * Read the (only) block in the attribute list in.
-        */
-       dp = args->dp;
-       args->blkno = 0;
-       error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
-       if (error)
-               return error;
-
-       /*
-        * Look up the given attribute in the leaf block.  Figure out if
-        * the given flags produce an error or call for an atomic rename.
-        */
-       retval = xfs_attr3_leaf_lookup_int(bp, args);
-       if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
-               xfs_trans_brelse(args->trans, bp);
-               return retval;
-       } else if (retval == EEXIST) {
-               if (args->flags & ATTR_CREATE) {        /* pure create op */
-                       xfs_trans_brelse(args->trans, bp);
-                       return retval;
-               }
-
-               trace_xfs_attr_leaf_replace(args);
-
-               /* save the attribute state for later removal*/
-               args->op_flags |= XFS_DA_OP_RENAME;     /* an atomic rename */
-               args->blkno2 = args->blkno;             /* set 2nd entry info*/
-               args->index2 = args->index;
-               args->rmtblkno2 = args->rmtblkno;
-               args->rmtblkcnt2 = args->rmtblkcnt;
-               args->rmtvaluelen2 = args->rmtvaluelen;
-
-               /*
-                * clear the remote attr state now that it is saved so that the
-                * values reflect the state of the attribute we are about to
-                * add, not the attribute we just found and will remove later.
-                */
-               args->rmtblkno = 0;
-               args->rmtblkcnt = 0;
-               args->rmtvaluelen = 0;
-       }
-
-       /*
-        * Add the attribute to the leaf block, transitioning to a Btree
-        * if required.
-        */
-       retval = xfs_attr3_leaf_add(bp, args);
-       if (retval == ENOSPC) {
-               /*
-                * Promote the attribute list to the Btree format, then
-                * Commit that transaction so that the node_addname() call
-                * can manage its own transactions.
-                */
-               xfs_bmap_init(args->flist, args->firstblock);
-               error = xfs_attr3_leaf_to_node(args);
-               if (!error) {
-                       error = xfs_bmap_finish(&args->trans, args->flist,
-                                               &committed);
-               }
-               if (error) {
-                       ASSERT(committed);
-                       args->trans = NULL;
-                       xfs_bmap_cancel(args->flist);
-                       return(error);
-               }
-
-               /*
-                * bmap_finish() may have committed the last trans and started
-                * a new one.  We need the inode to be in all transactions.
-                */
-               if (committed)
-                       xfs_trans_ijoin(args->trans, dp, 0);
-
-               /*
-                * Commit the current trans (including the inode) and start
-                * a new one.
-                */
-               error = xfs_trans_roll(&args->trans, dp);
-               if (error)
-                       return (error);
-
-               /*
-                * Fob the whole rest of the problem off on the Btree code.
-                */
-               error = xfs_attr_node_addname(args);
-               return(error);
-       }
-
-       /*
-        * Commit the transaction that added the attr name so that
-        * later routines can manage their own transactions.
-        */
-       error = xfs_trans_roll(&args->trans, dp);
-       if (error)
-               return (error);
-
-       /*
-        * If there was an out-of-line value, allocate the blocks we
-        * identified for its storage and copy the value.  This is done
-        * after we create the attribute so that we don't overflow the
-        * maximum size of a transaction and/or hit a deadlock.
-        */
-       if (args->rmtblkno > 0) {
-               error = xfs_attr_rmtval_set(args);
-               if (error)
-                       return(error);
-       }
-
-       /*
-        * If this is an atomic rename operation, we must "flip" the
-        * incomplete flags on the "new" and "old" attribute/value pairs
-        * so that one disappears and one appears atomically.  Then we
-        * must remove the "old" attribute/value pair.
-        */
-       if (args->op_flags & XFS_DA_OP_RENAME) {
-               /*
-                * In a separate transaction, set the incomplete flag on the
-                * "old" attr and clear the incomplete flag on the "new" attr.
-                */
-               error = xfs_attr3_leaf_flipflags(args);
-               if (error)
-                       return(error);
-
-               /*
-                * Dismantle the "old" attribute/value pair by removing
-                * a "remote" value (if it exists).
-                */
-               args->index = args->index2;
-               args->blkno = args->blkno2;
-               args->rmtblkno = args->rmtblkno2;
-               args->rmtblkcnt = args->rmtblkcnt2;
-               args->rmtvaluelen = args->rmtvaluelen2;
-               if (args->rmtblkno) {
-                       error = xfs_attr_rmtval_remove(args);
-                       if (error)
-                               return(error);
-               }
-
-               /*
-                * Read in the block containing the "old" attr, then
-                * remove the "old" attr from that block (neat, huh!)
-                */
-               error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
-                                          -1, &bp);
-               if (error)
-                       return error;
-
-               xfs_attr3_leaf_remove(bp, args);
-
-               /*
-                * If the result is small enough, shrink it all into the inode.
-                */
-               if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
-                       xfs_bmap_init(args->flist, args->firstblock);
-                       error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
-                       /* bp is gone due to xfs_da_shrink_inode */
-                       if (!error) {
-                               error = xfs_bmap_finish(&args->trans,
-                                                       args->flist,
-                                                       &committed);
-                       }
-                       if (error) {
-                               ASSERT(committed);
-                               args->trans = NULL;
-                               xfs_bmap_cancel(args->flist);
-                               return(error);
-                       }
-
-                       /*
-                        * bmap_finish() may have committed the last trans
-                        * and started a new one.  We need the inode to be
-                        * in all transactions.
-                        */
-                       if (committed)
-                               xfs_trans_ijoin(args->trans, dp, 0);
-               }
-
-               /*
-                * Commit the remove and start the next trans in series.
-                */
-               error = xfs_trans_roll(&args->trans, dp);
-
-       } else if (args->rmtblkno > 0) {
-               /*
-                * Added a "remote" value, just clear the incomplete flag.
-                */
-               error = xfs_attr3_leaf_clearflag(args);
-       }
-       return error;
-}
-
-/*
- * Remove a name from the leaf attribute list structure
- *
- * This leaf block cannot have a "remote" value, we only call this routine
- * if bmap_one_block() says there is only one block (ie: no remote blks).
- */
-STATIC int
-xfs_attr_leaf_removename(xfs_da_args_t *args)
-{
-       xfs_inode_t *dp;
-       struct xfs_buf *bp;
-       int error, committed, forkoff;
-
-       trace_xfs_attr_leaf_removename(args);
-
-       /*
-        * Remove the attribute.
-        */
-       dp = args->dp;
-       args->blkno = 0;
-       error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
-       if (error)
-               return error;
-
-       error = xfs_attr3_leaf_lookup_int(bp, args);
-       if (error == ENOATTR) {
-               xfs_trans_brelse(args->trans, bp);
-               return error;
-       }
-
-       xfs_attr3_leaf_remove(bp, args);
-
-       /*
-        * If the result is small enough, shrink it all into the inode.
-        */
-       if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
-               xfs_bmap_init(args->flist, args->firstblock);
-               error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
-               /* bp is gone due to xfs_da_shrink_inode */
-               if (!error) {
-                       error = xfs_bmap_finish(&args->trans, args->flist,
-                                               &committed);
-               }
-               if (error) {
-                       ASSERT(committed);
-                       args->trans = NULL;
-                       xfs_bmap_cancel(args->flist);
-                       return error;
-               }
-
-               /*
-                * bmap_finish() may have committed the last trans and started
-                * a new one.  We need the inode to be in all transactions.
-                */
-               if (committed)
-                       xfs_trans_ijoin(args->trans, dp, 0);
-       }
-       return 0;
-}
-
-/*
- * Look up a name in a leaf attribute list structure.
- *
- * This leaf block cannot have a "remote" value, we only call this routine
- * if bmap_one_block() says there is only one block (ie: no remote blks).
- */
-STATIC int
-xfs_attr_leaf_get(xfs_da_args_t *args)
-{
-       struct xfs_buf *bp;
-       int error;
-
-       trace_xfs_attr_leaf_get(args);
-
-       args->blkno = 0;
-       error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
-       if (error)
-               return error;
-
-       error = xfs_attr3_leaf_lookup_int(bp, args);
-       if (error != EEXIST)  {
-               xfs_trans_brelse(args->trans, bp);
-               return error;
-       }
-       error = xfs_attr3_leaf_getvalue(bp, args);
-       xfs_trans_brelse(args->trans, bp);
-       if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
-               error = xfs_attr_rmtval_get(args);
-       }
-       return error;
-}
-
-/*========================================================================
- * External routines when attribute list size > geo->blksize
- *========================================================================*/
-
-/*
- * Add a name to a Btree-format attribute list.
- *
- * This will involve walking down the Btree, and may involve splitting
- * leaf nodes and even splitting intermediate nodes up to and including
- * the root node (a special case of an intermediate node).
- *
- * "Remote" attribute values confuse the issue and atomic rename operations
- * add a whole extra layer of confusion on top of that.
- */
-STATIC int
-xfs_attr_node_addname(xfs_da_args_t *args)
-{
-       xfs_da_state_t *state;
-       xfs_da_state_blk_t *blk;
-       xfs_inode_t *dp;
-       xfs_mount_t *mp;
-       int committed, retval, error;
-
-       trace_xfs_attr_node_addname(args);
-
-       /*
-        * Fill in bucket of arguments/results/context to carry around.
-        */
-       dp = args->dp;
-       mp = dp->i_mount;
-restart:
-       state = xfs_da_state_alloc();
-       state->args = args;
-       state->mp = mp;
-
-       /*
-        * Search to see if name already exists, and get back a pointer
-        * to where it should go.
-        */
-       error = xfs_da3_node_lookup_int(state, &retval);
-       if (error)
-               goto out;
-       blk = &state->path.blk[ state->path.active-1 ];
-       ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
-       if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
-               goto out;
-       } else if (retval == EEXIST) {
-               if (args->flags & ATTR_CREATE)
-                       goto out;
-
-               trace_xfs_attr_node_replace(args);
-
-               /* save the attribute state for later removal*/
-               args->op_flags |= XFS_DA_OP_RENAME;     /* atomic rename op */
-               args->blkno2 = args->blkno;             /* set 2nd entry info*/
-               args->index2 = args->index;
-               args->rmtblkno2 = args->rmtblkno;
-               args->rmtblkcnt2 = args->rmtblkcnt;
-               args->rmtvaluelen2 = args->rmtvaluelen;
-
-               /*
-                * clear the remote attr state now that it is saved so that the
-                * values reflect the state of the attribute we are about to
-                * add, not the attribute we just found and will remove later.
-                */
-               args->rmtblkno = 0;
-               args->rmtblkcnt = 0;
-               args->rmtvaluelen = 0;
-       }
-
-       retval = xfs_attr3_leaf_add(blk->bp, state->args);
-       if (retval == ENOSPC) {
-               if (state->path.active == 1) {
-                       /*
-                        * Its really a single leaf node, but it had
-                        * out-of-line values so it looked like it *might*
-                        * have been a b-tree.
-                        */
-                       xfs_da_state_free(state);
-                       state = NULL;
-                       xfs_bmap_init(args->flist, args->firstblock);
-                       error = xfs_attr3_leaf_to_node(args);
-                       if (!error) {
-                               error = xfs_bmap_finish(&args->trans,
-                                                       args->flist,
-                                                       &committed);
-                       }
-                       if (error) {
-                               ASSERT(committed);
-                               args->trans = NULL;
-                               xfs_bmap_cancel(args->flist);
-                               goto out;
-                       }
-
-                       /*
-                        * bmap_finish() may have committed the last trans
-                        * and started a new one.  We need the inode to be
-                        * in all transactions.
-                        */
-                       if (committed)
-                               xfs_trans_ijoin(args->trans, dp, 0);
-
-                       /*
-                        * Commit the node conversion and start the next
-                        * trans in the chain.
-                        */
-                       error = xfs_trans_roll(&args->trans, dp);
-                       if (error)
-                               goto out;
-
-                       goto restart;
-               }
-
-               /*
-                * Split as many Btree elements as required.
-                * This code tracks the new and old attr's location
-                * in the index/blkno/rmtblkno/rmtblkcnt fields and
-                * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
-                */
-               xfs_bmap_init(args->flist, args->firstblock);
-               error = xfs_da3_split(state);
-               if (!error) {
-                       error = xfs_bmap_finish(&args->trans, args->flist,
-                                               &committed);
-               }
-               if (error) {
-                       ASSERT(committed);
-                       args->trans = NULL;
-                       xfs_bmap_cancel(args->flist);
-                       goto out;
-               }
-
-               /*
-                * bmap_finish() may have committed the last trans and started
-                * a new one.  We need the inode to be in all transactions.
-                */
-               if (committed)
-                       xfs_trans_ijoin(args->trans, dp, 0);
-       } else {
-               /*
-                * Addition succeeded, update Btree hashvals.
-                */
-               xfs_da3_fixhashpath(state, &state->path);
-       }
-
-       /*
-        * Kill the state structure, we're done with it and need to
-        * allow the buffers to come back later.
-        */
-       xfs_da_state_free(state);
-       state = NULL;
-
-       /*
-        * Commit the leaf addition or btree split and start the next
-        * trans in the chain.
-        */
-       error = xfs_trans_roll(&args->trans, dp);
-       if (error)
-               goto out;
-
-       /*
-        * If there was an out-of-line value, allocate the blocks we
-        * identified for its storage and copy the value.  This is done
-        * after we create the attribute so that we don't overflow the
-        * maximum size of a transaction and/or hit a deadlock.
-        */
-       if (args->rmtblkno > 0) {
-               error = xfs_attr_rmtval_set(args);
-               if (error)
-                       return(error);
-       }
-
-       /*
-        * If this is an atomic rename operation, we must "flip" the
-        * incomplete flags on the "new" and "old" attribute/value pairs
-        * so that one disappears and one appears atomically.  Then we
-        * must remove the "old" attribute/value pair.
-        */
-       if (args->op_flags & XFS_DA_OP_RENAME) {
-               /*
-                * In a separate transaction, set the incomplete flag on the
-                * "old" attr and clear the incomplete flag on the "new" attr.
-                */
-               error = xfs_attr3_leaf_flipflags(args);
-               if (error)
-                       goto out;
-
-               /*
-                * Dismantle the "old" attribute/value pair by removing
-                * a "remote" value (if it exists).
-                */
-               args->index = args->index2;
-               args->blkno = args->blkno2;
-               args->rmtblkno = args->rmtblkno2;
-               args->rmtblkcnt = args->rmtblkcnt2;
-               args->rmtvaluelen = args->rmtvaluelen2;
-               if (args->rmtblkno) {
-                       error = xfs_attr_rmtval_remove(args);
-                       if (error)
-                               return(error);
-               }
-
-               /*
-                * Re-find the "old" attribute entry after any split ops.
-                * The INCOMPLETE flag means that we will find the "old"
-                * attr, not the "new" one.
-                */
-               args->flags |= XFS_ATTR_INCOMPLETE;
-               state = xfs_da_state_alloc();
-               state->args = args;
-               state->mp = mp;
-               state->inleaf = 0;
-               error = xfs_da3_node_lookup_int(state, &retval);
-               if (error)
-                       goto out;
-
-               /*
-                * Remove the name and update the hashvals in the tree.
-                */
-               blk = &state->path.blk[ state->path.active-1 ];
-               ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
-               error = xfs_attr3_leaf_remove(blk->bp, args);
-               xfs_da3_fixhashpath(state, &state->path);
-
-               /*
-                * Check to see if the tree needs to be collapsed.
-                */
-               if (retval && (state->path.active > 1)) {
-                       xfs_bmap_init(args->flist, args->firstblock);
-                       error = xfs_da3_join(state);
-                       if (!error) {
-                               error = xfs_bmap_finish(&args->trans,
-                                                       args->flist,
-                                                       &committed);
-                       }
-                       if (error) {
-                               ASSERT(committed);
-                               args->trans = NULL;
-                               xfs_bmap_cancel(args->flist);
-                               goto out;
-                       }
-
-                       /*
-                        * bmap_finish() may have committed the last trans
-                        * and started a new one.  We need the inode to be
-                        * in all transactions.
-                        */
-                       if (committed)
-                               xfs_trans_ijoin(args->trans, dp, 0);
-               }
-
-               /*
-                * Commit and start the next trans in the chain.
-                */
-               error = xfs_trans_roll(&args->trans, dp);
-               if (error)
-                       goto out;
-
-       } else if (args->rmtblkno > 0) {
-               /*
-                * Added a "remote" value, just clear the incomplete flag.
-                */
-               error = xfs_attr3_leaf_clearflag(args);
-               if (error)
-                       goto out;
-       }
-       retval = error = 0;
-
-out:
-       if (state)
-               xfs_da_state_free(state);
-       if (error)
-               return(error);
-       return(retval);
-}
-
-/*
- * Remove a name from a B-tree attribute list.
- *
- * This will involve walking down the Btree, and may involve joining
- * leaf nodes and even joining intermediate nodes up to and including
- * the root node (a special case of an intermediate node).
- */
-STATIC int
-xfs_attr_node_removename(xfs_da_args_t *args)
-{
-       xfs_da_state_t *state;
-       xfs_da_state_blk_t *blk;
-       xfs_inode_t *dp;
-       struct xfs_buf *bp;
-       int retval, error, committed, forkoff;
-
-       trace_xfs_attr_node_removename(args);
-
-       /*
-        * Tie a string around our finger to remind us where we are.
-        */
-       dp = args->dp;
-       state = xfs_da_state_alloc();
-       state->args = args;
-       state->mp = dp->i_mount;
-
-       /*
-        * Search to see if name exists, and get back a pointer to it.
-        */
-       error = xfs_da3_node_lookup_int(state, &retval);
-       if (error || (retval != EEXIST)) {
-               if (error == 0)
-                       error = retval;
-               goto out;
-       }
-
-       /*
-        * If there is an out-of-line value, de-allocate the blocks.
-        * This is done before we remove the attribute so that we don't
-        * overflow the maximum size of a transaction and/or hit a deadlock.
-        */
-       blk = &state->path.blk[ state->path.active-1 ];
-       ASSERT(blk->bp != NULL);
-       ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
-       if (args->rmtblkno > 0) {
-               /*
-                * Fill in disk block numbers in the state structure
-                * so that we can get the buffers back after we commit
-                * several transactions in the following calls.
-                */
-               error = xfs_attr_fillstate(state);
-               if (error)
-                       goto out;
-
-               /*
-                * Mark the attribute as INCOMPLETE, then bunmapi() the
-                * remote value.
-                */
-               error = xfs_attr3_leaf_setflag(args);
-               if (error)
-                       goto out;
-               error = xfs_attr_rmtval_remove(args);
-               if (error)
-                       goto out;
-
-               /*
-                * Refill the state structure with buffers, the prior calls
-                * released our buffers.
-                */
-               error = xfs_attr_refillstate(state);
-               if (error)
-                       goto out;
-       }
-
-       /*
-        * Remove the name and update the hashvals in the tree.
-        */
-       blk = &state->path.blk[ state->path.active-1 ];
-       ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
-       retval = xfs_attr3_leaf_remove(blk->bp, args);
-       xfs_da3_fixhashpath(state, &state->path);
-
-       /*
-        * Check to see if the tree needs to be collapsed.
-        */
-       if (retval && (state->path.active > 1)) {
-               xfs_bmap_init(args->flist, args->firstblock);
-               error = xfs_da3_join(state);
-               if (!error) {
-                       error = xfs_bmap_finish(&args->trans, args->flist,
-                                               &committed);
-               }
-               if (error) {
-                       ASSERT(committed);
-                       args->trans = NULL;
-                       xfs_bmap_cancel(args->flist);
-                       goto out;
-               }
-
-               /*
-                * bmap_finish() may have committed the last trans and started
-                * a new one.  We need the inode to be in all transactions.
-                */
-               if (committed)
-                       xfs_trans_ijoin(args->trans, dp, 0);
-
-               /*
-                * Commit the Btree join operation and start a new trans.
-                */
-               error = xfs_trans_roll(&args->trans, dp);
-               if (error)
-                       goto out;
-       }
-
-       /*
-        * If the result is small enough, push it all into the inode.
-        */
-       if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
-               /*
-                * Have to get rid of the copy of this dabuf in the state.
-                */
-               ASSERT(state->path.active == 1);
-               ASSERT(state->path.blk[0].bp);
-               state->path.blk[0].bp = NULL;
-
-               error = xfs_attr3_leaf_read(args->trans, args->dp, 0, -1, &bp);
-               if (error)
-                       goto out;
-
-               if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
-                       xfs_bmap_init(args->flist, args->firstblock);
-                       error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
-                       /* bp is gone due to xfs_da_shrink_inode */
-                       if (!error) {
-                               error = xfs_bmap_finish(&args->trans,
-                                                       args->flist,
-                                                       &committed);
-                       }
-                       if (error) {
-                               ASSERT(committed);
-                               args->trans = NULL;
-                               xfs_bmap_cancel(args->flist);
-                               goto out;
-                       }
-
-                       /*
-                        * bmap_finish() may have committed the last trans
-                        * and started a new one.  We need the inode to be
-                        * in all transactions.
-                        */
-                       if (committed)
-                               xfs_trans_ijoin(args->trans, dp, 0);
-               } else
-                       xfs_trans_brelse(args->trans, bp);
-       }
-       error = 0;
-
-out:
-       xfs_da_state_free(state);
-       return(error);
-}
-
-/*
- * Fill in the disk block numbers in the state structure for the buffers
- * that are attached to the state structure.
- * This is done so that we can quickly reattach ourselves to those buffers
- * after some set of transaction commits have released these buffers.
- */
-STATIC int
-xfs_attr_fillstate(xfs_da_state_t *state)
-{
-       xfs_da_state_path_t *path;
-       xfs_da_state_blk_t *blk;
-       int level;
-
-       trace_xfs_attr_fillstate(state->args);
-
-       /*
-        * Roll down the "path" in the state structure, storing the on-disk
-        * block number for those buffers in the "path".
-        */
-       path = &state->path;
-       ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
-       for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
-               if (blk->bp) {
-                       blk->disk_blkno = XFS_BUF_ADDR(blk->bp);
-                       blk->bp = NULL;
-               } else {
-                       blk->disk_blkno = 0;
-               }
-       }
-
-       /*
-        * Roll down the "altpath" in the state structure, storing the on-disk
-        * block number for those buffers in the "altpath".
-        */
-       path = &state->altpath;
-       ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
-       for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
-               if (blk->bp) {
-                       blk->disk_blkno = XFS_BUF_ADDR(blk->bp);
-                       blk->bp = NULL;
-               } else {
-                       blk->disk_blkno = 0;
-               }
-       }
-
-       return(0);
-}
-
-/*
- * Reattach the buffers to the state structure based on the disk block
- * numbers stored in the state structure.
- * This is done after some set of transaction commits have released those
- * buffers from our grip.
- */
-STATIC int
-xfs_attr_refillstate(xfs_da_state_t *state)
-{
-       xfs_da_state_path_t *path;
-       xfs_da_state_blk_t *blk;
-       int level, error;
-
-       trace_xfs_attr_refillstate(state->args);
-
-       /*
-        * Roll down the "path" in the state structure, storing the on-disk
-        * block number for those buffers in the "path".
-        */
-       path = &state->path;
-       ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
-       for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
-               if (blk->disk_blkno) {
-                       error = xfs_da3_node_read(state->args->trans,
-                                               state->args->dp,
-                                               blk->blkno, blk->disk_blkno,
-                                               &blk->bp, XFS_ATTR_FORK);
-                       if (error)
-                               return(error);
-               } else {
-                       blk->bp = NULL;
-               }
-       }
-
-       /*
-        * Roll down the "altpath" in the state structure, storing the on-disk
-        * block number for those buffers in the "altpath".
-        */
-       path = &state->altpath;
-       ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
-       for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
-               if (blk->disk_blkno) {
-                       error = xfs_da3_node_read(state->args->trans,
-                                               state->args->dp,
-                                               blk->blkno, blk->disk_blkno,
-                                               &blk->bp, XFS_ATTR_FORK);
-                       if (error)
-                               return(error);
-               } else {
-                       blk->bp = NULL;
-               }
-       }
-
-       return(0);
-}
-
-/*
- * Look up a filename in a node attribute list.
- *
- * This routine gets called for any attribute fork that has more than one
- * block, ie: both true Btree attr lists and for single-leaf-blocks with
- * "remote" values taking up more blocks.
- */
-STATIC int
-xfs_attr_node_get(xfs_da_args_t *args)
-{
-       xfs_da_state_t *state;
-       xfs_da_state_blk_t *blk;
-       int error, retval;
-       int i;
-
-       trace_xfs_attr_node_get(args);
-
-       state = xfs_da_state_alloc();
-       state->args = args;
-       state->mp = args->dp->i_mount;
-
-       /*
-        * Search to see if name exists, and get back a pointer to it.
-        */
-       error = xfs_da3_node_lookup_int(state, &retval);
-       if (error) {
-               retval = error;
-       } else if (retval == EEXIST) {
-               blk = &state->path.blk[ state->path.active-1 ];
-               ASSERT(blk->bp != NULL);
-               ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
-
-               /*
-                * Get the value, local or "remote"
-                */
-               retval = xfs_attr3_leaf_getvalue(blk->bp, args);
-               if (!retval && (args->rmtblkno > 0)
-                   && !(args->flags & ATTR_KERNOVAL)) {
-                       retval = xfs_attr_rmtval_get(args);
-               }
-       }
-
-       /*
-        * If not in a transaction, we have to release all the buffers.
-        */
-       for (i = 0; i < state->path.active; i++) {
-               xfs_trans_brelse(args->trans, state->path.blk[i].bp);
-               state->path.blk[i].bp = NULL;
-       }
-
-       xfs_da_state_free(state);
-       return(retval);
-}
index 09480c57f06900ccb5fdba4817d561908f0deb1e..aa2a8b1838a2939d62ef104a25b4a4f98f1b9020 100644 (file)
@@ -76,7 +76,7 @@ xfs_attr3_leaf_freextent(
                error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
                                       &map, &nmap, XFS_BMAPI_ATTRFORK);
                if (error) {
-                       return(error);
+                       return error;
                }
                ASSERT(nmap == 1);
                ASSERT(map.br_startblock != DELAYSTARTBLOCK);
@@ -95,21 +95,21 @@ xfs_attr3_leaf_freextent(
                                        dp->i_mount->m_ddev_targp,
                                        dblkno, dblkcnt, 0);
                        if (!bp)
-                               return ENOMEM;
+                               return -ENOMEM;
                        xfs_trans_binval(*trans, bp);
                        /*
                         * Roll to next transaction.
                         */
                        error = xfs_trans_roll(trans, dp);
                        if (error)
-                               return (error);
+                               return error;
                }
 
                tblkno += map.br_blockcount;
                tblkcnt -= map.br_blockcount;
        }
 
-       return(0);
+       return 0;
 }
 
 /*
@@ -227,7 +227,7 @@ xfs_attr3_node_inactive(
         */
        if (level > XFS_DA_NODE_MAXDEPTH) {
                xfs_trans_brelse(*trans, bp);   /* no locks for later trans */
-               return XFS_ERROR(EIO);
+               return -EIO;
        }
 
        node = bp->b_addr;
@@ -256,7 +256,7 @@ xfs_attr3_node_inactive(
                error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp,
                                                XFS_ATTR_FORK);
                if (error)
-                       return(error);
+                       return error;
                if (child_bp) {
                                                /* save for re-read later */
                        child_blkno = XFS_BUF_ADDR(child_bp);
@@ -277,7 +277,7 @@ xfs_attr3_node_inactive(
                                                        child_bp);
                                break;
                        default:
-                               error = XFS_ERROR(EIO);
+                               error = -EIO;
                                xfs_trans_brelse(*trans, child_bp);
                                break;
                        }
@@ -360,7 +360,7 @@ xfs_attr3_root_inactive(
                error = xfs_attr3_leaf_inactive(trans, dp, bp);
                break;
        default:
-               error = XFS_ERROR(EIO);
+               error = -EIO;
                xfs_trans_brelse(*trans, bp);
                break;
        }
@@ -414,7 +414,7 @@ xfs_attr_inactive(xfs_inode_t *dp)
        error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0);
        if (error) {
                xfs_trans_cancel(trans, 0);
-               return(error);
+               return error;
        }
        xfs_ilock(dp, XFS_ILOCK_EXCL);
 
@@ -443,10 +443,10 @@ xfs_attr_inactive(xfs_inode_t *dp)
        error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
        xfs_iunlock(dp, XFS_ILOCK_EXCL);
 
-       return(error);
+       return error;
 
 out:
        xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
        xfs_iunlock(dp, XFS_ILOCK_EXCL);
-       return(error);
+       return error;
 }
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
deleted file mode 100644 (file)
index 28712d2..0000000
+++ /dev/null
@@ -1,2697 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_inode.h"
-#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_bmap.h"
-#include "xfs_attr_sf.h"
-#include "xfs_attr_remote.h"
-#include "xfs_attr.h"
-#include "xfs_attr_leaf.h"
-#include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_buf_item.h"
-#include "xfs_cksum.h"
-#include "xfs_dinode.h"
-#include "xfs_dir2.h"
-
-
-/*
- * xfs_attr_leaf.c
- *
- * Routines to implement leaf blocks of attributes as Btrees of hashed names.
- */
-
-/*========================================================================
- * Function prototypes for the kernel.
- *========================================================================*/
-
-/*
- * Routines used for growing the Btree.
- */
-STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args,
-                                xfs_dablk_t which_block, struct xfs_buf **bpp);
-STATIC int xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer,
-                                  struct xfs_attr3_icleaf_hdr *ichdr,
-                                  struct xfs_da_args *args, int freemap_index);
-STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args,
-                                  struct xfs_attr3_icleaf_hdr *ichdr,
-                                  struct xfs_buf *leaf_buffer);
-STATIC void xfs_attr3_leaf_rebalance(xfs_da_state_t *state,
-                                                  xfs_da_state_blk_t *blk1,
-                                                  xfs_da_state_blk_t *blk2);
-STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state,
-                       xfs_da_state_blk_t *leaf_blk_1,
-                       struct xfs_attr3_icleaf_hdr *ichdr1,
-                       xfs_da_state_blk_t *leaf_blk_2,
-                       struct xfs_attr3_icleaf_hdr *ichdr2,
-                       int *number_entries_in_blk1,
-                       int *number_usedbytes_in_blk1);
-
-/*
- * Utility routines.
- */
-STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args,
-                       struct xfs_attr_leafblock *src_leaf,
-                       struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start,
-                       struct xfs_attr_leafblock *dst_leaf,
-                       struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start,
-                       int move_count);
-STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
-
-void
-xfs_attr3_leaf_hdr_from_disk(
-       struct xfs_attr3_icleaf_hdr     *to,
-       struct xfs_attr_leafblock       *from)
-{
-       int     i;
-
-       ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
-              from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
-
-       if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) {
-               struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)from;
-
-               to->forw = be32_to_cpu(hdr3->info.hdr.forw);
-               to->back = be32_to_cpu(hdr3->info.hdr.back);
-               to->magic = be16_to_cpu(hdr3->info.hdr.magic);
-               to->count = be16_to_cpu(hdr3->count);
-               to->usedbytes = be16_to_cpu(hdr3->usedbytes);
-               to->firstused = be16_to_cpu(hdr3->firstused);
-               to->holes = hdr3->holes;
-
-               for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
-                       to->freemap[i].base = be16_to_cpu(hdr3->freemap[i].base);
-                       to->freemap[i].size = be16_to_cpu(hdr3->freemap[i].size);
-               }
-               return;
-       }
-       to->forw = be32_to_cpu(from->hdr.info.forw);
-       to->back = be32_to_cpu(from->hdr.info.back);
-       to->magic = be16_to_cpu(from->hdr.info.magic);
-       to->count = be16_to_cpu(from->hdr.count);
-       to->usedbytes = be16_to_cpu(from->hdr.usedbytes);
-       to->firstused = be16_to_cpu(from->hdr.firstused);
-       to->holes = from->hdr.holes;
-
-       for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
-               to->freemap[i].base = be16_to_cpu(from->hdr.freemap[i].base);
-               to->freemap[i].size = be16_to_cpu(from->hdr.freemap[i].size);
-       }
-}
-
-void
-xfs_attr3_leaf_hdr_to_disk(
-       struct xfs_attr_leafblock       *to,
-       struct xfs_attr3_icleaf_hdr     *from)
-{
-       int     i;
-
-       ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC ||
-              from->magic == XFS_ATTR3_LEAF_MAGIC);
-
-       if (from->magic == XFS_ATTR3_LEAF_MAGIC) {
-               struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)to;
-
-               hdr3->info.hdr.forw = cpu_to_be32(from->forw);
-               hdr3->info.hdr.back = cpu_to_be32(from->back);
-               hdr3->info.hdr.magic = cpu_to_be16(from->magic);
-               hdr3->count = cpu_to_be16(from->count);
-               hdr3->usedbytes = cpu_to_be16(from->usedbytes);
-               hdr3->firstused = cpu_to_be16(from->firstused);
-               hdr3->holes = from->holes;
-               hdr3->pad1 = 0;
-
-               for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
-                       hdr3->freemap[i].base = cpu_to_be16(from->freemap[i].base);
-                       hdr3->freemap[i].size = cpu_to_be16(from->freemap[i].size);
-               }
-               return;
-       }
-       to->hdr.info.forw = cpu_to_be32(from->forw);
-       to->hdr.info.back = cpu_to_be32(from->back);
-       to->hdr.info.magic = cpu_to_be16(from->magic);
-       to->hdr.count = cpu_to_be16(from->count);
-       to->hdr.usedbytes = cpu_to_be16(from->usedbytes);
-       to->hdr.firstused = cpu_to_be16(from->firstused);
-       to->hdr.holes = from->holes;
-       to->hdr.pad1 = 0;
-
-       for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
-               to->hdr.freemap[i].base = cpu_to_be16(from->freemap[i].base);
-               to->hdr.freemap[i].size = cpu_to_be16(from->freemap[i].size);
-       }
-}
-
-static bool
-xfs_attr3_leaf_verify(
-       struct xfs_buf          *bp)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-       struct xfs_attr_leafblock *leaf = bp->b_addr;
-       struct xfs_attr3_icleaf_hdr ichdr;
-
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
-
-       if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
-
-               if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC)
-                       return false;
-
-               if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
-                       return false;
-               if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
-                       return false;
-       } else {
-               if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
-                       return false;
-       }
-       if (ichdr.count == 0)
-               return false;
-
-       /* XXX: need to range check rest of attr header values */
-       /* XXX: hash order check? */
-
-       return true;
-}
-
-static void
-xfs_attr3_leaf_write_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-       struct xfs_buf_log_item *bip = bp->b_fspriv;
-       struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr;
-
-       if (!xfs_attr3_leaf_verify(bp)) {
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-               xfs_verifier_error(bp);
-               return;
-       }
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return;
-
-       if (bip)
-               hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
-
-       xfs_buf_update_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF);
-}
-
-/*
- * leaf/node format detection on trees is sketchy, so a node read can be done on
- * leaf level blocks when detection identifies the tree as a node format tree
- * incorrectly. In this case, we need to swap the verifier to match the correct
- * format of the block being read.
- */
-static void
-xfs_attr3_leaf_read_verify(
-       struct xfs_buf          *bp)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-
-       if (xfs_sb_version_hascrc(&mp->m_sb) &&
-            !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF))
-               xfs_buf_ioerror(bp, EFSBADCRC);
-       else if (!xfs_attr3_leaf_verify(bp))
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-       if (bp->b_error)
-               xfs_verifier_error(bp);
-}
-
-const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
-       .verify_read = xfs_attr3_leaf_read_verify,
-       .verify_write = xfs_attr3_leaf_write_verify,
-};
-
-int
-xfs_attr3_leaf_read(
-       struct xfs_trans        *tp,
-       struct xfs_inode        *dp,
-       xfs_dablk_t             bno,
-       xfs_daddr_t             mappedbno,
-       struct xfs_buf          **bpp)
-{
-       int                     err;
-
-       err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
-                               XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops);
-       if (!err && tp)
-               xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
-       return err;
-}
-
-/*========================================================================
- * Namespace helper routines
- *========================================================================*/
-
-/*
- * If namespace bits don't match return 0.
- * If all match then return 1.
- */
-STATIC int
-xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
-{
-       return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
-}
-
-
-/*========================================================================
- * External routines when attribute fork size < XFS_LITINO(mp).
- *========================================================================*/
-
-/*
- * Query whether the requested number of additional bytes of extended
- * attribute space will be able to fit inline.
- *
- * Returns zero if not, else the di_forkoff fork offset to be used in the
- * literal area for attribute data once the new bytes have been added.
- *
- * di_forkoff must be 8 byte aligned, hence is stored as a >>3 value;
- * special case for dev/uuid inodes, they have fixed size data forks.
- */
-int
-xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
-{
-       int offset;
-       int minforkoff; /* lower limit on valid forkoff locations */
-       int maxforkoff; /* upper limit on valid forkoff locations */
-       int dsize;
-       xfs_mount_t *mp = dp->i_mount;
-
-       /* rounded down */
-       offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3;
-
-       switch (dp->i_d.di_format) {
-       case XFS_DINODE_FMT_DEV:
-               minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
-               return (offset >= minforkoff) ? minforkoff : 0;
-       case XFS_DINODE_FMT_UUID:
-               minforkoff = roundup(sizeof(uuid_t), 8) >> 3;
-               return (offset >= minforkoff) ? minforkoff : 0;
-       }
-
-       /*
-        * If the requested numbers of bytes is smaller or equal to the
-        * current attribute fork size we can always proceed.
-        *
-        * Note that if_bytes in the data fork might actually be larger than
-        * the current data fork size is due to delalloc extents. In that
-        * case either the extent count will go down when they are converted
-        * to real extents, or the delalloc conversion will take care of the
-        * literal area rebalancing.
-        */
-       if (bytes <= XFS_IFORK_ASIZE(dp))
-               return dp->i_d.di_forkoff;
-
-       /*
-        * For attr2 we can try to move the forkoff if there is space in the
-        * literal area, but for the old format we are done if there is no
-        * space in the fixed attribute fork.
-        */
-       if (!(mp->m_flags & XFS_MOUNT_ATTR2))
-               return 0;
-
-       dsize = dp->i_df.if_bytes;
-
-       switch (dp->i_d.di_format) {
-       case XFS_DINODE_FMT_EXTENTS:
-               /*
-                * If there is no attr fork and the data fork is extents, 
-                * determine if creating the default attr fork will result
-                * in the extents form migrating to btree. If so, the
-                * minimum offset only needs to be the space required for
-                * the btree root.
-                */
-               if (!dp->i_d.di_forkoff && dp->i_df.if_bytes >
-                   xfs_default_attroffset(dp))
-                       dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
-               break;
-       case XFS_DINODE_FMT_BTREE:
-               /*
-                * If we have a data btree then keep forkoff if we have one,
-                * otherwise we are adding a new attr, so then we set
-                * minforkoff to where the btree root can finish so we have
-                * plenty of room for attrs
-                */
-               if (dp->i_d.di_forkoff) {
-                       if (offset < dp->i_d.di_forkoff)
-                               return 0;
-                       return dp->i_d.di_forkoff;
-               }
-               dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot);
-               break;
-       }
-
-       /*
-        * A data fork btree root must have space for at least
-        * MINDBTPTRS key/ptr pairs if the data fork is small or empty.
-        */
-       minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS));
-       minforkoff = roundup(minforkoff, 8) >> 3;
-
-       /* attr fork btree root can have at least this many key/ptr pairs */
-       maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) -
-                       XFS_BMDR_SPACE_CALC(MINABTPTRS);
-       maxforkoff = maxforkoff >> 3;   /* rounded down */
-
-       if (offset >= maxforkoff)
-               return maxforkoff;
-       if (offset >= minforkoff)
-               return offset;
-       return 0;
-}
-
-/*
- * Switch on the ATTR2 superblock bit (implies also FEATURES2)
- */
-STATIC void
-xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp)
-{
-       if ((mp->m_flags & XFS_MOUNT_ATTR2) &&
-           !(xfs_sb_version_hasattr2(&mp->m_sb))) {
-               spin_lock(&mp->m_sb_lock);
-               if (!xfs_sb_version_hasattr2(&mp->m_sb)) {
-                       xfs_sb_version_addattr2(&mp->m_sb);
-                       spin_unlock(&mp->m_sb_lock);
-                       xfs_mod_sb(tp, XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
-               } else
-                       spin_unlock(&mp->m_sb_lock);
-       }
-}
-
-/*
- * Create the initial contents of a shortform attribute list.
- */
-void
-xfs_attr_shortform_create(xfs_da_args_t *args)
-{
-       xfs_attr_sf_hdr_t *hdr;
-       xfs_inode_t *dp;
-       xfs_ifork_t *ifp;
-
-       trace_xfs_attr_sf_create(args);
-
-       dp = args->dp;
-       ASSERT(dp != NULL);
-       ifp = dp->i_afp;
-       ASSERT(ifp != NULL);
-       ASSERT(ifp->if_bytes == 0);
-       if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) {
-               ifp->if_flags &= ~XFS_IFEXTENTS;        /* just in case */
-               dp->i_d.di_aformat = XFS_DINODE_FMT_LOCAL;
-               ifp->if_flags |= XFS_IFINLINE;
-       } else {
-               ASSERT(ifp->if_flags & XFS_IFINLINE);
-       }
-       xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
-       hdr = (xfs_attr_sf_hdr_t *)ifp->if_u1.if_data;
-       hdr->count = 0;
-       hdr->totsize = cpu_to_be16(sizeof(*hdr));
-       xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
-}
-
-/*
- * Add a name/value pair to the shortform attribute list.
- * Overflow from the inode has already been checked for.
- */
-void
-xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
-{
-       xfs_attr_shortform_t *sf;
-       xfs_attr_sf_entry_t *sfe;
-       int i, offset, size;
-       xfs_mount_t *mp;
-       xfs_inode_t *dp;
-       xfs_ifork_t *ifp;
-
-       trace_xfs_attr_sf_add(args);
-
-       dp = args->dp;
-       mp = dp->i_mount;
-       dp->i_d.di_forkoff = forkoff;
-
-       ifp = dp->i_afp;
-       ASSERT(ifp->if_flags & XFS_IFINLINE);
-       sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
-       sfe = &sf->list[0];
-       for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
-#ifdef DEBUG
-               if (sfe->namelen != args->namelen)
-                       continue;
-               if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
-                       continue;
-               if (!xfs_attr_namesp_match(args->flags, sfe->flags))
-                       continue;
-               ASSERT(0);
-#endif
-       }
-
-       offset = (char *)sfe - (char *)sf;
-       size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
-       xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
-       sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
-       sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset);
-
-       sfe->namelen = args->namelen;
-       sfe->valuelen = args->valuelen;
-       sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
-       memcpy(sfe->nameval, args->name, args->namelen);
-       memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen);
-       sf->hdr.count++;
-       be16_add_cpu(&sf->hdr.totsize, size);
-       xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
-
-       xfs_sbversion_add_attr2(mp, args->trans);
-}
-
-/*
- * After the last attribute is removed revert to original inode format,
- * making all literal area available to the data fork once more.
- */
-STATIC void
-xfs_attr_fork_reset(
-       struct xfs_inode        *ip,
-       struct xfs_trans        *tp)
-{
-       xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-       ip->i_d.di_forkoff = 0;
-       ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
-
-       ASSERT(ip->i_d.di_anextents == 0);
-       ASSERT(ip->i_afp == NULL);
-
-       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-}
-
-/*
- * Remove an attribute from the shortform attribute list structure.
- */
-int
-xfs_attr_shortform_remove(xfs_da_args_t *args)
-{
-       xfs_attr_shortform_t *sf;
-       xfs_attr_sf_entry_t *sfe;
-       int base, size=0, end, totsize, i;
-       xfs_mount_t *mp;
-       xfs_inode_t *dp;
-
-       trace_xfs_attr_sf_remove(args);
-
-       dp = args->dp;
-       mp = dp->i_mount;
-       base = sizeof(xfs_attr_sf_hdr_t);
-       sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
-       sfe = &sf->list[0];
-       end = sf->hdr.count;
-       for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe),
-                                       base += size, i++) {
-               size = XFS_ATTR_SF_ENTSIZE(sfe);
-               if (sfe->namelen != args->namelen)
-                       continue;
-               if (memcmp(sfe->nameval, args->name, args->namelen) != 0)
-                       continue;
-               if (!xfs_attr_namesp_match(args->flags, sfe->flags))
-                       continue;
-               break;
-       }
-       if (i == end)
-               return(XFS_ERROR(ENOATTR));
-
-       /*
-        * Fix up the attribute fork data, covering the hole
-        */
-       end = base + size;
-       totsize = be16_to_cpu(sf->hdr.totsize);
-       if (end != totsize)
-               memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end);
-       sf->hdr.count--;
-       be16_add_cpu(&sf->hdr.totsize, -size);
-
-       /*
-        * Fix up the start offset of the attribute fork
-        */
-       totsize -= size;
-       if (totsize == sizeof(xfs_attr_sf_hdr_t) &&
-           (mp->m_flags & XFS_MOUNT_ATTR2) &&
-           (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
-           !(args->op_flags & XFS_DA_OP_ADDNAME)) {
-               xfs_attr_fork_reset(dp, args->trans);
-       } else {
-               xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
-               dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
-               ASSERT(dp->i_d.di_forkoff);
-               ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) ||
-                               (args->op_flags & XFS_DA_OP_ADDNAME) ||
-                               !(mp->m_flags & XFS_MOUNT_ATTR2) ||
-                               dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
-               xfs_trans_log_inode(args->trans, dp,
-                                       XFS_ILOG_CORE | XFS_ILOG_ADATA);
-       }
-
-       xfs_sbversion_add_attr2(mp, args->trans);
-
-       return(0);
-}
-
-/*
- * Look up a name in a shortform attribute list structure.
- */
-/*ARGSUSED*/
-int
-xfs_attr_shortform_lookup(xfs_da_args_t *args)
-{
-       xfs_attr_shortform_t *sf;
-       xfs_attr_sf_entry_t *sfe;
-       int i;
-       xfs_ifork_t *ifp;
-
-       trace_xfs_attr_sf_lookup(args);
-
-       ifp = args->dp->i_afp;
-       ASSERT(ifp->if_flags & XFS_IFINLINE);
-       sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
-       sfe = &sf->list[0];
-       for (i = 0; i < sf->hdr.count;
-                               sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
-               if (sfe->namelen != args->namelen)
-                       continue;
-               if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
-                       continue;
-               if (!xfs_attr_namesp_match(args->flags, sfe->flags))
-                       continue;
-               return(XFS_ERROR(EEXIST));
-       }
-       return(XFS_ERROR(ENOATTR));
-}
-
-/*
- * Look up a name in a shortform attribute list structure.
- */
-/*ARGSUSED*/
-int
-xfs_attr_shortform_getvalue(xfs_da_args_t *args)
-{
-       xfs_attr_shortform_t *sf;
-       xfs_attr_sf_entry_t *sfe;
-       int i;
-
-       ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE);
-       sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
-       sfe = &sf->list[0];
-       for (i = 0; i < sf->hdr.count;
-                               sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
-               if (sfe->namelen != args->namelen)
-                       continue;
-               if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
-                       continue;
-               if (!xfs_attr_namesp_match(args->flags, sfe->flags))
-                       continue;
-               if (args->flags & ATTR_KERNOVAL) {
-                       args->valuelen = sfe->valuelen;
-                       return(XFS_ERROR(EEXIST));
-               }
-               if (args->valuelen < sfe->valuelen) {
-                       args->valuelen = sfe->valuelen;
-                       return(XFS_ERROR(ERANGE));
-               }
-               args->valuelen = sfe->valuelen;
-               memcpy(args->value, &sfe->nameval[args->namelen],
-                                                   args->valuelen);
-               return(XFS_ERROR(EEXIST));
-       }
-       return(XFS_ERROR(ENOATTR));
-}
-
-/*
- * Convert from using the shortform to the leaf.
- */
-int
-xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
-{
-       xfs_inode_t *dp;
-       xfs_attr_shortform_t *sf;
-       xfs_attr_sf_entry_t *sfe;
-       xfs_da_args_t nargs;
-       char *tmpbuffer;
-       int error, i, size;
-       xfs_dablk_t blkno;
-       struct xfs_buf *bp;
-       xfs_ifork_t *ifp;
-
-       trace_xfs_attr_sf_to_leaf(args);
-
-       dp = args->dp;
-       ifp = dp->i_afp;
-       sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
-       size = be16_to_cpu(sf->hdr.totsize);
-       tmpbuffer = kmem_alloc(size, KM_SLEEP);
-       ASSERT(tmpbuffer != NULL);
-       memcpy(tmpbuffer, ifp->if_u1.if_data, size);
-       sf = (xfs_attr_shortform_t *)tmpbuffer;
-
-       xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
-       xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK);
-
-       bp = NULL;
-       error = xfs_da_grow_inode(args, &blkno);
-       if (error) {
-               /*
-                * If we hit an IO error middle of the transaction inside
-                * grow_inode(), we may have inconsistent data. Bail out.
-                */
-               if (error == EIO)
-                       goto out;
-               xfs_idata_realloc(dp, size, XFS_ATTR_FORK);     /* try to put */
-               memcpy(ifp->if_u1.if_data, tmpbuffer, size);    /* it back */
-               goto out;
-       }
-
-       ASSERT(blkno == 0);
-       error = xfs_attr3_leaf_create(args, blkno, &bp);
-       if (error) {
-               error = xfs_da_shrink_inode(args, 0, bp);
-               bp = NULL;
-               if (error)
-                       goto out;
-               xfs_idata_realloc(dp, size, XFS_ATTR_FORK);     /* try to put */
-               memcpy(ifp->if_u1.if_data, tmpbuffer, size);    /* it back */
-               goto out;
-       }
-
-       memset((char *)&nargs, 0, sizeof(nargs));
-       nargs.dp = dp;
-       nargs.geo = args->geo;
-       nargs.firstblock = args->firstblock;
-       nargs.flist = args->flist;
-       nargs.total = args->total;
-       nargs.whichfork = XFS_ATTR_FORK;
-       nargs.trans = args->trans;
-       nargs.op_flags = XFS_DA_OP_OKNOENT;
-
-       sfe = &sf->list[0];
-       for (i = 0; i < sf->hdr.count; i++) {
-               nargs.name = sfe->nameval;
-               nargs.namelen = sfe->namelen;
-               nargs.value = &sfe->nameval[nargs.namelen];
-               nargs.valuelen = sfe->valuelen;
-               nargs.hashval = xfs_da_hashname(sfe->nameval,
-                                               sfe->namelen);
-               nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
-               error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */
-               ASSERT(error == ENOATTR);
-               error = xfs_attr3_leaf_add(bp, &nargs);
-               ASSERT(error != ENOSPC);
-               if (error)
-                       goto out;
-               sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
-       }
-       error = 0;
-
-out:
-       kmem_free(tmpbuffer);
-       return(error);
-}
-
-/*
- * Check a leaf attribute block to see if all the entries would fit into
- * a shortform attribute list.
- */
-int
-xfs_attr_shortform_allfit(
-       struct xfs_buf          *bp,
-       struct xfs_inode        *dp)
-{
-       struct xfs_attr_leafblock *leaf;
-       struct xfs_attr_leaf_entry *entry;
-       xfs_attr_leaf_name_local_t *name_loc;
-       struct xfs_attr3_icleaf_hdr leafhdr;
-       int                     bytes;
-       int                     i;
-
-       leaf = bp->b_addr;
-       xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
-       entry = xfs_attr3_leaf_entryp(leaf);
-
-       bytes = sizeof(struct xfs_attr_sf_hdr);
-       for (i = 0; i < leafhdr.count; entry++, i++) {
-               if (entry->flags & XFS_ATTR_INCOMPLETE)
-                       continue;               /* don't copy partial entries */
-               if (!(entry->flags & XFS_ATTR_LOCAL))
-                       return(0);
-               name_loc = xfs_attr3_leaf_name_local(leaf, i);
-               if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
-                       return(0);
-               if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
-                       return(0);
-               bytes += sizeof(struct xfs_attr_sf_entry) - 1
-                               + name_loc->namelen
-                               + be16_to_cpu(name_loc->valuelen);
-       }
-       if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) &&
-           (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
-           (bytes == sizeof(struct xfs_attr_sf_hdr)))
-               return -1;
-       return xfs_attr_shortform_bytesfit(dp, bytes);
-}
-
-/*
- * Convert a leaf attribute list to shortform attribute list
- */
-int
-xfs_attr3_leaf_to_shortform(
-       struct xfs_buf          *bp,
-       struct xfs_da_args      *args,
-       int                     forkoff)
-{
-       struct xfs_attr_leafblock *leaf;
-       struct xfs_attr3_icleaf_hdr ichdr;
-       struct xfs_attr_leaf_entry *entry;
-       struct xfs_attr_leaf_name_local *name_loc;
-       struct xfs_da_args      nargs;
-       struct xfs_inode        *dp = args->dp;
-       char                    *tmpbuffer;
-       int                     error;
-       int                     i;
-
-       trace_xfs_attr_leaf_to_sf(args);
-
-       tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
-       if (!tmpbuffer)
-               return ENOMEM;
-
-       memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
-
-       leaf = (xfs_attr_leafblock_t *)tmpbuffer;
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
-       entry = xfs_attr3_leaf_entryp(leaf);
-
-       /* XXX (dgc): buffer is about to be marked stale - why zero it? */
-       memset(bp->b_addr, 0, args->geo->blksize);
-
-       /*
-        * Clean out the prior contents of the attribute list.
-        */
-       error = xfs_da_shrink_inode(args, 0, bp);
-       if (error)
-               goto out;
-
-       if (forkoff == -1) {
-               ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2);
-               ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE);
-               xfs_attr_fork_reset(dp, args->trans);
-               goto out;
-       }
-
-       xfs_attr_shortform_create(args);
-
-       /*
-        * Copy the attributes
-        */
-       memset((char *)&nargs, 0, sizeof(nargs));
-       nargs.geo = args->geo;
-       nargs.dp = dp;
-       nargs.firstblock = args->firstblock;
-       nargs.flist = args->flist;
-       nargs.total = args->total;
-       nargs.whichfork = XFS_ATTR_FORK;
-       nargs.trans = args->trans;
-       nargs.op_flags = XFS_DA_OP_OKNOENT;
-
-       for (i = 0; i < ichdr.count; entry++, i++) {
-               if (entry->flags & XFS_ATTR_INCOMPLETE)
-                       continue;       /* don't copy partial entries */
-               if (!entry->nameidx)
-                       continue;
-               ASSERT(entry->flags & XFS_ATTR_LOCAL);
-               name_loc = xfs_attr3_leaf_name_local(leaf, i);
-               nargs.name = name_loc->nameval;
-               nargs.namelen = name_loc->namelen;
-               nargs.value = &name_loc->nameval[nargs.namelen];
-               nargs.valuelen = be16_to_cpu(name_loc->valuelen);
-               nargs.hashval = be32_to_cpu(entry->hashval);
-               nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags);
-               xfs_attr_shortform_add(&nargs, forkoff);
-       }
-       error = 0;
-
-out:
-       kmem_free(tmpbuffer);
-       return error;
-}
-
-/*
- * Convert from using a single leaf to a root node and a leaf.
- */
-int
-xfs_attr3_leaf_to_node(
-       struct xfs_da_args      *args)
-{
-       struct xfs_attr_leafblock *leaf;
-       struct xfs_attr3_icleaf_hdr icleafhdr;
-       struct xfs_attr_leaf_entry *entries;
-       struct xfs_da_node_entry *btree;
-       struct xfs_da3_icnode_hdr icnodehdr;
-       struct xfs_da_intnode   *node;
-       struct xfs_inode        *dp = args->dp;
-       struct xfs_mount        *mp = dp->i_mount;
-       struct xfs_buf          *bp1 = NULL;
-       struct xfs_buf          *bp2 = NULL;
-       xfs_dablk_t             blkno;
-       int                     error;
-
-       trace_xfs_attr_leaf_to_node(args);
-
-       error = xfs_da_grow_inode(args, &blkno);
-       if (error)
-               goto out;
-       error = xfs_attr3_leaf_read(args->trans, dp, 0, -1, &bp1);
-       if (error)
-               goto out;
-
-       error = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp2, XFS_ATTR_FORK);
-       if (error)
-               goto out;
-
-       /* copy leaf to new buffer, update identifiers */
-       xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF);
-       bp2->b_ops = bp1->b_ops;
-       memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize);
-       if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               struct xfs_da3_blkinfo *hdr3 = bp2->b_addr;
-               hdr3->blkno = cpu_to_be64(bp2->b_bn);
-       }
-       xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1);
-
-       /*
-        * Set up the new root node.
-        */
-       error = xfs_da3_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK);
-       if (error)
-               goto out;
-       node = bp1->b_addr;
-       dp->d_ops->node_hdr_from_disk(&icnodehdr, node);
-       btree = dp->d_ops->node_tree_p(node);
-
-       leaf = bp2->b_addr;
-       xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf);
-       entries = xfs_attr3_leaf_entryp(leaf);
-
-       /* both on-disk, don't endian-flip twice */
-       btree[0].hashval = entries[icleafhdr.count - 1].hashval;
-       btree[0].before = cpu_to_be32(blkno);
-       icnodehdr.count = 1;
-       dp->d_ops->node_hdr_to_disk(node, &icnodehdr);
-       xfs_trans_log_buf(args->trans, bp1, 0, args->geo->blksize - 1);
-       error = 0;
-out:
-       return error;
-}
-
-/*========================================================================
- * Routines used for growing the Btree.
- *========================================================================*/
-
-/*
- * Create the initial contents of a leaf attribute list
- * or a leaf in a node attribute list.
- */
-STATIC int
-xfs_attr3_leaf_create(
-       struct xfs_da_args      *args,
-       xfs_dablk_t             blkno,
-       struct xfs_buf          **bpp)
-{
-       struct xfs_attr_leafblock *leaf;
-       struct xfs_attr3_icleaf_hdr ichdr;
-       struct xfs_inode        *dp = args->dp;
-       struct xfs_mount        *mp = dp->i_mount;
-       struct xfs_buf          *bp;
-       int                     error;
-
-       trace_xfs_attr_leaf_create(args);
-
-       error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp,
-                                           XFS_ATTR_FORK);
-       if (error)
-               return error;
-       bp->b_ops = &xfs_attr3_leaf_buf_ops;
-       xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF);
-       leaf = bp->b_addr;
-       memset(leaf, 0, args->geo->blksize);
-
-       memset(&ichdr, 0, sizeof(ichdr));
-       ichdr.firstused = args->geo->blksize;
-
-       if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               struct xfs_da3_blkinfo *hdr3 = bp->b_addr;
-
-               ichdr.magic = XFS_ATTR3_LEAF_MAGIC;
-
-               hdr3->blkno = cpu_to_be64(bp->b_bn);
-               hdr3->owner = cpu_to_be64(dp->i_ino);
-               uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
-
-               ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr);
-       } else {
-               ichdr.magic = XFS_ATTR_LEAF_MAGIC;
-               ichdr.freemap[0].base = sizeof(struct xfs_attr_leaf_hdr);
-       }
-       ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base;
-
-       xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
-       xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1);
-
-       *bpp = bp;
-       return 0;
-}
-
-/*
- * Split the leaf node, rebalance, then add the new entry.
- */
-int
-xfs_attr3_leaf_split(
-       struct xfs_da_state     *state,
-       struct xfs_da_state_blk *oldblk,
-       struct xfs_da_state_blk *newblk)
-{
-       xfs_dablk_t blkno;
-       int error;
-
-       trace_xfs_attr_leaf_split(state->args);
-
-       /*
-        * Allocate space for a new leaf node.
-        */
-       ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC);
-       error = xfs_da_grow_inode(state->args, &blkno);
-       if (error)
-               return(error);
-       error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp);
-       if (error)
-               return(error);
-       newblk->blkno = blkno;
-       newblk->magic = XFS_ATTR_LEAF_MAGIC;
-
-       /*
-        * Rebalance the entries across the two leaves.
-        * NOTE: rebalance() currently depends on the 2nd block being empty.
-        */
-       xfs_attr3_leaf_rebalance(state, oldblk, newblk);
-       error = xfs_da3_blk_link(state, oldblk, newblk);
-       if (error)
-               return(error);
-
-       /*
-        * Save info on "old" attribute for "atomic rename" ops, leaf_add()
-        * modifies the index/blkno/rmtblk/rmtblkcnt fields to show the
-        * "new" attrs info.  Will need the "old" info to remove it later.
-        *
-        * Insert the "new" entry in the correct block.
-        */
-       if (state->inleaf) {
-               trace_xfs_attr_leaf_add_old(state->args);
-               error = xfs_attr3_leaf_add(oldblk->bp, state->args);
-       } else {
-               trace_xfs_attr_leaf_add_new(state->args);
-               error = xfs_attr3_leaf_add(newblk->bp, state->args);
-       }
-
-       /*
-        * Update last hashval in each block since we added the name.
-        */
-       oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL);
-       newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL);
-       return(error);
-}
-
-/*
- * Add a name to the leaf attribute list structure.
- */
-int
-xfs_attr3_leaf_add(
-       struct xfs_buf          *bp,
-       struct xfs_da_args      *args)
-{
-       struct xfs_attr_leafblock *leaf;
-       struct xfs_attr3_icleaf_hdr ichdr;
-       int                     tablesize;
-       int                     entsize;
-       int                     sum;
-       int                     tmp;
-       int                     i;
-
-       trace_xfs_attr_leaf_add(args);
-
-       leaf = bp->b_addr;
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
-       ASSERT(args->index >= 0 && args->index <= ichdr.count);
-       entsize = xfs_attr_leaf_newentsize(args, NULL);
-
-       /*
-        * Search through freemap for first-fit on new name length.
-        * (may need to figure in size of entry struct too)
-        */
-       tablesize = (ichdr.count + 1) * sizeof(xfs_attr_leaf_entry_t)
-                                       + xfs_attr3_leaf_hdr_size(leaf);
-       for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE - 1; i >= 0; i--) {
-               if (tablesize > ichdr.firstused) {
-                       sum += ichdr.freemap[i].size;
-                       continue;
-               }
-               if (!ichdr.freemap[i].size)
-                       continue;       /* no space in this map */
-               tmp = entsize;
-               if (ichdr.freemap[i].base < ichdr.firstused)
-                       tmp += sizeof(xfs_attr_leaf_entry_t);
-               if (ichdr.freemap[i].size >= tmp) {
-                       tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, i);
-                       goto out_log_hdr;
-               }
-               sum += ichdr.freemap[i].size;
-       }
-
-       /*
-        * If there are no holes in the address space of the block,
-        * and we don't have enough freespace, then compaction will do us
-        * no good and we should just give up.
-        */
-       if (!ichdr.holes && sum < entsize)
-               return XFS_ERROR(ENOSPC);
-
-       /*
-        * Compact the entries to coalesce free space.
-        * This may change the hdr->count via dropping INCOMPLETE entries.
-        */
-       xfs_attr3_leaf_compact(args, &ichdr, bp);
-
-       /*
-        * After compaction, the block is guaranteed to have only one
-        * free region, in freemap[0].  If it is not big enough, give up.
-        */
-       if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) {
-               tmp = ENOSPC;
-               goto out_log_hdr;
-       }
-
-       tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0);
-
-out_log_hdr:
-       xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
-       xfs_trans_log_buf(args->trans, bp,
-               XFS_DA_LOGRANGE(leaf, &leaf->hdr,
-                               xfs_attr3_leaf_hdr_size(leaf)));
-       return tmp;
-}
-
-/*
- * Add a name to a leaf attribute list structure.
- */
-STATIC int
-xfs_attr3_leaf_add_work(
-       struct xfs_buf          *bp,
-       struct xfs_attr3_icleaf_hdr *ichdr,
-       struct xfs_da_args      *args,
-       int                     mapindex)
-{
-       struct xfs_attr_leafblock *leaf;
-       struct xfs_attr_leaf_entry *entry;
-       struct xfs_attr_leaf_name_local *name_loc;
-       struct xfs_attr_leaf_name_remote *name_rmt;
-       struct xfs_mount        *mp;
-       int                     tmp;
-       int                     i;
-
-       trace_xfs_attr_leaf_add_work(args);
-
-       leaf = bp->b_addr;
-       ASSERT(mapindex >= 0 && mapindex < XFS_ATTR_LEAF_MAPSIZE);
-       ASSERT(args->index >= 0 && args->index <= ichdr->count);
-
-       /*
-        * Force open some space in the entry array and fill it in.
-        */
-       entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
-       if (args->index < ichdr->count) {
-               tmp  = ichdr->count - args->index;
-               tmp *= sizeof(xfs_attr_leaf_entry_t);
-               memmove(entry + 1, entry, tmp);
-               xfs_trans_log_buf(args->trans, bp,
-                   XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
-       }
-       ichdr->count++;
-
-       /*
-        * Allocate space for the new string (at the end of the run).
-        */
-       mp = args->trans->t_mountp;
-       ASSERT(ichdr->freemap[mapindex].base < args->geo->blksize);
-       ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0);
-       ASSERT(ichdr->freemap[mapindex].size >=
-               xfs_attr_leaf_newentsize(args, NULL));
-       ASSERT(ichdr->freemap[mapindex].size < args->geo->blksize);
-       ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0);
-
-       ichdr->freemap[mapindex].size -= xfs_attr_leaf_newentsize(args, &tmp);
-
-       entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base +
-                                    ichdr->freemap[mapindex].size);
-       entry->hashval = cpu_to_be32(args->hashval);
-       entry->flags = tmp ? XFS_ATTR_LOCAL : 0;
-       entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
-       if (args->op_flags & XFS_DA_OP_RENAME) {
-               entry->flags |= XFS_ATTR_INCOMPLETE;
-               if ((args->blkno2 == args->blkno) &&
-                   (args->index2 <= args->index)) {
-                       args->index2++;
-               }
-       }
-       xfs_trans_log_buf(args->trans, bp,
-                         XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
-       ASSERT((args->index == 0) ||
-              (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval)));
-       ASSERT((args->index == ichdr->count - 1) ||
-              (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval)));
-
-       /*
-        * For "remote" attribute values, simply note that we need to
-        * allocate space for the "remote" value.  We can't actually
-        * allocate the extents in this transaction, and we can't decide
-        * which blocks they should be as we might allocate more blocks
-        * as part of this transaction (a split operation for example).
-        */
-       if (entry->flags & XFS_ATTR_LOCAL) {
-               name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
-               name_loc->namelen = args->namelen;
-               name_loc->valuelen = cpu_to_be16(args->valuelen);
-               memcpy((char *)name_loc->nameval, args->name, args->namelen);
-               memcpy((char *)&name_loc->nameval[args->namelen], args->value,
-                                  be16_to_cpu(name_loc->valuelen));
-       } else {
-               name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
-               name_rmt->namelen = args->namelen;
-               memcpy((char *)name_rmt->name, args->name, args->namelen);
-               entry->flags |= XFS_ATTR_INCOMPLETE;
-               /* just in case */
-               name_rmt->valuelen = 0;
-               name_rmt->valueblk = 0;
-               args->rmtblkno = 1;
-               args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
-               args->rmtvaluelen = args->valuelen;
-       }
-       xfs_trans_log_buf(args->trans, bp,
-            XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
-                                  xfs_attr_leaf_entsize(leaf, args->index)));
-
-       /*
-        * Update the control info for this leaf node
-        */
-       if (be16_to_cpu(entry->nameidx) < ichdr->firstused)
-               ichdr->firstused = be16_to_cpu(entry->nameidx);
-
-       ASSERT(ichdr->firstused >= ichdr->count * sizeof(xfs_attr_leaf_entry_t)
-                                       + xfs_attr3_leaf_hdr_size(leaf));
-       tmp = (ichdr->count - 1) * sizeof(xfs_attr_leaf_entry_t)
-                                       + xfs_attr3_leaf_hdr_size(leaf);
-
-       for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
-               if (ichdr->freemap[i].base == tmp) {
-                       ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t);
-                       ichdr->freemap[i].size -= sizeof(xfs_attr_leaf_entry_t);
-               }
-       }
-       ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index);
-       return 0;
-}
-
-/*
- * Garbage collect a leaf attribute list block by copying it to a new buffer.
- */
-STATIC void
-xfs_attr3_leaf_compact(
-       struct xfs_da_args      *args,
-       struct xfs_attr3_icleaf_hdr *ichdr_dst,
-       struct xfs_buf          *bp)
-{
-       struct xfs_attr_leafblock *leaf_src;
-       struct xfs_attr_leafblock *leaf_dst;
-       struct xfs_attr3_icleaf_hdr ichdr_src;
-       struct xfs_trans        *trans = args->trans;
-       char                    *tmpbuffer;
-
-       trace_xfs_attr_leaf_compact(args);
-
-       tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
-       memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
-       memset(bp->b_addr, 0, args->geo->blksize);
-       leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
-       leaf_dst = bp->b_addr;
-
-       /*
-        * Copy the on-disk header back into the destination buffer to ensure
-        * all the information in the header that is not part of the incore
-        * header structure is preserved.
-        */
-       memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src));
-
-       /* Initialise the incore headers */
-       ichdr_src = *ichdr_dst; /* struct copy */
-       ichdr_dst->firstused = args->geo->blksize;
-       ichdr_dst->usedbytes = 0;
-       ichdr_dst->count = 0;
-       ichdr_dst->holes = 0;
-       ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src);
-       ichdr_dst->freemap[0].size = ichdr_dst->firstused -
-                                               ichdr_dst->freemap[0].base;
-
-       /* write the header back to initialise the underlying buffer */
-       xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst);
-
-       /*
-        * Copy all entry's in the same (sorted) order,
-        * but allocate name/value pairs packed and in sequence.
-        */
-       xfs_attr3_leaf_moveents(args, leaf_src, &ichdr_src, 0,
-                               leaf_dst, ichdr_dst, 0, ichdr_src.count);
-       /*
-        * this logs the entire buffer, but the caller must write the header
-        * back to the buffer when it is finished modifying it.
-        */
-       xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1);
-
-       kmem_free(tmpbuffer);
-}
-
-/*
- * Compare two leaf blocks "order".
- * Return 0 unless leaf2 should go before leaf1.
- */
-static int
-xfs_attr3_leaf_order(
-       struct xfs_buf  *leaf1_bp,
-       struct xfs_attr3_icleaf_hdr *leaf1hdr,
-       struct xfs_buf  *leaf2_bp,
-       struct xfs_attr3_icleaf_hdr *leaf2hdr)
-{
-       struct xfs_attr_leaf_entry *entries1;
-       struct xfs_attr_leaf_entry *entries2;
-
-       entries1 = xfs_attr3_leaf_entryp(leaf1_bp->b_addr);
-       entries2 = xfs_attr3_leaf_entryp(leaf2_bp->b_addr);
-       if (leaf1hdr->count > 0 && leaf2hdr->count > 0 &&
-           ((be32_to_cpu(entries2[0].hashval) <
-             be32_to_cpu(entries1[0].hashval)) ||
-            (be32_to_cpu(entries2[leaf2hdr->count - 1].hashval) <
-             be32_to_cpu(entries1[leaf1hdr->count - 1].hashval)))) {
-               return 1;
-       }
-       return 0;
-}
-
-int
-xfs_attr_leaf_order(
-       struct xfs_buf  *leaf1_bp,
-       struct xfs_buf  *leaf2_bp)
-{
-       struct xfs_attr3_icleaf_hdr ichdr1;
-       struct xfs_attr3_icleaf_hdr ichdr2;
-
-       xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1_bp->b_addr);
-       xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2_bp->b_addr);
-       return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2);
-}
-
-/*
- * Redistribute the attribute list entries between two leaf nodes,
- * taking into account the size of the new entry.
- *
- * NOTE: if new block is empty, then it will get the upper half of the
- * old block.  At present, all (one) callers pass in an empty second block.
- *
- * This code adjusts the args->index/blkno and args->index2/blkno2 fields
- * to match what it is doing in splitting the attribute leaf block.  Those
- * values are used in "atomic rename" operations on attributes.  Note that
- * the "new" and "old" values can end up in different blocks.
- */
-STATIC void
-xfs_attr3_leaf_rebalance(
-       struct xfs_da_state     *state,
-       struct xfs_da_state_blk *blk1,
-       struct xfs_da_state_blk *blk2)
-{
-       struct xfs_da_args      *args;
-       struct xfs_attr_leafblock *leaf1;
-       struct xfs_attr_leafblock *leaf2;
-       struct xfs_attr3_icleaf_hdr ichdr1;
-       struct xfs_attr3_icleaf_hdr ichdr2;
-       struct xfs_attr_leaf_entry *entries1;
-       struct xfs_attr_leaf_entry *entries2;
-       int                     count;
-       int                     totallen;
-       int                     max;
-       int                     space;
-       int                     swap;
-
-       /*
-        * Set up environment.
-        */
-       ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC);
-       ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC);
-       leaf1 = blk1->bp->b_addr;
-       leaf2 = blk2->bp->b_addr;
-       xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
-       xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
-       ASSERT(ichdr2.count == 0);
-       args = state->args;
-
-       trace_xfs_attr_leaf_rebalance(args);
-
-       /*
-        * Check ordering of blocks, reverse if it makes things simpler.
-        *
-        * NOTE: Given that all (current) callers pass in an empty
-        * second block, this code should never set "swap".
-        */
-       swap = 0;
-       if (xfs_attr3_leaf_order(blk1->bp, &ichdr1, blk2->bp, &ichdr2)) {
-               struct xfs_da_state_blk *tmp_blk;
-               struct xfs_attr3_icleaf_hdr tmp_ichdr;
-
-               tmp_blk = blk1;
-               blk1 = blk2;
-               blk2 = tmp_blk;
-
-               /* struct copies to swap them rather than reconverting */
-               tmp_ichdr = ichdr1;
-               ichdr1 = ichdr2;
-               ichdr2 = tmp_ichdr;
-
-               leaf1 = blk1->bp->b_addr;
-               leaf2 = blk2->bp->b_addr;
-               swap = 1;
-       }
-
-       /*
-        * Examine entries until we reduce the absolute difference in
-        * byte usage between the two blocks to a minimum.  Then get
-        * the direction to copy and the number of elements to move.
-        *
-        * "inleaf" is true if the new entry should be inserted into blk1.
-        * If "swap" is also true, then reverse the sense of "inleaf".
-        */
-       state->inleaf = xfs_attr3_leaf_figure_balance(state, blk1, &ichdr1,
-                                                     blk2, &ichdr2,
-                                                     &count, &totallen);
-       if (swap)
-               state->inleaf = !state->inleaf;
-
-       /*
-        * Move any entries required from leaf to leaf:
-        */
-       if (count < ichdr1.count) {
-               /*
-                * Figure the total bytes to be added to the destination leaf.
-                */
-               /* number entries being moved */
-               count = ichdr1.count - count;
-               space  = ichdr1.usedbytes - totallen;
-               space += count * sizeof(xfs_attr_leaf_entry_t);
-
-               /*
-                * leaf2 is the destination, compact it if it looks tight.
-                */
-               max  = ichdr2.firstused - xfs_attr3_leaf_hdr_size(leaf1);
-               max -= ichdr2.count * sizeof(xfs_attr_leaf_entry_t);
-               if (space > max)
-                       xfs_attr3_leaf_compact(args, &ichdr2, blk2->bp);
-
-               /*
-                * Move high entries from leaf1 to low end of leaf2.
-                */
-               xfs_attr3_leaf_moveents(args, leaf1, &ichdr1,
-                               ichdr1.count - count, leaf2, &ichdr2, 0, count);
-
-       } else if (count > ichdr1.count) {
-               /*
-                * I assert that since all callers pass in an empty
-                * second buffer, this code should never execute.
-                */
-               ASSERT(0);
-
-               /*
-                * Figure the total bytes to be added to the destination leaf.
-                */
-               /* number entries being moved */
-               count -= ichdr1.count;
-               space  = totallen - ichdr1.usedbytes;
-               space += count * sizeof(xfs_attr_leaf_entry_t);
-
-               /*
-                * leaf1 is the destination, compact it if it looks tight.
-                */
-               max  = ichdr1.firstused - xfs_attr3_leaf_hdr_size(leaf1);
-               max -= ichdr1.count * sizeof(xfs_attr_leaf_entry_t);
-               if (space > max)
-                       xfs_attr3_leaf_compact(args, &ichdr1, blk1->bp);
-
-               /*
-                * Move low entries from leaf2 to high end of leaf1.
-                */
-               xfs_attr3_leaf_moveents(args, leaf2, &ichdr2, 0, leaf1, &ichdr1,
-                                       ichdr1.count, count);
-       }
-
-       xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1);
-       xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2);
-       xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1);
-       xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1);
-
-       /*
-        * Copy out last hashval in each block for B-tree code.
-        */
-       entries1 = xfs_attr3_leaf_entryp(leaf1);
-       entries2 = xfs_attr3_leaf_entryp(leaf2);
-       blk1->hashval = be32_to_cpu(entries1[ichdr1.count - 1].hashval);
-       blk2->hashval = be32_to_cpu(entries2[ichdr2.count - 1].hashval);
-
-       /*
-        * Adjust the expected index for insertion.
-        * NOTE: this code depends on the (current) situation that the
-        * second block was originally empty.
-        *
-        * If the insertion point moved to the 2nd block, we must adjust
-        * the index.  We must also track the entry just following the
-        * new entry for use in an "atomic rename" operation, that entry
-        * is always the "old" entry and the "new" entry is what we are
-        * inserting.  The index/blkno fields refer to the "old" entry,
-        * while the index2/blkno2 fields refer to the "new" entry.
-        */
-       if (blk1->index > ichdr1.count) {
-               ASSERT(state->inleaf == 0);
-               blk2->index = blk1->index - ichdr1.count;
-               args->index = args->index2 = blk2->index;
-               args->blkno = args->blkno2 = blk2->blkno;
-       } else if (blk1->index == ichdr1.count) {
-               if (state->inleaf) {
-                       args->index = blk1->index;
-                       args->blkno = blk1->blkno;
-                       args->index2 = 0;
-                       args->blkno2 = blk2->blkno;
-               } else {
-                       /*
-                        * On a double leaf split, the original attr location
-                        * is already stored in blkno2/index2, so don't
-                        * overwrite it overwise we corrupt the tree.
-                        */
-                       blk2->index = blk1->index - ichdr1.count;
-                       args->index = blk2->index;
-                       args->blkno = blk2->blkno;
-                       if (!state->extravalid) {
-                               /*
-                                * set the new attr location to match the old
-                                * one and let the higher level split code
-                                * decide where in the leaf to place it.
-                                */
-                               args->index2 = blk2->index;
-                               args->blkno2 = blk2->blkno;
-                       }
-               }
-       } else {
-               ASSERT(state->inleaf == 1);
-               args->index = args->index2 = blk1->index;
-               args->blkno = args->blkno2 = blk1->blkno;
-       }
-}
-
-/*
- * Examine entries until we reduce the absolute difference in
- * byte usage between the two blocks to a minimum.
- * GROT: Is this really necessary?  With other than a 512 byte blocksize,
- * GROT: there will always be enough room in either block for a new entry.
- * GROT: Do a double-split for this case?
- */
-STATIC int
-xfs_attr3_leaf_figure_balance(
-       struct xfs_da_state             *state,
-       struct xfs_da_state_blk         *blk1,
-       struct xfs_attr3_icleaf_hdr     *ichdr1,
-       struct xfs_da_state_blk         *blk2,
-       struct xfs_attr3_icleaf_hdr     *ichdr2,
-       int                             *countarg,
-       int                             *usedbytesarg)
-{
-       struct xfs_attr_leafblock       *leaf1 = blk1->bp->b_addr;
-       struct xfs_attr_leafblock       *leaf2 = blk2->bp->b_addr;
-       struct xfs_attr_leaf_entry      *entry;
-       int                             count;
-       int                             max;
-       int                             index;
-       int                             totallen = 0;
-       int                             half;
-       int                             lastdelta;
-       int                             foundit = 0;
-       int                             tmp;
-
-       /*
-        * Examine entries until we reduce the absolute difference in
-        * byte usage between the two blocks to a minimum.
-        */
-       max = ichdr1->count + ichdr2->count;
-       half = (max + 1) * sizeof(*entry);
-       half += ichdr1->usedbytes + ichdr2->usedbytes +
-                       xfs_attr_leaf_newentsize(state->args, NULL);
-       half /= 2;
-       lastdelta = state->args->geo->blksize;
-       entry = xfs_attr3_leaf_entryp(leaf1);
-       for (count = index = 0; count < max; entry++, index++, count++) {
-
-#define XFS_ATTR_ABS(A)        (((A) < 0) ? -(A) : (A))
-               /*
-                * The new entry is in the first block, account for it.
-                */
-               if (count == blk1->index) {
-                       tmp = totallen + sizeof(*entry) +
-                               xfs_attr_leaf_newentsize(state->args, NULL);
-                       if (XFS_ATTR_ABS(half - tmp) > lastdelta)
-                               break;
-                       lastdelta = XFS_ATTR_ABS(half - tmp);
-                       totallen = tmp;
-                       foundit = 1;
-               }
-
-               /*
-                * Wrap around into the second block if necessary.
-                */
-               if (count == ichdr1->count) {
-                       leaf1 = leaf2;
-                       entry = xfs_attr3_leaf_entryp(leaf1);
-                       index = 0;
-               }
-
-               /*
-                * Figure out if next leaf entry would be too much.
-                */
-               tmp = totallen + sizeof(*entry) + xfs_attr_leaf_entsize(leaf1,
-                                                                       index);
-               if (XFS_ATTR_ABS(half - tmp) > lastdelta)
-                       break;
-               lastdelta = XFS_ATTR_ABS(half - tmp);
-               totallen = tmp;
-#undef XFS_ATTR_ABS
-       }
-
-       /*
-        * Calculate the number of usedbytes that will end up in lower block.
-        * If new entry not in lower block, fix up the count.
-        */
-       totallen -= count * sizeof(*entry);
-       if (foundit) {
-               totallen -= sizeof(*entry) +
-                               xfs_attr_leaf_newentsize(state->args, NULL);
-       }
-
-       *countarg = count;
-       *usedbytesarg = totallen;
-       return foundit;
-}
-
-/*========================================================================
- * Routines used for shrinking the Btree.
- *========================================================================*/
-
-/*
- * Check a leaf block and its neighbors to see if the block should be
- * collapsed into one or the other neighbor.  Always keep the block
- * with the smaller block number.
- * If the current block is over 50% full, don't try to join it, return 0.
- * If the block is empty, fill in the state structure and return 2.
- * If it can be collapsed, fill in the state structure and return 1.
- * If nothing can be done, return 0.
- *
- * GROT: allow for INCOMPLETE entries in calculation.
- */
-int
-xfs_attr3_leaf_toosmall(
-       struct xfs_da_state     *state,
-       int                     *action)
-{
-       struct xfs_attr_leafblock *leaf;
-       struct xfs_da_state_blk *blk;
-       struct xfs_attr3_icleaf_hdr ichdr;
-       struct xfs_buf          *bp;
-       xfs_dablk_t             blkno;
-       int                     bytes;
-       int                     forward;
-       int                     error;
-       int                     retval;
-       int                     i;
-
-       trace_xfs_attr_leaf_toosmall(state->args);
-
-       /*
-        * Check for the degenerate case of the block being over 50% full.
-        * If so, it's not worth even looking to see if we might be able
-        * to coalesce with a sibling.
-        */
-       blk = &state->path.blk[ state->path.active-1 ];
-       leaf = blk->bp->b_addr;
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
-       bytes = xfs_attr3_leaf_hdr_size(leaf) +
-               ichdr.count * sizeof(xfs_attr_leaf_entry_t) +
-               ichdr.usedbytes;
-       if (bytes > (state->args->geo->blksize >> 1)) {
-               *action = 0;    /* blk over 50%, don't try to join */
-               return(0);
-       }
-
-       /*
-        * Check for the degenerate case of the block being empty.
-        * If the block is empty, we'll simply delete it, no need to
-        * coalesce it with a sibling block.  We choose (arbitrarily)
-        * to merge with the forward block unless it is NULL.
-        */
-       if (ichdr.count == 0) {
-               /*
-                * Make altpath point to the block we want to keep and
-                * path point to the block we want to drop (this one).
-                */
-               forward = (ichdr.forw != 0);
-               memcpy(&state->altpath, &state->path, sizeof(state->path));
-               error = xfs_da3_path_shift(state, &state->altpath, forward,
-                                                0, &retval);
-               if (error)
-                       return(error);
-               if (retval) {
-                       *action = 0;
-               } else {
-                       *action = 2;
-               }
-               return 0;
-       }
-
-       /*
-        * Examine each sibling block to see if we can coalesce with
-        * at least 25% free space to spare.  We need to figure out
-        * whether to merge with the forward or the backward block.
-        * We prefer coalescing with the lower numbered sibling so as
-        * to shrink an attribute list over time.
-        */
-       /* start with smaller blk num */
-       forward = ichdr.forw < ichdr.back;
-       for (i = 0; i < 2; forward = !forward, i++) {
-               struct xfs_attr3_icleaf_hdr ichdr2;
-               if (forward)
-                       blkno = ichdr.forw;
-               else
-                       blkno = ichdr.back;
-               if (blkno == 0)
-                       continue;
-               error = xfs_attr3_leaf_read(state->args->trans, state->args->dp,
-                                       blkno, -1, &bp);
-               if (error)
-                       return(error);
-
-               xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr);
-
-               bytes = state->args->geo->blksize -
-                       (state->args->geo->blksize >> 2) -
-                       ichdr.usedbytes - ichdr2.usedbytes -
-                       ((ichdr.count + ichdr2.count) *
-                                       sizeof(xfs_attr_leaf_entry_t)) -
-                       xfs_attr3_leaf_hdr_size(leaf);
-
-               xfs_trans_brelse(state->args->trans, bp);
-               if (bytes >= 0)
-                       break;  /* fits with at least 25% to spare */
-       }
-       if (i >= 2) {
-               *action = 0;
-               return(0);
-       }
-
-       /*
-        * Make altpath point to the block we want to keep (the lower
-        * numbered block) and path point to the block we want to drop.
-        */
-       memcpy(&state->altpath, &state->path, sizeof(state->path));
-       if (blkno < blk->blkno) {
-               error = xfs_da3_path_shift(state, &state->altpath, forward,
-                                                0, &retval);
-       } else {
-               error = xfs_da3_path_shift(state, &state->path, forward,
-                                                0, &retval);
-       }
-       if (error)
-               return(error);
-       if (retval) {
-               *action = 0;
-       } else {
-               *action = 1;
-       }
-       return(0);
-}
-
-/*
- * Remove a name from the leaf attribute list structure.
- *
- * Return 1 if leaf is less than 37% full, 0 if >= 37% full.
- * If two leaves are 37% full, when combined they will leave 25% free.
- */
-int
-xfs_attr3_leaf_remove(
-       struct xfs_buf          *bp,
-       struct xfs_da_args      *args)
-{
-       struct xfs_attr_leafblock *leaf;
-       struct xfs_attr3_icleaf_hdr ichdr;
-       struct xfs_attr_leaf_entry *entry;
-       int                     before;
-       int                     after;
-       int                     smallest;
-       int                     entsize;
-       int                     tablesize;
-       int                     tmp;
-       int                     i;
-
-       trace_xfs_attr_leaf_remove(args);
-
-       leaf = bp->b_addr;
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
-
-       ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8);
-       ASSERT(args->index >= 0 && args->index < ichdr.count);
-       ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) +
-                                       xfs_attr3_leaf_hdr_size(leaf));
-
-       entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
-
-       ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
-       ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
-
-       /*
-        * Scan through free region table:
-        *    check for adjacency of free'd entry with an existing one,
-        *    find smallest free region in case we need to replace it,
-        *    adjust any map that borders the entry table,
-        */
-       tablesize = ichdr.count * sizeof(xfs_attr_leaf_entry_t)
-                                       + xfs_attr3_leaf_hdr_size(leaf);
-       tmp = ichdr.freemap[0].size;
-       before = after = -1;
-       smallest = XFS_ATTR_LEAF_MAPSIZE - 1;
-       entsize = xfs_attr_leaf_entsize(leaf, args->index);
-       for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
-               ASSERT(ichdr.freemap[i].base < args->geo->blksize);
-               ASSERT(ichdr.freemap[i].size < args->geo->blksize);
-               if (ichdr.freemap[i].base == tablesize) {
-                       ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t);
-                       ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t);
-               }
-
-               if (ichdr.freemap[i].base + ichdr.freemap[i].size ==
-                               be16_to_cpu(entry->nameidx)) {
-                       before = i;
-               } else if (ichdr.freemap[i].base ==
-                               (be16_to_cpu(entry->nameidx) + entsize)) {
-                       after = i;
-               } else if (ichdr.freemap[i].size < tmp) {
-                       tmp = ichdr.freemap[i].size;
-                       smallest = i;
-               }
-       }
-
-       /*
-        * Coalesce adjacent freemap regions,
-        * or replace the smallest region.
-        */
-       if ((before >= 0) || (after >= 0)) {
-               if ((before >= 0) && (after >= 0)) {
-                       ichdr.freemap[before].size += entsize;
-                       ichdr.freemap[before].size += ichdr.freemap[after].size;
-                       ichdr.freemap[after].base = 0;
-                       ichdr.freemap[after].size = 0;
-               } else if (before >= 0) {
-                       ichdr.freemap[before].size += entsize;
-               } else {
-                       ichdr.freemap[after].base = be16_to_cpu(entry->nameidx);
-                       ichdr.freemap[after].size += entsize;
-               }
-       } else {
-               /*
-                * Replace smallest region (if it is smaller than free'd entry)
-                */
-               if (ichdr.freemap[smallest].size < entsize) {
-                       ichdr.freemap[smallest].base = be16_to_cpu(entry->nameidx);
-                       ichdr.freemap[smallest].size = entsize;
-               }
-       }
-
-       /*
-        * Did we remove the first entry?
-        */
-       if (be16_to_cpu(entry->nameidx) == ichdr.firstused)
-               smallest = 1;
-       else
-               smallest = 0;
-
-       /*
-        * Compress the remaining entries and zero out the removed stuff.
-        */
-       memset(xfs_attr3_leaf_name(leaf, args->index), 0, entsize);
-       ichdr.usedbytes -= entsize;
-       xfs_trans_log_buf(args->trans, bp,
-            XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
-                                  entsize));
-
-       tmp = (ichdr.count - args->index) * sizeof(xfs_attr_leaf_entry_t);
-       memmove(entry, entry + 1, tmp);
-       ichdr.count--;
-       xfs_trans_log_buf(args->trans, bp,
-           XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(xfs_attr_leaf_entry_t)));
-
-       entry = &xfs_attr3_leaf_entryp(leaf)[ichdr.count];
-       memset(entry, 0, sizeof(xfs_attr_leaf_entry_t));
-
-       /*
-        * If we removed the first entry, re-find the first used byte
-        * in the name area.  Note that if the entry was the "firstused",
-        * then we don't have a "hole" in our block resulting from
-        * removing the name.
-        */
-       if (smallest) {
-               tmp = args->geo->blksize;
-               entry = xfs_attr3_leaf_entryp(leaf);
-               for (i = ichdr.count - 1; i >= 0; entry++, i--) {
-                       ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
-                       ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
-
-                       if (be16_to_cpu(entry->nameidx) < tmp)
-                               tmp = be16_to_cpu(entry->nameidx);
-               }
-               ichdr.firstused = tmp;
-               if (!ichdr.firstused)
-                       ichdr.firstused = tmp - XFS_ATTR_LEAF_NAME_ALIGN;
-       } else {
-               ichdr.holes = 1;        /* mark as needing compaction */
-       }
-       xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
-       xfs_trans_log_buf(args->trans, bp,
-                         XFS_DA_LOGRANGE(leaf, &leaf->hdr,
-                                         xfs_attr3_leaf_hdr_size(leaf)));
-
-       /*
-        * Check if leaf is less than 50% full, caller may want to
-        * "join" the leaf with a sibling if so.
-        */
-       tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) +
-             ichdr.count * sizeof(xfs_attr_leaf_entry_t);
-
-       return tmp < args->geo->magicpct; /* leaf is < 37% full */
-}
-
-/*
- * Move all the attribute list entries from drop_leaf into save_leaf.
- */
-void
-xfs_attr3_leaf_unbalance(
-       struct xfs_da_state     *state,
-       struct xfs_da_state_blk *drop_blk,
-       struct xfs_da_state_blk *save_blk)
-{
-       struct xfs_attr_leafblock *drop_leaf = drop_blk->bp->b_addr;
-       struct xfs_attr_leafblock *save_leaf = save_blk->bp->b_addr;
-       struct xfs_attr3_icleaf_hdr drophdr;
-       struct xfs_attr3_icleaf_hdr savehdr;
-       struct xfs_attr_leaf_entry *entry;
-
-       trace_xfs_attr_leaf_unbalance(state->args);
-
-       drop_leaf = drop_blk->bp->b_addr;
-       save_leaf = save_blk->bp->b_addr;
-       xfs_attr3_leaf_hdr_from_disk(&drophdr, drop_leaf);
-       xfs_attr3_leaf_hdr_from_disk(&savehdr, save_leaf);
-       entry = xfs_attr3_leaf_entryp(drop_leaf);
-
-       /*
-        * Save last hashval from dying block for later Btree fixup.
-        */
-       drop_blk->hashval = be32_to_cpu(entry[drophdr.count - 1].hashval);
-
-       /*
-        * Check if we need a temp buffer, or can we do it in place.
-        * Note that we don't check "leaf" for holes because we will
-        * always be dropping it, toosmall() decided that for us already.
-        */
-       if (savehdr.holes == 0) {
-               /*
-                * dest leaf has no holes, so we add there.  May need
-                * to make some room in the entry array.
-                */
-               if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
-                                        drop_blk->bp, &drophdr)) {
-                       xfs_attr3_leaf_moveents(state->args,
-                                               drop_leaf, &drophdr, 0,
-                                               save_leaf, &savehdr, 0,
-                                               drophdr.count);
-               } else {
-                       xfs_attr3_leaf_moveents(state->args,
-                                               drop_leaf, &drophdr, 0,
-                                               save_leaf, &savehdr,
-                                               savehdr.count, drophdr.count);
-               }
-       } else {
-               /*
-                * Destination has holes, so we make a temporary copy
-                * of the leaf and add them both to that.
-                */
-               struct xfs_attr_leafblock *tmp_leaf;
-               struct xfs_attr3_icleaf_hdr tmphdr;
-
-               tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP);
-
-               /*
-                * Copy the header into the temp leaf so that all the stuff
-                * not in the incore header is present and gets copied back in
-                * once we've moved all the entries.
-                */
-               memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf));
-
-               memset(&tmphdr, 0, sizeof(tmphdr));
-               tmphdr.magic = savehdr.magic;
-               tmphdr.forw = savehdr.forw;
-               tmphdr.back = savehdr.back;
-               tmphdr.firstused = state->args->geo->blksize;
-
-               /* write the header to the temp buffer to initialise it */
-               xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr);
-
-               if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
-                                        drop_blk->bp, &drophdr)) {
-                       xfs_attr3_leaf_moveents(state->args,
-                                               drop_leaf, &drophdr, 0,
-                                               tmp_leaf, &tmphdr, 0,
-                                               drophdr.count);
-                       xfs_attr3_leaf_moveents(state->args,
-                                               save_leaf, &savehdr, 0,
-                                               tmp_leaf, &tmphdr, tmphdr.count,
-                                               savehdr.count);
-               } else {
-                       xfs_attr3_leaf_moveents(state->args,
-                                               save_leaf, &savehdr, 0,
-                                               tmp_leaf, &tmphdr, 0,
-                                               savehdr.count);
-                       xfs_attr3_leaf_moveents(state->args,
-                                               drop_leaf, &drophdr, 0,
-                                               tmp_leaf, &tmphdr, tmphdr.count,
-                                               drophdr.count);
-               }
-               memcpy(save_leaf, tmp_leaf, state->args->geo->blksize);
-               savehdr = tmphdr; /* struct copy */
-               kmem_free(tmp_leaf);
-       }
-
-       xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr);
-       xfs_trans_log_buf(state->args->trans, save_blk->bp, 0,
-                                          state->args->geo->blksize - 1);
-
-       /*
-        * Copy out last hashval in each block for B-tree code.
-        */
-       entry = xfs_attr3_leaf_entryp(save_leaf);
-       save_blk->hashval = be32_to_cpu(entry[savehdr.count - 1].hashval);
-}
-
-/*========================================================================
- * Routines used for finding things in the Btree.
- *========================================================================*/
-
-/*
- * Look up a name in a leaf attribute list structure.
- * This is the internal routine, it uses the caller's buffer.
- *
- * Note that duplicate keys are allowed, but only check within the
- * current leaf node.  The Btree code must check in adjacent leaf nodes.
- *
- * Return in args->index the index into the entry[] array of either
- * the found entry, or where the entry should have been (insert before
- * that entry).
- *
- * Don't change the args->value unless we find the attribute.
- */
-int
-xfs_attr3_leaf_lookup_int(
-       struct xfs_buf          *bp,
-       struct xfs_da_args      *args)
-{
-       struct xfs_attr_leafblock *leaf;
-       struct xfs_attr3_icleaf_hdr ichdr;
-       struct xfs_attr_leaf_entry *entry;
-       struct xfs_attr_leaf_entry *entries;
-       struct xfs_attr_leaf_name_local *name_loc;
-       struct xfs_attr_leaf_name_remote *name_rmt;
-       xfs_dahash_t            hashval;
-       int                     probe;
-       int                     span;
-
-       trace_xfs_attr_leaf_lookup(args);
-
-       leaf = bp->b_addr;
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
-       entries = xfs_attr3_leaf_entryp(leaf);
-       ASSERT(ichdr.count < args->geo->blksize / 8);
-
-       /*
-        * Binary search.  (note: small blocks will skip this loop)
-        */
-       hashval = args->hashval;
-       probe = span = ichdr.count / 2;
-       for (entry = &entries[probe]; span > 4; entry = &entries[probe]) {
-               span /= 2;
-               if (be32_to_cpu(entry->hashval) < hashval)
-                       probe += span;
-               else if (be32_to_cpu(entry->hashval) > hashval)
-                       probe -= span;
-               else
-                       break;
-       }
-       ASSERT(probe >= 0 && (!ichdr.count || probe < ichdr.count));
-       ASSERT(span <= 4 || be32_to_cpu(entry->hashval) == hashval);
-
-       /*
-        * Since we may have duplicate hashval's, find the first matching
-        * hashval in the leaf.
-        */
-       while (probe > 0 && be32_to_cpu(entry->hashval) >= hashval) {
-               entry--;
-               probe--;
-       }
-       while (probe < ichdr.count &&
-              be32_to_cpu(entry->hashval) < hashval) {
-               entry++;
-               probe++;
-       }
-       if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) {
-               args->index = probe;
-               return XFS_ERROR(ENOATTR);
-       }
-
-       /*
-        * Duplicate keys may be present, so search all of them for a match.
-        */
-       for (; probe < ichdr.count && (be32_to_cpu(entry->hashval) == hashval);
-                       entry++, probe++) {
-/*
- * GROT: Add code to remove incomplete entries.
- */
-               /*
-                * If we are looking for INCOMPLETE entries, show only those.
-                * If we are looking for complete entries, show only those.
-                */
-               if ((args->flags & XFS_ATTR_INCOMPLETE) !=
-                   (entry->flags & XFS_ATTR_INCOMPLETE)) {
-                       continue;
-               }
-               if (entry->flags & XFS_ATTR_LOCAL) {
-                       name_loc = xfs_attr3_leaf_name_local(leaf, probe);
-                       if (name_loc->namelen != args->namelen)
-                               continue;
-                       if (memcmp(args->name, name_loc->nameval,
-                                                       args->namelen) != 0)
-                               continue;
-                       if (!xfs_attr_namesp_match(args->flags, entry->flags))
-                               continue;
-                       args->index = probe;
-                       return XFS_ERROR(EEXIST);
-               } else {
-                       name_rmt = xfs_attr3_leaf_name_remote(leaf, probe);
-                       if (name_rmt->namelen != args->namelen)
-                               continue;
-                       if (memcmp(args->name, name_rmt->name,
-                                                       args->namelen) != 0)
-                               continue;
-                       if (!xfs_attr_namesp_match(args->flags, entry->flags))
-                               continue;
-                       args->index = probe;
-                       args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
-                       args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
-                       args->rmtblkcnt = xfs_attr3_rmt_blocks(
-                                                       args->dp->i_mount,
-                                                       args->rmtvaluelen);
-                       return XFS_ERROR(EEXIST);
-               }
-       }
-       args->index = probe;
-       return XFS_ERROR(ENOATTR);
-}
-
-/*
- * Get the value associated with an attribute name from a leaf attribute
- * list structure.
- */
-int
-xfs_attr3_leaf_getvalue(
-       struct xfs_buf          *bp,
-       struct xfs_da_args      *args)
-{
-       struct xfs_attr_leafblock *leaf;
-       struct xfs_attr3_icleaf_hdr ichdr;
-       struct xfs_attr_leaf_entry *entry;
-       struct xfs_attr_leaf_name_local *name_loc;
-       struct xfs_attr_leaf_name_remote *name_rmt;
-       int                     valuelen;
-
-       leaf = bp->b_addr;
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
-       ASSERT(ichdr.count < args->geo->blksize / 8);
-       ASSERT(args->index < ichdr.count);
-
-       entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
-       if (entry->flags & XFS_ATTR_LOCAL) {
-               name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
-               ASSERT(name_loc->namelen == args->namelen);
-               ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
-               valuelen = be16_to_cpu(name_loc->valuelen);
-               if (args->flags & ATTR_KERNOVAL) {
-                       args->valuelen = valuelen;
-                       return 0;
-               }
-               if (args->valuelen < valuelen) {
-                       args->valuelen = valuelen;
-                       return XFS_ERROR(ERANGE);
-               }
-               args->valuelen = valuelen;
-               memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
-       } else {
-               name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
-               ASSERT(name_rmt->namelen == args->namelen);
-               ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
-               args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
-               args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
-               args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
-                                                      args->rmtvaluelen);
-               if (args->flags & ATTR_KERNOVAL) {
-                       args->valuelen = args->rmtvaluelen;
-                       return 0;
-               }
-               if (args->valuelen < args->rmtvaluelen) {
-                       args->valuelen = args->rmtvaluelen;
-                       return XFS_ERROR(ERANGE);
-               }
-               args->valuelen = args->rmtvaluelen;
-       }
-       return 0;
-}
-
-/*========================================================================
- * Utility routines.
- *========================================================================*/
-
-/*
- * Move the indicated entries from one leaf to another.
- * NOTE: this routine modifies both source and destination leaves.
- */
-/*ARGSUSED*/
-STATIC void
-xfs_attr3_leaf_moveents(
-       struct xfs_da_args              *args,
-       struct xfs_attr_leafblock       *leaf_s,
-       struct xfs_attr3_icleaf_hdr     *ichdr_s,
-       int                             start_s,
-       struct xfs_attr_leafblock       *leaf_d,
-       struct xfs_attr3_icleaf_hdr     *ichdr_d,
-       int                             start_d,
-       int                             count)
-{
-       struct xfs_attr_leaf_entry      *entry_s;
-       struct xfs_attr_leaf_entry      *entry_d;
-       int                             desti;
-       int                             tmp;
-       int                             i;
-
-       /*
-        * Check for nothing to do.
-        */
-       if (count == 0)
-               return;
-
-       /*
-        * Set up environment.
-        */
-       ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC ||
-              ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC);
-       ASSERT(ichdr_s->magic == ichdr_d->magic);
-       ASSERT(ichdr_s->count > 0 && ichdr_s->count < args->geo->blksize / 8);
-       ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s))
-                                       + xfs_attr3_leaf_hdr_size(leaf_s));
-       ASSERT(ichdr_d->count < args->geo->blksize / 8);
-       ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d))
-                                       + xfs_attr3_leaf_hdr_size(leaf_d));
-
-       ASSERT(start_s < ichdr_s->count);
-       ASSERT(start_d <= ichdr_d->count);
-       ASSERT(count <= ichdr_s->count);
-
-
-       /*
-        * Move the entries in the destination leaf up to make a hole?
-        */
-       if (start_d < ichdr_d->count) {
-               tmp  = ichdr_d->count - start_d;
-               tmp *= sizeof(xfs_attr_leaf_entry_t);
-               entry_s = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
-               entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d + count];
-               memmove(entry_d, entry_s, tmp);
-       }
-
-       /*
-        * Copy all entry's in the same (sorted) order,
-        * but allocate attribute info packed and in sequence.
-        */
-       entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
-       entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
-       desti = start_d;
-       for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) {
-               ASSERT(be16_to_cpu(entry_s->nameidx) >= ichdr_s->firstused);
-               tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i);
-#ifdef GROT
-               /*
-                * Code to drop INCOMPLETE entries.  Difficult to use as we
-                * may also need to change the insertion index.  Code turned
-                * off for 6.2, should be revisited later.
-                */
-               if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */
-                       memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
-                       ichdr_s->usedbytes -= tmp;
-                       ichdr_s->count -= 1;
-                       entry_d--;      /* to compensate for ++ in loop hdr */
-                       desti--;
-                       if ((start_s + i) < offset)
-                               result++;       /* insertion index adjustment */
-               } else {
-#endif /* GROT */
-                       ichdr_d->firstused -= tmp;
-                       /* both on-disk, don't endian flip twice */
-                       entry_d->hashval = entry_s->hashval;
-                       entry_d->nameidx = cpu_to_be16(ichdr_d->firstused);
-                       entry_d->flags = entry_s->flags;
-                       ASSERT(be16_to_cpu(entry_d->nameidx) + tmp
-                                                       <= args->geo->blksize);
-                       memmove(xfs_attr3_leaf_name(leaf_d, desti),
-                               xfs_attr3_leaf_name(leaf_s, start_s + i), tmp);
-                       ASSERT(be16_to_cpu(entry_s->nameidx) + tmp
-                                                       <= args->geo->blksize);
-                       memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
-                       ichdr_s->usedbytes -= tmp;
-                       ichdr_d->usedbytes += tmp;
-                       ichdr_s->count -= 1;
-                       ichdr_d->count += 1;
-                       tmp = ichdr_d->count * sizeof(xfs_attr_leaf_entry_t)
-                                       + xfs_attr3_leaf_hdr_size(leaf_d);
-                       ASSERT(ichdr_d->firstused >= tmp);
-#ifdef GROT
-               }
-#endif /* GROT */
-       }
-
-       /*
-        * Zero out the entries we just copied.
-        */
-       if (start_s == ichdr_s->count) {
-               tmp = count * sizeof(xfs_attr_leaf_entry_t);
-               entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
-               ASSERT(((char *)entry_s + tmp) <=
-                      ((char *)leaf_s + args->geo->blksize));
-               memset(entry_s, 0, tmp);
-       } else {
-               /*
-                * Move the remaining entries down to fill the hole,
-                * then zero the entries at the top.
-                */
-               tmp  = (ichdr_s->count - count) * sizeof(xfs_attr_leaf_entry_t);
-               entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s + count];
-               entry_d = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
-               memmove(entry_d, entry_s, tmp);
-
-               tmp = count * sizeof(xfs_attr_leaf_entry_t);
-               entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count];
-               ASSERT(((char *)entry_s + tmp) <=
-                      ((char *)leaf_s + args->geo->blksize));
-               memset(entry_s, 0, tmp);
-       }
-
-       /*
-        * Fill in the freemap information
-        */
-       ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_d);
-       ichdr_d->freemap[0].base += ichdr_d->count * sizeof(xfs_attr_leaf_entry_t);
-       ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base;
-       ichdr_d->freemap[1].base = 0;
-       ichdr_d->freemap[2].base = 0;
-       ichdr_d->freemap[1].size = 0;
-       ichdr_d->freemap[2].size = 0;
-       ichdr_s->holes = 1;     /* leaf may not be compact */
-}
-
-/*
- * Pick up the last hashvalue from a leaf block.
- */
-xfs_dahash_t
-xfs_attr_leaf_lasthash(
-       struct xfs_buf  *bp,
-       int             *count)
-{
-       struct xfs_attr3_icleaf_hdr ichdr;
-       struct xfs_attr_leaf_entry *entries;
-
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, bp->b_addr);
-       entries = xfs_attr3_leaf_entryp(bp->b_addr);
-       if (count)
-               *count = ichdr.count;
-       if (!ichdr.count)
-               return 0;
-       return be32_to_cpu(entries[ichdr.count - 1].hashval);
-}
-
-/*
- * Calculate the number of bytes used to store the indicated attribute
- * (whether local or remote only calculate bytes in this block).
- */
-STATIC int
-xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
-{
-       struct xfs_attr_leaf_entry *entries;
-       xfs_attr_leaf_name_local_t *name_loc;
-       xfs_attr_leaf_name_remote_t *name_rmt;
-       int size;
-
-       entries = xfs_attr3_leaf_entryp(leaf);
-       if (entries[index].flags & XFS_ATTR_LOCAL) {
-               name_loc = xfs_attr3_leaf_name_local(leaf, index);
-               size = xfs_attr_leaf_entsize_local(name_loc->namelen,
-                                                  be16_to_cpu(name_loc->valuelen));
-       } else {
-               name_rmt = xfs_attr3_leaf_name_remote(leaf, index);
-               size = xfs_attr_leaf_entsize_remote(name_rmt->namelen);
-       }
-       return size;
-}
-
-/*
- * Calculate the number of bytes that would be required to store the new
- * attribute (whether local or remote only calculate bytes in this block).
- * This routine decides as a side effect whether the attribute will be
- * a "local" or a "remote" attribute.
- */
-int
-xfs_attr_leaf_newentsize(
-       struct xfs_da_args      *args,
-       int                     *local)
-{
-       int                     size;
-
-       size = xfs_attr_leaf_entsize_local(args->namelen, args->valuelen);
-       if (size < xfs_attr_leaf_entsize_local_max(args->geo->blksize)) {
-               if (local)
-                       *local = 1;
-               return size;
-       }
-       if (local)
-               *local = 0;
-       return xfs_attr_leaf_entsize_remote(args->namelen);
-}
-
-
-/*========================================================================
- * Manage the INCOMPLETE flag in a leaf entry
- *========================================================================*/
-
-/*
- * Clear the INCOMPLETE flag on an entry in a leaf block.
- */
-int
-xfs_attr3_leaf_clearflag(
-       struct xfs_da_args      *args)
-{
-       struct xfs_attr_leafblock *leaf;
-       struct xfs_attr_leaf_entry *entry;
-       struct xfs_attr_leaf_name_remote *name_rmt;
-       struct xfs_buf          *bp;
-       int                     error;
-#ifdef DEBUG
-       struct xfs_attr3_icleaf_hdr ichdr;
-       xfs_attr_leaf_name_local_t *name_loc;
-       int namelen;
-       char *name;
-#endif /* DEBUG */
-
-       trace_xfs_attr_leaf_clearflag(args);
-       /*
-        * Set up the operation.
-        */
-       error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
-       if (error)
-               return(error);
-
-       leaf = bp->b_addr;
-       entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
-       ASSERT(entry->flags & XFS_ATTR_INCOMPLETE);
-
-#ifdef DEBUG
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
-       ASSERT(args->index < ichdr.count);
-       ASSERT(args->index >= 0);
-
-       if (entry->flags & XFS_ATTR_LOCAL) {
-               name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
-               namelen = name_loc->namelen;
-               name = (char *)name_loc->nameval;
-       } else {
-               name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
-               namelen = name_rmt->namelen;
-               name = (char *)name_rmt->name;
-       }
-       ASSERT(be32_to_cpu(entry->hashval) == args->hashval);
-       ASSERT(namelen == args->namelen);
-       ASSERT(memcmp(name, args->name, namelen) == 0);
-#endif /* DEBUG */
-
-       entry->flags &= ~XFS_ATTR_INCOMPLETE;
-       xfs_trans_log_buf(args->trans, bp,
-                        XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
-
-       if (args->rmtblkno) {
-               ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
-               name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
-               name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
-               name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
-               xfs_trans_log_buf(args->trans, bp,
-                        XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
-       }
-
-       /*
-        * Commit the flag value change and start the next trans in series.
-        */
-       return xfs_trans_roll(&args->trans, args->dp);
-}
-
-/*
- * Set the INCOMPLETE flag on an entry in a leaf block.
- */
-int
-xfs_attr3_leaf_setflag(
-       struct xfs_da_args      *args)
-{
-       struct xfs_attr_leafblock *leaf;
-       struct xfs_attr_leaf_entry *entry;
-       struct xfs_attr_leaf_name_remote *name_rmt;
-       struct xfs_buf          *bp;
-       int error;
-#ifdef DEBUG
-       struct xfs_attr3_icleaf_hdr ichdr;
-#endif
-
-       trace_xfs_attr_leaf_setflag(args);
-
-       /*
-        * Set up the operation.
-        */
-       error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
-       if (error)
-               return(error);
-
-       leaf = bp->b_addr;
-#ifdef DEBUG
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
-       ASSERT(args->index < ichdr.count);
-       ASSERT(args->index >= 0);
-#endif
-       entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
-
-       ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0);
-       entry->flags |= XFS_ATTR_INCOMPLETE;
-       xfs_trans_log_buf(args->trans, bp,
-                       XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
-       if ((entry->flags & XFS_ATTR_LOCAL) == 0) {
-               name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
-               name_rmt->valueblk = 0;
-               name_rmt->valuelen = 0;
-               xfs_trans_log_buf(args->trans, bp,
-                        XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
-       }
-
-       /*
-        * Commit the flag value change and start the next trans in series.
-        */
-       return xfs_trans_roll(&args->trans, args->dp);
-}
-
-/*
- * In a single transaction, clear the INCOMPLETE flag on the leaf entry
- * given by args->blkno/index and set the INCOMPLETE flag on the leaf
- * entry given by args->blkno2/index2.
- *
- * Note that they could be in different blocks, or in the same block.
- */
-int
-xfs_attr3_leaf_flipflags(
-       struct xfs_da_args      *args)
-{
-       struct xfs_attr_leafblock *leaf1;
-       struct xfs_attr_leafblock *leaf2;
-       struct xfs_attr_leaf_entry *entry1;
-       struct xfs_attr_leaf_entry *entry2;
-       struct xfs_attr_leaf_name_remote *name_rmt;
-       struct xfs_buf          *bp1;
-       struct xfs_buf          *bp2;
-       int error;
-#ifdef DEBUG
-       struct xfs_attr3_icleaf_hdr ichdr1;
-       struct xfs_attr3_icleaf_hdr ichdr2;
-       xfs_attr_leaf_name_local_t *name_loc;
-       int namelen1, namelen2;
-       char *name1, *name2;
-#endif /* DEBUG */
-
-       trace_xfs_attr_leaf_flipflags(args);
-
-       /*
-        * Read the block containing the "old" attr
-        */
-       error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
-       if (error)
-               return error;
-
-       /*
-        * Read the block containing the "new" attr, if it is different
-        */
-       if (args->blkno2 != args->blkno) {
-               error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2,
-                                          -1, &bp2);
-               if (error)
-                       return error;
-       } else {
-               bp2 = bp1;
-       }
-
-       leaf1 = bp1->b_addr;
-       entry1 = &xfs_attr3_leaf_entryp(leaf1)[args->index];
-
-       leaf2 = bp2->b_addr;
-       entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2];
-
-#ifdef DEBUG
-       xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
-       ASSERT(args->index < ichdr1.count);
-       ASSERT(args->index >= 0);
-
-       xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
-       ASSERT(args->index2 < ichdr2.count);
-       ASSERT(args->index2 >= 0);
-
-       if (entry1->flags & XFS_ATTR_LOCAL) {
-               name_loc = xfs_attr3_leaf_name_local(leaf1, args->index);
-               namelen1 = name_loc->namelen;
-               name1 = (char *)name_loc->nameval;
-       } else {
-               name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
-               namelen1 = name_rmt->namelen;
-               name1 = (char *)name_rmt->name;
-       }
-       if (entry2->flags & XFS_ATTR_LOCAL) {
-               name_loc = xfs_attr3_leaf_name_local(leaf2, args->index2);
-               namelen2 = name_loc->namelen;
-               name2 = (char *)name_loc->nameval;
-       } else {
-               name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
-               namelen2 = name_rmt->namelen;
-               name2 = (char *)name_rmt->name;
-       }
-       ASSERT(be32_to_cpu(entry1->hashval) == be32_to_cpu(entry2->hashval));
-       ASSERT(namelen1 == namelen2);
-       ASSERT(memcmp(name1, name2, namelen1) == 0);
-#endif /* DEBUG */
-
-       ASSERT(entry1->flags & XFS_ATTR_INCOMPLETE);
-       ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0);
-
-       entry1->flags &= ~XFS_ATTR_INCOMPLETE;
-       xfs_trans_log_buf(args->trans, bp1,
-                         XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1)));
-       if (args->rmtblkno) {
-               ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
-               name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
-               name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
-               name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
-               xfs_trans_log_buf(args->trans, bp1,
-                        XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt)));
-       }
-
-       entry2->flags |= XFS_ATTR_INCOMPLETE;
-       xfs_trans_log_buf(args->trans, bp2,
-                         XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2)));
-       if ((entry2->flags & XFS_ATTR_LOCAL) == 0) {
-               name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
-               name_rmt->valueblk = 0;
-               name_rmt->valuelen = 0;
-               xfs_trans_log_buf(args->trans, bp2,
-                        XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt)));
-       }
-
-       /*
-        * Commit the flag value change and start the next trans in series.
-        */
-       error = xfs_trans_roll(&args->trans, args->dp);
-
-       return error;
-}
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
deleted file mode 100644 (file)
index e2929da..0000000
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_ATTR_LEAF_H__
-#define        __XFS_ATTR_LEAF_H__
-
-struct attrlist;
-struct attrlist_cursor_kern;
-struct xfs_attr_list_context;
-struct xfs_da_args;
-struct xfs_da_state;
-struct xfs_da_state_blk;
-struct xfs_inode;
-struct xfs_trans;
-
-/*
- * Used to keep a list of "remote value" extents when unlinking an inode.
- */
-typedef struct xfs_attr_inactive_list {
-       xfs_dablk_t     valueblk;       /* block number of value bytes */
-       int             valuelen;       /* number of bytes in value */
-} xfs_attr_inactive_list_t;
-
-
-/*========================================================================
- * Function prototypes for the kernel.
- *========================================================================*/
-
-/*
- * Internal routines when attribute fork size < XFS_LITINO(mp).
- */
-void   xfs_attr_shortform_create(struct xfs_da_args *args);
-void   xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
-int    xfs_attr_shortform_lookup(struct xfs_da_args *args);
-int    xfs_attr_shortform_getvalue(struct xfs_da_args *args);
-int    xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
-int    xfs_attr_shortform_remove(struct xfs_da_args *args);
-int    xfs_attr_shortform_list(struct xfs_attr_list_context *context);
-int    xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
-int    xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
-
-
-/*
- * Internal routines when attribute fork size == XFS_LBSIZE(mp).
- */
-int    xfs_attr3_leaf_to_node(struct xfs_da_args *args);
-int    xfs_attr3_leaf_to_shortform(struct xfs_buf *bp,
-                                  struct xfs_da_args *args, int forkoff);
-int    xfs_attr3_leaf_clearflag(struct xfs_da_args *args);
-int    xfs_attr3_leaf_setflag(struct xfs_da_args *args);
-int    xfs_attr3_leaf_flipflags(struct xfs_da_args *args);
-
-/*
- * Routines used for growing the Btree.
- */
-int    xfs_attr3_leaf_split(struct xfs_da_state *state,
-                                  struct xfs_da_state_blk *oldblk,
-                                  struct xfs_da_state_blk *newblk);
-int    xfs_attr3_leaf_lookup_int(struct xfs_buf *leaf,
-                                       struct xfs_da_args *args);
-int    xfs_attr3_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args);
-int    xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer,
-                                struct xfs_da_args *args);
-int    xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer,
-                                   struct xfs_da_args *args);
-int    xfs_attr3_leaf_list_int(struct xfs_buf *bp,
-                                     struct xfs_attr_list_context *context);
-
-/*
- * Routines used for shrinking the Btree.
- */
-int    xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval);
-void   xfs_attr3_leaf_unbalance(struct xfs_da_state *state,
-                                      struct xfs_da_state_blk *drop_blk,
-                                      struct xfs_da_state_blk *save_blk);
-int    xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
-
-/*
- * Utility routines.
- */
-xfs_dahash_t   xfs_attr_leaf_lasthash(struct xfs_buf *bp, int *count);
-int    xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
-                                  struct xfs_buf *leaf2_bp);
-int    xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local);
-int    xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
-                       xfs_dablk_t bno, xfs_daddr_t mappedbno,
-                       struct xfs_buf **bpp);
-void   xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to,
-                                    struct xfs_attr_leafblock *from);
-void   xfs_attr3_leaf_hdr_to_disk(struct xfs_attr_leafblock *to,
-                                  struct xfs_attr3_icleaf_hdr *from);
-
-#endif /* __XFS_ATTR_LEAF_H__ */
index 90e2eeb2120758e24c5bf466ca80f98b6129ef9e..62db83ab6cbc5e45f7dc1527738897b653c98e0f 100644 (file)
@@ -50,11 +50,11 @@ xfs_attr_shortform_compare(const void *a, const void *b)
        sa = (xfs_attr_sf_sort_t *)a;
        sb = (xfs_attr_sf_sort_t *)b;
        if (sa->hash < sb->hash) {
-               return(-1);
+               return -1;
        } else if (sa->hash > sb->hash) {
-               return(1);
+               return 1;
        } else {
-               return(sa->entno - sb->entno);
+               return sa->entno - sb->entno;
        }
 }
 
@@ -86,7 +86,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
        sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
        ASSERT(sf != NULL);
        if (!sf->hdr.count)
-               return(0);
+               return 0;
        cursor = context->cursor;
        ASSERT(cursor != NULL);
 
@@ -124,7 +124,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
                        sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
                }
                trace_xfs_attr_list_sf_all(context);
-               return(0);
+               return 0;
        }
 
        /* do no more for a search callback */
@@ -150,7 +150,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
                                             XFS_ERRLEVEL_LOW,
                                             context->dp->i_mount, sfe);
                        kmem_free(sbuf);
-                       return XFS_ERROR(EFSCORRUPTED);
+                       return -EFSCORRUPTED;
                }
 
                sbp->entno = i;
@@ -188,7 +188,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
        }
        if (i == nsbuf) {
                kmem_free(sbuf);
-               return(0);
+               return 0;
        }
 
        /*
@@ -213,7 +213,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
        }
 
        kmem_free(sbuf);
-       return(0);
+       return 0;
 }
 
 STATIC int
@@ -243,8 +243,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
        if (cursor->blkno > 0) {
                error = xfs_da3_node_read(NULL, dp, cursor->blkno, -1,
                                              &bp, XFS_ATTR_FORK);
-               if ((error != 0) && (error != EFSCORRUPTED))
-                       return(error);
+               if ((error != 0) && (error != -EFSCORRUPTED))
+                       return error;
                if (bp) {
                        struct xfs_attr_leaf_entry *entries;
 
@@ -295,7 +295,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
                                                      cursor->blkno, -1, &bp,
                                                      XFS_ATTR_FORK);
                        if (error)
-                               return(error);
+                               return error;
                        node = bp->b_addr;
                        magic = be16_to_cpu(node->hdr.info.magic);
                        if (magic == XFS_ATTR_LEAF_MAGIC ||
@@ -308,7 +308,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
                                                     context->dp->i_mount,
                                                     node);
                                xfs_trans_brelse(NULL, bp);
-                               return XFS_ERROR(EFSCORRUPTED);
+                               return -EFSCORRUPTED;
                        }
 
                        dp->d_ops->node_hdr_from_disk(&nodehdr, node);
@@ -496,11 +496,11 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context)
        context->cursor->blkno = 0;
        error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp);
        if (error)
-               return XFS_ERROR(error);
+               return error;
 
        error = xfs_attr3_leaf_list_int(bp, context);
        xfs_trans_brelse(NULL, bp);
-       return XFS_ERROR(error);
+       return error;
 }
 
 int
@@ -514,7 +514,7 @@ xfs_attr_list_int(
        XFS_STATS_INC(xs_attr_list);
 
        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-               return EIO;
+               return -EIO;
 
        /*
         * Decide on what work routines to call based on the inode size.
@@ -616,16 +616,16 @@ xfs_attr_list(
         * Validate the cursor.
         */
        if (cursor->pad1 || cursor->pad2)
-               return(XFS_ERROR(EINVAL));
+               return -EINVAL;
        if ((cursor->initted == 0) &&
            (cursor->hashval || cursor->blkno || cursor->offset))
-               return XFS_ERROR(EINVAL);
+               return -EINVAL;
 
        /*
         * Check for a properly aligned buffer.
         */
        if (((long)buffer) & (sizeof(int)-1))
-               return XFS_ERROR(EFAULT);
+               return -EFAULT;
        if (flags & ATTR_KERNOVAL)
                bufsize = 0;
 
@@ -648,6 +648,6 @@ xfs_attr_list(
        alist->al_offset[0] = context.bufsize;
 
        error = xfs_attr_list_int(&context);
-       ASSERT(error >= 0);
+       ASSERT(error <= 0);
        return error;
 }
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
deleted file mode 100644 (file)
index b5adfec..0000000
+++ /dev/null
@@ -1,628 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_inode.h"
-#include "xfs_alloc.h"
-#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
-#include "xfs_attr.h"
-#include "xfs_attr_leaf.h"
-#include "xfs_attr_remote.h"
-#include "xfs_trans_space.h"
-#include "xfs_trace.h"
-#include "xfs_cksum.h"
-#include "xfs_buf_item.h"
-#include "xfs_error.h"
-
-#define ATTR_RMTVALUE_MAPSIZE  1       /* # of map entries at once */
-
-/*
- * Each contiguous block has a header, so it is not just a simple attribute
- * length to FSB conversion.
- */
-int
-xfs_attr3_rmt_blocks(
-       struct xfs_mount *mp,
-       int             attrlen)
-{
-       if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
-               return (attrlen + buflen - 1) / buflen;
-       }
-       return XFS_B_TO_FSB(mp, attrlen);
-}
-
-/*
- * Checking of the remote attribute header is split into two parts. The verifier
- * does CRC, location and bounds checking, the unpacking function checks the
- * attribute parameters and owner.
- */
-static bool
-xfs_attr3_rmt_hdr_ok(
-       void                    *ptr,
-       xfs_ino_t               ino,
-       uint32_t                offset,
-       uint32_t                size,
-       xfs_daddr_t             bno)
-{
-       struct xfs_attr3_rmt_hdr *rmt = ptr;
-
-       if (bno != be64_to_cpu(rmt->rm_blkno))
-               return false;
-       if (offset != be32_to_cpu(rmt->rm_offset))
-               return false;
-       if (size != be32_to_cpu(rmt->rm_bytes))
-               return false;
-       if (ino != be64_to_cpu(rmt->rm_owner))
-               return false;
-
-       /* ok */
-       return true;
-}
-
-static bool
-xfs_attr3_rmt_verify(
-       struct xfs_mount        *mp,
-       void                    *ptr,
-       int                     fsbsize,
-       xfs_daddr_t             bno)
-{
-       struct xfs_attr3_rmt_hdr *rmt = ptr;
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return false;
-       if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC))
-               return false;
-       if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid))
-               return false;
-       if (be64_to_cpu(rmt->rm_blkno) != bno)
-               return false;
-       if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
-               return false;
-       if (be32_to_cpu(rmt->rm_offset) +
-                               be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX)
-               return false;
-       if (rmt->rm_owner == 0)
-               return false;
-
-       return true;
-}
-
-static void
-xfs_attr3_rmt_read_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount *mp = bp->b_target->bt_mount;
-       char            *ptr;
-       int             len;
-       xfs_daddr_t     bno;
-       int             blksize = mp->m_attr_geo->blksize;
-
-       /* no verification of non-crc buffers */
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return;
-
-       ptr = bp->b_addr;
-       bno = bp->b_bn;
-       len = BBTOB(bp->b_length);
-       ASSERT(len >= blksize);
-
-       while (len > 0) {
-               if (!xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) {
-                       xfs_buf_ioerror(bp, EFSBADCRC);
-                       break;
-               }
-               if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
-                       xfs_buf_ioerror(bp, EFSCORRUPTED);
-                       break;
-               }
-               len -= blksize;
-               ptr += blksize;
-               bno += BTOBB(blksize);
-       }
-
-       if (bp->b_error)
-               xfs_verifier_error(bp);
-       else
-               ASSERT(len == 0);
-}
-
-static void
-xfs_attr3_rmt_write_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount *mp = bp->b_target->bt_mount;
-       struct xfs_buf_log_item *bip = bp->b_fspriv;
-       char            *ptr;
-       int             len;
-       xfs_daddr_t     bno;
-       int             blksize = mp->m_attr_geo->blksize;
-
-       /* no verification of non-crc buffers */
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return;
-
-       ptr = bp->b_addr;
-       bno = bp->b_bn;
-       len = BBTOB(bp->b_length);
-       ASSERT(len >= blksize);
-
-       while (len > 0) {
-               if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
-                       xfs_buf_ioerror(bp, EFSCORRUPTED);
-                       xfs_verifier_error(bp);
-                       return;
-               }
-               if (bip) {
-                       struct xfs_attr3_rmt_hdr *rmt;
-
-                       rmt = (struct xfs_attr3_rmt_hdr *)ptr;
-                       rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn);
-               }
-               xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF);
-
-               len -= blksize;
-               ptr += blksize;
-               bno += BTOBB(blksize);
-       }
-       ASSERT(len == 0);
-}
-
-const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
-       .verify_read = xfs_attr3_rmt_read_verify,
-       .verify_write = xfs_attr3_rmt_write_verify,
-};
-
-STATIC int
-xfs_attr3_rmt_hdr_set(
-       struct xfs_mount        *mp,
-       void                    *ptr,
-       xfs_ino_t               ino,
-       uint32_t                offset,
-       uint32_t                size,
-       xfs_daddr_t             bno)
-{
-       struct xfs_attr3_rmt_hdr *rmt = ptr;
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return 0;
-
-       rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC);
-       rmt->rm_offset = cpu_to_be32(offset);
-       rmt->rm_bytes = cpu_to_be32(size);
-       uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid);
-       rmt->rm_owner = cpu_to_be64(ino);
-       rmt->rm_blkno = cpu_to_be64(bno);
-
-       return sizeof(struct xfs_attr3_rmt_hdr);
-}
-
-/*
- * Helper functions to copy attribute data in and out of the one disk extents
- */
-STATIC int
-xfs_attr_rmtval_copyout(
-       struct xfs_mount *mp,
-       struct xfs_buf  *bp,
-       xfs_ino_t       ino,
-       int             *offset,
-       int             *valuelen,
-       __uint8_t       **dst)
-{
-       char            *src = bp->b_addr;
-       xfs_daddr_t     bno = bp->b_bn;
-       int             len = BBTOB(bp->b_length);
-       int             blksize = mp->m_attr_geo->blksize;
-
-       ASSERT(len >= blksize);
-
-       while (len > 0 && *valuelen > 0) {
-               int hdr_size = 0;
-               int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
-
-               byte_cnt = min(*valuelen, byte_cnt);
-
-               if (xfs_sb_version_hascrc(&mp->m_sb)) {
-                       if (!xfs_attr3_rmt_hdr_ok(src, ino, *offset,
-                                                 byte_cnt, bno)) {
-                               xfs_alert(mp,
-"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
-                                       bno, *offset, byte_cnt, ino);
-                               return EFSCORRUPTED;
-                       }
-                       hdr_size = sizeof(struct xfs_attr3_rmt_hdr);
-               }
-
-               memcpy(*dst, src + hdr_size, byte_cnt);
-
-               /* roll buffer forwards */
-               len -= blksize;
-               src += blksize;
-               bno += BTOBB(blksize);
-
-               /* roll attribute data forwards */
-               *valuelen -= byte_cnt;
-               *dst += byte_cnt;
-               *offset += byte_cnt;
-       }
-       return 0;
-}
-
-STATIC void
-xfs_attr_rmtval_copyin(
-       struct xfs_mount *mp,
-       struct xfs_buf  *bp,
-       xfs_ino_t       ino,
-       int             *offset,
-       int             *valuelen,
-       __uint8_t       **src)
-{
-       char            *dst = bp->b_addr;
-       xfs_daddr_t     bno = bp->b_bn;
-       int             len = BBTOB(bp->b_length);
-       int             blksize = mp->m_attr_geo->blksize;
-
-       ASSERT(len >= blksize);
-
-       while (len > 0 && *valuelen > 0) {
-               int hdr_size;
-               int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
-
-               byte_cnt = min(*valuelen, byte_cnt);
-               hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset,
-                                                byte_cnt, bno);
-
-               memcpy(dst + hdr_size, *src, byte_cnt);
-
-               /*
-                * If this is the last block, zero the remainder of it.
-                * Check that we are actually the last block, too.
-                */
-               if (byte_cnt + hdr_size < blksize) {
-                       ASSERT(*valuelen - byte_cnt == 0);
-                       ASSERT(len == blksize);
-                       memset(dst + hdr_size + byte_cnt, 0,
-                                       blksize - hdr_size - byte_cnt);
-               }
-
-               /* roll buffer forwards */
-               len -= blksize;
-               dst += blksize;
-               bno += BTOBB(blksize);
-
-               /* roll attribute data forwards */
-               *valuelen -= byte_cnt;
-               *src += byte_cnt;
-               *offset += byte_cnt;
-       }
-}
-
-/*
- * Read the value associated with an attribute from the out-of-line buffer
- * that we stored it in.
- */
-int
-xfs_attr_rmtval_get(
-       struct xfs_da_args      *args)
-{
-       struct xfs_bmbt_irec    map[ATTR_RMTVALUE_MAPSIZE];
-       struct xfs_mount        *mp = args->dp->i_mount;
-       struct xfs_buf          *bp;
-       xfs_dablk_t             lblkno = args->rmtblkno;
-       __uint8_t               *dst = args->value;
-       int                     valuelen;
-       int                     nmap;
-       int                     error;
-       int                     blkcnt = args->rmtblkcnt;
-       int                     i;
-       int                     offset = 0;
-
-       trace_xfs_attr_rmtval_get(args);
-
-       ASSERT(!(args->flags & ATTR_KERNOVAL));
-       ASSERT(args->rmtvaluelen == args->valuelen);
-
-       valuelen = args->rmtvaluelen;
-       while (valuelen > 0) {
-               nmap = ATTR_RMTVALUE_MAPSIZE;
-               error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
-                                      blkcnt, map, &nmap,
-                                      XFS_BMAPI_ATTRFORK);
-               if (error)
-                       return error;
-               ASSERT(nmap >= 1);
-
-               for (i = 0; (i < nmap) && (valuelen > 0); i++) {
-                       xfs_daddr_t     dblkno;
-                       int             dblkcnt;
-
-                       ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
-                              (map[i].br_startblock != HOLESTARTBLOCK));
-                       dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
-                       dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
-                       error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
-                                                  dblkno, dblkcnt, 0, &bp,
-                                                  &xfs_attr3_rmt_buf_ops);
-                       if (error)
-                               return error;
-
-                       error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
-                                                       &offset, &valuelen,
-                                                       &dst);
-                       xfs_buf_relse(bp);
-                       if (error)
-                               return error;
-
-                       /* roll attribute extent map forwards */
-                       lblkno += map[i].br_blockcount;
-                       blkcnt -= map[i].br_blockcount;
-               }
-       }
-       ASSERT(valuelen == 0);
-       return 0;
-}
-
-/*
- * Write the value associated with an attribute into the out-of-line buffer
- * that we have defined for it.
- */
-int
-xfs_attr_rmtval_set(
-       struct xfs_da_args      *args)
-{
-       struct xfs_inode        *dp = args->dp;
-       struct xfs_mount        *mp = dp->i_mount;
-       struct xfs_bmbt_irec    map;
-       xfs_dablk_t             lblkno;
-       xfs_fileoff_t           lfileoff = 0;
-       __uint8_t               *src = args->value;
-       int                     blkcnt;
-       int                     valuelen;
-       int                     nmap;
-       int                     error;
-       int                     offset = 0;
-
-       trace_xfs_attr_rmtval_set(args);
-
-       /*
-        * Find a "hole" in the attribute address space large enough for
-        * us to drop the new attribute's value into. Because CRC enable
-        * attributes have headers, we can't just do a straight byte to FSB
-        * conversion and have to take the header space into account.
-        */
-       blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen);
-       error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
-                                                  XFS_ATTR_FORK);
-       if (error)
-               return error;
-
-       args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
-       args->rmtblkcnt = blkcnt;
-
-       /*
-        * Roll through the "value", allocating blocks on disk as required.
-        */
-       while (blkcnt > 0) {
-               int     committed;
-
-               /*
-                * Allocate a single extent, up to the size of the value.
-                */
-               xfs_bmap_init(args->flist, args->firstblock);
-               nmap = 1;
-               error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
-                                 blkcnt,
-                                 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-                                 args->firstblock, args->total, &map, &nmap,
-                                 args->flist);
-               if (!error) {
-                       error = xfs_bmap_finish(&args->trans, args->flist,
-                                               &committed);
-               }
-               if (error) {
-                       ASSERT(committed);
-                       args->trans = NULL;
-                       xfs_bmap_cancel(args->flist);
-                       return(error);
-               }
-
-               /*
-                * bmap_finish() may have committed the last trans and started
-                * a new one.  We need the inode to be in all transactions.
-                */
-               if (committed)
-                       xfs_trans_ijoin(args->trans, dp, 0);
-
-               ASSERT(nmap == 1);
-               ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
-                      (map.br_startblock != HOLESTARTBLOCK));
-               lblkno += map.br_blockcount;
-               blkcnt -= map.br_blockcount;
-
-               /*
-                * Start the next trans in the chain.
-                */
-               error = xfs_trans_roll(&args->trans, dp);
-               if (error)
-                       return (error);
-       }
-
-       /*
-        * Roll through the "value", copying the attribute value to the
-        * already-allocated blocks.  Blocks are written synchronously
-        * so that we can know they are all on disk before we turn off
-        * the INCOMPLETE flag.
-        */
-       lblkno = args->rmtblkno;
-       blkcnt = args->rmtblkcnt;
-       valuelen = args->rmtvaluelen;
-       while (valuelen > 0) {
-               struct xfs_buf  *bp;
-               xfs_daddr_t     dblkno;
-               int             dblkcnt;
-
-               ASSERT(blkcnt > 0);
-
-               xfs_bmap_init(args->flist, args->firstblock);
-               nmap = 1;
-               error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
-                                      blkcnt, &map, &nmap,
-                                      XFS_BMAPI_ATTRFORK);
-               if (error)
-                       return(error);
-               ASSERT(nmap == 1);
-               ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
-                      (map.br_startblock != HOLESTARTBLOCK));
-
-               dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
-               dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
-
-               bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0);
-               if (!bp)
-                       return ENOMEM;
-               bp->b_ops = &xfs_attr3_rmt_buf_ops;
-
-               xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset,
-                                      &valuelen, &src);
-
-               error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */
-               xfs_buf_relse(bp);
-               if (error)
-                       return error;
-
-
-               /* roll attribute extent map forwards */
-               lblkno += map.br_blockcount;
-               blkcnt -= map.br_blockcount;
-       }
-       ASSERT(valuelen == 0);
-       return 0;
-}
-
-/*
- * Remove the value associated with an attribute by deleting the
- * out-of-line buffer that it is stored on.
- */
-int
-xfs_attr_rmtval_remove(
-       struct xfs_da_args      *args)
-{
-       struct xfs_mount        *mp = args->dp->i_mount;
-       xfs_dablk_t             lblkno;
-       int                     blkcnt;
-       int                     error;
-       int                     done;
-
-       trace_xfs_attr_rmtval_remove(args);
-
-       /*
-        * Roll through the "value", invalidating the attribute value's blocks.
-        */
-       lblkno = args->rmtblkno;
-       blkcnt = args->rmtblkcnt;
-       while (blkcnt > 0) {
-               struct xfs_bmbt_irec    map;
-               struct xfs_buf          *bp;
-               xfs_daddr_t             dblkno;
-               int                     dblkcnt;
-               int                     nmap;
-
-               /*
-                * Try to remember where we decided to put the value.
-                */
-               nmap = 1;
-               error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
-                                      blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
-               if (error)
-                       return(error);
-               ASSERT(nmap == 1);
-               ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
-                      (map.br_startblock != HOLESTARTBLOCK));
-
-               dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
-               dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
-
-               /*
-                * If the "remote" value is in the cache, remove it.
-                */
-               bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK);
-               if (bp) {
-                       xfs_buf_stale(bp);
-                       xfs_buf_relse(bp);
-                       bp = NULL;
-               }
-
-               lblkno += map.br_blockcount;
-               blkcnt -= map.br_blockcount;
-       }
-
-       /*
-        * Keep de-allocating extents until the remote-value region is gone.
-        */
-       lblkno = args->rmtblkno;
-       blkcnt = args->rmtblkcnt;
-       done = 0;
-       while (!done) {
-               int committed;
-
-               xfs_bmap_init(args->flist, args->firstblock);
-               error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
-                                   XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-                                   1, args->firstblock, args->flist,
-                                   &done);
-               if (!error) {
-                       error = xfs_bmap_finish(&args->trans, args->flist,
-                                               &committed);
-               }
-               if (error) {
-                       ASSERT(committed);
-                       args->trans = NULL;
-                       xfs_bmap_cancel(args->flist);
-                       return error;
-               }
-
-               /*
-                * bmap_finish() may have committed the last trans and started
-                * a new one.  We need the inode to be in all transactions.
-                */
-               if (committed)
-                       xfs_trans_ijoin(args->trans, args->dp, 0);
-
-               /*
-                * Close out trans and start the next one in the chain.
-                */
-               error = xfs_trans_roll(&args->trans, args->dp);
-               if (error)
-                       return (error);
-       }
-       return(0);
-}
diff --git a/fs/xfs/xfs_attr_remote.h b/fs/xfs/xfs_attr_remote.h
deleted file mode 100644 (file)
index 5a9acfa..0000000
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_ATTR_REMOTE_H__
-#define        __XFS_ATTR_REMOTE_H__
-
-int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen);
-
-int xfs_attr_rmtval_get(struct xfs_da_args *args);
-int xfs_attr_rmtval_set(struct xfs_da_args *args);
-int xfs_attr_rmtval_remove(struct xfs_da_args *args);
-
-#endif /* __XFS_ATTR_REMOTE_H__ */
diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h
deleted file mode 100644 (file)
index 919756e..0000000
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_ATTR_SF_H__
-#define        __XFS_ATTR_SF_H__
-
-/*
- * Attribute storage when stored inside the inode.
- *
- * Small attribute lists are packed as tightly as possible so as
- * to fit into the literal area of the inode.
- */
-
-/*
- * Entries are packed toward the top as tight as possible.
- */
-typedef struct xfs_attr_shortform {
-       struct xfs_attr_sf_hdr {        /* constant-structure header block */
-               __be16  totsize;        /* total bytes in shortform list */
-               __u8    count;  /* count of active entries */
-       } hdr;
-       struct xfs_attr_sf_entry {
-               __uint8_t namelen;      /* actual length of name (no NULL) */
-               __uint8_t valuelen;     /* actual length of value (no NULL) */
-               __uint8_t flags;        /* flags bits (see xfs_attr_leaf.h) */
-               __uint8_t nameval[1];   /* name & value bytes concatenated */
-       } list[1];                      /* variable sized array */
-} xfs_attr_shortform_t;
-typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
-typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t;
-
-/*
- * We generate this then sort it, attr_list() must return things in hash-order.
- */
-typedef struct xfs_attr_sf_sort {
-       __uint8_t       entno;          /* entry number in original list */
-       __uint8_t       namelen;        /* length of name value (no null) */
-       __uint8_t       valuelen;       /* length of value */
-       __uint8_t       flags;          /* flags bits (see xfs_attr_leaf.h) */
-       xfs_dahash_t    hash;           /* this entry's hash value */
-       unsigned char   *name;          /* name value, pointer into buffer */
-} xfs_attr_sf_sort_t;
-
-#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen)  /* space name/value uses */ \
-       (((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen)))
-#define XFS_ATTR_SF_ENTSIZE_MAX                        /* max space for name&value */ \
-       ((1 << (NBBY*(int)sizeof(__uint8_t))) - 1)
-#define XFS_ATTR_SF_ENTSIZE(sfep)              /* space an entry uses */ \
-       ((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen)
-#define XFS_ATTR_SF_NEXTENTRY(sfep)            /* next entry in struct */ \
-       ((xfs_attr_sf_entry_t *)((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep)))
-#define XFS_ATTR_SF_TOTSIZE(dp)                        /* total space in use */ \
-       (be16_to_cpu(((xfs_attr_shortform_t *)  \
-               ((dp)->i_afp->if_u1.if_data))->hdr.totsize))
-
-#endif /* __XFS_ATTR_SF_H__ */
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
deleted file mode 100644 (file)
index e1649c0..0000000
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_BIT_H__
-#define        __XFS_BIT_H__
-
-/*
- * XFS bit manipulation routines.
- */
-
-/*
- * masks with n high/low bits set, 64-bit values
- */
-static inline __uint64_t xfs_mask64hi(int n)
-{
-       return (__uint64_t)-1 << (64 - (n));
-}
-static inline __uint32_t xfs_mask32lo(int n)
-{
-       return ((__uint32_t)1 << (n)) - 1;
-}
-static inline __uint64_t xfs_mask64lo(int n)
-{
-       return ((__uint64_t)1 << (n)) - 1;
-}
-
-/* Get high bit set out of 32-bit argument, -1 if none set */
-static inline int xfs_highbit32(__uint32_t v)
-{
-       return fls(v) - 1;
-}
-
-/* Get high bit set out of 64-bit argument, -1 if none set */
-static inline int xfs_highbit64(__uint64_t v)
-{
-       return fls64(v) - 1;
-}
-
-/* Get low bit set out of 32-bit argument, -1 if none set */
-static inline int xfs_lowbit32(__uint32_t v)
-{
-       return ffs(v) - 1;
-}
-
-/* Get low bit set out of 64-bit argument, -1 if none set */
-static inline int xfs_lowbit64(__uint64_t v)
-{
-       __uint32_t      w = (__uint32_t)v;
-       int             n = 0;
-
-       if (w) {        /* lower bits */
-               n = ffs(w);
-       } else {        /* upper bits */
-               w = (__uint32_t)(v >> 32);
-               if (w) {
-                       n = ffs(w);
-                       if (n)
-                               n += 32;
-               }
-       }
-       return n - 1;
-}
-
-/* Return whether bitmap is empty (1 == empty) */
-extern int xfs_bitmap_empty(uint *map, uint size);
-
-/* Count continuous one bits in map starting with start_bit */
-extern int xfs_contig_bits(uint *map, uint size, uint start_bit);
-
-/* Find next set bit in map */
-extern int xfs_next_bit(uint *map, uint size, uint start_bit);
-
-#endif /* __XFS_BIT_H__ */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
deleted file mode 100644 (file)
index 75c3fe5..0000000
+++ /dev/null
@@ -1,5606 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_inum.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_dir2.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_extfree_item.h"
-#include "xfs_alloc.h"
-#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_quota.h"
-#include "xfs_trans_space.h"
-#include "xfs_buf_item.h"
-#include "xfs_trace.h"
-#include "xfs_symlink.h"
-#include "xfs_attr_leaf.h"
-#include "xfs_dinode.h"
-#include "xfs_filestream.h"
-
-
-kmem_zone_t            *xfs_bmap_free_item_zone;
-
-/*
- * Miscellaneous helper functions
- */
-
-/*
- * Compute and fill in the value of the maximum depth of a bmap btree
- * in this filesystem.  Done once, during mount.
- */
-void
-xfs_bmap_compute_maxlevels(
-       xfs_mount_t     *mp,            /* file system mount structure */
-       int             whichfork)      /* data or attr fork */
-{
-       int             level;          /* btree level */
-       uint            maxblocks;      /* max blocks at this level */
-       uint            maxleafents;    /* max leaf entries possible */
-       int             maxrootrecs;    /* max records in root block */
-       int             minleafrecs;    /* min records in leaf block */
-       int             minnoderecs;    /* min records in node block */
-       int             sz;             /* root block size */
-
-       /*
-        * The maximum number of extents in a file, hence the maximum
-        * number of leaf entries, is controlled by the type of di_nextents
-        * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
-        * (a signed 16-bit number, xfs_aextnum_t).
-        *
-        * Note that we can no longer assume that if we are in ATTR1 that
-        * the fork offset of all the inodes will be
-        * (xfs_default_attroffset(ip) >> 3) because we could have mounted
-        * with ATTR2 and then mounted back with ATTR1, keeping the
-        * di_forkoff's fixed but probably at various positions. Therefore,
-        * for both ATTR1 and ATTR2 we have to assume the worst case scenario
-        * of a minimum size available.
-        */
-       if (whichfork == XFS_DATA_FORK) {
-               maxleafents = MAXEXTNUM;
-               sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
-       } else {
-               maxleafents = MAXAEXTNUM;
-               sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
-       }
-       maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
-       minleafrecs = mp->m_bmap_dmnr[0];
-       minnoderecs = mp->m_bmap_dmnr[1];
-       maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
-       for (level = 1; maxblocks > 1; level++) {
-               if (maxblocks <= maxrootrecs)
-                       maxblocks = 1;
-               else
-                       maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
-       }
-       mp->m_bm_maxlevels[whichfork] = level;
-}
-
-STATIC int                             /* error */
-xfs_bmbt_lookup_eq(
-       struct xfs_btree_cur    *cur,
-       xfs_fileoff_t           off,
-       xfs_fsblock_t           bno,
-       xfs_filblks_t           len,
-       int                     *stat)  /* success/failure */
-{
-       cur->bc_rec.b.br_startoff = off;
-       cur->bc_rec.b.br_startblock = bno;
-       cur->bc_rec.b.br_blockcount = len;
-       return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
-
-STATIC int                             /* error */
-xfs_bmbt_lookup_ge(
-       struct xfs_btree_cur    *cur,
-       xfs_fileoff_t           off,
-       xfs_fsblock_t           bno,
-       xfs_filblks_t           len,
-       int                     *stat)  /* success/failure */
-{
-       cur->bc_rec.b.br_startoff = off;
-       cur->bc_rec.b.br_startblock = bno;
-       cur->bc_rec.b.br_blockcount = len;
-       return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
-}
-
-/*
- * Check if the inode needs to be converted to btree format.
- */
-static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
-{
-       return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
-               XFS_IFORK_NEXTENTS(ip, whichfork) >
-                       XFS_IFORK_MAXEXT(ip, whichfork);
-}
-
-/*
- * Check if the inode should be converted to extent format.
- */
-static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
-{
-       return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
-               XFS_IFORK_NEXTENTS(ip, whichfork) <=
-                       XFS_IFORK_MAXEXT(ip, whichfork);
-}
-
-/*
- * Update the record referred to by cur to the value given
- * by [off, bno, len, state].
- * This either works (return 0) or gets an EFSCORRUPTED error.
- */
-STATIC int
-xfs_bmbt_update(
-       struct xfs_btree_cur    *cur,
-       xfs_fileoff_t           off,
-       xfs_fsblock_t           bno,
-       xfs_filblks_t           len,
-       xfs_exntst_t            state)
-{
-       union xfs_btree_rec     rec;
-
-       xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state);
-       return xfs_btree_update(cur, &rec);
-}
-
-/*
- * Compute the worst-case number of indirect blocks that will be used
- * for ip's delayed extent of length "len".
- */
-STATIC xfs_filblks_t
-xfs_bmap_worst_indlen(
-       xfs_inode_t     *ip,            /* incore inode pointer */
-       xfs_filblks_t   len)            /* delayed extent length */
-{
-       int             level;          /* btree level number */
-       int             maxrecs;        /* maximum record count at this level */
-       xfs_mount_t     *mp;            /* mount structure */
-       xfs_filblks_t   rval;           /* return value */
-
-       mp = ip->i_mount;
-       maxrecs = mp->m_bmap_dmxr[0];
-       for (level = 0, rval = 0;
-            level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
-            level++) {
-               len += maxrecs - 1;
-               do_div(len, maxrecs);
-               rval += len;
-               if (len == 1)
-                       return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
-                               level - 1;
-               if (level == 0)
-                       maxrecs = mp->m_bmap_dmxr[1];
-       }
-       return rval;
-}
-
-/*
- * Calculate the default attribute fork offset for newly created inodes.
- */
-uint
-xfs_default_attroffset(
-       struct xfs_inode        *ip)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       uint                    offset;
-
-       if (mp->m_sb.sb_inodesize == 256) {
-               offset = XFS_LITINO(mp, ip->i_d.di_version) -
-                               XFS_BMDR_SPACE_CALC(MINABTPTRS);
-       } else {
-               offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
-       }
-
-       ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version));
-       return offset;
-}
-
-/*
- * Helper routine to reset inode di_forkoff field when switching
- * attribute fork from local to extent format - we reset it where
- * possible to make space available for inline data fork extents.
- */
-STATIC void
-xfs_bmap_forkoff_reset(
-       xfs_inode_t     *ip,
-       int             whichfork)
-{
-       if (whichfork == XFS_ATTR_FORK &&
-           ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
-           ip->i_d.di_format != XFS_DINODE_FMT_UUID &&
-           ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
-               uint    dfl_forkoff = xfs_default_attroffset(ip) >> 3;
-
-               if (dfl_forkoff > ip->i_d.di_forkoff)
-                       ip->i_d.di_forkoff = dfl_forkoff;
-       }
-}
-
-/*
- * Debug/sanity checking code
- */
-
-STATIC int
-xfs_bmap_sanity_check(
-       struct xfs_mount        *mp,
-       struct xfs_buf          *bp,
-       int                     level)
-{
-       struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
-
-       if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) &&
-           block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC))
-               return 0;
-
-       if (be16_to_cpu(block->bb_level) != level ||
-           be16_to_cpu(block->bb_numrecs) == 0 ||
-           be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
-               return 0;
-
-       return 1;
-}
-
-#ifdef DEBUG
-STATIC struct xfs_buf *
-xfs_bmap_get_bp(
-       struct xfs_btree_cur    *cur,
-       xfs_fsblock_t           bno)
-{
-       struct xfs_log_item_desc *lidp;
-       int                     i;
-
-       if (!cur)
-               return NULL;
-
-       for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
-               if (!cur->bc_bufs[i])
-                       break;
-               if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno)
-                       return cur->bc_bufs[i];
-       }
-
-       /* Chase down all the log items to see if the bp is there */
-       list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) {
-               struct xfs_buf_log_item *bip;
-               bip = (struct xfs_buf_log_item *)lidp->lid_item;
-               if (bip->bli_item.li_type == XFS_LI_BUF &&
-                   XFS_BUF_ADDR(bip->bli_buf) == bno)
-                       return bip->bli_buf;
-       }
-
-       return NULL;
-}
-
-STATIC void
-xfs_check_block(
-       struct xfs_btree_block  *block,
-       xfs_mount_t             *mp,
-       int                     root,
-       short                   sz)
-{
-       int                     i, j, dmxr;
-       __be64                  *pp, *thispa;   /* pointer to block address */
-       xfs_bmbt_key_t          *prevp, *keyp;
-
-       ASSERT(be16_to_cpu(block->bb_level) > 0);
-
-       prevp = NULL;
-       for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
-               dmxr = mp->m_bmap_dmxr[0];
-               keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
-
-               if (prevp) {
-                       ASSERT(be64_to_cpu(prevp->br_startoff) <
-                              be64_to_cpu(keyp->br_startoff));
-               }
-               prevp = keyp;
-
-               /*
-                * Compare the block numbers to see if there are dups.
-                */
-               if (root)
-                       pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
-               else
-                       pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
-
-               for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
-                       if (root)
-                               thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
-                       else
-                               thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
-                       if (*thispa == *pp) {
-                               xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
-                                       __func__, j, i,
-                                       (unsigned long long)be64_to_cpu(*thispa));
-                               panic("%s: ptrs are equal in node\n",
-                                       __func__);
-                       }
-               }
-       }
-}
-
-/*
- * Check that the extents for the inode ip are in the right order in all
- * btree leaves.
- */
-
-STATIC void
-xfs_bmap_check_leaf_extents(
-       xfs_btree_cur_t         *cur,   /* btree cursor or null */
-       xfs_inode_t             *ip,            /* incore inode pointer */
-       int                     whichfork)      /* data or attr fork */
-{
-       struct xfs_btree_block  *block; /* current btree block */
-       xfs_fsblock_t           bno;    /* block # of "block" */
-       xfs_buf_t               *bp;    /* buffer for "block" */
-       int                     error;  /* error return value */
-       xfs_extnum_t            i=0, j; /* index into the extents list */
-       xfs_ifork_t             *ifp;   /* fork structure */
-       int                     level;  /* btree level, for checking */
-       xfs_mount_t             *mp;    /* file system mount structure */
-       __be64                  *pp;    /* pointer to block address */
-       xfs_bmbt_rec_t          *ep;    /* pointer to current extent */
-       xfs_bmbt_rec_t          last = {0, 0}; /* last extent in prev block */
-       xfs_bmbt_rec_t          *nextp; /* pointer to next extent */
-       int                     bp_release = 0;
-
-       if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) {
-               return;
-       }
-
-       bno = NULLFSBLOCK;
-       mp = ip->i_mount;
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       block = ifp->if_broot;
-       /*
-        * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
-        */
-       level = be16_to_cpu(block->bb_level);
-       ASSERT(level > 0);
-       xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
-       pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
-       bno = be64_to_cpu(*pp);
-
-       ASSERT(bno != NULLDFSBNO);
-       ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
-       ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
-
-       /*
-        * Go down the tree until leaf level is reached, following the first
-        * pointer (leftmost) at each level.
-        */
-       while (level-- > 0) {
-               /* See if buf is in cur first */
-               bp_release = 0;
-               bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
-               if (!bp) {
-                       bp_release = 1;
-                       error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
-                                               XFS_BMAP_BTREE_REF,
-                                               &xfs_bmbt_buf_ops);
-                       if (error)
-                               goto error_norelse;
-               }
-               block = XFS_BUF_TO_BLOCK(bp);
-               XFS_WANT_CORRUPTED_GOTO(
-                       xfs_bmap_sanity_check(mp, bp, level),
-                       error0);
-               if (level == 0)
-                       break;
-
-               /*
-                * Check this block for basic sanity (increasing keys and
-                * no duplicate blocks).
-                */
-
-               xfs_check_block(block, mp, 0, 0);
-               pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
-               bno = be64_to_cpu(*pp);
-               XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
-               if (bp_release) {
-                       bp_release = 0;
-                       xfs_trans_brelse(NULL, bp);
-               }
-       }
-
-       /*
-        * Here with bp and block set to the leftmost leaf node in the tree.
-        */
-       i = 0;
-
-       /*
-        * Loop over all leaf nodes checking that all extents are in the right order.
-        */
-       for (;;) {
-               xfs_fsblock_t   nextbno;
-               xfs_extnum_t    num_recs;
-
-
-               num_recs = xfs_btree_get_numrecs(block);
-
-               /*
-                * Read-ahead the next leaf block, if any.
-                */
-
-               nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
-
-               /*
-                * Check all the extents to make sure they are OK.
-                * If we had a previous block, the last entry should
-                * conform with the first entry in this one.
-                */
-
-               ep = XFS_BMBT_REC_ADDR(mp, block, 1);
-               if (i) {
-                       ASSERT(xfs_bmbt_disk_get_startoff(&last) +
-                              xfs_bmbt_disk_get_blockcount(&last) <=
-                              xfs_bmbt_disk_get_startoff(ep));
-               }
-               for (j = 1; j < num_recs; j++) {
-                       nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
-                       ASSERT(xfs_bmbt_disk_get_startoff(ep) +
-                              xfs_bmbt_disk_get_blockcount(ep) <=
-                              xfs_bmbt_disk_get_startoff(nextp));
-                       ep = nextp;
-               }
-
-               last = *ep;
-               i += num_recs;
-               if (bp_release) {
-                       bp_release = 0;
-                       xfs_trans_brelse(NULL, bp);
-               }
-               bno = nextbno;
-               /*
-                * If we've reached the end, stop.
-                */
-               if (bno == NULLFSBLOCK)
-                       break;
-
-               bp_release = 0;
-               bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
-               if (!bp) {
-                       bp_release = 1;
-                       error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
-                                               XFS_BMAP_BTREE_REF,
-                                               &xfs_bmbt_buf_ops);
-                       if (error)
-                               goto error_norelse;
-               }
-               block = XFS_BUF_TO_BLOCK(bp);
-       }
-       if (bp_release) {
-               bp_release = 0;
-               xfs_trans_brelse(NULL, bp);
-       }
-       return;
-
-error0:
-       xfs_warn(mp, "%s: at error0", __func__);
-       if (bp_release)
-               xfs_trans_brelse(NULL, bp);
-error_norelse:
-       xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
-               __func__, i);
-       panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
-       return;
-}
-
-/*
- * Add bmap trace insert entries for all the contents of the extent records.
- */
-void
-xfs_bmap_trace_exlist(
-       xfs_inode_t     *ip,            /* incore inode pointer */
-       xfs_extnum_t    cnt,            /* count of entries in the list */
-       int             whichfork,      /* data or attr fork */
-       unsigned long   caller_ip)
-{
-       xfs_extnum_t    idx;            /* extent record index */
-       xfs_ifork_t     *ifp;           /* inode fork pointer */
-       int             state = 0;
-
-       if (whichfork == XFS_ATTR_FORK)
-               state |= BMAP_ATTRFORK;
-
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
-       for (idx = 0; idx < cnt; idx++)
-               trace_xfs_extlist(ip, idx, whichfork, caller_ip);
-}
-
-/*
- * Validate that the bmbt_irecs being returned from bmapi are valid
- * given the caller's original parameters.  Specifically check the
- * ranges of the returned irecs to ensure that they only extend beyond
- * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
- */
-STATIC void
-xfs_bmap_validate_ret(
-       xfs_fileoff_t           bno,
-       xfs_filblks_t           len,
-       int                     flags,
-       xfs_bmbt_irec_t         *mval,
-       int                     nmap,
-       int                     ret_nmap)
-{
-       int                     i;              /* index to map values */
-
-       ASSERT(ret_nmap <= nmap);
-
-       for (i = 0; i < ret_nmap; i++) {
-               ASSERT(mval[i].br_blockcount > 0);
-               if (!(flags & XFS_BMAPI_ENTIRE)) {
-                       ASSERT(mval[i].br_startoff >= bno);
-                       ASSERT(mval[i].br_blockcount <= len);
-                       ASSERT(mval[i].br_startoff + mval[i].br_blockcount <=
-                              bno + len);
-               } else {
-                       ASSERT(mval[i].br_startoff < bno + len);
-                       ASSERT(mval[i].br_startoff + mval[i].br_blockcount >
-                              bno);
-               }
-               ASSERT(i == 0 ||
-                      mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
-                      mval[i].br_startoff);
-               ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
-                      mval[i].br_startblock != HOLESTARTBLOCK);
-               ASSERT(mval[i].br_state == XFS_EXT_NORM ||
-                      mval[i].br_state == XFS_EXT_UNWRITTEN);
-       }
-}
-
-#else
-#define xfs_bmap_check_leaf_extents(cur, ip, whichfork)                do { } while (0)
-#define        xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
-#endif /* DEBUG */
-
-/*
- * bmap free list manipulation functions
- */
-
-/*
- * Add the extent to the list of extents to be free at transaction end.
- * The list is maintained sorted (by block number).
- */
-void
-xfs_bmap_add_free(
-       xfs_fsblock_t           bno,            /* fs block number of extent */
-       xfs_filblks_t           len,            /* length of extent */
-       xfs_bmap_free_t         *flist,         /* list of extents */
-       xfs_mount_t             *mp)            /* mount point structure */
-{
-       xfs_bmap_free_item_t    *cur;           /* current (next) element */
-       xfs_bmap_free_item_t    *new;           /* new element */
-       xfs_bmap_free_item_t    *prev;          /* previous element */
-#ifdef DEBUG
-       xfs_agnumber_t          agno;
-       xfs_agblock_t           agbno;
-
-       ASSERT(bno != NULLFSBLOCK);
-       ASSERT(len > 0);
-       ASSERT(len <= MAXEXTLEN);
-       ASSERT(!isnullstartblock(bno));
-       agno = XFS_FSB_TO_AGNO(mp, bno);
-       agbno = XFS_FSB_TO_AGBNO(mp, bno);
-       ASSERT(agno < mp->m_sb.sb_agcount);
-       ASSERT(agbno < mp->m_sb.sb_agblocks);
-       ASSERT(len < mp->m_sb.sb_agblocks);
-       ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
-#endif
-       ASSERT(xfs_bmap_free_item_zone != NULL);
-       new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
-       new->xbfi_startblock = bno;
-       new->xbfi_blockcount = (xfs_extlen_t)len;
-       for (prev = NULL, cur = flist->xbf_first;
-            cur != NULL;
-            prev = cur, cur = cur->xbfi_next) {
-               if (cur->xbfi_startblock >= bno)
-                       break;
-       }
-       if (prev)
-               prev->xbfi_next = new;
-       else
-               flist->xbf_first = new;
-       new->xbfi_next = cur;
-       flist->xbf_count++;
-}
-
-/*
- * Remove the entry "free" from the free item list.  Prev points to the
- * previous entry, unless "free" is the head of the list.
- */
-void
-xfs_bmap_del_free(
-       xfs_bmap_free_t         *flist, /* free item list header */
-       xfs_bmap_free_item_t    *prev,  /* previous item on list, if any */
-       xfs_bmap_free_item_t    *free)  /* list item to be freed */
-{
-       if (prev)
-               prev->xbfi_next = free->xbfi_next;
-       else
-               flist->xbf_first = free->xbfi_next;
-       flist->xbf_count--;
-       kmem_zone_free(xfs_bmap_free_item_zone, free);
-}
-
-/*
- * Free up any items left in the list.
- */
-void
-xfs_bmap_cancel(
-       xfs_bmap_free_t         *flist) /* list of bmap_free_items */
-{
-       xfs_bmap_free_item_t    *free;  /* free list item */
-       xfs_bmap_free_item_t    *next;
-
-       if (flist->xbf_count == 0)
-               return;
-       ASSERT(flist->xbf_first != NULL);
-       for (free = flist->xbf_first; free; free = next) {
-               next = free->xbfi_next;
-               xfs_bmap_del_free(flist, NULL, free);
-       }
-       ASSERT(flist->xbf_count == 0);
-}
-
-/*
- * Inode fork format manipulation functions
- */
-
-/*
- * Transform a btree format file with only one leaf node, where the
- * extents list will fit in the inode, into an extents format file.
- * Since the file extents are already in-core, all we have to do is
- * give up the space for the btree root and pitch the leaf block.
- */
-STATIC int                             /* error */
-xfs_bmap_btree_to_extents(
-       xfs_trans_t             *tp,    /* transaction pointer */
-       xfs_inode_t             *ip,    /* incore inode pointer */
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     *logflagsp, /* inode logging flags */
-       int                     whichfork)  /* data or attr fork */
-{
-       /* REFERENCED */
-       struct xfs_btree_block  *cblock;/* child btree block */
-       xfs_fsblock_t           cbno;   /* child block number */
-       xfs_buf_t               *cbp;   /* child block's buffer */
-       int                     error;  /* error return value */
-       xfs_ifork_t             *ifp;   /* inode fork data */
-       xfs_mount_t             *mp;    /* mount point structure */
-       __be64                  *pp;    /* ptr to block address */
-       struct xfs_btree_block  *rblock;/* root btree block */
-
-       mp = ip->i_mount;
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       ASSERT(ifp->if_flags & XFS_IFEXTENTS);
-       ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
-       rblock = ifp->if_broot;
-       ASSERT(be16_to_cpu(rblock->bb_level) == 1);
-       ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
-       ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
-       pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
-       cbno = be64_to_cpu(*pp);
-       *logflagsp = 0;
-#ifdef DEBUG
-       if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
-               return error;
-#endif
-       error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
-                               &xfs_bmbt_buf_ops);
-       if (error)
-               return error;
-       cblock = XFS_BUF_TO_BLOCK(cbp);
-       if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
-               return error;
-       xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
-       ip->i_d.di_nblocks--;
-       xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
-       xfs_trans_binval(tp, cbp);
-       if (cur->bc_bufs[0] == cbp)
-               cur->bc_bufs[0] = NULL;
-       xfs_iroot_realloc(ip, -1, whichfork);
-       ASSERT(ifp->if_broot == NULL);
-       ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
-       XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
-       *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
-       return 0;
-}
-
-/*
- * Convert an extents-format file into a btree-format file.
- * The new file will have a root block (in the inode) and a single child block.
- */
-STATIC int                                     /* error */
-xfs_bmap_extents_to_btree(
-       xfs_trans_t             *tp,            /* transaction pointer */
-       xfs_inode_t             *ip,            /* incore inode pointer */
-       xfs_fsblock_t           *firstblock,    /* first-block-allocated */
-       xfs_bmap_free_t         *flist,         /* blocks freed in xaction */
-       xfs_btree_cur_t         **curp,         /* cursor returned to caller */
-       int                     wasdel,         /* converting a delayed alloc */
-       int                     *logflagsp,     /* inode logging flags */
-       int                     whichfork)      /* data or attr fork */
-{
-       struct xfs_btree_block  *ablock;        /* allocated (child) bt block */
-       xfs_buf_t               *abp;           /* buffer for ablock */
-       xfs_alloc_arg_t         args;           /* allocation arguments */
-       xfs_bmbt_rec_t          *arp;           /* child record pointer */
-       struct xfs_btree_block  *block;         /* btree root block */
-       xfs_btree_cur_t         *cur;           /* bmap btree cursor */
-       xfs_bmbt_rec_host_t     *ep;            /* extent record pointer */
-       int                     error;          /* error return value */
-       xfs_extnum_t            i, cnt;         /* extent record index */
-       xfs_ifork_t             *ifp;           /* inode fork pointer */
-       xfs_bmbt_key_t          *kp;            /* root block key pointer */
-       xfs_mount_t             *mp;            /* mount structure */
-       xfs_extnum_t            nextents;       /* number of file extents */
-       xfs_bmbt_ptr_t          *pp;            /* root block address pointer */
-
-       mp = ip->i_mount;
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
-
-       /*
-        * Make space in the inode incore.
-        */
-       xfs_iroot_realloc(ip, 1, whichfork);
-       ifp->if_flags |= XFS_IFBROOT;
-
-       /*
-        * Fill in the root.
-        */
-       block = ifp->if_broot;
-       if (xfs_sb_version_hascrc(&mp->m_sb))
-               xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
-                                XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino,
-                                XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
-       else
-               xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
-                                XFS_BMAP_MAGIC, 1, 1, ip->i_ino,
-                                XFS_BTREE_LONG_PTRS);
-
-       /*
-        * Need a cursor.  Can't allocate until bb_level is filled in.
-        */
-       cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
-       cur->bc_private.b.firstblock = *firstblock;
-       cur->bc_private.b.flist = flist;
-       cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
-       /*
-        * Convert to a btree with two levels, one record in root.
-        */
-       XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
-       memset(&args, 0, sizeof(args));
-       args.tp = tp;
-       args.mp = mp;
-       args.firstblock = *firstblock;
-       if (*firstblock == NULLFSBLOCK) {
-               args.type = XFS_ALLOCTYPE_START_BNO;
-               args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
-       } else if (flist->xbf_low) {
-               args.type = XFS_ALLOCTYPE_START_BNO;
-               args.fsbno = *firstblock;
-       } else {
-               args.type = XFS_ALLOCTYPE_NEAR_BNO;
-               args.fsbno = *firstblock;
-       }
-       args.minlen = args.maxlen = args.prod = 1;
-       args.wasdel = wasdel;
-       *logflagsp = 0;
-       if ((error = xfs_alloc_vextent(&args))) {
-               xfs_iroot_realloc(ip, -1, whichfork);
-               xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-               return error;
-       }
-       /*
-        * Allocation can't fail, the space was reserved.
-        */
-       ASSERT(args.fsbno != NULLFSBLOCK);
-       ASSERT(*firstblock == NULLFSBLOCK ||
-              args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
-              (flist->xbf_low &&
-               args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
-       *firstblock = cur->bc_private.b.firstblock = args.fsbno;
-       cur->bc_private.b.allocated++;
-       ip->i_d.di_nblocks++;
-       xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
-       abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
-       /*
-        * Fill in the child block.
-        */
-       abp->b_ops = &xfs_bmbt_buf_ops;
-       ablock = XFS_BUF_TO_BLOCK(abp);
-       if (xfs_sb_version_hascrc(&mp->m_sb))
-               xfs_btree_init_block_int(mp, ablock, abp->b_bn,
-                               XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
-                               XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
-       else
-               xfs_btree_init_block_int(mp, ablock, abp->b_bn,
-                               XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
-                               XFS_BTREE_LONG_PTRS);
-
-       arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       for (cnt = i = 0; i < nextents; i++) {
-               ep = xfs_iext_get_ext(ifp, i);
-               if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
-                       arp->l0 = cpu_to_be64(ep->l0);
-                       arp->l1 = cpu_to_be64(ep->l1);
-                       arp++; cnt++;
-               }
-       }
-       ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
-       xfs_btree_set_numrecs(ablock, cnt);
-
-       /*
-        * Fill in the root key and pointer.
-        */
-       kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
-       arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
-       kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
-       pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
-                                               be16_to_cpu(block->bb_level)));
-       *pp = cpu_to_be64(args.fsbno);
-
-       /*
-        * Do all this logging at the end so that
-        * the root is at the right level.
-        */
-       xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS);
-       xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
-       ASSERT(*curp == NULL);
-       *curp = cur;
-       *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
-       return 0;
-}
-
-/*
- * Convert a local file to an extents file.
- * This code is out of bounds for data forks of regular files,
- * since the file data needs to get logged so things will stay consistent.
- * (The bmap-level manipulations are ok, though).
- */
-void
-xfs_bmap_local_to_extents_empty(
-       struct xfs_inode        *ip,
-       int                     whichfork)
-{
-       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
-
-       ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
-       ASSERT(ifp->if_bytes == 0);
-       ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
-
-       xfs_bmap_forkoff_reset(ip, whichfork);
-       ifp->if_flags &= ~XFS_IFINLINE;
-       ifp->if_flags |= XFS_IFEXTENTS;
-       XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
-}
-
-
-STATIC int                             /* error */
-xfs_bmap_local_to_extents(
-       xfs_trans_t     *tp,            /* transaction pointer */
-       xfs_inode_t     *ip,            /* incore inode pointer */
-       xfs_fsblock_t   *firstblock,    /* first block allocated in xaction */
-       xfs_extlen_t    total,          /* total blocks needed by transaction */
-       int             *logflagsp,     /* inode logging flags */
-       int             whichfork,
-       void            (*init_fn)(struct xfs_trans *tp,
-                                  struct xfs_buf *bp,
-                                  struct xfs_inode *ip,
-                                  struct xfs_ifork *ifp))
-{
-       int             error = 0;
-       int             flags;          /* logging flags returned */
-       xfs_ifork_t     *ifp;           /* inode fork pointer */
-       xfs_alloc_arg_t args;           /* allocation arguments */
-       xfs_buf_t       *bp;            /* buffer for extent block */
-       xfs_bmbt_rec_host_t *ep;        /* extent record pointer */
-
-       /*
-        * We don't want to deal with the case of keeping inode data inline yet.
-        * So sending the data fork of a regular inode is invalid.
-        */
-       ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK));
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
-
-       if (!ifp->if_bytes) {
-               xfs_bmap_local_to_extents_empty(ip, whichfork);
-               flags = XFS_ILOG_CORE;
-               goto done;
-       }
-
-       flags = 0;
-       error = 0;
-       ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) ==
-                                                               XFS_IFINLINE);
-       memset(&args, 0, sizeof(args));
-       args.tp = tp;
-       args.mp = ip->i_mount;
-       args.firstblock = *firstblock;
-       /*
-        * Allocate a block.  We know we need only one, since the
-        * file currently fits in an inode.
-        */
-       if (*firstblock == NULLFSBLOCK) {
-               args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
-               args.type = XFS_ALLOCTYPE_START_BNO;
-       } else {
-               args.fsbno = *firstblock;
-               args.type = XFS_ALLOCTYPE_NEAR_BNO;
-       }
-       args.total = total;
-       args.minlen = args.maxlen = args.prod = 1;
-       error = xfs_alloc_vextent(&args);
-       if (error)
-               goto done;
-
-       /* Can't fail, the space was reserved. */
-       ASSERT(args.fsbno != NULLFSBLOCK);
-       ASSERT(args.len == 1);
-       *firstblock = args.fsbno;
-       bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
-
-       /* initialise the block and copy the data */
-       init_fn(tp, bp, ip, ifp);
-
-       /* account for the change in fork size and log everything */
-       xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
-       xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
-       xfs_bmap_local_to_extents_empty(ip, whichfork);
-       flags |= XFS_ILOG_CORE;
-
-       xfs_iext_add(ifp, 0, 1);
-       ep = xfs_iext_get_ext(ifp, 0);
-       xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
-       trace_xfs_bmap_post_update(ip, 0,
-                       whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
-                       _THIS_IP_);
-       XFS_IFORK_NEXT_SET(ip, whichfork, 1);
-       ip->i_d.di_nblocks = 1;
-       xfs_trans_mod_dquot_byino(tp, ip,
-               XFS_TRANS_DQ_BCOUNT, 1L);
-       flags |= xfs_ilog_fext(whichfork);
-
-done:
-       *logflagsp = flags;
-       return error;
-}
-
-/*
- * Called from xfs_bmap_add_attrfork to handle btree format files.
- */
-STATIC int                                     /* error */
-xfs_bmap_add_attrfork_btree(
-       xfs_trans_t             *tp,            /* transaction pointer */
-       xfs_inode_t             *ip,            /* incore inode pointer */
-       xfs_fsblock_t           *firstblock,    /* first block allocated */
-       xfs_bmap_free_t         *flist,         /* blocks to free at commit */
-       int                     *flags)         /* inode logging flags */
-{
-       xfs_btree_cur_t         *cur;           /* btree cursor */
-       int                     error;          /* error return value */
-       xfs_mount_t             *mp;            /* file system mount struct */
-       int                     stat;           /* newroot status */
-
-       mp = ip->i_mount;
-       if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip))
-               *flags |= XFS_ILOG_DBROOT;
-       else {
-               cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
-               cur->bc_private.b.flist = flist;
-               cur->bc_private.b.firstblock = *firstblock;
-               if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
-                       goto error0;
-               /* must be at least one entry */
-               XFS_WANT_CORRUPTED_GOTO(stat == 1, error0);
-               if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
-                       goto error0;
-               if (stat == 0) {
-                       xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-                       return XFS_ERROR(ENOSPC);
-               }
-               *firstblock = cur->bc_private.b.firstblock;
-               cur->bc_private.b.allocated = 0;
-               xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-       }
-       return 0;
-error0:
-       xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-       return error;
-}
-
-/*
- * Called from xfs_bmap_add_attrfork to handle extents format files.
- */
-STATIC int                                     /* error */
-xfs_bmap_add_attrfork_extents(
-       xfs_trans_t             *tp,            /* transaction pointer */
-       xfs_inode_t             *ip,            /* incore inode pointer */
-       xfs_fsblock_t           *firstblock,    /* first block allocated */
-       xfs_bmap_free_t         *flist,         /* blocks to free at commit */
-       int                     *flags)         /* inode logging flags */
-{
-       xfs_btree_cur_t         *cur;           /* bmap btree cursor */
-       int                     error;          /* error return value */
-
-       if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip))
-               return 0;
-       cur = NULL;
-       error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0,
-               flags, XFS_DATA_FORK);
-       if (cur) {
-               cur->bc_private.b.allocated = 0;
-               xfs_btree_del_cursor(cur,
-                       error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
-       }
-       return error;
-}
-
-/*
- * Called from xfs_bmap_add_attrfork to handle local format files. Each
- * different data fork content type needs a different callout to do the
- * conversion. Some are basic and only require special block initialisation
- * callouts for the data formating, others (directories) are so specialised they
- * handle everything themselves.
- *
- * XXX (dgc): investigate whether directory conversion can use the generic
- * formatting callout. It should be possible - it's just a very complex
- * formatter.
- */
-STATIC int                                     /* error */
-xfs_bmap_add_attrfork_local(
-       xfs_trans_t             *tp,            /* transaction pointer */
-       xfs_inode_t             *ip,            /* incore inode pointer */
-       xfs_fsblock_t           *firstblock,    /* first block allocated */
-       xfs_bmap_free_t         *flist,         /* blocks to free at commit */
-       int                     *flags)         /* inode logging flags */
-{
-       xfs_da_args_t           dargs;          /* args for dir/attr code */
-
-       if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
-               return 0;
-
-       if (S_ISDIR(ip->i_d.di_mode)) {
-               memset(&dargs, 0, sizeof(dargs));
-               dargs.geo = ip->i_mount->m_dir_geo;
-               dargs.dp = ip;
-               dargs.firstblock = firstblock;
-               dargs.flist = flist;
-               dargs.total = dargs.geo->fsbcount;
-               dargs.whichfork = XFS_DATA_FORK;
-               dargs.trans = tp;
-               return xfs_dir2_sf_to_block(&dargs);
-       }
-
-       if (S_ISLNK(ip->i_d.di_mode))
-               return xfs_bmap_local_to_extents(tp, ip, firstblock, 1,
-                                                flags, XFS_DATA_FORK,
-                                                xfs_symlink_local_to_remote);
-
-       /* should only be called for types that support local format data */
-       ASSERT(0);
-       return EFSCORRUPTED;
-}
-
-/*
- * Convert inode from non-attributed to attributed.
- * Must not be in a transaction, ip must not be locked.
- */
-int                                            /* error code */
-xfs_bmap_add_attrfork(
-       xfs_inode_t             *ip,            /* incore inode pointer */
-       int                     size,           /* space new attribute needs */
-       int                     rsvd)           /* xact may use reserved blks */
-{
-       xfs_fsblock_t           firstblock;     /* 1st block/ag allocated */
-       xfs_bmap_free_t         flist;          /* freed extent records */
-       xfs_mount_t             *mp;            /* mount structure */
-       xfs_trans_t             *tp;            /* transaction pointer */
-       int                     blks;           /* space reservation */
-       int                     version = 1;    /* superblock attr version */
-       int                     committed;      /* xaction was committed */
-       int                     logflags;       /* logging flags */
-       int                     error;          /* error return value */
-       int                     cancel_flags = 0;
-
-       ASSERT(XFS_IFORK_Q(ip) == 0);
-
-       mp = ip->i_mount;
-       ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
-       tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
-       blks = XFS_ADDAFORK_SPACE_RES(mp);
-       if (rsvd)
-               tp->t_flags |= XFS_TRANS_RESERVE;
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
-       if (error) {
-               xfs_trans_cancel(tp, 0);
-               return error;
-       }
-       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
-                       XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
-                       XFS_QMOPT_RES_REGBLKS);
-       if (error)
-               goto trans_cancel;
-       cancel_flags |= XFS_TRANS_ABORT;
-       if (XFS_IFORK_Q(ip))
-               goto trans_cancel;
-       if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
-               /*
-                * For inodes coming from pre-6.2 filesystems.
-                */
-               ASSERT(ip->i_d.di_aformat == 0);
-               ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
-       }
-       ASSERT(ip->i_d.di_anextents == 0);
-
-       xfs_trans_ijoin(tp, ip, 0);
-       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-       switch (ip->i_d.di_format) {
-       case XFS_DINODE_FMT_DEV:
-               ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
-               break;
-       case XFS_DINODE_FMT_UUID:
-               ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3;
-               break;
-       case XFS_DINODE_FMT_LOCAL:
-       case XFS_DINODE_FMT_EXTENTS:
-       case XFS_DINODE_FMT_BTREE:
-               ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
-               if (!ip->i_d.di_forkoff)
-                       ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
-               else if (mp->m_flags & XFS_MOUNT_ATTR2)
-                       version = 2;
-               break;
-       default:
-               ASSERT(0);
-               error = XFS_ERROR(EINVAL);
-               goto trans_cancel;
-       }
-
-       ASSERT(ip->i_afp == NULL);
-       ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
-       ip->i_afp->if_flags = XFS_IFEXTENTS;
-       logflags = 0;
-       xfs_bmap_init(&flist, &firstblock);
-       switch (ip->i_d.di_format) {
-       case XFS_DINODE_FMT_LOCAL:
-               error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
-                       &logflags);
-               break;
-       case XFS_DINODE_FMT_EXTENTS:
-               error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock,
-                       &flist, &logflags);
-               break;
-       case XFS_DINODE_FMT_BTREE:
-               error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist,
-                       &logflags);
-               break;
-       default:
-               error = 0;
-               break;
-       }
-       if (logflags)
-               xfs_trans_log_inode(tp, ip, logflags);
-       if (error)
-               goto bmap_cancel;
-       if (!xfs_sb_version_hasattr(&mp->m_sb) ||
-          (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
-               __int64_t sbfields = 0;
-
-               spin_lock(&mp->m_sb_lock);
-               if (!xfs_sb_version_hasattr(&mp->m_sb)) {
-                       xfs_sb_version_addattr(&mp->m_sb);
-                       sbfields |= XFS_SB_VERSIONNUM;
-               }
-               if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) {
-                       xfs_sb_version_addattr2(&mp->m_sb);
-                       sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
-               }
-               if (sbfields) {
-                       spin_unlock(&mp->m_sb_lock);
-                       xfs_mod_sb(tp, sbfields);
-               } else
-                       spin_unlock(&mp->m_sb_lock);
-       }
-
-       error = xfs_bmap_finish(&tp, &flist, &committed);
-       if (error)
-               goto bmap_cancel;
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       return error;
-
-bmap_cancel:
-       xfs_bmap_cancel(&flist);
-trans_cancel:
-       xfs_trans_cancel(tp, cancel_flags);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       return error;
-}
-
-/*
- * Internal and external extent tree search functions.
- */
-
-/*
- * Read in the extents to if_extents.
- * All inode fields are set up by caller, we just traverse the btree
- * and copy the records in. If the file system cannot contain unwritten
- * extents, the records are checked for no "state" flags.
- */
-int                                    /* error */
-xfs_bmap_read_extents(
-       xfs_trans_t             *tp,    /* transaction pointer */
-       xfs_inode_t             *ip,    /* incore inode */
-       int                     whichfork) /* data or attr fork */
-{
-       struct xfs_btree_block  *block; /* current btree block */
-       xfs_fsblock_t           bno;    /* block # of "block" */
-       xfs_buf_t               *bp;    /* buffer for "block" */
-       int                     error;  /* error return value */
-       xfs_exntfmt_t           exntf;  /* XFS_EXTFMT_NOSTATE, if checking */
-       xfs_extnum_t            i, j;   /* index into the extents list */
-       xfs_ifork_t             *ifp;   /* fork structure */
-       int                     level;  /* btree level, for checking */
-       xfs_mount_t             *mp;    /* file system mount structure */
-       __be64                  *pp;    /* pointer to block address */
-       /* REFERENCED */
-       xfs_extnum_t            room;   /* number of entries there's room for */
-
-       bno = NULLFSBLOCK;
-       mp = ip->i_mount;
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
-                                       XFS_EXTFMT_INODE(ip);
-       block = ifp->if_broot;
-       /*
-        * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
-        */
-       level = be16_to_cpu(block->bb_level);
-       ASSERT(level > 0);
-       pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
-       bno = be64_to_cpu(*pp);
-       ASSERT(bno != NULLDFSBNO);
-       ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
-       ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
-       /*
-        * Go down the tree until leaf level is reached, following the first
-        * pointer (leftmost) at each level.
-        */
-       while (level-- > 0) {
-               error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
-                               XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
-               if (error)
-                       return error;
-               block = XFS_BUF_TO_BLOCK(bp);
-               XFS_WANT_CORRUPTED_GOTO(
-                       xfs_bmap_sanity_check(mp, bp, level),
-                       error0);
-               if (level == 0)
-                       break;
-               pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
-               bno = be64_to_cpu(*pp);
-               XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
-               xfs_trans_brelse(tp, bp);
-       }
-       /*
-        * Here with bp and block set to the leftmost leaf node in the tree.
-        */
-       room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       i = 0;
-       /*
-        * Loop over all leaf nodes.  Copy information to the extent records.
-        */
-       for (;;) {
-               xfs_bmbt_rec_t  *frp;
-               xfs_fsblock_t   nextbno;
-               xfs_extnum_t    num_recs;
-               xfs_extnum_t    start;
-
-               num_recs = xfs_btree_get_numrecs(block);
-               if (unlikely(i + num_recs > room)) {
-                       ASSERT(i + num_recs <= room);
-                       xfs_warn(ip->i_mount,
-                               "corrupt dinode %Lu, (btree extents).",
-                               (unsigned long long) ip->i_ino);
-                       XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
-                               XFS_ERRLEVEL_LOW, ip->i_mount, block);
-                       goto error0;
-               }
-               XFS_WANT_CORRUPTED_GOTO(
-                       xfs_bmap_sanity_check(mp, bp, 0),
-                       error0);
-               /*
-                * Read-ahead the next leaf block, if any.
-                */
-               nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
-               if (nextbno != NULLFSBLOCK)
-                       xfs_btree_reada_bufl(mp, nextbno, 1,
-                                            &xfs_bmbt_buf_ops);
-               /*
-                * Copy records into the extent records.
-                */
-               frp = XFS_BMBT_REC_ADDR(mp, block, 1);
-               start = i;
-               for (j = 0; j < num_recs; j++, i++, frp++) {
-                       xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
-                       trp->l0 = be64_to_cpu(frp->l0);
-                       trp->l1 = be64_to_cpu(frp->l1);
-               }
-               if (exntf == XFS_EXTFMT_NOSTATE) {
-                       /*
-                        * Check all attribute bmap btree records and
-                        * any "older" data bmap btree records for a
-                        * set bit in the "extent flag" position.
-                        */
-                       if (unlikely(xfs_check_nostate_extents(ifp,
-                                       start, num_recs))) {
-                               XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
-                                                XFS_ERRLEVEL_LOW,
-                                                ip->i_mount);
-                               goto error0;
-                       }
-               }
-               xfs_trans_brelse(tp, bp);
-               bno = nextbno;
-               /*
-                * If we've reached the end, stop.
-                */
-               if (bno == NULLFSBLOCK)
-                       break;
-               error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
-                               XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
-               if (error)
-                       return error;
-               block = XFS_BUF_TO_BLOCK(bp);
-       }
-       ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
-       ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
-       XFS_BMAP_TRACE_EXLIST(ip, i, whichfork);
-       return 0;
-error0:
-       xfs_trans_brelse(tp, bp);
-       return XFS_ERROR(EFSCORRUPTED);
-}
-
-
-/*
- * Search the extent records for the entry containing block bno.
- * If bno lies in a hole, point to the next entry.  If bno lies
- * past eof, *eofp will be set, and *prevp will contain the last
- * entry (null if none).  Else, *lastxp will be set to the index
- * of the found entry; *gotp will contain the entry.
- */
-STATIC xfs_bmbt_rec_host_t *           /* pointer to found extent entry */
-xfs_bmap_search_multi_extents(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_fileoff_t   bno,            /* block number searched for */
-       int             *eofp,          /* out: end of file found */
-       xfs_extnum_t    *lastxp,        /* out: last extent index */
-       xfs_bmbt_irec_t *gotp,          /* out: extent entry found */
-       xfs_bmbt_irec_t *prevp)         /* out: previous extent entry found */
-{
-       xfs_bmbt_rec_host_t *ep;                /* extent record pointer */
-       xfs_extnum_t    lastx;          /* last extent index */
-
-       /*
-        * Initialize the extent entry structure to catch access to
-        * uninitialized br_startblock field.
-        */
-       gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL;
-       gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL;
-       gotp->br_state = XFS_EXT_INVALID;
-#if XFS_BIG_BLKNOS
-       gotp->br_startblock = 0xffffa5a5a5a5a5a5LL;
-#else
-       gotp->br_startblock = 0xffffa5a5;
-#endif
-       prevp->br_startoff = NULLFILEOFF;
-
-       ep = xfs_iext_bno_to_ext(ifp, bno, &lastx);
-       if (lastx > 0) {
-               xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp);
-       }
-       if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
-               xfs_bmbt_get_all(ep, gotp);
-               *eofp = 0;
-       } else {
-               if (lastx > 0) {
-                       *gotp = *prevp;
-               }
-               *eofp = 1;
-               ep = NULL;
-       }
-       *lastxp = lastx;
-       return ep;
-}
-
-/*
- * Search the extents list for the inode, for the extent containing bno.
- * If bno lies in a hole, point to the next entry.  If bno lies past eof,
- * *eofp will be set, and *prevp will contain the last entry (null if none).
- * Else, *lastxp will be set to the index of the found
- * entry; *gotp will contain the entry.
- */
-STATIC xfs_bmbt_rec_host_t *                 /* pointer to found extent entry */
-xfs_bmap_search_extents(
-       xfs_inode_t     *ip,            /* incore inode pointer */
-       xfs_fileoff_t   bno,            /* block number searched for */
-       int             fork,           /* data or attr fork */
-       int             *eofp,          /* out: end of file found */
-       xfs_extnum_t    *lastxp,        /* out: last extent index */
-       xfs_bmbt_irec_t *gotp,          /* out: extent entry found */
-       xfs_bmbt_irec_t *prevp)         /* out: previous extent entry found */
-{
-       xfs_ifork_t     *ifp;           /* inode fork pointer */
-       xfs_bmbt_rec_host_t  *ep;            /* extent record pointer */
-
-       XFS_STATS_INC(xs_look_exlist);
-       ifp = XFS_IFORK_PTR(ip, fork);
-
-       ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
-
-       if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
-                    !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
-               xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
-                               "Access to block zero in inode %llu "
-                               "start_block: %llx start_off: %llx "
-                               "blkcnt: %llx extent-state: %x lastx: %x",
-                       (unsigned long long)ip->i_ino,
-                       (unsigned long long)gotp->br_startblock,
-                       (unsigned long long)gotp->br_startoff,
-                       (unsigned long long)gotp->br_blockcount,
-                       gotp->br_state, *lastxp);
-               *lastxp = NULLEXTNUM;
-               *eofp = 1;
-               return NULL;
-       }
-       return ep;
-}
-
-/*
- * Returns the file-relative block number of the first unused block(s)
- * in the file with at least "len" logically contiguous blocks free.
- * This is the lowest-address hole if the file has holes, else the first block
- * past the end of file.
- * Return 0 if the file is currently local (in-inode).
- */
-int                                            /* error */
-xfs_bmap_first_unused(
-       xfs_trans_t     *tp,                    /* transaction pointer */
-       xfs_inode_t     *ip,                    /* incore inode */
-       xfs_extlen_t    len,                    /* size of hole to find */
-       xfs_fileoff_t   *first_unused,          /* unused block */
-       int             whichfork)              /* data or attr fork */
-{
-       int             error;                  /* error return value */
-       int             idx;                    /* extent record index */
-       xfs_ifork_t     *ifp;                   /* inode fork pointer */
-       xfs_fileoff_t   lastaddr;               /* last block number seen */
-       xfs_fileoff_t   lowest;                 /* lowest useful block */
-       xfs_fileoff_t   max;                    /* starting useful block */
-       xfs_fileoff_t   off;                    /* offset for this block */
-       xfs_extnum_t    nextents;               /* number of extent entries */
-
-       ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
-              XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
-              XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
-       if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
-               *first_unused = 0;
-               return 0;
-       }
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       if (!(ifp->if_flags & XFS_IFEXTENTS) &&
-           (error = xfs_iread_extents(tp, ip, whichfork)))
-               return error;
-       lowest = *first_unused;
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) {
-               xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
-               off = xfs_bmbt_get_startoff(ep);
-               /*
-                * See if the hole before this extent will work.
-                */
-               if (off >= lowest + len && off - max >= len) {
-                       *first_unused = max;
-                       return 0;
-               }
-               lastaddr = off + xfs_bmbt_get_blockcount(ep);
-               max = XFS_FILEOFF_MAX(lastaddr, lowest);
-       }
-       *first_unused = max;
-       return 0;
-}
-
-/*
- * Returns the file-relative block number of the last block - 1 before
- * last_block (input value) in the file.
- * This is not based on i_size, it is based on the extent records.
- * Returns 0 for local files, as they do not have extent records.
- */
-int                                            /* error */
-xfs_bmap_last_before(
-       xfs_trans_t     *tp,                    /* transaction pointer */
-       xfs_inode_t     *ip,                    /* incore inode */
-       xfs_fileoff_t   *last_block,            /* last block */
-       int             whichfork)              /* data or attr fork */
-{
-       xfs_fileoff_t   bno;                    /* input file offset */
-       int             eof;                    /* hit end of file */
-       xfs_bmbt_rec_host_t *ep;                /* pointer to last extent */
-       int             error;                  /* error return value */
-       xfs_bmbt_irec_t got;                    /* current extent value */
-       xfs_ifork_t     *ifp;                   /* inode fork pointer */
-       xfs_extnum_t    lastx;                  /* last extent used */
-       xfs_bmbt_irec_t prev;                   /* previous extent value */
-
-       if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
-           XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
-           XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
-              return XFS_ERROR(EIO);
-       if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
-               *last_block = 0;
-               return 0;
-       }
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       if (!(ifp->if_flags & XFS_IFEXTENTS) &&
-           (error = xfs_iread_extents(tp, ip, whichfork)))
-               return error;
-       bno = *last_block - 1;
-       ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
-               &prev);
-       if (eof || xfs_bmbt_get_startoff(ep) > bno) {
-               if (prev.br_startoff == NULLFILEOFF)
-                       *last_block = 0;
-               else
-                       *last_block = prev.br_startoff + prev.br_blockcount;
-       }
-       /*
-        * Otherwise *last_block is already the right answer.
-        */
-       return 0;
-}
-
-int
-xfs_bmap_last_extent(
-       struct xfs_trans        *tp,
-       struct xfs_inode        *ip,
-       int                     whichfork,
-       struct xfs_bmbt_irec    *rec,
-       int                     *is_empty)
-{
-       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
-       int                     error;
-       int                     nextents;
-
-       if (!(ifp->if_flags & XFS_IFEXTENTS)) {
-               error = xfs_iread_extents(tp, ip, whichfork);
-               if (error)
-                       return error;
-       }
-
-       nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
-       if (nextents == 0) {
-               *is_empty = 1;
-               return 0;
-       }
-
-       xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec);
-       *is_empty = 0;
-       return 0;
-}
-
-/*
- * Check the last inode extent to determine whether this allocation will result
- * in blocks being allocated at the end of the file. When we allocate new data
- * blocks at the end of the file which do not start at the previous data block,
- * we will try to align the new blocks at stripe unit boundaries.
- *
- * Returns 1 in bma->aeof if the file (fork) is empty as any new write will be
- * at, or past the EOF.
- */
-STATIC int
-xfs_bmap_isaeof(
-       struct xfs_bmalloca     *bma,
-       int                     whichfork)
-{
-       struct xfs_bmbt_irec    rec;
-       int                     is_empty;
-       int                     error;
-
-       bma->aeof = 0;
-       error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
-                                    &is_empty);
-       if (error)
-               return error;
-
-       if (is_empty) {
-               bma->aeof = 1;
-               return 0;
-       }
-
-       /*
-        * Check if we are allocation or past the last extent, or at least into
-        * the last delayed allocated extent.
-        */
-       bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount ||
-               (bma->offset >= rec.br_startoff &&
-                isnullstartblock(rec.br_startblock));
-       return 0;
-}
-
-/*
- * Returns the file-relative block number of the first block past eof in
- * the file.  This is not based on i_size, it is based on the extent records.
- * Returns 0 for local files, as they do not have extent records.
- */
-int
-xfs_bmap_last_offset(
-       struct xfs_inode        *ip,
-       xfs_fileoff_t           *last_block,
-       int                     whichfork)
-{
-       struct xfs_bmbt_irec    rec;
-       int                     is_empty;
-       int                     error;
-
-       *last_block = 0;
-
-       if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL)
-               return 0;
-
-       if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
-           XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
-              return XFS_ERROR(EIO);
-
-       error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
-       if (error || is_empty)
-               return error;
-
-       *last_block = rec.br_startoff + rec.br_blockcount;
-       return 0;
-}
-
-/*
- * Returns whether the selected fork of the inode has exactly one
- * block or not.  For the data fork we check this matches di_size,
- * implying the file's range is 0..bsize-1.
- */
-int                                    /* 1=>1 block, 0=>otherwise */
-xfs_bmap_one_block(
-       xfs_inode_t     *ip,            /* incore inode */
-       int             whichfork)      /* data or attr fork */
-{
-       xfs_bmbt_rec_host_t *ep;        /* ptr to fork's extent */
-       xfs_ifork_t     *ifp;           /* inode fork pointer */
-       int             rval;           /* return value */
-       xfs_bmbt_irec_t s;              /* internal version of extent */
-
-#ifndef DEBUG
-       if (whichfork == XFS_DATA_FORK)
-               return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
-#endif /* !DEBUG */
-       if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
-               return 0;
-       if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
-               return 0;
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       ASSERT(ifp->if_flags & XFS_IFEXTENTS);
-       ep = xfs_iext_get_ext(ifp, 0);
-       xfs_bmbt_get_all(ep, &s);
-       rval = s.br_startoff == 0 && s.br_blockcount == 1;
-       if (rval && whichfork == XFS_DATA_FORK)
-               ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
-       return rval;
-}
-
-/*
- * Extent tree manipulation functions used during allocation.
- */
-
-/*
- * Convert a delayed allocation to a real allocation.
- */
-STATIC int                             /* error */
-xfs_bmap_add_extent_delay_real(
-       struct xfs_bmalloca     *bma)
-{
-       struct xfs_bmbt_irec    *new = &bma->got;
-       int                     diff;   /* temp value */
-       xfs_bmbt_rec_host_t     *ep;    /* extent entry for idx */
-       int                     error;  /* error return value */
-       int                     i;      /* temp state */
-       xfs_ifork_t             *ifp;   /* inode fork pointer */
-       xfs_fileoff_t           new_endoff;     /* end offset of new entry */
-       xfs_bmbt_irec_t         r[3];   /* neighbor extent entries */
-                                       /* left is 0, right is 1, prev is 2 */
-       int                     rval=0; /* return value (logging flags) */
-       int                     state = 0;/* state bits, accessed thru macros */
-       xfs_filblks_t           da_new; /* new count del alloc blocks used */
-       xfs_filblks_t           da_old; /* old count del alloc blocks used */
-       xfs_filblks_t           temp=0; /* value for da_new calculations */
-       xfs_filblks_t           temp2=0;/* value for da_new calculations */
-       int                     tmp_rval;       /* partial logging flags */
-
-       ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK);
-
-       ASSERT(bma->idx >= 0);
-       ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
-       ASSERT(!isnullstartblock(new->br_startblock));
-       ASSERT(!bma->cur ||
-              (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
-
-       XFS_STATS_INC(xs_add_exlist);
-
-#define        LEFT            r[0]
-#define        RIGHT           r[1]
-#define        PREV            r[2]
-
-       /*
-        * Set up a bunch of variables to make the tests simpler.
-        */
-       ep = xfs_iext_get_ext(ifp, bma->idx);
-       xfs_bmbt_get_all(ep, &PREV);
-       new_endoff = new->br_startoff + new->br_blockcount;
-       ASSERT(PREV.br_startoff <= new->br_startoff);
-       ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
-
-       da_old = startblockval(PREV.br_startblock);
-       da_new = 0;
-
-       /*
-        * Set flags determining what part of the previous delayed allocation
-        * extent is being replaced by a real allocation.
-        */
-       if (PREV.br_startoff == new->br_startoff)
-               state |= BMAP_LEFT_FILLING;
-       if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
-               state |= BMAP_RIGHT_FILLING;
-
-       /*
-        * Check and set flags if this segment has a left neighbor.
-        * Don't set contiguous if the combined extent would be too large.
-        */
-       if (bma->idx > 0) {
-               state |= BMAP_LEFT_VALID;
-               xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &LEFT);
-
-               if (isnullstartblock(LEFT.br_startblock))
-                       state |= BMAP_LEFT_DELAY;
-       }
-
-       if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
-           LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
-           LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
-           LEFT.br_state == new->br_state &&
-           LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
-               state |= BMAP_LEFT_CONTIG;
-
-       /*
-        * Check and set flags if this segment has a right neighbor.
-        * Don't set contiguous if the combined extent would be too large.
-        * Also check for all-three-contiguous being too large.
-        */
-       if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
-               state |= BMAP_RIGHT_VALID;
-               xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT);
-
-               if (isnullstartblock(RIGHT.br_startblock))
-                       state |= BMAP_RIGHT_DELAY;
-       }
-
-       if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
-           new_endoff == RIGHT.br_startoff &&
-           new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
-           new->br_state == RIGHT.br_state &&
-           new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
-           ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
-                      BMAP_RIGHT_FILLING)) !=
-                     (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
-                      BMAP_RIGHT_FILLING) ||
-            LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
-                       <= MAXEXTLEN))
-               state |= BMAP_RIGHT_CONTIG;
-
-       error = 0;
-       /*
-        * Switch out based on the FILLING and CONTIG state bits.
-        */
-       switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
-                        BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
-       case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
-            BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
-               /*
-                * Filling in all of a previously delayed allocation extent.
-                * The left and right neighbors are both contiguous with new.
-                */
-               bma->idx--;
-               trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-               xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
-                       LEFT.br_blockcount + PREV.br_blockcount +
-                       RIGHT.br_blockcount);
-               trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-
-               xfs_iext_remove(bma->ip, bma->idx + 1, 2, state);
-               bma->ip->i_d.di_nextents--;
-               if (bma->cur == NULL)
-                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
-               else {
-                       rval = XFS_ILOG_CORE;
-                       error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
-                                       RIGHT.br_startblock,
-                                       RIGHT.br_blockcount, &i);
-                       if (error)
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       error = xfs_btree_delete(bma->cur, &i);
-                       if (error)
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       error = xfs_btree_decrement(bma->cur, 0, &i);
-                       if (error)
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
-                                       LEFT.br_startblock,
-                                       LEFT.br_blockcount +
-                                       PREV.br_blockcount +
-                                       RIGHT.br_blockcount, LEFT.br_state);
-                       if (error)
-                               goto done;
-               }
-               break;
-
-       case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
-               /*
-                * Filling in all of a previously delayed allocation extent.
-                * The left neighbor is contiguous, the right is not.
-                */
-               bma->idx--;
-
-               trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-               xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
-                       LEFT.br_blockcount + PREV.br_blockcount);
-               trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-
-               xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
-               if (bma->cur == NULL)
-                       rval = XFS_ILOG_DEXT;
-               else {
-                       rval = 0;
-                       error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
-                                       LEFT.br_startblock, LEFT.br_blockcount,
-                                       &i);
-                       if (error)
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
-                                       LEFT.br_startblock,
-                                       LEFT.br_blockcount +
-                                       PREV.br_blockcount, LEFT.br_state);
-                       if (error)
-                               goto done;
-               }
-               break;
-
-       case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
-               /*
-                * Filling in all of a previously delayed allocation extent.
-                * The right neighbor is contiguous, the left is not.
-                */
-               trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-               xfs_bmbt_set_startblock(ep, new->br_startblock);
-               xfs_bmbt_set_blockcount(ep,
-                       PREV.br_blockcount + RIGHT.br_blockcount);
-               trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-
-               xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
-               if (bma->cur == NULL)
-                       rval = XFS_ILOG_DEXT;
-               else {
-                       rval = 0;
-                       error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
-                                       RIGHT.br_startblock,
-                                       RIGHT.br_blockcount, &i);
-                       if (error)
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       error = xfs_bmbt_update(bma->cur, PREV.br_startoff,
-                                       new->br_startblock,
-                                       PREV.br_blockcount +
-                                       RIGHT.br_blockcount, PREV.br_state);
-                       if (error)
-                               goto done;
-               }
-               break;
-
-       case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
-               /*
-                * Filling in all of a previously delayed allocation extent.
-                * Neither the left nor right neighbors are contiguous with
-                * the new one.
-                */
-               trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-               xfs_bmbt_set_startblock(ep, new->br_startblock);
-               trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-
-               bma->ip->i_d.di_nextents++;
-               if (bma->cur == NULL)
-                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
-               else {
-                       rval = XFS_ILOG_CORE;
-                       error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
-                                       new->br_startblock, new->br_blockcount,
-                                       &i);
-                       if (error)
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-                       bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                       error = xfs_btree_insert(bma->cur, &i);
-                       if (error)
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-               }
-               break;
-
-       case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
-               /*
-                * Filling in the first part of a previous delayed allocation.
-                * The left neighbor is contiguous.
-                */
-               trace_xfs_bmap_pre_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
-               xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx - 1),
-                       LEFT.br_blockcount + new->br_blockcount);
-               xfs_bmbt_set_startoff(ep,
-                       PREV.br_startoff + new->br_blockcount);
-               trace_xfs_bmap_post_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
-
-               temp = PREV.br_blockcount - new->br_blockcount;
-               trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-               xfs_bmbt_set_blockcount(ep, temp);
-               if (bma->cur == NULL)
-                       rval = XFS_ILOG_DEXT;
-               else {
-                       rval = 0;
-                       error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
-                                       LEFT.br_startblock, LEFT.br_blockcount,
-                                       &i);
-                       if (error)
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
-                                       LEFT.br_startblock,
-                                       LEFT.br_blockcount +
-                                       new->br_blockcount,
-                                       LEFT.br_state);
-                       if (error)
-                               goto done;
-               }
-               da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
-                       startblockval(PREV.br_startblock));
-               xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
-               trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-
-               bma->idx--;
-               break;
-
-       case BMAP_LEFT_FILLING:
-               /*
-                * Filling in the first part of a previous delayed allocation.
-                * The left neighbor is not contiguous.
-                */
-               trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-               xfs_bmbt_set_startoff(ep, new_endoff);
-               temp = PREV.br_blockcount - new->br_blockcount;
-               xfs_bmbt_set_blockcount(ep, temp);
-               xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
-               bma->ip->i_d.di_nextents++;
-               if (bma->cur == NULL)
-                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
-               else {
-                       rval = XFS_ILOG_CORE;
-                       error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
-                                       new->br_startblock, new->br_blockcount,
-                                       &i);
-                       if (error)
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-                       bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                       error = xfs_btree_insert(bma->cur, &i);
-                       if (error)
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-               }
-
-               if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
-                       error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
-                                       bma->firstblock, bma->flist,
-                                       &bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
-                       rval |= tmp_rval;
-                       if (error)
-                               goto done;
-               }
-               da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
-                       startblockval(PREV.br_startblock) -
-                       (bma->cur ? bma->cur->bc_private.b.allocated : 0));
-               ep = xfs_iext_get_ext(ifp, bma->idx + 1);
-               xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
-               trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
-               break;
-
-       case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
-               /*
-                * Filling in the last part of a previous delayed allocation.
-                * The right neighbor is contiguous with the new allocation.
-                */
-               temp = PREV.br_blockcount - new->br_blockcount;
-               trace_xfs_bmap_pre_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
-               xfs_bmbt_set_blockcount(ep, temp);
-               xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx + 1),
-                       new->br_startoff, new->br_startblock,
-                       new->br_blockcount + RIGHT.br_blockcount,
-                       RIGHT.br_state);
-               trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
-               if (bma->cur == NULL)
-                       rval = XFS_ILOG_DEXT;
-               else {
-                       rval = 0;
-                       error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
-                                       RIGHT.br_startblock,
-                                       RIGHT.br_blockcount, &i);
-                       if (error)
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       error = xfs_bmbt_update(bma->cur, new->br_startoff,
-                                       new->br_startblock,
-                                       new->br_blockcount +
-                                       RIGHT.br_blockcount,
-                                       RIGHT.br_state);
-                       if (error)
-                               goto done;
-               }
-
-               da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
-                       startblockval(PREV.br_startblock));
-               trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-               xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
-               trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-
-               bma->idx++;
-               break;
-
-       case BMAP_RIGHT_FILLING:
-               /*
-                * Filling in the last part of a previous delayed allocation.
-                * The right neighbor is not contiguous.
-                */
-               temp = PREV.br_blockcount - new->br_blockcount;
-               trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-               xfs_bmbt_set_blockcount(ep, temp);
-               xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state);
-               bma->ip->i_d.di_nextents++;
-               if (bma->cur == NULL)
-                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
-               else {
-                       rval = XFS_ILOG_CORE;
-                       error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
-                                       new->br_startblock, new->br_blockcount,
-                                       &i);
-                       if (error)
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-                       bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                       error = xfs_btree_insert(bma->cur, &i);
-                       if (error)
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-               }
-
-               if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
-                       error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
-                               bma->firstblock, bma->flist, &bma->cur, 1,
-                               &tmp_rval, XFS_DATA_FORK);
-                       rval |= tmp_rval;
-                       if (error)
-                               goto done;
-               }
-               da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
-                       startblockval(PREV.br_startblock) -
-                       (bma->cur ? bma->cur->bc_private.b.allocated : 0));
-               ep = xfs_iext_get_ext(ifp, bma->idx);
-               xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
-               trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-
-               bma->idx++;
-               break;
-
-       case 0:
-               /*
-                * Filling in the middle part of a previous delayed allocation.
-                * Contiguity is impossible here.
-                * This case is avoided almost all the time.
-                *
-                * We start with a delayed allocation:
-                *
-                * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
-                *  PREV @ idx
-                *
-                * and we are allocating:
-                *                     +rrrrrrrrrrrrrrrrr+
-                *                            new
-                *
-                * and we set it up for insertion as:
-                * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
-                *                            new
-                *  PREV @ idx          LEFT              RIGHT
-                *                      inserted at idx + 1
-                */
-               temp = new->br_startoff - PREV.br_startoff;
-               temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
-               trace_xfs_bmap_pre_update(bma->ip, bma->idx, 0, _THIS_IP_);
-               xfs_bmbt_set_blockcount(ep, temp);      /* truncate PREV */
-               LEFT = *new;
-               RIGHT.br_state = PREV.br_state;
-               RIGHT.br_startblock = nullstartblock(
-                               (int)xfs_bmap_worst_indlen(bma->ip, temp2));
-               RIGHT.br_startoff = new_endoff;
-               RIGHT.br_blockcount = temp2;
-               /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
-               xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state);
-               bma->ip->i_d.di_nextents++;
-               if (bma->cur == NULL)
-                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
-               else {
-                       rval = XFS_ILOG_CORE;
-                       error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
-                                       new->br_startblock, new->br_blockcount,
-                                       &i);
-                       if (error)
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-                       bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                       error = xfs_btree_insert(bma->cur, &i);
-                       if (error)
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-               }
-
-               if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
-                       error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
-                                       bma->firstblock, bma->flist, &bma->cur,
-                                       1, &tmp_rval, XFS_DATA_FORK);
-                       rval |= tmp_rval;
-                       if (error)
-                               goto done;
-               }
-               temp = xfs_bmap_worst_indlen(bma->ip, temp);
-               temp2 = xfs_bmap_worst_indlen(bma->ip, temp2);
-               diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
-                       (bma->cur ? bma->cur->bc_private.b.allocated : 0));
-               if (diff > 0) {
-                       error = xfs_icsb_modify_counters(bma->ip->i_mount,
-                                       XFS_SBS_FDBLOCKS,
-                                       -((int64_t)diff), 0);
-                       ASSERT(!error);
-                       if (error)
-                               goto done;
-               }
-
-               ep = xfs_iext_get_ext(ifp, bma->idx);
-               xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-               trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-               trace_xfs_bmap_pre_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
-               xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, bma->idx + 2),
-                       nullstartblock((int)temp2));
-               trace_xfs_bmap_post_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
-
-               bma->idx++;
-               da_new = temp + temp2;
-               break;
-
-       case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
-       case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
-       case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
-       case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
-       case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
-       case BMAP_LEFT_CONTIG:
-       case BMAP_RIGHT_CONTIG:
-               /*
-                * These cases are all impossible.
-                */
-               ASSERT(0);
-       }
-
-       /* convert to a btree if necessary */
-       if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
-               int     tmp_logflags;   /* partial log flag return val */
-
-               ASSERT(bma->cur == NULL);
-               error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
-                               bma->firstblock, bma->flist, &bma->cur,
-                               da_old > 0, &tmp_logflags, XFS_DATA_FORK);
-               bma->logflags |= tmp_logflags;
-               if (error)
-                       goto done;
-       }
-
-       /* adjust for changes in reserved delayed indirect blocks */
-       if (da_old || da_new) {
-               temp = da_new;
-               if (bma->cur)
-                       temp += bma->cur->bc_private.b.allocated;
-               ASSERT(temp <= da_old);
-               if (temp < da_old)
-                       xfs_icsb_modify_counters(bma->ip->i_mount,
-                                       XFS_SBS_FDBLOCKS,
-                                       (int64_t)(da_old - temp), 0);
-       }
-
-       /* clear out the allocated field, done with it now in any case. */
-       if (bma->cur)
-               bma->cur->bc_private.b.allocated = 0;
-
-       xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK);
-done:
-       bma->logflags |= rval;
-       return error;
-#undef LEFT
-#undef RIGHT
-#undef PREV
-}
-
-/*
- * Convert an unwritten allocation to a real allocation or vice versa.
- */
-STATIC int                             /* error */
-xfs_bmap_add_extent_unwritten_real(
-       struct xfs_trans        *tp,
-       xfs_inode_t             *ip,    /* incore inode pointer */
-       xfs_extnum_t            *idx,   /* extent number to update/insert */
-       xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
-       xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-       xfs_fsblock_t           *first, /* pointer to firstblock variable */
-       xfs_bmap_free_t         *flist, /* list of extents to be freed */
-       int                     *logflagsp) /* inode logging flags */
-{
-       xfs_btree_cur_t         *cur;   /* btree cursor */
-       xfs_bmbt_rec_host_t     *ep;    /* extent entry for idx */
-       int                     error;  /* error return value */
-       int                     i;      /* temp state */
-       xfs_ifork_t             *ifp;   /* inode fork pointer */
-       xfs_fileoff_t           new_endoff;     /* end offset of new entry */
-       xfs_exntst_t            newext; /* new extent state */
-       xfs_exntst_t            oldext; /* old extent state */
-       xfs_bmbt_irec_t         r[3];   /* neighbor extent entries */
-                                       /* left is 0, right is 1, prev is 2 */
-       int                     rval=0; /* return value (logging flags) */
-       int                     state = 0;/* state bits, accessed thru macros */
-
-       *logflagsp = 0;
-
-       cur = *curp;
-       ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-
-       ASSERT(*idx >= 0);
-       ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
-       ASSERT(!isnullstartblock(new->br_startblock));
-
-       XFS_STATS_INC(xs_add_exlist);
-
-#define        LEFT            r[0]
-#define        RIGHT           r[1]
-#define        PREV            r[2]
-
-       /*
-        * Set up a bunch of variables to make the tests simpler.
-        */
-       error = 0;
-       ep = xfs_iext_get_ext(ifp, *idx);
-       xfs_bmbt_get_all(ep, &PREV);
-       newext = new->br_state;
-       oldext = (newext == XFS_EXT_UNWRITTEN) ?
-               XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
-       ASSERT(PREV.br_state == oldext);
-       new_endoff = new->br_startoff + new->br_blockcount;
-       ASSERT(PREV.br_startoff <= new->br_startoff);
-       ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
-
-       /*
-        * Set flags determining what part of the previous oldext allocation
-        * extent is being replaced by a newext allocation.
-        */
-       if (PREV.br_startoff == new->br_startoff)
-               state |= BMAP_LEFT_FILLING;
-       if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
-               state |= BMAP_RIGHT_FILLING;
-
-       /*
-        * Check and set flags if this segment has a left neighbor.
-        * Don't set contiguous if the combined extent would be too large.
-        */
-       if (*idx > 0) {
-               state |= BMAP_LEFT_VALID;
-               xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
-
-               if (isnullstartblock(LEFT.br_startblock))
-                       state |= BMAP_LEFT_DELAY;
-       }
-
-       if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
-           LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
-           LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
-           LEFT.br_state == newext &&
-           LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
-               state |= BMAP_LEFT_CONTIG;
-
-       /*
-        * Check and set flags if this segment has a right neighbor.
-        * Don't set contiguous if the combined extent would be too large.
-        * Also check for all-three-contiguous being too large.
-        */
-       if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
-               state |= BMAP_RIGHT_VALID;
-               xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
-               if (isnullstartblock(RIGHT.br_startblock))
-                       state |= BMAP_RIGHT_DELAY;
-       }
-
-       if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
-           new_endoff == RIGHT.br_startoff &&
-           new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
-           newext == RIGHT.br_state &&
-           new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
-           ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
-                      BMAP_RIGHT_FILLING)) !=
-                     (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
-                      BMAP_RIGHT_FILLING) ||
-            LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
-                       <= MAXEXTLEN))
-               state |= BMAP_RIGHT_CONTIG;
-
-       /*
-        * Switch out based on the FILLING and CONTIG state bits.
-        */
-       switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
-                        BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
-       case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
-            BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
-               /*
-                * Setting all of a previous oldext extent to newext.
-                * The left and right neighbors are both contiguous with new.
-                */
-               --*idx;
-
-               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-               xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
-                       LEFT.br_blockcount + PREV.br_blockcount +
-                       RIGHT.br_blockcount);
-               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-               xfs_iext_remove(ip, *idx + 1, 2, state);
-               ip->i_d.di_nextents -= 2;
-               if (cur == NULL)
-                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
-               else {
-                       rval = XFS_ILOG_CORE;
-                       if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
-                                       RIGHT.br_startblock,
-                                       RIGHT.br_blockcount, &i)))
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_btree_delete(cur, &i)))
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_btree_decrement(cur, 0, &i)))
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_btree_delete(cur, &i)))
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_btree_decrement(cur, 0, &i)))
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
-                               LEFT.br_startblock,
-                               LEFT.br_blockcount + PREV.br_blockcount +
-                               RIGHT.br_blockcount, LEFT.br_state)))
-                               goto done;
-               }
-               break;
-
-       case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
-               /*
-                * Setting all of a previous oldext extent to newext.
-                * The left neighbor is contiguous, the right is not.
-                */
-               --*idx;
-
-               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-               xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
-                       LEFT.br_blockcount + PREV.br_blockcount);
-               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-               xfs_iext_remove(ip, *idx + 1, 1, state);
-               ip->i_d.di_nextents--;
-               if (cur == NULL)
-                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
-               else {
-                       rval = XFS_ILOG_CORE;
-                       if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
-                                       PREV.br_startblock, PREV.br_blockcount,
-                                       &i)))
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_btree_delete(cur, &i)))
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_btree_decrement(cur, 0, &i)))
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
-                               LEFT.br_startblock,
-                               LEFT.br_blockcount + PREV.br_blockcount,
-                               LEFT.br_state)))
-                               goto done;
-               }
-               break;
-
-       case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
-               /*
-                * Setting all of a previous oldext extent to newext.
-                * The right neighbor is contiguous, the left is not.
-                */
-               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-               xfs_bmbt_set_blockcount(ep,
-                       PREV.br_blockcount + RIGHT.br_blockcount);
-               xfs_bmbt_set_state(ep, newext);
-               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-               xfs_iext_remove(ip, *idx + 1, 1, state);
-               ip->i_d.di_nextents--;
-               if (cur == NULL)
-                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
-               else {
-                       rval = XFS_ILOG_CORE;
-                       if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
-                                       RIGHT.br_startblock,
-                                       RIGHT.br_blockcount, &i)))
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_btree_delete(cur, &i)))
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_btree_decrement(cur, 0, &i)))
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_update(cur, new->br_startoff,
-                               new->br_startblock,
-                               new->br_blockcount + RIGHT.br_blockcount,
-                               newext)))
-                               goto done;
-               }
-               break;
-
-       case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
-               /*
-                * Setting all of a previous oldext extent to newext.
-                * Neither the left nor right neighbors are contiguous with
-                * the new one.
-                */
-               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-               xfs_bmbt_set_state(ep, newext);
-               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-               if (cur == NULL)
-                       rval = XFS_ILOG_DEXT;
-               else {
-                       rval = 0;
-                       if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
-                                       new->br_startblock, new->br_blockcount,
-                                       &i)))
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_update(cur, new->br_startoff,
-                               new->br_startblock, new->br_blockcount,
-                               newext)))
-                               goto done;
-               }
-               break;
-
-       case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
-               /*
-                * Setting the first part of a previous oldext extent to newext.
-                * The left neighbor is contiguous.
-                */
-               trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
-               xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
-                       LEFT.br_blockcount + new->br_blockcount);
-               xfs_bmbt_set_startoff(ep,
-                       PREV.br_startoff + new->br_blockcount);
-               trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
-
-               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-               xfs_bmbt_set_startblock(ep,
-                       new->br_startblock + new->br_blockcount);
-               xfs_bmbt_set_blockcount(ep,
-                       PREV.br_blockcount - new->br_blockcount);
-               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-               --*idx;
-
-               if (cur == NULL)
-                       rval = XFS_ILOG_DEXT;
-               else {
-                       rval = 0;
-                       if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
-                                       PREV.br_startblock, PREV.br_blockcount,
-                                       &i)))
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_update(cur,
-                               PREV.br_startoff + new->br_blockcount,
-                               PREV.br_startblock + new->br_blockcount,
-                               PREV.br_blockcount - new->br_blockcount,
-                               oldext)))
-                               goto done;
-                       if ((error = xfs_btree_decrement(cur, 0, &i)))
-                               goto done;
-                       error = xfs_bmbt_update(cur, LEFT.br_startoff,
-                               LEFT.br_startblock,
-                               LEFT.br_blockcount + new->br_blockcount,
-                               LEFT.br_state);
-                       if (error)
-                               goto done;
-               }
-               break;
-
-       case BMAP_LEFT_FILLING:
-               /*
-                * Setting the first part of a previous oldext extent to newext.
-                * The left neighbor is not contiguous.
-                */
-               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-               ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
-               xfs_bmbt_set_startoff(ep, new_endoff);
-               xfs_bmbt_set_blockcount(ep,
-                       PREV.br_blockcount - new->br_blockcount);
-               xfs_bmbt_set_startblock(ep,
-                       new->br_startblock + new->br_blockcount);
-               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-               xfs_iext_insert(ip, *idx, 1, new, state);
-               ip->i_d.di_nextents++;
-               if (cur == NULL)
-                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
-               else {
-                       rval = XFS_ILOG_CORE;
-                       if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
-                                       PREV.br_startblock, PREV.br_blockcount,
-                                       &i)))
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_update(cur,
-                               PREV.br_startoff + new->br_blockcount,
-                               PREV.br_startblock + new->br_blockcount,
-                               PREV.br_blockcount - new->br_blockcount,
-                               oldext)))
-                               goto done;
-                       cur->bc_rec.b = *new;
-                       if ((error = xfs_btree_insert(cur, &i)))
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-               }
-               break;
-
-       case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
-               /*
-                * Setting the last part of a previous oldext extent to newext.
-                * The right neighbor is contiguous with the new allocation.
-                */
-               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-               xfs_bmbt_set_blockcount(ep,
-                       PREV.br_blockcount - new->br_blockcount);
-               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-               ++*idx;
-
-               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-               xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
-                       new->br_startoff, new->br_startblock,
-                       new->br_blockcount + RIGHT.br_blockcount, newext);
-               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-               if (cur == NULL)
-                       rval = XFS_ILOG_DEXT;
-               else {
-                       rval = 0;
-                       if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
-                                       PREV.br_startblock,
-                                       PREV.br_blockcount, &i)))
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
-                               PREV.br_startblock,
-                               PREV.br_blockcount - new->br_blockcount,
-                               oldext)))
-                               goto done;
-                       if ((error = xfs_btree_increment(cur, 0, &i)))
-                               goto done;
-                       if ((error = xfs_bmbt_update(cur, new->br_startoff,
-                               new->br_startblock,
-                               new->br_blockcount + RIGHT.br_blockcount,
-                               newext)))
-                               goto done;
-               }
-               break;
-
-       case BMAP_RIGHT_FILLING:
-               /*
-                * Setting the last part of a previous oldext extent to newext.
-                * The right neighbor is not contiguous.
-                */
-               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-               xfs_bmbt_set_blockcount(ep,
-                       PREV.br_blockcount - new->br_blockcount);
-               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-               ++*idx;
-               xfs_iext_insert(ip, *idx, 1, new, state);
-
-               ip->i_d.di_nextents++;
-               if (cur == NULL)
-                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
-               else {
-                       rval = XFS_ILOG_CORE;
-                       if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
-                                       PREV.br_startblock, PREV.br_blockcount,
-                                       &i)))
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
-                               PREV.br_startblock,
-                               PREV.br_blockcount - new->br_blockcount,
-                               oldext)))
-                               goto done;
-                       if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
-                                       new->br_startblock, new->br_blockcount,
-                                       &i)))
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-                       cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                       if ((error = xfs_btree_insert(cur, &i)))
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-               }
-               break;
-
-       case 0:
-               /*
-                * Setting the middle part of a previous oldext extent to
-                * newext.  Contiguity is impossible here.
-                * One extent becomes three extents.
-                */
-               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-               xfs_bmbt_set_blockcount(ep,
-                       new->br_startoff - PREV.br_startoff);
-               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-               r[0] = *new;
-               r[1].br_startoff = new_endoff;
-               r[1].br_blockcount =
-                       PREV.br_startoff + PREV.br_blockcount - new_endoff;
-               r[1].br_startblock = new->br_startblock + new->br_blockcount;
-               r[1].br_state = oldext;
-
-               ++*idx;
-               xfs_iext_insert(ip, *idx, 2, &r[0], state);
-
-               ip->i_d.di_nextents += 2;
-               if (cur == NULL)
-                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
-               else {
-                       rval = XFS_ILOG_CORE;
-                       if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
-                                       PREV.br_startblock, PREV.br_blockcount,
-                                       &i)))
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       /* new right extent - oldext */
-                       if ((error = xfs_bmbt_update(cur, r[1].br_startoff,
-                               r[1].br_startblock, r[1].br_blockcount,
-                               r[1].br_state)))
-                               goto done;
-                       /* new left extent - oldext */
-                       cur->bc_rec.b = PREV;
-                       cur->bc_rec.b.br_blockcount =
-                               new->br_startoff - PREV.br_startoff;
-                       if ((error = xfs_btree_insert(cur, &i)))
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       /*
-                        * Reset the cursor to the position of the new extent
-                        * we are about to insert as we can't trust it after
-                        * the previous insert.
-                        */
-                       if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
-                                       new->br_startblock, new->br_blockcount,
-                                       &i)))
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-                       /* new middle extent - newext */
-                       cur->bc_rec.b.br_state = new->br_state;
-                       if ((error = xfs_btree_insert(cur, &i)))
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-               }
-               break;
-
-       case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
-       case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
-       case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
-       case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
-       case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
-       case BMAP_LEFT_CONTIG:
-       case BMAP_RIGHT_CONTIG:
-               /*
-                * These cases are all impossible.
-                */
-               ASSERT(0);
-       }
-
-       /* convert to a btree if necessary */
-       if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
-               int     tmp_logflags;   /* partial log flag return val */
-
-               ASSERT(cur == NULL);
-               error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur,
-                               0, &tmp_logflags, XFS_DATA_FORK);
-               *logflagsp |= tmp_logflags;
-               if (error)
-                       goto done;
-       }
-
-       /* clear out the allocated field, done with it now in any case. */
-       if (cur) {
-               cur->bc_private.b.allocated = 0;
-               *curp = cur;
-       }
-
-       xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK);
-done:
-       *logflagsp |= rval;
-       return error;
-#undef LEFT
-#undef RIGHT
-#undef PREV
-}
-
-/*
- * Convert a hole to a delayed allocation.
- */
-STATIC void
-xfs_bmap_add_extent_hole_delay(
-       xfs_inode_t             *ip,    /* incore inode pointer */
-       xfs_extnum_t            *idx,   /* extent number to update/insert */
-       xfs_bmbt_irec_t         *new)   /* new data to add to file extents */
-{
-       xfs_ifork_t             *ifp;   /* inode fork pointer */
-       xfs_bmbt_irec_t         left;   /* left neighbor extent entry */
-       xfs_filblks_t           newlen=0;       /* new indirect size */
-       xfs_filblks_t           oldlen=0;       /* old indirect size */
-       xfs_bmbt_irec_t         right;  /* right neighbor extent entry */
-       int                     state;  /* state bits, accessed thru macros */
-       xfs_filblks_t           temp=0; /* temp for indirect calculations */
-
-       ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-       state = 0;
-       ASSERT(isnullstartblock(new->br_startblock));
-
-       /*
-        * Check and set flags if this segment has a left neighbor
-        */
-       if (*idx > 0) {
-               state |= BMAP_LEFT_VALID;
-               xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
-
-               if (isnullstartblock(left.br_startblock))
-                       state |= BMAP_LEFT_DELAY;
-       }
-
-       /*
-        * Check and set flags if the current (right) segment exists.
-        * If it doesn't exist, we're converting the hole at end-of-file.
-        */
-       if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
-               state |= BMAP_RIGHT_VALID;
-               xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
-
-               if (isnullstartblock(right.br_startblock))
-                       state |= BMAP_RIGHT_DELAY;
-       }
-
-       /*
-        * Set contiguity flags on the left and right neighbors.
-        * Don't let extents get too large, even if the pieces are contiguous.
-        */
-       if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
-           left.br_startoff + left.br_blockcount == new->br_startoff &&
-           left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
-               state |= BMAP_LEFT_CONTIG;
-
-       if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
-           new->br_startoff + new->br_blockcount == right.br_startoff &&
-           new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
-           (!(state & BMAP_LEFT_CONTIG) ||
-            (left.br_blockcount + new->br_blockcount +
-             right.br_blockcount <= MAXEXTLEN)))
-               state |= BMAP_RIGHT_CONTIG;
-
-       /*
-        * Switch out based on the contiguity flags.
-        */
-       switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
-       case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
-               /*
-                * New allocation is contiguous with delayed allocations
-                * on the left and on the right.
-                * Merge all three into a single extent record.
-                */
-               --*idx;
-               temp = left.br_blockcount + new->br_blockcount +
-                       right.br_blockcount;
-
-               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-               xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
-               oldlen = startblockval(left.br_startblock) +
-                       startblockval(new->br_startblock) +
-                       startblockval(right.br_startblock);
-               newlen = xfs_bmap_worst_indlen(ip, temp);
-               xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
-                       nullstartblock((int)newlen));
-               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-               xfs_iext_remove(ip, *idx + 1, 1, state);
-               break;
-
-       case BMAP_LEFT_CONTIG:
-               /*
-                * New allocation is contiguous with a delayed allocation
-                * on the left.
-                * Merge the new allocation with the left neighbor.
-                */
-               --*idx;
-               temp = left.br_blockcount + new->br_blockcount;
-
-               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-               xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
-               oldlen = startblockval(left.br_startblock) +
-                       startblockval(new->br_startblock);
-               newlen = xfs_bmap_worst_indlen(ip, temp);
-               xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
-                       nullstartblock((int)newlen));
-               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-               break;
-
-       case BMAP_RIGHT_CONTIG:
-               /*
-                * New allocation is contiguous with a delayed allocation
-                * on the right.
-                * Merge the new allocation with the right neighbor.
-                */
-               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-               temp = new->br_blockcount + right.br_blockcount;
-               oldlen = startblockval(new->br_startblock) +
-                       startblockval(right.br_startblock);
-               newlen = xfs_bmap_worst_indlen(ip, temp);
-               xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
-                       new->br_startoff,
-                       nullstartblock((int)newlen), temp, right.br_state);
-               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-               break;
-
-       case 0:
-               /*
-                * New allocation is not contiguous with another
-                * delayed allocation.
-                * Insert a new entry.
-                */
-               oldlen = newlen = 0;
-               xfs_iext_insert(ip, *idx, 1, new, state);
-               break;
-       }
-       if (oldlen != newlen) {
-               ASSERT(oldlen > newlen);
-               xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
-                       (int64_t)(oldlen - newlen), 0);
-               /*
-                * Nothing to do for disk quota accounting here.
-                */
-       }
-}
-
-/*
- * Convert a hole to a real allocation.
- */
-STATIC int                             /* error */
-xfs_bmap_add_extent_hole_real(
-       struct xfs_bmalloca     *bma,
-       int                     whichfork)
-{
-       struct xfs_bmbt_irec    *new = &bma->got;
-       int                     error;  /* error return value */
-       int                     i;      /* temp state */
-       xfs_ifork_t             *ifp;   /* inode fork pointer */
-       xfs_bmbt_irec_t         left;   /* left neighbor extent entry */
-       xfs_bmbt_irec_t         right;  /* right neighbor extent entry */
-       int                     rval=0; /* return value (logging flags) */
-       int                     state;  /* state bits, accessed thru macros */
-
-       ifp = XFS_IFORK_PTR(bma->ip, whichfork);
-
-       ASSERT(bma->idx >= 0);
-       ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
-       ASSERT(!isnullstartblock(new->br_startblock));
-       ASSERT(!bma->cur ||
-              !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
-
-       XFS_STATS_INC(xs_add_exlist);
-
-       state = 0;
-       if (whichfork == XFS_ATTR_FORK)
-               state |= BMAP_ATTRFORK;
-
-       /*
-        * Check and set flags if this segment has a left neighbor.
-        */
-       if (bma->idx > 0) {
-               state |= BMAP_LEFT_VALID;
-               xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left);
-               if (isnullstartblock(left.br_startblock))
-                       state |= BMAP_LEFT_DELAY;
-       }
-
-       /*
-        * Check and set flags if this segment has a current value.
-        * Not true if we're inserting into the "hole" at eof.
-        */
-       if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
-               state |= BMAP_RIGHT_VALID;
-               xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right);
-               if (isnullstartblock(right.br_startblock))
-                       state |= BMAP_RIGHT_DELAY;
-       }
-
-       /*
-        * We're inserting a real allocation between "left" and "right".
-        * Set the contiguity flags.  Don't let extents get too large.
-        */
-       if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
-           left.br_startoff + left.br_blockcount == new->br_startoff &&
-           left.br_startblock + left.br_blockcount == new->br_startblock &&
-           left.br_state == new->br_state &&
-           left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
-               state |= BMAP_LEFT_CONTIG;
-
-       if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
-           new->br_startoff + new->br_blockcount == right.br_startoff &&
-           new->br_startblock + new->br_blockcount == right.br_startblock &&
-           new->br_state == right.br_state &&
-           new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
-           (!(state & BMAP_LEFT_CONTIG) ||
-            left.br_blockcount + new->br_blockcount +
-            right.br_blockcount <= MAXEXTLEN))
-               state |= BMAP_RIGHT_CONTIG;
-
-       error = 0;
-       /*
-        * Select which case we're in here, and implement it.
-        */
-       switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
-       case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
-               /*
-                * New allocation is contiguous with real allocations on the
-                * left and on the right.
-                * Merge all three into a single extent record.
-                */
-               --bma->idx;
-               trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-               xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
-                       left.br_blockcount + new->br_blockcount +
-                       right.br_blockcount);
-               trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-
-               xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
-
-               XFS_IFORK_NEXT_SET(bma->ip, whichfork,
-                       XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1);
-               if (bma->cur == NULL) {
-                       rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
-               } else {
-                       rval = XFS_ILOG_CORE;
-                       error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff,
-                                       right.br_startblock, right.br_blockcount,
-                                       &i);
-                       if (error)
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       error = xfs_btree_delete(bma->cur, &i);
-                       if (error)
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       error = xfs_btree_decrement(bma->cur, 0, &i);
-                       if (error)
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       error = xfs_bmbt_update(bma->cur, left.br_startoff,
-                                       left.br_startblock,
-                                       left.br_blockcount +
-                                               new->br_blockcount +
-                                               right.br_blockcount,
-                                       left.br_state);
-                       if (error)
-                               goto done;
-               }
-               break;
-
-       case BMAP_LEFT_CONTIG:
-               /*
-                * New allocation is contiguous with a real allocation
-                * on the left.
-                * Merge the new allocation with the left neighbor.
-                */
-               --bma->idx;
-               trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-               xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
-                       left.br_blockcount + new->br_blockcount);
-               trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-
-               if (bma->cur == NULL) {
-                       rval = xfs_ilog_fext(whichfork);
-               } else {
-                       rval = 0;
-                       error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff,
-                                       left.br_startblock, left.br_blockcount,
-                                       &i);
-                       if (error)
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       error = xfs_bmbt_update(bma->cur, left.br_startoff,
-                                       left.br_startblock,
-                                       left.br_blockcount +
-                                               new->br_blockcount,
-                                       left.br_state);
-                       if (error)
-                               goto done;
-               }
-               break;
-
-       case BMAP_RIGHT_CONTIG:
-               /*
-                * New allocation is contiguous with a real allocation
-                * on the right.
-                * Merge the new allocation with the right neighbor.
-                */
-               trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-               xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx),
-                       new->br_startoff, new->br_startblock,
-                       new->br_blockcount + right.br_blockcount,
-                       right.br_state);
-               trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-
-               if (bma->cur == NULL) {
-                       rval = xfs_ilog_fext(whichfork);
-               } else {
-                       rval = 0;
-                       error = xfs_bmbt_lookup_eq(bma->cur,
-                                       right.br_startoff,
-                                       right.br_startblock,
-                                       right.br_blockcount, &i);
-                       if (error)
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       error = xfs_bmbt_update(bma->cur, new->br_startoff,
-                                       new->br_startblock,
-                                       new->br_blockcount +
-                                               right.br_blockcount,
-                                       right.br_state);
-                       if (error)
-                               goto done;
-               }
-               break;
-
-       case 0:
-               /*
-                * New allocation is not contiguous with another
-                * real allocation.
-                * Insert a new entry.
-                */
-               xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
-               XFS_IFORK_NEXT_SET(bma->ip, whichfork,
-                       XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1);
-               if (bma->cur == NULL) {
-                       rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
-               } else {
-                       rval = XFS_ILOG_CORE;
-                       error = xfs_bmbt_lookup_eq(bma->cur,
-                                       new->br_startoff,
-                                       new->br_startblock,
-                                       new->br_blockcount, &i);
-                       if (error)
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-                       bma->cur->bc_rec.b.br_state = new->br_state;
-                       error = xfs_btree_insert(bma->cur, &i);
-                       if (error)
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-               }
-               break;
-       }
-
-       /* convert to a btree if necessary */
-       if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
-               int     tmp_logflags;   /* partial log flag return val */
-
-               ASSERT(bma->cur == NULL);
-               error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
-                               bma->firstblock, bma->flist, &bma->cur,
-                               0, &tmp_logflags, whichfork);
-               bma->logflags |= tmp_logflags;
-               if (error)
-                       goto done;
-       }
-
-       /* clear out the allocated field, done with it now in any case. */
-       if (bma->cur)
-               bma->cur->bc_private.b.allocated = 0;
-
-       xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
-done:
-       bma->logflags |= rval;
-       return error;
-}
-
-/*
- * Functions used in the extent read, allocate and remove paths
- */
-
-/*
- * Adjust the size of the new extent based on di_extsize and rt extsize.
- */
-int
-xfs_bmap_extsize_align(
-       xfs_mount_t     *mp,
-       xfs_bmbt_irec_t *gotp,          /* next extent pointer */
-       xfs_bmbt_irec_t *prevp,         /* previous extent pointer */
-       xfs_extlen_t    extsz,          /* align to this extent size */
-       int             rt,             /* is this a realtime inode? */
-       int             eof,            /* is extent at end-of-file? */
-       int             delay,          /* creating delalloc extent? */
-       int             convert,        /* overwriting unwritten extent? */
-       xfs_fileoff_t   *offp,          /* in/out: aligned offset */
-       xfs_extlen_t    *lenp)          /* in/out: aligned length */
-{
-       xfs_fileoff_t   orig_off;       /* original offset */
-       xfs_extlen_t    orig_alen;      /* original length */
-       xfs_fileoff_t   orig_end;       /* original off+len */
-       xfs_fileoff_t   nexto;          /* next file offset */
-       xfs_fileoff_t   prevo;          /* previous file offset */
-       xfs_fileoff_t   align_off;      /* temp for offset */
-       xfs_extlen_t    align_alen;     /* temp for length */
-       xfs_extlen_t    temp;           /* temp for calculations */
-
-       if (convert)
-               return 0;
-
-       orig_off = align_off = *offp;
-       orig_alen = align_alen = *lenp;
-       orig_end = orig_off + orig_alen;
-
-       /*
-        * If this request overlaps an existing extent, then don't
-        * attempt to perform any additional alignment.
-        */
-       if (!delay && !eof &&
-           (orig_off >= gotp->br_startoff) &&
-           (orig_end <= gotp->br_startoff + gotp->br_blockcount)) {
-               return 0;
-       }
-
-       /*
-        * If the file offset is unaligned vs. the extent size
-        * we need to align it.  This will be possible unless
-        * the file was previously written with a kernel that didn't
-        * perform this alignment, or if a truncate shot us in the
-        * foot.
-        */
-       temp = do_mod(orig_off, extsz);
-       if (temp) {
-               align_alen += temp;
-               align_off -= temp;
-       }
-       /*
-        * Same adjustment for the end of the requested area.
-        */
-       if ((temp = (align_alen % extsz))) {
-               align_alen += extsz - temp;
-       }
-       /*
-        * If the previous block overlaps with this proposed allocation
-        * then move the start forward without adjusting the length.
-        */
-       if (prevp->br_startoff != NULLFILEOFF) {
-               if (prevp->br_startblock == HOLESTARTBLOCK)
-                       prevo = prevp->br_startoff;
-               else
-                       prevo = prevp->br_startoff + prevp->br_blockcount;
-       } else
-               prevo = 0;
-       if (align_off != orig_off && align_off < prevo)
-               align_off = prevo;
-       /*
-        * If the next block overlaps with this proposed allocation
-        * then move the start back without adjusting the length,
-        * but not before offset 0.
-        * This may of course make the start overlap previous block,
-        * and if we hit the offset 0 limit then the next block
-        * can still overlap too.
-        */
-       if (!eof && gotp->br_startoff != NULLFILEOFF) {
-               if ((delay && gotp->br_startblock == HOLESTARTBLOCK) ||
-                   (!delay && gotp->br_startblock == DELAYSTARTBLOCK))
-                       nexto = gotp->br_startoff + gotp->br_blockcount;
-               else
-                       nexto = gotp->br_startoff;
-       } else
-               nexto = NULLFILEOFF;
-       if (!eof &&
-           align_off + align_alen != orig_end &&
-           align_off + align_alen > nexto)
-               align_off = nexto > align_alen ? nexto - align_alen : 0;
-       /*
-        * If we're now overlapping the next or previous extent that
-        * means we can't fit an extsz piece in this hole.  Just move
-        * the start forward to the first valid spot and set
-        * the length so we hit the end.
-        */
-       if (align_off != orig_off && align_off < prevo)
-               align_off = prevo;
-       if (align_off + align_alen != orig_end &&
-           align_off + align_alen > nexto &&
-           nexto != NULLFILEOFF) {
-               ASSERT(nexto > prevo);
-               align_alen = nexto - align_off;
-       }
-
-       /*
-        * If realtime, and the result isn't a multiple of the realtime
-        * extent size we need to remove blocks until it is.
-        */
-       if (rt && (temp = (align_alen % mp->m_sb.sb_rextsize))) {
-               /*
-                * We're not covering the original request, or
-                * we won't be able to once we fix the length.
-                */
-               if (orig_off < align_off ||
-                   orig_end > align_off + align_alen ||
-                   align_alen - temp < orig_alen)
-                       return XFS_ERROR(EINVAL);
-               /*
-                * Try to fix it by moving the start up.
-                */
-               if (align_off + temp <= orig_off) {
-                       align_alen -= temp;
-                       align_off += temp;
-               }
-               /*
-                * Try to fix it by moving the end in.
-                */
-               else if (align_off + align_alen - temp >= orig_end)
-                       align_alen -= temp;
-               /*
-                * Set the start to the minimum then trim the length.
-                */
-               else {
-                       align_alen -= orig_off - align_off;
-                       align_off = orig_off;
-                       align_alen -= align_alen % mp->m_sb.sb_rextsize;
-               }
-               /*
-                * Result doesn't cover the request, fail it.
-                */
-               if (orig_off < align_off || orig_end > align_off + align_alen)
-                       return XFS_ERROR(EINVAL);
-       } else {
-               ASSERT(orig_off >= align_off);
-               ASSERT(orig_end <= align_off + align_alen);
-       }
-
-#ifdef DEBUG
-       if (!eof && gotp->br_startoff != NULLFILEOFF)
-               ASSERT(align_off + align_alen <= gotp->br_startoff);
-       if (prevp->br_startoff != NULLFILEOFF)
-               ASSERT(align_off >= prevp->br_startoff + prevp->br_blockcount);
-#endif
-
-       *lenp = align_alen;
-       *offp = align_off;
-       return 0;
-}
-
-#define XFS_ALLOC_GAP_UNITS    4
-
-void
-xfs_bmap_adjacent(
-       struct xfs_bmalloca     *ap)    /* bmap alloc argument struct */
-{
-       xfs_fsblock_t   adjust;         /* adjustment to block numbers */
-       xfs_agnumber_t  fb_agno;        /* ag number of ap->firstblock */
-       xfs_mount_t     *mp;            /* mount point structure */
-       int             nullfb;         /* true if ap->firstblock isn't set */
-       int             rt;             /* true if inode is realtime */
-
-#define        ISVALID(x,y)    \
-       (rt ? \
-               (x) < mp->m_sb.sb_rblocks : \
-               XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && \
-               XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && \
-               XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks)
-
-       mp = ap->ip->i_mount;
-       nullfb = *ap->firstblock == NULLFSBLOCK;
-       rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata;
-       fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
-       /*
-        * If allocating at eof, and there's a previous real block,
-        * try to use its last block as our starting point.
-        */
-       if (ap->eof && ap->prev.br_startoff != NULLFILEOFF &&
-           !isnullstartblock(ap->prev.br_startblock) &&
-           ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount,
-                   ap->prev.br_startblock)) {
-               ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount;
-               /*
-                * Adjust for the gap between prevp and us.
-                */
-               adjust = ap->offset -
-                       (ap->prev.br_startoff + ap->prev.br_blockcount);
-               if (adjust &&
-                   ISVALID(ap->blkno + adjust, ap->prev.br_startblock))
-                       ap->blkno += adjust;
-       }
-       /*
-        * If not at eof, then compare the two neighbor blocks.
-        * Figure out whether either one gives us a good starting point,
-        * and pick the better one.
-        */
-       else if (!ap->eof) {
-               xfs_fsblock_t   gotbno;         /* right side block number */
-               xfs_fsblock_t   gotdiff=0;      /* right side difference */
-               xfs_fsblock_t   prevbno;        /* left side block number */
-               xfs_fsblock_t   prevdiff=0;     /* left side difference */
-
-               /*
-                * If there's a previous (left) block, select a requested
-                * start block based on it.
-                */
-               if (ap->prev.br_startoff != NULLFILEOFF &&
-                   !isnullstartblock(ap->prev.br_startblock) &&
-                   (prevbno = ap->prev.br_startblock +
-                              ap->prev.br_blockcount) &&
-                   ISVALID(prevbno, ap->prev.br_startblock)) {
-                       /*
-                        * Calculate gap to end of previous block.
-                        */
-                       adjust = prevdiff = ap->offset -
-                               (ap->prev.br_startoff +
-                                ap->prev.br_blockcount);
-                       /*
-                        * Figure the startblock based on the previous block's
-                        * end and the gap size.
-                        * Heuristic!
-                        * If the gap is large relative to the piece we're
-                        * allocating, or using it gives us an invalid block
-                        * number, then just use the end of the previous block.
-                        */
-                       if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
-                           ISVALID(prevbno + prevdiff,
-                                   ap->prev.br_startblock))
-                               prevbno += adjust;
-                       else
-                               prevdiff += adjust;
-                       /*
-                        * If the firstblock forbids it, can't use it,
-                        * must use default.
-                        */
-                       if (!rt && !nullfb &&
-                           XFS_FSB_TO_AGNO(mp, prevbno) != fb_agno)
-                               prevbno = NULLFSBLOCK;
-               }
-               /*
-                * No previous block or can't follow it, just default.
-                */
-               else
-                       prevbno = NULLFSBLOCK;
-               /*
-                * If there's a following (right) block, select a requested
-                * start block based on it.
-                */
-               if (!isnullstartblock(ap->got.br_startblock)) {
-                       /*
-                        * Calculate gap to start of next block.
-                        */
-                       adjust = gotdiff = ap->got.br_startoff - ap->offset;
-                       /*
-                        * Figure the startblock based on the next block's
-                        * start and the gap size.
-                        */
-                       gotbno = ap->got.br_startblock;
-                       /*
-                        * Heuristic!
-                        * If the gap is large relative to the piece we're
-                        * allocating, or using it gives us an invalid block
-                        * number, then just use the start of the next block
-                        * offset by our length.
-                        */
-                       if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
-                           ISVALID(gotbno - gotdiff, gotbno))
-                               gotbno -= adjust;
-                       else if (ISVALID(gotbno - ap->length, gotbno)) {
-                               gotbno -= ap->length;
-                               gotdiff += adjust - ap->length;
-                       } else
-                               gotdiff += adjust;
-                       /*
-                        * If the firstblock forbids it, can't use it,
-                        * must use default.
-                        */
-                       if (!rt && !nullfb &&
-                           XFS_FSB_TO_AGNO(mp, gotbno) != fb_agno)
-                               gotbno = NULLFSBLOCK;
-               }
-               /*
-                * No next block, just default.
-                */
-               else
-                       gotbno = NULLFSBLOCK;
-               /*
-                * If both valid, pick the better one, else the only good
-                * one, else ap->blkno is already set (to 0 or the inode block).
-                */
-               if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
-                       ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno;
-               else if (prevbno != NULLFSBLOCK)
-                       ap->blkno = prevbno;
-               else if (gotbno != NULLFSBLOCK)
-                       ap->blkno = gotbno;
-       }
-#undef ISVALID
-}
-
-static int
-xfs_bmap_longest_free_extent(
-       struct xfs_trans        *tp,
-       xfs_agnumber_t          ag,
-       xfs_extlen_t            *blen,
-       int                     *notinit)
-{
-       struct xfs_mount        *mp = tp->t_mountp;
-       struct xfs_perag        *pag;
-       xfs_extlen_t            longest;
-       int                     error = 0;
-
-       pag = xfs_perag_get(mp, ag);
-       if (!pag->pagf_init) {
-               error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK);
-               if (error)
-                       goto out;
-
-               if (!pag->pagf_init) {
-                       *notinit = 1;
-                       goto out;
-               }
-       }
-
-       longest = xfs_alloc_longest_free_extent(mp, pag);
-       if (*blen < longest)
-               *blen = longest;
-
-out:
-       xfs_perag_put(pag);
-       return error;
-}
-
-static void
-xfs_bmap_select_minlen(
-       struct xfs_bmalloca     *ap,
-       struct xfs_alloc_arg    *args,
-       xfs_extlen_t            *blen,
-       int                     notinit)
-{
-       if (notinit || *blen < ap->minlen) {
-               /*
-                * Since we did a BUF_TRYLOCK above, it is possible that
-                * there is space for this request.
-                */
-               args->minlen = ap->minlen;
-       } else if (*blen < args->maxlen) {
-               /*
-                * If the best seen length is less than the request length,
-                * use the best as the minimum.
-                */
-               args->minlen = *blen;
-       } else {
-               /*
-                * Otherwise we've seen an extent as big as maxlen, use that
-                * as the minimum.
-                */
-               args->minlen = args->maxlen;
-       }
-}
-
-STATIC int
-xfs_bmap_btalloc_nullfb(
-       struct xfs_bmalloca     *ap,
-       struct xfs_alloc_arg    *args,
-       xfs_extlen_t            *blen)
-{
-       struct xfs_mount        *mp = ap->ip->i_mount;
-       xfs_agnumber_t          ag, startag;
-       int                     notinit = 0;
-       int                     error;
-
-       args->type = XFS_ALLOCTYPE_START_BNO;
-       args->total = ap->total;
-
-       startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
-       if (startag == NULLAGNUMBER)
-               startag = ag = 0;
-
-       while (*blen < args->maxlen) {
-               error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
-                                                    &notinit);
-               if (error)
-                       return error;
-
-               if (++ag == mp->m_sb.sb_agcount)
-                       ag = 0;
-               if (ag == startag)
-                       break;
-       }
-
-       xfs_bmap_select_minlen(ap, args, blen, notinit);
-       return 0;
-}
-
-STATIC int
-xfs_bmap_btalloc_filestreams(
-       struct xfs_bmalloca     *ap,
-       struct xfs_alloc_arg    *args,
-       xfs_extlen_t            *blen)
-{
-       struct xfs_mount        *mp = ap->ip->i_mount;
-       xfs_agnumber_t          ag;
-       int                     notinit = 0;
-       int                     error;
-
-       args->type = XFS_ALLOCTYPE_NEAR_BNO;
-       args->total = ap->total;
-
-       ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
-       if (ag == NULLAGNUMBER)
-               ag = 0;
-
-       error = xfs_bmap_longest_free_extent(args->tp, ag, blen, &notinit);
-       if (error)
-               return error;
-
-       if (*blen < args->maxlen) {
-               error = xfs_filestream_new_ag(ap, &ag);
-               if (error)
-                       return error;
-
-               error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
-                                                    &notinit);
-               if (error)
-                       return error;
-
-       }
-
-       xfs_bmap_select_minlen(ap, args, blen, notinit);
-
-       /*
-        * Set the failure fallback case to look in the selected AG as stream
-        * may have moved.
-        */
-       ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
-       return 0;
-}
-
-STATIC int
-xfs_bmap_btalloc(
-       struct xfs_bmalloca     *ap)    /* bmap alloc argument struct */
-{
-       xfs_mount_t     *mp;            /* mount point structure */
-       xfs_alloctype_t atype = 0;      /* type for allocation routines */
-       xfs_extlen_t    align;          /* minimum allocation alignment */
-       xfs_agnumber_t  fb_agno;        /* ag number of ap->firstblock */
-       xfs_agnumber_t  ag;
-       xfs_alloc_arg_t args;
-       xfs_extlen_t    blen;
-       xfs_extlen_t    nextminlen = 0;
-       int             nullfb;         /* true if ap->firstblock isn't set */
-       int             isaligned;
-       int             tryagain;
-       int             error;
-       int             stripe_align;
-
-       ASSERT(ap->length);
-
-       mp = ap->ip->i_mount;
-
-       /* stripe alignment for allocation is determined by mount parameters */
-       stripe_align = 0;
-       if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
-               stripe_align = mp->m_swidth;
-       else if (mp->m_dalign)
-               stripe_align = mp->m_dalign;
-
-       align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
-       if (unlikely(align)) {
-               error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
-                                               align, 0, ap->eof, 0, ap->conv,
-                                               &ap->offset, &ap->length);
-               ASSERT(!error);
-               ASSERT(ap->length);
-       }
-
-
-       nullfb = *ap->firstblock == NULLFSBLOCK;
-       fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
-       if (nullfb) {
-               if (ap->userdata && xfs_inode_is_filestream(ap->ip)) {
-                       ag = xfs_filestream_lookup_ag(ap->ip);
-                       ag = (ag != NULLAGNUMBER) ? ag : 0;
-                       ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0);
-               } else {
-                       ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
-               }
-       } else
-               ap->blkno = *ap->firstblock;
-
-       xfs_bmap_adjacent(ap);
-
-       /*
-        * If allowed, use ap->blkno; otherwise must use firstblock since
-        * it's in the right allocation group.
-        */
-       if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno)
-               ;
-       else
-               ap->blkno = *ap->firstblock;
-       /*
-        * Normal allocation, done through xfs_alloc_vextent.
-        */
-       tryagain = isaligned = 0;
-       memset(&args, 0, sizeof(args));
-       args.tp = ap->tp;
-       args.mp = mp;
-       args.fsbno = ap->blkno;
-
-       /* Trim the allocation back to the maximum an AG can fit. */
-       args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp));
-       args.firstblock = *ap->firstblock;
-       blen = 0;
-       if (nullfb) {
-               /*
-                * Search for an allocation group with a single extent large
-                * enough for the request.  If one isn't found, then adjust
-                * the minimum allocation size to the largest space found.
-                */
-               if (ap->userdata && xfs_inode_is_filestream(ap->ip))
-                       error = xfs_bmap_btalloc_filestreams(ap, &args, &blen);
-               else
-                       error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
-               if (error)
-                       return error;
-       } else if (ap->flist->xbf_low) {
-               if (xfs_inode_is_filestream(ap->ip))
-                       args.type = XFS_ALLOCTYPE_FIRST_AG;
-               else
-                       args.type = XFS_ALLOCTYPE_START_BNO;
-               args.total = args.minlen = ap->minlen;
-       } else {
-               args.type = XFS_ALLOCTYPE_NEAR_BNO;
-               args.total = ap->total;
-               args.minlen = ap->minlen;
-       }
-       /* apply extent size hints if obtained earlier */
-       if (unlikely(align)) {
-               args.prod = align;
-               if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod)))
-                       args.mod = (xfs_extlen_t)(args.prod - args.mod);
-       } else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) {
-               args.prod = 1;
-               args.mod = 0;
-       } else {
-               args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog;
-               if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod))))
-                       args.mod = (xfs_extlen_t)(args.prod - args.mod);
-       }
-       /*
-        * If we are not low on available data blocks, and the
-        * underlying logical volume manager is a stripe, and
-        * the file offset is zero then try to allocate data
-        * blocks on stripe unit boundary.
-        * NOTE: ap->aeof is only set if the allocation length
-        * is >= the stripe unit and the allocation offset is
-        * at the end of file.
-        */
-       if (!ap->flist->xbf_low && ap->aeof) {
-               if (!ap->offset) {
-                       args.alignment = stripe_align;
-                       atype = args.type;
-                       isaligned = 1;
-                       /*
-                        * Adjust for alignment
-                        */
-                       if (blen > args.alignment && blen <= args.maxlen)
-                               args.minlen = blen - args.alignment;
-                       args.minalignslop = 0;
-               } else {
-                       /*
-                        * First try an exact bno allocation.
-                        * If it fails then do a near or start bno
-                        * allocation with alignment turned on.
-                        */
-                       atype = args.type;
-                       tryagain = 1;
-                       args.type = XFS_ALLOCTYPE_THIS_BNO;
-                       args.alignment = 1;
-                       /*
-                        * Compute the minlen+alignment for the
-                        * next case.  Set slop so that the value
-                        * of minlen+alignment+slop doesn't go up
-                        * between the calls.
-                        */
-                       if (blen > stripe_align && blen <= args.maxlen)
-                               nextminlen = blen - stripe_align;
-                       else
-                               nextminlen = args.minlen;
-                       if (nextminlen + stripe_align > args.minlen + 1)
-                               args.minalignslop =
-                                       nextminlen + stripe_align -
-                                       args.minlen - 1;
-                       else
-                               args.minalignslop = 0;
-               }
-       } else {
-               args.alignment = 1;
-               args.minalignslop = 0;
-       }
-       args.minleft = ap->minleft;
-       args.wasdel = ap->wasdel;
-       args.isfl = 0;
-       args.userdata = ap->userdata;
-       if ((error = xfs_alloc_vextent(&args)))
-               return error;
-       if (tryagain && args.fsbno == NULLFSBLOCK) {
-               /*
-                * Exact allocation failed. Now try with alignment
-                * turned on.
-                */
-               args.type = atype;
-               args.fsbno = ap->blkno;
-               args.alignment = stripe_align;
-               args.minlen = nextminlen;
-               args.minalignslop = 0;
-               isaligned = 1;
-               if ((error = xfs_alloc_vextent(&args)))
-                       return error;
-       }
-       if (isaligned && args.fsbno == NULLFSBLOCK) {
-               /*
-                * allocation failed, so turn off alignment and
-                * try again.
-                */
-               args.type = atype;
-               args.fsbno = ap->blkno;
-               args.alignment = 0;
-               if ((error = xfs_alloc_vextent(&args)))
-                       return error;
-       }
-       if (args.fsbno == NULLFSBLOCK && nullfb &&
-           args.minlen > ap->minlen) {
-               args.minlen = ap->minlen;
-               args.type = XFS_ALLOCTYPE_START_BNO;
-               args.fsbno = ap->blkno;
-               if ((error = xfs_alloc_vextent(&args)))
-                       return error;
-       }
-       if (args.fsbno == NULLFSBLOCK && nullfb) {
-               args.fsbno = 0;
-               args.type = XFS_ALLOCTYPE_FIRST_AG;
-               args.total = ap->minlen;
-               args.minleft = 0;
-               if ((error = xfs_alloc_vextent(&args)))
-                       return error;
-               ap->flist->xbf_low = 1;
-       }
-       if (args.fsbno != NULLFSBLOCK) {
-               /*
-                * check the allocation happened at the same or higher AG than
-                * the first block that was allocated.
-                */
-               ASSERT(*ap->firstblock == NULLFSBLOCK ||
-                      XFS_FSB_TO_AGNO(mp, *ap->firstblock) ==
-                      XFS_FSB_TO_AGNO(mp, args.fsbno) ||
-                      (ap->flist->xbf_low &&
-                       XFS_FSB_TO_AGNO(mp, *ap->firstblock) <
-                       XFS_FSB_TO_AGNO(mp, args.fsbno)));
-
-               ap->blkno = args.fsbno;
-               if (*ap->firstblock == NULLFSBLOCK)
-                       *ap->firstblock = args.fsbno;
-               ASSERT(nullfb || fb_agno == args.agno ||
-                      (ap->flist->xbf_low && fb_agno < args.agno));
-               ap->length = args.len;
-               ap->ip->i_d.di_nblocks += args.len;
-               xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
-               if (ap->wasdel)
-                       ap->ip->i_delayed_blks -= args.len;
-               /*
-                * Adjust the disk quota also. This was reserved
-                * earlier.
-                */
-               xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
-                       ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT :
-                                       XFS_TRANS_DQ_BCOUNT,
-                       (long) args.len);
-       } else {
-               ap->blkno = NULLFSBLOCK;
-               ap->length = 0;
-       }
-       return 0;
-}
-
-/*
- * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
- * It figures out where to ask the underlying allocator to put the new extent.
- */
-STATIC int
-xfs_bmap_alloc(
-       struct xfs_bmalloca     *ap)    /* bmap alloc argument struct */
-{
-       if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata)
-               return xfs_bmap_rtalloc(ap);
-       return xfs_bmap_btalloc(ap);
-}
-
-/*
- * Trim the returned map to the required bounds
- */
-STATIC void
-xfs_bmapi_trim_map(
-       struct xfs_bmbt_irec    *mval,
-       struct xfs_bmbt_irec    *got,
-       xfs_fileoff_t           *bno,
-       xfs_filblks_t           len,
-       xfs_fileoff_t           obno,
-       xfs_fileoff_t           end,
-       int                     n,
-       int                     flags)
-{
-       if ((flags & XFS_BMAPI_ENTIRE) ||
-           got->br_startoff + got->br_blockcount <= obno) {
-               *mval = *got;
-               if (isnullstartblock(got->br_startblock))
-                       mval->br_startblock = DELAYSTARTBLOCK;
-               return;
-       }
-
-       if (obno > *bno)
-               *bno = obno;
-       ASSERT((*bno >= obno) || (n == 0));
-       ASSERT(*bno < end);
-       mval->br_startoff = *bno;
-       if (isnullstartblock(got->br_startblock))
-               mval->br_startblock = DELAYSTARTBLOCK;
-       else
-               mval->br_startblock = got->br_startblock +
-                                       (*bno - got->br_startoff);
-       /*
-        * Return the minimum of what we got and what we asked for for
-        * the length.  We can use the len variable here because it is
-        * modified below and we could have been there before coming
-        * here if the first part of the allocation didn't overlap what
-        * was asked for.
-        */
-       mval->br_blockcount = XFS_FILBLKS_MIN(end - *bno,
-                       got->br_blockcount - (*bno - got->br_startoff));
-       mval->br_state = got->br_state;
-       ASSERT(mval->br_blockcount <= len);
-       return;
-}
-
-/*
- * Update and validate the extent map to return
- */
-STATIC void
-xfs_bmapi_update_map(
-       struct xfs_bmbt_irec    **map,
-       xfs_fileoff_t           *bno,
-       xfs_filblks_t           *len,
-       xfs_fileoff_t           obno,
-       xfs_fileoff_t           end,
-       int                     *n,
-       int                     flags)
-{
-       xfs_bmbt_irec_t *mval = *map;
-
-       ASSERT((flags & XFS_BMAPI_ENTIRE) ||
-              ((mval->br_startoff + mval->br_blockcount) <= end));
-       ASSERT((flags & XFS_BMAPI_ENTIRE) || (mval->br_blockcount <= *len) ||
-              (mval->br_startoff < obno));
-
-       *bno = mval->br_startoff + mval->br_blockcount;
-       *len = end - *bno;
-       if (*n > 0 && mval->br_startoff == mval[-1].br_startoff) {
-               /* update previous map with new information */
-               ASSERT(mval->br_startblock == mval[-1].br_startblock);
-               ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
-               ASSERT(mval->br_state == mval[-1].br_state);
-               mval[-1].br_blockcount = mval->br_blockcount;
-               mval[-1].br_state = mval->br_state;
-       } else if (*n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
-                  mval[-1].br_startblock != DELAYSTARTBLOCK &&
-                  mval[-1].br_startblock != HOLESTARTBLOCK &&
-                  mval->br_startblock == mval[-1].br_startblock +
-                                         mval[-1].br_blockcount &&
-                  ((flags & XFS_BMAPI_IGSTATE) ||
-                       mval[-1].br_state == mval->br_state)) {
-               ASSERT(mval->br_startoff ==
-                      mval[-1].br_startoff + mval[-1].br_blockcount);
-               mval[-1].br_blockcount += mval->br_blockcount;
-       } else if (*n > 0 &&
-                  mval->br_startblock == DELAYSTARTBLOCK &&
-                  mval[-1].br_startblock == DELAYSTARTBLOCK &&
-                  mval->br_startoff ==
-                  mval[-1].br_startoff + mval[-1].br_blockcount) {
-               mval[-1].br_blockcount += mval->br_blockcount;
-               mval[-1].br_state = mval->br_state;
-       } else if (!((*n == 0) &&
-                    ((mval->br_startoff + mval->br_blockcount) <=
-                     obno))) {
-               mval++;
-               (*n)++;
-       }
-       *map = mval;
-}
-
-/*
- * Map file blocks to filesystem blocks without allocation.
- */
-int
-xfs_bmapi_read(
-       struct xfs_inode        *ip,
-       xfs_fileoff_t           bno,
-       xfs_filblks_t           len,
-       struct xfs_bmbt_irec    *mval,
-       int                     *nmap,
-       int                     flags)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_ifork        *ifp;
-       struct xfs_bmbt_irec    got;
-       struct xfs_bmbt_irec    prev;
-       xfs_fileoff_t           obno;
-       xfs_fileoff_t           end;
-       xfs_extnum_t            lastx;
-       int                     error;
-       int                     eof;
-       int                     n = 0;
-       int                     whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
-                                               XFS_ATTR_FORK : XFS_DATA_FORK;
-
-       ASSERT(*nmap >= 1);
-       ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
-                          XFS_BMAPI_IGSTATE)));
-       ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
-
-       if (unlikely(XFS_TEST_ERROR(
-           (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
-            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
-            mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
-               XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       XFS_STATS_INC(xs_blk_mapr);
-
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-
-       if (!(ifp->if_flags & XFS_IFEXTENTS)) {
-               error = xfs_iread_extents(NULL, ip, whichfork);
-               if (error)
-                       return error;
-       }
-
-       xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev);
-       end = bno + len;
-       obno = bno;
-
-       while (bno < end && n < *nmap) {
-               /* Reading past eof, act as though there's a hole up to end. */
-               if (eof)
-                       got.br_startoff = end;
-               if (got.br_startoff > bno) {
-                       /* Reading in a hole.  */
-                       mval->br_startoff = bno;
-                       mval->br_startblock = HOLESTARTBLOCK;
-                       mval->br_blockcount =
-                               XFS_FILBLKS_MIN(len, got.br_startoff - bno);
-                       mval->br_state = XFS_EXT_NORM;
-                       bno += mval->br_blockcount;
-                       len -= mval->br_blockcount;
-                       mval++;
-                       n++;
-                       continue;
-               }
-
-               /* set up the extent map to return. */
-               xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
-               xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
-
-               /* If we're done, stop now. */
-               if (bno >= end || n >= *nmap)
-                       break;
-
-               /* Else go on to the next record. */
-               if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
-                       xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
-               else
-                       eof = 1;
-       }
-       *nmap = n;
-       return 0;
-}
-
-STATIC int
-xfs_bmapi_reserve_delalloc(
-       struct xfs_inode        *ip,
-       xfs_fileoff_t           aoff,
-       xfs_filblks_t           len,
-       struct xfs_bmbt_irec    *got,
-       struct xfs_bmbt_irec    *prev,
-       xfs_extnum_t            *lastx,
-       int                     eof)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-       xfs_extlen_t            alen;
-       xfs_extlen_t            indlen;
-       char                    rt = XFS_IS_REALTIME_INODE(ip);
-       xfs_extlen_t            extsz;
-       int                     error;
-
-       alen = XFS_FILBLKS_MIN(len, MAXEXTLEN);
-       if (!eof)
-               alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
-
-       /* Figure out the extent size, adjust alen */
-       extsz = xfs_get_extsz_hint(ip);
-       if (extsz) {
-               /*
-                * Make sure we don't exceed a single extent length when we
-                * align the extent by reducing length we are going to
-                * allocate by the maximum amount extent size aligment may
-                * require.
-                */
-               alen = XFS_FILBLKS_MIN(len, MAXEXTLEN - (2 * extsz - 1));
-               error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof,
-                                              1, 0, &aoff, &alen);
-               ASSERT(!error);
-       }
-
-       if (rt)
-               extsz = alen / mp->m_sb.sb_rextsize;
-
-       /*
-        * Make a transaction-less quota reservation for delayed allocation
-        * blocks.  This number gets adjusted later.  We return if we haven't
-        * allocated blocks already inside this loop.
-        */
-       error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0,
-                       rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
-       if (error)
-               return error;
-
-       /*
-        * Split changing sb for alen and indlen since they could be coming
-        * from different places.
-        */
-       indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
-       ASSERT(indlen > 0);
-
-       if (rt) {
-               error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
-                                         -((int64_t)extsz), 0);
-       } else {
-               error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-                                                -((int64_t)alen), 0);
-       }
-
-       if (error)
-               goto out_unreserve_quota;
-
-       error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-                                        -((int64_t)indlen), 0);
-       if (error)
-               goto out_unreserve_blocks;
-
-
-       ip->i_delayed_blks += alen;
-
-       got->br_startoff = aoff;
-       got->br_startblock = nullstartblock(indlen);
-       got->br_blockcount = alen;
-       got->br_state = XFS_EXT_NORM;
-       xfs_bmap_add_extent_hole_delay(ip, lastx, got);
-
-       /*
-        * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay
-        * might have merged it into one of the neighbouring ones.
-        */
-       xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
-
-       ASSERT(got->br_startoff <= aoff);
-       ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen);
-       ASSERT(isnullstartblock(got->br_startblock));
-       ASSERT(got->br_state == XFS_EXT_NORM);
-       return 0;
-
-out_unreserve_blocks:
-       if (rt)
-               xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0);
-       else
-               xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0);
-out_unreserve_quota:
-       if (XFS_IS_QUOTA_ON(mp))
-               xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ?
-                               XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
-       return error;
-}
-
-/*
- * Map file blocks to filesystem blocks, adding delayed allocations as needed.
- */
-int
-xfs_bmapi_delay(
-       struct xfs_inode        *ip,    /* incore inode */
-       xfs_fileoff_t           bno,    /* starting file offs. mapped */
-       xfs_filblks_t           len,    /* length to map in file */
-       struct xfs_bmbt_irec    *mval,  /* output: map values */
-       int                     *nmap,  /* i/o: mval size/count */
-       int                     flags)  /* XFS_BMAPI_... */
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-       struct xfs_bmbt_irec    got;    /* current file extent record */
-       struct xfs_bmbt_irec    prev;   /* previous file extent record */
-       xfs_fileoff_t           obno;   /* old block number (offset) */
-       xfs_fileoff_t           end;    /* end of mapped file region */
-       xfs_extnum_t            lastx;  /* last useful extent number */
-       int                     eof;    /* we've hit the end of extents */
-       int                     n = 0;  /* current extent index */
-       int                     error = 0;
-
-       ASSERT(*nmap >= 1);
-       ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
-       ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
-       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
-       if (unlikely(XFS_TEST_ERROR(
-           (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
-            XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
-            mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
-               XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       XFS_STATS_INC(xs_blk_mapw);
-
-       if (!(ifp->if_flags & XFS_IFEXTENTS)) {
-               error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
-               if (error)
-                       return error;
-       }
-
-       xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev);
-       end = bno + len;
-       obno = bno;
-
-       while (bno < end && n < *nmap) {
-               if (eof || got.br_startoff > bno) {
-                       error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got,
-                                                          &prev, &lastx, eof);
-                       if (error) {
-                               if (n == 0) {
-                                       *nmap = 0;
-                                       return error;
-                               }
-                               break;
-                       }
-               }
-
-               /* set up the extent map to return. */
-               xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
-               xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
-
-               /* If we're done, stop now. */
-               if (bno >= end || n >= *nmap)
-                       break;
-
-               /* Else go on to the next record. */
-               prev = got;
-               if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
-                       xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
-               else
-                       eof = 1;
-       }
-
-       *nmap = n;
-       return 0;
-}
-
-
-static int
-xfs_bmapi_allocate(
-       struct xfs_bmalloca     *bma)
-{
-       struct xfs_mount        *mp = bma->ip->i_mount;
-       int                     whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ?
-                                               XFS_ATTR_FORK : XFS_DATA_FORK;
-       struct xfs_ifork        *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
-       int                     tmp_logflags = 0;
-       int                     error;
-
-       ASSERT(bma->length > 0);
-
-       /*
-        * For the wasdelay case, we could also just allocate the stuff asked
-        * for in this bmap call but that wouldn't be as good.
-        */
-       if (bma->wasdel) {
-               bma->length = (xfs_extlen_t)bma->got.br_blockcount;
-               bma->offset = bma->got.br_startoff;
-               if (bma->idx != NULLEXTNUM && bma->idx) {
-                       xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1),
-                                        &bma->prev);
-               }
-       } else {
-               bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
-               if (!bma->eof)
-                       bma->length = XFS_FILBLKS_MIN(bma->length,
-                                       bma->got.br_startoff - bma->offset);
-       }
-
-       /*
-        * Indicate if this is the first user data in the file, or just any
-        * user data.
-        */
-       if (!(bma->flags & XFS_BMAPI_METADATA)) {
-               bma->userdata = (bma->offset == 0) ?
-                       XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
-       }
-
-       bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
-
-       /*
-        * Only want to do the alignment at the eof if it is userdata and
-        * allocation length is larger than a stripe unit.
-        */
-       if (mp->m_dalign && bma->length >= mp->m_dalign &&
-           !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
-               error = xfs_bmap_isaeof(bma, whichfork);
-               if (error)
-                       return error;
-       }
-
-       error = xfs_bmap_alloc(bma);
-       if (error)
-               return error;
-
-       if (bma->flist->xbf_low)
-               bma->minleft = 0;
-       if (bma->cur)
-               bma->cur->bc_private.b.firstblock = *bma->firstblock;
-       if (bma->blkno == NULLFSBLOCK)
-               return 0;
-       if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
-               bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
-               bma->cur->bc_private.b.firstblock = *bma->firstblock;
-               bma->cur->bc_private.b.flist = bma->flist;
-       }
-       /*
-        * Bump the number of extents we've allocated
-        * in this call.
-        */
-       bma->nallocs++;
-
-       if (bma->cur)
-               bma->cur->bc_private.b.flags =
-                       bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
-
-       bma->got.br_startoff = bma->offset;
-       bma->got.br_startblock = bma->blkno;
-       bma->got.br_blockcount = bma->length;
-       bma->got.br_state = XFS_EXT_NORM;
-
-       /*
-        * A wasdelay extent has been initialized, so shouldn't be flagged
-        * as unwritten.
-        */
-       if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) &&
-           xfs_sb_version_hasextflgbit(&mp->m_sb))
-               bma->got.br_state = XFS_EXT_UNWRITTEN;
-
-       if (bma->wasdel)
-               error = xfs_bmap_add_extent_delay_real(bma);
-       else
-               error = xfs_bmap_add_extent_hole_real(bma, whichfork);
-
-       bma->logflags |= tmp_logflags;
-       if (error)
-               return error;
-
-       /*
-        * Update our extent pointer, given that xfs_bmap_add_extent_delay_real
-        * or xfs_bmap_add_extent_hole_real might have merged it into one of
-        * the neighbouring ones.
-        */
-       xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
-
-       ASSERT(bma->got.br_startoff <= bma->offset);
-       ASSERT(bma->got.br_startoff + bma->got.br_blockcount >=
-              bma->offset + bma->length);
-       ASSERT(bma->got.br_state == XFS_EXT_NORM ||
-              bma->got.br_state == XFS_EXT_UNWRITTEN);
-       return 0;
-}
-
-STATIC int
-xfs_bmapi_convert_unwritten(
-       struct xfs_bmalloca     *bma,
-       struct xfs_bmbt_irec    *mval,
-       xfs_filblks_t           len,
-       int                     flags)
-{
-       int                     whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
-                                               XFS_ATTR_FORK : XFS_DATA_FORK;
-       struct xfs_ifork        *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
-       int                     tmp_logflags = 0;
-       int                     error;
-
-       /* check if we need to do unwritten->real conversion */
-       if (mval->br_state == XFS_EXT_UNWRITTEN &&
-           (flags & XFS_BMAPI_PREALLOC))
-               return 0;
-
-       /* check if we need to do real->unwritten conversion */
-       if (mval->br_state == XFS_EXT_NORM &&
-           (flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) !=
-                       (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
-               return 0;
-
-       /*
-        * Modify (by adding) the state flag, if writing.
-        */
-       ASSERT(mval->br_blockcount <= len);
-       if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
-               bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp,
-                                       bma->ip, whichfork);
-               bma->cur->bc_private.b.firstblock = *bma->firstblock;
-               bma->cur->bc_private.b.flist = bma->flist;
-       }
-       mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
-                               ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
-
-       error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
-                       &bma->cur, mval, bma->firstblock, bma->flist,
-                       &tmp_logflags);
-       bma->logflags |= tmp_logflags;
-       if (error)
-               return error;
-
-       /*
-        * Update our extent pointer, given that
-        * xfs_bmap_add_extent_unwritten_real might have merged it into one
-        * of the neighbouring ones.
-        */
-       xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
-
-       /*
-        * We may have combined previously unwritten space with written space,
-        * so generate another request.
-        */
-       if (mval->br_blockcount < len)
-               return EAGAIN;
-       return 0;
-}
-
-/*
- * Map file blocks to filesystem blocks, and allocate blocks or convert the
- * extent state if necessary.  Details behaviour is controlled by the flags
- * parameter.  Only allocates blocks from a single allocation group, to avoid
- * locking problems.
- *
- * The returned value in "firstblock" from the first call in a transaction
- * must be remembered and presented to subsequent calls in "firstblock".
- * An upper bound for the number of blocks to be allocated is supplied to
- * the first call in "total"; if no allocation group has that many free
- * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
- */
-int
-xfs_bmapi_write(
-       struct xfs_trans        *tp,            /* transaction pointer */
-       struct xfs_inode        *ip,            /* incore inode */
-       xfs_fileoff_t           bno,            /* starting file offs. mapped */
-       xfs_filblks_t           len,            /* length to map in file */
-       int                     flags,          /* XFS_BMAPI_... */
-       xfs_fsblock_t           *firstblock,    /* first allocated block
-                                                  controls a.g. for allocs */
-       xfs_extlen_t            total,          /* total blocks needed */
-       struct xfs_bmbt_irec    *mval,          /* output: map values */
-       int                     *nmap,          /* i/o: mval size/count */
-       struct xfs_bmap_free    *flist)         /* i/o: list extents to free */
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_ifork        *ifp;
-       struct xfs_bmalloca     bma = { NULL }; /* args for xfs_bmap_alloc */
-       xfs_fileoff_t           end;            /* end of mapped file region */
-       int                     eof;            /* after the end of extents */
-       int                     error;          /* error return */
-       int                     n;              /* current extent index */
-       xfs_fileoff_t           obno;           /* old block number (offset) */
-       int                     whichfork;      /* data or attr fork */
-       char                    inhole;         /* current location is hole in file */
-       char                    wasdelay;       /* old extent was delayed */
-
-#ifdef DEBUG
-       xfs_fileoff_t           orig_bno;       /* original block number value */
-       int                     orig_flags;     /* original flags arg value */
-       xfs_filblks_t           orig_len;       /* original value of len arg */
-       struct xfs_bmbt_irec    *orig_mval;     /* original value of mval */
-       int                     orig_nmap;      /* original value of *nmap */
-
-       orig_bno = bno;
-       orig_len = len;
-       orig_flags = flags;
-       orig_mval = mval;
-       orig_nmap = *nmap;
-#endif
-       whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
-               XFS_ATTR_FORK : XFS_DATA_FORK;
-
-       ASSERT(*nmap >= 1);
-       ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
-       ASSERT(!(flags & XFS_BMAPI_IGSTATE));
-       ASSERT(tp != NULL);
-       ASSERT(len > 0);
-       ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
-       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
-       if (unlikely(XFS_TEST_ERROR(
-           (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
-            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
-            mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
-               XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-
-       XFS_STATS_INC(xs_blk_mapw);
-
-       if (*firstblock == NULLFSBLOCK) {
-               if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
-                       bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
-               else
-                       bma.minleft = 1;
-       } else {
-               bma.minleft = 0;
-       }
-
-       if (!(ifp->if_flags & XFS_IFEXTENTS)) {
-               error = xfs_iread_extents(tp, ip, whichfork);
-               if (error)
-                       goto error0;
-       }
-
-       xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got,
-                               &bma.prev);
-       n = 0;
-       end = bno + len;
-       obno = bno;
-
-       bma.tp = tp;
-       bma.ip = ip;
-       bma.total = total;
-       bma.userdata = 0;
-       bma.flist = flist;
-       bma.firstblock = firstblock;
-
-       while (bno < end && n < *nmap) {
-               inhole = eof || bma.got.br_startoff > bno;
-               wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
-
-               /*
-                * First, deal with the hole before the allocated space
-                * that we found, if any.
-                */
-               if (inhole || wasdelay) {
-                       bma.eof = eof;
-                       bma.conv = !!(flags & XFS_BMAPI_CONVERT);
-                       bma.wasdel = wasdelay;
-                       bma.offset = bno;
-                       bma.flags = flags;
-
-                       /*
-                        * There's a 32/64 bit type mismatch between the
-                        * allocation length request (which can be 64 bits in
-                        * length) and the bma length request, which is
-                        * xfs_extlen_t and therefore 32 bits. Hence we have to
-                        * check for 32-bit overflows and handle them here.
-                        */
-                       if (len > (xfs_filblks_t)MAXEXTLEN)
-                               bma.length = MAXEXTLEN;
-                       else
-                               bma.length = len;
-
-                       ASSERT(len > 0);
-                       ASSERT(bma.length > 0);
-                       error = xfs_bmapi_allocate(&bma);
-                       if (error)
-                               goto error0;
-                       if (bma.blkno == NULLFSBLOCK)
-                               break;
-               }
-
-               /* Deal with the allocated space we found.  */
-               xfs_bmapi_trim_map(mval, &bma.got, &bno, len, obno,
-                                                       end, n, flags);
-
-               /* Execute unwritten extent conversion if necessary */
-               error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags);
-               if (error == EAGAIN)
-                       continue;
-               if (error)
-                       goto error0;
-
-               /* update the extent map to return */
-               xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
-
-               /*
-                * If we're done, stop now.  Stop when we've allocated
-                * XFS_BMAP_MAX_NMAP extents no matter what.  Otherwise
-                * the transaction may get too big.
-                */
-               if (bno >= end || n >= *nmap || bma.nallocs >= *nmap)
-                       break;
-
-               /* Else go on to the next record. */
-               bma.prev = bma.got;
-               if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) {
-                       xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx),
-                                        &bma.got);
-               } else
-                       eof = 1;
-       }
-       *nmap = n;
-
-       /*
-        * Transform from btree to extents, give it cur.
-        */
-       if (xfs_bmap_wants_extents(ip, whichfork)) {
-               int             tmp_logflags = 0;
-
-               ASSERT(bma.cur);
-               error = xfs_bmap_btree_to_extents(tp, ip, bma.cur,
-                       &tmp_logflags, whichfork);
-               bma.logflags |= tmp_logflags;
-               if (error)
-                       goto error0;
-       }
-
-       ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
-              XFS_IFORK_NEXTENTS(ip, whichfork) >
-               XFS_IFORK_MAXEXT(ip, whichfork));
-       error = 0;
-error0:
-       /*
-        * Log everything.  Do this after conversion, there's no point in
-        * logging the extent records if we've converted to btree format.
-        */
-       if ((bma.logflags & xfs_ilog_fext(whichfork)) &&
-           XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
-               bma.logflags &= ~xfs_ilog_fext(whichfork);
-       else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) &&
-                XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
-               bma.logflags &= ~xfs_ilog_fbroot(whichfork);
-       /*
-        * Log whatever the flags say, even if error.  Otherwise we might miss
-        * detecting a case where the data is changed, there's an error,
-        * and it's not logged so we don't shutdown when we should.
-        */
-       if (bma.logflags)
-               xfs_trans_log_inode(tp, ip, bma.logflags);
-
-       if (bma.cur) {
-               if (!error) {
-                       ASSERT(*firstblock == NULLFSBLOCK ||
-                              XFS_FSB_TO_AGNO(mp, *firstblock) ==
-                              XFS_FSB_TO_AGNO(mp,
-                                      bma.cur->bc_private.b.firstblock) ||
-                              (flist->xbf_low &&
-                               XFS_FSB_TO_AGNO(mp, *firstblock) <
-                               XFS_FSB_TO_AGNO(mp,
-                                       bma.cur->bc_private.b.firstblock)));
-                       *firstblock = bma.cur->bc_private.b.firstblock;
-               }
-               xfs_btree_del_cursor(bma.cur,
-                       error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
-       }
-       if (!error)
-               xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
-                       orig_nmap, *nmap);
-       return error;
-}
-
-/*
- * Called by xfs_bmapi to update file extent records and the btree
- * after removing space (or undoing a delayed allocation).
- */
-STATIC int                             /* error */
-xfs_bmap_del_extent(
-       xfs_inode_t             *ip,    /* incore inode pointer */
-       xfs_trans_t             *tp,    /* current transaction pointer */
-       xfs_extnum_t            *idx,   /* extent number to update/delete */
-       xfs_bmap_free_t         *flist, /* list of extents to be freed */
-       xfs_btree_cur_t         *cur,   /* if null, not a btree */
-       xfs_bmbt_irec_t         *del,   /* data to remove from extents */
-       int                     *logflagsp, /* inode logging flags */
-       int                     whichfork) /* data or attr fork */
-{
-       xfs_filblks_t           da_new; /* new delay-alloc indirect blocks */
-       xfs_filblks_t           da_old; /* old delay-alloc indirect blocks */
-       xfs_fsblock_t           del_endblock=0; /* first block past del */
-       xfs_fileoff_t           del_endoff;     /* first offset past del */
-       int                     delay;  /* current block is delayed allocated */
-       int                     do_fx;  /* free extent at end of routine */
-       xfs_bmbt_rec_host_t     *ep;    /* current extent entry pointer */
-       int                     error;  /* error return value */
-       int                     flags;  /* inode logging flags */
-       xfs_bmbt_irec_t         got;    /* current extent entry */
-       xfs_fileoff_t           got_endoff;     /* first offset past got */
-       int                     i;      /* temp state */
-       xfs_ifork_t             *ifp;   /* inode fork pointer */
-       xfs_mount_t             *mp;    /* mount structure */
-       xfs_filblks_t           nblks;  /* quota/sb block count */
-       xfs_bmbt_irec_t         new;    /* new record to be inserted */
-       /* REFERENCED */
-       uint                    qfield; /* quota field to update */
-       xfs_filblks_t           temp;   /* for indirect length calculations */
-       xfs_filblks_t           temp2;  /* for indirect length calculations */
-       int                     state = 0;
-
-       XFS_STATS_INC(xs_del_exlist);
-
-       if (whichfork == XFS_ATTR_FORK)
-               state |= BMAP_ATTRFORK;
-
-       mp = ip->i_mount;
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
-               (uint)sizeof(xfs_bmbt_rec_t)));
-       ASSERT(del->br_blockcount > 0);
-       ep = xfs_iext_get_ext(ifp, *idx);
-       xfs_bmbt_get_all(ep, &got);
-       ASSERT(got.br_startoff <= del->br_startoff);
-       del_endoff = del->br_startoff + del->br_blockcount;
-       got_endoff = got.br_startoff + got.br_blockcount;
-       ASSERT(got_endoff >= del_endoff);
-       delay = isnullstartblock(got.br_startblock);
-       ASSERT(isnullstartblock(del->br_startblock) == delay);
-       flags = 0;
-       qfield = 0;
-       error = 0;
-       /*
-        * If deleting a real allocation, must free up the disk space.
-        */
-       if (!delay) {
-               flags = XFS_ILOG_CORE;
-               /*
-                * Realtime allocation.  Free it and record di_nblocks update.
-                */
-               if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
-                       xfs_fsblock_t   bno;
-                       xfs_filblks_t   len;
-
-                       ASSERT(do_mod(del->br_blockcount,
-                                     mp->m_sb.sb_rextsize) == 0);
-                       ASSERT(do_mod(del->br_startblock,
-                                     mp->m_sb.sb_rextsize) == 0);
-                       bno = del->br_startblock;
-                       len = del->br_blockcount;
-                       do_div(bno, mp->m_sb.sb_rextsize);
-                       do_div(len, mp->m_sb.sb_rextsize);
-                       error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
-                       if (error)
-                               goto done;
-                       do_fx = 0;
-                       nblks = len * mp->m_sb.sb_rextsize;
-                       qfield = XFS_TRANS_DQ_RTBCOUNT;
-               }
-               /*
-                * Ordinary allocation.
-                */
-               else {
-                       do_fx = 1;
-                       nblks = del->br_blockcount;
-                       qfield = XFS_TRANS_DQ_BCOUNT;
-               }
-               /*
-                * Set up del_endblock and cur for later.
-                */
-               del_endblock = del->br_startblock + del->br_blockcount;
-               if (cur) {
-                       if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
-                                       got.br_startblock, got.br_blockcount,
-                                       &i)))
-                               goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-               }
-               da_old = da_new = 0;
-       } else {
-               da_old = startblockval(got.br_startblock);
-               da_new = 0;
-               nblks = 0;
-               do_fx = 0;
-       }
-       /*
-        * Set flag value to use in switch statement.
-        * Left-contig is 2, right-contig is 1.
-        */
-       switch (((got.br_startoff == del->br_startoff) << 1) |
-               (got_endoff == del_endoff)) {
-       case 3:
-               /*
-                * Matches the whole extent.  Delete the entry.
-                */
-               xfs_iext_remove(ip, *idx, 1,
-                               whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
-               --*idx;
-               if (delay)
-                       break;
-
-               XFS_IFORK_NEXT_SET(ip, whichfork,
-                       XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
-               flags |= XFS_ILOG_CORE;
-               if (!cur) {
-                       flags |= xfs_ilog_fext(whichfork);
-                       break;
-               }
-               if ((error = xfs_btree_delete(cur, &i)))
-                       goto done;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-               break;
-
-       case 2:
-               /*
-                * Deleting the first part of the extent.
-                */
-               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-               xfs_bmbt_set_startoff(ep, del_endoff);
-               temp = got.br_blockcount - del->br_blockcount;
-               xfs_bmbt_set_blockcount(ep, temp);
-               if (delay) {
-                       temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
-                               da_old);
-                       xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                       trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                       da_new = temp;
-                       break;
-               }
-               xfs_bmbt_set_startblock(ep, del_endblock);
-               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-               if (!cur) {
-                       flags |= xfs_ilog_fext(whichfork);
-                       break;
-               }
-               if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
-                               got.br_blockcount - del->br_blockcount,
-                               got.br_state)))
-                       goto done;
-               break;
-
-       case 1:
-               /*
-                * Deleting the last part of the extent.
-                */
-               temp = got.br_blockcount - del->br_blockcount;
-               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-               xfs_bmbt_set_blockcount(ep, temp);
-               if (delay) {
-                       temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
-                               da_old);
-                       xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                       trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                       da_new = temp;
-                       break;
-               }
-               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-               if (!cur) {
-                       flags |= xfs_ilog_fext(whichfork);
-                       break;
-               }
-               if ((error = xfs_bmbt_update(cur, got.br_startoff,
-                               got.br_startblock,
-                               got.br_blockcount - del->br_blockcount,
-                               got.br_state)))
-                       goto done;
-               break;
-
-       case 0:
-               /*
-                * Deleting the middle of the extent.
-                */
-               temp = del->br_startoff - got.br_startoff;
-               trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-               xfs_bmbt_set_blockcount(ep, temp);
-               new.br_startoff = del_endoff;
-               temp2 = got_endoff - del_endoff;
-               new.br_blockcount = temp2;
-               new.br_state = got.br_state;
-               if (!delay) {
-                       new.br_startblock = del_endblock;
-                       flags |= XFS_ILOG_CORE;
-                       if (cur) {
-                               if ((error = xfs_bmbt_update(cur,
-                                               got.br_startoff,
-                                               got.br_startblock, temp,
-                                               got.br_state)))
-                                       goto done;
-                               if ((error = xfs_btree_increment(cur, 0, &i)))
-                                       goto done;
-                               cur->bc_rec.b = new;
-                               error = xfs_btree_insert(cur, &i);
-                               if (error && error != ENOSPC)
-                                       goto done;
-                               /*
-                                * If get no-space back from btree insert,
-                                * it tried a split, and we have a zero
-                                * block reservation.
-                                * Fix up our state and return the error.
-                                */
-                               if (error == ENOSPC) {
-                                       /*
-                                        * Reset the cursor, don't trust
-                                        * it after any insert operation.
-                                        */
-                                       if ((error = xfs_bmbt_lookup_eq(cur,
-                                                       got.br_startoff,
-                                                       got.br_startblock,
-                                                       temp, &i)))
-                                               goto done;
-                                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                                       /*
-                                        * Update the btree record back
-                                        * to the original value.
-                                        */
-                                       if ((error = xfs_bmbt_update(cur,
-                                                       got.br_startoff,
-                                                       got.br_startblock,
-                                                       got.br_blockcount,
-                                                       got.br_state)))
-                                               goto done;
-                                       /*
-                                        * Reset the extent record back
-                                        * to the original value.
-                                        */
-                                       xfs_bmbt_set_blockcount(ep,
-                                               got.br_blockcount);
-                                       flags = 0;
-                                       error = XFS_ERROR(ENOSPC);
-                                       goto done;
-                               }
-                               XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       } else
-                               flags |= xfs_ilog_fext(whichfork);
-                       XFS_IFORK_NEXT_SET(ip, whichfork,
-                               XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
-               } else {
-                       ASSERT(whichfork == XFS_DATA_FORK);
-                       temp = xfs_bmap_worst_indlen(ip, temp);
-                       xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                       temp2 = xfs_bmap_worst_indlen(ip, temp2);
-                       new.br_startblock = nullstartblock((int)temp2);
-                       da_new = temp + temp2;
-                       while (da_new > da_old) {
-                               if (temp) {
-                                       temp--;
-                                       da_new--;
-                                       xfs_bmbt_set_startblock(ep,
-                                               nullstartblock((int)temp));
-                               }
-                               if (da_new == da_old)
-                                       break;
-                               if (temp2) {
-                                       temp2--;
-                                       da_new--;
-                                       new.br_startblock =
-                                               nullstartblock((int)temp2);
-                               }
-                       }
-               }
-               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-               xfs_iext_insert(ip, *idx + 1, 1, &new, state);
-               ++*idx;
-               break;
-       }
-       /*
-        * If we need to, add to list of extents to delete.
-        */
-       if (do_fx)
-               xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
-                       mp);
-       /*
-        * Adjust inode # blocks in the file.
-        */
-       if (nblks)
-               ip->i_d.di_nblocks -= nblks;
-       /*
-        * Adjust quota data.
-        */
-       if (qfield)
-               xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
-
-       /*
-        * Account for change in delayed indirect blocks.
-        * Nothing to do for disk quota accounting here.
-        */
-       ASSERT(da_old >= da_new);
-       if (da_old > da_new) {
-               xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-                       (int64_t)(da_old - da_new), 0);
-       }
-done:
-       *logflagsp = flags;
-       return error;
-}
-
-/*
- * Unmap (remove) blocks from a file.
- * If nexts is nonzero then the number of extents to remove is limited to
- * that value.  If not all extents in the block range can be removed then
- * *done is set.
- */
-int                                            /* error */
-xfs_bunmapi(
-       xfs_trans_t             *tp,            /* transaction pointer */
-       struct xfs_inode        *ip,            /* incore inode */
-       xfs_fileoff_t           bno,            /* starting offset to unmap */
-       xfs_filblks_t           len,            /* length to unmap in file */
-       int                     flags,          /* misc flags */
-       xfs_extnum_t            nexts,          /* number of extents max */
-       xfs_fsblock_t           *firstblock,    /* first allocated block
-                                                  controls a.g. for allocs */
-       xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
-       int                     *done)          /* set if not done yet */
-{
-       xfs_btree_cur_t         *cur;           /* bmap btree cursor */
-       xfs_bmbt_irec_t         del;            /* extent being deleted */
-       int                     eof;            /* is deleting at eof */
-       xfs_bmbt_rec_host_t     *ep;            /* extent record pointer */
-       int                     error;          /* error return value */
-       xfs_extnum_t            extno;          /* extent number in list */
-       xfs_bmbt_irec_t         got;            /* current extent record */
-       xfs_ifork_t             *ifp;           /* inode fork pointer */
-       int                     isrt;           /* freeing in rt area */
-       xfs_extnum_t            lastx;          /* last extent index used */
-       int                     logflags;       /* transaction logging flags */
-       xfs_extlen_t            mod;            /* rt extent offset */
-       xfs_mount_t             *mp;            /* mount structure */
-       xfs_extnum_t            nextents;       /* number of file extents */
-       xfs_bmbt_irec_t         prev;           /* previous extent record */
-       xfs_fileoff_t           start;          /* first file offset deleted */
-       int                     tmp_logflags;   /* partial logging flags */
-       int                     wasdel;         /* was a delayed alloc extent */
-       int                     whichfork;      /* data or attribute fork */
-       xfs_fsblock_t           sum;
-
-       trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
-
-       whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
-               XFS_ATTR_FORK : XFS_DATA_FORK;
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       if (unlikely(
-           XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
-           XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
-               XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW,
-                                ip->i_mount);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-       mp = ip->i_mount;
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-       ASSERT(len > 0);
-       ASSERT(nexts >= 0);
-
-       if (!(ifp->if_flags & XFS_IFEXTENTS) &&
-           (error = xfs_iread_extents(tp, ip, whichfork)))
-               return error;
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       if (nextents == 0) {
-               *done = 1;
-               return 0;
-       }
-       XFS_STATS_INC(xs_blk_unmap);
-       isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
-       start = bno;
-       bno = start + len - 1;
-       ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
-               &prev);
-
-       /*
-        * Check to see if the given block number is past the end of the
-        * file, back up to the last block if so...
-        */
-       if (eof) {
-               ep = xfs_iext_get_ext(ifp, --lastx);
-               xfs_bmbt_get_all(ep, &got);
-               bno = got.br_startoff + got.br_blockcount - 1;
-       }
-       logflags = 0;
-       if (ifp->if_flags & XFS_IFBROOT) {
-               ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
-               cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
-               cur->bc_private.b.firstblock = *firstblock;
-               cur->bc_private.b.flist = flist;
-               cur->bc_private.b.flags = 0;
-       } else
-               cur = NULL;
-
-       if (isrt) {
-               /*
-                * Synchronize by locking the bitmap inode.
-                */
-               xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
-               xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
-       }
-
-       extno = 0;
-       while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
-              (nexts == 0 || extno < nexts)) {
-               /*
-                * Is the found extent after a hole in which bno lives?
-                * Just back up to the previous extent, if so.
-                */
-               if (got.br_startoff > bno) {
-                       if (--lastx < 0)
-                               break;
-                       ep = xfs_iext_get_ext(ifp, lastx);
-                       xfs_bmbt_get_all(ep, &got);
-               }
-               /*
-                * Is the last block of this extent before the range
-                * we're supposed to delete?  If so, we're done.
-                */
-               bno = XFS_FILEOFF_MIN(bno,
-                       got.br_startoff + got.br_blockcount - 1);
-               if (bno < start)
-                       break;
-               /*
-                * Then deal with the (possibly delayed) allocated space
-                * we found.
-                */
-               ASSERT(ep != NULL);
-               del = got;
-               wasdel = isnullstartblock(del.br_startblock);
-               if (got.br_startoff < start) {
-                       del.br_startoff = start;
-                       del.br_blockcount -= start - got.br_startoff;
-                       if (!wasdel)
-                               del.br_startblock += start - got.br_startoff;
-               }
-               if (del.br_startoff + del.br_blockcount > bno + 1)
-                       del.br_blockcount = bno + 1 - del.br_startoff;
-               sum = del.br_startblock + del.br_blockcount;
-               if (isrt &&
-                   (mod = do_mod(sum, mp->m_sb.sb_rextsize))) {
-                       /*
-                        * Realtime extent not lined up at the end.
-                        * The extent could have been split into written
-                        * and unwritten pieces, or we could just be
-                        * unmapping part of it.  But we can't really
-                        * get rid of part of a realtime extent.
-                        */
-                       if (del.br_state == XFS_EXT_UNWRITTEN ||
-                           !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
-                               /*
-                                * This piece is unwritten, or we're not
-                                * using unwritten extents.  Skip over it.
-                                */
-                               ASSERT(bno >= mod);
-                               bno -= mod > del.br_blockcount ?
-                                       del.br_blockcount : mod;
-                               if (bno < got.br_startoff) {
-                                       if (--lastx >= 0)
-                                               xfs_bmbt_get_all(xfs_iext_get_ext(
-                                                       ifp, lastx), &got);
-                               }
-                               continue;
-                       }
-                       /*
-                        * It's written, turn it unwritten.
-                        * This is better than zeroing it.
-                        */
-                       ASSERT(del.br_state == XFS_EXT_NORM);
-                       ASSERT(xfs_trans_get_block_res(tp) > 0);
-                       /*
-                        * If this spans a realtime extent boundary,
-                        * chop it back to the start of the one we end at.
-                        */
-                       if (del.br_blockcount > mod) {
-                               del.br_startoff += del.br_blockcount - mod;
-                               del.br_startblock += del.br_blockcount - mod;
-                               del.br_blockcount = mod;
-                       }
-                       del.br_state = XFS_EXT_UNWRITTEN;
-                       error = xfs_bmap_add_extent_unwritten_real(tp, ip,
-                                       &lastx, &cur, &del, firstblock, flist,
-                                       &logflags);
-                       if (error)
-                               goto error0;
-                       goto nodelete;
-               }
-               if (isrt && (mod = do_mod(del.br_startblock, mp->m_sb.sb_rextsize))) {
-                       /*
-                        * Realtime extent is lined up at the end but not
-                        * at the front.  We'll get rid of full extents if
-                        * we can.
-                        */
-                       mod = mp->m_sb.sb_rextsize - mod;
-                       if (del.br_blockcount > mod) {
-                               del.br_blockcount -= mod;
-                               del.br_startoff += mod;
-                               del.br_startblock += mod;
-                       } else if ((del.br_startoff == start &&
-                                   (del.br_state == XFS_EXT_UNWRITTEN ||
-                                    xfs_trans_get_block_res(tp) == 0)) ||
-                                  !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
-                               /*
-                                * Can't make it unwritten.  There isn't
-                                * a full extent here so just skip it.
-                                */
-                               ASSERT(bno >= del.br_blockcount);
-                               bno -= del.br_blockcount;
-                               if (got.br_startoff > bno) {
-                                       if (--lastx >= 0) {
-                                               ep = xfs_iext_get_ext(ifp,
-                                                                     lastx);
-                                               xfs_bmbt_get_all(ep, &got);
-                                       }
-                               }
-                               continue;
-                       } else if (del.br_state == XFS_EXT_UNWRITTEN) {
-                               /*
-                                * This one is already unwritten.
-                                * It must have a written left neighbor.
-                                * Unwrite the killed part of that one and
-                                * try again.
-                                */
-                               ASSERT(lastx > 0);
-                               xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
-                                               lastx - 1), &prev);
-                               ASSERT(prev.br_state == XFS_EXT_NORM);
-                               ASSERT(!isnullstartblock(prev.br_startblock));
-                               ASSERT(del.br_startblock ==
-                                      prev.br_startblock + prev.br_blockcount);
-                               if (prev.br_startoff < start) {
-                                       mod = start - prev.br_startoff;
-                                       prev.br_blockcount -= mod;
-                                       prev.br_startblock += mod;
-                                       prev.br_startoff = start;
-                               }
-                               prev.br_state = XFS_EXT_UNWRITTEN;
-                               lastx--;
-                               error = xfs_bmap_add_extent_unwritten_real(tp,
-                                               ip, &lastx, &cur, &prev,
-                                               firstblock, flist, &logflags);
-                               if (error)
-                                       goto error0;
-                               goto nodelete;
-                       } else {
-                               ASSERT(del.br_state == XFS_EXT_NORM);
-                               del.br_state = XFS_EXT_UNWRITTEN;
-                               error = xfs_bmap_add_extent_unwritten_real(tp,
-                                               ip, &lastx, &cur, &del,
-                                               firstblock, flist, &logflags);
-                               if (error)
-                                       goto error0;
-                               goto nodelete;
-                       }
-               }
-               if (wasdel) {
-                       ASSERT(startblockval(del.br_startblock) > 0);
-                       /* Update realtime/data freespace, unreserve quota */
-                       if (isrt) {
-                               xfs_filblks_t rtexts;
-
-                               rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
-                               do_div(rtexts, mp->m_sb.sb_rextsize);
-                               xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
-                                               (int64_t)rtexts, 0);
-                               (void)xfs_trans_reserve_quota_nblks(NULL,
-                                       ip, -((long)del.br_blockcount), 0,
-                                       XFS_QMOPT_RES_RTBLKS);
-                       } else {
-                               xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-                                               (int64_t)del.br_blockcount, 0);
-                               (void)xfs_trans_reserve_quota_nblks(NULL,
-                                       ip, -((long)del.br_blockcount), 0,
-                                       XFS_QMOPT_RES_REGBLKS);
-                       }
-                       ip->i_delayed_blks -= del.br_blockcount;
-                       if (cur)
-                               cur->bc_private.b.flags |=
-                                       XFS_BTCUR_BPRV_WASDEL;
-               } else if (cur)
-                       cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
-               /*
-                * If it's the case where the directory code is running
-                * with no block reservation, and the deleted block is in
-                * the middle of its extent, and the resulting insert
-                * of an extent would cause transformation to btree format,
-                * then reject it.  The calling code will then swap
-                * blocks around instead.
-                * We have to do this now, rather than waiting for the
-                * conversion to btree format, since the transaction
-                * will be dirty.
-                */
-               if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
-                   XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
-                   XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
-                       XFS_IFORK_MAXEXT(ip, whichfork) &&
-                   del.br_startoff > got.br_startoff &&
-                   del.br_startoff + del.br_blockcount <
-                   got.br_startoff + got.br_blockcount) {
-                       error = XFS_ERROR(ENOSPC);
-                       goto error0;
-               }
-               error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
-                               &tmp_logflags, whichfork);
-               logflags |= tmp_logflags;
-               if (error)
-                       goto error0;
-               bno = del.br_startoff - 1;
-nodelete:
-               /*
-                * If not done go on to the next (previous) record.
-                */
-               if (bno != (xfs_fileoff_t)-1 && bno >= start) {
-                       if (lastx >= 0) {
-                               ep = xfs_iext_get_ext(ifp, lastx);
-                               if (xfs_bmbt_get_startoff(ep) > bno) {
-                                       if (--lastx >= 0)
-                                               ep = xfs_iext_get_ext(ifp,
-                                                                     lastx);
-                               }
-                               xfs_bmbt_get_all(ep, &got);
-                       }
-                       extno++;
-               }
-       }
-       *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
-
-       /*
-        * Convert to a btree if necessary.
-        */
-       if (xfs_bmap_needs_btree(ip, whichfork)) {
-               ASSERT(cur == NULL);
-               error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
-                       &cur, 0, &tmp_logflags, whichfork);
-               logflags |= tmp_logflags;
-               if (error)
-                       goto error0;
-       }
-       /*
-        * transform from btree to extents, give it cur
-        */
-       else if (xfs_bmap_wants_extents(ip, whichfork)) {
-               ASSERT(cur != NULL);
-               error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
-                       whichfork);
-               logflags |= tmp_logflags;
-               if (error)
-                       goto error0;
-       }
-       /*
-        * transform from extents to local?
-        */
-       error = 0;
-error0:
-       /*
-        * Log everything.  Do this after conversion, there's no point in
-        * logging the extent records if we've converted to btree format.
-        */
-       if ((logflags & xfs_ilog_fext(whichfork)) &&
-           XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
-               logflags &= ~xfs_ilog_fext(whichfork);
-       else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
-                XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
-               logflags &= ~xfs_ilog_fbroot(whichfork);
-       /*
-        * Log inode even in the error case, if the transaction
-        * is dirty we'll need to shut down the filesystem.
-        */
-       if (logflags)
-               xfs_trans_log_inode(tp, ip, logflags);
-       if (cur) {
-               if (!error) {
-                       *firstblock = cur->bc_private.b.firstblock;
-                       cur->bc_private.b.allocated = 0;
-               }
-               xfs_btree_del_cursor(cur,
-                       error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
-       }
-       return error;
-}
-
-/*
- * Shift extent records to the left to cover a hole.
- *
- * The maximum number of extents to be shifted in a single operation
- * is @num_exts, and @current_ext keeps track of the current extent
- * index we have shifted. @offset_shift_fsb is the length by which each
- * extent is shifted. If there is no hole to shift the extents
- * into, this will be considered invalid operation and we abort immediately.
- */
-int
-xfs_bmap_shift_extents(
-       struct xfs_trans        *tp,
-       struct xfs_inode        *ip,
-       int                     *done,
-       xfs_fileoff_t           start_fsb,
-       xfs_fileoff_t           offset_shift_fsb,
-       xfs_extnum_t            *current_ext,
-       xfs_fsblock_t           *firstblock,
-       struct xfs_bmap_free    *flist,
-       int                     num_exts)
-{
-       struct xfs_btree_cur            *cur;
-       struct xfs_bmbt_rec_host        *gotp;
-       struct xfs_bmbt_irec            got;
-       struct xfs_bmbt_irec            left;
-       struct xfs_mount                *mp = ip->i_mount;
-       struct xfs_ifork                *ifp;
-       xfs_extnum_t                    nexts = 0;
-       xfs_fileoff_t                   startoff;
-       int                             error = 0;
-       int                             i;
-       int                             whichfork = XFS_DATA_FORK;
-       int                             logflags;
-       xfs_filblks_t                   blockcount = 0;
-       int                             total_extents;
-
-       if (unlikely(XFS_TEST_ERROR(
-           (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
-            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
-            mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
-               XFS_ERROR_REPORT("xfs_bmap_shift_extents",
-                                XFS_ERRLEVEL_LOW, mp);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       ASSERT(current_ext != NULL);
-
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       if (!(ifp->if_flags & XFS_IFEXTENTS)) {
-               /* Read in all the extents */
-               error = xfs_iread_extents(tp, ip, whichfork);
-               if (error)
-                       return error;
-       }
-
-       /*
-        * If *current_ext is 0, we would need to lookup the extent
-        * from where we would start shifting and store it in gotp.
-        */
-       if (!*current_ext) {
-               gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
-               /*
-                * gotp can be null in 2 cases: 1) if there are no extents
-                * or 2) start_fsb lies in a hole beyond which there are
-                * no extents. Either way, we are done.
-                */
-               if (!gotp) {
-                       *done = 1;
-                       return 0;
-               }
-       }
-
-       /* We are going to change core inode */
-       logflags = XFS_ILOG_CORE;
-       if (ifp->if_flags & XFS_IFBROOT) {
-               cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
-               cur->bc_private.b.firstblock = *firstblock;
-               cur->bc_private.b.flist = flist;
-               cur->bc_private.b.flags = 0;
-       } else {
-               cur = NULL;
-               logflags |= XFS_ILOG_DEXT;
-       }
-
-       /*
-        * There may be delalloc extents in the data fork before the range we
-        * are collapsing out, so we cannot
-        * use the count of real extents here. Instead we have to calculate it
-        * from the incore fork.
-        */
-       total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
-       while (nexts++ < num_exts && *current_ext < total_extents) {
-
-               gotp = xfs_iext_get_ext(ifp, *current_ext);
-               xfs_bmbt_get_all(gotp, &got);
-               startoff = got.br_startoff - offset_shift_fsb;
-
-               /*
-                * Before shifting extent into hole, make sure that the hole
-                * is large enough to accomodate the shift.
-                */
-               if (*current_ext) {
-                       xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
-                                               *current_ext - 1), &left);
-
-                       if (startoff < left.br_startoff + left.br_blockcount)
-                               error = XFS_ERROR(EINVAL);
-               } else if (offset_shift_fsb > got.br_startoff) {
-                       /*
-                        * When first extent is shifted, offset_shift_fsb
-                        * should be less than the stating offset of
-                        * the first extent.
-                        */
-                       error = XFS_ERROR(EINVAL);
-               }
-
-               if (error)
-                       goto del_cursor;
-
-               if (cur) {
-                       error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
-                                                  got.br_startblock,
-                                                  got.br_blockcount,
-                                                  &i);
-                       if (error)
-                               goto del_cursor;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
-               }
-
-               /* Check if we can merge 2 adjacent extents */
-               if (*current_ext &&
-                   left.br_startoff + left.br_blockcount == startoff &&
-                   left.br_startblock + left.br_blockcount ==
-                               got.br_startblock &&
-                   left.br_state == got.br_state &&
-                   left.br_blockcount + got.br_blockcount <= MAXEXTLEN) {
-                       blockcount = left.br_blockcount +
-                               got.br_blockcount;
-                       xfs_iext_remove(ip, *current_ext, 1, 0);
-                       if (cur) {
-                               error = xfs_btree_delete(cur, &i);
-                               if (error)
-                                       goto del_cursor;
-                               XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
-                       }
-                       XFS_IFORK_NEXT_SET(ip, whichfork,
-                               XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
-                       gotp = xfs_iext_get_ext(ifp, --*current_ext);
-                       xfs_bmbt_get_all(gotp, &got);
-
-                       /* Make cursor point to the extent we will update */
-                       if (cur) {
-                               error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
-                                                          got.br_startblock,
-                                                          got.br_blockcount,
-                                                          &i);
-                               if (error)
-                                       goto del_cursor;
-                               XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
-                       }
-
-                       xfs_bmbt_set_blockcount(gotp, blockcount);
-                       got.br_blockcount = blockcount;
-               } else {
-                       /* We have to update the startoff */
-                       xfs_bmbt_set_startoff(gotp, startoff);
-                       got.br_startoff = startoff;
-               }
-
-               if (cur) {
-                       error = xfs_bmbt_update(cur, got.br_startoff,
-                                               got.br_startblock,
-                                               got.br_blockcount,
-                                               got.br_state);
-                       if (error)
-                               goto del_cursor;
-               }
-
-               (*current_ext)++;
-               total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
-       }
-
-       /* Check if we are done */
-       if (*current_ext == total_extents)
-               *done = 1;
-
-del_cursor:
-       if (cur)
-               xfs_btree_del_cursor(cur,
-                       error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
-
-       xfs_trans_log_inode(tp, ip, logflags);
-       return error;
-}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
deleted file mode 100644 (file)
index b879ca5..0000000
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_BMAP_H__
-#define        __XFS_BMAP_H__
-
-struct getbmap;
-struct xfs_bmbt_irec;
-struct xfs_ifork;
-struct xfs_inode;
-struct xfs_mount;
-struct xfs_trans;
-
-extern kmem_zone_t     *xfs_bmap_free_item_zone;
-
-/*
- * List of extents to be free "later".
- * The list is kept sorted on xbf_startblock.
- */
-typedef struct xfs_bmap_free_item
-{
-       xfs_fsblock_t           xbfi_startblock;/* starting fs block number */
-       xfs_extlen_t            xbfi_blockcount;/* number of blocks in extent */
-       struct xfs_bmap_free_item *xbfi_next;   /* link to next entry */
-} xfs_bmap_free_item_t;
-
-/*
- * Header for free extent list.
- *
- * xbf_low is used by the allocator to activate the lowspace algorithm -
- * when free space is running low the extent allocator may choose to
- * allocate an extent from an AG without leaving sufficient space for
- * a btree split when inserting the new extent.  In this case the allocator
- * will enable the lowspace algorithm which is supposed to allow further
- * allocations (such as btree splits and newroots) to allocate from
- * sequential AGs.  In order to avoid locking AGs out of order the lowspace
- * algorithm will start searching for free space from AG 0.  If the correct
- * transaction reservations have been made then this algorithm will eventually
- * find all the space it needs.
- */
-typedef        struct xfs_bmap_free
-{
-       xfs_bmap_free_item_t    *xbf_first;     /* list of to-be-free extents */
-       int                     xbf_count;      /* count of items on list */
-       int                     xbf_low;        /* alloc in low mode */
-} xfs_bmap_free_t;
-
-#define        XFS_BMAP_MAX_NMAP       4
-
-/*
- * Flags for xfs_bmapi_*
- */
-#define XFS_BMAPI_ENTIRE       0x001   /* return entire extent, not trimmed */
-#define XFS_BMAPI_METADATA     0x002   /* mapping metadata not user data */
-#define XFS_BMAPI_ATTRFORK     0x004   /* use attribute fork not data */
-#define XFS_BMAPI_PREALLOC     0x008   /* preallocation op: unwritten space */
-#define XFS_BMAPI_IGSTATE      0x010   /* Ignore state - */
-                                       /* combine contig. space */
-#define XFS_BMAPI_CONTIG       0x020   /* must allocate only one extent */
-/*
- * unwritten extent conversion - this needs write cache flushing and no additional
- * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
- * from written to unwritten, otherwise convert from unwritten to written.
- */
-#define XFS_BMAPI_CONVERT      0x040
-
-#define XFS_BMAPI_FLAGS \
-       { XFS_BMAPI_ENTIRE,     "ENTIRE" }, \
-       { XFS_BMAPI_METADATA,   "METADATA" }, \
-       { XFS_BMAPI_ATTRFORK,   "ATTRFORK" }, \
-       { XFS_BMAPI_PREALLOC,   "PREALLOC" }, \
-       { XFS_BMAPI_IGSTATE,    "IGSTATE" }, \
-       { XFS_BMAPI_CONTIG,     "CONTIG" }, \
-       { XFS_BMAPI_CONVERT,    "CONVERT" }
-
-
-static inline int xfs_bmapi_aflag(int w)
-{
-       return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0);
-}
-
-/*
- * Special values for xfs_bmbt_irec_t br_startblock field.
- */
-#define        DELAYSTARTBLOCK         ((xfs_fsblock_t)-1LL)
-#define        HOLESTARTBLOCK          ((xfs_fsblock_t)-2LL)
-
-static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
-{
-       ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \
-               (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK);
-}
-
-/*
- * Flags for xfs_bmap_add_extent*.
- */
-#define BMAP_LEFT_CONTIG       (1 << 0)
-#define BMAP_RIGHT_CONTIG      (1 << 1)
-#define BMAP_LEFT_FILLING      (1 << 2)
-#define BMAP_RIGHT_FILLING     (1 << 3)
-#define BMAP_LEFT_DELAY                (1 << 4)
-#define BMAP_RIGHT_DELAY       (1 << 5)
-#define BMAP_LEFT_VALID                (1 << 6)
-#define BMAP_RIGHT_VALID       (1 << 7)
-#define BMAP_ATTRFORK          (1 << 8)
-
-#define XFS_BMAP_EXT_FLAGS \
-       { BMAP_LEFT_CONTIG,     "LC" }, \
-       { BMAP_RIGHT_CONTIG,    "RC" }, \
-       { BMAP_LEFT_FILLING,    "LF" }, \
-       { BMAP_RIGHT_FILLING,   "RF" }, \
-       { BMAP_ATTRFORK,        "ATTR" }
-
-
-/*
- * This macro is used to determine how many extents will be shifted
- * in one write transaction. We could require two splits,
- * an extent move on the first and an extent merge on the second,
- * So it is proper that one extent is shifted inside write transaction
- * at a time.
- */
-#define XFS_BMAP_MAX_SHIFT_EXTENTS     1
-
-#ifdef DEBUG
-void   xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
-               int whichfork, unsigned long caller_ip);
-#define        XFS_BMAP_TRACE_EXLIST(ip,c,w)   \
-       xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_)
-#else
-#define        XFS_BMAP_TRACE_EXLIST(ip,c,w)
-#endif
-
-int    xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
-void   xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
-void   xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
-               struct xfs_bmap_free *flist, struct xfs_mount *mp);
-void   xfs_bmap_cancel(struct xfs_bmap_free *flist);
-void   xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
-int    xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
-               xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
-int    xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip,
-               xfs_fileoff_t *last_block, int whichfork);
-int    xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused,
-               int whichfork);
-int    xfs_bmap_one_block(struct xfs_inode *ip, int whichfork);
-int    xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,
-               int whichfork);
-int    xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
-               xfs_filblks_t len, struct xfs_bmbt_irec *mval,
-               int *nmap, int flags);
-int    xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno,
-               xfs_filblks_t len, struct xfs_bmbt_irec *mval,
-               int *nmap, int flags);
-int    xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
-               xfs_fileoff_t bno, xfs_filblks_t len, int flags,
-               xfs_fsblock_t *firstblock, xfs_extlen_t total,
-               struct xfs_bmbt_irec *mval, int *nmap,
-               struct xfs_bmap_free *flist);
-int    xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
-               xfs_fileoff_t bno, xfs_filblks_t len, int flags,
-               xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
-               struct xfs_bmap_free *flist, int *done);
-int    xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
-               xfs_extnum_t num);
-uint   xfs_default_attroffset(struct xfs_inode *ip);
-int    xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
-               int *done, xfs_fileoff_t start_fsb,
-               xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext,
-               xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist,
-               int num_exts);
-
-#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
deleted file mode 100644 (file)
index 948836c..0000000
+++ /dev/null
@@ -1,967 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_inode.h"
-#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_alloc.h"
-#include "xfs_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_bmap.h"
-#include "xfs_error.h"
-#include "xfs_quota.h"
-#include "xfs_trace.h"
-#include "xfs_cksum.h"
-#include "xfs_dinode.h"
-
-/*
- * Determine the extent state.
- */
-/* ARGSUSED */
-STATIC xfs_exntst_t
-xfs_extent_state(
-       xfs_filblks_t           blks,
-       int                     extent_flag)
-{
-       if (extent_flag) {
-               ASSERT(blks != 0);      /* saved for DMIG */
-               return XFS_EXT_UNWRITTEN;
-       }
-       return XFS_EXT_NORM;
-}
-
-/*
- * Convert on-disk form of btree root to in-memory form.
- */
-void
-xfs_bmdr_to_bmbt(
-       struct xfs_inode        *ip,
-       xfs_bmdr_block_t        *dblock,
-       int                     dblocklen,
-       struct xfs_btree_block  *rblock,
-       int                     rblocklen)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       int                     dmxr;
-       xfs_bmbt_key_t          *fkp;
-       __be64                  *fpp;
-       xfs_bmbt_key_t          *tkp;
-       __be64                  *tpp;
-
-       if (xfs_sb_version_hascrc(&mp->m_sb))
-               xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
-                                XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
-                                XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
-       else
-               xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
-                                XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
-                                XFS_BTREE_LONG_PTRS);
-
-       rblock->bb_level = dblock->bb_level;
-       ASSERT(be16_to_cpu(rblock->bb_level) > 0);
-       rblock->bb_numrecs = dblock->bb_numrecs;
-       dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
-       fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
-       tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
-       fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
-       tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
-       dmxr = be16_to_cpu(dblock->bb_numrecs);
-       memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
-       memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
-}
-
-/*
- * Convert a compressed bmap extent record to an uncompressed form.
- * This code must be in sync with the routines xfs_bmbt_get_startoff,
- * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state.
- */
-STATIC void
-__xfs_bmbt_get_all(
-               __uint64_t l0,
-               __uint64_t l1,
-               xfs_bmbt_irec_t *s)
-{
-       int     ext_flag;
-       xfs_exntst_t st;
-
-       ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN));
-       s->br_startoff = ((xfs_fileoff_t)l0 &
-                          xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
-#if XFS_BIG_BLKNOS
-       s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) |
-                          (((xfs_fsblock_t)l1) >> 21);
-#else
-#ifdef DEBUG
-       {
-               xfs_dfsbno_t    b;
-
-               b = (((xfs_dfsbno_t)l0 & xfs_mask64lo(9)) << 43) |
-                   (((xfs_dfsbno_t)l1) >> 21);
-               ASSERT((b >> 32) == 0 || isnulldstartblock(b));
-               s->br_startblock = (xfs_fsblock_t)b;
-       }
-#else  /* !DEBUG */
-       s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21);
-#endif /* DEBUG */
-#endif /* XFS_BIG_BLKNOS */
-       s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21));
-       /* This is xfs_extent_state() in-line */
-       if (ext_flag) {
-               ASSERT(s->br_blockcount != 0);  /* saved for DMIG */
-               st = XFS_EXT_UNWRITTEN;
-       } else
-               st = XFS_EXT_NORM;
-       s->br_state = st;
-}
-
-void
-xfs_bmbt_get_all(
-       xfs_bmbt_rec_host_t *r,
-       xfs_bmbt_irec_t *s)
-{
-       __xfs_bmbt_get_all(r->l0, r->l1, s);
-}
-
-/*
- * Extract the blockcount field from an in memory bmap extent record.
- */
-xfs_filblks_t
-xfs_bmbt_get_blockcount(
-       xfs_bmbt_rec_host_t     *r)
-{
-       return (xfs_filblks_t)(r->l1 & xfs_mask64lo(21));
-}
-
-/*
- * Extract the startblock field from an in memory bmap extent record.
- */
-xfs_fsblock_t
-xfs_bmbt_get_startblock(
-       xfs_bmbt_rec_host_t     *r)
-{
-#if XFS_BIG_BLKNOS
-       return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) |
-              (((xfs_fsblock_t)r->l1) >> 21);
-#else
-#ifdef DEBUG
-       xfs_dfsbno_t    b;
-
-       b = (((xfs_dfsbno_t)r->l0 & xfs_mask64lo(9)) << 43) |
-           (((xfs_dfsbno_t)r->l1) >> 21);
-       ASSERT((b >> 32) == 0 || isnulldstartblock(b));
-       return (xfs_fsblock_t)b;
-#else  /* !DEBUG */
-       return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21);
-#endif /* DEBUG */
-#endif /* XFS_BIG_BLKNOS */
-}
-
-/*
- * Extract the startoff field from an in memory bmap extent record.
- */
-xfs_fileoff_t
-xfs_bmbt_get_startoff(
-       xfs_bmbt_rec_host_t     *r)
-{
-       return ((xfs_fileoff_t)r->l0 &
-                xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
-}
-
-xfs_exntst_t
-xfs_bmbt_get_state(
-       xfs_bmbt_rec_host_t     *r)
-{
-       int     ext_flag;
-
-       ext_flag = (int)((r->l0) >> (64 - BMBT_EXNTFLAG_BITLEN));
-       return xfs_extent_state(xfs_bmbt_get_blockcount(r),
-                               ext_flag);
-}
-
-/*
- * Extract the blockcount field from an on disk bmap extent record.
- */
-xfs_filblks_t
-xfs_bmbt_disk_get_blockcount(
-       xfs_bmbt_rec_t  *r)
-{
-       return (xfs_filblks_t)(be64_to_cpu(r->l1) & xfs_mask64lo(21));
-}
-
-/*
- * Extract the startoff field from a disk format bmap extent record.
- */
-xfs_fileoff_t
-xfs_bmbt_disk_get_startoff(
-       xfs_bmbt_rec_t  *r)
-{
-       return ((xfs_fileoff_t)be64_to_cpu(r->l0) &
-                xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
-}
-
-
-/*
- * Set all the fields in a bmap extent record from the arguments.
- */
-void
-xfs_bmbt_set_allf(
-       xfs_bmbt_rec_host_t     *r,
-       xfs_fileoff_t           startoff,
-       xfs_fsblock_t           startblock,
-       xfs_filblks_t           blockcount,
-       xfs_exntst_t            state)
-{
-       int             extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
-
-       ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
-       ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
-       ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
-
-#if XFS_BIG_BLKNOS
-       ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
-
-       r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
-               ((xfs_bmbt_rec_base_t)startoff << 9) |
-               ((xfs_bmbt_rec_base_t)startblock >> 43);
-       r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
-               ((xfs_bmbt_rec_base_t)blockcount &
-               (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
-#else  /* !XFS_BIG_BLKNOS */
-       if (isnullstartblock(startblock)) {
-               r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
-                       ((xfs_bmbt_rec_base_t)startoff << 9) |
-                        (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
-               r->l1 = xfs_mask64hi(11) |
-                         ((xfs_bmbt_rec_base_t)startblock << 21) |
-                         ((xfs_bmbt_rec_base_t)blockcount &
-                          (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
-       } else {
-               r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
-                       ((xfs_bmbt_rec_base_t)startoff << 9);
-               r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
-                        ((xfs_bmbt_rec_base_t)blockcount &
-                        (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
-       }
-#endif /* XFS_BIG_BLKNOS */
-}
-
-/*
- * Set all the fields in a bmap extent record from the uncompressed form.
- */
-void
-xfs_bmbt_set_all(
-       xfs_bmbt_rec_host_t *r,
-       xfs_bmbt_irec_t *s)
-{
-       xfs_bmbt_set_allf(r, s->br_startoff, s->br_startblock,
-                            s->br_blockcount, s->br_state);
-}
-
-
-/*
- * Set all the fields in a disk format bmap extent record from the arguments.
- */
-void
-xfs_bmbt_disk_set_allf(
-       xfs_bmbt_rec_t          *r,
-       xfs_fileoff_t           startoff,
-       xfs_fsblock_t           startblock,
-       xfs_filblks_t           blockcount,
-       xfs_exntst_t            state)
-{
-       int                     extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
-
-       ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
-       ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
-       ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
-
-#if XFS_BIG_BLKNOS
-       ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
-
-       r->l0 = cpu_to_be64(
-               ((xfs_bmbt_rec_base_t)extent_flag << 63) |
-                ((xfs_bmbt_rec_base_t)startoff << 9) |
-                ((xfs_bmbt_rec_base_t)startblock >> 43));
-       r->l1 = cpu_to_be64(
-               ((xfs_bmbt_rec_base_t)startblock << 21) |
-                ((xfs_bmbt_rec_base_t)blockcount &
-                 (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
-#else  /* !XFS_BIG_BLKNOS */
-       if (isnullstartblock(startblock)) {
-               r->l0 = cpu_to_be64(
-                       ((xfs_bmbt_rec_base_t)extent_flag << 63) |
-                        ((xfs_bmbt_rec_base_t)startoff << 9) |
-                         (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
-               r->l1 = cpu_to_be64(xfs_mask64hi(11) |
-                         ((xfs_bmbt_rec_base_t)startblock << 21) |
-                         ((xfs_bmbt_rec_base_t)blockcount &
-                          (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
-       } else {
-               r->l0 = cpu_to_be64(
-                       ((xfs_bmbt_rec_base_t)extent_flag << 63) |
-                        ((xfs_bmbt_rec_base_t)startoff << 9));
-               r->l1 = cpu_to_be64(
-                       ((xfs_bmbt_rec_base_t)startblock << 21) |
-                        ((xfs_bmbt_rec_base_t)blockcount &
-                         (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
-       }
-#endif /* XFS_BIG_BLKNOS */
-}
-
-/*
- * Set all the fields in a bmap extent record from the uncompressed form.
- */
-STATIC void
-xfs_bmbt_disk_set_all(
-       xfs_bmbt_rec_t  *r,
-       xfs_bmbt_irec_t *s)
-{
-       xfs_bmbt_disk_set_allf(r, s->br_startoff, s->br_startblock,
-                                 s->br_blockcount, s->br_state);
-}
-
-/*
- * Set the blockcount field in a bmap extent record.
- */
-void
-xfs_bmbt_set_blockcount(
-       xfs_bmbt_rec_host_t *r,
-       xfs_filblks_t   v)
-{
-       ASSERT((v & xfs_mask64hi(43)) == 0);
-       r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64hi(43)) |
-                 (xfs_bmbt_rec_base_t)(v & xfs_mask64lo(21));
-}
-
-/*
- * Set the startblock field in a bmap extent record.
- */
-void
-xfs_bmbt_set_startblock(
-       xfs_bmbt_rec_host_t *r,
-       xfs_fsblock_t   v)
-{
-#if XFS_BIG_BLKNOS
-       ASSERT((v & xfs_mask64hi(12)) == 0);
-       r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) |
-                 (xfs_bmbt_rec_base_t)(v >> 43);
-       r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) |
-                 (xfs_bmbt_rec_base_t)(v << 21);
-#else  /* !XFS_BIG_BLKNOS */
-       if (isnullstartblock(v)) {
-               r->l0 |= (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
-               r->l1 = (xfs_bmbt_rec_base_t)xfs_mask64hi(11) |
-                         ((xfs_bmbt_rec_base_t)v << 21) |
-                         (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
-       } else {
-               r->l0 &= ~(xfs_bmbt_rec_base_t)xfs_mask64lo(9);
-               r->l1 = ((xfs_bmbt_rec_base_t)v << 21) |
-                         (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
-       }
-#endif /* XFS_BIG_BLKNOS */
-}
-
-/*
- * Set the startoff field in a bmap extent record.
- */
-void
-xfs_bmbt_set_startoff(
-       xfs_bmbt_rec_host_t *r,
-       xfs_fileoff_t   v)
-{
-       ASSERT((v & xfs_mask64hi(9)) == 0);
-       r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) xfs_mask64hi(1)) |
-               ((xfs_bmbt_rec_base_t)v << 9) |
-                 (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
-}
-
-/*
- * Set the extent state field in a bmap extent record.
- */
-void
-xfs_bmbt_set_state(
-       xfs_bmbt_rec_host_t *r,
-       xfs_exntst_t    v)
-{
-       ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN);
-       if (v == XFS_EXT_NORM)
-               r->l0 &= xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN);
-       else
-               r->l0 |= xfs_mask64hi(BMBT_EXNTFLAG_BITLEN);
-}
-
-/*
- * Convert in-memory form of btree root to on-disk form.
- */
-void
-xfs_bmbt_to_bmdr(
-       struct xfs_mount        *mp,
-       struct xfs_btree_block  *rblock,
-       int                     rblocklen,
-       xfs_bmdr_block_t        *dblock,
-       int                     dblocklen)
-{
-       int                     dmxr;
-       xfs_bmbt_key_t          *fkp;
-       __be64                  *fpp;
-       xfs_bmbt_key_t          *tkp;
-       __be64                  *tpp;
-
-       if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC));
-               ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid));
-               ASSERT(rblock->bb_u.l.bb_blkno ==
-                      cpu_to_be64(XFS_BUF_DADDR_NULL));
-       } else
-               ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC));
-       ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO));
-       ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO));
-       ASSERT(rblock->bb_level != 0);
-       dblock->bb_level = rblock->bb_level;
-       dblock->bb_numrecs = rblock->bb_numrecs;
-       dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
-       fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
-       tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
-       fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
-       tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
-       dmxr = be16_to_cpu(dblock->bb_numrecs);
-       memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
-       memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
-}
-
-/*
- * Check extent records, which have just been read, for
- * any bit in the extent flag field. ASSERT on debug
- * kernels, as this condition should not occur.
- * Return an error condition (1) if any flags found,
- * otherwise return 0.
- */
-
-int
-xfs_check_nostate_extents(
-       xfs_ifork_t             *ifp,
-       xfs_extnum_t            idx,
-       xfs_extnum_t            num)
-{
-       for (; num > 0; num--, idx++) {
-               xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
-               if ((ep->l0 >>
-                    (64 - BMBT_EXNTFLAG_BITLEN)) != 0) {
-                       ASSERT(0);
-                       return 1;
-               }
-       }
-       return 0;
-}
-
-
-STATIC struct xfs_btree_cur *
-xfs_bmbt_dup_cursor(
-       struct xfs_btree_cur    *cur)
-{
-       struct xfs_btree_cur    *new;
-
-       new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
-                       cur->bc_private.b.ip, cur->bc_private.b.whichfork);
-
-       /*
-        * Copy the firstblock, flist, and flags values,
-        * since init cursor doesn't get them.
-        */
-       new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
-       new->bc_private.b.flist = cur->bc_private.b.flist;
-       new->bc_private.b.flags = cur->bc_private.b.flags;
-
-       return new;
-}
-
-STATIC void
-xfs_bmbt_update_cursor(
-       struct xfs_btree_cur    *src,
-       struct xfs_btree_cur    *dst)
-{
-       ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) ||
-              (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME));
-       ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist);
-
-       dst->bc_private.b.allocated += src->bc_private.b.allocated;
-       dst->bc_private.b.firstblock = src->bc_private.b.firstblock;
-
-       src->bc_private.b.allocated = 0;
-}
-
-STATIC int
-xfs_bmbt_alloc_block(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_ptr     *start,
-       union xfs_btree_ptr     *new,
-       int                     *stat)
-{
-       xfs_alloc_arg_t         args;           /* block allocation args */
-       int                     error;          /* error return value */
-
-       memset(&args, 0, sizeof(args));
-       args.tp = cur->bc_tp;
-       args.mp = cur->bc_mp;
-       args.fsbno = cur->bc_private.b.firstblock;
-       args.firstblock = args.fsbno;
-
-       if (args.fsbno == NULLFSBLOCK) {
-               args.fsbno = be64_to_cpu(start->l);
-               args.type = XFS_ALLOCTYPE_START_BNO;
-               /*
-                * Make sure there is sufficient room left in the AG to
-                * complete a full tree split for an extent insert.  If
-                * we are converting the middle part of an extent then
-                * we may need space for two tree splits.
-                *
-                * We are relying on the caller to make the correct block
-                * reservation for this operation to succeed.  If the
-                * reservation amount is insufficient then we may fail a
-                * block allocation here and corrupt the filesystem.
-                */
-               args.minleft = xfs_trans_get_block_res(args.tp);
-       } else if (cur->bc_private.b.flist->xbf_low) {
-               args.type = XFS_ALLOCTYPE_START_BNO;
-       } else {
-               args.type = XFS_ALLOCTYPE_NEAR_BNO;
-       }
-
-       args.minlen = args.maxlen = args.prod = 1;
-       args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
-       if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
-               error = XFS_ERROR(ENOSPC);
-               goto error0;
-       }
-       error = xfs_alloc_vextent(&args);
-       if (error)
-               goto error0;
-
-       if (args.fsbno == NULLFSBLOCK && args.minleft) {
-               /*
-                * Could not find an AG with enough free space to satisfy
-                * a full btree split.  Try again without minleft and if
-                * successful activate the lowspace algorithm.
-                */
-               args.fsbno = 0;
-               args.type = XFS_ALLOCTYPE_FIRST_AG;
-               args.minleft = 0;
-               error = xfs_alloc_vextent(&args);
-               if (error)
-                       goto error0;
-               cur->bc_private.b.flist->xbf_low = 1;
-       }
-       if (args.fsbno == NULLFSBLOCK) {
-               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-               *stat = 0;
-               return 0;
-       }
-       ASSERT(args.len == 1);
-       cur->bc_private.b.firstblock = args.fsbno;
-       cur->bc_private.b.allocated++;
-       cur->bc_private.b.ip->i_d.di_nblocks++;
-       xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
-       xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip,
-                       XFS_TRANS_DQ_BCOUNT, 1L);
-
-       new->l = cpu_to_be64(args.fsbno);
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-       *stat = 1;
-       return 0;
-
- error0:
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-       return error;
-}
-
-STATIC int
-xfs_bmbt_free_block(
-       struct xfs_btree_cur    *cur,
-       struct xfs_buf          *bp)
-{
-       struct xfs_mount        *mp = cur->bc_mp;
-       struct xfs_inode        *ip = cur->bc_private.b.ip;
-       struct xfs_trans        *tp = cur->bc_tp;
-       xfs_fsblock_t           fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
-
-       xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp);
-       ip->i_d.di_nblocks--;
-
-       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-       xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
-       xfs_trans_binval(tp, bp);
-       return 0;
-}
-
-STATIC int
-xfs_bmbt_get_minrecs(
-       struct xfs_btree_cur    *cur,
-       int                     level)
-{
-       if (level == cur->bc_nlevels - 1) {
-               struct xfs_ifork        *ifp;
-
-               ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
-                                   cur->bc_private.b.whichfork);
-
-               return xfs_bmbt_maxrecs(cur->bc_mp,
-                                       ifp->if_broot_bytes, level == 0) / 2;
-       }
-
-       return cur->bc_mp->m_bmap_dmnr[level != 0];
-}
-
-int
-xfs_bmbt_get_maxrecs(
-       struct xfs_btree_cur    *cur,
-       int                     level)
-{
-       if (level == cur->bc_nlevels - 1) {
-               struct xfs_ifork        *ifp;
-
-               ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
-                                   cur->bc_private.b.whichfork);
-
-               return xfs_bmbt_maxrecs(cur->bc_mp,
-                                       ifp->if_broot_bytes, level == 0);
-       }
-
-       return cur->bc_mp->m_bmap_dmxr[level != 0];
-
-}
-
-/*
- * Get the maximum records we could store in the on-disk format.
- *
- * For non-root nodes this is equivalent to xfs_bmbt_get_maxrecs, but
- * for the root node this checks the available space in the dinode fork
- * so that we can resize the in-memory buffer to match it.  After a
- * resize to the maximum size this function returns the same value
- * as xfs_bmbt_get_maxrecs for the root node, too.
- */
-STATIC int
-xfs_bmbt_get_dmaxrecs(
-       struct xfs_btree_cur    *cur,
-       int                     level)
-{
-       if (level != cur->bc_nlevels - 1)
-               return cur->bc_mp->m_bmap_dmxr[level != 0];
-       return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0);
-}
-
-STATIC void
-xfs_bmbt_init_key_from_rec(
-       union xfs_btree_key     *key,
-       union xfs_btree_rec     *rec)
-{
-       key->bmbt.br_startoff =
-               cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt));
-}
-
-STATIC void
-xfs_bmbt_init_rec_from_key(
-       union xfs_btree_key     *key,
-       union xfs_btree_rec     *rec)
-{
-       ASSERT(key->bmbt.br_startoff != 0);
-
-       xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff),
-                              0, 0, XFS_EXT_NORM);
-}
-
-STATIC void
-xfs_bmbt_init_rec_from_cur(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_rec     *rec)
-{
-       xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
-}
-
-STATIC void
-xfs_bmbt_init_ptr_from_cur(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_ptr     *ptr)
-{
-       ptr->l = 0;
-}
-
-STATIC __int64_t
-xfs_bmbt_key_diff(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_key     *key)
-{
-       return (__int64_t)be64_to_cpu(key->bmbt.br_startoff) -
-                                     cur->bc_rec.b.br_startoff;
-}
-
-static bool
-xfs_bmbt_verify(
-       struct xfs_buf          *bp)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-       struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
-       unsigned int            level;
-
-       switch (block->bb_magic) {
-       case cpu_to_be32(XFS_BMAP_CRC_MAGIC):
-               if (!xfs_sb_version_hascrc(&mp->m_sb))
-                       return false;
-               if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid))
-                       return false;
-               if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn)
-                       return false;
-               /*
-                * XXX: need a better way of verifying the owner here. Right now
-                * just make sure there has been one set.
-                */
-               if (be64_to_cpu(block->bb_u.l.bb_owner) == 0)
-                       return false;
-               /* fall through */
-       case cpu_to_be32(XFS_BMAP_MAGIC):
-               break;
-       default:
-               return false;
-       }
-
-       /*
-        * numrecs and level verification.
-        *
-        * We don't know what fork we belong to, so just verify that the level
-        * is less than the maximum of the two. Later checks will be more
-        * precise.
-        */
-       level = be16_to_cpu(block->bb_level);
-       if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]))
-               return false;
-       if (be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
-               return false;
-
-       /* sibling pointer verification */
-       if (!block->bb_u.l.bb_leftsib ||
-           (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLDFSBNO) &&
-            !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))))
-               return false;
-       if (!block->bb_u.l.bb_rightsib ||
-           (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLDFSBNO) &&
-            !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))))
-               return false;
-
-       return true;
-}
-
-static void
-xfs_bmbt_read_verify(
-       struct xfs_buf  *bp)
-{
-       if (!xfs_btree_lblock_verify_crc(bp))
-               xfs_buf_ioerror(bp, EFSBADCRC);
-       else if (!xfs_bmbt_verify(bp))
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-       if (bp->b_error) {
-               trace_xfs_btree_corrupt(bp, _RET_IP_);
-               xfs_verifier_error(bp);
-       }
-}
-
-static void
-xfs_bmbt_write_verify(
-       struct xfs_buf  *bp)
-{
-       if (!xfs_bmbt_verify(bp)) {
-               trace_xfs_btree_corrupt(bp, _RET_IP_);
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-               xfs_verifier_error(bp);
-               return;
-       }
-       xfs_btree_lblock_calc_crc(bp);
-}
-
-const struct xfs_buf_ops xfs_bmbt_buf_ops = {
-       .verify_read = xfs_bmbt_read_verify,
-       .verify_write = xfs_bmbt_write_verify,
-};
-
-
-#if defined(DEBUG) || defined(XFS_WARN)
-STATIC int
-xfs_bmbt_keys_inorder(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_key     *k1,
-       union xfs_btree_key     *k2)
-{
-       return be64_to_cpu(k1->bmbt.br_startoff) <
-               be64_to_cpu(k2->bmbt.br_startoff);
-}
-
-STATIC int
-xfs_bmbt_recs_inorder(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_rec     *r1,
-       union xfs_btree_rec     *r2)
-{
-       return xfs_bmbt_disk_get_startoff(&r1->bmbt) +
-               xfs_bmbt_disk_get_blockcount(&r1->bmbt) <=
-               xfs_bmbt_disk_get_startoff(&r2->bmbt);
-}
-#endif /* DEBUG */
-
-static const struct xfs_btree_ops xfs_bmbt_ops = {
-       .rec_len                = sizeof(xfs_bmbt_rec_t),
-       .key_len                = sizeof(xfs_bmbt_key_t),
-
-       .dup_cursor             = xfs_bmbt_dup_cursor,
-       .update_cursor          = xfs_bmbt_update_cursor,
-       .alloc_block            = xfs_bmbt_alloc_block,
-       .free_block             = xfs_bmbt_free_block,
-       .get_maxrecs            = xfs_bmbt_get_maxrecs,
-       .get_minrecs            = xfs_bmbt_get_minrecs,
-       .get_dmaxrecs           = xfs_bmbt_get_dmaxrecs,
-       .init_key_from_rec      = xfs_bmbt_init_key_from_rec,
-       .init_rec_from_key      = xfs_bmbt_init_rec_from_key,
-       .init_rec_from_cur      = xfs_bmbt_init_rec_from_cur,
-       .init_ptr_from_cur      = xfs_bmbt_init_ptr_from_cur,
-       .key_diff               = xfs_bmbt_key_diff,
-       .buf_ops                = &xfs_bmbt_buf_ops,
-#if defined(DEBUG) || defined(XFS_WARN)
-       .keys_inorder           = xfs_bmbt_keys_inorder,
-       .recs_inorder           = xfs_bmbt_recs_inorder,
-#endif
-};
-
-/*
- * Allocate a new bmap btree cursor.
- */
-struct xfs_btree_cur *                         /* new bmap btree cursor */
-xfs_bmbt_init_cursor(
-       struct xfs_mount        *mp,            /* file system mount point */
-       struct xfs_trans        *tp,            /* transaction pointer */
-       struct xfs_inode        *ip,            /* inode owning the btree */
-       int                     whichfork)      /* data or attr fork */
-{
-       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
-       struct xfs_btree_cur    *cur;
-
-       cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
-
-       cur->bc_tp = tp;
-       cur->bc_mp = mp;
-       cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
-       cur->bc_btnum = XFS_BTNUM_BMAP;
-       cur->bc_blocklog = mp->m_sb.sb_blocklog;
-
-       cur->bc_ops = &xfs_bmbt_ops;
-       cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
-       if (xfs_sb_version_hascrc(&mp->m_sb))
-               cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
-
-       cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
-       cur->bc_private.b.ip = ip;
-       cur->bc_private.b.firstblock = NULLFSBLOCK;
-       cur->bc_private.b.flist = NULL;
-       cur->bc_private.b.allocated = 0;
-       cur->bc_private.b.flags = 0;
-       cur->bc_private.b.whichfork = whichfork;
-
-       return cur;
-}
-
-/*
- * Calculate number of records in a bmap btree block.
- */
-int
-xfs_bmbt_maxrecs(
-       struct xfs_mount        *mp,
-       int                     blocklen,
-       int                     leaf)
-{
-       blocklen -= XFS_BMBT_BLOCK_LEN(mp);
-
-       if (leaf)
-               return blocklen / sizeof(xfs_bmbt_rec_t);
-       return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
-}
-
-/*
- * Calculate number of records in a bmap btree inode root.
- */
-int
-xfs_bmdr_maxrecs(
-       int                     blocklen,
-       int                     leaf)
-{
-       blocklen -= sizeof(xfs_bmdr_block_t);
-
-       if (leaf)
-               return blocklen / sizeof(xfs_bmdr_rec_t);
-       return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
-}
-
-/*
- * Change the owner of a btree format fork fo the inode passed in. Change it to
- * the owner of that is passed in so that we can change owners before or after
- * we switch forks between inodes. The operation that the caller is doing will
- * determine whether is needs to change owner before or after the switch.
- *
- * For demand paged transactional modification, the fork switch should be done
- * after reading in all the blocks, modifying them and pinning them in the
- * transaction. For modification when the buffers are already pinned in memory,
- * the fork switch can be done before changing the owner as we won't need to
- * validate the owner until the btree buffers are unpinned and writes can occur
- * again.
- *
- * For recovery based ownership change, there is no transactional context and
- * so a buffer list must be supplied so that we can record the buffers that we
- * modified for the caller to issue IO on.
- */
-int
-xfs_bmbt_change_owner(
-       struct xfs_trans        *tp,
-       struct xfs_inode        *ip,
-       int                     whichfork,
-       xfs_ino_t               new_owner,
-       struct list_head        *buffer_list)
-{
-       struct xfs_btree_cur    *cur;
-       int                     error;
-
-       ASSERT(tp || buffer_list);
-       ASSERT(!(tp && buffer_list));
-       if (whichfork == XFS_DATA_FORK)
-               ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE);
-       else
-               ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE);
-
-       cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
-       if (!cur)
-               return ENOMEM;
-
-       error = xfs_btree_change_owner(cur, new_owner, buffer_list);
-       xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
-       return error;
-}
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
deleted file mode 100644 (file)
index 819a8a4..0000000
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Copyright (c) 2000,2002-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_BMAP_BTREE_H__
-#define __XFS_BMAP_BTREE_H__
-
-struct xfs_btree_cur;
-struct xfs_btree_block;
-struct xfs_mount;
-struct xfs_inode;
-struct xfs_trans;
-
-/*
- * Extent state and extent format macros.
- */
-#define XFS_EXTFMT_INODE(x)    \
-       (xfs_sb_version_hasextflgbit(&((x)->i_mount->m_sb)) ? \
-               XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE)
-#define ISUNWRITTEN(x) ((x)->br_state == XFS_EXT_UNWRITTEN)
-
-/*
- * Btree block header size depends on a superblock flag.
- */
-#define XFS_BMBT_BLOCK_LEN(mp) \
-       (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
-               XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN)
-
-#define XFS_BMBT_REC_ADDR(mp, block, index) \
-       ((xfs_bmbt_rec_t *) \
-               ((char *)(block) + \
-                XFS_BMBT_BLOCK_LEN(mp) + \
-                ((index) - 1) * sizeof(xfs_bmbt_rec_t)))
-
-#define XFS_BMBT_KEY_ADDR(mp, block, index) \
-       ((xfs_bmbt_key_t *) \
-               ((char *)(block) + \
-                XFS_BMBT_BLOCK_LEN(mp) + \
-                ((index) - 1) * sizeof(xfs_bmbt_key_t)))
-
-#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \
-       ((xfs_bmbt_ptr_t *) \
-               ((char *)(block) + \
-                XFS_BMBT_BLOCK_LEN(mp) + \
-                (maxrecs) * sizeof(xfs_bmbt_key_t) + \
-                ((index) - 1) * sizeof(xfs_bmbt_ptr_t)))
-
-#define XFS_BMDR_REC_ADDR(block, index) \
-       ((xfs_bmdr_rec_t *) \
-               ((char *)(block) + \
-                sizeof(struct xfs_bmdr_block) + \
-                ((index) - 1) * sizeof(xfs_bmdr_rec_t)))
-
-#define XFS_BMDR_KEY_ADDR(block, index) \
-       ((xfs_bmdr_key_t *) \
-               ((char *)(block) + \
-                sizeof(struct xfs_bmdr_block) + \
-                ((index) - 1) * sizeof(xfs_bmdr_key_t)))
-
-#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \
-       ((xfs_bmdr_ptr_t *) \
-               ((char *)(block) + \
-                sizeof(struct xfs_bmdr_block) + \
-                (maxrecs) * sizeof(xfs_bmdr_key_t) + \
-                ((index) - 1) * sizeof(xfs_bmdr_ptr_t)))
-
-/*
- * These are to be used when we know the size of the block and
- * we don't have a cursor.
- */
-#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \
-       XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0))
-
-#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \
-       (int)(XFS_BMBT_BLOCK_LEN(mp) + \
-              ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
-
-#define XFS_BMAP_BROOT_SPACE(mp, bb) \
-       (XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs)))
-#define XFS_BMDR_SPACE_CALC(nrecs) \
-       (int)(sizeof(xfs_bmdr_block_t) + \
-              ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
-#define XFS_BMAP_BMDR_SPACE(bb) \
-       (XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
-
-/*
- * Maximum number of bmap btree levels.
- */
-#define XFS_BM_MAXLEVELS(mp,w)         ((mp)->m_bm_maxlevels[(w)])
-
-/*
- * Prototypes for xfs_bmap.c to call.
- */
-extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int,
-                       struct xfs_btree_block *, int);
-extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
-extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r);
-extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
-extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
-extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r);
-
-extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
-extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
-
-extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
-extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o,
-                       xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
-extern void xfs_bmbt_set_blockcount(xfs_bmbt_rec_host_t *r, xfs_filblks_t v);
-extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v);
-extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v);
-extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v);
-
-extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o,
-                       xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
-
-extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
-                       xfs_bmdr_block_t *, int);
-
-extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
-extern int xfs_bmdr_maxrecs(int blocklen, int leaf);
-extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
-
-extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
-                                int whichfork, xfs_ino_t new_owner,
-                                struct list_head *buffer_list);
-
-extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
-               struct xfs_trans *, struct xfs_inode *, int);
-
-#endif /* __XFS_BMAP_BTREE_H__ */
index 64731ef3324d4b44a938aeac30fc3b816d890222..2f1e30d39a3540f9d545e8ab9c5afd52af0047a9 100644 (file)
@@ -133,7 +133,7 @@ xfs_bmap_finish(
                        mp = ntp->t_mountp;
                        if (!XFS_FORCED_SHUTDOWN(mp))
                                xfs_force_shutdown(mp,
-                                                  (error == EFSCORRUPTED) ?
+                                                  (error == -EFSCORRUPTED) ?
                                                   SHUTDOWN_CORRUPT_INCORE :
                                                   SHUTDOWN_META_IO_ERROR);
                        return error;
@@ -365,7 +365,7 @@ xfs_bmap_count_tree(
                        xfs_trans_brelse(tp, bp);
                        XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
                                         XFS_ERRLEVEL_LOW, mp);
-                       return XFS_ERROR(EFSCORRUPTED);
+                       return -EFSCORRUPTED;
                }
                xfs_trans_brelse(tp, bp);
        } else {
@@ -425,14 +425,14 @@ xfs_bmap_count_blocks(
        ASSERT(level > 0);
        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
        bno = be64_to_cpu(*pp);
-       ASSERT(bno != NULLDFSBNO);
+       ASSERT(bno != NULLFSBLOCK);
        ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
        ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
 
        if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
                XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
                                 mp);
-               return XFS_ERROR(EFSCORRUPTED);
+               return -EFSCORRUPTED;
        }
 
        return 0;
@@ -524,13 +524,13 @@ xfs_getbmap(
                        if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
                            ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
                            ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
-                               return XFS_ERROR(EINVAL);
+                               return -EINVAL;
                } else if (unlikely(
                           ip->i_d.di_aformat != 0 &&
                           ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
                        XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
                                         ip->i_mount);
-                       return XFS_ERROR(EFSCORRUPTED);
+                       return -EFSCORRUPTED;
                }
 
                prealloced = 0;
@@ -539,7 +539,7 @@ xfs_getbmap(
                if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
                    ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
                    ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
-                       return XFS_ERROR(EINVAL);
+                       return -EINVAL;
 
                if (xfs_get_extsz_hint(ip) ||
                    ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
@@ -559,26 +559,26 @@ xfs_getbmap(
                bmv->bmv_entries = 0;
                return 0;
        } else if (bmv->bmv_length < 0) {
-               return XFS_ERROR(EINVAL);
+               return -EINVAL;
        }
 
        nex = bmv->bmv_count - 1;
        if (nex <= 0)
-               return XFS_ERROR(EINVAL);
+               return -EINVAL;
        bmvend = bmv->bmv_offset + bmv->bmv_length;
 
 
        if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
-               return XFS_ERROR(ENOMEM);
+               return -ENOMEM;
        out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
        if (!out)
-               return XFS_ERROR(ENOMEM);
+               return -ENOMEM;
 
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
        if (whichfork == XFS_DATA_FORK) {
                if (!(iflags & BMV_IF_DELALLOC) &&
                    (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
-                       error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
+                       error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
                        if (error)
                                goto out_unlock_iolock;
 
@@ -611,7 +611,7 @@ xfs_getbmap(
        /*
         * Allocate enough space to handle "subnex" maps at a time.
         */
-       error = ENOMEM;
+       error = -ENOMEM;
        subnex = 16;
        map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
        if (!map)
@@ -809,7 +809,7 @@ xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
         * have speculative prealloc/delalloc blocks to remove.
         */
        if (VFS_I(ip)->i_size == 0 &&
-           VN_CACHED(VFS_I(ip)) == 0 &&
+           VFS_I(ip)->i_mapping->nrpages == 0 &&
            ip->i_delayed_blks == 0)
                return false;
 
@@ -882,7 +882,7 @@ xfs_free_eofblocks(
                if (need_iolock) {
                        if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
                                xfs_trans_cancel(tp, 0);
-                               return EAGAIN;
+                               return -EAGAIN;
                        }
                }
 
@@ -955,14 +955,14 @@ xfs_alloc_file_space(
        trace_xfs_alloc_file_space(ip);
 
        if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
+               return -EIO;
 
        error = xfs_qm_dqattach(ip, 0);
        if (error)
                return error;
 
        if (len <= 0)
-               return XFS_ERROR(EINVAL);
+               return -EINVAL;
 
        rt = XFS_IS_REALTIME_INODE(ip);
        extsz = xfs_get_extsz_hint(ip);
@@ -1028,7 +1028,7 @@ xfs_alloc_file_space(
                        /*
                         * Free the transaction structure.
                         */
-                       ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+                       ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
                        xfs_trans_cancel(tp, 0);
                        break;
                }
@@ -1065,7 +1065,7 @@ xfs_alloc_file_space(
                allocated_fsb = imapp->br_blockcount;
 
                if (nimaps == 0) {
-                       error = XFS_ERROR(ENOSPC);
+                       error = -ENOSPC;
                        break;
                }
 
@@ -1126,7 +1126,7 @@ xfs_zero_remaining_bytes(
                                        mp->m_rtdev_targp : mp->m_ddev_targp,
                                  BTOBB(mp->m_sb.sb_blocksize), 0);
        if (!bp)
-               return XFS_ERROR(ENOMEM);
+               return -ENOMEM;
 
        xfs_buf_unlock(bp);
 
@@ -1158,7 +1158,7 @@ xfs_zero_remaining_bytes(
                XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
 
                if (XFS_FORCED_SHUTDOWN(mp)) {
-                       error = XFS_ERROR(EIO);
+                       error = -EIO;
                        break;
                }
                xfs_buf_iorequest(bp);
@@ -1176,7 +1176,7 @@ xfs_zero_remaining_bytes(
                XFS_BUF_WRITE(bp);
 
                if (XFS_FORCED_SHUTDOWN(mp)) {
-                       error = XFS_ERROR(EIO);
+                       error = -EIO;
                        break;
                }
                xfs_buf_iorequest(bp);
@@ -1234,7 +1234,7 @@ xfs_free_file_space(
 
        rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
        ioffset = offset & ~(rounding - 1);
-       error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+       error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
                                              ioffset, -1);
        if (error)
                goto out;
@@ -1315,7 +1315,7 @@ xfs_free_file_space(
                        /*
                         * Free the transaction structure.
                         */
-                       ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+                       ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
                        xfs_trans_cancel(tp, 0);
                        break;
                }
@@ -1557,14 +1557,14 @@ xfs_swap_extents_check_format(
        /* Should never get a local format */
        if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
            tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
-               return EINVAL;
+               return -EINVAL;
 
        /*
         * if the target inode has less extents that then temporary inode then
         * why did userspace call us?
         */
        if (ip->i_d.di_nextents < tip->i_d.di_nextents)
-               return EINVAL;
+               return -EINVAL;
 
        /*
         * if the target inode is in extent form and the temp inode is in btree
@@ -1573,19 +1573,19 @@ xfs_swap_extents_check_format(
         */
        if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
            tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
-               return EINVAL;
+               return -EINVAL;
 
        /* Check temp in extent form to max in target */
        if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
            XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
                        XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
-               return EINVAL;
+               return -EINVAL;
 
        /* Check target in extent form to max in temp */
        if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
            XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
                        XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
-               return EINVAL;
+               return -EINVAL;
 
        /*
         * If we are in a btree format, check that the temp root block will fit
@@ -1599,25 +1599,49 @@ xfs_swap_extents_check_format(
        if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
                if (XFS_IFORK_BOFF(ip) &&
                    XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
-                       return EINVAL;
+                       return -EINVAL;
                if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
                    XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
-                       return EINVAL;
+                       return -EINVAL;
        }
 
        /* Reciprocal target->temp btree format checks */
        if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
                if (XFS_IFORK_BOFF(tip) &&
                    XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
-                       return EINVAL;
+                       return -EINVAL;
                if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
                    XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
-                       return EINVAL;
+                       return -EINVAL;
        }
 
        return 0;
 }
 
+int
+xfs_swap_extent_flush(
+       struct xfs_inode        *ip)
+{
+       int     error;
+
+       error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+       if (error)
+               return error;
+       truncate_pagecache_range(VFS_I(ip), 0, -1);
+
+       /* Verify O_DIRECT for ftmp */
+       if (VFS_I(ip)->i_mapping->nrpages)
+               return -EINVAL;
+
+       /*
+        * Don't try to swap extents on mmap()d files because we can't lock
+        * out races against page faults safely.
+        */
+       if (mapping_mapped(VFS_I(ip)->i_mapping))
+               return -EBUSY;
+       return 0;
+}
+
 int
 xfs_swap_extents(
        xfs_inode_t     *ip,    /* target inode */
@@ -1633,51 +1657,57 @@ xfs_swap_extents(
        int             aforkblks = 0;
        int             taforkblks = 0;
        __uint64_t      tmp;
+       int             lock_flags;
 
        tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
        if (!tempifp) {
-               error = XFS_ERROR(ENOMEM);
+               error = -ENOMEM;
                goto out;
        }
 
        /*
-        * we have to do two separate lock calls here to keep lockdep
-        * happy. If we try to get all the locks in one call, lock will
-        * report false positives when we drop the ILOCK and regain them
-        * below.
+        * Lock up the inodes against other IO and truncate to begin with.
+        * Then we can ensure the inodes are flushed and have no page cache
+        * safely. Once we have done this we can take the ilocks and do the rest
+        * of the checks.
         */
+       lock_flags = XFS_IOLOCK_EXCL;
        xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
-       xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
 
        /* Verify that both files have the same format */
        if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
-               error = XFS_ERROR(EINVAL);
+               error = -EINVAL;
                goto out_unlock;
        }
 
        /* Verify both files are either real-time or non-realtime */
        if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
-               error = XFS_ERROR(EINVAL);
+               error = -EINVAL;
                goto out_unlock;
        }
 
-       error = -filemap_write_and_wait(VFS_I(tip)->i_mapping);
+       error = xfs_swap_extent_flush(ip);
+       if (error)
+               goto out_unlock;
+       error = xfs_swap_extent_flush(tip);
        if (error)
                goto out_unlock;
-       truncate_pagecache_range(VFS_I(tip), 0, -1);
 
-       /* Verify O_DIRECT for ftmp */
-       if (VN_CACHED(VFS_I(tip)) != 0) {
-               error = XFS_ERROR(EINVAL);
+       tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+       if (error) {
+               xfs_trans_cancel(tp, 0);
                goto out_unlock;
        }
+       xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
+       lock_flags |= XFS_ILOCK_EXCL;
 
        /* Verify all data are being swapped */
        if (sxp->sx_offset != 0 ||
            sxp->sx_length != ip->i_d.di_size ||
            sxp->sx_length != tip->i_d.di_size) {
-               error = XFS_ERROR(EFAULT);
-               goto out_unlock;
+               error = -EFAULT;
+               goto out_trans_cancel;
        }
 
        trace_xfs_swap_extent_before(ip, 0);
@@ -1689,7 +1719,7 @@ xfs_swap_extents(
                xfs_notice(mp,
                    "%s: inode 0x%llx format is incompatible for exchanging.",
                                __func__, ip->i_ino);
-               goto out_unlock;
+               goto out_trans_cancel;
        }
 
        /*
@@ -1703,43 +1733,9 @@ xfs_swap_extents(
            (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
            (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
            (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
-               error = XFS_ERROR(EBUSY);
-               goto out_unlock;
-       }
-
-       /* We need to fail if the file is memory mapped.  Once we have tossed
-        * all existing pages, the page fault will have no option
-        * but to go to the filesystem for pages. By making the page fault call
-        * vop_read (or write in the case of autogrow) they block on the iolock
-        * until we have switched the extents.
-        */
-       if (VN_MAPPED(VFS_I(ip))) {
-               error = XFS_ERROR(EBUSY);
-               goto out_unlock;
-       }
-
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       xfs_iunlock(tip, XFS_ILOCK_EXCL);
-
-       /*
-        * There is a race condition here since we gave up the
-        * ilock.  However, the data fork will not change since
-        * we have the iolock (locked for truncation too) so we
-        * are safe.  We don't really care if non-io related
-        * fields change.
-        */
-       truncate_pagecache_range(VFS_I(ip), 0, -1);
-
-       tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
-       if (error) {
-               xfs_iunlock(ip,  XFS_IOLOCK_EXCL);
-               xfs_iunlock(tip, XFS_IOLOCK_EXCL);
-               xfs_trans_cancel(tp, 0);
-               goto out;
+               error = -EBUSY;
+               goto out_trans_cancel;
        }
-       xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
-
        /*
         * Count the number of extended attribute blocks
         */
@@ -1757,8 +1753,8 @@ xfs_swap_extents(
                        goto out_trans_cancel;
        }
 
-       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-       xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+       xfs_trans_ijoin(tp, ip, lock_flags);
+       xfs_trans_ijoin(tp, tip, lock_flags);
 
        /*
         * Before we've swapped the forks, lets set the owners of the forks
@@ -1887,8 +1883,8 @@ out:
        return error;
 
 out_unlock:
-       xfs_iunlock(ip,  XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-       xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+       xfs_iunlock(ip, lock_flags);
+       xfs_iunlock(tip, lock_flags);
        goto out;
 
 out_trans_cancel:
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
deleted file mode 100644 (file)
index cf893bc..0000000
+++ /dev/null
@@ -1,4069 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_inode.h"
-#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_buf_item.h"
-#include "xfs_btree.h"
-#include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_cksum.h"
-#include "xfs_alloc.h"
-
-/*
- * Cursor allocation zone.
- */
-kmem_zone_t    *xfs_btree_cur_zone;
-
-/*
- * Btree magic numbers.
- */
-static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
-       { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
-         XFS_FIBT_MAGIC },
-       { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
-         XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
-};
-#define xfs_btree_magic(cur) \
-       xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
-
-
-STATIC int                             /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lblock(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       struct xfs_btree_block  *block, /* btree long form block pointer */
-       int                     level,  /* level of the btree block */
-       struct xfs_buf          *bp)    /* buffer for block, if any */
-{
-       int                     lblock_ok = 1; /* block passes checks */
-       struct xfs_mount        *mp;    /* file system mount point */
-
-       mp = cur->bc_mp;
-
-       if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               lblock_ok = lblock_ok &&
-                       uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) &&
-                       block->bb_u.l.bb_blkno == cpu_to_be64(
-                               bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
-       }
-
-       lblock_ok = lblock_ok &&
-               be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
-               be16_to_cpu(block->bb_level) == level &&
-               be16_to_cpu(block->bb_numrecs) <=
-                       cur->bc_ops->get_maxrecs(cur, level) &&
-               block->bb_u.l.bb_leftsib &&
-               (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
-                XFS_FSB_SANITY_CHECK(mp,
-                       be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
-               block->bb_u.l.bb_rightsib &&
-               (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
-                XFS_FSB_SANITY_CHECK(mp,
-                       be64_to_cpu(block->bb_u.l.bb_rightsib)));
-
-       if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
-                       XFS_ERRTAG_BTREE_CHECK_LBLOCK,
-                       XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
-               if (bp)
-                       trace_xfs_btree_corrupt(bp, _RET_IP_);
-               XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-       return 0;
-}
-
-STATIC int                             /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sblock(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       struct xfs_btree_block  *block, /* btree short form block pointer */
-       int                     level,  /* level of the btree block */
-       struct xfs_buf          *bp)    /* buffer containing block */
-{
-       struct xfs_mount        *mp;    /* file system mount point */
-       struct xfs_buf          *agbp;  /* buffer for ag. freespace struct */
-       struct xfs_agf          *agf;   /* ag. freespace structure */
-       xfs_agblock_t           agflen; /* native ag. freespace length */
-       int                     sblock_ok = 1; /* block passes checks */
-
-       mp = cur->bc_mp;
-       agbp = cur->bc_private.a.agbp;
-       agf = XFS_BUF_TO_AGF(agbp);
-       agflen = be32_to_cpu(agf->agf_length);
-
-       if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               sblock_ok = sblock_ok &&
-                       uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) &&
-                       block->bb_u.s.bb_blkno == cpu_to_be64(
-                               bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
-       }
-
-       sblock_ok = sblock_ok &&
-               be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
-               be16_to_cpu(block->bb_level) == level &&
-               be16_to_cpu(block->bb_numrecs) <=
-                       cur->bc_ops->get_maxrecs(cur, level) &&
-               (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
-                be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
-               block->bb_u.s.bb_leftsib &&
-               (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
-                be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
-               block->bb_u.s.bb_rightsib;
-
-       if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
-                       XFS_ERRTAG_BTREE_CHECK_SBLOCK,
-                       XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
-               if (bp)
-                       trace_xfs_btree_corrupt(bp, _RET_IP_);
-               XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-       return 0;
-}
-
-/*
- * Debug routine: check that block header is ok.
- */
-int
-xfs_btree_check_block(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       struct xfs_btree_block  *block, /* generic btree block pointer */
-       int                     level,  /* level of the btree block */
-       struct xfs_buf          *bp)    /* buffer containing block, if any */
-{
-       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-               return xfs_btree_check_lblock(cur, block, level, bp);
-       else
-               return xfs_btree_check_sblock(cur, block, level, bp);
-}
-
-/*
- * Check that (long) pointer is ok.
- */
-int                                    /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lptr(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       xfs_dfsbno_t            bno,    /* btree block disk address */
-       int                     level)  /* btree block level */
-{
-       XFS_WANT_CORRUPTED_RETURN(
-               level > 0 &&
-               bno != NULLDFSBNO &&
-               XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
-       return 0;
-}
-
-#ifdef DEBUG
-/*
- * Check that (short) pointer is ok.
- */
-STATIC int                             /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sptr(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       xfs_agblock_t           bno,    /* btree block disk address */
-       int                     level)  /* btree block level */
-{
-       xfs_agblock_t           agblocks = cur->bc_mp->m_sb.sb_agblocks;
-
-       XFS_WANT_CORRUPTED_RETURN(
-               level > 0 &&
-               bno != NULLAGBLOCK &&
-               bno != 0 &&
-               bno < agblocks);
-       return 0;
-}
-
-/*
- * Check that block ptr is ok.
- */
-STATIC int                             /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_ptr(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       union xfs_btree_ptr     *ptr,   /* btree block disk address */
-       int                     index,  /* offset from ptr to check */
-       int                     level)  /* btree block level */
-{
-       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-               return xfs_btree_check_lptr(cur,
-                               be64_to_cpu((&ptr->l)[index]), level);
-       } else {
-               return xfs_btree_check_sptr(cur,
-                               be32_to_cpu((&ptr->s)[index]), level);
-       }
-}
-#endif
-
-/*
- * Calculate CRC on the whole btree block and stuff it into the
- * long-form btree header.
- *
- * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
- * it into the buffer so recovery knows what the last modifcation was that made
- * it to disk.
- */
-void
-xfs_btree_lblock_calc_crc(
-       struct xfs_buf          *bp)
-{
-       struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
-       struct xfs_buf_log_item *bip = bp->b_fspriv;
-
-       if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
-               return;
-       if (bip)
-               block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
-       xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
-}
-
-bool
-xfs_btree_lblock_verify_crc(
-       struct xfs_buf          *bp)
-{
-       if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
-               return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
-
-       return true;
-}
-
-/*
- * Calculate CRC on the whole btree block and stuff it into the
- * short-form btree header.
- *
- * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
- * it into the buffer so recovery knows what the last modifcation was that made
- * it to disk.
- */
-void
-xfs_btree_sblock_calc_crc(
-       struct xfs_buf          *bp)
-{
-       struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
-       struct xfs_buf_log_item *bip = bp->b_fspriv;
-
-       if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
-               return;
-       if (bip)
-               block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
-       xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
-}
-
-bool
-xfs_btree_sblock_verify_crc(
-       struct xfs_buf          *bp)
-{
-       if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
-               return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
-
-       return true;
-}
-
-/*
- * Delete the btree cursor.
- */
-void
-xfs_btree_del_cursor(
-       xfs_btree_cur_t *cur,           /* btree cursor */
-       int             error)          /* del because of error */
-{
-       int             i;              /* btree level */
-
-       /*
-        * Clear the buffer pointers, and release the buffers.
-        * If we're doing this in the face of an error, we
-        * need to make sure to inspect all of the entries
-        * in the bc_bufs array for buffers to be unlocked.
-        * This is because some of the btree code works from
-        * level n down to 0, and if we get an error along
-        * the way we won't have initialized all the entries
-        * down to 0.
-        */
-       for (i = 0; i < cur->bc_nlevels; i++) {
-               if (cur->bc_bufs[i])
-                       xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
-               else if (!error)
-                       break;
-       }
-       /*
-        * Can't free a bmap cursor without having dealt with the
-        * allocated indirect blocks' accounting.
-        */
-       ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
-              cur->bc_private.b.allocated == 0);
-       /*
-        * Free the cursor.
-        */
-       kmem_zone_free(xfs_btree_cur_zone, cur);
-}
-
-/*
- * Duplicate the btree cursor.
- * Allocate a new one, copy the record, re-get the buffers.
- */
-int                                    /* error */
-xfs_btree_dup_cursor(
-       xfs_btree_cur_t *cur,           /* input cursor */
-       xfs_btree_cur_t **ncur)         /* output cursor */
-{
-       xfs_buf_t       *bp;            /* btree block's buffer pointer */
-       int             error;          /* error return value */
-       int             i;              /* level number of btree block */
-       xfs_mount_t     *mp;            /* mount structure for filesystem */
-       xfs_btree_cur_t *new;           /* new cursor value */
-       xfs_trans_t     *tp;            /* transaction pointer, can be NULL */
-
-       tp = cur->bc_tp;
-       mp = cur->bc_mp;
-
-       /*
-        * Allocate a new cursor like the old one.
-        */
-       new = cur->bc_ops->dup_cursor(cur);
-
-       /*
-        * Copy the record currently in the cursor.
-        */
-       new->bc_rec = cur->bc_rec;
-
-       /*
-        * For each level current, re-get the buffer and copy the ptr value.
-        */
-       for (i = 0; i < new->bc_nlevels; i++) {
-               new->bc_ptrs[i] = cur->bc_ptrs[i];
-               new->bc_ra[i] = cur->bc_ra[i];
-               bp = cur->bc_bufs[i];
-               if (bp) {
-                       error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-                                                  XFS_BUF_ADDR(bp), mp->m_bsize,
-                                                  0, &bp,
-                                                  cur->bc_ops->buf_ops);
-                       if (error) {
-                               xfs_btree_del_cursor(new, error);
-                               *ncur = NULL;
-                               return error;
-                       }
-               }
-               new->bc_bufs[i] = bp;
-       }
-       *ncur = new;
-       return 0;
-}
-
-/*
- * XFS btree block layout and addressing:
- *
- * There are two types of blocks in the btree: leaf and non-leaf blocks.
- *
- * The leaf record start with a header then followed by records containing
- * the values.  A non-leaf block also starts with the same header, and
- * then first contains lookup keys followed by an equal number of pointers
- * to the btree blocks at the previous level.
- *
- *             +--------+-------+-------+-------+-------+-------+-------+
- * Leaf:       | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
- *             +--------+-------+-------+-------+-------+-------+-------+
- *
- *             +--------+-------+-------+-------+-------+-------+-------+
- * Non-Leaf:   | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
- *             +--------+-------+-------+-------+-------+-------+-------+
- *
- * The header is called struct xfs_btree_block for reasons better left unknown
- * and comes in different versions for short (32bit) and long (64bit) block
- * pointers.  The record and key structures are defined by the btree instances
- * and opaque to the btree core.  The block pointers are simple disk endian
- * integers, available in a short (32bit) and long (64bit) variant.
- *
- * The helpers below calculate the offset of a given record, key or pointer
- * into a btree block (xfs_btree_*_offset) or return a pointer to the given
- * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
- * inside the btree block is done using indices starting at one, not zero!
- */
-
-/*
- * Return size of the btree block header for this btree instance.
- */
-static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
-{
-       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-               if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
-                       return XFS_BTREE_LBLOCK_CRC_LEN;
-               return XFS_BTREE_LBLOCK_LEN;
-       }
-       if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
-               return XFS_BTREE_SBLOCK_CRC_LEN;
-       return XFS_BTREE_SBLOCK_LEN;
-}
-
-/*
- * Return size of btree block pointers for this btree instance.
- */
-static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
-{
-       return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
-               sizeof(__be64) : sizeof(__be32);
-}
-
-/*
- * Calculate offset of the n-th record in a btree block.
- */
-STATIC size_t
-xfs_btree_rec_offset(
-       struct xfs_btree_cur    *cur,
-       int                     n)
-{
-       return xfs_btree_block_len(cur) +
-               (n - 1) * cur->bc_ops->rec_len;
-}
-
-/*
- * Calculate offset of the n-th key in a btree block.
- */
-STATIC size_t
-xfs_btree_key_offset(
-       struct xfs_btree_cur    *cur,
-       int                     n)
-{
-       return xfs_btree_block_len(cur) +
-               (n - 1) * cur->bc_ops->key_len;
-}
-
-/*
- * Calculate offset of the n-th block pointer in a btree block.
- */
-STATIC size_t
-xfs_btree_ptr_offset(
-       struct xfs_btree_cur    *cur,
-       int                     n,
-       int                     level)
-{
-       return xfs_btree_block_len(cur) +
-               cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
-               (n - 1) * xfs_btree_ptr_len(cur);
-}
-
-/*
- * Return a pointer to the n-th record in the btree block.
- */
-STATIC union xfs_btree_rec *
-xfs_btree_rec_addr(
-       struct xfs_btree_cur    *cur,
-       int                     n,
-       struct xfs_btree_block  *block)
-{
-       return (union xfs_btree_rec *)
-               ((char *)block + xfs_btree_rec_offset(cur, n));
-}
-
-/*
- * Return a pointer to the n-th key in the btree block.
- */
-STATIC union xfs_btree_key *
-xfs_btree_key_addr(
-       struct xfs_btree_cur    *cur,
-       int                     n,
-       struct xfs_btree_block  *block)
-{
-       return (union xfs_btree_key *)
-               ((char *)block + xfs_btree_key_offset(cur, n));
-}
-
-/*
- * Return a pointer to the n-th block pointer in the btree block.
- */
-STATIC union xfs_btree_ptr *
-xfs_btree_ptr_addr(
-       struct xfs_btree_cur    *cur,
-       int                     n,
-       struct xfs_btree_block  *block)
-{
-       int                     level = xfs_btree_get_level(block);
-
-       ASSERT(block->bb_level != 0);
-
-       return (union xfs_btree_ptr *)
-               ((char *)block + xfs_btree_ptr_offset(cur, n, level));
-}
-
-/*
- * Get the root block which is stored in the inode.
- *
- * For now this btree implementation assumes the btree root is always
- * stored in the if_broot field of an inode fork.
- */
-STATIC struct xfs_btree_block *
-xfs_btree_get_iroot(
-       struct xfs_btree_cur    *cur)
-{
-       struct xfs_ifork        *ifp;
-
-       ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
-       return (struct xfs_btree_block *)ifp->if_broot;
-}
-
-/*
- * Retrieve the block pointer from the cursor at the given level.
- * This may be an inode btree root or from a buffer.
- */
-STATIC struct xfs_btree_block *                /* generic btree block pointer */
-xfs_btree_get_block(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       int                     level,  /* level in btree */
-       struct xfs_buf          **bpp)  /* buffer containing the block */
-{
-       if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
-           (level == cur->bc_nlevels - 1)) {
-               *bpp = NULL;
-               return xfs_btree_get_iroot(cur);
-       }
-
-       *bpp = cur->bc_bufs[level];
-       return XFS_BUF_TO_BLOCK(*bpp);
-}
-
-/*
- * Get a buffer for the block, return it with no data read.
- * Long-form addressing.
- */
-xfs_buf_t *                            /* buffer for fsbno */
-xfs_btree_get_bufl(
-       xfs_mount_t     *mp,            /* file system mount point */
-       xfs_trans_t     *tp,            /* transaction pointer */
-       xfs_fsblock_t   fsbno,          /* file system block number */
-       uint            lock)           /* lock flags for get_buf */
-{
-       xfs_daddr_t             d;              /* real disk block address */
-
-       ASSERT(fsbno != NULLFSBLOCK);
-       d = XFS_FSB_TO_DADDR(mp, fsbno);
-       return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
-}
-
-/*
- * Get a buffer for the block, return it with no data read.
- * Short-form addressing.
- */
-xfs_buf_t *                            /* buffer for agno/agbno */
-xfs_btree_get_bufs(
-       xfs_mount_t     *mp,            /* file system mount point */
-       xfs_trans_t     *tp,            /* transaction pointer */
-       xfs_agnumber_t  agno,           /* allocation group number */
-       xfs_agblock_t   agbno,          /* allocation group block number */
-       uint            lock)           /* lock flags for get_buf */
-{
-       xfs_daddr_t             d;              /* real disk block address */
-
-       ASSERT(agno != NULLAGNUMBER);
-       ASSERT(agbno != NULLAGBLOCK);
-       d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-       return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
-}
-
-/*
- * Check for the cursor referring to the last block at the given level.
- */
-int                                    /* 1=is last block, 0=not last block */
-xfs_btree_islastblock(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level)  /* level to check */
-{
-       struct xfs_btree_block  *block; /* generic btree block pointer */
-       xfs_buf_t               *bp;    /* buffer containing block */
-
-       block = xfs_btree_get_block(cur, level, &bp);
-       xfs_btree_check_block(cur, block, level, bp);
-       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-               return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO);
-       else
-               return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
-}
-
-/*
- * Change the cursor to point to the first record at the given level.
- * Other levels are unaffected.
- */
-STATIC int                             /* success=1, failure=0 */
-xfs_btree_firstrec(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level)  /* level to change */
-{
-       struct xfs_btree_block  *block; /* generic btree block pointer */
-       xfs_buf_t               *bp;    /* buffer containing block */
-
-       /*
-        * Get the block pointer for this level.
-        */
-       block = xfs_btree_get_block(cur, level, &bp);
-       xfs_btree_check_block(cur, block, level, bp);
-       /*
-        * It's empty, there is no such record.
-        */
-       if (!block->bb_numrecs)
-               return 0;
-       /*
-        * Set the ptr value to 1, that's the first record/key.
-        */
-       cur->bc_ptrs[level] = 1;
-       return 1;
-}
-
-/*
- * Change the cursor to point to the last record in the current block
- * at the given level.  Other levels are unaffected.
- */
-STATIC int                             /* success=1, failure=0 */
-xfs_btree_lastrec(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level)  /* level to change */
-{
-       struct xfs_btree_block  *block; /* generic btree block pointer */
-       xfs_buf_t               *bp;    /* buffer containing block */
-
-       /*
-        * Get the block pointer for this level.
-        */
-       block = xfs_btree_get_block(cur, level, &bp);
-       xfs_btree_check_block(cur, block, level, bp);
-       /*
-        * It's empty, there is no such record.
-        */
-       if (!block->bb_numrecs)
-               return 0;
-       /*
-        * Set the ptr value to numrecs, that's the last record/key.
-        */
-       cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
-       return 1;
-}
-
-/*
- * Compute first and last byte offsets for the fields given.
- * Interprets the offsets table, which contains struct field offsets.
- */
-void
-xfs_btree_offsets(
-       __int64_t       fields,         /* bitmask of fields */
-       const short     *offsets,       /* table of field offsets */
-       int             nbits,          /* number of bits to inspect */
-       int             *first,         /* output: first byte offset */
-       int             *last)          /* output: last byte offset */
-{
-       int             i;              /* current bit number */
-       __int64_t       imask;          /* mask for current bit number */
-
-       ASSERT(fields != 0);
-       /*
-        * Find the lowest bit, so the first byte offset.
-        */
-       for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
-               if (imask & fields) {
-                       *first = offsets[i];
-                       break;
-               }
-       }
-       /*
-        * Find the highest bit, so the last byte offset.
-        */
-       for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
-               if (imask & fields) {
-                       *last = offsets[i + 1] - 1;
-                       break;
-               }
-       }
-}
-
-/*
- * Get a buffer for the block, return it read in.
- * Long-form addressing.
- */
-int
-xfs_btree_read_bufl(
-       struct xfs_mount        *mp,            /* file system mount point */
-       struct xfs_trans        *tp,            /* transaction pointer */
-       xfs_fsblock_t           fsbno,          /* file system block number */
-       uint                    lock,           /* lock flags for read_buf */
-       struct xfs_buf          **bpp,          /* buffer for fsbno */
-       int                     refval,         /* ref count value for buffer */
-       const struct xfs_buf_ops *ops)
-{
-       struct xfs_buf          *bp;            /* return value */
-       xfs_daddr_t             d;              /* real disk block address */
-       int                     error;
-
-       ASSERT(fsbno != NULLFSBLOCK);
-       d = XFS_FSB_TO_DADDR(mp, fsbno);
-       error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
-                                  mp->m_bsize, lock, &bp, ops);
-       if (error)
-               return error;
-       if (bp)
-               xfs_buf_set_ref(bp, refval);
-       *bpp = bp;
-       return 0;
-}
-
-/*
- * Read-ahead the block, don't wait for it, don't return a buffer.
- * Long-form addressing.
- */
-/* ARGSUSED */
-void
-xfs_btree_reada_bufl(
-       struct xfs_mount        *mp,            /* file system mount point */
-       xfs_fsblock_t           fsbno,          /* file system block number */
-       xfs_extlen_t            count,          /* count of filesystem blocks */
-       const struct xfs_buf_ops *ops)
-{
-       xfs_daddr_t             d;
-
-       ASSERT(fsbno != NULLFSBLOCK);
-       d = XFS_FSB_TO_DADDR(mp, fsbno);
-       xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
-}
-
-/*
- * Read-ahead the block, don't wait for it, don't return a buffer.
- * Short-form addressing.
- */
-/* ARGSUSED */
-void
-xfs_btree_reada_bufs(
-       struct xfs_mount        *mp,            /* file system mount point */
-       xfs_agnumber_t          agno,           /* allocation group number */
-       xfs_agblock_t           agbno,          /* allocation group block number */
-       xfs_extlen_t            count,          /* count of filesystem blocks */
-       const struct xfs_buf_ops *ops)
-{
-       xfs_daddr_t             d;
-
-       ASSERT(agno != NULLAGNUMBER);
-       ASSERT(agbno != NULLAGBLOCK);
-       d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-       xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
-}
-
-STATIC int
-xfs_btree_readahead_lblock(
-       struct xfs_btree_cur    *cur,
-       int                     lr,
-       struct xfs_btree_block  *block)
-{
-       int                     rval = 0;
-       xfs_dfsbno_t            left = be64_to_cpu(block->bb_u.l.bb_leftsib);
-       xfs_dfsbno_t            right = be64_to_cpu(block->bb_u.l.bb_rightsib);
-
-       if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
-               xfs_btree_reada_bufl(cur->bc_mp, left, 1,
-                                    cur->bc_ops->buf_ops);
-               rval++;
-       }
-
-       if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
-               xfs_btree_reada_bufl(cur->bc_mp, right, 1,
-                                    cur->bc_ops->buf_ops);
-               rval++;
-       }
-
-       return rval;
-}
-
-STATIC int
-xfs_btree_readahead_sblock(
-       struct xfs_btree_cur    *cur,
-       int                     lr,
-       struct xfs_btree_block *block)
-{
-       int                     rval = 0;
-       xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
-       xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
-
-
-       if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
-               xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                                    left, 1, cur->bc_ops->buf_ops);
-               rval++;
-       }
-
-       if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
-               xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                                    right, 1, cur->bc_ops->buf_ops);
-               rval++;
-       }
-
-       return rval;
-}
-
-/*
- * Read-ahead btree blocks, at the given level.
- * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
- */
-STATIC int
-xfs_btree_readahead(
-       struct xfs_btree_cur    *cur,           /* btree cursor */
-       int                     lev,            /* level in btree */
-       int                     lr)             /* left/right bits */
-{
-       struct xfs_btree_block  *block;
-
-       /*
-        * No readahead needed if we are at the root level and the
-        * btree root is stored in the inode.
-        */
-       if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
-           (lev == cur->bc_nlevels - 1))
-               return 0;
-
-       if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
-               return 0;
-
-       cur->bc_ra[lev] |= lr;
-       block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
-
-       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-               return xfs_btree_readahead_lblock(cur, lr, block);
-       return xfs_btree_readahead_sblock(cur, lr, block);
-}
-
-STATIC xfs_daddr_t
-xfs_btree_ptr_to_daddr(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_ptr     *ptr)
-{
-       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-               ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
-
-               return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
-       } else {
-               ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
-               ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
-
-               return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
-                                       be32_to_cpu(ptr->s));
-       }
-}
-
-/*
- * Readahead @count btree blocks at the given @ptr location.
- *
- * We don't need to care about long or short form btrees here as we have a
- * method of converting the ptr directly to a daddr available to us.
- */
-STATIC void
-xfs_btree_readahead_ptr(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_ptr     *ptr,
-       xfs_extlen_t            count)
-{
-       xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
-                         xfs_btree_ptr_to_daddr(cur, ptr),
-                         cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
-}
-
-/*
- * Set the buffer for level "lev" in the cursor to bp, releasing
- * any previous buffer.
- */
-STATIC void
-xfs_btree_setbuf(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     lev,    /* level in btree */
-       xfs_buf_t               *bp)    /* new buffer to set */
-{
-       struct xfs_btree_block  *b;     /* btree block */
-
-       if (cur->bc_bufs[lev])
-               xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
-       cur->bc_bufs[lev] = bp;
-       cur->bc_ra[lev] = 0;
-
-       b = XFS_BUF_TO_BLOCK(bp);
-       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-               if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO))
-                       cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
-               if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO))
-                       cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
-       } else {
-               if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
-                       cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
-               if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
-                       cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
-       }
-}
-
-STATIC int
-xfs_btree_ptr_is_null(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_ptr     *ptr)
-{
-       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-               return ptr->l == cpu_to_be64(NULLDFSBNO);
-       else
-               return ptr->s == cpu_to_be32(NULLAGBLOCK);
-}
-
-STATIC void
-xfs_btree_set_ptr_null(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_ptr     *ptr)
-{
-       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-               ptr->l = cpu_to_be64(NULLDFSBNO);
-       else
-               ptr->s = cpu_to_be32(NULLAGBLOCK);
-}
-
-/*
- * Get/set/init sibling pointers
- */
-STATIC void
-xfs_btree_get_sibling(
-       struct xfs_btree_cur    *cur,
-       struct xfs_btree_block  *block,
-       union xfs_btree_ptr     *ptr,
-       int                     lr)
-{
-       ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
-
-       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-               if (lr == XFS_BB_RIGHTSIB)
-                       ptr->l = block->bb_u.l.bb_rightsib;
-               else
-                       ptr->l = block->bb_u.l.bb_leftsib;
-       } else {
-               if (lr == XFS_BB_RIGHTSIB)
-                       ptr->s = block->bb_u.s.bb_rightsib;
-               else
-                       ptr->s = block->bb_u.s.bb_leftsib;
-       }
-}
-
-STATIC void
-xfs_btree_set_sibling(
-       struct xfs_btree_cur    *cur,
-       struct xfs_btree_block  *block,
-       union xfs_btree_ptr     *ptr,
-       int                     lr)
-{
-       ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
-
-       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-               if (lr == XFS_BB_RIGHTSIB)
-                       block->bb_u.l.bb_rightsib = ptr->l;
-               else
-                       block->bb_u.l.bb_leftsib = ptr->l;
-       } else {
-               if (lr == XFS_BB_RIGHTSIB)
-                       block->bb_u.s.bb_rightsib = ptr->s;
-               else
-                       block->bb_u.s.bb_leftsib = ptr->s;
-       }
-}
-
-void
-xfs_btree_init_block_int(
-       struct xfs_mount        *mp,
-       struct xfs_btree_block  *buf,
-       xfs_daddr_t             blkno,
-       __u32                   magic,
-       __u16                   level,
-       __u16                   numrecs,
-       __u64                   owner,
-       unsigned int            flags)
-{
-       buf->bb_magic = cpu_to_be32(magic);
-       buf->bb_level = cpu_to_be16(level);
-       buf->bb_numrecs = cpu_to_be16(numrecs);
-
-       if (flags & XFS_BTREE_LONG_PTRS) {
-               buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
-               buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
-               if (flags & XFS_BTREE_CRC_BLOCKS) {
-                       buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
-                       buf->bb_u.l.bb_owner = cpu_to_be64(owner);
-                       uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
-                       buf->bb_u.l.bb_pad = 0;
-                       buf->bb_u.l.bb_lsn = 0;
-               }
-       } else {
-               /* owner is a 32 bit value on short blocks */
-               __u32 __owner = (__u32)owner;
-
-               buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-               buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-               if (flags & XFS_BTREE_CRC_BLOCKS) {
-                       buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
-                       buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
-                       uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
-                       buf->bb_u.s.bb_lsn = 0;
-               }
-       }
-}
-
-void
-xfs_btree_init_block(
-       struct xfs_mount *mp,
-       struct xfs_buf  *bp,
-       __u32           magic,
-       __u16           level,
-       __u16           numrecs,
-       __u64           owner,
-       unsigned int    flags)
-{
-       xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
-                                magic, level, numrecs, owner, flags);
-}
-
-STATIC void
-xfs_btree_init_block_cur(
-       struct xfs_btree_cur    *cur,
-       struct xfs_buf          *bp,
-       int                     level,
-       int                     numrecs)
-{
-       __u64 owner;
-
-       /*
-        * we can pull the owner from the cursor right now as the different
-        * owners align directly with the pointer size of the btree. This may
-        * change in future, but is safe for current users of the generic btree
-        * code.
-        */
-       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-               owner = cur->bc_private.b.ip->i_ino;
-       else
-               owner = cur->bc_private.a.agno;
-
-       xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
-                                xfs_btree_magic(cur), level, numrecs,
-                                owner, cur->bc_flags);
-}
-
-/*
- * Return true if ptr is the last record in the btree and
- * we need to track updates to this record.  The decision
- * will be further refined in the update_lastrec method.
- */
-STATIC int
-xfs_btree_is_lastrec(
-       struct xfs_btree_cur    *cur,
-       struct xfs_btree_block  *block,
-       int                     level)
-{
-       union xfs_btree_ptr     ptr;
-
-       if (level > 0)
-               return 0;
-       if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
-               return 0;
-
-       xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
-       if (!xfs_btree_ptr_is_null(cur, &ptr))
-               return 0;
-       return 1;
-}
-
-STATIC void
-xfs_btree_buf_to_ptr(
-       struct xfs_btree_cur    *cur,
-       struct xfs_buf          *bp,
-       union xfs_btree_ptr     *ptr)
-{
-       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-               ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
-                                       XFS_BUF_ADDR(bp)));
-       else {
-               ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
-                                       XFS_BUF_ADDR(bp)));
-       }
-}
-
-STATIC void
-xfs_btree_set_refs(
-       struct xfs_btree_cur    *cur,
-       struct xfs_buf          *bp)
-{
-       switch (cur->bc_btnum) {
-       case XFS_BTNUM_BNO:
-       case XFS_BTNUM_CNT:
-               xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
-               break;
-       case XFS_BTNUM_INO:
-       case XFS_BTNUM_FINO:
-               xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
-               break;
-       case XFS_BTNUM_BMAP:
-               xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
-               break;
-       default:
-               ASSERT(0);
-       }
-}
-
-STATIC int
-xfs_btree_get_buf_block(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_ptr     *ptr,
-       int                     flags,
-       struct xfs_btree_block  **block,
-       struct xfs_buf          **bpp)
-{
-       struct xfs_mount        *mp = cur->bc_mp;
-       xfs_daddr_t             d;
-
-       /* need to sort out how callers deal with failures first */
-       ASSERT(!(flags & XBF_TRYLOCK));
-
-       d = xfs_btree_ptr_to_daddr(cur, ptr);
-       *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
-                                mp->m_bsize, flags);
-
-       if (!*bpp)
-               return ENOMEM;
-
-       (*bpp)->b_ops = cur->bc_ops->buf_ops;
-       *block = XFS_BUF_TO_BLOCK(*bpp);
-       return 0;
-}
-
-/*
- * Read in the buffer at the given ptr and return the buffer and
- * the block pointer within the buffer.
- */
-STATIC int
-xfs_btree_read_buf_block(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_ptr     *ptr,
-       int                     flags,
-       struct xfs_btree_block  **block,
-       struct xfs_buf          **bpp)
-{
-       struct xfs_mount        *mp = cur->bc_mp;
-       xfs_daddr_t             d;
-       int                     error;
-
-       /* need to sort out how callers deal with failures first */
-       ASSERT(!(flags & XBF_TRYLOCK));
-
-       d = xfs_btree_ptr_to_daddr(cur, ptr);
-       error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
-                                  mp->m_bsize, flags, bpp,
-                                  cur->bc_ops->buf_ops);
-       if (error)
-               return error;
-
-       xfs_btree_set_refs(cur, *bpp);
-       *block = XFS_BUF_TO_BLOCK(*bpp);
-       return 0;
-}
-
-/*
- * Copy keys from one btree block to another.
- */
-STATIC void
-xfs_btree_copy_keys(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_key     *dst_key,
-       union xfs_btree_key     *src_key,
-       int                     numkeys)
-{
-       ASSERT(numkeys >= 0);
-       memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
-}
-
-/*
- * Copy records from one btree block to another.
- */
-STATIC void
-xfs_btree_copy_recs(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_rec     *dst_rec,
-       union xfs_btree_rec     *src_rec,
-       int                     numrecs)
-{
-       ASSERT(numrecs >= 0);
-       memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
-}
-
-/*
- * Copy block pointers from one btree block to another.
- */
-STATIC void
-xfs_btree_copy_ptrs(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_ptr     *dst_ptr,
-       union xfs_btree_ptr     *src_ptr,
-       int                     numptrs)
-{
-       ASSERT(numptrs >= 0);
-       memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
-}
-
-/*
- * Shift keys one index left/right inside a single btree block.
- */
-STATIC void
-xfs_btree_shift_keys(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_key     *key,
-       int                     dir,
-       int                     numkeys)
-{
-       char                    *dst_key;
-
-       ASSERT(numkeys >= 0);
-       ASSERT(dir == 1 || dir == -1);
-
-       dst_key = (char *)key + (dir * cur->bc_ops->key_len);
-       memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
-}
-
-/*
- * Shift records one index left/right inside a single btree block.
- */
-STATIC void
-xfs_btree_shift_recs(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_rec     *rec,
-       int                     dir,
-       int                     numrecs)
-{
-       char                    *dst_rec;
-
-       ASSERT(numrecs >= 0);
-       ASSERT(dir == 1 || dir == -1);
-
-       dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
-       memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
-}
-
-/*
- * Shift block pointers one index left/right inside a single btree block.
- */
-STATIC void
-xfs_btree_shift_ptrs(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_ptr     *ptr,
-       int                     dir,
-       int                     numptrs)
-{
-       char                    *dst_ptr;
-
-       ASSERT(numptrs >= 0);
-       ASSERT(dir == 1 || dir == -1);
-
-       dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
-       memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
-}
-
-/*
- * Log key values from the btree block.
- */
-STATIC void
-xfs_btree_log_keys(
-       struct xfs_btree_cur    *cur,
-       struct xfs_buf          *bp,
-       int                     first,
-       int                     last)
-{
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-       XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
-
-       if (bp) {
-               xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
-               xfs_trans_log_buf(cur->bc_tp, bp,
-                                 xfs_btree_key_offset(cur, first),
-                                 xfs_btree_key_offset(cur, last + 1) - 1);
-       } else {
-               xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
-                               xfs_ilog_fbroot(cur->bc_private.b.whichfork));
-       }
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-}
-
-/*
- * Log record values from the btree block.
- */
-void
-xfs_btree_log_recs(
-       struct xfs_btree_cur    *cur,
-       struct xfs_buf          *bp,
-       int                     first,
-       int                     last)
-{
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-       XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
-
-       xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
-       xfs_trans_log_buf(cur->bc_tp, bp,
-                         xfs_btree_rec_offset(cur, first),
-                         xfs_btree_rec_offset(cur, last + 1) - 1);
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-}
-
-/*
- * Log block pointer fields from a btree block (nonleaf).
- */
-STATIC void
-xfs_btree_log_ptrs(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       struct xfs_buf          *bp,    /* buffer containing btree block */
-       int                     first,  /* index of first pointer to log */
-       int                     last)   /* index of last pointer to log */
-{
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-       XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
-
-       if (bp) {
-               struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
-               int                     level = xfs_btree_get_level(block);
-
-               xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
-               xfs_trans_log_buf(cur->bc_tp, bp,
-                               xfs_btree_ptr_offset(cur, first, level),
-                               xfs_btree_ptr_offset(cur, last + 1, level) - 1);
-       } else {
-               xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
-                       xfs_ilog_fbroot(cur->bc_private.b.whichfork));
-       }
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-}
-
-/*
- * Log fields from a btree block header.
- */
-void
-xfs_btree_log_block(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       struct xfs_buf          *bp,    /* buffer containing btree block */
-       int                     fields) /* mask of fields: XFS_BB_... */
-{
-       int                     first;  /* first byte offset logged */
-       int                     last;   /* last byte offset logged */
-       static const short      soffsets[] = {  /* table of offsets (short) */
-               offsetof(struct xfs_btree_block, bb_magic),
-               offsetof(struct xfs_btree_block, bb_level),
-               offsetof(struct xfs_btree_block, bb_numrecs),
-               offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
-               offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
-               offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
-               offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
-               offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
-               offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
-               offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
-               XFS_BTREE_SBLOCK_CRC_LEN
-       };
-       static const short      loffsets[] = {  /* table of offsets (long) */
-               offsetof(struct xfs_btree_block, bb_magic),
-               offsetof(struct xfs_btree_block, bb_level),
-               offsetof(struct xfs_btree_block, bb_numrecs),
-               offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
-               offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
-               offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
-               offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
-               offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
-               offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
-               offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
-               offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
-               XFS_BTREE_LBLOCK_CRC_LEN
-       };
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-       XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
-
-       if (bp) {
-               int nbits;
-
-               if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
-                       /*
-                        * We don't log the CRC when updating a btree
-                        * block but instead recreate it during log
-                        * recovery.  As the log buffers have checksums
-                        * of their own this is safe and avoids logging a crc
-                        * update in a lot of places.
-                        */
-                       if (fields == XFS_BB_ALL_BITS)
-                               fields = XFS_BB_ALL_BITS_CRC;
-                       nbits = XFS_BB_NUM_BITS_CRC;
-               } else {
-                       nbits = XFS_BB_NUM_BITS;
-               }
-               xfs_btree_offsets(fields,
-                                 (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
-                                       loffsets : soffsets,
-                                 nbits, &first, &last);
-               xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
-               xfs_trans_log_buf(cur->bc_tp, bp, first, last);
-       } else {
-               xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
-                       xfs_ilog_fbroot(cur->bc_private.b.whichfork));
-       }
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-}
-
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int                                            /* error */
-xfs_btree_increment(
-       struct xfs_btree_cur    *cur,
-       int                     level,
-       int                     *stat)          /* success/failure */
-{
-       struct xfs_btree_block  *block;
-       union xfs_btree_ptr     ptr;
-       struct xfs_buf          *bp;
-       int                     error;          /* error return value */
-       int                     lev;
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-       XFS_BTREE_TRACE_ARGI(cur, level);
-
-       ASSERT(level < cur->bc_nlevels);
-
-       /* Read-ahead to the right at this level. */
-       xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
-
-       /* Get a pointer to the btree block. */
-       block = xfs_btree_get_block(cur, level, &bp);
-
-#ifdef DEBUG
-       error = xfs_btree_check_block(cur, block, level, bp);
-       if (error)
-               goto error0;
-#endif
-
-       /* We're done if we remain in the block after the increment. */
-       if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
-               goto out1;
-
-       /* Fail if we just went off the right edge of the tree. */
-       xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
-       if (xfs_btree_ptr_is_null(cur, &ptr))
-               goto out0;
-
-       XFS_BTREE_STATS_INC(cur, increment);
-
-       /*
-        * March up the tree incrementing pointers.
-        * Stop when we don't go off the right edge of a block.
-        */
-       for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-               block = xfs_btree_get_block(cur, lev, &bp);
-
-#ifdef DEBUG
-               error = xfs_btree_check_block(cur, block, lev, bp);
-               if (error)
-                       goto error0;
-#endif
-
-               if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
-                       break;
-
-               /* Read-ahead the right block for the next loop. */
-               xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
-       }
-
-       /*
-        * If we went off the root then we are either seriously
-        * confused or have the tree root in an inode.
-        */
-       if (lev == cur->bc_nlevels) {
-               if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
-                       goto out0;
-               ASSERT(0);
-               error = EFSCORRUPTED;
-               goto error0;
-       }
-       ASSERT(lev < cur->bc_nlevels);
-
-       /*
-        * Now walk back down the tree, fixing up the cursor's buffer
-        * pointers and key numbers.
-        */
-       for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
-               union xfs_btree_ptr     *ptrp;
-
-               ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
-               --lev;
-               error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
-               if (error)
-                       goto error0;
-
-               xfs_btree_setbuf(cur, lev, bp);
-               cur->bc_ptrs[lev] = 1;
-       }
-out1:
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-       *stat = 1;
-       return 0;
-
-out0:
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-       *stat = 0;
-       return 0;
-
-error0:
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-       return error;
-}
-
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int                                            /* error */
-xfs_btree_decrement(
-       struct xfs_btree_cur    *cur,
-       int                     level,
-       int                     *stat)          /* success/failure */
-{
-       struct xfs_btree_block  *block;
-       xfs_buf_t               *bp;
-       int                     error;          /* error return value */
-       int                     lev;
-       union xfs_btree_ptr     ptr;
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-       XFS_BTREE_TRACE_ARGI(cur, level);
-
-       ASSERT(level < cur->bc_nlevels);
-
-       /* Read-ahead to the left at this level. */
-       xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
-
-       /* We're done if we remain in the block after the decrement. */
-       if (--cur->bc_ptrs[level] > 0)
-               goto out1;
-
-       /* Get a pointer to the btree block. */
-       block = xfs_btree_get_block(cur, level, &bp);
-
-#ifdef DEBUG
-       error = xfs_btree_check_block(cur, block, level, bp);
-       if (error)
-               goto error0;
-#endif
-
-       /* Fail if we just went off the left edge of the tree. */
-       xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
-       if (xfs_btree_ptr_is_null(cur, &ptr))
-               goto out0;
-
-       XFS_BTREE_STATS_INC(cur, decrement);
-
-       /*
-        * March up the tree decrementing pointers.
-        * Stop when we don't go off the left edge of a block.
-        */
-       for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-               if (--cur->bc_ptrs[lev] > 0)
-                       break;
-               /* Read-ahead the left block for the next loop. */
-               xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
-       }
-
-       /*
-        * If we went off the root then we are seriously confused.
-        * or the root of the tree is in an inode.
-        */
-       if (lev == cur->bc_nlevels) {
-               if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
-                       goto out0;
-               ASSERT(0);
-               error = EFSCORRUPTED;
-               goto error0;
-       }
-       ASSERT(lev < cur->bc_nlevels);
-
-       /*
-        * Now walk back down the tree, fixing up the cursor's buffer
-        * pointers and key numbers.
-        */
-       for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
-               union xfs_btree_ptr     *ptrp;
-
-               ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
-               --lev;
-               error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
-               if (error)
-                       goto error0;
-               xfs_btree_setbuf(cur, lev, bp);
-               cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
-       }
-out1:
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-       *stat = 1;
-       return 0;
-
-out0:
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-       *stat = 0;
-       return 0;
-
-error0:
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-       return error;
-}
-
-STATIC int
-xfs_btree_lookup_get_block(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       int                     level,  /* level in the btree */
-       union xfs_btree_ptr     *pp,    /* ptr to btree block */
-       struct xfs_btree_block  **blkp) /* return btree block */
-{
-       struct xfs_buf          *bp;    /* buffer pointer for btree block */
-       int                     error = 0;
-
-       /* special case the root block if in an inode */
-       if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
-           (level == cur->bc_nlevels - 1)) {
-               *blkp = xfs_btree_get_iroot(cur);
-               return 0;
-       }
-
-       /*
-        * If the old buffer at this level for the disk address we are
-        * looking for re-use it.
-        *
-        * Otherwise throw it away and get a new one.
-        */
-       bp = cur->bc_bufs[level];
-       if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
-               *blkp = XFS_BUF_TO_BLOCK(bp);
-               return 0;
-       }
-
-       error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
-       if (error)
-               return error;
-
-       xfs_btree_setbuf(cur, level, bp);
-       return 0;
-}
-
-/*
- * Get current search key.  For level 0 we don't actually have a key
- * structure so we make one up from the record.  For all other levels
- * we just return the right key.
- */
-STATIC union xfs_btree_key *
-xfs_lookup_get_search_key(
-       struct xfs_btree_cur    *cur,
-       int                     level,
-       int                     keyno,
-       struct xfs_btree_block  *block,
-       union xfs_btree_key     *kp)
-{
-       if (level == 0) {
-               cur->bc_ops->init_key_from_rec(kp,
-                               xfs_btree_rec_addr(cur, keyno, block));
-               return kp;
-       }
-
-       return xfs_btree_key_addr(cur, keyno, block);
-}
-
-/*
- * Lookup the record.  The cursor is made to point to it, based on dir.
- * stat is set to 0 if can't find any such record, 1 for success.
- */
-int                                    /* error */
-xfs_btree_lookup(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       xfs_lookup_t            dir,    /* <=, ==, or >= */
-       int                     *stat)  /* success/failure */
-{
-       struct xfs_btree_block  *block; /* current btree block */
-       __int64_t               diff;   /* difference for the current key */
-       int                     error;  /* error return value */
-       int                     keyno;  /* current key number */
-       int                     level;  /* level in the btree */
-       union xfs_btree_ptr     *pp;    /* ptr to btree block */
-       union xfs_btree_ptr     ptr;    /* ptr to btree block */
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-       XFS_BTREE_TRACE_ARGI(cur, dir);
-
-       XFS_BTREE_STATS_INC(cur, lookup);
-
-       block = NULL;
-       keyno = 0;
-
-       /* initialise start pointer from cursor */
-       cur->bc_ops->init_ptr_from_cur(cur, &ptr);
-       pp = &ptr;
-
-       /*
-        * Iterate over each level in the btree, starting at the root.
-        * For each level above the leaves, find the key we need, based
-        * on the lookup record, then follow the corresponding block
-        * pointer down to the next level.
-        */
-       for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
-               /* Get the block we need to do the lookup on. */
-               error = xfs_btree_lookup_get_block(cur, level, pp, &block);
-               if (error)
-                       goto error0;
-
-               if (diff == 0) {
-                       /*
-                        * If we already had a key match at a higher level, we
-                        * know we need to use the first entry in this block.
-                        */
-                       keyno = 1;
-               } else {
-                       /* Otherwise search this block. Do a binary search. */
-
-                       int     high;   /* high entry number */
-                       int     low;    /* low entry number */
-
-                       /* Set low and high entry numbers, 1-based. */
-                       low = 1;
-                       high = xfs_btree_get_numrecs(block);
-                       if (!high) {
-                               /* Block is empty, must be an empty leaf. */
-                               ASSERT(level == 0 && cur->bc_nlevels == 1);
-
-                               cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
-                               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-                               *stat = 0;
-                               return 0;
-                       }
-
-                       /* Binary search the block. */
-                       while (low <= high) {
-                               union xfs_btree_key     key;
-                               union xfs_btree_key     *kp;
-
-                               XFS_BTREE_STATS_INC(cur, compare);
-
-                               /* keyno is average of low and high. */
-                               keyno = (low + high) >> 1;
-
-                               /* Get current search key */
-                               kp = xfs_lookup_get_search_key(cur, level,
-                                               keyno, block, &key);
-
-                               /*
-                                * Compute difference to get next direction:
-                                *  - less than, move right
-                                *  - greater than, move left
-                                *  - equal, we're done
-                                */
-                               diff = cur->bc_ops->key_diff(cur, kp);
-                               if (diff < 0)
-                                       low = keyno + 1;
-                               else if (diff > 0)
-                                       high = keyno - 1;
-                               else
-                                       break;
-                       }
-               }
-
-               /*
-                * If there are more levels, set up for the next level
-                * by getting the block number and filling in the cursor.
-                */
-               if (level > 0) {
-                       /*
-                        * If we moved left, need the previous key number,
-                        * unless there isn't one.
-                        */
-                       if (diff > 0 && --keyno < 1)
-                               keyno = 1;
-                       pp = xfs_btree_ptr_addr(cur, keyno, block);
-
-#ifdef DEBUG
-                       error = xfs_btree_check_ptr(cur, pp, 0, level);
-                       if (error)
-                               goto error0;
-#endif
-                       cur->bc_ptrs[level] = keyno;
-               }
-       }
-
-       /* Done with the search. See if we need to adjust the results. */
-       if (dir != XFS_LOOKUP_LE && diff < 0) {
-               keyno++;
-               /*
-                * If ge search and we went off the end of the block, but it's
-                * not the last block, we're in the wrong block.
-                */
-               xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
-               if (dir == XFS_LOOKUP_GE &&
-                   keyno > xfs_btree_get_numrecs(block) &&
-                   !xfs_btree_ptr_is_null(cur, &ptr)) {
-                       int     i;
-
-                       cur->bc_ptrs[0] = keyno;
-                       error = xfs_btree_increment(cur, 0, &i);
-                       if (error)
-                               goto error0;
-                       XFS_WANT_CORRUPTED_RETURN(i == 1);
-                       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-                       *stat = 1;
-                       return 0;
-               }
-       } else if (dir == XFS_LOOKUP_LE && diff > 0)
-               keyno--;
-       cur->bc_ptrs[0] = keyno;
-
-       /* Return if we succeeded or not. */
-       if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
-               *stat = 0;
-       else if (dir != XFS_LOOKUP_EQ || diff == 0)
-               *stat = 1;
-       else
-               *stat = 0;
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-       return 0;
-
-error0:
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-       return error;
-}
-
-/*
- * Update keys at all levels from here to the root along the cursor's path.
- */
-STATIC int
-xfs_btree_updkey(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_key     *keyp,
-       int                     level)
-{
-       struct xfs_btree_block  *block;
-       struct xfs_buf          *bp;
-       union xfs_btree_key     *kp;
-       int                     ptr;
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-       XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
-
-       ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
-
-       /*
-        * Go up the tree from this level toward the root.
-        * At each level, update the key value to the value input.
-        * Stop when we reach a level where the cursor isn't pointing
-        * at the first entry in the block.
-        */
-       for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
-#ifdef DEBUG
-               int             error;
-#endif
-               block = xfs_btree_get_block(cur, level, &bp);
-#ifdef DEBUG
-               error = xfs_btree_check_block(cur, block, level, bp);
-               if (error) {
-                       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-                       return error;
-               }
-#endif
-               ptr = cur->bc_ptrs[level];
-               kp = xfs_btree_key_addr(cur, ptr, block);
-               xfs_btree_copy_keys(cur, kp, keyp, 1);
-               xfs_btree_log_keys(cur, bp, ptr, ptr);
-       }
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-       return 0;
-}
-
-/*
- * Update the record referred to by cur to the value in the
- * given record. This either works (return 0) or gets an
- * EFSCORRUPTED error.
- */
-int
-xfs_btree_update(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_rec     *rec)
-{
-       struct xfs_btree_block  *block;
-       struct xfs_buf          *bp;
-       int                     error;
-       int                     ptr;
-       union xfs_btree_rec     *rp;
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-       XFS_BTREE_TRACE_ARGR(cur, rec);
-
-       /* Pick up the current block. */
-       block = xfs_btree_get_block(cur, 0, &bp);
-
-#ifdef DEBUG
-       error = xfs_btree_check_block(cur, block, 0, bp);
-       if (error)
-               goto error0;
-#endif
-       /* Get the address of the rec to be updated. */
-       ptr = cur->bc_ptrs[0];
-       rp = xfs_btree_rec_addr(cur, ptr, block);
-
-       /* Fill in the new contents and log them. */
-       xfs_btree_copy_recs(cur, rp, rec, 1);
-       xfs_btree_log_recs(cur, bp, ptr, ptr);
-
-       /*
-        * If we are tracking the last record in the tree and
-        * we are at the far right edge of the tree, update it.
-        */
-       if (xfs_btree_is_lastrec(cur, block, 0)) {
-               cur->bc_ops->update_lastrec(cur, block, rec,
-                                           ptr, LASTREC_UPDATE);
-       }
-
-       /* Updating first rec in leaf. Pass new key value up to our parent. */
-       if (ptr == 1) {
-               union xfs_btree_key     key;
-
-               cur->bc_ops->init_key_from_rec(&key, rec);
-               error = xfs_btree_updkey(cur, &key, 1);
-               if (error)
-                       goto error0;
-       }
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-       return 0;
-
-error0:
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-       return error;
-}
-
-/*
- * Move 1 record left from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                                     /* error */
-xfs_btree_lshift(
-       struct xfs_btree_cur    *cur,
-       int                     level,
-       int                     *stat)          /* success/failure */
-{
-       union xfs_btree_key     key;            /* btree key */
-       struct xfs_buf          *lbp;           /* left buffer pointer */
-       struct xfs_btree_block  *left;          /* left btree block */
-       int                     lrecs;          /* left record count */
-       struct xfs_buf          *rbp;           /* right buffer pointer */
-       struct xfs_btree_block  *right;         /* right btree block */
-       int                     rrecs;          /* right record count */
-       union xfs_btree_ptr     lptr;           /* left btree pointer */
-       union xfs_btree_key     *rkp = NULL;    /* right btree key */
-       union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
-       union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
-       int                     error;          /* error return value */
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-       XFS_BTREE_TRACE_ARGI(cur, level);
-
-       if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
-           level == cur->bc_nlevels - 1)
-               goto out0;
-
-       /* Set up variables for this block as "right". */
-       right = xfs_btree_get_block(cur, level, &rbp);
-
-#ifdef DEBUG
-       error = xfs_btree_check_block(cur, right, level, rbp);
-       if (error)
-               goto error0;
-#endif
-
-       /* If we've got no left sibling then we can't shift an entry left. */
-       xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
-       if (xfs_btree_ptr_is_null(cur, &lptr))
-               goto out0;
-
-       /*
-        * If the cursor entry is the one that would be moved, don't
-        * do it... it's too complicated.
-        */
-       if (cur->bc_ptrs[level] <= 1)
-               goto out0;
-
-       /* Set up the left neighbor as "left". */
-       error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
-       if (error)
-               goto error0;
-
-       /* If it's full, it can't take another entry. */
-       lrecs = xfs_btree_get_numrecs(left);
-       if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
-               goto out0;
-
-       rrecs = xfs_btree_get_numrecs(right);
-
-       /*
-        * We add one entry to the left side and remove one for the right side.
-        * Account for it here, the changes will be updated on disk and logged
-        * later.
-        */
-       lrecs++;
-       rrecs--;
-
-       XFS_BTREE_STATS_INC(cur, lshift);
-       XFS_BTREE_STATS_ADD(cur, moves, 1);
-
-       /*
-        * If non-leaf, copy a key and a ptr to the left block.
-        * Log the changes to the left block.
-        */
-       if (level > 0) {
-               /* It's a non-leaf.  Move keys and pointers. */
-               union xfs_btree_key     *lkp;   /* left btree key */
-               union xfs_btree_ptr     *lpp;   /* left address pointer */
-
-               lkp = xfs_btree_key_addr(cur, lrecs, left);
-               rkp = xfs_btree_key_addr(cur, 1, right);
-
-               lpp = xfs_btree_ptr_addr(cur, lrecs, left);
-               rpp = xfs_btree_ptr_addr(cur, 1, right);
-#ifdef DEBUG
-               error = xfs_btree_check_ptr(cur, rpp, 0, level);
-               if (error)
-                       goto error0;
-#endif
-               xfs_btree_copy_keys(cur, lkp, rkp, 1);
-               xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
-
-               xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
-               xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
-
-               ASSERT(cur->bc_ops->keys_inorder(cur,
-                       xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
-       } else {
-               /* It's a leaf.  Move records.  */
-               union xfs_btree_rec     *lrp;   /* left record pointer */
-
-               lrp = xfs_btree_rec_addr(cur, lrecs, left);
-               rrp = xfs_btree_rec_addr(cur, 1, right);
-
-               xfs_btree_copy_recs(cur, lrp, rrp, 1);
-               xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
-
-               ASSERT(cur->bc_ops->recs_inorder(cur,
-                       xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
-       }
-
-       xfs_btree_set_numrecs(left, lrecs);
-       xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
-
-       xfs_btree_set_numrecs(right, rrecs);
-       xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
-
-       /*
-        * Slide the contents of right down one entry.
-        */
-       XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
-       if (level > 0) {
-               /* It's a nonleaf. operate on keys and ptrs */
-#ifdef DEBUG
-               int                     i;              /* loop index */
-
-               for (i = 0; i < rrecs; i++) {
-                       error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
-                       if (error)
-                               goto error0;
-               }
-#endif
-               xfs_btree_shift_keys(cur,
-                               xfs_btree_key_addr(cur, 2, right),
-                               -1, rrecs);
-               xfs_btree_shift_ptrs(cur,
-                               xfs_btree_ptr_addr(cur, 2, right),
-                               -1, rrecs);
-
-               xfs_btree_log_keys(cur, rbp, 1, rrecs);
-               xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
-       } else {
-               /* It's a leaf. operate on records */
-               xfs_btree_shift_recs(cur,
-                       xfs_btree_rec_addr(cur, 2, right),
-                       -1, rrecs);
-               xfs_btree_log_recs(cur, rbp, 1, rrecs);
-
-               /*
-                * If it's the first record in the block, we'll need a key
-                * structure to pass up to the next level (updkey).
-                */
-               cur->bc_ops->init_key_from_rec(&key,
-                       xfs_btree_rec_addr(cur, 1, right));
-               rkp = &key;
-       }
-
-       /* Update the parent key values of right. */
-       error = xfs_btree_updkey(cur, rkp, level + 1);
-       if (error)
-               goto error0;
-
-       /* Slide the cursor value left one. */
-       cur->bc_ptrs[level]--;
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-       *stat = 1;
-       return 0;
-
-out0:
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-       *stat = 0;
-       return 0;
-
-error0:
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-       return error;
-}
-
-/*
- * Move 1 record right from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                                     /* error */
-xfs_btree_rshift(
-       struct xfs_btree_cur    *cur,
-       int                     level,
-       int                     *stat)          /* success/failure */
-{
-       union xfs_btree_key     key;            /* btree key */
-       struct xfs_buf          *lbp;           /* left buffer pointer */
-       struct xfs_btree_block  *left;          /* left btree block */
-       struct xfs_buf          *rbp;           /* right buffer pointer */
-       struct xfs_btree_block  *right;         /* right btree block */
-       struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
-       union xfs_btree_ptr     rptr;           /* right block pointer */
-       union xfs_btree_key     *rkp;           /* right btree key */
-       int                     rrecs;          /* right record count */
-       int                     lrecs;          /* left record count */
-       int                     error;          /* error return value */
-       int                     i;              /* loop counter */
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-       XFS_BTREE_TRACE_ARGI(cur, level);
-
-       if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
-           (level == cur->bc_nlevels - 1))
-               goto out0;
-
-       /* Set up variables for this block as "left". */
-       left = xfs_btree_get_block(cur, level, &lbp);
-
-#ifdef DEBUG
-       error = xfs_btree_check_block(cur, left, level, lbp);
-       if (error)
-               goto error0;
-#endif
-
-       /* If we've got no right sibling then we can't shift an entry right. */
-       xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
-       if (xfs_btree_ptr_is_null(cur, &rptr))
-               goto out0;
-
-       /*
-        * If the cursor entry is the one that would be moved, don't
-        * do it... it's too complicated.
-        */
-       lrecs = xfs_btree_get_numrecs(left);
-       if (cur->bc_ptrs[level] >= lrecs)
-               goto out0;
-
-       /* Set up the right neighbor as "right". */
-       error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
-       if (error)
-               goto error0;
-
-       /* If it's full, it can't take another entry. */
-       rrecs = xfs_btree_get_numrecs(right);
-       if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
-               goto out0;
-
-       XFS_BTREE_STATS_INC(cur, rshift);
-       XFS_BTREE_STATS_ADD(cur, moves, rrecs);
-
-       /*
-        * Make a hole at the start of the right neighbor block, then
-        * copy the last left block entry to the hole.
-        */
-       if (level > 0) {
-               /* It's a nonleaf. make a hole in the keys and ptrs */
-               union xfs_btree_key     *lkp;
-               union xfs_btree_ptr     *lpp;
-               union xfs_btree_ptr     *rpp;
-
-               lkp = xfs_btree_key_addr(cur, lrecs, left);
-               lpp = xfs_btree_ptr_addr(cur, lrecs, left);
-               rkp = xfs_btree_key_addr(cur, 1, right);
-               rpp = xfs_btree_ptr_addr(cur, 1, right);
-
-#ifdef DEBUG
-               for (i = rrecs - 1; i >= 0; i--) {
-                       error = xfs_btree_check_ptr(cur, rpp, i, level);
-                       if (error)
-                               goto error0;
-               }
-#endif
-
-               xfs_btree_shift_keys(cur, rkp, 1, rrecs);
-               xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
-
-#ifdef DEBUG
-               error = xfs_btree_check_ptr(cur, lpp, 0, level);
-               if (error)
-                       goto error0;
-#endif
-
-               /* Now put the new data in, and log it. */
-               xfs_btree_copy_keys(cur, rkp, lkp, 1);
-               xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
-
-               xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
-               xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
-
-               ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
-                       xfs_btree_key_addr(cur, 2, right)));
-       } else {
-               /* It's a leaf. make a hole in the records */
-               union xfs_btree_rec     *lrp;
-               union xfs_btree_rec     *rrp;
-
-               lrp = xfs_btree_rec_addr(cur, lrecs, left);
-               rrp = xfs_btree_rec_addr(cur, 1, right);
-
-               xfs_btree_shift_recs(cur, rrp, 1, rrecs);
-
-               /* Now put the new data in, and log it. */
-               xfs_btree_copy_recs(cur, rrp, lrp, 1);
-               xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
-
-               cur->bc_ops->init_key_from_rec(&key, rrp);
-               rkp = &key;
-
-               ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
-                       xfs_btree_rec_addr(cur, 2, right)));
-       }
-
-       /*
-        * Decrement and log left's numrecs, bump and log right's numrecs.
-        */
-       xfs_btree_set_numrecs(left, --lrecs);
-       xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
-
-       xfs_btree_set_numrecs(right, ++rrecs);
-       xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
-
-       /*
-        * Using a temporary cursor, update the parent key values of the
-        * block on the right.
-        */
-       error = xfs_btree_dup_cursor(cur, &tcur);
-       if (error)
-               goto error0;
-       i = xfs_btree_lastrec(tcur, level);
-       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-
-       error = xfs_btree_increment(tcur, level, &i);
-       if (error)
-               goto error1;
-
-       error = xfs_btree_updkey(tcur, rkp, level + 1);
-       if (error)
-               goto error1;
-
-       xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-       *stat = 1;
-       return 0;
-
-out0:
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-       *stat = 0;
-       return 0;
-
-error0:
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-       return error;
-
-error1:
-       XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
-       xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-       return error;
-}
-
-/*
- * Split cur/level block in half.
- * Return new block number and the key to its first
- * record (to be inserted into parent).
- */
-STATIC int                                     /* error */
-__xfs_btree_split(
-       struct xfs_btree_cur    *cur,
-       int                     level,
-       union xfs_btree_ptr     *ptrp,
-       union xfs_btree_key     *key,
-       struct xfs_btree_cur    **curp,
-       int                     *stat)          /* success/failure */
-{
-       union xfs_btree_ptr     lptr;           /* left sibling block ptr */
-       struct xfs_buf          *lbp;           /* left buffer pointer */
-       struct xfs_btree_block  *left;          /* left btree block */
-       union xfs_btree_ptr     rptr;           /* right sibling block ptr */
-       struct xfs_buf          *rbp;           /* right buffer pointer */
-       struct xfs_btree_block  *right;         /* right btree block */
-       union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
-       struct xfs_buf          *rrbp;          /* right-right buffer pointer */
-       struct xfs_btree_block  *rrblock;       /* right-right btree block */
-       int                     lrecs;
-       int                     rrecs;
-       int                     src_index;
-       int                     error;          /* error return value */
-#ifdef DEBUG
-       int                     i;
-#endif
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-       XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
-
-       XFS_BTREE_STATS_INC(cur, split);
-
-       /* Set up left block (current one). */
-       left = xfs_btree_get_block(cur, level, &lbp);
-
-#ifdef DEBUG
-       error = xfs_btree_check_block(cur, left, level, lbp);
-       if (error)
-               goto error0;
-#endif
-
-       xfs_btree_buf_to_ptr(cur, lbp, &lptr);
-
-       /* Allocate the new block. If we can't do it, we're toast. Give up. */
-       error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
-       if (error)
-               goto error0;
-       if (*stat == 0)
-               goto out0;
-       XFS_BTREE_STATS_INC(cur, alloc);
-
-       /* Set up the new block as "right". */
-       error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
-       if (error)
-               goto error0;
-
-       /* Fill in the btree header for the new right block. */
-       xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
-
-       /*
-        * Split the entries between the old and the new block evenly.
-        * Make sure that if there's an odd number of entries now, that
-        * each new block will have the same number of entries.
-        */
-       lrecs = xfs_btree_get_numrecs(left);
-       rrecs = lrecs / 2;
-       if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
-               rrecs++;
-       src_index = (lrecs - rrecs + 1);
-
-       XFS_BTREE_STATS_ADD(cur, moves, rrecs);
-
-       /*
-        * Copy btree block entries from the left block over to the
-        * new block, the right. Update the right block and log the
-        * changes.
-        */
-       if (level > 0) {
-               /* It's a non-leaf.  Move keys and pointers. */
-               union xfs_btree_key     *lkp;   /* left btree key */
-               union xfs_btree_ptr     *lpp;   /* left address pointer */
-               union xfs_btree_key     *rkp;   /* right btree key */
-               union xfs_btree_ptr     *rpp;   /* right address pointer */
-
-               lkp = xfs_btree_key_addr(cur, src_index, left);
-               lpp = xfs_btree_ptr_addr(cur, src_index, left);
-               rkp = xfs_btree_key_addr(cur, 1, right);
-               rpp = xfs_btree_ptr_addr(cur, 1, right);
-
-#ifdef DEBUG
-               for (i = src_index; i < rrecs; i++) {
-                       error = xfs_btree_check_ptr(cur, lpp, i, level);
-                       if (error)
-                               goto error0;
-               }
-#endif
-
-               xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
-               xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
-
-               xfs_btree_log_keys(cur, rbp, 1, rrecs);
-               xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
-
-               /* Grab the keys to the entries moved to the right block */
-               xfs_btree_copy_keys(cur, key, rkp, 1);
-       } else {
-               /* It's a leaf.  Move records.  */
-               union xfs_btree_rec     *lrp;   /* left record pointer */
-               union xfs_btree_rec     *rrp;   /* right record pointer */
-
-               lrp = xfs_btree_rec_addr(cur, src_index, left);
-               rrp = xfs_btree_rec_addr(cur, 1, right);
-
-               xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
-               xfs_btree_log_recs(cur, rbp, 1, rrecs);
-
-               cur->bc_ops->init_key_from_rec(key,
-                       xfs_btree_rec_addr(cur, 1, right));
-       }
-
-
-       /*
-        * Find the left block number by looking in the buffer.
-        * Adjust numrecs, sibling pointers.
-        */
-       xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
-       xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
-       xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
-       xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
-
-       lrecs -= rrecs;
-       xfs_btree_set_numrecs(left, lrecs);
-       xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
-
-       xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
-       xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-
-       /*
-        * If there's a block to the new block's right, make that block
-        * point back to right instead of to left.
-        */
-       if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
-               error = xfs_btree_read_buf_block(cur, &rrptr,
-                                                       0, &rrblock, &rrbp);
-               if (error)
-                       goto error0;
-               xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
-               xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
-       }
-       /*
-        * If the cursor is really in the right block, move it there.
-        * If it's just pointing past the last entry in left, then we'll
-        * insert there, so don't change anything in that case.
-        */
-       if (cur->bc_ptrs[level] > lrecs + 1) {
-               xfs_btree_setbuf(cur, level, rbp);
-               cur->bc_ptrs[level] -= lrecs;
-       }
-       /*
-        * If there are more levels, we'll need another cursor which refers
-        * the right block, no matter where this cursor was.
-        */
-       if (level + 1 < cur->bc_nlevels) {
-               error = xfs_btree_dup_cursor(cur, curp);
-               if (error)
-                       goto error0;
-               (*curp)->bc_ptrs[level + 1]++;
-       }
-       *ptrp = rptr;
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-       *stat = 1;
-       return 0;
-out0:
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-       *stat = 0;
-       return 0;
-
-error0:
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-       return error;
-}
-
-struct xfs_btree_split_args {
-       struct xfs_btree_cur    *cur;
-       int                     level;
-       union xfs_btree_ptr     *ptrp;
-       union xfs_btree_key     *key;
-       struct xfs_btree_cur    **curp;
-       int                     *stat;          /* success/failure */
-       int                     result;
-       bool                    kswapd; /* allocation in kswapd context */
-       struct completion       *done;
-       struct work_struct      work;
-};
-
-/*
- * Stack switching interfaces for allocation
- */
-static void
-xfs_btree_split_worker(
-       struct work_struct      *work)
-{
-       struct xfs_btree_split_args     *args = container_of(work,
-                                               struct xfs_btree_split_args, work);
-       unsigned long           pflags;
-       unsigned long           new_pflags = PF_FSTRANS;
-
-       /*
-        * we are in a transaction context here, but may also be doing work
-        * in kswapd context, and hence we may need to inherit that state
-        * temporarily to ensure that we don't block waiting for memory reclaim
-        * in any way.
-        */
-       if (args->kswapd)
-               new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
-
-       current_set_flags_nested(&pflags, new_pflags);
-
-       args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
-                                        args->key, args->curp, args->stat);
-       complete(args->done);
-
-       current_restore_flags_nested(&pflags, new_pflags);
-}
-
-/*
- * BMBT split requests often come in with little stack to work on. Push
- * them off to a worker thread so there is lots of stack to use. For the other
- * btree types, just call directly to avoid the context switch overhead here.
- */
-STATIC int                                     /* error */
-xfs_btree_split(
-       struct xfs_btree_cur    *cur,
-       int                     level,
-       union xfs_btree_ptr     *ptrp,
-       union xfs_btree_key     *key,
-       struct xfs_btree_cur    **curp,
-       int                     *stat)          /* success/failure */
-{
-       struct xfs_btree_split_args     args;
-       DECLARE_COMPLETION_ONSTACK(done);
-
-       if (cur->bc_btnum != XFS_BTNUM_BMAP)
-               return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
-
-       args.cur = cur;
-       args.level = level;
-       args.ptrp = ptrp;
-       args.key = key;
-       args.curp = curp;
-       args.stat = stat;
-       args.done = &done;
-       args.kswapd = current_is_kswapd();
-       INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
-       queue_work(xfs_alloc_wq, &args.work);
-       wait_for_completion(&done);
-       destroy_work_on_stack(&args.work);
-       return args.result;
-}
-
-
-/*
- * Copy the old inode root contents into a real block and make the
- * broot point to it.
- */
-int                                            /* error */
-xfs_btree_new_iroot(
-       struct xfs_btree_cur    *cur,           /* btree cursor */
-       int                     *logflags,      /* logging flags for inode */
-       int                     *stat)          /* return status - 0 fail */
-{
-       struct xfs_buf          *cbp;           /* buffer for cblock */
-       struct xfs_btree_block  *block;         /* btree block */
-       struct xfs_btree_block  *cblock;        /* child btree block */
-       union xfs_btree_key     *ckp;           /* child key pointer */
-       union xfs_btree_ptr     *cpp;           /* child ptr pointer */
-       union xfs_btree_key     *kp;            /* pointer to btree key */
-       union xfs_btree_ptr     *pp;            /* pointer to block addr */
-       union xfs_btree_ptr     nptr;           /* new block addr */
-       int                     level;          /* btree level */
-       int                     error;          /* error return code */
-#ifdef DEBUG
-       int                     i;              /* loop counter */
-#endif
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-       XFS_BTREE_STATS_INC(cur, newroot);
-
-       ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
-
-       level = cur->bc_nlevels - 1;
-
-       block = xfs_btree_get_iroot(cur);
-       pp = xfs_btree_ptr_addr(cur, 1, block);
-
-       /* Allocate the new block. If we can't do it, we're toast. Give up. */
-       error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
-       if (error)
-               goto error0;
-       if (*stat == 0) {
-               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-               return 0;
-       }
-       XFS_BTREE_STATS_INC(cur, alloc);
-
-       /* Copy the root into a real block. */
-       error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
-       if (error)
-               goto error0;
-
-       /*
-        * we can't just memcpy() the root in for CRC enabled btree blocks.
-        * In that case have to also ensure the blkno remains correct
-        */
-       memcpy(cblock, block, xfs_btree_block_len(cur));
-       if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
-               if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-                       cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
-               else
-                       cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
-       }
-
-       be16_add_cpu(&block->bb_level, 1);
-       xfs_btree_set_numrecs(block, 1);
-       cur->bc_nlevels++;
-       cur->bc_ptrs[level + 1] = 1;
-
-       kp = xfs_btree_key_addr(cur, 1, block);
-       ckp = xfs_btree_key_addr(cur, 1, cblock);
-       xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
-
-       cpp = xfs_btree_ptr_addr(cur, 1, cblock);
-#ifdef DEBUG
-       for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
-               error = xfs_btree_check_ptr(cur, pp, i, level);
-               if (error)
-                       goto error0;
-       }
-#endif
-       xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
-
-#ifdef DEBUG
-       error = xfs_btree_check_ptr(cur, &nptr, 0, level);
-       if (error)
-               goto error0;
-#endif
-       xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
-
-       xfs_iroot_realloc(cur->bc_private.b.ip,
-                         1 - xfs_btree_get_numrecs(cblock),
-                         cur->bc_private.b.whichfork);
-
-       xfs_btree_setbuf(cur, level, cbp);
-
-       /*
-        * Do all this logging at the end so that
-        * the root is at the right level.
-        */
-       xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
-       xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
-       xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
-
-       *logflags |=
-               XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
-       *stat = 1;
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-       return 0;
-error0:
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-       return error;
-}
-
-/*
- * Allocate a new root block, fill it in.
- */
-STATIC int                             /* error */
-xfs_btree_new_root(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       int                     *stat)  /* success/failure */
-{
-       struct xfs_btree_block  *block; /* one half of the old root block */
-       struct xfs_buf          *bp;    /* buffer containing block */
-       int                     error;  /* error return value */
-       struct xfs_buf          *lbp;   /* left buffer pointer */
-       struct xfs_btree_block  *left;  /* left btree block */
-       struct xfs_buf          *nbp;   /* new (root) buffer */
-       struct xfs_btree_block  *new;   /* new (root) btree block */
-       int                     nptr;   /* new value for key index, 1 or 2 */
-       struct xfs_buf          *rbp;   /* right buffer pointer */
-       struct xfs_btree_block  *right; /* right btree block */
-       union xfs_btree_ptr     rptr;
-       union xfs_btree_ptr     lptr;
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-       XFS_BTREE_STATS_INC(cur, newroot);
-
-       /* initialise our start point from the cursor */
-       cur->bc_ops->init_ptr_from_cur(cur, &rptr);
-
-       /* Allocate the new block. If we can't do it, we're toast. Give up. */
-       error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
-       if (error)
-               goto error0;
-       if (*stat == 0)
-               goto out0;
-       XFS_BTREE_STATS_INC(cur, alloc);
-
-       /* Set up the new block. */
-       error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
-       if (error)
-               goto error0;
-
-       /* Set the root in the holding structure  increasing the level by 1. */
-       cur->bc_ops->set_root(cur, &lptr, 1);
-
-       /*
-        * At the previous root level there are now two blocks: the old root,
-        * and the new block generated when it was split.  We don't know which
-        * one the cursor is pointing at, so we set up variables "left" and
-        * "right" for each case.
-        */
-       block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
-
-#ifdef DEBUG
-       error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
-       if (error)
-               goto error0;
-#endif
-
-       xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
-       if (!xfs_btree_ptr_is_null(cur, &rptr)) {
-               /* Our block is left, pick up the right block. */
-               lbp = bp;
-               xfs_btree_buf_to_ptr(cur, lbp, &lptr);
-               left = block;
-               error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
-               if (error)
-                       goto error0;
-               bp = rbp;
-               nptr = 1;
-       } else {
-               /* Our block is right, pick up the left block. */
-               rbp = bp;
-               xfs_btree_buf_to_ptr(cur, rbp, &rptr);
-               right = block;
-               xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
-               error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
-               if (error)
-                       goto error0;
-               bp = lbp;
-               nptr = 2;
-       }
-       /* Fill in the new block's btree header and log it. */
-       xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
-       xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
-       ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
-                       !xfs_btree_ptr_is_null(cur, &rptr));
-
-       /* Fill in the key data in the new root. */
-       if (xfs_btree_get_level(left) > 0) {
-               xfs_btree_copy_keys(cur,
-                               xfs_btree_key_addr(cur, 1, new),
-                               xfs_btree_key_addr(cur, 1, left), 1);
-               xfs_btree_copy_keys(cur,
-                               xfs_btree_key_addr(cur, 2, new),
-                               xfs_btree_key_addr(cur, 1, right), 1);
-       } else {
-               cur->bc_ops->init_key_from_rec(
-                               xfs_btree_key_addr(cur, 1, new),
-                               xfs_btree_rec_addr(cur, 1, left));
-               cur->bc_ops->init_key_from_rec(
-                               xfs_btree_key_addr(cur, 2, new),
-                               xfs_btree_rec_addr(cur, 1, right));
-       }
-       xfs_btree_log_keys(cur, nbp, 1, 2);
-
-       /* Fill in the pointer data in the new root. */
-       xfs_btree_copy_ptrs(cur,
-               xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
-       xfs_btree_copy_ptrs(cur,
-               xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
-       xfs_btree_log_ptrs(cur, nbp, 1, 2);
-
-       /* Fix up the cursor. */
-       xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
-       cur->bc_ptrs[cur->bc_nlevels] = nptr;
-       cur->bc_nlevels++;
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-       *stat = 1;
-       return 0;
-error0:
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-       return error;
-out0:
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-       *stat = 0;
-       return 0;
-}
-
-STATIC int
-xfs_btree_make_block_unfull(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       int                     level,  /* btree level */
-       int                     numrecs,/* # of recs in block */
-       int                     *oindex,/* old tree index */
-       int                     *index, /* new tree index */
-       union xfs_btree_ptr     *nptr,  /* new btree ptr */
-       struct xfs_btree_cur    **ncur, /* new btree cursor */
-       union xfs_btree_rec     *nrec,  /* new record */
-       int                     *stat)
-{
-       union xfs_btree_key     key;    /* new btree key value */
-       int                     error = 0;
-
-       if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
-           level == cur->bc_nlevels - 1) {
-               struct xfs_inode *ip = cur->bc_private.b.ip;
-
-               if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
-                       /* A root block that can be made bigger. */
-                       xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
-               } else {
-                       /* A root block that needs replacing */
-                       int     logflags = 0;
-
-                       error = xfs_btree_new_iroot(cur, &logflags, stat);
-                       if (error || *stat == 0)
-                               return error;
-
-                       xfs_trans_log_inode(cur->bc_tp, ip, logflags);
-               }
-
-               return 0;
-       }
-
-       /* First, try shifting an entry to the right neighbor. */
-       error = xfs_btree_rshift(cur, level, stat);
-       if (error || *stat)
-               return error;
-
-       /* Next, try shifting an entry to the left neighbor. */
-       error = xfs_btree_lshift(cur, level, stat);
-       if (error)
-               return error;
-
-       if (*stat) {
-               *oindex = *index = cur->bc_ptrs[level];
-               return 0;
-       }
-
-       /*
-        * Next, try splitting the current block in half.
-        *
-        * If this works we have to re-set our variables because we
-        * could be in a different block now.
-        */
-       error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
-       if (error || *stat == 0)
-               return error;
-
-
-       *index = cur->bc_ptrs[level];
-       cur->bc_ops->init_rec_from_key(&key, nrec);
-       return 0;
-}
-
-/*
- * Insert one record/level.  Return information to the caller
- * allowing the next level up to proceed if necessary.
- */
-STATIC int
-xfs_btree_insrec(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       int                     level,  /* level to insert record at */
-       union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
-       union xfs_btree_rec     *recp,  /* i/o: record data inserted */
-       struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
-       int                     *stat)  /* success/failure */
-{
-       struct xfs_btree_block  *block; /* btree block */
-       struct xfs_buf          *bp;    /* buffer for block */
-       union xfs_btree_key     key;    /* btree key */
-       union xfs_btree_ptr     nptr;   /* new block ptr */
-       struct xfs_btree_cur    *ncur;  /* new btree cursor */
-       union xfs_btree_rec     nrec;   /* new record count */
-       int                     optr;   /* old key/record index */
-       int                     ptr;    /* key/record index */
-       int                     numrecs;/* number of records */
-       int                     error;  /* error return value */
-#ifdef DEBUG
-       int                     i;
-#endif
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-       XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
-
-       ncur = NULL;
-
-       /*
-        * If we have an external root pointer, and we've made it to the
-        * root level, allocate a new root block and we're done.
-        */
-       if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
-           (level >= cur->bc_nlevels)) {
-               error = xfs_btree_new_root(cur, stat);
-               xfs_btree_set_ptr_null(cur, ptrp);
-
-               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-               return error;
-       }
-
-       /* If we're off the left edge, return failure. */
-       ptr = cur->bc_ptrs[level];
-       if (ptr == 0) {
-               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-               *stat = 0;
-               return 0;
-       }
-
-       /* Make a key out of the record data to be inserted, and save it. */
-       cur->bc_ops->init_key_from_rec(&key, recp);
-
-       optr = ptr;
-
-       XFS_BTREE_STATS_INC(cur, insrec);
-
-       /* Get pointers to the btree buffer and block. */
-       block = xfs_btree_get_block(cur, level, &bp);
-       numrecs = xfs_btree_get_numrecs(block);
-
-#ifdef DEBUG
-       error = xfs_btree_check_block(cur, block, level, bp);
-       if (error)
-               goto error0;
-
-       /* Check that the new entry is being inserted in the right place. */
-       if (ptr <= numrecs) {
-               if (level == 0) {
-                       ASSERT(cur->bc_ops->recs_inorder(cur, recp,
-                               xfs_btree_rec_addr(cur, ptr, block)));
-               } else {
-                       ASSERT(cur->bc_ops->keys_inorder(cur, &key,
-                               xfs_btree_key_addr(cur, ptr, block)));
-               }
-       }
-#endif
-
-       /*
-        * If the block is full, we can't insert the new entry until we
-        * make the block un-full.
-        */
-       xfs_btree_set_ptr_null(cur, &nptr);
-       if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
-               error = xfs_btree_make_block_unfull(cur, level, numrecs,
-                                       &optr, &ptr, &nptr, &ncur, &nrec, stat);
-               if (error || *stat == 0)
-                       goto error0;
-       }
-
-       /*
-        * The current block may have changed if the block was
-        * previously full and we have just made space in it.
-        */
-       block = xfs_btree_get_block(cur, level, &bp);
-       numrecs = xfs_btree_get_numrecs(block);
-
-#ifdef DEBUG
-       error = xfs_btree_check_block(cur, block, level, bp);
-       if (error)
-               return error;
-#endif
-
-       /*
-        * At this point we know there's room for our new entry in the block
-        * we're pointing at.
-        */
-       XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
-
-       if (level > 0) {
-               /* It's a nonleaf. make a hole in the keys and ptrs */
-               union xfs_btree_key     *kp;
-               union xfs_btree_ptr     *pp;
-
-               kp = xfs_btree_key_addr(cur, ptr, block);
-               pp = xfs_btree_ptr_addr(cur, ptr, block);
-
-#ifdef DEBUG
-               for (i = numrecs - ptr; i >= 0; i--) {
-                       error = xfs_btree_check_ptr(cur, pp, i, level);
-                       if (error)
-                               return error;
-               }
-#endif
-
-               xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
-               xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
-
-#ifdef DEBUG
-               error = xfs_btree_check_ptr(cur, ptrp, 0, level);
-               if (error)
-                       goto error0;
-#endif
-
-               /* Now put the new data in, bump numrecs and log it. */
-               xfs_btree_copy_keys(cur, kp, &key, 1);
-               xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
-               numrecs++;
-               xfs_btree_set_numrecs(block, numrecs);
-               xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
-               xfs_btree_log_keys(cur, bp, ptr, numrecs);
-#ifdef DEBUG
-               if (ptr < numrecs) {
-                       ASSERT(cur->bc_ops->keys_inorder(cur, kp,
-                               xfs_btree_key_addr(cur, ptr + 1, block)));
-               }
-#endif
-       } else {
-               /* It's a leaf. make a hole in the records */
-               union xfs_btree_rec             *rp;
-
-               rp = xfs_btree_rec_addr(cur, ptr, block);
-
-               xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
-
-               /* Now put the new data in, bump numrecs and log it. */
-               xfs_btree_copy_recs(cur, rp, recp, 1);
-               xfs_btree_set_numrecs(block, ++numrecs);
-               xfs_btree_log_recs(cur, bp, ptr, numrecs);
-#ifdef DEBUG
-               if (ptr < numrecs) {
-                       ASSERT(cur->bc_ops->recs_inorder(cur, rp,
-                               xfs_btree_rec_addr(cur, ptr + 1, block)));
-               }
-#endif
-       }
-
-       /* Log the new number of records in the btree header. */
-       xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
-
-       /* If we inserted at the start of a block, update the parents' keys. */
-       if (optr == 1) {
-               error = xfs_btree_updkey(cur, &key, level + 1);
-               if (error)
-                       goto error0;
-       }
-
-       /*
-        * If we are tracking the last record in the tree and
-        * we are at the far right edge of the tree, update it.
-        */
-       if (xfs_btree_is_lastrec(cur, block, level)) {
-               cur->bc_ops->update_lastrec(cur, block, recp,
-                                           ptr, LASTREC_INSREC);
-       }
-
-       /*
-        * Return the new block number, if any.
-        * If there is one, give back a record value and a cursor too.
-        */
-       *ptrp = nptr;
-       if (!xfs_btree_ptr_is_null(cur, &nptr)) {
-               *recp = nrec;
-               *curp = ncur;
-       }
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-       *stat = 1;
-       return 0;
-
-error0:
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-       return error;
-}
-
-/*
- * Insert the record at the point referenced by cur.
- *
- * A multi-level split of the tree on insert will invalidate the original
- * cursor.  All callers of this function should assume that the cursor is
- * no longer valid and revalidate it.
- */
-int
-xfs_btree_insert(
-       struct xfs_btree_cur    *cur,
-       int                     *stat)
-{
-       int                     error;  /* error return value */
-       int                     i;      /* result value, 0 for failure */
-       int                     level;  /* current level number in btree */
-       union xfs_btree_ptr     nptr;   /* new block number (split result) */
-       struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
-       struct xfs_btree_cur    *pcur;  /* previous level's cursor */
-       union xfs_btree_rec     rec;    /* record to insert */
-
-       level = 0;
-       ncur = NULL;
-       pcur = cur;
-
-       xfs_btree_set_ptr_null(cur, &nptr);
-       cur->bc_ops->init_rec_from_cur(cur, &rec);
-
-       /*
-        * Loop going up the tree, starting at the leaf level.
-        * Stop when we don't get a split block, that must mean that
-        * the insert is finished with this level.
-        */
-       do {
-               /*
-                * Insert nrec/nptr into this level of the tree.
-                * Note if we fail, nptr will be null.
-                */
-               error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
-               if (error) {
-                       if (pcur != cur)
-                               xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
-                       goto error0;
-               }
-
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               level++;
-
-               /*
-                * See if the cursor we just used is trash.
-                * Can't trash the caller's cursor, but otherwise we should
-                * if ncur is a new cursor or we're about to be done.
-                */
-               if (pcur != cur &&
-                   (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
-                       /* Save the state from the cursor before we trash it */
-                       if (cur->bc_ops->update_cursor)
-                               cur->bc_ops->update_cursor(pcur, cur);
-                       cur->bc_nlevels = pcur->bc_nlevels;
-                       xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
-               }
-               /* If we got a new cursor, switch to it. */
-               if (ncur) {
-                       pcur = ncur;
-                       ncur = NULL;
-               }
-       } while (!xfs_btree_ptr_is_null(cur, &nptr));
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-       *stat = i;
-       return 0;
-error0:
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-       return error;
-}
-
-/*
- * Try to merge a non-leaf block back into the inode root.
- *
- * Note: the killroot names comes from the fact that we're effectively
- * killing the old root block.  But because we can't just delete the
- * inode we have to copy the single block it was pointing to into the
- * inode.
- */
-STATIC int
-xfs_btree_kill_iroot(
-       struct xfs_btree_cur    *cur)
-{
-       int                     whichfork = cur->bc_private.b.whichfork;
-       struct xfs_inode        *ip = cur->bc_private.b.ip;
-       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
-       struct xfs_btree_block  *block;
-       struct xfs_btree_block  *cblock;
-       union xfs_btree_key     *kp;
-       union xfs_btree_key     *ckp;
-       union xfs_btree_ptr     *pp;
-       union xfs_btree_ptr     *cpp;
-       struct xfs_buf          *cbp;
-       int                     level;
-       int                     index;
-       int                     numrecs;
-#ifdef DEBUG
-       union xfs_btree_ptr     ptr;
-       int                     i;
-#endif
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-
-       ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
-       ASSERT(cur->bc_nlevels > 1);
-
-       /*
-        * Don't deal with the root block needs to be a leaf case.
-        * We're just going to turn the thing back into extents anyway.
-        */
-       level = cur->bc_nlevels - 1;
-       if (level == 1)
-               goto out0;
-
-       /*
-        * Give up if the root has multiple children.
-        */
-       block = xfs_btree_get_iroot(cur);
-       if (xfs_btree_get_numrecs(block) != 1)
-               goto out0;
-
-       cblock = xfs_btree_get_block(cur, level - 1, &cbp);
-       numrecs = xfs_btree_get_numrecs(cblock);
-
-       /*
-        * Only do this if the next level will fit.
-        * Then the data must be copied up to the inode,
-        * instead of freeing the root you free the next level.
-        */
-       if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
-               goto out0;
-
-       XFS_BTREE_STATS_INC(cur, killroot);
-
-#ifdef DEBUG
-       xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
-       ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
-       xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
-       ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
-#endif
-
-       index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
-       if (index) {
-               xfs_iroot_realloc(cur->bc_private.b.ip, index,
-                                 cur->bc_private.b.whichfork);
-               block = ifp->if_broot;
-       }
-
-       be16_add_cpu(&block->bb_numrecs, index);
-       ASSERT(block->bb_numrecs == cblock->bb_numrecs);
-
-       kp = xfs_btree_key_addr(cur, 1, block);
-       ckp = xfs_btree_key_addr(cur, 1, cblock);
-       xfs_btree_copy_keys(cur, kp, ckp, numrecs);
-
-       pp = xfs_btree_ptr_addr(cur, 1, block);
-       cpp = xfs_btree_ptr_addr(cur, 1, cblock);
-#ifdef DEBUG
-       for (i = 0; i < numrecs; i++) {
-               int             error;
-
-               error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
-               if (error) {
-                       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-                       return error;
-               }
-       }
-#endif
-       xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
-
-       cur->bc_ops->free_block(cur, cbp);
-       XFS_BTREE_STATS_INC(cur, free);
-
-       cur->bc_bufs[level - 1] = NULL;
-       be16_add_cpu(&block->bb_level, -1);
-       xfs_trans_log_inode(cur->bc_tp, ip,
-               XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
-       cur->bc_nlevels--;
-out0:
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-       return 0;
-}
-
-/*
- * Kill the current root node, and replace it with it's only child node.
- */
-STATIC int
-xfs_btree_kill_root(
-       struct xfs_btree_cur    *cur,
-       struct xfs_buf          *bp,
-       int                     level,
-       union xfs_btree_ptr     *newroot)
-{
-       int                     error;
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-       XFS_BTREE_STATS_INC(cur, killroot);
-
-       /*
-        * Update the root pointer, decreasing the level by 1 and then
-        * free the old root.
-        */
-       cur->bc_ops->set_root(cur, newroot, -1);
-
-       error = cur->bc_ops->free_block(cur, bp);
-       if (error) {
-               XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-               return error;
-       }
-
-       XFS_BTREE_STATS_INC(cur, free);
-
-       cur->bc_bufs[level] = NULL;
-       cur->bc_ra[level] = 0;
-       cur->bc_nlevels--;
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-       return 0;
-}
-
-STATIC int
-xfs_btree_dec_cursor(
-       struct xfs_btree_cur    *cur,
-       int                     level,
-       int                     *stat)
-{
-       int                     error;
-       int                     i;
-
-       if (level > 0) {
-               error = xfs_btree_decrement(cur, level, &i);
-               if (error)
-                       return error;
-       }
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-       *stat = 1;
-       return 0;
-}
-
-/*
- * Single level of the btree record deletion routine.
- * Delete record pointed to by cur/level.
- * Remove the record from its block then rebalance the tree.
- * Return 0 for error, 1 for done, 2 to go on to the next level.
- */
-STATIC int                                     /* error */
-xfs_btree_delrec(
-       struct xfs_btree_cur    *cur,           /* btree cursor */
-       int                     level,          /* level removing record from */
-       int                     *stat)          /* fail/done/go-on */
-{
-       struct xfs_btree_block  *block;         /* btree block */
-       union xfs_btree_ptr     cptr;           /* current block ptr */
-       struct xfs_buf          *bp;            /* buffer for block */
-       int                     error;          /* error return value */
-       int                     i;              /* loop counter */
-       union xfs_btree_key     key;            /* storage for keyp */
-       union xfs_btree_key     *keyp = &key;   /* passed to the next level */
-       union xfs_btree_ptr     lptr;           /* left sibling block ptr */
-       struct xfs_buf          *lbp;           /* left buffer pointer */
-       struct xfs_btree_block  *left;          /* left btree block */
-       int                     lrecs = 0;      /* left record count */
-       int                     ptr;            /* key/record index */
-       union xfs_btree_ptr     rptr;           /* right sibling block ptr */
-       struct xfs_buf          *rbp;           /* right buffer pointer */
-       struct xfs_btree_block  *right;         /* right btree block */
-       struct xfs_btree_block  *rrblock;       /* right-right btree block */
-       struct xfs_buf          *rrbp;          /* right-right buffer pointer */
-       int                     rrecs = 0;      /* right record count */
-       struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
-       int                     numrecs;        /* temporary numrec count */
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-       XFS_BTREE_TRACE_ARGI(cur, level);
-
-       tcur = NULL;
-
-       /* Get the index of the entry being deleted, check for nothing there. */
-       ptr = cur->bc_ptrs[level];
-       if (ptr == 0) {
-               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-               *stat = 0;
-               return 0;
-       }
-
-       /* Get the buffer & block containing the record or key/ptr. */
-       block = xfs_btree_get_block(cur, level, &bp);
-       numrecs = xfs_btree_get_numrecs(block);
-
-#ifdef DEBUG
-       error = xfs_btree_check_block(cur, block, level, bp);
-       if (error)
-               goto error0;
-#endif
-
-       /* Fail if we're off the end of the block. */
-       if (ptr > numrecs) {
-               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-               *stat = 0;
-               return 0;
-       }
-
-       XFS_BTREE_STATS_INC(cur, delrec);
-       XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
-
-       /* Excise the entries being deleted. */
-       if (level > 0) {
-               /* It's a nonleaf. operate on keys and ptrs */
-               union xfs_btree_key     *lkp;
-               union xfs_btree_ptr     *lpp;
-
-               lkp = xfs_btree_key_addr(cur, ptr + 1, block);
-               lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
-
-#ifdef DEBUG
-               for (i = 0; i < numrecs - ptr; i++) {
-                       error = xfs_btree_check_ptr(cur, lpp, i, level);
-                       if (error)
-                               goto error0;
-               }
-#endif
-
-               if (ptr < numrecs) {
-                       xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
-                       xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
-                       xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
-                       xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
-               }
-
-               /*
-                * If it's the first record in the block, we'll need to pass a
-                * key up to the next level (updkey).
-                */
-               if (ptr == 1)
-                       keyp = xfs_btree_key_addr(cur, 1, block);
-       } else {
-               /* It's a leaf. operate on records */
-               if (ptr < numrecs) {
-                       xfs_btree_shift_recs(cur,
-                               xfs_btree_rec_addr(cur, ptr + 1, block),
-                               -1, numrecs - ptr);
-                       xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
-               }
-
-               /*
-                * If it's the first record in the block, we'll need a key
-                * structure to pass up to the next level (updkey).
-                */
-               if (ptr == 1) {
-                       cur->bc_ops->init_key_from_rec(&key,
-                                       xfs_btree_rec_addr(cur, 1, block));
-                       keyp = &key;
-               }
-       }
-
-       /*
-        * Decrement and log the number of entries in the block.
-        */
-       xfs_btree_set_numrecs(block, --numrecs);
-       xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
-
-       /*
-        * If we are tracking the last record in the tree and
-        * we are at the far right edge of the tree, update it.
-        */
-       if (xfs_btree_is_lastrec(cur, block, level)) {
-               cur->bc_ops->update_lastrec(cur, block, NULL,
-                                           ptr, LASTREC_DELREC);
-       }
-
-       /*
-        * We're at the root level.  First, shrink the root block in-memory.
-        * Try to get rid of the next level down.  If we can't then there's
-        * nothing left to do.
-        */
-       if (level == cur->bc_nlevels - 1) {
-               if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
-                       xfs_iroot_realloc(cur->bc_private.b.ip, -1,
-                                         cur->bc_private.b.whichfork);
-
-                       error = xfs_btree_kill_iroot(cur);
-                       if (error)
-                               goto error0;
-
-                       error = xfs_btree_dec_cursor(cur, level, stat);
-                       if (error)
-                               goto error0;
-                       *stat = 1;
-                       return 0;
-               }
-
-               /*
-                * If this is the root level, and there's only one entry left,
-                * and it's NOT the leaf level, then we can get rid of this
-                * level.
-                */
-               if (numrecs == 1 && level > 0) {
-                       union xfs_btree_ptr     *pp;
-                       /*
-                        * pp is still set to the first pointer in the block.
-                        * Make it the new root of the btree.
-                        */
-                       pp = xfs_btree_ptr_addr(cur, 1, block);
-                       error = xfs_btree_kill_root(cur, bp, level, pp);
-                       if (error)
-                               goto error0;
-               } else if (level > 0) {
-                       error = xfs_btree_dec_cursor(cur, level, stat);
-                       if (error)
-                               goto error0;
-               }
-               *stat = 1;
-               return 0;
-       }
-
-       /*
-        * If we deleted the leftmost entry in the block, update the
-        * key values above us in the tree.
-        */
-       if (ptr == 1) {
-               error = xfs_btree_updkey(cur, keyp, level + 1);
-               if (error)
-                       goto error0;
-       }
-
-       /*
-        * If the number of records remaining in the block is at least
-        * the minimum, we're done.
-        */
-       if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
-               error = xfs_btree_dec_cursor(cur, level, stat);
-               if (error)
-                       goto error0;
-               return 0;
-       }
-
-       /*
-        * Otherwise, we have to move some records around to keep the
-        * tree balanced.  Look at the left and right sibling blocks to
-        * see if we can re-balance by moving only one record.
-        */
-       xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
-       xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
-
-       if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
-               /*
-                * One child of root, need to get a chance to copy its contents
-                * into the root and delete it. Can't go up to next level,
-                * there's nothing to delete there.
-                */
-               if (xfs_btree_ptr_is_null(cur, &rptr) &&
-                   xfs_btree_ptr_is_null(cur, &lptr) &&
-                   level == cur->bc_nlevels - 2) {
-                       error = xfs_btree_kill_iroot(cur);
-                       if (!error)
-                               error = xfs_btree_dec_cursor(cur, level, stat);
-                       if (error)
-                               goto error0;
-                       return 0;
-               }
-       }
-
-       ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
-              !xfs_btree_ptr_is_null(cur, &lptr));
-
-       /*
-        * Duplicate the cursor so our btree manipulations here won't
-        * disrupt the next level up.
-        */
-       error = xfs_btree_dup_cursor(cur, &tcur);
-       if (error)
-               goto error0;
-
-       /*
-        * If there's a right sibling, see if it's ok to shift an entry
-        * out of it.
-        */
-       if (!xfs_btree_ptr_is_null(cur, &rptr)) {
-               /*
-                * Move the temp cursor to the last entry in the next block.
-                * Actually any entry but the first would suffice.
-                */
-               i = xfs_btree_lastrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-
-               error = xfs_btree_increment(tcur, level, &i);
-               if (error)
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-
-               i = xfs_btree_lastrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-
-               /* Grab a pointer to the block. */
-               right = xfs_btree_get_block(tcur, level, &rbp);
-#ifdef DEBUG
-               error = xfs_btree_check_block(tcur, right, level, rbp);
-               if (error)
-                       goto error0;
-#endif
-               /* Grab the current block number, for future use. */
-               xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
-
-               /*
-                * If right block is full enough so that removing one entry
-                * won't make it too empty, and left-shifting an entry out
-                * of right to us works, we're done.
-                */
-               if (xfs_btree_get_numrecs(right) - 1 >=
-                   cur->bc_ops->get_minrecs(tcur, level)) {
-                       error = xfs_btree_lshift(tcur, level, &i);
-                       if (error)
-                               goto error0;
-                       if (i) {
-                               ASSERT(xfs_btree_get_numrecs(block) >=
-                                      cur->bc_ops->get_minrecs(tcur, level));
-
-                               xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-                               tcur = NULL;
-
-                               error = xfs_btree_dec_cursor(cur, level, stat);
-                               if (error)
-                                       goto error0;
-                               return 0;
-                       }
-               }
-
-               /*
-                * Otherwise, grab the number of records in right for
-                * future reference, and fix up the temp cursor to point
-                * to our block again (last record).
-                */
-               rrecs = xfs_btree_get_numrecs(right);
-               if (!xfs_btree_ptr_is_null(cur, &lptr)) {
-                       i = xfs_btree_firstrec(tcur, level);
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-
-                       error = xfs_btree_decrement(tcur, level, &i);
-                       if (error)
-                               goto error0;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               }
-       }
-
-       /*
-        * If there's a left sibling, see if it's ok to shift an entry
-        * out of it.
-        */
-       if (!xfs_btree_ptr_is_null(cur, &lptr)) {
-               /*
-                * Move the temp cursor to the first entry in the
-                * previous block.
-                */
-               i = xfs_btree_firstrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-
-               error = xfs_btree_decrement(tcur, level, &i);
-               if (error)
-                       goto error0;
-               i = xfs_btree_firstrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-
-               /* Grab a pointer to the block. */
-               left = xfs_btree_get_block(tcur, level, &lbp);
-#ifdef DEBUG
-               error = xfs_btree_check_block(cur, left, level, lbp);
-               if (error)
-                       goto error0;
-#endif
-               /* Grab the current block number, for future use. */
-               xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
-
-               /*
-                * If left block is full enough so that removing one entry
-                * won't make it too empty, and right-shifting an entry out
-                * of left to us works, we're done.
-                */
-               if (xfs_btree_get_numrecs(left) - 1 >=
-                   cur->bc_ops->get_minrecs(tcur, level)) {
-                       error = xfs_btree_rshift(tcur, level, &i);
-                       if (error)
-                               goto error0;
-                       if (i) {
-                               ASSERT(xfs_btree_get_numrecs(block) >=
-                                      cur->bc_ops->get_minrecs(tcur, level));
-                               xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-                               tcur = NULL;
-                               if (level == 0)
-                                       cur->bc_ptrs[0]++;
-                               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-                               *stat = 1;
-                               return 0;
-                       }
-               }
-
-               /*
-                * Otherwise, grab the number of records in right for
-                * future reference.
-                */
-               lrecs = xfs_btree_get_numrecs(left);
-       }
-
-       /* Delete the temp cursor, we're done with it. */
-       xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-       tcur = NULL;
-
-       /* If here, we need to do a join to keep the tree balanced. */
-       ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
-
-       if (!xfs_btree_ptr_is_null(cur, &lptr) &&
-           lrecs + xfs_btree_get_numrecs(block) <=
-                       cur->bc_ops->get_maxrecs(cur, level)) {
-               /*
-                * Set "right" to be the starting block,
-                * "left" to be the left neighbor.
-                */
-               rptr = cptr;
-               right = block;
-               rbp = bp;
-               error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
-               if (error)
-                       goto error0;
-
-       /*
-        * If that won't work, see if we can join with the right neighbor block.
-        */
-       } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
-                  rrecs + xfs_btree_get_numrecs(block) <=
-                       cur->bc_ops->get_maxrecs(cur, level)) {
-               /*
-                * Set "left" to be the starting block,
-                * "right" to be the right neighbor.
-                */
-               lptr = cptr;
-               left = block;
-               lbp = bp;
-               error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
-               if (error)
-                       goto error0;
-
-       /*
-        * Otherwise, we can't fix the imbalance.
-        * Just return.  This is probably a logic error, but it's not fatal.
-        */
-       } else {
-               error = xfs_btree_dec_cursor(cur, level, stat);
-               if (error)
-                       goto error0;
-               return 0;
-       }
-
-       rrecs = xfs_btree_get_numrecs(right);
-       lrecs = xfs_btree_get_numrecs(left);
-
-       /*
-        * We're now going to join "left" and "right" by moving all the stuff
-        * in "right" to "left" and deleting "right".
-        */
-       XFS_BTREE_STATS_ADD(cur, moves, rrecs);
-       if (level > 0) {
-               /* It's a non-leaf.  Move keys and pointers. */
-               union xfs_btree_key     *lkp;   /* left btree key */
-               union xfs_btree_ptr     *lpp;   /* left address pointer */
-               union xfs_btree_key     *rkp;   /* right btree key */
-               union xfs_btree_ptr     *rpp;   /* right address pointer */
-
-               lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
-               lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
-               rkp = xfs_btree_key_addr(cur, 1, right);
-               rpp = xfs_btree_ptr_addr(cur, 1, right);
-#ifdef DEBUG
-               for (i = 1; i < rrecs; i++) {
-                       error = xfs_btree_check_ptr(cur, rpp, i, level);
-                       if (error)
-                               goto error0;
-               }
-#endif
-               xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
-               xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
-
-               xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
-               xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
-       } else {
-               /* It's a leaf.  Move records.  */
-               union xfs_btree_rec     *lrp;   /* left record pointer */
-               union xfs_btree_rec     *rrp;   /* right record pointer */
-
-               lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
-               rrp = xfs_btree_rec_addr(cur, 1, right);
-
-               xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
-               xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
-       }
-
-       XFS_BTREE_STATS_INC(cur, join);
-
-       /*
-        * Fix up the number of records and right block pointer in the
-        * surviving block, and log it.
-        */
-       xfs_btree_set_numrecs(left, lrecs + rrecs);
-       xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
-       xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
-       xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-
-       /* If there is a right sibling, point it to the remaining block. */
-       xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
-       if (!xfs_btree_ptr_is_null(cur, &cptr)) {
-               error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
-               if (error)
-                       goto error0;
-               xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
-               xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
-       }
-
-       /* Free the deleted block. */
-       error = cur->bc_ops->free_block(cur, rbp);
-       if (error)
-               goto error0;
-       XFS_BTREE_STATS_INC(cur, free);
-
-       /*
-        * If we joined with the left neighbor, set the buffer in the
-        * cursor to the left block, and fix up the index.
-        */
-       if (bp != lbp) {
-               cur->bc_bufs[level] = lbp;
-               cur->bc_ptrs[level] += lrecs;
-               cur->bc_ra[level] = 0;
-       }
-       /*
-        * If we joined with the right neighbor and there's a level above
-        * us, increment the cursor at that level.
-        */
-       else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
-                  (level + 1 < cur->bc_nlevels)) {
-               error = xfs_btree_increment(cur, level + 1, &i);
-               if (error)
-                       goto error0;
-       }
-
-       /*
-        * Readjust the ptr at this level if it's not a leaf, since it's
-        * still pointing at the deletion point, which makes the cursor
-        * inconsistent.  If this makes the ptr 0, the caller fixes it up.
-        * We can't use decrement because it would change the next level up.
-        */
-       if (level > 0)
-               cur->bc_ptrs[level]--;
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-       /* Return value means the next level up has something to do. */
-       *stat = 2;
-       return 0;
-
-error0:
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-       if (tcur)
-               xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-       return error;
-}
-
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-int                                    /* error */
-xfs_btree_delete(
-       struct xfs_btree_cur    *cur,
-       int                     *stat)  /* success/failure */
-{
-       int                     error;  /* error return value */
-       int                     level;
-       int                     i;
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-
-       /*
-        * Go up the tree, starting at leaf level.
-        *
-        * If 2 is returned then a join was done; go to the next level.
-        * Otherwise we are done.
-        */
-       for (level = 0, i = 2; i == 2; level++) {
-               error = xfs_btree_delrec(cur, level, &i);
-               if (error)
-                       goto error0;
-       }
-
-       if (i == 0) {
-               for (level = 1; level < cur->bc_nlevels; level++) {
-                       if (cur->bc_ptrs[level] == 0) {
-                               error = xfs_btree_decrement(cur, level, &i);
-                               if (error)
-                                       goto error0;
-                               break;
-                       }
-               }
-       }
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-       *stat = i;
-       return 0;
-error0:
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-       return error;
-}
-
-/*
- * Get the data from the pointed-to record.
- */
-int                                    /* error */
-xfs_btree_get_rec(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       union xfs_btree_rec     **recp, /* output: btree record */
-       int                     *stat)  /* output: success/failure */
-{
-       struct xfs_btree_block  *block; /* btree block */
-       struct xfs_buf          *bp;    /* buffer pointer */
-       int                     ptr;    /* record number */
-#ifdef DEBUG
-       int                     error;  /* error return value */
-#endif
-
-       ptr = cur->bc_ptrs[0];
-       block = xfs_btree_get_block(cur, 0, &bp);
-
-#ifdef DEBUG
-       error = xfs_btree_check_block(cur, block, 0, bp);
-       if (error)
-               return error;
-#endif
-
-       /*
-        * Off the right end or left end, return failure.
-        */
-       if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
-               *stat = 0;
-               return 0;
-       }
-
-       /*
-        * Point to the record and extract its data.
-        */
-       *recp = xfs_btree_rec_addr(cur, ptr, block);
-       *stat = 1;
-       return 0;
-}
-
-/*
- * Change the owner of a btree.
- *
- * The mechanism we use here is ordered buffer logging. Because we don't know
- * how many buffers were are going to need to modify, we don't really want to
- * have to make transaction reservations for the worst case of every buffer in a
- * full size btree as that may be more space that we can fit in the log....
- *
- * We do the btree walk in the most optimal manner possible - we have sibling
- * pointers so we can just walk all the blocks on each level from left to right
- * in a single pass, and then move to the next level and do the same. We can
- * also do readahead on the sibling pointers to get IO moving more quickly,
- * though for slow disks this is unlikely to make much difference to performance
- * as the amount of CPU work we have to do before moving to the next block is
- * relatively small.
- *
- * For each btree block that we load, modify the owner appropriately, set the
- * buffer as an ordered buffer and log it appropriately. We need to ensure that
- * we mark the region we change dirty so that if the buffer is relogged in
- * a subsequent transaction the changes we make here as an ordered buffer are
- * correctly relogged in that transaction.  If we are in recovery context, then
- * just queue the modified buffer as delayed write buffer so the transaction
- * recovery completion writes the changes to disk.
- */
-static int
-xfs_btree_block_change_owner(
-       struct xfs_btree_cur    *cur,
-       int                     level,
-       __uint64_t              new_owner,
-       struct list_head        *buffer_list)
-{
-       struct xfs_btree_block  *block;
-       struct xfs_buf          *bp;
-       union xfs_btree_ptr     rptr;
-
-       /* do right sibling readahead */
-       xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
-
-       /* modify the owner */
-       block = xfs_btree_get_block(cur, level, &bp);
-       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-               block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
-       else
-               block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
-
-       /*
-        * If the block is a root block hosted in an inode, we might not have a
-        * buffer pointer here and we shouldn't attempt to log the change as the
-        * information is already held in the inode and discarded when the root
-        * block is formatted into the on-disk inode fork. We still change it,
-        * though, so everything is consistent in memory.
-        */
-       if (bp) {
-               if (cur->bc_tp) {
-                       xfs_trans_ordered_buf(cur->bc_tp, bp);
-                       xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
-               } else {
-                       xfs_buf_delwri_queue(bp, buffer_list);
-               }
-       } else {
-               ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
-               ASSERT(level == cur->bc_nlevels - 1);
-       }
-
-       /* now read rh sibling block for next iteration */
-       xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
-       if (xfs_btree_ptr_is_null(cur, &rptr))
-               return ENOENT;
-
-       return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
-}
-
-int
-xfs_btree_change_owner(
-       struct xfs_btree_cur    *cur,
-       __uint64_t              new_owner,
-       struct list_head        *buffer_list)
-{
-       union xfs_btree_ptr     lptr;
-       int                     level;
-       struct xfs_btree_block  *block = NULL;
-       int                     error = 0;
-
-       cur->bc_ops->init_ptr_from_cur(cur, &lptr);
-
-       /* for each level */
-       for (level = cur->bc_nlevels - 1; level >= 0; level--) {
-               /* grab the left hand block */
-               error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
-               if (error)
-                       return error;
-
-               /* readahead the left most block for the next level down */
-               if (level > 0) {
-                       union xfs_btree_ptr     *ptr;
-
-                       ptr = xfs_btree_ptr_addr(cur, 1, block);
-                       xfs_btree_readahead_ptr(cur, ptr, 1);
-
-                       /* save for the next iteration of the loop */
-                       lptr = *ptr;
-               }
-
-               /* for each buffer in the level */
-               do {
-                       error = xfs_btree_block_change_owner(cur, level,
-                                                            new_owner,
-                                                            buffer_list);
-               } while (!error);
-
-               if (error != ENOENT)
-                       return error;
-       }
-
-       return 0;
-}
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
deleted file mode 100644 (file)
index a04b694..0000000
+++ /dev/null
@@ -1,468 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_BTREE_H__
-#define        __XFS_BTREE_H__
-
-struct xfs_buf;
-struct xfs_bmap_free;
-struct xfs_inode;
-struct xfs_mount;
-struct xfs_trans;
-
-extern kmem_zone_t     *xfs_btree_cur_zone;
-
-/*
- * Generic key, ptr and record wrapper structures.
- *
- * These are disk format structures, and are converted where necessary
- * by the btree specific code that needs to interpret them.
- */
-union xfs_btree_ptr {
-       __be32                  s;      /* short form ptr */
-       __be64                  l;      /* long form ptr */
-};
-
-union xfs_btree_key {
-       xfs_bmbt_key_t          bmbt;
-       xfs_bmdr_key_t          bmbr;   /* bmbt root block */
-       xfs_alloc_key_t         alloc;
-       xfs_inobt_key_t         inobt;
-};
-
-union xfs_btree_rec {
-       xfs_bmbt_rec_t          bmbt;
-       xfs_bmdr_rec_t          bmbr;   /* bmbt root block */
-       xfs_alloc_rec_t         alloc;
-       xfs_inobt_rec_t         inobt;
-};
-
-/*
- * This nonsense is to make -wlint happy.
- */
-#define        XFS_LOOKUP_EQ   ((xfs_lookup_t)XFS_LOOKUP_EQi)
-#define        XFS_LOOKUP_LE   ((xfs_lookup_t)XFS_LOOKUP_LEi)
-#define        XFS_LOOKUP_GE   ((xfs_lookup_t)XFS_LOOKUP_GEi)
-
-#define        XFS_BTNUM_BNO   ((xfs_btnum_t)XFS_BTNUM_BNOi)
-#define        XFS_BTNUM_CNT   ((xfs_btnum_t)XFS_BTNUM_CNTi)
-#define        XFS_BTNUM_BMAP  ((xfs_btnum_t)XFS_BTNUM_BMAPi)
-#define        XFS_BTNUM_INO   ((xfs_btnum_t)XFS_BTNUM_INOi)
-#define        XFS_BTNUM_FINO  ((xfs_btnum_t)XFS_BTNUM_FINOi)
-
-/*
- * For logging record fields.
- */
-#define        XFS_BB_MAGIC            (1 << 0)
-#define        XFS_BB_LEVEL            (1 << 1)
-#define        XFS_BB_NUMRECS          (1 << 2)
-#define        XFS_BB_LEFTSIB          (1 << 3)
-#define        XFS_BB_RIGHTSIB         (1 << 4)
-#define        XFS_BB_BLKNO            (1 << 5)
-#define        XFS_BB_LSN              (1 << 6)
-#define        XFS_BB_UUID             (1 << 7)
-#define        XFS_BB_OWNER            (1 << 8)
-#define        XFS_BB_NUM_BITS         5
-#define        XFS_BB_ALL_BITS         ((1 << XFS_BB_NUM_BITS) - 1)
-#define        XFS_BB_NUM_BITS_CRC     9
-#define        XFS_BB_ALL_BITS_CRC     ((1 << XFS_BB_NUM_BITS_CRC) - 1)
-
-/*
- * Generic stats interface
- */
-#define __XFS_BTREE_STATS_INC(type, stat) \
-       XFS_STATS_INC(xs_ ## type ## _2_ ## stat)
-#define XFS_BTREE_STATS_INC(cur, stat)  \
-do {    \
-       switch (cur->bc_btnum) {  \
-       case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break;   \
-       case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break;   \
-       case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break;  \
-       case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break;    \
-       case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(fibt, stat); break;  \
-       case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;       \
-       }       \
-} while (0)
-
-#define __XFS_BTREE_STATS_ADD(type, stat, val) \
-       XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val)
-#define XFS_BTREE_STATS_ADD(cur, stat, val)  \
-do {    \
-       switch (cur->bc_btnum) {  \
-       case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \
-       case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \
-       case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \
-       case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \
-       case XFS_BTNUM_FINO: __XFS_BTREE_STATS_ADD(fibt, stat, val); break; \
-       case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;       \
-       }       \
-} while (0)
-
-#define        XFS_BTREE_MAXLEVELS     8       /* max of all btrees */
-
-struct xfs_btree_ops {
-       /* size of the key and record structures */
-       size_t  key_len;
-       size_t  rec_len;
-
-       /* cursor operations */
-       struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *);
-       void    (*update_cursor)(struct xfs_btree_cur *src,
-                                struct xfs_btree_cur *dst);
-
-       /* update btree root pointer */
-       void    (*set_root)(struct xfs_btree_cur *cur,
-                           union xfs_btree_ptr *nptr, int level_change);
-
-       /* block allocation / freeing */
-       int     (*alloc_block)(struct xfs_btree_cur *cur,
-                              union xfs_btree_ptr *start_bno,
-                              union xfs_btree_ptr *new_bno,
-                              int *stat);
-       int     (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
-
-       /* update last record information */
-       void    (*update_lastrec)(struct xfs_btree_cur *cur,
-                                 struct xfs_btree_block *block,
-                                 union xfs_btree_rec *rec,
-                                 int ptr, int reason);
-
-       /* records in block/level */
-       int     (*get_minrecs)(struct xfs_btree_cur *cur, int level);
-       int     (*get_maxrecs)(struct xfs_btree_cur *cur, int level);
-
-       /* records on disk.  Matter for the root in inode case. */
-       int     (*get_dmaxrecs)(struct xfs_btree_cur *cur, int level);
-
-       /* init values of btree structures */
-       void    (*init_key_from_rec)(union xfs_btree_key *key,
-                                    union xfs_btree_rec *rec);
-       void    (*init_rec_from_key)(union xfs_btree_key *key,
-                                    union xfs_btree_rec *rec);
-       void    (*init_rec_from_cur)(struct xfs_btree_cur *cur,
-                                    union xfs_btree_rec *rec);
-       void    (*init_ptr_from_cur)(struct xfs_btree_cur *cur,
-                                    union xfs_btree_ptr *ptr);
-
-       /* difference between key value and cursor value */
-       __int64_t (*key_diff)(struct xfs_btree_cur *cur,
-                             union xfs_btree_key *key);
-
-       const struct xfs_buf_ops        *buf_ops;
-
-#if defined(DEBUG) || defined(XFS_WARN)
-       /* check that k1 is lower than k2 */
-       int     (*keys_inorder)(struct xfs_btree_cur *cur,
-                               union xfs_btree_key *k1,
-                               union xfs_btree_key *k2);
-
-       /* check that r1 is lower than r2 */
-       int     (*recs_inorder)(struct xfs_btree_cur *cur,
-                               union xfs_btree_rec *r1,
-                               union xfs_btree_rec *r2);
-#endif
-};
-
-/*
- * Reasons for the update_lastrec method to be called.
- */
-#define LASTREC_UPDATE 0
-#define LASTREC_INSREC 1
-#define LASTREC_DELREC 2
-
-
-/*
- * Btree cursor structure.
- * This collects all information needed by the btree code in one place.
- */
-typedef struct xfs_btree_cur
-{
-       struct xfs_trans        *bc_tp; /* transaction we're in, if any */
-       struct xfs_mount        *bc_mp; /* file system mount struct */
-       const struct xfs_btree_ops *bc_ops;
-       uint                    bc_flags; /* btree features - below */
-       union {
-               xfs_alloc_rec_incore_t  a;
-               xfs_bmbt_irec_t         b;
-               xfs_inobt_rec_incore_t  i;
-       }               bc_rec;         /* current insert/search record value */
-       struct xfs_buf  *bc_bufs[XFS_BTREE_MAXLEVELS];  /* buf ptr per level */
-       int             bc_ptrs[XFS_BTREE_MAXLEVELS];   /* key/record # */
-       __uint8_t       bc_ra[XFS_BTREE_MAXLEVELS];     /* readahead bits */
-#define        XFS_BTCUR_LEFTRA        1       /* left sibling has been read-ahead */
-#define        XFS_BTCUR_RIGHTRA       2       /* right sibling has been read-ahead */
-       __uint8_t       bc_nlevels;     /* number of levels in the tree */
-       __uint8_t       bc_blocklog;    /* log2(blocksize) of btree blocks */
-       xfs_btnum_t     bc_btnum;       /* identifies which btree type */
-       union {
-               struct {                        /* needed for BNO, CNT, INO */
-                       struct xfs_buf  *agbp;  /* agf/agi buffer pointer */
-                       xfs_agnumber_t  agno;   /* ag number */
-               } a;
-               struct {                        /* needed for BMAP */
-                       struct xfs_inode *ip;   /* pointer to our inode */
-                       struct xfs_bmap_free *flist;    /* list to free after */
-                       xfs_fsblock_t   firstblock;     /* 1st blk allocated */
-                       int             allocated;      /* count of alloced */
-                       short           forksize;       /* fork's inode space */
-                       char            whichfork;      /* data or attr fork */
-                       char            flags;          /* flags */
-#define        XFS_BTCUR_BPRV_WASDEL   1                       /* was delayed */
-               } b;
-       }               bc_private;     /* per-btree type data */
-} xfs_btree_cur_t;
-
-/* cursor flags */
-#define XFS_BTREE_LONG_PTRS            (1<<0)  /* pointers are 64bits long */
-#define XFS_BTREE_ROOT_IN_INODE                (1<<1)  /* root may be variable size */
-#define XFS_BTREE_LASTREC_UPDATE       (1<<2)  /* track last rec externally */
-#define XFS_BTREE_CRC_BLOCKS           (1<<3)  /* uses extended btree blocks */
-
-
-#define        XFS_BTREE_NOERROR       0
-#define        XFS_BTREE_ERROR         1
-
-/*
- * Convert from buffer to btree block header.
- */
-#define        XFS_BUF_TO_BLOCK(bp)    ((struct xfs_btree_block *)((bp)->b_addr))
-
-
-/*
- * Check that block header is ok.
- */
-int
-xfs_btree_check_block(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       struct xfs_btree_block  *block, /* generic btree block pointer */
-       int                     level,  /* level of the btree block */
-       struct xfs_buf          *bp);   /* buffer containing block, if any */
-
-/*
- * Check that (long) pointer is ok.
- */
-int                                    /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lptr(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       xfs_dfsbno_t            ptr,    /* btree block disk address */
-       int                     level); /* btree block level */
-
-/*
- * Delete the btree cursor.
- */
-void
-xfs_btree_del_cursor(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     error); /* del because of error */
-
-/*
- * Duplicate the btree cursor.
- * Allocate a new one, copy the record, re-get the buffers.
- */
-int                                    /* error */
-xfs_btree_dup_cursor(
-       xfs_btree_cur_t         *cur,   /* input cursor */
-       xfs_btree_cur_t         **ncur);/* output cursor */
-
-/*
- * Get a buffer for the block, return it with no data read.
- * Long-form addressing.
- */
-struct xfs_buf *                               /* buffer for fsbno */
-xfs_btree_get_bufl(
-       struct xfs_mount        *mp,    /* file system mount point */
-       struct xfs_trans        *tp,    /* transaction pointer */
-       xfs_fsblock_t           fsbno,  /* file system block number */
-       uint                    lock);  /* lock flags for get_buf */
-
-/*
- * Get a buffer for the block, return it with no data read.
- * Short-form addressing.
- */
-struct xfs_buf *                               /* buffer for agno/agbno */
-xfs_btree_get_bufs(
-       struct xfs_mount        *mp,    /* file system mount point */
-       struct xfs_trans        *tp,    /* transaction pointer */
-       xfs_agnumber_t          agno,   /* allocation group number */
-       xfs_agblock_t           agbno,  /* allocation group block number */
-       uint                    lock);  /* lock flags for get_buf */
-
-/*
- * Check for the cursor referring to the last block at the given level.
- */
-int                                    /* 1=is last block, 0=not last block */
-xfs_btree_islastblock(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level); /* level to check */
-
-/*
- * Compute first and last byte offsets for the fields given.
- * Interprets the offsets table, which contains struct field offsets.
- */
-void
-xfs_btree_offsets(
-       __int64_t               fields, /* bitmask of fields */
-       const short             *offsets,/* table of field offsets */
-       int                     nbits,  /* number of bits to inspect */
-       int                     *first, /* output: first byte offset */
-       int                     *last); /* output: last byte offset */
-
-/*
- * Get a buffer for the block, return it read in.
- * Long-form addressing.
- */
-int                                    /* error */
-xfs_btree_read_bufl(
-       struct xfs_mount        *mp,    /* file system mount point */
-       struct xfs_trans        *tp,    /* transaction pointer */
-       xfs_fsblock_t           fsbno,  /* file system block number */
-       uint                    lock,   /* lock flags for read_buf */
-       struct xfs_buf          **bpp,  /* buffer for fsbno */
-       int                     refval, /* ref count value for buffer */
-       const struct xfs_buf_ops *ops);
-
-/*
- * Read-ahead the block, don't wait for it, don't return a buffer.
- * Long-form addressing.
- */
-void                                   /* error */
-xfs_btree_reada_bufl(
-       struct xfs_mount        *mp,    /* file system mount point */
-       xfs_fsblock_t           fsbno,  /* file system block number */
-       xfs_extlen_t            count,  /* count of filesystem blocks */
-       const struct xfs_buf_ops *ops);
-
-/*
- * Read-ahead the block, don't wait for it, don't return a buffer.
- * Short-form addressing.
- */
-void                                   /* error */
-xfs_btree_reada_bufs(
-       struct xfs_mount        *mp,    /* file system mount point */
-       xfs_agnumber_t          agno,   /* allocation group number */
-       xfs_agblock_t           agbno,  /* allocation group block number */
-       xfs_extlen_t            count,  /* count of filesystem blocks */
-       const struct xfs_buf_ops *ops);
-
-/*
- * Initialise a new btree block header
- */
-void
-xfs_btree_init_block(
-       struct xfs_mount *mp,
-       struct xfs_buf  *bp,
-       __u32           magic,
-       __u16           level,
-       __u16           numrecs,
-       __u64           owner,
-       unsigned int    flags);
-
-void
-xfs_btree_init_block_int(
-       struct xfs_mount        *mp,
-       struct xfs_btree_block  *buf,
-       xfs_daddr_t             blkno,
-       __u32                   magic,
-       __u16                   level,
-       __u16                   numrecs,
-       __u64                   owner,
-       unsigned int            flags);
-
-/*
- * Common btree core entry points.
- */
-int xfs_btree_increment(struct xfs_btree_cur *, int, int *);
-int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
-int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
-int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
-int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
-int xfs_btree_insert(struct xfs_btree_cur *, int *);
-int xfs_btree_delete(struct xfs_btree_cur *, int *);
-int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
-int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner,
-                          struct list_head *buffer_list);
-
-/*
- * btree block CRC helpers
- */
-void xfs_btree_lblock_calc_crc(struct xfs_buf *);
-bool xfs_btree_lblock_verify_crc(struct xfs_buf *);
-void xfs_btree_sblock_calc_crc(struct xfs_buf *);
-bool xfs_btree_sblock_verify_crc(struct xfs_buf *);
-
-/*
- * Internal btree helpers also used by xfs_bmap.c.
- */
-void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
-void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
-
-/*
- * Helpers.
- */
-static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block)
-{
-       return be16_to_cpu(block->bb_numrecs);
-}
-
-static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block,
-               __uint16_t numrecs)
-{
-       block->bb_numrecs = cpu_to_be16(numrecs);
-}
-
-static inline int xfs_btree_get_level(struct xfs_btree_block *block)
-{
-       return be16_to_cpu(block->bb_level);
-}
-
-
-/*
- * Min and max functions for extlen, agblock, fileoff, and filblks types.
- */
-#define        XFS_EXTLEN_MIN(a,b)     min_t(xfs_extlen_t, (a), (b))
-#define        XFS_EXTLEN_MAX(a,b)     max_t(xfs_extlen_t, (a), (b))
-#define        XFS_AGBLOCK_MIN(a,b)    min_t(xfs_agblock_t, (a), (b))
-#define        XFS_AGBLOCK_MAX(a,b)    max_t(xfs_agblock_t, (a), (b))
-#define        XFS_FILEOFF_MIN(a,b)    min_t(xfs_fileoff_t, (a), (b))
-#define        XFS_FILEOFF_MAX(a,b)    max_t(xfs_fileoff_t, (a), (b))
-#define        XFS_FILBLKS_MIN(a,b)    min_t(xfs_filblks_t, (a), (b))
-#define        XFS_FILBLKS_MAX(a,b)    max_t(xfs_filblks_t, (a), (b))
-
-#define        XFS_FSB_SANITY_CHECK(mp,fsb)    \
-       (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \
-               XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks)
-
-/*
- * Trace hooks.  Currently not implemented as they need to be ported
- * over to the generic tracing functionality, which is some effort.
- *
- * i,j = integer (32 bit)
- * b = btree block buffer (xfs_buf_t)
- * p = btree ptr
- * r = btree record
- * k = btree key
- */
-#define        XFS_BTREE_TRACE_ARGBI(c, b, i)
-#define        XFS_BTREE_TRACE_ARGBII(c, b, i, j)
-#define        XFS_BTREE_TRACE_ARGI(c, i)
-#define        XFS_BTREE_TRACE_ARGIPK(c, i, p, s)
-#define        XFS_BTREE_TRACE_ARGIPR(c, i, p, r)
-#define        XFS_BTREE_TRACE_ARGIK(c, i, k)
-#define XFS_BTREE_TRACE_ARGR(c, r)
-#define        XFS_BTREE_TRACE_CURSOR(c, t)
-
-#endif /* __XFS_BTREE_H__ */
index 7a34a1ae655246e6e07fb0b71fabb3bac4c1b6c2..cd7b8ca9b06410c5d34e92161906532a6221ef66 100644 (file)
@@ -130,7 +130,7 @@ xfs_buf_get_maps(
        bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
                                KM_NOFS);
        if (!bp->b_maps)
-               return ENOMEM;
+               return -ENOMEM;
        return 0;
 }
 
@@ -344,7 +344,7 @@ retry:
                if (unlikely(page == NULL)) {
                        if (flags & XBF_READ_AHEAD) {
                                bp->b_page_count = i;
-                               error = ENOMEM;
+                               error = -ENOMEM;
                                goto out_free_pages;
                        }
 
@@ -465,7 +465,7 @@ _xfs_buf_find(
        eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
        if (blkno >= eofs) {
                /*
-                * XXX (dgc): we should really be returning EFSCORRUPTED here,
+                * XXX (dgc): we should really be returning -EFSCORRUPTED here,
                 * but none of the higher level infrastructure supports
                 * returning a specific error on buffer lookup failures.
                 */
@@ -1052,8 +1052,8 @@ xfs_buf_ioerror(
        xfs_buf_t               *bp,
        int                     error)
 {
-       ASSERT(error >= 0 && error <= 0xffff);
-       bp->b_error = (unsigned short)error;
+       ASSERT(error <= 0 && error >= -1000);
+       bp->b_error = error;
        trace_xfs_buf_ioerror(bp, error, _RET_IP_);
 }
 
@@ -1064,7 +1064,7 @@ xfs_buf_ioerror_alert(
 {
        xfs_alert(bp->b_target->bt_mount,
 "metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d",
-               (__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length);
+               (__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length);
 }
 
 /*
@@ -1083,7 +1083,7 @@ xfs_bioerror(
        /*
         * No need to wait until the buffer is unpinned, we aren't flushing it.
         */
-       xfs_buf_ioerror(bp, EIO);
+       xfs_buf_ioerror(bp, -EIO);
 
        /*
         * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
@@ -1094,7 +1094,7 @@ xfs_bioerror(
 
        xfs_buf_ioend(bp, 0);
 
-       return EIO;
+       return -EIO;
 }
 
 /*
@@ -1127,13 +1127,13 @@ xfs_bioerror_relse(
                 * There's no reason to mark error for
                 * ASYNC buffers.
                 */
-               xfs_buf_ioerror(bp, EIO);
+               xfs_buf_ioerror(bp, -EIO);
                complete(&bp->b_iowait);
        } else {
                xfs_buf_relse(bp);
        }
 
-       return EIO;
+       return -EIO;
 }
 
 STATIC int
@@ -1199,7 +1199,7 @@ xfs_buf_bio_end_io(
         * buffers that require multiple bios to complete.
         */
        if (!bp->b_error)
-               xfs_buf_ioerror(bp, -error);
+               xfs_buf_ioerror(bp, error);
 
        if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
                invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
@@ -1286,7 +1286,7 @@ next_chunk:
                 * because the caller (xfs_buf_iorequest) holds a count itself.
                 */
                atomic_dec(&bp->b_io_remaining);
-               xfs_buf_ioerror(bp, EIO);
+               xfs_buf_ioerror(bp, -EIO);
                bio_put(bio);
        }
 
@@ -1330,6 +1330,20 @@ _xfs_buf_ioapply(
                                                   SHUTDOWN_CORRUPT_INCORE);
                                return;
                        }
+               } else if (bp->b_bn != XFS_BUF_DADDR_NULL) {
+                       struct xfs_mount *mp = bp->b_target->bt_mount;
+
+                       /*
+                        * non-crc filesystems don't attach verifiers during
+                        * log recovery, so don't warn for such filesystems.
+                        */
+                       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                               xfs_warn(mp,
+                                       "%s: no ops on block 0x%llx/0x%x",
+                                       __func__, bp->b_bn, bp->b_length);
+                               xfs_hex_dump(bp->b_addr, 64);
+                               dump_stack();
+                       }
                }
        } else if (bp->b_flags & XBF_READ_AHEAD) {
                rw = READA;
@@ -1628,7 +1642,7 @@ xfs_setsize_buftarg(
                xfs_warn(btp->bt_mount,
                        "Cannot set_blocksize to %u on device %s",
                        sectorsize, name);
-               return EINVAL;
+               return -EINVAL;
        }
 
        /* Set up device logical sector size mask */
index 3a7a5523d3dc8c7df8b2d22bfbb5216b93326a86..c753183900b369b9dbbb377194c3e7b0fc3c9343 100644 (file)
@@ -178,7 +178,7 @@ typedef struct xfs_buf {
        atomic_t                b_io_remaining; /* #outstanding I/O requests */
        unsigned int            b_page_count;   /* size of page array */
        unsigned int            b_offset;       /* page offset in first page */
-       unsigned short          b_error;        /* error code on I/O */
+       int                     b_error;        /* error code on I/O */
        const struct xfs_buf_ops        *b_ops;
 
 #ifdef XFS_BUF_LOCK_TRACKING
index 4654338b03fc5a160247fbc9d0a52ffb26b79cf9..76007deed31fe4455d80956f5debf82faa8baf96 100644 (file)
@@ -488,7 +488,7 @@ xfs_buf_item_unpin(
                xfs_buf_lock(bp);
                xfs_buf_hold(bp);
                bp->b_flags |= XBF_ASYNC;
-               xfs_buf_ioerror(bp, EIO);
+               xfs_buf_ioerror(bp, -EIO);
                XFS_BUF_UNDONE(bp);
                xfs_buf_stale(bp);
                xfs_buf_ioend(bp, 0);
@@ -725,7 +725,7 @@ xfs_buf_item_get_format(
        bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
                                KM_SLEEP);
        if (!bip->bli_formats)
-               return ENOMEM;
+               return -ENOMEM;
        return 0;
 }
 
diff --git a/fs/xfs/xfs_cksum.h b/fs/xfs/xfs_cksum.h
deleted file mode 100644 (file)
index fad1676..0000000
+++ /dev/null
@@ -1,63 +0,0 @@
-#ifndef _XFS_CKSUM_H
-#define _XFS_CKSUM_H 1
-
-#define XFS_CRC_SEED   (~(__uint32_t)0)
-
-/*
- * Calculate the intermediate checksum for a buffer that has the CRC field
- * inside it.  The offset of the 32bit crc fields is passed as the
- * cksum_offset parameter.
- */
-static inline __uint32_t
-xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
-{
-       __uint32_t zero = 0;
-       __uint32_t crc;
-
-       /* Calculate CRC up to the checksum. */
-       crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset);
-
-       /* Skip checksum field */
-       crc = crc32c(crc, &zero, sizeof(__u32));
-
-       /* Calculate the rest of the CRC. */
-       return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)],
-                     length - (cksum_offset + sizeof(__be32)));
-}
-
-/*
- * Convert the intermediate checksum to the final ondisk format.
- *
- * The CRC32c calculation uses LE format even on BE machines, but returns the
- * result in host endian format. Hence we need to byte swap it back to LE format
- * so that it is consistent on disk.
- */
-static inline __le32
-xfs_end_cksum(__uint32_t crc)
-{
-       return ~cpu_to_le32(crc);
-}
-
-/*
- * Helper to generate the checksum for a buffer.
- */
-static inline void
-xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
-{
-       __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
-
-       *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
-}
-
-/*
- * Helper to verify the checksum for a buffer.
- */
-static inline int
-xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
-{
-       __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
-
-       return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
-}
-
-#endif /* _XFS_CKSUM_H */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
deleted file mode 100644 (file)
index a514ab6..0000000
+++ /dev/null
@@ -1,2665 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_priv.h"
-#include "xfs_inode.h"
-#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_alloc.h"
-#include "xfs_bmap.h"
-#include "xfs_attr.h"
-#include "xfs_attr_leaf.h"
-#include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_cksum.h"
-#include "xfs_buf_item.h"
-
-/*
- * xfs_da_btree.c
- *
- * Routines to implement directories as Btrees of hashed names.
- */
-
-/*========================================================================
- * Function prototypes for the kernel.
- *========================================================================*/
-
-/*
- * Routines used for growing the Btree.
- */
-STATIC int xfs_da3_root_split(xfs_da_state_t *state,
-                                           xfs_da_state_blk_t *existing_root,
-                                           xfs_da_state_blk_t *new_child);
-STATIC int xfs_da3_node_split(xfs_da_state_t *state,
-                                           xfs_da_state_blk_t *existing_blk,
-                                           xfs_da_state_blk_t *split_blk,
-                                           xfs_da_state_blk_t *blk_to_add,
-                                           int treelevel,
-                                           int *result);
-STATIC void xfs_da3_node_rebalance(xfs_da_state_t *state,
-                                        xfs_da_state_blk_t *node_blk_1,
-                                        xfs_da_state_blk_t *node_blk_2);
-STATIC void xfs_da3_node_add(xfs_da_state_t *state,
-                                  xfs_da_state_blk_t *old_node_blk,
-                                  xfs_da_state_blk_t *new_node_blk);
-
-/*
- * Routines used for shrinking the Btree.
- */
-STATIC int xfs_da3_root_join(xfs_da_state_t *state,
-                                          xfs_da_state_blk_t *root_blk);
-STATIC int xfs_da3_node_toosmall(xfs_da_state_t *state, int *retval);
-STATIC void xfs_da3_node_remove(xfs_da_state_t *state,
-                                             xfs_da_state_blk_t *drop_blk);
-STATIC void xfs_da3_node_unbalance(xfs_da_state_t *state,
-                                        xfs_da_state_blk_t *src_node_blk,
-                                        xfs_da_state_blk_t *dst_node_blk);
-
-/*
- * Utility routines.
- */
-STATIC int     xfs_da3_blk_unlink(xfs_da_state_t *state,
-                                 xfs_da_state_blk_t *drop_blk,
-                                 xfs_da_state_blk_t *save_blk);
-
-
-kmem_zone_t *xfs_da_state_zone;        /* anchor for state struct zone */
-
-/*
- * Allocate a dir-state structure.
- * We don't put them on the stack since they're large.
- */
-xfs_da_state_t *
-xfs_da_state_alloc(void)
-{
-       return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
-}
-
-/*
- * Kill the altpath contents of a da-state structure.
- */
-STATIC void
-xfs_da_state_kill_altpath(xfs_da_state_t *state)
-{
-       int     i;
-
-       for (i = 0; i < state->altpath.active; i++)
-               state->altpath.blk[i].bp = NULL;
-       state->altpath.active = 0;
-}
-
-/*
- * Free a da-state structure.
- */
-void
-xfs_da_state_free(xfs_da_state_t *state)
-{
-       xfs_da_state_kill_altpath(state);
-#ifdef DEBUG
-       memset((char *)state, 0, sizeof(*state));
-#endif /* DEBUG */
-       kmem_zone_free(xfs_da_state_zone, state);
-}
-
-static bool
-xfs_da3_node_verify(
-       struct xfs_buf          *bp)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-       struct xfs_da_intnode   *hdr = bp->b_addr;
-       struct xfs_da3_icnode_hdr ichdr;
-       const struct xfs_dir_ops *ops;
-
-       ops = xfs_dir_get_ops(mp, NULL);
-
-       ops->node_hdr_from_disk(&ichdr, hdr);
-
-       if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
-
-               if (ichdr.magic != XFS_DA3_NODE_MAGIC)
-                       return false;
-
-               if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
-                       return false;
-               if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
-                       return false;
-       } else {
-               if (ichdr.magic != XFS_DA_NODE_MAGIC)
-                       return false;
-       }
-       if (ichdr.level == 0)
-               return false;
-       if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
-               return false;
-       if (ichdr.count == 0)
-               return false;
-
-       /*
-        * we don't know if the node is for and attribute or directory tree,
-        * so only fail if the count is outside both bounds
-        */
-       if (ichdr.count > mp->m_dir_geo->node_ents &&
-           ichdr.count > mp->m_attr_geo->node_ents)
-               return false;
-
-       /* XXX: hash order check? */
-
-       return true;
-}
-
-static void
-xfs_da3_node_write_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-       struct xfs_buf_log_item *bip = bp->b_fspriv;
-       struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
-
-       if (!xfs_da3_node_verify(bp)) {
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-               xfs_verifier_error(bp);
-               return;
-       }
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return;
-
-       if (bip)
-               hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
-
-       xfs_buf_update_cksum(bp, XFS_DA3_NODE_CRC_OFF);
-}
-
-/*
- * leaf/node format detection on trees is sketchy, so a node read can be done on
- * leaf level blocks when detection identifies the tree as a node format tree
- * incorrectly. In this case, we need to swap the verifier to match the correct
- * format of the block being read.
- */
-static void
-xfs_da3_node_read_verify(
-       struct xfs_buf          *bp)
-{
-       struct xfs_da_blkinfo   *info = bp->b_addr;
-
-       switch (be16_to_cpu(info->magic)) {
-               case XFS_DA3_NODE_MAGIC:
-                       if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) {
-                               xfs_buf_ioerror(bp, EFSBADCRC);
-                               break;
-                       }
-                       /* fall through */
-               case XFS_DA_NODE_MAGIC:
-                       if (!xfs_da3_node_verify(bp)) {
-                               xfs_buf_ioerror(bp, EFSCORRUPTED);
-                               break;
-                       }
-                       return;
-               case XFS_ATTR_LEAF_MAGIC:
-               case XFS_ATTR3_LEAF_MAGIC:
-                       bp->b_ops = &xfs_attr3_leaf_buf_ops;
-                       bp->b_ops->verify_read(bp);
-                       return;
-               case XFS_DIR2_LEAFN_MAGIC:
-               case XFS_DIR3_LEAFN_MAGIC:
-                       bp->b_ops = &xfs_dir3_leafn_buf_ops;
-                       bp->b_ops->verify_read(bp);
-                       return;
-               default:
-                       break;
-       }
-
-       /* corrupt block */
-       xfs_verifier_error(bp);
-}
-
-const struct xfs_buf_ops xfs_da3_node_buf_ops = {
-       .verify_read = xfs_da3_node_read_verify,
-       .verify_write = xfs_da3_node_write_verify,
-};
-
-int
-xfs_da3_node_read(
-       struct xfs_trans        *tp,
-       struct xfs_inode        *dp,
-       xfs_dablk_t             bno,
-       xfs_daddr_t             mappedbno,
-       struct xfs_buf          **bpp,
-       int                     which_fork)
-{
-       int                     err;
-
-       err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
-                                       which_fork, &xfs_da3_node_buf_ops);
-       if (!err && tp) {
-               struct xfs_da_blkinfo   *info = (*bpp)->b_addr;
-               int                     type;
-
-               switch (be16_to_cpu(info->magic)) {
-               case XFS_DA_NODE_MAGIC:
-               case XFS_DA3_NODE_MAGIC:
-                       type = XFS_BLFT_DA_NODE_BUF;
-                       break;
-               case XFS_ATTR_LEAF_MAGIC:
-               case XFS_ATTR3_LEAF_MAGIC:
-                       type = XFS_BLFT_ATTR_LEAF_BUF;
-                       break;
-               case XFS_DIR2_LEAFN_MAGIC:
-               case XFS_DIR3_LEAFN_MAGIC:
-                       type = XFS_BLFT_DIR_LEAFN_BUF;
-                       break;
-               default:
-                       type = 0;
-                       ASSERT(0);
-                       break;
-               }
-               xfs_trans_buf_set_type(tp, *bpp, type);
-       }
-       return err;
-}
-
-/*========================================================================
- * Routines used for growing the Btree.
- *========================================================================*/
-
-/*
- * Create the initial contents of an intermediate node.
- */
-int
-xfs_da3_node_create(
-       struct xfs_da_args      *args,
-       xfs_dablk_t             blkno,
-       int                     level,
-       struct xfs_buf          **bpp,
-       int                     whichfork)
-{
-       struct xfs_da_intnode   *node;
-       struct xfs_trans        *tp = args->trans;
-       struct xfs_mount        *mp = tp->t_mountp;
-       struct xfs_da3_icnode_hdr ichdr = {0};
-       struct xfs_buf          *bp;
-       int                     error;
-       struct xfs_inode        *dp = args->dp;
-
-       trace_xfs_da_node_create(args);
-       ASSERT(level <= XFS_DA_NODE_MAXDEPTH);
-
-       error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork);
-       if (error)
-               return(error);
-       bp->b_ops = &xfs_da3_node_buf_ops;
-       xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
-       node = bp->b_addr;
-
-       if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
-
-               ichdr.magic = XFS_DA3_NODE_MAGIC;
-               hdr3->info.blkno = cpu_to_be64(bp->b_bn);
-               hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
-               uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid);
-       } else {
-               ichdr.magic = XFS_DA_NODE_MAGIC;
-       }
-       ichdr.level = level;
-
-       dp->d_ops->node_hdr_to_disk(node, &ichdr);
-       xfs_trans_log_buf(tp, bp,
-               XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
-
-       *bpp = bp;
-       return(0);
-}
-
-/*
- * Split a leaf node, rebalance, then possibly split
- * intermediate nodes, rebalance, etc.
- */
-int                                                    /* error */
-xfs_da3_split(
-       struct xfs_da_state     *state)
-{
-       struct xfs_da_state_blk *oldblk;
-       struct xfs_da_state_blk *newblk;
-       struct xfs_da_state_blk *addblk;
-       struct xfs_da_intnode   *node;
-       struct xfs_buf          *bp;
-       int                     max;
-       int                     action = 0;
-       int                     error;
-       int                     i;
-
-       trace_xfs_da_split(state->args);
-
-       /*
-        * Walk back up the tree splitting/inserting/adjusting as necessary.
-        * If we need to insert and there isn't room, split the node, then
-        * decide which fragment to insert the new block from below into.
-        * Note that we may split the root this way, but we need more fixup.
-        */
-       max = state->path.active - 1;
-       ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH));
-       ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC ||
-              state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
-
-       addblk = &state->path.blk[max];         /* initial dummy value */
-       for (i = max; (i >= 0) && addblk; state->path.active--, i--) {
-               oldblk = &state->path.blk[i];
-               newblk = &state->altpath.blk[i];
-
-               /*
-                * If a leaf node then
-                *     Allocate a new leaf node, then rebalance across them.
-                * else if an intermediate node then
-                *     We split on the last layer, must we split the node?
-                */
-               switch (oldblk->magic) {
-               case XFS_ATTR_LEAF_MAGIC:
-                       error = xfs_attr3_leaf_split(state, oldblk, newblk);
-                       if ((error != 0) && (error != ENOSPC)) {
-                               return(error);  /* GROT: attr is inconsistent */
-                       }
-                       if (!error) {
-                               addblk = newblk;
-                               break;
-                       }
-                       /*
-                        * Entry wouldn't fit, split the leaf again.
-                        */
-                       state->extravalid = 1;
-                       if (state->inleaf) {
-                               state->extraafter = 0;  /* before newblk */
-                               trace_xfs_attr_leaf_split_before(state->args);
-                               error = xfs_attr3_leaf_split(state, oldblk,
-                                                           &state->extrablk);
-                       } else {
-                               state->extraafter = 1;  /* after newblk */
-                               trace_xfs_attr_leaf_split_after(state->args);
-                               error = xfs_attr3_leaf_split(state, newblk,
-                                                           &state->extrablk);
-                       }
-                       if (error)
-                               return(error);  /* GROT: attr inconsistent */
-                       addblk = newblk;
-                       break;
-               case XFS_DIR2_LEAFN_MAGIC:
-                       error = xfs_dir2_leafn_split(state, oldblk, newblk);
-                       if (error)
-                               return error;
-                       addblk = newblk;
-                       break;
-               case XFS_DA_NODE_MAGIC:
-                       error = xfs_da3_node_split(state, oldblk, newblk, addblk,
-                                                        max - i, &action);
-                       addblk->bp = NULL;
-                       if (error)
-                               return(error);  /* GROT: dir is inconsistent */
-                       /*
-                        * Record the newly split block for the next time thru?
-                        */
-                       if (action)
-                               addblk = newblk;
-                       else
-                               addblk = NULL;
-                       break;
-               }
-
-               /*
-                * Update the btree to show the new hashval for this child.
-                */
-               xfs_da3_fixhashpath(state, &state->path);
-       }
-       if (!addblk)
-               return(0);
-
-       /*
-        * Split the root node.
-        */
-       ASSERT(state->path.active == 0);
-       oldblk = &state->path.blk[0];
-       error = xfs_da3_root_split(state, oldblk, addblk);
-       if (error) {
-               addblk->bp = NULL;
-               return(error);  /* GROT: dir is inconsistent */
-       }
-
-       /*
-        * Update pointers to the node which used to be block 0 and
-        * just got bumped because of the addition of a new root node.
-        * There might be three blocks involved if a double split occurred,
-        * and the original block 0 could be at any position in the list.
-        *
-        * Note: the magic numbers and sibling pointers are in the same
-        * physical place for both v2 and v3 headers (by design). Hence it
-        * doesn't matter which version of the xfs_da_intnode structure we use
-        * here as the result will be the same using either structure.
-        */
-       node = oldblk->bp->b_addr;
-       if (node->hdr.info.forw) {
-               if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
-                       bp = addblk->bp;
-               } else {
-                       ASSERT(state->extravalid);
-                       bp = state->extrablk.bp;
-               }
-               node = bp->b_addr;
-               node->hdr.info.back = cpu_to_be32(oldblk->blkno);
-               xfs_trans_log_buf(state->args->trans, bp,
-                   XFS_DA_LOGRANGE(node, &node->hdr.info,
-                   sizeof(node->hdr.info)));
-       }
-       node = oldblk->bp->b_addr;
-       if (node->hdr.info.back) {
-               if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) {
-                       bp = addblk->bp;
-               } else {
-                       ASSERT(state->extravalid);
-                       bp = state->extrablk.bp;
-               }
-               node = bp->b_addr;
-               node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
-               xfs_trans_log_buf(state->args->trans, bp,
-                   XFS_DA_LOGRANGE(node, &node->hdr.info,
-                   sizeof(node->hdr.info)));
-       }
-       addblk->bp = NULL;
-       return(0);
-}
-
-/*
- * Split the root.  We have to create a new root and point to the two
- * parts (the split old root) that we just created.  Copy block zero to
- * the EOF, extending the inode in process.
- */
-STATIC int                                             /* error */
-xfs_da3_root_split(
-       struct xfs_da_state     *state,
-       struct xfs_da_state_blk *blk1,
-       struct xfs_da_state_blk *blk2)
-{
-       struct xfs_da_intnode   *node;
-       struct xfs_da_intnode   *oldroot;
-       struct xfs_da_node_entry *btree;
-       struct xfs_da3_icnode_hdr nodehdr;
-       struct xfs_da_args      *args;
-       struct xfs_buf          *bp;
-       struct xfs_inode        *dp;
-       struct xfs_trans        *tp;
-       struct xfs_mount        *mp;
-       struct xfs_dir2_leaf    *leaf;
-       xfs_dablk_t             blkno;
-       int                     level;
-       int                     error;
-       int                     size;
-
-       trace_xfs_da_root_split(state->args);
-
-       /*
-        * Copy the existing (incorrect) block from the root node position
-        * to a free space somewhere.
-        */
-       args = state->args;
-       error = xfs_da_grow_inode(args, &blkno);
-       if (error)
-               return error;
-
-       dp = args->dp;
-       tp = args->trans;
-       mp = state->mp;
-       error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
-       if (error)
-               return error;
-       node = bp->b_addr;
-       oldroot = blk1->bp->b_addr;
-       if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
-           oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
-               struct xfs_da3_icnode_hdr nodehdr;
-
-               dp->d_ops->node_hdr_from_disk(&nodehdr, oldroot);
-               btree = dp->d_ops->node_tree_p(oldroot);
-               size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot);
-               level = nodehdr.level;
-
-               /*
-                * we are about to copy oldroot to bp, so set up the type
-                * of bp while we know exactly what it will be.
-                */
-               xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
-       } else {
-               struct xfs_dir3_icleaf_hdr leafhdr;
-               struct xfs_dir2_leaf_entry *ents;
-
-               leaf = (xfs_dir2_leaf_t *)oldroot;
-               dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-               ents = dp->d_ops->leaf_ents_p(leaf);
-
-               ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
-                      leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
-               size = (int)((char *)&ents[leafhdr.count] - (char *)leaf);
-               level = 0;
-
-               /*
-                * we are about to copy oldroot to bp, so set up the type
-                * of bp while we know exactly what it will be.
-                */
-               xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
-       }
-
-       /*
-        * we can copy most of the information in the node from one block to
-        * another, but for CRC enabled headers we have to make sure that the
-        * block specific identifiers are kept intact. We update the buffer
-        * directly for this.
-        */
-       memcpy(node, oldroot, size);
-       if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
-           oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
-               struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node;
-
-               node3->hdr.info.blkno = cpu_to_be64(bp->b_bn);
-       }
-       xfs_trans_log_buf(tp, bp, 0, size - 1);
-
-       bp->b_ops = blk1->bp->b_ops;
-       xfs_trans_buf_copy_type(bp, blk1->bp);
-       blk1->bp = bp;
-       blk1->blkno = blkno;
-
-       /*
-        * Set up the new root node.
-        */
-       error = xfs_da3_node_create(args,
-               (args->whichfork == XFS_DATA_FORK) ? args->geo->leafblk : 0,
-               level + 1, &bp, args->whichfork);
-       if (error)
-               return error;
-
-       node = bp->b_addr;
-       dp->d_ops->node_hdr_from_disk(&nodehdr, node);
-       btree = dp->d_ops->node_tree_p(node);
-       btree[0].hashval = cpu_to_be32(blk1->hashval);
-       btree[0].before = cpu_to_be32(blk1->blkno);
-       btree[1].hashval = cpu_to_be32(blk2->hashval);
-       btree[1].before = cpu_to_be32(blk2->blkno);
-       nodehdr.count = 2;
-       dp->d_ops->node_hdr_to_disk(node, &nodehdr);
-
-#ifdef DEBUG
-       if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
-           oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
-               ASSERT(blk1->blkno >= args->geo->leafblk &&
-                      blk1->blkno < args->geo->freeblk);
-               ASSERT(blk2->blkno >= args->geo->leafblk &&
-                      blk2->blkno < args->geo->freeblk);
-       }
-#endif
-
-       /* Header is already logged by xfs_da_node_create */
-       xfs_trans_log_buf(tp, bp,
-               XFS_DA_LOGRANGE(node, btree, sizeof(xfs_da_node_entry_t) * 2));
-
-       return 0;
-}
-
-/*
- * Split the node, rebalance, then add the new entry.
- */
-STATIC int                                             /* error */
-xfs_da3_node_split(
-       struct xfs_da_state     *state,
-       struct xfs_da_state_blk *oldblk,
-       struct xfs_da_state_blk *newblk,
-       struct xfs_da_state_blk *addblk,
-       int                     treelevel,
-       int                     *result)
-{
-       struct xfs_da_intnode   *node;
-       struct xfs_da3_icnode_hdr nodehdr;
-       xfs_dablk_t             blkno;
-       int                     newcount;
-       int                     error;
-       int                     useextra;
-       struct xfs_inode        *dp = state->args->dp;
-
-       trace_xfs_da_node_split(state->args);
-
-       node = oldblk->bp->b_addr;
-       dp->d_ops->node_hdr_from_disk(&nodehdr, node);
-
-       /*
-        * With V2 dirs the extra block is data or freespace.
-        */
-       useextra = state->extravalid && state->args->whichfork == XFS_ATTR_FORK;
-       newcount = 1 + useextra;
-       /*
-        * Do we have to split the node?
-        */
-       if (nodehdr.count + newcount > state->args->geo->node_ents) {
-               /*
-                * Allocate a new node, add to the doubly linked chain of
-                * nodes, then move some of our excess entries into it.
-                */
-               error = xfs_da_grow_inode(state->args, &blkno);
-               if (error)
-                       return(error);  /* GROT: dir is inconsistent */
-
-               error = xfs_da3_node_create(state->args, blkno, treelevel,
-                                          &newblk->bp, state->args->whichfork);
-               if (error)
-                       return(error);  /* GROT: dir is inconsistent */
-               newblk->blkno = blkno;
-               newblk->magic = XFS_DA_NODE_MAGIC;
-               xfs_da3_node_rebalance(state, oldblk, newblk);
-               error = xfs_da3_blk_link(state, oldblk, newblk);
-               if (error)
-                       return(error);
-               *result = 1;
-       } else {
-               *result = 0;
-       }
-
-       /*
-        * Insert the new entry(s) into the correct block
-        * (updating last hashval in the process).
-        *
-        * xfs_da3_node_add() inserts BEFORE the given index,
-        * and as a result of using node_lookup_int() we always
-        * point to a valid entry (not after one), but a split
-        * operation always results in a new block whose hashvals
-        * FOLLOW the current block.
-        *
-        * If we had double-split op below us, then add the extra block too.
-        */
-       node = oldblk->bp->b_addr;
-       dp->d_ops->node_hdr_from_disk(&nodehdr, node);
-       if (oldblk->index <= nodehdr.count) {
-               oldblk->index++;
-               xfs_da3_node_add(state, oldblk, addblk);
-               if (useextra) {
-                       if (state->extraafter)
-                               oldblk->index++;
-                       xfs_da3_node_add(state, oldblk, &state->extrablk);
-                       state->extravalid = 0;
-               }
-       } else {
-               newblk->index++;
-               xfs_da3_node_add(state, newblk, addblk);
-               if (useextra) {
-                       if (state->extraafter)
-                               newblk->index++;
-                       xfs_da3_node_add(state, newblk, &state->extrablk);
-                       state->extravalid = 0;
-               }
-       }
-
-       return(0);
-}
-
-/*
- * Balance the btree elements between two intermediate nodes,
- * usually one full and one empty.
- *
- * NOTE: if blk2 is empty, then it will get the upper half of blk1.
- */
-STATIC void
-xfs_da3_node_rebalance(
-       struct xfs_da_state     *state,
-       struct xfs_da_state_blk *blk1,
-       struct xfs_da_state_blk *blk2)
-{
-       struct xfs_da_intnode   *node1;
-       struct xfs_da_intnode   *node2;
-       struct xfs_da_intnode   *tmpnode;
-       struct xfs_da_node_entry *btree1;
-       struct xfs_da_node_entry *btree2;
-       struct xfs_da_node_entry *btree_s;
-       struct xfs_da_node_entry *btree_d;
-       struct xfs_da3_icnode_hdr nodehdr1;
-       struct xfs_da3_icnode_hdr nodehdr2;
-       struct xfs_trans        *tp;
-       int                     count;
-       int                     tmp;
-       int                     swap = 0;
-       struct xfs_inode        *dp = state->args->dp;
-
-       trace_xfs_da_node_rebalance(state->args);
-
-       node1 = blk1->bp->b_addr;
-       node2 = blk2->bp->b_addr;
-       dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
-       dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
-       btree1 = dp->d_ops->node_tree_p(node1);
-       btree2 = dp->d_ops->node_tree_p(node2);
-
-       /*
-        * Figure out how many entries need to move, and in which direction.
-        * Swap the nodes around if that makes it simpler.
-        */
-       if (nodehdr1.count > 0 && nodehdr2.count > 0 &&
-           ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
-            (be32_to_cpu(btree2[nodehdr2.count - 1].hashval) <
-                       be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) {
-               tmpnode = node1;
-               node1 = node2;
-               node2 = tmpnode;
-               dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
-               dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
-               btree1 = dp->d_ops->node_tree_p(node1);
-               btree2 = dp->d_ops->node_tree_p(node2);
-               swap = 1;
-       }
-
-       count = (nodehdr1.count - nodehdr2.count) / 2;
-       if (count == 0)
-               return;
-       tp = state->args->trans;
-       /*
-        * Two cases: high-to-low and low-to-high.
-        */
-       if (count > 0) {
-               /*
-                * Move elements in node2 up to make a hole.
-                */
-               tmp = nodehdr2.count;
-               if (tmp > 0) {
-                       tmp *= (uint)sizeof(xfs_da_node_entry_t);
-                       btree_s = &btree2[0];
-                       btree_d = &btree2[count];
-                       memmove(btree_d, btree_s, tmp);
-               }
-
-               /*
-                * Move the req'd B-tree elements from high in node1 to
-                * low in node2.
-                */
-               nodehdr2.count += count;
-               tmp = count * (uint)sizeof(xfs_da_node_entry_t);
-               btree_s = &btree1[nodehdr1.count - count];
-               btree_d = &btree2[0];
-               memcpy(btree_d, btree_s, tmp);
-               nodehdr1.count -= count;
-       } else {
-               /*
-                * Move the req'd B-tree elements from low in node2 to
-                * high in node1.
-                */
-               count = -count;
-               tmp = count * (uint)sizeof(xfs_da_node_entry_t);
-               btree_s = &btree2[0];
-               btree_d = &btree1[nodehdr1.count];
-               memcpy(btree_d, btree_s, tmp);
-               nodehdr1.count += count;
-
-               xfs_trans_log_buf(tp, blk1->bp,
-                       XFS_DA_LOGRANGE(node1, btree_d, tmp));
-
-               /*
-                * Move elements in node2 down to fill the hole.
-                */
-               tmp  = nodehdr2.count - count;
-               tmp *= (uint)sizeof(xfs_da_node_entry_t);
-               btree_s = &btree2[count];
-               btree_d = &btree2[0];
-               memmove(btree_d, btree_s, tmp);
-               nodehdr2.count -= count;
-       }
-
-       /*
-        * Log header of node 1 and all current bits of node 2.
-        */
-       dp->d_ops->node_hdr_to_disk(node1, &nodehdr1);
-       xfs_trans_log_buf(tp, blk1->bp,
-               XFS_DA_LOGRANGE(node1, &node1->hdr, dp->d_ops->node_hdr_size));
-
-       dp->d_ops->node_hdr_to_disk(node2, &nodehdr2);
-       xfs_trans_log_buf(tp, blk2->bp,
-               XFS_DA_LOGRANGE(node2, &node2->hdr,
-                               dp->d_ops->node_hdr_size +
-                               (sizeof(btree2[0]) * nodehdr2.count)));
-
-       /*
-        * Record the last hashval from each block for upward propagation.
-        * (note: don't use the swapped node pointers)
-        */
-       if (swap) {
-               node1 = blk1->bp->b_addr;
-               node2 = blk2->bp->b_addr;
-               dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
-               dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
-               btree1 = dp->d_ops->node_tree_p(node1);
-               btree2 = dp->d_ops->node_tree_p(node2);
-       }
-       blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval);
-       blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval);
-
-       /*
-        * Adjust the expected index for insertion.
-        */
-       if (blk1->index >= nodehdr1.count) {
-               blk2->index = blk1->index - nodehdr1.count;
-               blk1->index = nodehdr1.count + 1;       /* make it invalid */
-       }
-}
-
-/*
- * Add a new entry to an intermediate node.
- */
-STATIC void
-xfs_da3_node_add(
-       struct xfs_da_state     *state,
-       struct xfs_da_state_blk *oldblk,
-       struct xfs_da_state_blk *newblk)
-{
-       struct xfs_da_intnode   *node;
-       struct xfs_da3_icnode_hdr nodehdr;
-       struct xfs_da_node_entry *btree;
-       int                     tmp;
-       struct xfs_inode        *dp = state->args->dp;
-
-       trace_xfs_da_node_add(state->args);
-
-       node = oldblk->bp->b_addr;
-       dp->d_ops->node_hdr_from_disk(&nodehdr, node);
-       btree = dp->d_ops->node_tree_p(node);
-
-       ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count);
-       ASSERT(newblk->blkno != 0);
-       if (state->args->whichfork == XFS_DATA_FORK)
-               ASSERT(newblk->blkno >= state->args->geo->leafblk &&
-                      newblk->blkno < state->args->geo->freeblk);
-
-       /*
-        * We may need to make some room before we insert the new node.
-        */
-       tmp = 0;
-       if (oldblk->index < nodehdr.count) {
-               tmp = (nodehdr.count - oldblk->index) * (uint)sizeof(*btree);
-               memmove(&btree[oldblk->index + 1], &btree[oldblk->index], tmp);
-       }
-       btree[oldblk->index].hashval = cpu_to_be32(newblk->hashval);
-       btree[oldblk->index].before = cpu_to_be32(newblk->blkno);
-       xfs_trans_log_buf(state->args->trans, oldblk->bp,
-               XFS_DA_LOGRANGE(node, &btree[oldblk->index],
-                               tmp + sizeof(*btree)));
-
-       nodehdr.count += 1;
-       dp->d_ops->node_hdr_to_disk(node, &nodehdr);
-       xfs_trans_log_buf(state->args->trans, oldblk->bp,
-               XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
-
-       /*
-        * Copy the last hash value from the oldblk to propagate upwards.
-        */
-       oldblk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
-}
-
-/*========================================================================
- * Routines used for shrinking the Btree.
- *========================================================================*/
-
-/*
- * Deallocate an empty leaf node, remove it from its parent,
- * possibly deallocating that block, etc...
- */
-int
-xfs_da3_join(
-       struct xfs_da_state     *state)
-{
-       struct xfs_da_state_blk *drop_blk;
-       struct xfs_da_state_blk *save_blk;
-       int                     action = 0;
-       int                     error;
-
-       trace_xfs_da_join(state->args);
-
-       drop_blk = &state->path.blk[ state->path.active-1 ];
-       save_blk = &state->altpath.blk[ state->path.active-1 ];
-       ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
-       ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC ||
-              drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
-
-       /*
-        * Walk back up the tree joining/deallocating as necessary.
-        * When we stop dropping blocks, break out.
-        */
-       for (  ; state->path.active >= 2; drop_blk--, save_blk--,
-                state->path.active--) {
-               /*
-                * See if we can combine the block with a neighbor.
-                *   (action == 0) => no options, just leave
-                *   (action == 1) => coalesce, then unlink
-                *   (action == 2) => block empty, unlink it
-                */
-               switch (drop_blk->magic) {
-               case XFS_ATTR_LEAF_MAGIC:
-                       error = xfs_attr3_leaf_toosmall(state, &action);
-                       if (error)
-                               return(error);
-                       if (action == 0)
-                               return(0);
-                       xfs_attr3_leaf_unbalance(state, drop_blk, save_blk);
-                       break;
-               case XFS_DIR2_LEAFN_MAGIC:
-                       error = xfs_dir2_leafn_toosmall(state, &action);
-                       if (error)
-                               return error;
-                       if (action == 0)
-                               return 0;
-                       xfs_dir2_leafn_unbalance(state, drop_blk, save_blk);
-                       break;
-               case XFS_DA_NODE_MAGIC:
-                       /*
-                        * Remove the offending node, fixup hashvals,
-                        * check for a toosmall neighbor.
-                        */
-                       xfs_da3_node_remove(state, drop_blk);
-                       xfs_da3_fixhashpath(state, &state->path);
-                       error = xfs_da3_node_toosmall(state, &action);
-                       if (error)
-                               return(error);
-                       if (action == 0)
-                               return 0;
-                       xfs_da3_node_unbalance(state, drop_blk, save_blk);
-                       break;
-               }
-               xfs_da3_fixhashpath(state, &state->altpath);
-               error = xfs_da3_blk_unlink(state, drop_blk, save_blk);
-               xfs_da_state_kill_altpath(state);
-               if (error)
-                       return(error);
-               error = xfs_da_shrink_inode(state->args, drop_blk->blkno,
-                                                        drop_blk->bp);
-               drop_blk->bp = NULL;
-               if (error)
-                       return(error);
-       }
-       /*
-        * We joined all the way to the top.  If it turns out that
-        * we only have one entry in the root, make the child block
-        * the new root.
-        */
-       xfs_da3_node_remove(state, drop_blk);
-       xfs_da3_fixhashpath(state, &state->path);
-       error = xfs_da3_root_join(state, &state->path.blk[0]);
-       return(error);
-}
-
-#ifdef DEBUG
-static void
-xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level)
-{
-       __be16  magic = blkinfo->magic;
-
-       if (level == 1) {
-               ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
-                      magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
-                      magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
-                      magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
-       } else {
-               ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
-                      magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
-       }
-       ASSERT(!blkinfo->forw);
-       ASSERT(!blkinfo->back);
-}
-#else  /* !DEBUG */
-#define        xfs_da_blkinfo_onlychild_validate(blkinfo, level)
-#endif /* !DEBUG */
-
-/*
- * We have only one entry in the root.  Copy the only remaining child of
- * the old root to block 0 as the new root node.
- */
-STATIC int
-xfs_da3_root_join(
-       struct xfs_da_state     *state,
-       struct xfs_da_state_blk *root_blk)
-{
-       struct xfs_da_intnode   *oldroot;
-       struct xfs_da_args      *args;
-       xfs_dablk_t             child;
-       struct xfs_buf          *bp;
-       struct xfs_da3_icnode_hdr oldroothdr;
-       struct xfs_da_node_entry *btree;
-       int                     error;
-       struct xfs_inode        *dp = state->args->dp;
-
-       trace_xfs_da_root_join(state->args);
-
-       ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
-
-       args = state->args;
-       oldroot = root_blk->bp->b_addr;
-       dp->d_ops->node_hdr_from_disk(&oldroothdr, oldroot);
-       ASSERT(oldroothdr.forw == 0);
-       ASSERT(oldroothdr.back == 0);
-
-       /*
-        * If the root has more than one child, then don't do anything.
-        */
-       if (oldroothdr.count > 1)
-               return 0;
-
-       /*
-        * Read in the (only) child block, then copy those bytes into
-        * the root block's buffer and free the original child block.
-        */
-       btree = dp->d_ops->node_tree_p(oldroot);
-       child = be32_to_cpu(btree[0].before);
-       ASSERT(child != 0);
-       error = xfs_da3_node_read(args->trans, dp, child, -1, &bp,
-                                            args->whichfork);
-       if (error)
-               return error;
-       xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level);
-
-       /*
-        * This could be copying a leaf back into the root block in the case of
-        * there only being a single leaf block left in the tree. Hence we have
-        * to update the b_ops pointer as well to match the buffer type change
-        * that could occur. For dir3 blocks we also need to update the block
-        * number in the buffer header.
-        */
-       memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize);
-       root_blk->bp->b_ops = bp->b_ops;
-       xfs_trans_buf_copy_type(root_blk->bp, bp);
-       if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) {
-               struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr;
-               da3->blkno = cpu_to_be64(root_blk->bp->b_bn);
-       }
-       xfs_trans_log_buf(args->trans, root_blk->bp, 0,
-                         args->geo->blksize - 1);
-       error = xfs_da_shrink_inode(args, child, bp);
-       return(error);
-}
-
-/*
- * Check a node block and its neighbors to see if the block should be
- * collapsed into one or the other neighbor.  Always keep the block
- * with the smaller block number.
- * If the current block is over 50% full, don't try to join it, return 0.
- * If the block is empty, fill in the state structure and return 2.
- * If it can be collapsed, fill in the state structure and return 1.
- * If nothing can be done, return 0.
- */
-STATIC int
-xfs_da3_node_toosmall(
-       struct xfs_da_state     *state,
-       int                     *action)
-{
-       struct xfs_da_intnode   *node;
-       struct xfs_da_state_blk *blk;
-       struct xfs_da_blkinfo   *info;
-       xfs_dablk_t             blkno;
-       struct xfs_buf          *bp;
-       struct xfs_da3_icnode_hdr nodehdr;
-       int                     count;
-       int                     forward;
-       int                     error;
-       int                     retval;
-       int                     i;
-       struct xfs_inode        *dp = state->args->dp;
-
-       trace_xfs_da_node_toosmall(state->args);
-
-       /*
-        * Check for the degenerate case of the block being over 50% full.
-        * If so, it's not worth even looking to see if we might be able
-        * to coalesce with a sibling.
-        */
-       blk = &state->path.blk[ state->path.active-1 ];
-       info = blk->bp->b_addr;
-       node = (xfs_da_intnode_t *)info;
-       dp->d_ops->node_hdr_from_disk(&nodehdr, node);
-       if (nodehdr.count > (state->args->geo->node_ents >> 1)) {
-               *action = 0;    /* blk over 50%, don't try to join */
-               return(0);      /* blk over 50%, don't try to join */
-       }
-
-       /*
-        * Check for the degenerate case of the block being empty.
-        * If the block is empty, we'll simply delete it, no need to
-        * coalesce it with a sibling block.  We choose (arbitrarily)
-        * to merge with the forward block unless it is NULL.
-        */
-       if (nodehdr.count == 0) {
-               /*
-                * Make altpath point to the block we want to keep and
-                * path point to the block we want to drop (this one).
-                */
-               forward = (info->forw != 0);
-               memcpy(&state->altpath, &state->path, sizeof(state->path));
-               error = xfs_da3_path_shift(state, &state->altpath, forward,
-                                                0, &retval);
-               if (error)
-                       return(error);
-               if (retval) {
-                       *action = 0;
-               } else {
-                       *action = 2;
-               }
-               return(0);
-       }
-
-       /*
-        * Examine each sibling block to see if we can coalesce with
-        * at least 25% free space to spare.  We need to figure out
-        * whether to merge with the forward or the backward block.
-        * We prefer coalescing with the lower numbered sibling so as
-        * to shrink a directory over time.
-        */
-       count  = state->args->geo->node_ents;
-       count -= state->args->geo->node_ents >> 2;
-       count -= nodehdr.count;
-
-       /* start with smaller blk num */
-       forward = nodehdr.forw < nodehdr.back;
-       for (i = 0; i < 2; forward = !forward, i++) {
-               struct xfs_da3_icnode_hdr thdr;
-               if (forward)
-                       blkno = nodehdr.forw;
-               else
-                       blkno = nodehdr.back;
-               if (blkno == 0)
-                       continue;
-               error = xfs_da3_node_read(state->args->trans, dp,
-                                       blkno, -1, &bp, state->args->whichfork);
-               if (error)
-                       return(error);
-
-               node = bp->b_addr;
-               dp->d_ops->node_hdr_from_disk(&thdr, node);
-               xfs_trans_brelse(state->args->trans, bp);
-
-               if (count - thdr.count >= 0)
-                       break;  /* fits with at least 25% to spare */
-       }
-       if (i >= 2) {
-               *action = 0;
-               return 0;
-       }
-
-       /*
-        * Make altpath point to the block we want to keep (the lower
-        * numbered block) and path point to the block we want to drop.
-        */
-       memcpy(&state->altpath, &state->path, sizeof(state->path));
-       if (blkno < blk->blkno) {
-               error = xfs_da3_path_shift(state, &state->altpath, forward,
-                                                0, &retval);
-       } else {
-               error = xfs_da3_path_shift(state, &state->path, forward,
-                                                0, &retval);
-       }
-       if (error)
-               return error;
-       if (retval) {
-               *action = 0;
-               return 0;
-       }
-       *action = 1;
-       return 0;
-}
-
-/*
- * Pick up the last hashvalue from an intermediate node.
- */
-STATIC uint
-xfs_da3_node_lasthash(
-       struct xfs_inode        *dp,
-       struct xfs_buf          *bp,
-       int                     *count)
-{
-       struct xfs_da_intnode    *node;
-       struct xfs_da_node_entry *btree;
-       struct xfs_da3_icnode_hdr nodehdr;
-
-       node = bp->b_addr;
-       dp->d_ops->node_hdr_from_disk(&nodehdr, node);
-       if (count)
-               *count = nodehdr.count;
-       if (!nodehdr.count)
-               return 0;
-       btree = dp->d_ops->node_tree_p(node);
-       return be32_to_cpu(btree[nodehdr.count - 1].hashval);
-}
-
-/*
- * Walk back up the tree adjusting hash values as necessary,
- * when we stop making changes, return.
- */
-void
-xfs_da3_fixhashpath(
-       struct xfs_da_state     *state,
-       struct xfs_da_state_path *path)
-{
-       struct xfs_da_state_blk *blk;
-       struct xfs_da_intnode   *node;
-       struct xfs_da_node_entry *btree;
-       xfs_dahash_t            lasthash=0;
-       int                     level;
-       int                     count;
-       struct xfs_inode        *dp = state->args->dp;
-
-       trace_xfs_da_fixhashpath(state->args);
-
-       level = path->active-1;
-       blk = &path->blk[ level ];
-       switch (blk->magic) {
-       case XFS_ATTR_LEAF_MAGIC:
-               lasthash = xfs_attr_leaf_lasthash(blk->bp, &count);
-               if (count == 0)
-                       return;
-               break;
-       case XFS_DIR2_LEAFN_MAGIC:
-               lasthash = xfs_dir2_leafn_lasthash(dp, blk->bp, &count);
-               if (count == 0)
-                       return;
-               break;
-       case XFS_DA_NODE_MAGIC:
-               lasthash = xfs_da3_node_lasthash(dp, blk->bp, &count);
-               if (count == 0)
-                       return;
-               break;
-       }
-       for (blk--, level--; level >= 0; blk--, level--) {
-               struct xfs_da3_icnode_hdr nodehdr;
-
-               node = blk->bp->b_addr;
-               dp->d_ops->node_hdr_from_disk(&nodehdr, node);
-               btree = dp->d_ops->node_tree_p(node);
-               if (be32_to_cpu(btree[blk->index].hashval) == lasthash)
-                       break;
-               blk->hashval = lasthash;
-               btree[blk->index].hashval = cpu_to_be32(lasthash);
-               xfs_trans_log_buf(state->args->trans, blk->bp,
-                                 XFS_DA_LOGRANGE(node, &btree[blk->index],
-                                                 sizeof(*btree)));
-
-               lasthash = be32_to_cpu(btree[nodehdr.count - 1].hashval);
-       }
-}
-
-/*
- * Remove an entry from an intermediate node.
- */
-STATIC void
-xfs_da3_node_remove(
-       struct xfs_da_state     *state,
-       struct xfs_da_state_blk *drop_blk)
-{
-       struct xfs_da_intnode   *node;
-       struct xfs_da3_icnode_hdr nodehdr;
-       struct xfs_da_node_entry *btree;
-       int                     index;
-       int                     tmp;
-       struct xfs_inode        *dp = state->args->dp;
-
-       trace_xfs_da_node_remove(state->args);
-
-       node = drop_blk->bp->b_addr;
-       dp->d_ops->node_hdr_from_disk(&nodehdr, node);
-       ASSERT(drop_blk->index < nodehdr.count);
-       ASSERT(drop_blk->index >= 0);
-
-       /*
-        * Copy over the offending entry, or just zero it out.
-        */
-       index = drop_blk->index;
-       btree = dp->d_ops->node_tree_p(node);
-       if (index < nodehdr.count - 1) {
-               tmp  = nodehdr.count - index - 1;
-               tmp *= (uint)sizeof(xfs_da_node_entry_t);
-               memmove(&btree[index], &btree[index + 1], tmp);
-               xfs_trans_log_buf(state->args->trans, drop_blk->bp,
-                   XFS_DA_LOGRANGE(node, &btree[index], tmp));
-               index = nodehdr.count - 1;
-       }
-       memset(&btree[index], 0, sizeof(xfs_da_node_entry_t));
-       xfs_trans_log_buf(state->args->trans, drop_blk->bp,
-           XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index])));
-       nodehdr.count -= 1;
-       dp->d_ops->node_hdr_to_disk(node, &nodehdr);
-       xfs_trans_log_buf(state->args->trans, drop_blk->bp,
-           XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
-
-       /*
-        * Copy the last hash value from the block to propagate upwards.
-        */
-       drop_blk->hashval = be32_to_cpu(btree[index - 1].hashval);
-}
-
-/*
- * Unbalance the elements between two intermediate nodes,
- * move all Btree elements from one node into another.
- */
-STATIC void
-xfs_da3_node_unbalance(
-       struct xfs_da_state     *state,
-       struct xfs_da_state_blk *drop_blk,
-       struct xfs_da_state_blk *save_blk)
-{
-       struct xfs_da_intnode   *drop_node;
-       struct xfs_da_intnode   *save_node;
-       struct xfs_da_node_entry *drop_btree;
-       struct xfs_da_node_entry *save_btree;
-       struct xfs_da3_icnode_hdr drop_hdr;
-       struct xfs_da3_icnode_hdr save_hdr;
-       struct xfs_trans        *tp;
-       int                     sindex;
-       int                     tmp;
-       struct xfs_inode        *dp = state->args->dp;
-
-       trace_xfs_da_node_unbalance(state->args);
-
-       drop_node = drop_blk->bp->b_addr;
-       save_node = save_blk->bp->b_addr;
-       dp->d_ops->node_hdr_from_disk(&drop_hdr, drop_node);
-       dp->d_ops->node_hdr_from_disk(&save_hdr, save_node);
-       drop_btree = dp->d_ops->node_tree_p(drop_node);
-       save_btree = dp->d_ops->node_tree_p(save_node);
-       tp = state->args->trans;
-
-       /*
-        * If the dying block has lower hashvals, then move all the
-        * elements in the remaining block up to make a hole.
-        */
-       if ((be32_to_cpu(drop_btree[0].hashval) <
-                       be32_to_cpu(save_btree[0].hashval)) ||
-           (be32_to_cpu(drop_btree[drop_hdr.count - 1].hashval) <
-                       be32_to_cpu(save_btree[save_hdr.count - 1].hashval))) {
-               /* XXX: check this - is memmove dst correct? */
-               tmp = save_hdr.count * sizeof(xfs_da_node_entry_t);
-               memmove(&save_btree[drop_hdr.count], &save_btree[0], tmp);
-
-               sindex = 0;
-               xfs_trans_log_buf(tp, save_blk->bp,
-                       XFS_DA_LOGRANGE(save_node, &save_btree[0],
-                               (save_hdr.count + drop_hdr.count) *
-                                               sizeof(xfs_da_node_entry_t)));
-       } else {
-               sindex = save_hdr.count;
-               xfs_trans_log_buf(tp, save_blk->bp,
-                       XFS_DA_LOGRANGE(save_node, &save_btree[sindex],
-                               drop_hdr.count * sizeof(xfs_da_node_entry_t)));
-       }
-
-       /*
-        * Move all the B-tree elements from drop_blk to save_blk.
-        */
-       tmp = drop_hdr.count * (uint)sizeof(xfs_da_node_entry_t);
-       memcpy(&save_btree[sindex], &drop_btree[0], tmp);
-       save_hdr.count += drop_hdr.count;
-
-       dp->d_ops->node_hdr_to_disk(save_node, &save_hdr);
-       xfs_trans_log_buf(tp, save_blk->bp,
-               XFS_DA_LOGRANGE(save_node, &save_node->hdr,
-                               dp->d_ops->node_hdr_size));
-
-       /*
-        * Save the last hashval in the remaining block for upward propagation.
-        */
-       save_blk->hashval = be32_to_cpu(save_btree[save_hdr.count - 1].hashval);
-}
-
-/*========================================================================
- * Routines used for finding things in the Btree.
- *========================================================================*/
-
-/*
- * Walk down the Btree looking for a particular filename, filling
- * in the state structure as we go.
- *
- * We will set the state structure to point to each of the elements
- * in each of the nodes where either the hashval is or should be.
- *
- * We support duplicate hashval's so for each entry in the current
- * node that could contain the desired hashval, descend.  This is a
- * pruned depth-first tree search.
- */
-int                                                    /* error */
-xfs_da3_node_lookup_int(
-       struct xfs_da_state     *state,
-       int                     *result)
-{
-       struct xfs_da_state_blk *blk;
-       struct xfs_da_blkinfo   *curr;
-       struct xfs_da_intnode   *node;
-       struct xfs_da_node_entry *btree;
-       struct xfs_da3_icnode_hdr nodehdr;
-       struct xfs_da_args      *args;
-       xfs_dablk_t             blkno;
-       xfs_dahash_t            hashval;
-       xfs_dahash_t            btreehashval;
-       int                     probe;
-       int                     span;
-       int                     max;
-       int                     error;
-       int                     retval;
-       struct xfs_inode        *dp = state->args->dp;
-
-       args = state->args;
-
-       /*
-        * Descend thru the B-tree searching each level for the right
-        * node to use, until the right hashval is found.
-        */
-       blkno = (args->whichfork == XFS_DATA_FORK)? args->geo->leafblk : 0;
-       for (blk = &state->path.blk[0], state->path.active = 1;
-                        state->path.active <= XFS_DA_NODE_MAXDEPTH;
-                        blk++, state->path.active++) {
-               /*
-                * Read the next node down in the tree.
-                */
-               blk->blkno = blkno;
-               error = xfs_da3_node_read(args->trans, args->dp, blkno,
-                                       -1, &blk->bp, args->whichfork);
-               if (error) {
-                       blk->blkno = 0;
-                       state->path.active--;
-                       return(error);
-               }
-               curr = blk->bp->b_addr;
-               blk->magic = be16_to_cpu(curr->magic);
-
-               if (blk->magic == XFS_ATTR_LEAF_MAGIC ||
-                   blk->magic == XFS_ATTR3_LEAF_MAGIC) {
-                       blk->magic = XFS_ATTR_LEAF_MAGIC;
-                       blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
-                       break;
-               }
-
-               if (blk->magic == XFS_DIR2_LEAFN_MAGIC ||
-                   blk->magic == XFS_DIR3_LEAFN_MAGIC) {
-                       blk->magic = XFS_DIR2_LEAFN_MAGIC;
-                       blk->hashval = xfs_dir2_leafn_lasthash(args->dp,
-                                                              blk->bp, NULL);
-                       break;
-               }
-
-               blk->magic = XFS_DA_NODE_MAGIC;
-
-
-               /*
-                * Search an intermediate node for a match.
-                */
-               node = blk->bp->b_addr;
-               dp->d_ops->node_hdr_from_disk(&nodehdr, node);
-               btree = dp->d_ops->node_tree_p(node);
-
-               max = nodehdr.count;
-               blk->hashval = be32_to_cpu(btree[max - 1].hashval);
-
-               /*
-                * Binary search.  (note: small blocks will skip loop)
-                */
-               probe = span = max / 2;
-               hashval = args->hashval;
-               while (span > 4) {
-                       span /= 2;
-                       btreehashval = be32_to_cpu(btree[probe].hashval);
-                       if (btreehashval < hashval)
-                               probe += span;
-                       else if (btreehashval > hashval)
-                               probe -= span;
-                       else
-                               break;
-               }
-               ASSERT((probe >= 0) && (probe < max));
-               ASSERT((span <= 4) ||
-                       (be32_to_cpu(btree[probe].hashval) == hashval));
-
-               /*
-                * Since we may have duplicate hashval's, find the first
-                * matching hashval in the node.
-                */
-               while (probe > 0 &&
-                      be32_to_cpu(btree[probe].hashval) >= hashval) {
-                       probe--;
-               }
-               while (probe < max &&
-                      be32_to_cpu(btree[probe].hashval) < hashval) {
-                       probe++;
-               }
-
-               /*
-                * Pick the right block to descend on.
-                */
-               if (probe == max) {
-                       blk->index = max - 1;
-                       blkno = be32_to_cpu(btree[max - 1].before);
-               } else {
-                       blk->index = probe;
-                       blkno = be32_to_cpu(btree[probe].before);
-               }
-       }
-
-       /*
-        * A leaf block that ends in the hashval that we are interested in
-        * (final hashval == search hashval) means that the next block may
-        * contain more entries with the same hashval, shift upward to the
-        * next leaf and keep searching.
-        */
-       for (;;) {
-               if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
-                       retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
-                                                       &blk->index, state);
-               } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
-                       retval = xfs_attr3_leaf_lookup_int(blk->bp, args);
-                       blk->index = args->index;
-                       args->blkno = blk->blkno;
-               } else {
-                       ASSERT(0);
-                       return XFS_ERROR(EFSCORRUPTED);
-               }
-               if (((retval == ENOENT) || (retval == ENOATTR)) &&
-                   (blk->hashval == args->hashval)) {
-                       error = xfs_da3_path_shift(state, &state->path, 1, 1,
-                                                        &retval);
-                       if (error)
-                               return(error);
-                       if (retval == 0) {
-                               continue;
-                       } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
-                               /* path_shift() gives ENOENT */
-                               retval = XFS_ERROR(ENOATTR);
-                       }
-               }
-               break;
-       }
-       *result = retval;
-       return(0);
-}
-
-/*========================================================================
- * Utility routines.
- *========================================================================*/
-
-/*
- * Compare two intermediate nodes for "order".
- */
-STATIC int
-xfs_da3_node_order(
-       struct xfs_inode *dp,
-       struct xfs_buf  *node1_bp,
-       struct xfs_buf  *node2_bp)
-{
-       struct xfs_da_intnode   *node1;
-       struct xfs_da_intnode   *node2;
-       struct xfs_da_node_entry *btree1;
-       struct xfs_da_node_entry *btree2;
-       struct xfs_da3_icnode_hdr node1hdr;
-       struct xfs_da3_icnode_hdr node2hdr;
-
-       node1 = node1_bp->b_addr;
-       node2 = node2_bp->b_addr;
-       dp->d_ops->node_hdr_from_disk(&node1hdr, node1);
-       dp->d_ops->node_hdr_from_disk(&node2hdr, node2);
-       btree1 = dp->d_ops->node_tree_p(node1);
-       btree2 = dp->d_ops->node_tree_p(node2);
-
-       if (node1hdr.count > 0 && node2hdr.count > 0 &&
-           ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
-            (be32_to_cpu(btree2[node2hdr.count - 1].hashval) <
-             be32_to_cpu(btree1[node1hdr.count - 1].hashval)))) {
-               return 1;
-       }
-       return 0;
-}
-
-/*
- * Link a new block into a doubly linked list of blocks (of whatever type).
- */
-int                                                    /* error */
-xfs_da3_blk_link(
-       struct xfs_da_state     *state,
-       struct xfs_da_state_blk *old_blk,
-       struct xfs_da_state_blk *new_blk)
-{
-       struct xfs_da_blkinfo   *old_info;
-       struct xfs_da_blkinfo   *new_info;
-       struct xfs_da_blkinfo   *tmp_info;
-       struct xfs_da_args      *args;
-       struct xfs_buf          *bp;
-       int                     before = 0;
-       int                     error;
-       struct xfs_inode        *dp = state->args->dp;
-
-       /*
-        * Set up environment.
-        */
-       args = state->args;
-       ASSERT(args != NULL);
-       old_info = old_blk->bp->b_addr;
-       new_info = new_blk->bp->b_addr;
-       ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
-              old_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
-              old_blk->magic == XFS_ATTR_LEAF_MAGIC);
-
-       switch (old_blk->magic) {
-       case XFS_ATTR_LEAF_MAGIC:
-               before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
-               break;
-       case XFS_DIR2_LEAFN_MAGIC:
-               before = xfs_dir2_leafn_order(dp, old_blk->bp, new_blk->bp);
-               break;
-       case XFS_DA_NODE_MAGIC:
-               before = xfs_da3_node_order(dp, old_blk->bp, new_blk->bp);
-               break;
-       }
-
-       /*
-        * Link blocks in appropriate order.
-        */
-       if (before) {
-               /*
-                * Link new block in before existing block.
-                */
-               trace_xfs_da_link_before(args);
-               new_info->forw = cpu_to_be32(old_blk->blkno);
-               new_info->back = old_info->back;
-               if (old_info->back) {
-                       error = xfs_da3_node_read(args->trans, dp,
-                                               be32_to_cpu(old_info->back),
-                                               -1, &bp, args->whichfork);
-                       if (error)
-                               return(error);
-                       ASSERT(bp != NULL);
-                       tmp_info = bp->b_addr;
-                       ASSERT(tmp_info->magic == old_info->magic);
-                       ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno);
-                       tmp_info->forw = cpu_to_be32(new_blk->blkno);
-                       xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
-               }
-               old_info->back = cpu_to_be32(new_blk->blkno);
-       } else {
-               /*
-                * Link new block in after existing block.
-                */
-               trace_xfs_da_link_after(args);
-               new_info->forw = old_info->forw;
-               new_info->back = cpu_to_be32(old_blk->blkno);
-               if (old_info->forw) {
-                       error = xfs_da3_node_read(args->trans, dp,
-                                               be32_to_cpu(old_info->forw),
-                                               -1, &bp, args->whichfork);
-                       if (error)
-                               return(error);
-                       ASSERT(bp != NULL);
-                       tmp_info = bp->b_addr;
-                       ASSERT(tmp_info->magic == old_info->magic);
-                       ASSERT(be32_to_cpu(tmp_info->back) == old_blk->blkno);
-                       tmp_info->back = cpu_to_be32(new_blk->blkno);
-                       xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
-               }
-               old_info->forw = cpu_to_be32(new_blk->blkno);
-       }
-
-       xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
-       xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
-       return(0);
-}
-
-/*
- * Unlink a block from a doubly linked list of blocks.
- */
-STATIC int                                             /* error */
-xfs_da3_blk_unlink(
-       struct xfs_da_state     *state,
-       struct xfs_da_state_blk *drop_blk,
-       struct xfs_da_state_blk *save_blk)
-{
-       struct xfs_da_blkinfo   *drop_info;
-       struct xfs_da_blkinfo   *save_info;
-       struct xfs_da_blkinfo   *tmp_info;
-       struct xfs_da_args      *args;
-       struct xfs_buf          *bp;
-       int                     error;
-
-       /*
-        * Set up environment.
-        */
-       args = state->args;
-       ASSERT(args != NULL);
-       save_info = save_blk->bp->b_addr;
-       drop_info = drop_blk->bp->b_addr;
-       ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
-              save_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
-              save_blk->magic == XFS_ATTR_LEAF_MAGIC);
-       ASSERT(save_blk->magic == drop_blk->magic);
-       ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) ||
-              (be32_to_cpu(save_info->back) == drop_blk->blkno));
-       ASSERT((be32_to_cpu(drop_info->forw) == save_blk->blkno) ||
-              (be32_to_cpu(drop_info->back) == save_blk->blkno));
-
-       /*
-        * Unlink the leaf block from the doubly linked chain of leaves.
-        */
-       if (be32_to_cpu(save_info->back) == drop_blk->blkno) {
-               trace_xfs_da_unlink_back(args);
-               save_info->back = drop_info->back;
-               if (drop_info->back) {
-                       error = xfs_da3_node_read(args->trans, args->dp,
-                                               be32_to_cpu(drop_info->back),
-                                               -1, &bp, args->whichfork);
-                       if (error)
-                               return(error);
-                       ASSERT(bp != NULL);
-                       tmp_info = bp->b_addr;
-                       ASSERT(tmp_info->magic == save_info->magic);
-                       ASSERT(be32_to_cpu(tmp_info->forw) == drop_blk->blkno);
-                       tmp_info->forw = cpu_to_be32(save_blk->blkno);
-                       xfs_trans_log_buf(args->trans, bp, 0,
-                                                   sizeof(*tmp_info) - 1);
-               }
-       } else {
-               trace_xfs_da_unlink_forward(args);
-               save_info->forw = drop_info->forw;
-               if (drop_info->forw) {
-                       error = xfs_da3_node_read(args->trans, args->dp,
-                                               be32_to_cpu(drop_info->forw),
-                                               -1, &bp, args->whichfork);
-                       if (error)
-                               return(error);
-                       ASSERT(bp != NULL);
-                       tmp_info = bp->b_addr;
-                       ASSERT(tmp_info->magic == save_info->magic);
-                       ASSERT(be32_to_cpu(tmp_info->back) == drop_blk->blkno);
-                       tmp_info->back = cpu_to_be32(save_blk->blkno);
-                       xfs_trans_log_buf(args->trans, bp, 0,
-                                                   sizeof(*tmp_info) - 1);
-               }
-       }
-
-       xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
-       return(0);
-}
-
-/*
- * Move a path "forward" or "!forward" one block at the current level.
- *
- * This routine will adjust a "path" to point to the next block
- * "forward" (higher hashvalues) or "!forward" (lower hashvals) in the
- * Btree, including updating pointers to the intermediate nodes between
- * the new bottom and the root.
- */
-int                                                    /* error */
-xfs_da3_path_shift(
-       struct xfs_da_state     *state,
-       struct xfs_da_state_path *path,
-       int                     forward,
-       int                     release,
-       int                     *result)
-{
-       struct xfs_da_state_blk *blk;
-       struct xfs_da_blkinfo   *info;
-       struct xfs_da_intnode   *node;
-       struct xfs_da_args      *args;
-       struct xfs_da_node_entry *btree;
-       struct xfs_da3_icnode_hdr nodehdr;
-       xfs_dablk_t             blkno = 0;
-       int                     level;
-       int                     error;
-       struct xfs_inode        *dp = state->args->dp;
-
-       trace_xfs_da_path_shift(state->args);
-
-       /*
-        * Roll up the Btree looking for the first block where our
-        * current index is not at the edge of the block.  Note that
-        * we skip the bottom layer because we want the sibling block.
-        */
-       args = state->args;
-       ASSERT(args != NULL);
-       ASSERT(path != NULL);
-       ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
-       level = (path->active-1) - 1;   /* skip bottom layer in path */
-       for (blk = &path->blk[level]; level >= 0; blk--, level--) {
-               node = blk->bp->b_addr;
-               dp->d_ops->node_hdr_from_disk(&nodehdr, node);
-               btree = dp->d_ops->node_tree_p(node);
-
-               if (forward && (blk->index < nodehdr.count - 1)) {
-                       blk->index++;
-                       blkno = be32_to_cpu(btree[blk->index].before);
-                       break;
-               } else if (!forward && (blk->index > 0)) {
-                       blk->index--;
-                       blkno = be32_to_cpu(btree[blk->index].before);
-                       break;
-               }
-       }
-       if (level < 0) {
-               *result = XFS_ERROR(ENOENT);    /* we're out of our tree */
-               ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
-               return(0);
-       }
-
-       /*
-        * Roll down the edge of the subtree until we reach the
-        * same depth we were at originally.
-        */
-       for (blk++, level++; level < path->active; blk++, level++) {
-               /*
-                * Release the old block.
-                * (if it's dirty, trans won't actually let go)
-                */
-               if (release)
-                       xfs_trans_brelse(args->trans, blk->bp);
-
-               /*
-                * Read the next child block.
-                */
-               blk->blkno = blkno;
-               error = xfs_da3_node_read(args->trans, dp, blkno, -1,
-                                       &blk->bp, args->whichfork);
-               if (error)
-                       return(error);
-               info = blk->bp->b_addr;
-               ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
-                      info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
-                      info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
-                      info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
-                      info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
-                      info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
-
-
-               /*
-                * Note: we flatten the magic number to a single type so we
-                * don't have to compare against crc/non-crc types elsewhere.
-                */
-               switch (be16_to_cpu(info->magic)) {
-               case XFS_DA_NODE_MAGIC:
-               case XFS_DA3_NODE_MAGIC:
-                       blk->magic = XFS_DA_NODE_MAGIC;
-                       node = (xfs_da_intnode_t *)info;
-                       dp->d_ops->node_hdr_from_disk(&nodehdr, node);
-                       btree = dp->d_ops->node_tree_p(node);
-                       blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
-                       if (forward)
-                               blk->index = 0;
-                       else
-                               blk->index = nodehdr.count - 1;
-                       blkno = be32_to_cpu(btree[blk->index].before);
-                       break;
-               case XFS_ATTR_LEAF_MAGIC:
-               case XFS_ATTR3_LEAF_MAGIC:
-                       blk->magic = XFS_ATTR_LEAF_MAGIC;
-                       ASSERT(level == path->active-1);
-                       blk->index = 0;
-                       blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
-                       break;
-               case XFS_DIR2_LEAFN_MAGIC:
-               case XFS_DIR3_LEAFN_MAGIC:
-                       blk->magic = XFS_DIR2_LEAFN_MAGIC;
-                       ASSERT(level == path->active-1);
-                       blk->index = 0;
-                       blk->hashval = xfs_dir2_leafn_lasthash(args->dp,
-                                                              blk->bp, NULL);
-                       break;
-               default:
-                       ASSERT(0);
-                       break;
-               }
-       }
-       *result = 0;
-       return 0;
-}
-
-
-/*========================================================================
- * Utility routines.
- *========================================================================*/
-
-/*
- * Implement a simple hash on a character string.
- * Rotate the hash value by 7 bits, then XOR each character in.
- * This is implemented with some source-level loop unrolling.
- */
-xfs_dahash_t
-xfs_da_hashname(const __uint8_t *name, int namelen)
-{
-       xfs_dahash_t hash;
-
-       /*
-        * Do four characters at a time as long as we can.
-        */
-       for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
-               hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
-                      (name[3] << 0) ^ rol32(hash, 7 * 4);
-
-       /*
-        * Now do the rest of the characters.
-        */
-       switch (namelen) {
-       case 3:
-               return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
-                      rol32(hash, 7 * 3);
-       case 2:
-               return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
-       case 1:
-               return (name[0] << 0) ^ rol32(hash, 7 * 1);
-       default: /* case 0: */
-               return hash;
-       }
-}
-
-enum xfs_dacmp
-xfs_da_compname(
-       struct xfs_da_args *args,
-       const unsigned char *name,
-       int             len)
-{
-       return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
-                                       XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
-}
-
-static xfs_dahash_t
-xfs_default_hashname(
-       struct xfs_name *name)
-{
-       return xfs_da_hashname(name->name, name->len);
-}
-
-const struct xfs_nameops xfs_default_nameops = {
-       .hashname       = xfs_default_hashname,
-       .compname       = xfs_da_compname
-};
-
-int
-xfs_da_grow_inode_int(
-       struct xfs_da_args      *args,
-       xfs_fileoff_t           *bno,
-       int                     count)
-{
-       struct xfs_trans        *tp = args->trans;
-       struct xfs_inode        *dp = args->dp;
-       int                     w = args->whichfork;
-       xfs_drfsbno_t           nblks = dp->i_d.di_nblocks;
-       struct xfs_bmbt_irec    map, *mapp;
-       int                     nmap, error, got, i, mapi;
-
-       /*
-        * Find a spot in the file space to put the new block.
-        */
-       error = xfs_bmap_first_unused(tp, dp, count, bno, w);
-       if (error)
-               return error;
-
-       /*
-        * Try mapping it in one filesystem block.
-        */
-       nmap = 1;
-       ASSERT(args->firstblock != NULL);
-       error = xfs_bmapi_write(tp, dp, *bno, count,
-                       xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
-                       args->firstblock, args->total, &map, &nmap,
-                       args->flist);
-       if (error)
-               return error;
-
-       ASSERT(nmap <= 1);
-       if (nmap == 1) {
-               mapp = &map;
-               mapi = 1;
-       } else if (nmap == 0 && count > 1) {
-               xfs_fileoff_t           b;
-               int                     c;
-
-               /*
-                * If we didn't get it and the block might work if fragmented,
-                * try without the CONTIG flag.  Loop until we get it all.
-                */
-               mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
-               for (b = *bno, mapi = 0; b < *bno + count; ) {
-                       nmap = MIN(XFS_BMAP_MAX_NMAP, count);
-                       c = (int)(*bno + count - b);
-                       error = xfs_bmapi_write(tp, dp, b, c,
-                                       xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
-                                       args->firstblock, args->total,
-                                       &mapp[mapi], &nmap, args->flist);
-                       if (error)
-                               goto out_free_map;
-                       if (nmap < 1)
-                               break;
-                       mapi += nmap;
-                       b = mapp[mapi - 1].br_startoff +
-                           mapp[mapi - 1].br_blockcount;
-               }
-       } else {
-               mapi = 0;
-               mapp = NULL;
-       }
-
-       /*
-        * Count the blocks we got, make sure it matches the total.
-        */
-       for (i = 0, got = 0; i < mapi; i++)
-               got += mapp[i].br_blockcount;
-       if (got != count || mapp[0].br_startoff != *bno ||
-           mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
-           *bno + count) {
-               error = XFS_ERROR(ENOSPC);
-               goto out_free_map;
-       }
-
-       /* account for newly allocated blocks in reserved blocks total */
-       args->total -= dp->i_d.di_nblocks - nblks;
-
-out_free_map:
-       if (mapp != &map)
-               kmem_free(mapp);
-       return error;
-}
-
-/*
- * Add a block to the btree ahead of the file.
- * Return the new block number to the caller.
- */
-int
-xfs_da_grow_inode(
-       struct xfs_da_args      *args,
-       xfs_dablk_t             *new_blkno)
-{
-       xfs_fileoff_t           bno;
-       int                     error;
-
-       trace_xfs_da_grow_inode(args);
-
-       bno = args->geo->leafblk;
-       error = xfs_da_grow_inode_int(args, &bno, args->geo->fsbcount);
-       if (!error)
-               *new_blkno = (xfs_dablk_t)bno;
-       return error;
-}
-
-/*
- * Ick.  We need to always be able to remove a btree block, even
- * if there's no space reservation because the filesystem is full.
- * This is called if xfs_bunmapi on a btree block fails due to ENOSPC.
- * It swaps the target block with the last block in the file.  The
- * last block in the file can always be removed since it can't cause
- * a bmap btree split to do that.
- */
-STATIC int
-xfs_da3_swap_lastblock(
-       struct xfs_da_args      *args,
-       xfs_dablk_t             *dead_blknop,
-       struct xfs_buf          **dead_bufp)
-{
-       struct xfs_da_blkinfo   *dead_info;
-       struct xfs_da_blkinfo   *sib_info;
-       struct xfs_da_intnode   *par_node;
-       struct xfs_da_intnode   *dead_node;
-       struct xfs_dir2_leaf    *dead_leaf2;
-       struct xfs_da_node_entry *btree;
-       struct xfs_da3_icnode_hdr par_hdr;
-       struct xfs_inode        *dp;
-       struct xfs_trans        *tp;
-       struct xfs_mount        *mp;
-       struct xfs_buf          *dead_buf;
-       struct xfs_buf          *last_buf;
-       struct xfs_buf          *sib_buf;
-       struct xfs_buf          *par_buf;
-       xfs_dahash_t            dead_hash;
-       xfs_fileoff_t           lastoff;
-       xfs_dablk_t             dead_blkno;
-       xfs_dablk_t             last_blkno;
-       xfs_dablk_t             sib_blkno;
-       xfs_dablk_t             par_blkno;
-       int                     error;
-       int                     w;
-       int                     entno;
-       int                     level;
-       int                     dead_level;
-
-       trace_xfs_da_swap_lastblock(args);
-
-       dead_buf = *dead_bufp;
-       dead_blkno = *dead_blknop;
-       tp = args->trans;
-       dp = args->dp;
-       w = args->whichfork;
-       ASSERT(w == XFS_DATA_FORK);
-       mp = dp->i_mount;
-       lastoff = args->geo->freeblk;
-       error = xfs_bmap_last_before(tp, dp, &lastoff, w);
-       if (error)
-               return error;
-       if (unlikely(lastoff == 0)) {
-               XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW,
-                                mp);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-       /*
-        * Read the last block in the btree space.
-        */
-       last_blkno = (xfs_dablk_t)lastoff - args->geo->fsbcount;
-       error = xfs_da3_node_read(tp, dp, last_blkno, -1, &last_buf, w);
-       if (error)
-               return error;
-       /*
-        * Copy the last block into the dead buffer and log it.
-        */
-       memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize);
-       xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1);
-       dead_info = dead_buf->b_addr;
-       /*
-        * Get values from the moved block.
-        */
-       if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
-           dead_info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
-               struct xfs_dir3_icleaf_hdr leafhdr;
-               struct xfs_dir2_leaf_entry *ents;
-
-               dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
-               dp->d_ops->leaf_hdr_from_disk(&leafhdr, dead_leaf2);
-               ents = dp->d_ops->leaf_ents_p(dead_leaf2);
-               dead_level = 0;
-               dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval);
-       } else {
-               struct xfs_da3_icnode_hdr deadhdr;
-
-               dead_node = (xfs_da_intnode_t *)dead_info;
-               dp->d_ops->node_hdr_from_disk(&deadhdr, dead_node);
-               btree = dp->d_ops->node_tree_p(dead_node);
-               dead_level = deadhdr.level;
-               dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval);
-       }
-       sib_buf = par_buf = NULL;
-       /*
-        * If the moved block has a left sibling, fix up the pointers.
-        */
-       if ((sib_blkno = be32_to_cpu(dead_info->back))) {
-               error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
-               if (error)
-                       goto done;
-               sib_info = sib_buf->b_addr;
-               if (unlikely(
-                   be32_to_cpu(sib_info->forw) != last_blkno ||
-                   sib_info->magic != dead_info->magic)) {
-                       XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)",
-                                        XFS_ERRLEVEL_LOW, mp);
-                       error = XFS_ERROR(EFSCORRUPTED);
-                       goto done;
-               }
-               sib_info->forw = cpu_to_be32(dead_blkno);
-               xfs_trans_log_buf(tp, sib_buf,
-                       XFS_DA_LOGRANGE(sib_info, &sib_info->forw,
-                                       sizeof(sib_info->forw)));
-               sib_buf = NULL;
-       }
-       /*
-        * If the moved block has a right sibling, fix up the pointers.
-        */
-       if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
-               error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
-               if (error)
-                       goto done;
-               sib_info = sib_buf->b_addr;
-               if (unlikely(
-                      be32_to_cpu(sib_info->back) != last_blkno ||
-                      sib_info->magic != dead_info->magic)) {
-                       XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)",
-                                        XFS_ERRLEVEL_LOW, mp);
-                       error = XFS_ERROR(EFSCORRUPTED);
-                       goto done;
-               }
-               sib_info->back = cpu_to_be32(dead_blkno);
-               xfs_trans_log_buf(tp, sib_buf,
-                       XFS_DA_LOGRANGE(sib_info, &sib_info->back,
-                                       sizeof(sib_info->back)));
-               sib_buf = NULL;
-       }
-       par_blkno = args->geo->leafblk;
-       level = -1;
-       /*
-        * Walk down the tree looking for the parent of the moved block.
-        */
-       for (;;) {
-               error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
-               if (error)
-                       goto done;
-               par_node = par_buf->b_addr;
-               dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
-               if (level >= 0 && level != par_hdr.level + 1) {
-                       XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
-                                        XFS_ERRLEVEL_LOW, mp);
-                       error = XFS_ERROR(EFSCORRUPTED);
-                       goto done;
-               }
-               level = par_hdr.level;
-               btree = dp->d_ops->node_tree_p(par_node);
-               for (entno = 0;
-                    entno < par_hdr.count &&
-                    be32_to_cpu(btree[entno].hashval) < dead_hash;
-                    entno++)
-                       continue;
-               if (entno == par_hdr.count) {
-                       XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
-                                        XFS_ERRLEVEL_LOW, mp);
-                       error = XFS_ERROR(EFSCORRUPTED);
-                       goto done;
-               }
-               par_blkno = be32_to_cpu(btree[entno].before);
-               if (level == dead_level + 1)
-                       break;
-               xfs_trans_brelse(tp, par_buf);
-               par_buf = NULL;
-       }
-       /*
-        * We're in the right parent block.
-        * Look for the right entry.
-        */
-       for (;;) {
-               for (;
-                    entno < par_hdr.count &&
-                    be32_to_cpu(btree[entno].before) != last_blkno;
-                    entno++)
-                       continue;
-               if (entno < par_hdr.count)
-                       break;
-               par_blkno = par_hdr.forw;
-               xfs_trans_brelse(tp, par_buf);
-               par_buf = NULL;
-               if (unlikely(par_blkno == 0)) {
-                       XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)",
-                                        XFS_ERRLEVEL_LOW, mp);
-                       error = XFS_ERROR(EFSCORRUPTED);
-                       goto done;
-               }
-               error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
-               if (error)
-                       goto done;
-               par_node = par_buf->b_addr;
-               dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
-               if (par_hdr.level != level) {
-                       XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
-                                        XFS_ERRLEVEL_LOW, mp);
-                       error = XFS_ERROR(EFSCORRUPTED);
-                       goto done;
-               }
-               btree = dp->d_ops->node_tree_p(par_node);
-               entno = 0;
-       }
-       /*
-        * Update the parent entry pointing to the moved block.
-        */
-       btree[entno].before = cpu_to_be32(dead_blkno);
-       xfs_trans_log_buf(tp, par_buf,
-               XFS_DA_LOGRANGE(par_node, &btree[entno].before,
-                               sizeof(btree[entno].before)));
-       *dead_blknop = last_blkno;
-       *dead_bufp = last_buf;
-       return 0;
-done:
-       if (par_buf)
-               xfs_trans_brelse(tp, par_buf);
-       if (sib_buf)
-               xfs_trans_brelse(tp, sib_buf);
-       xfs_trans_brelse(tp, last_buf);
-       return error;
-}
-
-/*
- * Remove a btree block from a directory or attribute.
- */
-int
-xfs_da_shrink_inode(
-       xfs_da_args_t   *args,
-       xfs_dablk_t     dead_blkno,
-       struct xfs_buf  *dead_buf)
-{
-       xfs_inode_t *dp;
-       int done, error, w, count;
-       xfs_trans_t *tp;
-       xfs_mount_t *mp;
-
-       trace_xfs_da_shrink_inode(args);
-
-       dp = args->dp;
-       w = args->whichfork;
-       tp = args->trans;
-       mp = dp->i_mount;
-       count = args->geo->fsbcount;
-       for (;;) {
-               /*
-                * Remove extents.  If we get ENOSPC for a dir we have to move
-                * the last block to the place we want to kill.
-                */
-               error = xfs_bunmapi(tp, dp, dead_blkno, count,
-                                   xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
-                                   0, args->firstblock, args->flist, &done);
-               if (error == ENOSPC) {
-                       if (w != XFS_DATA_FORK)
-                               break;
-                       error = xfs_da3_swap_lastblock(args, &dead_blkno,
-                                                     &dead_buf);
-                       if (error)
-                               break;
-               } else {
-                       break;
-               }
-       }
-       xfs_trans_binval(tp, dead_buf);
-       return error;
-}
-
-/*
- * See if the mapping(s) for this btree block are valid, i.e.
- * don't contain holes, are logically contiguous, and cover the whole range.
- */
-STATIC int
-xfs_da_map_covers_blocks(
-       int             nmap,
-       xfs_bmbt_irec_t *mapp,
-       xfs_dablk_t     bno,
-       int             count)
-{
-       int             i;
-       xfs_fileoff_t   off;
-
-       for (i = 0, off = bno; i < nmap; i++) {
-               if (mapp[i].br_startblock == HOLESTARTBLOCK ||
-                   mapp[i].br_startblock == DELAYSTARTBLOCK) {
-                       return 0;
-               }
-               if (off != mapp[i].br_startoff) {
-                       return 0;
-               }
-               off += mapp[i].br_blockcount;
-       }
-       return off == bno + count;
-}
-
-/*
- * Convert a struct xfs_bmbt_irec to a struct xfs_buf_map.
- *
- * For the single map case, it is assumed that the caller has provided a pointer
- * to a valid xfs_buf_map.  For the multiple map case, this function will
- * allocate the xfs_buf_map to hold all the maps and replace the caller's single
- * map pointer with the allocated map.
- */
-static int
-xfs_buf_map_from_irec(
-       struct xfs_mount        *mp,
-       struct xfs_buf_map      **mapp,
-       int                     *nmaps,
-       struct xfs_bmbt_irec    *irecs,
-       int                     nirecs)
-{
-       struct xfs_buf_map      *map;
-       int                     i;
-
-       ASSERT(*nmaps == 1);
-       ASSERT(nirecs >= 1);
-
-       if (nirecs > 1) {
-               map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
-                                 KM_SLEEP | KM_NOFS);
-               if (!map)
-                       return ENOMEM;
-               *mapp = map;
-       }
-
-       *nmaps = nirecs;
-       map = *mapp;
-       for (i = 0; i < *nmaps; i++) {
-               ASSERT(irecs[i].br_startblock != DELAYSTARTBLOCK &&
-                      irecs[i].br_startblock != HOLESTARTBLOCK);
-               map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock);
-               map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount);
-       }
-       return 0;
-}
-
-/*
- * Map the block we are given ready for reading. There are three possible return
- * values:
- *     -1 - will be returned if we land in a hole and mappedbno == -2 so the
- *          caller knows not to execute a subsequent read.
- *      0 - if we mapped the block successfully
- *     >0 - positive error number if there was an error.
- */
-static int
-xfs_dabuf_map(
-       struct xfs_inode        *dp,
-       xfs_dablk_t             bno,
-       xfs_daddr_t             mappedbno,
-       int                     whichfork,
-       struct xfs_buf_map      **map,
-       int                     *nmaps)
-{
-       struct xfs_mount        *mp = dp->i_mount;
-       int                     nfsb;
-       int                     error = 0;
-       struct xfs_bmbt_irec    irec;
-       struct xfs_bmbt_irec    *irecs = &irec;
-       int                     nirecs;
-
-       ASSERT(map && *map);
-       ASSERT(*nmaps == 1);
-
-       if (whichfork == XFS_DATA_FORK)
-               nfsb = mp->m_dir_geo->fsbcount;
-       else
-               nfsb = mp->m_attr_geo->fsbcount;
-
-       /*
-        * Caller doesn't have a mapping.  -2 means don't complain
-        * if we land in a hole.
-        */
-       if (mappedbno == -1 || mappedbno == -2) {
-               /*
-                * Optimize the one-block case.
-                */
-               if (nfsb != 1)
-                       irecs = kmem_zalloc(sizeof(irec) * nfsb,
-                                           KM_SLEEP | KM_NOFS);
-
-               nirecs = nfsb;
-               error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
-                                      &nirecs, xfs_bmapi_aflag(whichfork));
-               if (error)
-                       goto out;
-       } else {
-               irecs->br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
-               irecs->br_startoff = (xfs_fileoff_t)bno;
-               irecs->br_blockcount = nfsb;
-               irecs->br_state = 0;
-               nirecs = 1;
-       }
-
-       if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) {
-               error = mappedbno == -2 ? -1 : XFS_ERROR(EFSCORRUPTED);
-               if (unlikely(error == EFSCORRUPTED)) {
-                       if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
-                               int i;
-                               xfs_alert(mp, "%s: bno %lld dir: inode %lld",
-                                       __func__, (long long)bno,
-                                       (long long)dp->i_ino);
-                               for (i = 0; i < *nmaps; i++) {
-                                       xfs_alert(mp,
-"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
-                                               i,
-                                               (long long)irecs[i].br_startoff,
-                                               (long long)irecs[i].br_startblock,
-                                               (long long)irecs[i].br_blockcount,
-                                               irecs[i].br_state);
-                               }
-                       }
-                       XFS_ERROR_REPORT("xfs_da_do_buf(1)",
-                                        XFS_ERRLEVEL_LOW, mp);
-               }
-               goto out;
-       }
-       error = xfs_buf_map_from_irec(mp, map, nmaps, irecs, nirecs);
-out:
-       if (irecs != &irec)
-               kmem_free(irecs);
-       return error;
-}
-
-/*
- * Get a buffer for the dir/attr block.
- */
-int
-xfs_da_get_buf(
-       struct xfs_trans        *trans,
-       struct xfs_inode        *dp,
-       xfs_dablk_t             bno,
-       xfs_daddr_t             mappedbno,
-       struct xfs_buf          **bpp,
-       int                     whichfork)
-{
-       struct xfs_buf          *bp;
-       struct xfs_buf_map      map;
-       struct xfs_buf_map      *mapp;
-       int                     nmap;
-       int                     error;
-
-       *bpp = NULL;
-       mapp = &map;
-       nmap = 1;
-       error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
-                               &mapp, &nmap);
-       if (error) {
-               /* mapping a hole is not an error, but we don't continue */
-               if (error == -1)
-                       error = 0;
-               goto out_free;
-       }
-
-       bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp,
-                                   mapp, nmap, 0);
-       error = bp ? bp->b_error : XFS_ERROR(EIO);
-       if (error) {
-               xfs_trans_brelse(trans, bp);
-               goto out_free;
-       }
-
-       *bpp = bp;
-
-out_free:
-       if (mapp != &map)
-               kmem_free(mapp);
-
-       return error;
-}
-
-/*
- * Get a buffer for the dir/attr block, fill in the contents.
- */
-int
-xfs_da_read_buf(
-       struct xfs_trans        *trans,
-       struct xfs_inode        *dp,
-       xfs_dablk_t             bno,
-       xfs_daddr_t             mappedbno,
-       struct xfs_buf          **bpp,
-       int                     whichfork,
-       const struct xfs_buf_ops *ops)
-{
-       struct xfs_buf          *bp;
-       struct xfs_buf_map      map;
-       struct xfs_buf_map      *mapp;
-       int                     nmap;
-       int                     error;
-
-       *bpp = NULL;
-       mapp = &map;
-       nmap = 1;
-       error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
-                               &mapp, &nmap);
-       if (error) {
-               /* mapping a hole is not an error, but we don't continue */
-               if (error == -1)
-                       error = 0;
-               goto out_free;
-       }
-
-       error = xfs_trans_read_buf_map(dp->i_mount, trans,
-                                       dp->i_mount->m_ddev_targp,
-                                       mapp, nmap, 0, &bp, ops);
-       if (error)
-               goto out_free;
-
-       if (whichfork == XFS_ATTR_FORK)
-               xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
-       else
-               xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
-       *bpp = bp;
-out_free:
-       if (mapp != &map)
-               kmem_free(mapp);
-
-       return error;
-}
-
-/*
- * Readahead the dir/attr block.
- */
-xfs_daddr_t
-xfs_da_reada_buf(
-       struct xfs_inode        *dp,
-       xfs_dablk_t             bno,
-       xfs_daddr_t             mappedbno,
-       int                     whichfork,
-       const struct xfs_buf_ops *ops)
-{
-       struct xfs_buf_map      map;
-       struct xfs_buf_map      *mapp;
-       int                     nmap;
-       int                     error;
-
-       mapp = &map;
-       nmap = 1;
-       error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
-                               &mapp, &nmap);
-       if (error) {
-               /* mapping a hole is not an error, but we don't continue */
-               if (error == -1)
-                       error = 0;
-               goto out_free;
-       }
-
-       mappedbno = mapp[0].bm_bn;
-       xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops);
-
-out_free:
-       if (mapp != &map)
-               kmem_free(mapp);
-
-       if (error)
-               return -1;
-       return mappedbno;
-}
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
deleted file mode 100644 (file)
index 6e153e3..0000000
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DA_BTREE_H__
-#define        __XFS_DA_BTREE_H__
-
-struct xfs_bmap_free;
-struct xfs_inode;
-struct xfs_trans;
-struct zone;
-struct xfs_dir_ops;
-
-/*
- * Directory/attribute geometry information. There will be one of these for each
- * data fork type, and it will be passed around via the xfs_da_args. Global
- * structures will be attached to the xfs_mount.
- */
-struct xfs_da_geometry {
-       int             blksize;        /* da block size in bytes */
-       int             fsbcount;       /* da block size in filesystem blocks */
-       uint8_t         fsblog;         /* log2 of _filesystem_ block size */
-       uint8_t         blklog;         /* log2 of da block size */
-       uint            node_ents;      /* # of entries in a danode */
-       int             magicpct;       /* 37% of block size in bytes */
-       xfs_dablk_t     datablk;        /* blockno of dir data v2 */
-       xfs_dablk_t     leafblk;        /* blockno of leaf data v2 */
-       xfs_dablk_t     freeblk;        /* blockno of free data v2 */
-};
-
-/*========================================================================
- * Btree searching and modification structure definitions.
- *========================================================================*/
-
-/*
- * Search comparison results
- */
-enum xfs_dacmp {
-       XFS_CMP_DIFFERENT,      /* names are completely different */
-       XFS_CMP_EXACT,          /* names are exactly the same */
-       XFS_CMP_CASE            /* names are same but differ in case */
-};
-
-/*
- * Structure to ease passing around component names.
- */
-typedef struct xfs_da_args {
-       struct xfs_da_geometry *geo;    /* da block geometry */
-       const __uint8_t *name;          /* string (maybe not NULL terminated) */
-       int             namelen;        /* length of string (maybe no NULL) */
-       __uint8_t       filetype;       /* filetype of inode for directories */
-       __uint8_t       *value;         /* set of bytes (maybe contain NULLs) */
-       int             valuelen;       /* length of value */
-       int             flags;          /* argument flags (eg: ATTR_NOCREATE) */
-       xfs_dahash_t    hashval;        /* hash value of name */
-       xfs_ino_t       inumber;        /* input/output inode number */
-       struct xfs_inode *dp;           /* directory inode to manipulate */
-       xfs_fsblock_t   *firstblock;    /* ptr to firstblock for bmap calls */
-       struct xfs_bmap_free *flist;    /* ptr to freelist for bmap_finish */
-       struct xfs_trans *trans;        /* current trans (changes over time) */
-       xfs_extlen_t    total;          /* total blocks needed, for 1st bmap */
-       int             whichfork;      /* data or attribute fork */
-       xfs_dablk_t     blkno;          /* blkno of attr leaf of interest */
-       int             index;          /* index of attr of interest in blk */
-       xfs_dablk_t     rmtblkno;       /* remote attr value starting blkno */
-       int             rmtblkcnt;      /* remote attr value block count */
-       int             rmtvaluelen;    /* remote attr value length in bytes */
-       xfs_dablk_t     blkno2;         /* blkno of 2nd attr leaf of interest */
-       int             index2;         /* index of 2nd attr in blk */
-       xfs_dablk_t     rmtblkno2;      /* remote attr value starting blkno */
-       int             rmtblkcnt2;     /* remote attr value block count */
-       int             rmtvaluelen2;   /* remote attr value length in bytes */
-       int             op_flags;       /* operation flags */
-       enum xfs_dacmp  cmpresult;      /* name compare result for lookups */
-} xfs_da_args_t;
-
-/*
- * Operation flags:
- */
-#define XFS_DA_OP_JUSTCHECK    0x0001  /* check for ok with no space */
-#define XFS_DA_OP_RENAME       0x0002  /* this is an atomic rename op */
-#define XFS_DA_OP_ADDNAME      0x0004  /* this is an add operation */
-#define XFS_DA_OP_OKNOENT      0x0008  /* lookup/add op, ENOENT ok, else die */
-#define XFS_DA_OP_CILOOKUP     0x0010  /* lookup to return CI name if found */
-
-#define XFS_DA_OP_FLAGS \
-       { XFS_DA_OP_JUSTCHECK,  "JUSTCHECK" }, \
-       { XFS_DA_OP_RENAME,     "RENAME" }, \
-       { XFS_DA_OP_ADDNAME,    "ADDNAME" }, \
-       { XFS_DA_OP_OKNOENT,    "OKNOENT" }, \
-       { XFS_DA_OP_CILOOKUP,   "CILOOKUP" }
-
-/*
- * Storage for holding state during Btree searches and split/join ops.
- *
- * Only need space for 5 intermediate nodes.  With a minimum of 62-way
- * fanout to the Btree, we can support over 900 million directory blocks,
- * which is slightly more than enough.
- */
-typedef struct xfs_da_state_blk {
-       struct xfs_buf  *bp;            /* buffer containing block */
-       xfs_dablk_t     blkno;          /* filesystem blkno of buffer */
-       xfs_daddr_t     disk_blkno;     /* on-disk blkno (in BBs) of buffer */
-       int             index;          /* relevant index into block */
-       xfs_dahash_t    hashval;        /* last hash value in block */
-       int             magic;          /* blk's magic number, ie: blk type */
-} xfs_da_state_blk_t;
-
-typedef struct xfs_da_state_path {
-       int                     active;         /* number of active levels */
-       xfs_da_state_blk_t      blk[XFS_DA_NODE_MAXDEPTH];
-} xfs_da_state_path_t;
-
-typedef struct xfs_da_state {
-       xfs_da_args_t           *args;          /* filename arguments */
-       struct xfs_mount        *mp;            /* filesystem mount point */
-       xfs_da_state_path_t     path;           /* search/split paths */
-       xfs_da_state_path_t     altpath;        /* alternate path for join */
-       unsigned char           inleaf;         /* insert into 1->lf, 0->splf */
-       unsigned char           extravalid;     /* T/F: extrablk is in use */
-       unsigned char           extraafter;     /* T/F: extrablk is after new */
-       xfs_da_state_blk_t      extrablk;       /* for double-splits on leaves */
-                                               /* for dirv2 extrablk is data */
-} xfs_da_state_t;
-
-/*
- * Utility macros to aid in logging changed structure fields.
- */
-#define XFS_DA_LOGOFF(BASE, ADDR)      ((char *)(ADDR) - (char *)(BASE))
-#define XFS_DA_LOGRANGE(BASE, ADDR, SIZE)      \
-               (uint)(XFS_DA_LOGOFF(BASE, ADDR)), \
-               (uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1)
-
-/*
- * Name ops for directory and/or attr name operations
- */
-struct xfs_nameops {
-       xfs_dahash_t    (*hashname)(struct xfs_name *);
-       enum xfs_dacmp  (*compname)(struct xfs_da_args *,
-                                       const unsigned char *, int);
-};
-
-
-/*========================================================================
- * Function prototypes.
- *========================================================================*/
-
-/*
- * Routines used for growing the Btree.
- */
-int    xfs_da3_node_create(struct xfs_da_args *args, xfs_dablk_t blkno,
-                           int level, struct xfs_buf **bpp, int whichfork);
-int    xfs_da3_split(xfs_da_state_t *state);
-
-/*
- * Routines used for shrinking the Btree.
- */
-int    xfs_da3_join(xfs_da_state_t *state);
-void   xfs_da3_fixhashpath(struct xfs_da_state *state,
-                           struct xfs_da_state_path *path_to_to_fix);
-
-/*
- * Routines used for finding things in the Btree.
- */
-int    xfs_da3_node_lookup_int(xfs_da_state_t *state, int *result);
-int    xfs_da3_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
-                                        int forward, int release, int *result);
-/*
- * Utility routines.
- */
-int    xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
-                                      xfs_da_state_blk_t *new_blk);
-int    xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
-                        xfs_dablk_t bno, xfs_daddr_t mappedbno,
-                        struct xfs_buf **bpp, int which_fork);
-
-/*
- * Utility routines.
- */
-int    xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno);
-int    xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno,
-                             int count);
-int    xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
-                             xfs_dablk_t bno, xfs_daddr_t mappedbno,
-                             struct xfs_buf **bp, int whichfork);
-int    xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
-                              xfs_dablk_t bno, xfs_daddr_t mappedbno,
-                              struct xfs_buf **bpp, int whichfork,
-                              const struct xfs_buf_ops *ops);
-xfs_daddr_t    xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
-                               xfs_daddr_t mapped_bno, int whichfork,
-                               const struct xfs_buf_ops *ops);
-int    xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
-                                         struct xfs_buf *dead_buf);
-
-uint xfs_da_hashname(const __uint8_t *name_string, int name_length);
-enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
-                               const unsigned char *name, int len);
-
-
-xfs_da_state_t *xfs_da_state_alloc(void);
-void xfs_da_state_free(xfs_da_state_t *state);
-
-extern struct kmem_zone *xfs_da_state_zone;
-extern const struct xfs_nameops xfs_default_nameops;
-
-#endif /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/xfs_da_format.c
deleted file mode 100644 (file)
index c9aee52..0000000
+++ /dev/null
@@ -1,911 +0,0 @@
-/*
- * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_inode.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_priv.h"
-
-/*
- * Shortform directory ops
- */
-static int
-xfs_dir2_sf_entsize(
-       struct xfs_dir2_sf_hdr  *hdr,
-       int                     len)
-{
-       int count = sizeof(struct xfs_dir2_sf_entry);   /* namelen + offset */
-
-       count += len;                                   /* name */
-       count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
-                               sizeof(xfs_dir2_ino4_t); /* ino # */
-       return count;
-}
-
-static int
-xfs_dir3_sf_entsize(
-       struct xfs_dir2_sf_hdr  *hdr,
-       int                     len)
-{
-       return xfs_dir2_sf_entsize(hdr, len) + sizeof(__uint8_t);
-}
-
-static struct xfs_dir2_sf_entry *
-xfs_dir2_sf_nextentry(
-       struct xfs_dir2_sf_hdr  *hdr,
-       struct xfs_dir2_sf_entry *sfep)
-{
-       return (struct xfs_dir2_sf_entry *)
-               ((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen));
-}
-
-static struct xfs_dir2_sf_entry *
-xfs_dir3_sf_nextentry(
-       struct xfs_dir2_sf_hdr  *hdr,
-       struct xfs_dir2_sf_entry *sfep)
-{
-       return (struct xfs_dir2_sf_entry *)
-               ((char *)sfep + xfs_dir3_sf_entsize(hdr, sfep->namelen));
-}
-
-
-/*
- * For filetype enabled shortform directories, the file type field is stored at
- * the end of the name.  Because it's only a single byte, endian conversion is
- * not necessary. For non-filetype enable directories, the type is always
- * unknown and we never store the value.
- */
-static __uint8_t
-xfs_dir2_sfe_get_ftype(
-       struct xfs_dir2_sf_entry *sfep)
-{
-       return XFS_DIR3_FT_UNKNOWN;
-}
-
-static void
-xfs_dir2_sfe_put_ftype(
-       struct xfs_dir2_sf_entry *sfep,
-       __uint8_t               ftype)
-{
-       ASSERT(ftype < XFS_DIR3_FT_MAX);
-}
-
-static __uint8_t
-xfs_dir3_sfe_get_ftype(
-       struct xfs_dir2_sf_entry *sfep)
-{
-       __uint8_t       ftype;
-
-       ftype = sfep->name[sfep->namelen];
-       if (ftype >= XFS_DIR3_FT_MAX)
-               return XFS_DIR3_FT_UNKNOWN;
-       return ftype;
-}
-
-static void
-xfs_dir3_sfe_put_ftype(
-       struct xfs_dir2_sf_entry *sfep,
-       __uint8_t               ftype)
-{
-       ASSERT(ftype < XFS_DIR3_FT_MAX);
-
-       sfep->name[sfep->namelen] = ftype;
-}
-
-/*
- * Inode numbers in short-form directories can come in two versions,
- * either 4 bytes or 8 bytes wide.  These helpers deal with the
- * two forms transparently by looking at the headers i8count field.
- *
- * For 64-bit inode number the most significant byte must be zero.
- */
-static xfs_ino_t
-xfs_dir2_sf_get_ino(
-       struct xfs_dir2_sf_hdr  *hdr,
-       xfs_dir2_inou_t         *from)
-{
-       if (hdr->i8count)
-               return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL;
-       else
-               return get_unaligned_be32(&from->i4.i);
-}
-
-static void
-xfs_dir2_sf_put_ino(
-       struct xfs_dir2_sf_hdr  *hdr,
-       xfs_dir2_inou_t         *to,
-       xfs_ino_t               ino)
-{
-       ASSERT((ino & 0xff00000000000000ULL) == 0);
-
-       if (hdr->i8count)
-               put_unaligned_be64(ino, &to->i8.i);
-       else
-               put_unaligned_be32(ino, &to->i4.i);
-}
-
-static xfs_ino_t
-xfs_dir2_sf_get_parent_ino(
-       struct xfs_dir2_sf_hdr  *hdr)
-{
-       return xfs_dir2_sf_get_ino(hdr, &hdr->parent);
-}
-
-static void
-xfs_dir2_sf_put_parent_ino(
-       struct xfs_dir2_sf_hdr  *hdr,
-       xfs_ino_t               ino)
-{
-       xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino);
-}
-
-/*
- * In short-form directory entries the inode numbers are stored at variable
- * offset behind the entry name. If the entry stores a filetype value, then it
- * sits between the name and the inode number. Hence the inode numbers may only
- * be accessed through the helpers below.
- */
-static xfs_ino_t
-xfs_dir2_sfe_get_ino(
-       struct xfs_dir2_sf_hdr  *hdr,
-       struct xfs_dir2_sf_entry *sfep)
-{
-       return xfs_dir2_sf_get_ino(hdr,
-                               (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]);
-}
-
-static void
-xfs_dir2_sfe_put_ino(
-       struct xfs_dir2_sf_hdr  *hdr,
-       struct xfs_dir2_sf_entry *sfep,
-       xfs_ino_t               ino)
-{
-       xfs_dir2_sf_put_ino(hdr,
-                           (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino);
-}
-
-static xfs_ino_t
-xfs_dir3_sfe_get_ino(
-       struct xfs_dir2_sf_hdr  *hdr,
-       struct xfs_dir2_sf_entry *sfep)
-{
-       return xfs_dir2_sf_get_ino(hdr,
-                       (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]);
-}
-
-static void
-xfs_dir3_sfe_put_ino(
-       struct xfs_dir2_sf_hdr  *hdr,
-       struct xfs_dir2_sf_entry *sfep,
-       xfs_ino_t               ino)
-{
-       xfs_dir2_sf_put_ino(hdr,
-                       (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino);
-}
-
-
-/*
- * Directory data block operations
- */
-
-/*
- * For special situations, the dirent size ends up fixed because we always know
- * what the size of the entry is. That's true for the "." and "..", and
- * therefore we know that they are a fixed size and hence their offsets are
- * constant, as is the first entry.
- *
- * Hence, this calculation is written as a macro to be able to be calculated at
- * compile time and so certain offsets can be calculated directly in the
- * structure initaliser via the macro. There are two macros - one for dirents
- * with ftype and without so there are no unresolvable conditionals in the
- * calculations. We also use round_up() as XFS_DIR2_DATA_ALIGN is always a power
- * of 2 and the compiler doesn't reject it (unlike roundup()).
- */
-#define XFS_DIR2_DATA_ENTSIZE(n)                                       \
-       round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \
-                sizeof(xfs_dir2_data_off_t)), XFS_DIR2_DATA_ALIGN)
-
-#define XFS_DIR3_DATA_ENTSIZE(n)                                       \
-       round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \
-                sizeof(xfs_dir2_data_off_t) + sizeof(__uint8_t)),      \
-               XFS_DIR2_DATA_ALIGN)
-
-static int
-xfs_dir2_data_entsize(
-       int                     n)
-{
-       return XFS_DIR2_DATA_ENTSIZE(n);
-}
-
-static int
-xfs_dir3_data_entsize(
-       int                     n)
-{
-       return XFS_DIR3_DATA_ENTSIZE(n);
-}
-
-static __uint8_t
-xfs_dir2_data_get_ftype(
-       struct xfs_dir2_data_entry *dep)
-{
-       return XFS_DIR3_FT_UNKNOWN;
-}
-
-static void
-xfs_dir2_data_put_ftype(
-       struct xfs_dir2_data_entry *dep,
-       __uint8_t               ftype)
-{
-       ASSERT(ftype < XFS_DIR3_FT_MAX);
-}
-
-static __uint8_t
-xfs_dir3_data_get_ftype(
-       struct xfs_dir2_data_entry *dep)
-{
-       __uint8_t       ftype = dep->name[dep->namelen];
-
-       ASSERT(ftype < XFS_DIR3_FT_MAX);
-       if (ftype >= XFS_DIR3_FT_MAX)
-               return XFS_DIR3_FT_UNKNOWN;
-       return ftype;
-}
-
-static void
-xfs_dir3_data_put_ftype(
-       struct xfs_dir2_data_entry *dep,
-       __uint8_t               type)
-{
-       ASSERT(type < XFS_DIR3_FT_MAX);
-       ASSERT(dep->namelen != 0);
-
-       dep->name[dep->namelen] = type;
-}
-
-/*
- * Pointer to an entry's tag word.
- */
-static __be16 *
-xfs_dir2_data_entry_tag_p(
-       struct xfs_dir2_data_entry *dep)
-{
-       return (__be16 *)((char *)dep +
-               xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16));
-}
-
-static __be16 *
-xfs_dir3_data_entry_tag_p(
-       struct xfs_dir2_data_entry *dep)
-{
-       return (__be16 *)((char *)dep +
-               xfs_dir3_data_entsize(dep->namelen) - sizeof(__be16));
-}
-
-/*
- * location of . and .. in data space (always block 0)
- */
-static struct xfs_dir2_data_entry *
-xfs_dir2_data_dot_entry_p(
-       struct xfs_dir2_data_hdr *hdr)
-{
-       return (struct xfs_dir2_data_entry *)
-               ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir2_data_dotdot_entry_p(
-       struct xfs_dir2_data_hdr *hdr)
-{
-       return (struct xfs_dir2_data_entry *)
-               ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
-                               XFS_DIR2_DATA_ENTSIZE(1));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir2_data_first_entry_p(
-       struct xfs_dir2_data_hdr *hdr)
-{
-       return (struct xfs_dir2_data_entry *)
-               ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
-                               XFS_DIR2_DATA_ENTSIZE(1) +
-                               XFS_DIR2_DATA_ENTSIZE(2));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir2_ftype_data_dotdot_entry_p(
-       struct xfs_dir2_data_hdr *hdr)
-{
-       return (struct xfs_dir2_data_entry *)
-               ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
-                               XFS_DIR3_DATA_ENTSIZE(1));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir2_ftype_data_first_entry_p(
-       struct xfs_dir2_data_hdr *hdr)
-{
-       return (struct xfs_dir2_data_entry *)
-               ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
-                               XFS_DIR3_DATA_ENTSIZE(1) +
-                               XFS_DIR3_DATA_ENTSIZE(2));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir3_data_dot_entry_p(
-       struct xfs_dir2_data_hdr *hdr)
-{
-       return (struct xfs_dir2_data_entry *)
-               ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir3_data_dotdot_entry_p(
-       struct xfs_dir2_data_hdr *hdr)
-{
-       return (struct xfs_dir2_data_entry *)
-               ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
-                               XFS_DIR3_DATA_ENTSIZE(1));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir3_data_first_entry_p(
-       struct xfs_dir2_data_hdr *hdr)
-{
-       return (struct xfs_dir2_data_entry *)
-               ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
-                               XFS_DIR3_DATA_ENTSIZE(1) +
-                               XFS_DIR3_DATA_ENTSIZE(2));
-}
-
-static struct xfs_dir2_data_free *
-xfs_dir2_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
-{
-       return hdr->bestfree;
-}
-
-static struct xfs_dir2_data_free *
-xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
-{
-       return ((struct xfs_dir3_data_hdr *)hdr)->best_free;
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir2_data_entry_p(struct xfs_dir2_data_hdr *hdr)
-{
-       return (struct xfs_dir2_data_entry *)
-               ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
-}
-
-static struct xfs_dir2_data_unused *
-xfs_dir2_data_unused_p(struct xfs_dir2_data_hdr *hdr)
-{
-       return (struct xfs_dir2_data_unused *)
-               ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr)
-{
-       return (struct xfs_dir2_data_entry *)
-               ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
-}
-
-static struct xfs_dir2_data_unused *
-xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
-{
-       return (struct xfs_dir2_data_unused *)
-               ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
-}
-
-
-/*
- * Directory Leaf block operations
- */
-static int
-xfs_dir2_max_leaf_ents(struct xfs_da_geometry *geo)
-{
-       return (geo->blksize - sizeof(struct xfs_dir2_leaf_hdr)) /
-               (uint)sizeof(struct xfs_dir2_leaf_entry);
-}
-
-static struct xfs_dir2_leaf_entry *
-xfs_dir2_leaf_ents_p(struct xfs_dir2_leaf *lp)
-{
-       return lp->__ents;
-}
-
-static int
-xfs_dir3_max_leaf_ents(struct xfs_da_geometry *geo)
-{
-       return (geo->blksize - sizeof(struct xfs_dir3_leaf_hdr)) /
-               (uint)sizeof(struct xfs_dir2_leaf_entry);
-}
-
-static struct xfs_dir2_leaf_entry *
-xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp)
-{
-       return ((struct xfs_dir3_leaf *)lp)->__ents;
-}
-
-static void
-xfs_dir2_leaf_hdr_from_disk(
-       struct xfs_dir3_icleaf_hdr      *to,
-       struct xfs_dir2_leaf            *from)
-{
-       to->forw = be32_to_cpu(from->hdr.info.forw);
-       to->back = be32_to_cpu(from->hdr.info.back);
-       to->magic = be16_to_cpu(from->hdr.info.magic);
-       to->count = be16_to_cpu(from->hdr.count);
-       to->stale = be16_to_cpu(from->hdr.stale);
-
-       ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC ||
-              to->magic == XFS_DIR2_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir2_leaf_hdr_to_disk(
-       struct xfs_dir2_leaf            *to,
-       struct xfs_dir3_icleaf_hdr      *from)
-{
-       ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC ||
-              from->magic == XFS_DIR2_LEAFN_MAGIC);
-
-       to->hdr.info.forw = cpu_to_be32(from->forw);
-       to->hdr.info.back = cpu_to_be32(from->back);
-       to->hdr.info.magic = cpu_to_be16(from->magic);
-       to->hdr.count = cpu_to_be16(from->count);
-       to->hdr.stale = cpu_to_be16(from->stale);
-}
-
-static void
-xfs_dir3_leaf_hdr_from_disk(
-       struct xfs_dir3_icleaf_hdr      *to,
-       struct xfs_dir2_leaf            *from)
-{
-       struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from;
-
-       to->forw = be32_to_cpu(hdr3->info.hdr.forw);
-       to->back = be32_to_cpu(hdr3->info.hdr.back);
-       to->magic = be16_to_cpu(hdr3->info.hdr.magic);
-       to->count = be16_to_cpu(hdr3->count);
-       to->stale = be16_to_cpu(hdr3->stale);
-
-       ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC ||
-              to->magic == XFS_DIR3_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir3_leaf_hdr_to_disk(
-       struct xfs_dir2_leaf            *to,
-       struct xfs_dir3_icleaf_hdr      *from)
-{
-       struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to;
-
-       ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC ||
-              from->magic == XFS_DIR3_LEAFN_MAGIC);
-
-       hdr3->info.hdr.forw = cpu_to_be32(from->forw);
-       hdr3->info.hdr.back = cpu_to_be32(from->back);
-       hdr3->info.hdr.magic = cpu_to_be16(from->magic);
-       hdr3->count = cpu_to_be16(from->count);
-       hdr3->stale = cpu_to_be16(from->stale);
-}
-
-
-/*
- * Directory/Attribute Node block operations
- */
-static struct xfs_da_node_entry *
-xfs_da2_node_tree_p(struct xfs_da_intnode *dap)
-{
-       return dap->__btree;
-}
-
-static struct xfs_da_node_entry *
-xfs_da3_node_tree_p(struct xfs_da_intnode *dap)
-{
-       return ((struct xfs_da3_intnode *)dap)->__btree;
-}
-
-static void
-xfs_da2_node_hdr_from_disk(
-       struct xfs_da3_icnode_hdr       *to,
-       struct xfs_da_intnode           *from)
-{
-       ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
-       to->forw = be32_to_cpu(from->hdr.info.forw);
-       to->back = be32_to_cpu(from->hdr.info.back);
-       to->magic = be16_to_cpu(from->hdr.info.magic);
-       to->count = be16_to_cpu(from->hdr.__count);
-       to->level = be16_to_cpu(from->hdr.__level);
-}
-
-static void
-xfs_da2_node_hdr_to_disk(
-       struct xfs_da_intnode           *to,
-       struct xfs_da3_icnode_hdr       *from)
-{
-       ASSERT(from->magic == XFS_DA_NODE_MAGIC);
-       to->hdr.info.forw = cpu_to_be32(from->forw);
-       to->hdr.info.back = cpu_to_be32(from->back);
-       to->hdr.info.magic = cpu_to_be16(from->magic);
-       to->hdr.__count = cpu_to_be16(from->count);
-       to->hdr.__level = cpu_to_be16(from->level);
-}
-
-static void
-xfs_da3_node_hdr_from_disk(
-       struct xfs_da3_icnode_hdr       *to,
-       struct xfs_da_intnode           *from)
-{
-       struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from;
-
-       ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
-       to->forw = be32_to_cpu(hdr3->info.hdr.forw);
-       to->back = be32_to_cpu(hdr3->info.hdr.back);
-       to->magic = be16_to_cpu(hdr3->info.hdr.magic);
-       to->count = be16_to_cpu(hdr3->__count);
-       to->level = be16_to_cpu(hdr3->__level);
-}
-
-static void
-xfs_da3_node_hdr_to_disk(
-       struct xfs_da_intnode           *to,
-       struct xfs_da3_icnode_hdr       *from)
-{
-       struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to;
-
-       ASSERT(from->magic == XFS_DA3_NODE_MAGIC);
-       hdr3->info.hdr.forw = cpu_to_be32(from->forw);
-       hdr3->info.hdr.back = cpu_to_be32(from->back);
-       hdr3->info.hdr.magic = cpu_to_be16(from->magic);
-       hdr3->__count = cpu_to_be16(from->count);
-       hdr3->__level = cpu_to_be16(from->level);
-}
-
-
-/*
- * Directory free space block operations
- */
-static int
-xfs_dir2_free_max_bests(struct xfs_da_geometry *geo)
-{
-       return (geo->blksize - sizeof(struct xfs_dir2_free_hdr)) /
-               sizeof(xfs_dir2_data_off_t);
-}
-
-static __be16 *
-xfs_dir2_free_bests_p(struct xfs_dir2_free *free)
-{
-       return (__be16 *)((char *)free + sizeof(struct xfs_dir2_free_hdr));
-}
-
-/*
- * Convert data space db to the corresponding free db.
- */
-static xfs_dir2_db_t
-xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
-{
-       return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
-                       (db / xfs_dir2_free_max_bests(geo));
-}
-
-/*
- * Convert data space db to the corresponding index in a free db.
- */
-static int
-xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
-{
-       return db % xfs_dir2_free_max_bests(geo);
-}
-
-static int
-xfs_dir3_free_max_bests(struct xfs_da_geometry *geo)
-{
-       return (geo->blksize - sizeof(struct xfs_dir3_free_hdr)) /
-               sizeof(xfs_dir2_data_off_t);
-}
-
-static __be16 *
-xfs_dir3_free_bests_p(struct xfs_dir2_free *free)
-{
-       return (__be16 *)((char *)free + sizeof(struct xfs_dir3_free_hdr));
-}
-
-/*
- * Convert data space db to the corresponding free db.
- */
-static xfs_dir2_db_t
-xfs_dir3_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
-{
-       return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
-                       (db / xfs_dir3_free_max_bests(geo));
-}
-
-/*
- * Convert data space db to the corresponding index in a free db.
- */
-static int
-xfs_dir3_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
-{
-       return db % xfs_dir3_free_max_bests(geo);
-}
-
-static void
-xfs_dir2_free_hdr_from_disk(
-       struct xfs_dir3_icfree_hdr      *to,
-       struct xfs_dir2_free            *from)
-{
-       to->magic = be32_to_cpu(from->hdr.magic);
-       to->firstdb = be32_to_cpu(from->hdr.firstdb);
-       to->nvalid = be32_to_cpu(from->hdr.nvalid);
-       to->nused = be32_to_cpu(from->hdr.nused);
-       ASSERT(to->magic == XFS_DIR2_FREE_MAGIC);
-}
-
-static void
-xfs_dir2_free_hdr_to_disk(
-       struct xfs_dir2_free            *to,
-       struct xfs_dir3_icfree_hdr      *from)
-{
-       ASSERT(from->magic == XFS_DIR2_FREE_MAGIC);
-
-       to->hdr.magic = cpu_to_be32(from->magic);
-       to->hdr.firstdb = cpu_to_be32(from->firstdb);
-       to->hdr.nvalid = cpu_to_be32(from->nvalid);
-       to->hdr.nused = cpu_to_be32(from->nused);
-}
-
-static void
-xfs_dir3_free_hdr_from_disk(
-       struct xfs_dir3_icfree_hdr      *to,
-       struct xfs_dir2_free            *from)
-{
-       struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from;
-
-       to->magic = be32_to_cpu(hdr3->hdr.magic);
-       to->firstdb = be32_to_cpu(hdr3->firstdb);
-       to->nvalid = be32_to_cpu(hdr3->nvalid);
-       to->nused = be32_to_cpu(hdr3->nused);
-
-       ASSERT(to->magic == XFS_DIR3_FREE_MAGIC);
-}
-
-static void
-xfs_dir3_free_hdr_to_disk(
-       struct xfs_dir2_free            *to,
-       struct xfs_dir3_icfree_hdr      *from)
-{
-       struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to;
-
-       ASSERT(from->magic == XFS_DIR3_FREE_MAGIC);
-
-       hdr3->hdr.magic = cpu_to_be32(from->magic);
-       hdr3->firstdb = cpu_to_be32(from->firstdb);
-       hdr3->nvalid = cpu_to_be32(from->nvalid);
-       hdr3->nused = cpu_to_be32(from->nused);
-}
-
-static const struct xfs_dir_ops xfs_dir2_ops = {
-       .sf_entsize = xfs_dir2_sf_entsize,
-       .sf_nextentry = xfs_dir2_sf_nextentry,
-       .sf_get_ftype = xfs_dir2_sfe_get_ftype,
-       .sf_put_ftype = xfs_dir2_sfe_put_ftype,
-       .sf_get_ino = xfs_dir2_sfe_get_ino,
-       .sf_put_ino = xfs_dir2_sfe_put_ino,
-       .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
-       .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
-
-       .data_entsize = xfs_dir2_data_entsize,
-       .data_get_ftype = xfs_dir2_data_get_ftype,
-       .data_put_ftype = xfs_dir2_data_put_ftype,
-       .data_entry_tag_p = xfs_dir2_data_entry_tag_p,
-       .data_bestfree_p = xfs_dir2_data_bestfree_p,
-
-       .data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
-       .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
-                               XFS_DIR2_DATA_ENTSIZE(1),
-       .data_first_offset =  sizeof(struct xfs_dir2_data_hdr) +
-                               XFS_DIR2_DATA_ENTSIZE(1) +
-                               XFS_DIR2_DATA_ENTSIZE(2),
-       .data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
-
-       .data_dot_entry_p = xfs_dir2_data_dot_entry_p,
-       .data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p,
-       .data_first_entry_p = xfs_dir2_data_first_entry_p,
-       .data_entry_p = xfs_dir2_data_entry_p,
-       .data_unused_p = xfs_dir2_data_unused_p,
-
-       .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
-       .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
-       .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
-       .leaf_max_ents = xfs_dir2_max_leaf_ents,
-       .leaf_ents_p = xfs_dir2_leaf_ents_p,
-
-       .node_hdr_size = sizeof(struct xfs_da_node_hdr),
-       .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
-       .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
-       .node_tree_p = xfs_da2_node_tree_p,
-
-       .free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
-       .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
-       .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
-       .free_max_bests = xfs_dir2_free_max_bests,
-       .free_bests_p = xfs_dir2_free_bests_p,
-       .db_to_fdb = xfs_dir2_db_to_fdb,
-       .db_to_fdindex = xfs_dir2_db_to_fdindex,
-};
-
-static const struct xfs_dir_ops xfs_dir2_ftype_ops = {
-       .sf_entsize = xfs_dir3_sf_entsize,
-       .sf_nextentry = xfs_dir3_sf_nextentry,
-       .sf_get_ftype = xfs_dir3_sfe_get_ftype,
-       .sf_put_ftype = xfs_dir3_sfe_put_ftype,
-       .sf_get_ino = xfs_dir3_sfe_get_ino,
-       .sf_put_ino = xfs_dir3_sfe_put_ino,
-       .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
-       .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
-
-       .data_entsize = xfs_dir3_data_entsize,
-       .data_get_ftype = xfs_dir3_data_get_ftype,
-       .data_put_ftype = xfs_dir3_data_put_ftype,
-       .data_entry_tag_p = xfs_dir3_data_entry_tag_p,
-       .data_bestfree_p = xfs_dir2_data_bestfree_p,
-
-       .data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
-       .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
-                               XFS_DIR3_DATA_ENTSIZE(1),
-       .data_first_offset =  sizeof(struct xfs_dir2_data_hdr) +
-                               XFS_DIR3_DATA_ENTSIZE(1) +
-                               XFS_DIR3_DATA_ENTSIZE(2),
-       .data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
-
-       .data_dot_entry_p = xfs_dir2_data_dot_entry_p,
-       .data_dotdot_entry_p = xfs_dir2_ftype_data_dotdot_entry_p,
-       .data_first_entry_p = xfs_dir2_ftype_data_first_entry_p,
-       .data_entry_p = xfs_dir2_data_entry_p,
-       .data_unused_p = xfs_dir2_data_unused_p,
-
-       .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
-       .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
-       .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
-       .leaf_max_ents = xfs_dir2_max_leaf_ents,
-       .leaf_ents_p = xfs_dir2_leaf_ents_p,
-
-       .node_hdr_size = sizeof(struct xfs_da_node_hdr),
-       .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
-       .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
-       .node_tree_p = xfs_da2_node_tree_p,
-
-       .free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
-       .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
-       .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
-       .free_max_bests = xfs_dir2_free_max_bests,
-       .free_bests_p = xfs_dir2_free_bests_p,
-       .db_to_fdb = xfs_dir2_db_to_fdb,
-       .db_to_fdindex = xfs_dir2_db_to_fdindex,
-};
-
-static const struct xfs_dir_ops xfs_dir3_ops = {
-       .sf_entsize = xfs_dir3_sf_entsize,
-       .sf_nextentry = xfs_dir3_sf_nextentry,
-       .sf_get_ftype = xfs_dir3_sfe_get_ftype,
-       .sf_put_ftype = xfs_dir3_sfe_put_ftype,
-       .sf_get_ino = xfs_dir3_sfe_get_ino,
-       .sf_put_ino = xfs_dir3_sfe_put_ino,
-       .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
-       .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
-
-       .data_entsize = xfs_dir3_data_entsize,
-       .data_get_ftype = xfs_dir3_data_get_ftype,
-       .data_put_ftype = xfs_dir3_data_put_ftype,
-       .data_entry_tag_p = xfs_dir3_data_entry_tag_p,
-       .data_bestfree_p = xfs_dir3_data_bestfree_p,
-
-       .data_dot_offset = sizeof(struct xfs_dir3_data_hdr),
-       .data_dotdot_offset = sizeof(struct xfs_dir3_data_hdr) +
-                               XFS_DIR3_DATA_ENTSIZE(1),
-       .data_first_offset =  sizeof(struct xfs_dir3_data_hdr) +
-                               XFS_DIR3_DATA_ENTSIZE(1) +
-                               XFS_DIR3_DATA_ENTSIZE(2),
-       .data_entry_offset = sizeof(struct xfs_dir3_data_hdr),
-
-       .data_dot_entry_p = xfs_dir3_data_dot_entry_p,
-       .data_dotdot_entry_p = xfs_dir3_data_dotdot_entry_p,
-       .data_first_entry_p = xfs_dir3_data_first_entry_p,
-       .data_entry_p = xfs_dir3_data_entry_p,
-       .data_unused_p = xfs_dir3_data_unused_p,
-
-       .leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr),
-       .leaf_hdr_to_disk = xfs_dir3_leaf_hdr_to_disk,
-       .leaf_hdr_from_disk = xfs_dir3_leaf_hdr_from_disk,
-       .leaf_max_ents = xfs_dir3_max_leaf_ents,
-       .leaf_ents_p = xfs_dir3_leaf_ents_p,
-
-       .node_hdr_size = sizeof(struct xfs_da3_node_hdr),
-       .node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
-       .node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
-       .node_tree_p = xfs_da3_node_tree_p,
-
-       .free_hdr_size = sizeof(struct xfs_dir3_free_hdr),
-       .free_hdr_to_disk = xfs_dir3_free_hdr_to_disk,
-       .free_hdr_from_disk = xfs_dir3_free_hdr_from_disk,
-       .free_max_bests = xfs_dir3_free_max_bests,
-       .free_bests_p = xfs_dir3_free_bests_p,
-       .db_to_fdb = xfs_dir3_db_to_fdb,
-       .db_to_fdindex = xfs_dir3_db_to_fdindex,
-};
-
-static const struct xfs_dir_ops xfs_dir2_nondir_ops = {
-       .node_hdr_size = sizeof(struct xfs_da_node_hdr),
-       .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
-       .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
-       .node_tree_p = xfs_da2_node_tree_p,
-};
-
-static const struct xfs_dir_ops xfs_dir3_nondir_ops = {
-       .node_hdr_size = sizeof(struct xfs_da3_node_hdr),
-       .node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
-       .node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
-       .node_tree_p = xfs_da3_node_tree_p,
-};
-
-/*
- * Return the ops structure according to the current config.  If we are passed
- * an inode, then that overrides the default config we use which is based on
- * feature bits.
- */
-const struct xfs_dir_ops *
-xfs_dir_get_ops(
-       struct xfs_mount        *mp,
-       struct xfs_inode        *dp)
-{
-       if (dp)
-               return dp->d_ops;
-       if (mp->m_dir_inode_ops)
-               return mp->m_dir_inode_ops;
-       if (xfs_sb_version_hascrc(&mp->m_sb))
-               return &xfs_dir3_ops;
-       if (xfs_sb_version_hasftype(&mp->m_sb))
-               return &xfs_dir2_ftype_ops;
-       return &xfs_dir2_ops;
-}
-
-const struct xfs_dir_ops *
-xfs_nondir_get_ops(
-       struct xfs_mount        *mp,
-       struct xfs_inode        *dp)
-{
-       if (dp)
-               return dp->d_ops;
-       if (mp->m_nondir_inode_ops)
-               return mp->m_nondir_inode_ops;
-       if (xfs_sb_version_hascrc(&mp->m_sb))
-               return &xfs_dir3_nondir_ops;
-       return &xfs_dir2_nondir_ops;
-}
diff --git a/fs/xfs/xfs_da_format.h b/fs/xfs/xfs_da_format.h
deleted file mode 100644 (file)
index 0a49b02..0000000
+++ /dev/null
@@ -1,861 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DA_FORMAT_H__
-#define __XFS_DA_FORMAT_H__
-
-/*
- * This structure is common to both leaf nodes and non-leaf nodes in the Btree.
- *
- * It is used to manage a doubly linked list of all blocks at the same
- * level in the Btree, and to identify which type of block this is.
- */
-#define XFS_DA_NODE_MAGIC      0xfebe  /* magic number: non-leaf blocks */
-#define XFS_ATTR_LEAF_MAGIC    0xfbee  /* magic number: attribute leaf blks */
-#define        XFS_DIR2_LEAF1_MAGIC    0xd2f1  /* magic number: v2 dirlf single blks */
-#define        XFS_DIR2_LEAFN_MAGIC    0xd2ff  /* magic number: v2 dirlf multi blks */
-
-typedef struct xfs_da_blkinfo {
-       __be32          forw;                   /* previous block in list */
-       __be32          back;                   /* following block in list */
-       __be16          magic;                  /* validity check on block */
-       __be16          pad;                    /* unused */
-} xfs_da_blkinfo_t;
-
-/*
- * CRC enabled directory structure types
- *
- * The headers change size for the additional verification information, but
- * otherwise the tree layouts and contents are unchanged. Hence the da btree
- * code can use the struct xfs_da_blkinfo for manipulating the tree links and
- * magic numbers without modification for both v2 and v3 nodes.
- */
-#define XFS_DA3_NODE_MAGIC     0x3ebe  /* magic number: non-leaf blocks */
-#define XFS_ATTR3_LEAF_MAGIC   0x3bee  /* magic number: attribute leaf blks */
-#define        XFS_DIR3_LEAF1_MAGIC    0x3df1  /* magic number: v2 dirlf single blks */
-#define        XFS_DIR3_LEAFN_MAGIC    0x3dff  /* magic number: v2 dirlf multi blks */
-
-struct xfs_da3_blkinfo {
-       /*
-        * the node link manipulation code relies on the fact that the first
-        * element of this structure is the struct xfs_da_blkinfo so it can
-        * ignore the differences in the rest of the structures.
-        */
-       struct xfs_da_blkinfo   hdr;
-       __be32                  crc;    /* CRC of block */
-       __be64                  blkno;  /* first block of the buffer */
-       __be64                  lsn;    /* sequence number of last write */
-       uuid_t                  uuid;   /* filesystem we belong to */
-       __be64                  owner;  /* inode that owns the block */
-};
-
-/*
- * This is the structure of the root and intermediate nodes in the Btree.
- * The leaf nodes are defined above.
- *
- * Entries are not packed.
- *
- * Since we have duplicate keys, use a binary search but always follow
- * all match in the block, not just the first match found.
- */
-#define        XFS_DA_NODE_MAXDEPTH    5       /* max depth of Btree */
-
-typedef struct xfs_da_node_hdr {
-       struct xfs_da_blkinfo   info;   /* block type, links, etc. */
-       __be16                  __count; /* count of active entries */
-       __be16                  __level; /* level above leaves (leaf == 0) */
-} xfs_da_node_hdr_t;
-
-struct xfs_da3_node_hdr {
-       struct xfs_da3_blkinfo  info;   /* block type, links, etc. */
-       __be16                  __count; /* count of active entries */
-       __be16                  __level; /* level above leaves (leaf == 0) */
-       __be32                  __pad32;
-};
-
-#define XFS_DA3_NODE_CRC_OFF   (offsetof(struct xfs_da3_node_hdr, info.crc))
-
-typedef struct xfs_da_node_entry {
-       __be32  hashval;        /* hash value for this descendant */
-       __be32  before;         /* Btree block before this key */
-} xfs_da_node_entry_t;
-
-typedef struct xfs_da_intnode {
-       struct xfs_da_node_hdr  hdr;
-       struct xfs_da_node_entry __btree[];
-} xfs_da_intnode_t;
-
-struct xfs_da3_intnode {
-       struct xfs_da3_node_hdr hdr;
-       struct xfs_da_node_entry __btree[];
-};
-
-/*
- * In-core version of the node header to abstract the differences in the v2 and
- * v3 disk format of the headers. Callers need to convert to/from disk format as
- * appropriate.
- */
-struct xfs_da3_icnode_hdr {
-       __uint32_t      forw;
-       __uint32_t      back;
-       __uint16_t      magic;
-       __uint16_t      count;
-       __uint16_t      level;
-};
-
-/*
- * Directory version 2.
- *
- * There are 4 possible formats:
- *  - shortform - embedded into the inode
- *  - single block - data with embedded leaf at the end
- *  - multiple data blocks, single leaf+freeindex block
- *  - data blocks, node and leaf blocks (btree), freeindex blocks
- *
- * Note: many node blocks structures and constants are shared with the attr
- * code and defined in xfs_da_btree.h.
- */
-
-#define        XFS_DIR2_BLOCK_MAGIC    0x58443242      /* XD2B: single block dirs */
-#define        XFS_DIR2_DATA_MAGIC     0x58443244      /* XD2D: multiblock dirs */
-#define        XFS_DIR2_FREE_MAGIC     0x58443246      /* XD2F: free index blocks */
-
-/*
- * Directory Version 3 With CRCs.
- *
- * The tree formats are the same as for version 2 directories.  The difference
- * is in the block header and dirent formats. In many cases the v3 structures
- * use v2 definitions as they are no different and this makes code sharing much
- * easier.
- *
- * Also, the xfs_dir3_*() functions handle both v2 and v3 formats - if the
- * format is v2 then they switch to the existing v2 code, or the format is v3
- * they implement the v3 functionality. This means the existing dir2 is a mix of
- * xfs_dir2/xfs_dir3 calls and functions. The xfs_dir3 functions are called
- * where there is a difference in the formats, otherwise the code is unchanged.
- *
- * Where it is possible, the code decides what to do based on the magic numbers
- * in the blocks rather than feature bits in the superblock. This means the code
- * is as independent of the external XFS code as possible as doesn't require
- * passing struct xfs_mount pointers into places where it isn't really
- * necessary.
- *
- * Version 3 includes:
- *
- *     - a larger block header for CRC and identification purposes and so the
- *     offsets of all the structures inside the blocks are different.
- *
- *     - new magic numbers to be able to detect the v2/v3 types on the fly.
- */
-
-#define        XFS_DIR3_BLOCK_MAGIC    0x58444233      /* XDB3: single block dirs */
-#define        XFS_DIR3_DATA_MAGIC     0x58444433      /* XDD3: multiblock dirs */
-#define        XFS_DIR3_FREE_MAGIC     0x58444633      /* XDF3: free index blocks */
-
-/*
- * Dirents in version 3 directories have a file type field. Additions to this
- * list are an on-disk format change, requiring feature bits. Valid values
- * are as follows:
- */
-#define XFS_DIR3_FT_UNKNOWN            0
-#define XFS_DIR3_FT_REG_FILE           1
-#define XFS_DIR3_FT_DIR                        2
-#define XFS_DIR3_FT_CHRDEV             3
-#define XFS_DIR3_FT_BLKDEV             4
-#define XFS_DIR3_FT_FIFO               5
-#define XFS_DIR3_FT_SOCK               6
-#define XFS_DIR3_FT_SYMLINK            7
-#define XFS_DIR3_FT_WHT                        8
-
-#define XFS_DIR3_FT_MAX                        9
-
-/*
- * Byte offset in data block and shortform entry.
- */
-typedef        __uint16_t      xfs_dir2_data_off_t;
-#define        NULLDATAOFF     0xffffU
-typedef uint           xfs_dir2_data_aoff_t;   /* argument form */
-
-/*
- * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
- * Only need 16 bits, this is the byte offset into the single block form.
- */
-typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t;
-
-/*
- * Offset in data space of a data entry.
- */
-typedef        __uint32_t      xfs_dir2_dataptr_t;
-#define        XFS_DIR2_MAX_DATAPTR    ((xfs_dir2_dataptr_t)0xffffffff)
-#define        XFS_DIR2_NULL_DATAPTR   ((xfs_dir2_dataptr_t)0)
-
-/*
- * Byte offset in a directory.
- */
-typedef        xfs_off_t       xfs_dir2_off_t;
-
-/*
- * Directory block number (logical dirblk in file)
- */
-typedef        __uint32_t      xfs_dir2_db_t;
-
-/*
- * Inode number stored as 8 8-bit values.
- */
-typedef        struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
-
-/*
- * Inode number stored as 4 8-bit values.
- * Works a lot of the time, when all the inode numbers in a directory
- * fit in 32 bits.
- */
-typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
-
-typedef union {
-       xfs_dir2_ino8_t i8;
-       xfs_dir2_ino4_t i4;
-} xfs_dir2_inou_t;
-#define        XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL)
-
-/*
- * Directory layout when stored internal to an inode.
- *
- * Small directories are packed as tightly as possible so as to fit into the
- * literal area of the inode.  These "shortform" directories consist of a
- * single xfs_dir2_sf_hdr header followed by zero or more xfs_dir2_sf_entry
- * structures.  Due the different inode number storage size and the variable
- * length name field in the xfs_dir2_sf_entry all these structure are
- * variable length, and the accessors in this file should be used to iterate
- * over them.
- */
-typedef struct xfs_dir2_sf_hdr {
-       __uint8_t               count;          /* count of entries */
-       __uint8_t               i8count;        /* count of 8-byte inode #s */
-       xfs_dir2_inou_t         parent;         /* parent dir inode number */
-} __arch_pack xfs_dir2_sf_hdr_t;
-
-typedef struct xfs_dir2_sf_entry {
-       __u8                    namelen;        /* actual name length */
-       xfs_dir2_sf_off_t       offset;         /* saved offset */
-       __u8                    name[];         /* name, variable size */
-       /*
-        * A single byte containing the file type field follows the inode
-        * number for version 3 directory entries.
-        *
-        * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a
-        * variable offset after the name.
-        */
-} __arch_pack xfs_dir2_sf_entry_t;
-
-static inline int xfs_dir2_sf_hdr_size(int i8count)
-{
-       return sizeof(struct xfs_dir2_sf_hdr) -
-               (i8count == 0) *
-               (sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t));
-}
-
-static inline xfs_dir2_data_aoff_t
-xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
-{
-       return get_unaligned_be16(&sfep->offset.i);
-}
-
-static inline void
-xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
-{
-       put_unaligned_be16(off, &sfep->offset.i);
-}
-
-static inline struct xfs_dir2_sf_entry *
-xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
-{
-       return (struct xfs_dir2_sf_entry *)
-               ((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count));
-}
-
-/*
- * Data block structures.
- *
- * A pure data block looks like the following drawing on disk:
- *
- *    +-------------------------------------------------+
- *    | xfs_dir2_data_hdr_t                             |
- *    +-------------------------------------------------+
- *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
- *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
- *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
- *    | ...                                             |
- *    +-------------------------------------------------+
- *    | unused space                                    |
- *    +-------------------------------------------------+
- *
- * As all the entries are variable size structures the accessors below should
- * be used to iterate over them.
- *
- * In addition to the pure data blocks for the data and node formats,
- * most structures are also used for the combined data/freespace "block"
- * format below.
- */
-
-#define        XFS_DIR2_DATA_ALIGN_LOG 3               /* i.e., 8 bytes */
-#define        XFS_DIR2_DATA_ALIGN     (1 << XFS_DIR2_DATA_ALIGN_LOG)
-#define        XFS_DIR2_DATA_FREE_TAG  0xffff
-#define        XFS_DIR2_DATA_FD_COUNT  3
-
-/*
- * Directory address space divided into sections,
- * spaces separated by 32GB.
- */
-#define        XFS_DIR2_SPACE_SIZE     (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
-#define        XFS_DIR2_DATA_SPACE     0
-#define        XFS_DIR2_DATA_OFFSET    (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
-
-/*
- * Describe a free area in the data block.
- *
- * The freespace will be formatted as a xfs_dir2_data_unused_t.
- */
-typedef struct xfs_dir2_data_free {
-       __be16                  offset;         /* start of freespace */
-       __be16                  length;         /* length of freespace */
-} xfs_dir2_data_free_t;
-
-/*
- * Header for the data blocks.
- *
- * The code knows that XFS_DIR2_DATA_FD_COUNT is 3.
- */
-typedef struct xfs_dir2_data_hdr {
-       __be32                  magic;          /* XFS_DIR2_DATA_MAGIC or */
-                                               /* XFS_DIR2_BLOCK_MAGIC */
-       xfs_dir2_data_free_t    bestfree[XFS_DIR2_DATA_FD_COUNT];
-} xfs_dir2_data_hdr_t;
-
-/*
- * define a structure for all the verification fields we are adding to the
- * directory block structures. This will be used in several structures.
- * The magic number must be the first entry to align with all the dir2
- * structures so we determine how to decode them just by the magic number.
- */
-struct xfs_dir3_blk_hdr {
-       __be32                  magic;  /* magic number */
-       __be32                  crc;    /* CRC of block */
-       __be64                  blkno;  /* first block of the buffer */
-       __be64                  lsn;    /* sequence number of last write */
-       uuid_t                  uuid;   /* filesystem we belong to */
-       __be64                  owner;  /* inode that owns the block */
-};
-
-struct xfs_dir3_data_hdr {
-       struct xfs_dir3_blk_hdr hdr;
-       xfs_dir2_data_free_t    best_free[XFS_DIR2_DATA_FD_COUNT];
-       __be32                  pad;    /* 64 bit alignment */
-};
-
-#define XFS_DIR3_DATA_CRC_OFF  offsetof(struct xfs_dir3_data_hdr, hdr.crc)
-
-/*
- * Active entry in a data block.
- *
- * Aligned to 8 bytes.  After the variable length name field there is a
- * 2 byte tag field, which can be accessed using xfs_dir3_data_entry_tag_p.
- *
- * For dir3 structures, there is file type field between the name and the tag.
- * This can only be manipulated by helper functions. It is packed hard against
- * the end of the name so any padding for rounding is between the file type and
- * the tag.
- */
-typedef struct xfs_dir2_data_entry {
-       __be64                  inumber;        /* inode number */
-       __u8                    namelen;        /* name length */
-       __u8                    name[];         /* name bytes, no null */
-     /* __u8                   filetype; */    /* type of inode we point to */
-     /*        __be16                  tag; */         /* starting offset of us */
-} xfs_dir2_data_entry_t;
-
-/*
- * Unused entry in a data block.
- *
- * Aligned to 8 bytes.  Tag appears as the last 2 bytes and must be accessed
- * using xfs_dir2_data_unused_tag_p.
- */
-typedef struct xfs_dir2_data_unused {
-       __be16                  freetag;        /* XFS_DIR2_DATA_FREE_TAG */
-       __be16                  length;         /* total free length */
-                                               /* variable offset */
-       __be16                  tag;            /* starting offset of us */
-} xfs_dir2_data_unused_t;
-
-/*
- * Pointer to a freespace's tag word.
- */
-static inline __be16 *
-xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup)
-{
-       return (__be16 *)((char *)dup +
-                       be16_to_cpu(dup->length) - sizeof(__be16));
-}
-
-/*
- * Leaf block structures.
- *
- * A pure leaf block looks like the following drawing on disk:
- *
- *    +---------------------------+
- *    | xfs_dir2_leaf_hdr_t       |
- *    +---------------------------+
- *    | xfs_dir2_leaf_entry_t     |
- *    | xfs_dir2_leaf_entry_t     |
- *    | xfs_dir2_leaf_entry_t     |
- *    | xfs_dir2_leaf_entry_t     |
- *    | ...                       |
- *    +---------------------------+
- *    | xfs_dir2_data_off_t       |
- *    | xfs_dir2_data_off_t       |
- *    | xfs_dir2_data_off_t       |
- *    | ...                       |
- *    +---------------------------+
- *    | xfs_dir2_leaf_tail_t      |
- *    +---------------------------+
- *
- * The xfs_dir2_data_off_t members (bests) and tail are at the end of the block
- * for single-leaf (magic = XFS_DIR2_LEAF1_MAGIC) blocks only, but not present
- * for directories with separate leaf nodes and free space blocks
- * (magic = XFS_DIR2_LEAFN_MAGIC).
- *
- * As all the entries are variable size structures the accessors below should
- * be used to iterate over them.
- */
-
-/*
- * Offset of the leaf/node space.  First block in this space
- * is the btree root.
- */
-#define        XFS_DIR2_LEAF_SPACE     1
-#define        XFS_DIR2_LEAF_OFFSET    (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE)
-
-/*
- * Leaf block header.
- */
-typedef struct xfs_dir2_leaf_hdr {
-       xfs_da_blkinfo_t        info;           /* header for da routines */
-       __be16                  count;          /* count of entries */
-       __be16                  stale;          /* count of stale entries */
-} xfs_dir2_leaf_hdr_t;
-
-struct xfs_dir3_leaf_hdr {
-       struct xfs_da3_blkinfo  info;           /* header for da routines */
-       __be16                  count;          /* count of entries */
-       __be16                  stale;          /* count of stale entries */
-       __be32                  pad;            /* 64 bit alignment */
-};
-
-struct xfs_dir3_icleaf_hdr {
-       __uint32_t              forw;
-       __uint32_t              back;
-       __uint16_t              magic;
-       __uint16_t              count;
-       __uint16_t              stale;
-};
-
-/*
- * Leaf block entry.
- */
-typedef struct xfs_dir2_leaf_entry {
-       __be32                  hashval;        /* hash value of name */
-       __be32                  address;        /* address of data entry */
-} xfs_dir2_leaf_entry_t;
-
-/*
- * Leaf block tail.
- */
-typedef struct xfs_dir2_leaf_tail {
-       __be32                  bestcount;
-} xfs_dir2_leaf_tail_t;
-
-/*
- * Leaf block.
- */
-typedef struct xfs_dir2_leaf {
-       xfs_dir2_leaf_hdr_t     hdr;                    /* leaf header */
-       xfs_dir2_leaf_entry_t   __ents[];               /* entries */
-} xfs_dir2_leaf_t;
-
-struct xfs_dir3_leaf {
-       struct xfs_dir3_leaf_hdr        hdr;            /* leaf header */
-       struct xfs_dir2_leaf_entry      __ents[];       /* entries */
-};
-
-#define XFS_DIR3_LEAF_CRC_OFF  offsetof(struct xfs_dir3_leaf_hdr, info.crc)
-
-/*
- * Get address of the bests array in the single-leaf block.
- */
-static inline __be16 *
-xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp)
-{
-       return (__be16 *)ltp - be32_to_cpu(ltp->bestcount);
-}
-
-/*
- * Free space block defintions for the node format.
- */
-
-/*
- * Offset of the freespace index.
- */
-#define        XFS_DIR2_FREE_SPACE     2
-#define        XFS_DIR2_FREE_OFFSET    (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE)
-
-typedef        struct xfs_dir2_free_hdr {
-       __be32                  magic;          /* XFS_DIR2_FREE_MAGIC */
-       __be32                  firstdb;        /* db of first entry */
-       __be32                  nvalid;         /* count of valid entries */
-       __be32                  nused;          /* count of used entries */
-} xfs_dir2_free_hdr_t;
-
-typedef struct xfs_dir2_free {
-       xfs_dir2_free_hdr_t     hdr;            /* block header */
-       __be16                  bests[];        /* best free counts */
-                                               /* unused entries are -1 */
-} xfs_dir2_free_t;
-
-struct xfs_dir3_free_hdr {
-       struct xfs_dir3_blk_hdr hdr;
-       __be32                  firstdb;        /* db of first entry */
-       __be32                  nvalid;         /* count of valid entries */
-       __be32                  nused;          /* count of used entries */
-       __be32                  pad;            /* 64 bit alignment */
-};
-
-struct xfs_dir3_free {
-       struct xfs_dir3_free_hdr hdr;
-       __be16                  bests[];        /* best free counts */
-                                               /* unused entries are -1 */
-};
-
-#define XFS_DIR3_FREE_CRC_OFF  offsetof(struct xfs_dir3_free, hdr.hdr.crc)
-
-/*
- * In core version of the free block header, abstracted away from on-disk format
- * differences. Use this in the code, and convert to/from the disk version using
- * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk.
- */
-struct xfs_dir3_icfree_hdr {
-       __uint32_t      magic;
-       __uint32_t      firstdb;
-       __uint32_t      nvalid;
-       __uint32_t      nused;
-
-};
-
-/*
- * Single block format.
- *
- * The single block format looks like the following drawing on disk:
- *
- *    +-------------------------------------------------+
- *    | xfs_dir2_data_hdr_t                             |
- *    +-------------------------------------------------+
- *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
- *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
- *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t :
- *    | ...                                             |
- *    +-------------------------------------------------+
- *    | unused space                                    |
- *    +-------------------------------------------------+
- *    | ...                                             |
- *    | xfs_dir2_leaf_entry_t                           |
- *    | xfs_dir2_leaf_entry_t                           |
- *    +-------------------------------------------------+
- *    | xfs_dir2_block_tail_t                           |
- *    +-------------------------------------------------+
- *
- * As all the entries are variable size structures the accessors below should
- * be used to iterate over them.
- */
-
-typedef struct xfs_dir2_block_tail {
-       __be32          count;                  /* count of leaf entries */
-       __be32          stale;                  /* count of stale lf entries */
-} xfs_dir2_block_tail_t;
-
-/*
- * Pointer to the leaf entries embedded in a data block (1-block format)
- */
-static inline struct xfs_dir2_leaf_entry *
-xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
-{
-       return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count);
-}
-
-
-/*
- * Attribute storage layout
- *
- * Attribute lists are structured around Btrees where all the data
- * elements are in the leaf nodes.  Attribute names are hashed into an int,
- * then that int is used as the index into the Btree.  Since the hashval
- * of an attribute name may not be unique, we may have duplicate keys.  The
- * internal links in the Btree are logical block offsets into the file.
- *
- * Struct leaf_entry's are packed from the top.  Name/values grow from the
- * bottom but are not packed.  The freemap contains run-length-encoded entries
- * for the free bytes after the leaf_entry's, but only the N largest such,
- * smaller runs are dropped.  When the freemap doesn't show enough space
- * for an allocation, we compact the name/value area and try again.  If we
- * still don't have enough space, then we have to split the block.  The
- * name/value structs (both local and remote versions) must be 32bit aligned.
- *
- * Since we have duplicate hash keys, for each key that matches, compare
- * the actual name string.  The root and intermediate node search always
- * takes the first-in-the-block key match found, so we should only have
- * to work "forw"ard.  If none matches, continue with the "forw"ard leaf
- * nodes until the hash key changes or the attribute name is found.
- *
- * We store the fact that an attribute is a ROOT/USER/SECURE attribute in
- * the leaf_entry.  The namespaces are independent only because we also look
- * at the namespace bit when we are looking for a matching attribute name.
- *
- * We also store an "incomplete" bit in the leaf_entry.  It shows that an
- * attribute is in the middle of being created and should not be shown to
- * the user if we crash during the time that the bit is set.  We clear the
- * bit when we have finished setting up the attribute.  We do this because
- * we cannot create some large attributes inside a single transaction, and we
- * need some indication that we weren't finished if we crash in the middle.
- */
-#define XFS_ATTR_LEAF_MAPSIZE  3       /* how many freespace slots */
-
-typedef struct xfs_attr_leaf_map {     /* RLE map of free bytes */
-       __be16  base;                     /* base of free region */
-       __be16  size;                     /* length of free region */
-} xfs_attr_leaf_map_t;
-
-typedef struct xfs_attr_leaf_hdr {     /* constant-structure header block */
-       xfs_da_blkinfo_t info;          /* block type, links, etc. */
-       __be16  count;                  /* count of active leaf_entry's */
-       __be16  usedbytes;              /* num bytes of names/values stored */
-       __be16  firstused;              /* first used byte in name area */
-       __u8    holes;                  /* != 0 if blk needs compaction */
-       __u8    pad1;
-       xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE];
-                                       /* N largest free regions */
-} xfs_attr_leaf_hdr_t;
-
-typedef struct xfs_attr_leaf_entry {   /* sorted on key, not name */
-       __be32  hashval;                /* hash value of name */
-       __be16  nameidx;                /* index into buffer of name/value */
-       __u8    flags;                  /* LOCAL/ROOT/SECURE/INCOMPLETE flag */
-       __u8    pad2;                   /* unused pad byte */
-} xfs_attr_leaf_entry_t;
-
-typedef struct xfs_attr_leaf_name_local {
-       __be16  valuelen;               /* number of bytes in value */
-       __u8    namelen;                /* length of name bytes */
-       __u8    nameval[1];             /* name/value bytes */
-} xfs_attr_leaf_name_local_t;
-
-typedef struct xfs_attr_leaf_name_remote {
-       __be32  valueblk;               /* block number of value bytes */
-       __be32  valuelen;               /* number of bytes in value */
-       __u8    namelen;                /* length of name bytes */
-       __u8    name[1];                /* name bytes */
-} xfs_attr_leaf_name_remote_t;
-
-typedef struct xfs_attr_leafblock {
-       xfs_attr_leaf_hdr_t     hdr;    /* constant-structure header block */
-       xfs_attr_leaf_entry_t   entries[1];     /* sorted on key, not name */
-       xfs_attr_leaf_name_local_t namelist;    /* grows from bottom of buf */
-       xfs_attr_leaf_name_remote_t valuelist;  /* grows from bottom of buf */
-} xfs_attr_leafblock_t;
-
-/*
- * CRC enabled leaf structures. Called "version 3" structures to match the
- * version number of the directory and dablk structures for this feature, and
- * attr2 is already taken by the variable inode attribute fork size feature.
- */
-struct xfs_attr3_leaf_hdr {
-       struct xfs_da3_blkinfo  info;
-       __be16                  count;
-       __be16                  usedbytes;
-       __be16                  firstused;
-       __u8                    holes;
-       __u8                    pad1;
-       struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE];
-       __be32                  pad2;           /* 64 bit alignment */
-};
-
-#define XFS_ATTR3_LEAF_CRC_OFF (offsetof(struct xfs_attr3_leaf_hdr, info.crc))
-
-struct xfs_attr3_leafblock {
-       struct xfs_attr3_leaf_hdr       hdr;
-       struct xfs_attr_leaf_entry      entries[1];
-
-       /*
-        * The rest of the block contains the following structures after the
-        * leaf entries, growing from the bottom up. The variables are never
-        * referenced, the locations accessed purely from helper functions.
-        *
-        * struct xfs_attr_leaf_name_local
-        * struct xfs_attr_leaf_name_remote
-        */
-};
-
-/*
- * incore, neutral version of the attribute leaf header
- */
-struct xfs_attr3_icleaf_hdr {
-       __uint32_t      forw;
-       __uint32_t      back;
-       __uint16_t      magic;
-       __uint16_t      count;
-       __uint16_t      usedbytes;
-       __uint16_t      firstused;
-       __u8            holes;
-       struct {
-               __uint16_t      base;
-               __uint16_t      size;
-       } freemap[XFS_ATTR_LEAF_MAPSIZE];
-};
-
-/*
- * Flags used in the leaf_entry[i].flags field.
- * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
- * on the system call, they are "or"ed together for various operations.
- */
-#define        XFS_ATTR_LOCAL_BIT      0       /* attr is stored locally */
-#define        XFS_ATTR_ROOT_BIT       1       /* limit access to trusted attrs */
-#define        XFS_ATTR_SECURE_BIT     2       /* limit access to secure attrs */
-#define        XFS_ATTR_INCOMPLETE_BIT 7       /* attr in middle of create/delete */
-#define XFS_ATTR_LOCAL         (1 << XFS_ATTR_LOCAL_BIT)
-#define XFS_ATTR_ROOT          (1 << XFS_ATTR_ROOT_BIT)
-#define XFS_ATTR_SECURE                (1 << XFS_ATTR_SECURE_BIT)
-#define XFS_ATTR_INCOMPLETE    (1 << XFS_ATTR_INCOMPLETE_BIT)
-
-/*
- * Conversion macros for converting namespace bits from argument flags
- * to ondisk flags.
- */
-#define XFS_ATTR_NSP_ARGS_MASK         (ATTR_ROOT | ATTR_SECURE)
-#define XFS_ATTR_NSP_ONDISK_MASK       (XFS_ATTR_ROOT | XFS_ATTR_SECURE)
-#define XFS_ATTR_NSP_ONDISK(flags)     ((flags) & XFS_ATTR_NSP_ONDISK_MASK)
-#define XFS_ATTR_NSP_ARGS(flags)       ((flags) & XFS_ATTR_NSP_ARGS_MASK)
-#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\
-                                        ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0))
-#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\
-                                        ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0))
-
-/*
- * Alignment for namelist and valuelist entries (since they are mixed
- * there can be only one alignment value)
- */
-#define        XFS_ATTR_LEAF_NAME_ALIGN        ((uint)sizeof(xfs_dablk_t))
-
-static inline int
-xfs_attr3_leaf_hdr_size(struct xfs_attr_leafblock *leafp)
-{
-       if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
-               return sizeof(struct xfs_attr3_leaf_hdr);
-       return sizeof(struct xfs_attr_leaf_hdr);
-}
-
-static inline struct xfs_attr_leaf_entry *
-xfs_attr3_leaf_entryp(xfs_attr_leafblock_t *leafp)
-{
-       if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
-               return &((struct xfs_attr3_leafblock *)leafp)->entries[0];
-       return &leafp->entries[0];
-}
-
-/*
- * Cast typed pointers for "local" and "remote" name/value structs.
- */
-static inline char *
-xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
-{
-       struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp);
-
-       return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)];
-}
-
-static inline xfs_attr_leaf_name_remote_t *
-xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
-{
-       return (xfs_attr_leaf_name_remote_t *)xfs_attr3_leaf_name(leafp, idx);
-}
-
-static inline xfs_attr_leaf_name_local_t *
-xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
-{
-       return (xfs_attr_leaf_name_local_t *)xfs_attr3_leaf_name(leafp, idx);
-}
-
-/*
- * Calculate total bytes used (including trailing pad for alignment) for
- * a "local" name/value structure, a "remote" name/value structure, and
- * a pointer which might be either.
- */
-static inline int xfs_attr_leaf_entsize_remote(int nlen)
-{
-       return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
-               XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
-}
-
-static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
-{
-       return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) +
-               XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
-}
-
-static inline int xfs_attr_leaf_entsize_local_max(int bsize)
-{
-       return (((bsize) >> 1) + ((bsize) >> 2));
-}
-
-
-
-/*
- * Remote attribute block format definition
- *
- * There is one of these headers per filesystem block in a remote attribute.
- * This is done to ensure there is a 1:1 mapping between the attribute value
- * length and the number of blocks needed to store the attribute. This makes the
- * verification of a buffer a little more complex, but greatly simplifies the
- * allocation, reading and writing of these attributes as we don't have to guess
- * the number of blocks needed to store the attribute data.
- */
-#define XFS_ATTR3_RMT_MAGIC    0x5841524d      /* XARM */
-
-struct xfs_attr3_rmt_hdr {
-       __be32  rm_magic;
-       __be32  rm_offset;
-       __be32  rm_bytes;
-       __be32  rm_crc;
-       uuid_t  rm_uuid;
-       __be64  rm_owner;
-       __be64  rm_blkno;
-       __be64  rm_lsn;
-};
-
-#define XFS_ATTR3_RMT_CRC_OFF  offsetof(struct xfs_attr3_rmt_hdr, rm_crc)
-
-#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize)   \
-       ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
-                       sizeof(struct xfs_attr3_rmt_hdr) : 0))
-
-#endif /* __XFS_DA_FORMAT_H__ */
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
deleted file mode 100644 (file)
index 623bbe8..0000000
+++ /dev/null
@@ -1,243 +0,0 @@
-/*
- * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DINODE_H__
-#define        __XFS_DINODE_H__
-
-#define        XFS_DINODE_MAGIC                0x494e  /* 'IN' */
-#define XFS_DINODE_GOOD_VERSION(v)     ((v) >= 1 && (v) <= 3)
-
-typedef struct xfs_timestamp {
-       __be32          t_sec;          /* timestamp seconds */
-       __be32          t_nsec;         /* timestamp nanoseconds */
-} xfs_timestamp_t;
-
-/*
- * On-disk inode structure.
- *
- * This is just the header or "dinode core", the inode is expanded to fill a
- * variable size the leftover area split into a data and an attribute fork.
- * The format of the data and attribute fork depends on the format of the
- * inode as indicated by di_format and di_aformat.  To access the data and
- * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros
- * below.
- *
- * There is a very similar struct icdinode in xfs_inode which matches the
- * layout of the first 96 bytes of this structure, but is kept in native
- * format instead of big endian.
- *
- * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed
- * padding field for v3 inodes.
- */
-typedef struct xfs_dinode {
-       __be16          di_magic;       /* inode magic # = XFS_DINODE_MAGIC */
-       __be16          di_mode;        /* mode and type of file */
-       __u8            di_version;     /* inode version */
-       __u8            di_format;      /* format of di_c data */
-       __be16          di_onlink;      /* old number of links to file */
-       __be32          di_uid;         /* owner's user id */
-       __be32          di_gid;         /* owner's group id */
-       __be32          di_nlink;       /* number of links to file */
-       __be16          di_projid_lo;   /* lower part of owner's project id */
-       __be16          di_projid_hi;   /* higher part owner's project id */
-       __u8            di_pad[6];      /* unused, zeroed space */
-       __be16          di_flushiter;   /* incremented on flush */
-       xfs_timestamp_t di_atime;       /* time last accessed */
-       xfs_timestamp_t di_mtime;       /* time last modified */
-       xfs_timestamp_t di_ctime;       /* time created/inode modified */
-       __be64          di_size;        /* number of bytes in file */
-       __be64          di_nblocks;     /* # of direct & btree blocks used */
-       __be32          di_extsize;     /* basic/minimum extent size for file */
-       __be32          di_nextents;    /* number of extents in data fork */
-       __be16          di_anextents;   /* number of extents in attribute fork*/
-       __u8            di_forkoff;     /* attr fork offs, <<3 for 64b align */
-       __s8            di_aformat;     /* format of attr fork's data */
-       __be32          di_dmevmask;    /* DMIG event mask */
-       __be16          di_dmstate;     /* DMIG state info */
-       __be16          di_flags;       /* random flags, XFS_DIFLAG_... */
-       __be32          di_gen;         /* generation number */
-
-       /* di_next_unlinked is the only non-core field in the old dinode */
-       __be32          di_next_unlinked;/* agi unlinked list ptr */
-
-       /* start of the extended dinode, writable fields */
-       __le32          di_crc;         /* CRC of the inode */
-       __be64          di_changecount; /* number of attribute changes */
-       __be64          di_lsn;         /* flush sequence */
-       __be64          di_flags2;      /* more random flags */
-       __u8            di_pad2[16];    /* more padding for future expansion */
-
-       /* fields only written to during inode creation */
-       xfs_timestamp_t di_crtime;      /* time created */
-       __be64          di_ino;         /* inode number */
-       uuid_t          di_uuid;        /* UUID of the filesystem */
-
-       /* structure must be padded to 64 bit alignment */
-} xfs_dinode_t;
-
-#define XFS_DINODE_CRC_OFF     offsetof(struct xfs_dinode, di_crc)
-
-#define DI_MAX_FLUSH 0xffff
-
-/*
- * Size of the core inode on disk.  Version 1 and 2 inodes have
- * the same size, but version 3 has grown a few additional fields.
- */
-static inline uint xfs_dinode_size(int version)
-{
-       if (version == 3)
-               return sizeof(struct xfs_dinode);
-       return offsetof(struct xfs_dinode, di_crc);
-}
-
-/*
- * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
- * Since the pathconf interface is signed, we use 2^31 - 1 instead.
- * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
- */
-#define        XFS_MAXLINK             ((1U << 31) - 1U)
-#define        XFS_MAXLINK_1           65535U
-
-/*
- * Values for di_format
- */
-typedef enum xfs_dinode_fmt {
-       XFS_DINODE_FMT_DEV,             /* xfs_dev_t */
-       XFS_DINODE_FMT_LOCAL,           /* bulk data */
-       XFS_DINODE_FMT_EXTENTS,         /* struct xfs_bmbt_rec */
-       XFS_DINODE_FMT_BTREE,           /* struct xfs_bmdr_block */
-       XFS_DINODE_FMT_UUID             /* uuid_t */
-} xfs_dinode_fmt_t;
-
-/*
- * Inode minimum and maximum sizes.
- */
-#define        XFS_DINODE_MIN_LOG      8
-#define        XFS_DINODE_MAX_LOG      11
-#define        XFS_DINODE_MIN_SIZE     (1 << XFS_DINODE_MIN_LOG)
-#define        XFS_DINODE_MAX_SIZE     (1 << XFS_DINODE_MAX_LOG)
-
-/*
- * Inode size for given fs.
- */
-#define XFS_LITINO(mp, version) \
-       ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
-
-/*
- * Inode data & attribute fork sizes, per inode.
- */
-#define XFS_DFORK_Q(dip)               ((dip)->di_forkoff != 0)
-#define XFS_DFORK_BOFF(dip)            ((int)((dip)->di_forkoff << 3))
-
-#define XFS_DFORK_DSIZE(dip,mp) \
-       (XFS_DFORK_Q(dip) ? \
-               XFS_DFORK_BOFF(dip) : \
-               XFS_LITINO(mp, (dip)->di_version))
-#define XFS_DFORK_ASIZE(dip,mp) \
-       (XFS_DFORK_Q(dip) ? \
-               XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \
-               0)
-#define XFS_DFORK_SIZE(dip,mp,w) \
-       ((w) == XFS_DATA_FORK ? \
-               XFS_DFORK_DSIZE(dip, mp) : \
-               XFS_DFORK_ASIZE(dip, mp))
-
-/*
- * Return pointers to the data or attribute forks.
- */
-#define XFS_DFORK_DPTR(dip) \
-       ((char *)dip + xfs_dinode_size(dip->di_version))
-#define XFS_DFORK_APTR(dip)    \
-       (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
-#define XFS_DFORK_PTR(dip,w)   \
-       ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
-
-#define XFS_DFORK_FORMAT(dip,w) \
-       ((w) == XFS_DATA_FORK ? \
-               (dip)->di_format : \
-               (dip)->di_aformat)
-#define XFS_DFORK_NEXTENTS(dip,w) \
-       ((w) == XFS_DATA_FORK ? \
-               be32_to_cpu((dip)->di_nextents) : \
-               be16_to_cpu((dip)->di_anextents))
-
-#define        XFS_BUF_TO_DINODE(bp)   ((xfs_dinode_t *)((bp)->b_addr))
-
-/*
- * For block and character special files the 32bit dev_t is stored at the
- * beginning of the data fork.
- */
-static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip)
-{
-       return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip));
-}
-
-static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
-{
-       *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev);
-}
-
-/*
- * Values for di_flags
- * There should be a one-to-one correspondence between these flags and the
- * XFS_XFLAG_s.
- */
-#define XFS_DIFLAG_REALTIME_BIT  0     /* file's blocks come from rt area */
-#define XFS_DIFLAG_PREALLOC_BIT  1     /* file space has been preallocated */
-#define XFS_DIFLAG_NEWRTBM_BIT   2     /* for rtbitmap inode, new format */
-#define XFS_DIFLAG_IMMUTABLE_BIT 3     /* inode is immutable */
-#define XFS_DIFLAG_APPEND_BIT    4     /* inode is append-only */
-#define XFS_DIFLAG_SYNC_BIT      5     /* inode is written synchronously */
-#define XFS_DIFLAG_NOATIME_BIT   6     /* do not update atime */
-#define XFS_DIFLAG_NODUMP_BIT    7     /* do not dump */
-#define XFS_DIFLAG_RTINHERIT_BIT 8     /* create with realtime bit set */
-#define XFS_DIFLAG_PROJINHERIT_BIT   9 /* create with parents projid */
-#define XFS_DIFLAG_NOSYMLINKS_BIT   10 /* disallow symlink creation */
-#define XFS_DIFLAG_EXTSIZE_BIT      11 /* inode extent size allocator hint */
-#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */
-#define XFS_DIFLAG_NODEFRAG_BIT     13 /* do not reorganize/defragment */
-#define XFS_DIFLAG_FILESTREAM_BIT   14  /* use filestream allocator */
-#define XFS_DIFLAG_REALTIME      (1 << XFS_DIFLAG_REALTIME_BIT)
-#define XFS_DIFLAG_PREALLOC      (1 << XFS_DIFLAG_PREALLOC_BIT)
-#define XFS_DIFLAG_NEWRTBM       (1 << XFS_DIFLAG_NEWRTBM_BIT)
-#define XFS_DIFLAG_IMMUTABLE     (1 << XFS_DIFLAG_IMMUTABLE_BIT)
-#define XFS_DIFLAG_APPEND        (1 << XFS_DIFLAG_APPEND_BIT)
-#define XFS_DIFLAG_SYNC          (1 << XFS_DIFLAG_SYNC_BIT)
-#define XFS_DIFLAG_NOATIME       (1 << XFS_DIFLAG_NOATIME_BIT)
-#define XFS_DIFLAG_NODUMP        (1 << XFS_DIFLAG_NODUMP_BIT)
-#define XFS_DIFLAG_RTINHERIT     (1 << XFS_DIFLAG_RTINHERIT_BIT)
-#define XFS_DIFLAG_PROJINHERIT   (1 << XFS_DIFLAG_PROJINHERIT_BIT)
-#define XFS_DIFLAG_NOSYMLINKS    (1 << XFS_DIFLAG_NOSYMLINKS_BIT)
-#define XFS_DIFLAG_EXTSIZE       (1 << XFS_DIFLAG_EXTSIZE_BIT)
-#define XFS_DIFLAG_EXTSZINHERIT  (1 << XFS_DIFLAG_EXTSZINHERIT_BIT)
-#define XFS_DIFLAG_NODEFRAG      (1 << XFS_DIFLAG_NODEFRAG_BIT)
-#define XFS_DIFLAG_FILESTREAM    (1 << XFS_DIFLAG_FILESTREAM_BIT)
-
-#ifdef CONFIG_XFS_RT
-#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME)
-#else
-#define XFS_IS_REALTIME_INODE(ip) (0)
-#endif
-
-#define XFS_DIFLAG_ANY \
-       (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
-        XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
-        XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
-        XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
-        XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)
-
-#endif /* __XFS_DINODE_H__ */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
deleted file mode 100644 (file)
index 79670cd..0000000
+++ /dev/null
@@ -1,762 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_inum.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_inode.h"
-#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_priv.h"
-#include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_dinode.h"
-
-struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
-
-
-/*
- * ASCII case-insensitive (ie. A-Z) support for directories that was
- * used in IRIX.
- */
-STATIC xfs_dahash_t
-xfs_ascii_ci_hashname(
-       struct xfs_name *name)
-{
-       xfs_dahash_t    hash;
-       int             i;
-
-       for (i = 0, hash = 0; i < name->len; i++)
-               hash = tolower(name->name[i]) ^ rol32(hash, 7);
-
-       return hash;
-}
-
-STATIC enum xfs_dacmp
-xfs_ascii_ci_compname(
-       struct xfs_da_args *args,
-       const unsigned char *name,
-       int             len)
-{
-       enum xfs_dacmp  result;
-       int             i;
-
-       if (args->namelen != len)
-               return XFS_CMP_DIFFERENT;
-
-       result = XFS_CMP_EXACT;
-       for (i = 0; i < len; i++) {
-               if (args->name[i] == name[i])
-                       continue;
-               if (tolower(args->name[i]) != tolower(name[i]))
-                       return XFS_CMP_DIFFERENT;
-               result = XFS_CMP_CASE;
-       }
-
-       return result;
-}
-
-static struct xfs_nameops xfs_ascii_ci_nameops = {
-       .hashname       = xfs_ascii_ci_hashname,
-       .compname       = xfs_ascii_ci_compname,
-};
-
-int
-xfs_da_mount(
-       struct xfs_mount        *mp)
-{
-       struct xfs_da_geometry  *dageo;
-       int                     nodehdr_size;
-
-
-       ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
-       ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
-              XFS_MAX_BLOCKSIZE);
-
-       mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL);
-       mp->m_nondir_inode_ops = xfs_nondir_get_ops(mp, NULL);
-
-       nodehdr_size = mp->m_dir_inode_ops->node_hdr_size;
-       mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
-                                   KM_SLEEP | KM_MAYFAIL);
-       mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
-                                    KM_SLEEP | KM_MAYFAIL);
-       if (!mp->m_dir_geo || !mp->m_attr_geo) {
-               kmem_free(mp->m_dir_geo);
-               kmem_free(mp->m_attr_geo);
-               return ENOMEM;
-       }
-
-       /* set up directory geometry */
-       dageo = mp->m_dir_geo;
-       dageo->blklog = mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog;
-       dageo->fsblog = mp->m_sb.sb_blocklog;
-       dageo->blksize = 1 << dageo->blklog;
-       dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog;
-
-       /*
-        * Now we've set up the block conversion variables, we can calculate the
-        * segment block constants using the geometry structure.
-        */
-       dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET);
-       dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET);
-       dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET);
-       dageo->node_ents = (dageo->blksize - nodehdr_size) /
-                               (uint)sizeof(xfs_da_node_entry_t);
-       dageo->magicpct = (dageo->blksize * 37) / 100;
-
-       /* set up attribute geometry - single fsb only */
-       dageo = mp->m_attr_geo;
-       dageo->blklog = mp->m_sb.sb_blocklog;
-       dageo->fsblog = mp->m_sb.sb_blocklog;
-       dageo->blksize = 1 << dageo->blklog;
-       dageo->fsbcount = 1;
-       dageo->node_ents = (dageo->blksize - nodehdr_size) /
-                               (uint)sizeof(xfs_da_node_entry_t);
-       dageo->magicpct = (dageo->blksize * 37) / 100;
-
-       if (xfs_sb_version_hasasciici(&mp->m_sb))
-               mp->m_dirnameops = &xfs_ascii_ci_nameops;
-       else
-               mp->m_dirnameops = &xfs_default_nameops;
-
-       return 0;
-}
-
-void
-xfs_da_unmount(
-       struct xfs_mount        *mp)
-{
-       kmem_free(mp->m_dir_geo);
-       kmem_free(mp->m_attr_geo);
-}
-
-/*
- * Return 1 if directory contains only "." and "..".
- */
-int
-xfs_dir_isempty(
-       xfs_inode_t     *dp)
-{
-       xfs_dir2_sf_hdr_t       *sfp;
-
-       ASSERT(S_ISDIR(dp->i_d.di_mode));
-       if (dp->i_d.di_size == 0)       /* might happen during shutdown. */
-               return 1;
-       if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
-               return 0;
-       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-       return !sfp->count;
-}
-
-/*
- * Validate a given inode number.
- */
-int
-xfs_dir_ino_validate(
-       xfs_mount_t     *mp,
-       xfs_ino_t       ino)
-{
-       xfs_agblock_t   agblkno;
-       xfs_agino_t     agino;
-       xfs_agnumber_t  agno;
-       int             ino_ok;
-       int             ioff;
-
-       agno = XFS_INO_TO_AGNO(mp, ino);
-       agblkno = XFS_INO_TO_AGBNO(mp, ino);
-       ioff = XFS_INO_TO_OFFSET(mp, ino);
-       agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff);
-       ino_ok =
-               agno < mp->m_sb.sb_agcount &&
-               agblkno < mp->m_sb.sb_agblocks &&
-               agblkno != 0 &&
-               ioff < (1 << mp->m_sb.sb_inopblog) &&
-               XFS_AGINO_TO_INO(mp, agno, agino) == ino;
-       if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
-                       XFS_RANDOM_DIR_INO_VALIDATE))) {
-               xfs_warn(mp, "Invalid inode number 0x%Lx",
-                               (unsigned long long) ino);
-               XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-       return 0;
-}
-
-/*
- * Initialize a directory with its "." and ".." entries.
- */
-int
-xfs_dir_init(
-       xfs_trans_t     *tp,
-       xfs_inode_t     *dp,
-       xfs_inode_t     *pdp)
-{
-       struct xfs_da_args *args;
-       int             error;
-
-       ASSERT(S_ISDIR(dp->i_d.di_mode));
-       error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino);
-       if (error)
-               return error;
-
-       args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
-       if (!args)
-               return ENOMEM;
-
-       args->geo = dp->i_mount->m_dir_geo;
-       args->dp = dp;
-       args->trans = tp;
-       error = xfs_dir2_sf_create(args, pdp->i_ino);
-       kmem_free(args);
-       return error;
-}
-
-/*
-  Enter a name in a directory.
- */
-int
-xfs_dir_createname(
-       xfs_trans_t             *tp,
-       xfs_inode_t             *dp,
-       struct xfs_name         *name,
-       xfs_ino_t               inum,           /* new entry inode number */
-       xfs_fsblock_t           *first,         /* bmap's firstblock */
-       xfs_bmap_free_t         *flist,         /* bmap's freeblock list */
-       xfs_extlen_t            total)          /* bmap's total block count */
-{
-       struct xfs_da_args      *args;
-       int                     rval;
-       int                     v;              /* type-checking value */
-
-       ASSERT(S_ISDIR(dp->i_d.di_mode));
-       rval = xfs_dir_ino_validate(tp->t_mountp, inum);
-       if (rval)
-               return rval;
-       XFS_STATS_INC(xs_dir_create);
-
-       args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
-       if (!args)
-               return ENOMEM;
-
-       args->geo = dp->i_mount->m_dir_geo;
-       args->name = name->name;
-       args->namelen = name->len;
-       args->filetype = name->type;
-       args->hashval = dp->i_mount->m_dirnameops->hashname(name);
-       args->inumber = inum;
-       args->dp = dp;
-       args->firstblock = first;
-       args->flist = flist;
-       args->total = total;
-       args->whichfork = XFS_DATA_FORK;
-       args->trans = tp;
-       args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
-
-       if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-               rval = xfs_dir2_sf_addname(args);
-               goto out_free;
-       }
-
-       rval = xfs_dir2_isblock(args, &v);
-       if (rval)
-               goto out_free;
-       if (v) {
-               rval = xfs_dir2_block_addname(args);
-               goto out_free;
-       }
-
-       rval = xfs_dir2_isleaf(args, &v);
-       if (rval)
-               goto out_free;
-       if (v)
-               rval = xfs_dir2_leaf_addname(args);
-       else
-               rval = xfs_dir2_node_addname(args);
-
-out_free:
-       kmem_free(args);
-       return rval;
-}
-
-/*
- * If doing a CI lookup and case-insensitive match, dup actual name into
- * args.value. Return EEXIST for success (ie. name found) or an error.
- */
-int
-xfs_dir_cilookup_result(
-       struct xfs_da_args *args,
-       const unsigned char *name,
-       int             len)
-{
-       if (args->cmpresult == XFS_CMP_DIFFERENT)
-               return ENOENT;
-       if (args->cmpresult != XFS_CMP_CASE ||
-                                       !(args->op_flags & XFS_DA_OP_CILOOKUP))
-               return EEXIST;
-
-       args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
-       if (!args->value)
-               return ENOMEM;
-
-       memcpy(args->value, name, len);
-       args->valuelen = len;
-       return EEXIST;
-}
-
-/*
- * Lookup a name in a directory, give back the inode number.
- * If ci_name is not NULL, returns the actual name in ci_name if it differs
- * to name, or ci_name->name is set to NULL for an exact match.
- */
-
-int
-xfs_dir_lookup(
-       xfs_trans_t     *tp,
-       xfs_inode_t     *dp,
-       struct xfs_name *name,
-       xfs_ino_t       *inum,          /* out: inode number */
-       struct xfs_name *ci_name)       /* out: actual name if CI match */
-{
-       struct xfs_da_args *args;
-       int             rval;
-       int             v;              /* type-checking value */
-
-       ASSERT(S_ISDIR(dp->i_d.di_mode));
-       XFS_STATS_INC(xs_dir_lookup);
-
-       /*
-        * We need to use KM_NOFS here so that lockdep will not throw false
-        * positive deadlock warnings on a non-transactional lookup path. It is
-        * safe to recurse into inode recalim in that case, but lockdep can't
-        * easily be taught about it. Hence KM_NOFS avoids having to add more
-        * lockdep Doing this avoids having to add a bunch of lockdep class
-        * annotations into the reclaim path for the ilock.
-        */
-       args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
-       args->geo = dp->i_mount->m_dir_geo;
-       args->name = name->name;
-       args->namelen = name->len;
-       args->filetype = name->type;
-       args->hashval = dp->i_mount->m_dirnameops->hashname(name);
-       args->dp = dp;
-       args->whichfork = XFS_DATA_FORK;
-       args->trans = tp;
-       args->op_flags = XFS_DA_OP_OKNOENT;
-       if (ci_name)
-               args->op_flags |= XFS_DA_OP_CILOOKUP;
-
-       if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-               rval = xfs_dir2_sf_lookup(args);
-               goto out_check_rval;
-       }
-
-       rval = xfs_dir2_isblock(args, &v);
-       if (rval)
-               goto out_free;
-       if (v) {
-               rval = xfs_dir2_block_lookup(args);
-               goto out_check_rval;
-       }
-
-       rval = xfs_dir2_isleaf(args, &v);
-       if (rval)
-               goto out_free;
-       if (v)
-               rval = xfs_dir2_leaf_lookup(args);
-       else
-               rval = xfs_dir2_node_lookup(args);
-
-out_check_rval:
-       if (rval == EEXIST)
-               rval = 0;
-       if (!rval) {
-               *inum = args->inumber;
-               if (ci_name) {
-                       ci_name->name = args->value;
-                       ci_name->len = args->valuelen;
-               }
-       }
-out_free:
-       kmem_free(args);
-       return rval;
-}
-
-/*
- * Remove an entry from a directory.
- */
-int
-xfs_dir_removename(
-       xfs_trans_t     *tp,
-       xfs_inode_t     *dp,
-       struct xfs_name *name,
-       xfs_ino_t       ino,
-       xfs_fsblock_t   *first,         /* bmap's firstblock */
-       xfs_bmap_free_t *flist,         /* bmap's freeblock list */
-       xfs_extlen_t    total)          /* bmap's total block count */
-{
-       struct xfs_da_args *args;
-       int             rval;
-       int             v;              /* type-checking value */
-
-       ASSERT(S_ISDIR(dp->i_d.di_mode));
-       XFS_STATS_INC(xs_dir_remove);
-
-       args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
-       if (!args)
-               return ENOMEM;
-
-       args->geo = dp->i_mount->m_dir_geo;
-       args->name = name->name;
-       args->namelen = name->len;
-       args->filetype = name->type;
-       args->hashval = dp->i_mount->m_dirnameops->hashname(name);
-       args->inumber = ino;
-       args->dp = dp;
-       args->firstblock = first;
-       args->flist = flist;
-       args->total = total;
-       args->whichfork = XFS_DATA_FORK;
-       args->trans = tp;
-
-       if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-               rval = xfs_dir2_sf_removename(args);
-               goto out_free;
-       }
-
-       rval = xfs_dir2_isblock(args, &v);
-       if (rval)
-               goto out_free;
-       if (v) {
-               rval = xfs_dir2_block_removename(args);
-               goto out_free;
-       }
-
-       rval = xfs_dir2_isleaf(args, &v);
-       if (rval)
-               goto out_free;
-       if (v)
-               rval = xfs_dir2_leaf_removename(args);
-       else
-               rval = xfs_dir2_node_removename(args);
-out_free:
-       kmem_free(args);
-       return rval;
-}
-
-/*
- * Replace the inode number of a directory entry.
- */
-int
-xfs_dir_replace(
-       xfs_trans_t     *tp,
-       xfs_inode_t     *dp,
-       struct xfs_name *name,          /* name of entry to replace */
-       xfs_ino_t       inum,           /* new inode number */
-       xfs_fsblock_t   *first,         /* bmap's firstblock */
-       xfs_bmap_free_t *flist,         /* bmap's freeblock list */
-       xfs_extlen_t    total)          /* bmap's total block count */
-{
-       struct xfs_da_args *args;
-       int             rval;
-       int             v;              /* type-checking value */
-
-       ASSERT(S_ISDIR(dp->i_d.di_mode));
-
-       rval = xfs_dir_ino_validate(tp->t_mountp, inum);
-       if (rval)
-               return rval;
-
-       args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
-       if (!args)
-               return ENOMEM;
-
-       args->geo = dp->i_mount->m_dir_geo;
-       args->name = name->name;
-       args->namelen = name->len;
-       args->filetype = name->type;
-       args->hashval = dp->i_mount->m_dirnameops->hashname(name);
-       args->inumber = inum;
-       args->dp = dp;
-       args->firstblock = first;
-       args->flist = flist;
-       args->total = total;
-       args->whichfork = XFS_DATA_FORK;
-       args->trans = tp;
-
-       if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-               rval = xfs_dir2_sf_replace(args);
-               goto out_free;
-       }
-
-       rval = xfs_dir2_isblock(args, &v);
-       if (rval)
-               goto out_free;
-       if (v) {
-               rval = xfs_dir2_block_replace(args);
-               goto out_free;
-       }
-
-       rval = xfs_dir2_isleaf(args, &v);
-       if (rval)
-               goto out_free;
-       if (v)
-               rval = xfs_dir2_leaf_replace(args);
-       else
-               rval = xfs_dir2_node_replace(args);
-out_free:
-       kmem_free(args);
-       return rval;
-}
-
-/*
- * See if this entry can be added to the directory without allocating space.
- * First checks that the caller couldn't reserve enough space (resblks = 0).
- */
-int
-xfs_dir_canenter(
-       xfs_trans_t     *tp,
-       xfs_inode_t     *dp,
-       struct xfs_name *name,          /* name of entry to add */
-       uint            resblks)
-{
-       struct xfs_da_args *args;
-       int             rval;
-       int             v;              /* type-checking value */
-
-       if (resblks)
-               return 0;
-
-       ASSERT(S_ISDIR(dp->i_d.di_mode));
-
-       args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
-       if (!args)
-               return ENOMEM;
-
-       args->geo = dp->i_mount->m_dir_geo;
-       args->name = name->name;
-       args->namelen = name->len;
-       args->filetype = name->type;
-       args->hashval = dp->i_mount->m_dirnameops->hashname(name);
-       args->dp = dp;
-       args->whichfork = XFS_DATA_FORK;
-       args->trans = tp;
-       args->op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME |
-                                                       XFS_DA_OP_OKNOENT;
-
-       if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-               rval = xfs_dir2_sf_addname(args);
-               goto out_free;
-       }
-
-       rval = xfs_dir2_isblock(args, &v);
-       if (rval)
-               goto out_free;
-       if (v) {
-               rval = xfs_dir2_block_addname(args);
-               goto out_free;
-       }
-
-       rval = xfs_dir2_isleaf(args, &v);
-       if (rval)
-               goto out_free;
-       if (v)
-               rval = xfs_dir2_leaf_addname(args);
-       else
-               rval = xfs_dir2_node_addname(args);
-out_free:
-       kmem_free(args);
-       return rval;
-}
-
-/*
- * Utility routines.
- */
-
-/*
- * Add a block to the directory.
- *
- * This routine is for data and free blocks, not leaf/node blocks which are
- * handled by xfs_da_grow_inode.
- */
-int
-xfs_dir2_grow_inode(
-       struct xfs_da_args      *args,
-       int                     space,  /* v2 dir's space XFS_DIR2_xxx_SPACE */
-       xfs_dir2_db_t           *dbp)   /* out: block number added */
-{
-       struct xfs_inode        *dp = args->dp;
-       struct xfs_mount        *mp = dp->i_mount;
-       xfs_fileoff_t           bno;    /* directory offset of new block */
-       int                     count;  /* count of filesystem blocks */
-       int                     error;
-
-       trace_xfs_dir2_grow_inode(args, space);
-
-       /*
-        * Set lowest possible block in the space requested.
-        */
-       bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE);
-       count = args->geo->fsbcount;
-
-       error = xfs_da_grow_inode_int(args, &bno, count);
-       if (error)
-               return error;
-
-       *dbp = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)bno);
-
-       /*
-        * Update file's size if this is the data space and it grew.
-        */
-       if (space == XFS_DIR2_DATA_SPACE) {
-               xfs_fsize_t     size;           /* directory file (data) size */
-
-               size = XFS_FSB_TO_B(mp, bno + count);
-               if (size > dp->i_d.di_size) {
-                       dp->i_d.di_size = size;
-                       xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
-               }
-       }
-       return 0;
-}
-
-/*
- * See if the directory is a single-block form directory.
- */
-int
-xfs_dir2_isblock(
-       struct xfs_da_args      *args,
-       int                     *vp)    /* out: 1 is block, 0 is not block */
-{
-       xfs_fileoff_t           last;   /* last file offset */
-       int                     rval;
-
-       if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
-               return rval;
-       rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize;
-       ASSERT(rval == 0 || args->dp->i_d.di_size == args->geo->blksize);
-       *vp = rval;
-       return 0;
-}
-
-/*
- * See if the directory is a single-leaf form directory.
- */
-int
-xfs_dir2_isleaf(
-       struct xfs_da_args      *args,
-       int                     *vp)    /* out: 1 is block, 0 is not block */
-{
-       xfs_fileoff_t           last;   /* last file offset */
-       int                     rval;
-
-       if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
-               return rval;
-       *vp = last == args->geo->leafblk + args->geo->fsbcount;
-       return 0;
-}
-
-/*
- * Remove the given block from the directory.
- * This routine is used for data and free blocks, leaf/node are done
- * by xfs_da_shrink_inode.
- */
-int
-xfs_dir2_shrink_inode(
-       xfs_da_args_t   *args,
-       xfs_dir2_db_t   db,
-       struct xfs_buf  *bp)
-{
-       xfs_fileoff_t   bno;            /* directory file offset */
-       xfs_dablk_t     da;             /* directory file offset */
-       int             done;           /* bunmap is finished */
-       xfs_inode_t     *dp;
-       int             error;
-       xfs_mount_t     *mp;
-       xfs_trans_t     *tp;
-
-       trace_xfs_dir2_shrink_inode(args, db);
-
-       dp = args->dp;
-       mp = dp->i_mount;
-       tp = args->trans;
-       da = xfs_dir2_db_to_da(args->geo, db);
-       /*
-        * Unmap the fsblock(s).
-        */
-       if ((error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount,
-                       XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
-                       &done))) {
-               /*
-                * ENOSPC actually can happen if we're in a removename with
-                * no space reservation, and the resulting block removal
-                * would cause a bmap btree split or conversion from extents
-                * to btree.  This can only happen for un-fragmented
-                * directory blocks, since you need to be punching out
-                * the middle of an extent.
-                * In this case we need to leave the block in the file,
-                * and not binval it.
-                * So the block has to be in a consistent empty state
-                * and appropriately logged.
-                * We don't free up the buffer, the caller can tell it
-                * hasn't happened since it got an error back.
-                */
-               return error;
-       }
-       ASSERT(done);
-       /*
-        * Invalidate the buffer from the transaction.
-        */
-       xfs_trans_binval(tp, bp);
-       /*
-        * If it's not a data block, we're done.
-        */
-       if (db >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET))
-               return 0;
-       /*
-        * If the block isn't the last one in the directory, we're done.
-        */
-       if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0))
-               return 0;
-       bno = da;
-       if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) {
-               /*
-                * This can't really happen unless there's kernel corruption.
-                */
-               return error;
-       }
-       if (db == args->geo->datablk)
-               ASSERT(bno == 0);
-       else
-               ASSERT(bno > 0);
-       /*
-        * Set the size to the new last block.
-        */
-       dp->i_d.di_size = XFS_FSB_TO_B(mp, bno);
-       xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
-       return 0;
-}
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
deleted file mode 100644 (file)
index c8e86b0..0000000
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DIR2_H__
-#define __XFS_DIR2_H__
-
-struct xfs_bmap_free;
-struct xfs_da_args;
-struct xfs_inode;
-struct xfs_mount;
-struct xfs_trans;
-struct xfs_dir2_sf_hdr;
-struct xfs_dir2_sf_entry;
-struct xfs_dir2_data_hdr;
-struct xfs_dir2_data_entry;
-struct xfs_dir2_data_unused;
-
-extern struct xfs_name xfs_name_dotdot;
-
-/*
- * directory operations vector for encode/decode routines
- */
-struct xfs_dir_ops {
-       int     (*sf_entsize)(struct xfs_dir2_sf_hdr *hdr, int len);
-       struct xfs_dir2_sf_entry *
-               (*sf_nextentry)(struct xfs_dir2_sf_hdr *hdr,
-                               struct xfs_dir2_sf_entry *sfep);
-       __uint8_t (*sf_get_ftype)(struct xfs_dir2_sf_entry *sfep);
-       void    (*sf_put_ftype)(struct xfs_dir2_sf_entry *sfep,
-                               __uint8_t ftype);
-       xfs_ino_t (*sf_get_ino)(struct xfs_dir2_sf_hdr *hdr,
-                               struct xfs_dir2_sf_entry *sfep);
-       void    (*sf_put_ino)(struct xfs_dir2_sf_hdr *hdr,
-                             struct xfs_dir2_sf_entry *sfep,
-                             xfs_ino_t ino);
-       xfs_ino_t (*sf_get_parent_ino)(struct xfs_dir2_sf_hdr *hdr);
-       void    (*sf_put_parent_ino)(struct xfs_dir2_sf_hdr *hdr,
-                                    xfs_ino_t ino);
-
-       int     (*data_entsize)(int len);
-       __uint8_t (*data_get_ftype)(struct xfs_dir2_data_entry *dep);
-       void    (*data_put_ftype)(struct xfs_dir2_data_entry *dep,
-                               __uint8_t ftype);
-       __be16 * (*data_entry_tag_p)(struct xfs_dir2_data_entry *dep);
-       struct xfs_dir2_data_free *
-               (*data_bestfree_p)(struct xfs_dir2_data_hdr *hdr);
-
-       xfs_dir2_data_aoff_t data_dot_offset;
-       xfs_dir2_data_aoff_t data_dotdot_offset;
-       xfs_dir2_data_aoff_t data_first_offset;
-       size_t  data_entry_offset;
-
-       struct xfs_dir2_data_entry *
-               (*data_dot_entry_p)(struct xfs_dir2_data_hdr *hdr);
-       struct xfs_dir2_data_entry *
-               (*data_dotdot_entry_p)(struct xfs_dir2_data_hdr *hdr);
-       struct xfs_dir2_data_entry *
-               (*data_first_entry_p)(struct xfs_dir2_data_hdr *hdr);
-       struct xfs_dir2_data_entry *
-               (*data_entry_p)(struct xfs_dir2_data_hdr *hdr);
-       struct xfs_dir2_data_unused *
-               (*data_unused_p)(struct xfs_dir2_data_hdr *hdr);
-
-       int     leaf_hdr_size;
-       void    (*leaf_hdr_to_disk)(struct xfs_dir2_leaf *to,
-                                   struct xfs_dir3_icleaf_hdr *from);
-       void    (*leaf_hdr_from_disk)(struct xfs_dir3_icleaf_hdr *to,
-                                     struct xfs_dir2_leaf *from);
-       int     (*leaf_max_ents)(struct xfs_da_geometry *geo);
-       struct xfs_dir2_leaf_entry *
-               (*leaf_ents_p)(struct xfs_dir2_leaf *lp);
-
-       int     node_hdr_size;
-       void    (*node_hdr_to_disk)(struct xfs_da_intnode *to,
-                                   struct xfs_da3_icnode_hdr *from);
-       void    (*node_hdr_from_disk)(struct xfs_da3_icnode_hdr *to,
-                                     struct xfs_da_intnode *from);
-       struct xfs_da_node_entry *
-               (*node_tree_p)(struct xfs_da_intnode *dap);
-
-       int     free_hdr_size;
-       void    (*free_hdr_to_disk)(struct xfs_dir2_free *to,
-                                   struct xfs_dir3_icfree_hdr *from);
-       void    (*free_hdr_from_disk)(struct xfs_dir3_icfree_hdr *to,
-                                     struct xfs_dir2_free *from);
-       int     (*free_max_bests)(struct xfs_da_geometry *geo);
-       __be16 * (*free_bests_p)(struct xfs_dir2_free *free);
-       xfs_dir2_db_t (*db_to_fdb)(struct xfs_da_geometry *geo,
-                                  xfs_dir2_db_t db);
-       int     (*db_to_fdindex)(struct xfs_da_geometry *geo,
-                                xfs_dir2_db_t db);
-};
-
-extern const struct xfs_dir_ops *
-       xfs_dir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp);
-extern const struct xfs_dir_ops *
-       xfs_nondir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp);
-
-/*
- * Generic directory interface routines
- */
-extern void xfs_dir_startup(void);
-extern int xfs_da_mount(struct xfs_mount *mp);
-extern void xfs_da_unmount(struct xfs_mount *mp);
-
-extern int xfs_dir_isempty(struct xfs_inode *dp);
-extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
-                               struct xfs_inode *pdp);
-extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
-                               struct xfs_name *name, xfs_ino_t inum,
-                               xfs_fsblock_t *first,
-                               struct xfs_bmap_free *flist, xfs_extlen_t tot);
-extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
-                               struct xfs_name *name, xfs_ino_t *inum,
-                               struct xfs_name *ci_name);
-extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
-                               struct xfs_name *name, xfs_ino_t ino,
-                               xfs_fsblock_t *first,
-                               struct xfs_bmap_free *flist, xfs_extlen_t tot);
-extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
-                               struct xfs_name *name, xfs_ino_t inum,
-                               xfs_fsblock_t *first,
-                               struct xfs_bmap_free *flist, xfs_extlen_t tot);
-extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
-                               struct xfs_name *name, uint resblks);
-
-/*
- * Direct call from the bmap code, bypassing the generic directory layer.
- */
-extern int xfs_dir2_sf_to_block(struct xfs_da_args *args);
-
-/*
- * Interface routines used by userspace utilities
- */
-extern int xfs_dir2_isblock(struct xfs_da_args *args, int *r);
-extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r);
-extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
-                               struct xfs_buf *bp);
-
-extern void xfs_dir2_data_freescan(struct xfs_inode *dp,
-               struct xfs_dir2_data_hdr *hdr, int *loghead);
-extern void xfs_dir2_data_log_entry(struct xfs_da_args *args,
-               struct xfs_buf *bp, struct xfs_dir2_data_entry *dep);
-extern void xfs_dir2_data_log_header(struct xfs_da_args *args,
-               struct xfs_buf *bp);
-extern void xfs_dir2_data_log_unused(struct xfs_da_args *args,
-               struct xfs_buf *bp, struct xfs_dir2_data_unused *dup);
-extern void xfs_dir2_data_make_free(struct xfs_da_args *args,
-               struct xfs_buf *bp, xfs_dir2_data_aoff_t offset,
-               xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
-extern void xfs_dir2_data_use_free(struct xfs_da_args *args,
-               struct xfs_buf *bp, struct xfs_dir2_data_unused *dup,
-               xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len,
-               int *needlogp, int *needscanp);
-
-extern struct xfs_dir2_data_free *xfs_dir2_data_freefind(
-               struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf,
-               struct xfs_dir2_data_unused *dup);
-
-extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
-extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
-extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
-extern const struct xfs_buf_ops xfs_dir3_free_buf_ops;
-extern const struct xfs_buf_ops xfs_dir3_data_buf_ops;
-
-#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
deleted file mode 100644 (file)
index c7cd315..0000000
+++ /dev/null
@@ -1,1265 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_inode.h"
-#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_buf_item.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_priv.h"
-#include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_cksum.h"
-#include "xfs_dinode.h"
-
-/*
- * Local function prototypes.
- */
-static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, struct xfs_buf *bp,
-                                   int first, int last);
-static void xfs_dir2_block_log_tail(xfs_trans_t *tp, struct xfs_buf *bp);
-static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, struct xfs_buf **bpp,
-                                    int *entno);
-static int xfs_dir2_block_sort(const void *a, const void *b);
-
-static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
-
-/*
- * One-time startup routine called from xfs_init().
- */
-void
-xfs_dir_startup(void)
-{
-       xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1);
-       xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
-}
-
-static bool
-xfs_dir3_block_verify(
-       struct xfs_buf          *bp)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-       struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
-
-       if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
-                       return false;
-               if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
-                       return false;
-               if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
-                       return false;
-       } else {
-               if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
-                       return false;
-       }
-       if (__xfs_dir3_data_check(NULL, bp))
-               return false;
-       return true;
-}
-
-static void
-xfs_dir3_block_read_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-
-       if (xfs_sb_version_hascrc(&mp->m_sb) &&
-            !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
-               xfs_buf_ioerror(bp, EFSBADCRC);
-       else if (!xfs_dir3_block_verify(bp))
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-       if (bp->b_error)
-               xfs_verifier_error(bp);
-}
-
-static void
-xfs_dir3_block_write_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-       struct xfs_buf_log_item *bip = bp->b_fspriv;
-       struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
-
-       if (!xfs_dir3_block_verify(bp)) {
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-               xfs_verifier_error(bp);
-               return;
-       }
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return;
-
-       if (bip)
-               hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
-
-       xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
-}
-
-const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
-       .verify_read = xfs_dir3_block_read_verify,
-       .verify_write = xfs_dir3_block_write_verify,
-};
-
-int
-xfs_dir3_block_read(
-       struct xfs_trans        *tp,
-       struct xfs_inode        *dp,
-       struct xfs_buf          **bpp)
-{
-       struct xfs_mount        *mp = dp->i_mount;
-       int                     err;
-
-       err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp,
-                               XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
-       if (!err && tp)
-               xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
-       return err;
-}
-
-static void
-xfs_dir3_block_init(
-       struct xfs_mount        *mp,
-       struct xfs_trans        *tp,
-       struct xfs_buf          *bp,
-       struct xfs_inode        *dp)
-{
-       struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
-
-       bp->b_ops = &xfs_dir3_block_buf_ops;
-       xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF);
-
-       if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               memset(hdr3, 0, sizeof(*hdr3));
-               hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
-               hdr3->blkno = cpu_to_be64(bp->b_bn);
-               hdr3->owner = cpu_to_be64(dp->i_ino);
-               uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
-               return;
-
-       }
-       hdr3->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
-}
-
-static void
-xfs_dir2_block_need_space(
-       struct xfs_inode                *dp,
-       struct xfs_dir2_data_hdr        *hdr,
-       struct xfs_dir2_block_tail      *btp,
-       struct xfs_dir2_leaf_entry      *blp,
-       __be16                          **tagpp,
-       struct xfs_dir2_data_unused     **dupp,
-       struct xfs_dir2_data_unused     **enddupp,
-       int                             *compact,
-       int                             len)
-{
-       struct xfs_dir2_data_free       *bf;
-       __be16                          *tagp = NULL;
-       struct xfs_dir2_data_unused     *dup = NULL;
-       struct xfs_dir2_data_unused     *enddup = NULL;
-
-       *compact = 0;
-       bf = dp->d_ops->data_bestfree_p(hdr);
-
-       /*
-        * If there are stale entries we'll use one for the leaf.
-        */
-       if (btp->stale) {
-               if (be16_to_cpu(bf[0].length) >= len) {
-                       /*
-                        * The biggest entry enough to avoid compaction.
-                        */
-                       dup = (xfs_dir2_data_unused_t *)
-                             ((char *)hdr + be16_to_cpu(bf[0].offset));
-                       goto out;
-               }
-
-               /*
-                * Will need to compact to make this work.
-                * Tag just before the first leaf entry.
-                */
-               *compact = 1;
-               tagp = (__be16 *)blp - 1;
-
-               /* Data object just before the first leaf entry.  */
-               dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
-
-               /*
-                * If it's not free then the data will go where the
-                * leaf data starts now, if it works at all.
-                */
-               if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
-                       if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
-                           (uint)sizeof(*blp) < len)
-                               dup = NULL;
-               } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
-                       dup = NULL;
-               else
-                       dup = (xfs_dir2_data_unused_t *)blp;
-               goto out;
-       }
-
-       /*
-        * no stale entries, so just use free space.
-        * Tag just before the first leaf entry.
-        */
-       tagp = (__be16 *)blp - 1;
-
-       /* Data object just before the first leaf entry.  */
-       enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
-
-       /*
-        * If it's not free then can't do this add without cleaning up:
-        * the space before the first leaf entry needs to be free so it
-        * can be expanded to hold the pointer to the new entry.
-        */
-       if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
-               /*
-                * Check out the biggest freespace and see if it's the same one.
-                */
-               dup = (xfs_dir2_data_unused_t *)
-                     ((char *)hdr + be16_to_cpu(bf[0].offset));
-               if (dup != enddup) {
-                       /*
-                        * Not the same free entry, just check its length.
-                        */
-                       if (be16_to_cpu(dup->length) < len)
-                               dup = NULL;
-                       goto out;
-               }
-
-               /*
-                * It is the biggest freespace, can it hold the leaf too?
-                */
-               if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
-                       /*
-                        * Yes, use the second-largest entry instead if it works.
-                        */
-                       if (be16_to_cpu(bf[1].length) >= len)
-                               dup = (xfs_dir2_data_unused_t *)
-                                     ((char *)hdr + be16_to_cpu(bf[1].offset));
-                       else
-                               dup = NULL;
-               }
-       }
-out:
-       *tagpp = tagp;
-       *dupp = dup;
-       *enddupp = enddup;
-}
-
-/*
- * compact the leaf entries.
- * Leave the highest-numbered stale entry stale.
- * XXX should be the one closest to mid but mid is not yet computed.
- */
-static void
-xfs_dir2_block_compact(
-       struct xfs_da_args              *args,
-       struct xfs_buf                  *bp,
-       struct xfs_dir2_data_hdr        *hdr,
-       struct xfs_dir2_block_tail      *btp,
-       struct xfs_dir2_leaf_entry      *blp,
-       int                             *needlog,
-       int                             *lfloghigh,
-       int                             *lfloglow)
-{
-       int                     fromidx;        /* source leaf index */
-       int                     toidx;          /* target leaf index */
-       int                     needscan = 0;
-       int                     highstale;      /* high stale index */
-
-       fromidx = toidx = be32_to_cpu(btp->count) - 1;
-       highstale = *lfloghigh = -1;
-       for (; fromidx >= 0; fromidx--) {
-               if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
-                       if (highstale == -1)
-                               highstale = toidx;
-                       else {
-                               if (*lfloghigh == -1)
-                                       *lfloghigh = toidx;
-                               continue;
-                       }
-               }
-               if (fromidx < toidx)
-                       blp[toidx] = blp[fromidx];
-               toidx--;
-       }
-       *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
-       *lfloghigh -= be32_to_cpu(btp->stale) - 1;
-       be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
-       xfs_dir2_data_make_free(args, bp,
-               (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
-               (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
-               needlog, &needscan);
-       btp->stale = cpu_to_be32(1);
-       /*
-        * If we now need to rebuild the bestfree map, do so.
-        * This needs to happen before the next call to use_free.
-        */
-       if (needscan)
-               xfs_dir2_data_freescan(args->dp, hdr, needlog);
-}
-
-/*
- * Add an entry to a block directory.
- */
-int                                            /* error */
-xfs_dir2_block_addname(
-       xfs_da_args_t           *args)          /* directory op arguments */
-{
-       xfs_dir2_data_hdr_t     *hdr;           /* block header */
-       xfs_dir2_leaf_entry_t   *blp;           /* block leaf entries */
-       struct xfs_buf          *bp;            /* buffer for block */
-       xfs_dir2_block_tail_t   *btp;           /* block tail */
-       int                     compact;        /* need to compact leaf ents */
-       xfs_dir2_data_entry_t   *dep;           /* block data entry */
-       xfs_inode_t             *dp;            /* directory inode */
-       xfs_dir2_data_unused_t  *dup;           /* block unused entry */
-       int                     error;          /* error return value */
-       xfs_dir2_data_unused_t  *enddup=NULL;   /* unused at end of data */
-       xfs_dahash_t            hash;           /* hash value of found entry */
-       int                     high;           /* high index for binary srch */
-       int                     highstale;      /* high stale index */
-       int                     lfloghigh=0;    /* last final leaf to log */
-       int                     lfloglow=0;     /* first final leaf to log */
-       int                     len;            /* length of the new entry */
-       int                     low;            /* low index for binary srch */
-       int                     lowstale;       /* low stale index */
-       int                     mid=0;          /* midpoint for binary srch */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       int                     needlog;        /* need to log header */
-       int                     needscan;       /* need to rescan freespace */
-       __be16                  *tagp;          /* pointer to tag value */
-       xfs_trans_t             *tp;            /* transaction structure */
-
-       trace_xfs_dir2_block_addname(args);
-
-       dp = args->dp;
-       tp = args->trans;
-       mp = dp->i_mount;
-
-       /* Read the (one and only) directory block into bp. */
-       error = xfs_dir3_block_read(tp, dp, &bp);
-       if (error)
-               return error;
-
-       len = dp->d_ops->data_entsize(args->namelen);
-
-       /*
-        * Set up pointers to parts of the block.
-        */
-       hdr = bp->b_addr;
-       btp = xfs_dir2_block_tail_p(args->geo, hdr);
-       blp = xfs_dir2_block_leaf_p(btp);
-
-       /*
-        * Find out if we can reuse stale entries or whether we need extra
-        * space for entry and new leaf.
-        */
-       xfs_dir2_block_need_space(dp, hdr, btp, blp, &tagp, &dup,
-                                 &enddup, &compact, len);
-
-       /*
-        * Done everything we need for a space check now.
-        */
-       if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
-               xfs_trans_brelse(tp, bp);
-               if (!dup)
-                       return XFS_ERROR(ENOSPC);
-               return 0;
-       }
-
-       /*
-        * If we don't have space for the new entry & leaf ...
-        */
-       if (!dup) {
-               /* Don't have a space reservation: return no-space.  */
-               if (args->total == 0)
-                       return XFS_ERROR(ENOSPC);
-               /*
-                * Convert to the next larger format.
-                * Then add the new entry in that format.
-                */
-               error = xfs_dir2_block_to_leaf(args, bp);
-               if (error)
-                       return error;
-               return xfs_dir2_leaf_addname(args);
-       }
-
-       needlog = needscan = 0;
-
-       /*
-        * If need to compact the leaf entries, do it now.
-        */
-       if (compact) {
-               xfs_dir2_block_compact(args, bp, hdr, btp, blp, &needlog,
-                                     &lfloghigh, &lfloglow);
-               /* recalculate blp post-compaction */
-               blp = xfs_dir2_block_leaf_p(btp);
-       } else if (btp->stale) {
-               /*
-                * Set leaf logging boundaries to impossible state.
-                * For the no-stale case they're set explicitly.
-                */
-               lfloglow = be32_to_cpu(btp->count);
-               lfloghigh = -1;
-       }
-
-       /*
-        * Find the slot that's first lower than our hash value, -1 if none.
-        */
-       for (low = 0, high = be32_to_cpu(btp->count) - 1; low <= high; ) {
-               mid = (low + high) >> 1;
-               if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval)
-                       break;
-               if (hash < args->hashval)
-                       low = mid + 1;
-               else
-                       high = mid - 1;
-       }
-       while (mid >= 0 && be32_to_cpu(blp[mid].hashval) >= args->hashval) {
-               mid--;
-       }
-       /*
-        * No stale entries, will use enddup space to hold new leaf.
-        */
-       if (!btp->stale) {
-               /*
-                * Mark the space needed for the new leaf entry, now in use.
-                */
-               xfs_dir2_data_use_free(args, bp, enddup,
-                       (xfs_dir2_data_aoff_t)
-                       ((char *)enddup - (char *)hdr + be16_to_cpu(enddup->length) -
-                        sizeof(*blp)),
-                       (xfs_dir2_data_aoff_t)sizeof(*blp),
-                       &needlog, &needscan);
-               /*
-                * Update the tail (entry count).
-                */
-               be32_add_cpu(&btp->count, 1);
-               /*
-                * If we now need to rebuild the bestfree map, do so.
-                * This needs to happen before the next call to use_free.
-                */
-               if (needscan) {
-                       xfs_dir2_data_freescan(dp, hdr, &needlog);
-                       needscan = 0;
-               }
-               /*
-                * Adjust pointer to the first leaf entry, we're about to move
-                * the table up one to open up space for the new leaf entry.
-                * Then adjust our index to match.
-                */
-               blp--;
-               mid++;
-               if (mid)
-                       memmove(blp, &blp[1], mid * sizeof(*blp));
-               lfloglow = 0;
-               lfloghigh = mid;
-       }
-       /*
-        * Use a stale leaf for our new entry.
-        */
-       else {
-               for (lowstale = mid;
-                    lowstale >= 0 &&
-                       blp[lowstale].address !=
-                       cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
-                    lowstale--)
-                       continue;
-               for (highstale = mid + 1;
-                    highstale < be32_to_cpu(btp->count) &&
-                       blp[highstale].address !=
-                       cpu_to_be32(XFS_DIR2_NULL_DATAPTR) &&
-                       (lowstale < 0 || mid - lowstale > highstale - mid);
-                    highstale++)
-                       continue;
-               /*
-                * Move entries toward the low-numbered stale entry.
-                */
-               if (lowstale >= 0 &&
-                   (highstale == be32_to_cpu(btp->count) ||
-                    mid - lowstale <= highstale - mid)) {
-                       if (mid - lowstale)
-                               memmove(&blp[lowstale], &blp[lowstale + 1],
-                                       (mid - lowstale) * sizeof(*blp));
-                       lfloglow = MIN(lowstale, lfloglow);
-                       lfloghigh = MAX(mid, lfloghigh);
-               }
-               /*
-                * Move entries toward the high-numbered stale entry.
-                */
-               else {
-                       ASSERT(highstale < be32_to_cpu(btp->count));
-                       mid++;
-                       if (highstale - mid)
-                               memmove(&blp[mid + 1], &blp[mid],
-                                       (highstale - mid) * sizeof(*blp));
-                       lfloglow = MIN(mid, lfloglow);
-                       lfloghigh = MAX(highstale, lfloghigh);
-               }
-               be32_add_cpu(&btp->stale, -1);
-       }
-       /*
-        * Point to the new data entry.
-        */
-       dep = (xfs_dir2_data_entry_t *)dup;
-       /*
-        * Fill in the leaf entry.
-        */
-       blp[mid].hashval = cpu_to_be32(args->hashval);
-       blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
-                               (char *)dep - (char *)hdr));
-       xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh);
-       /*
-        * Mark space for the data entry used.
-        */
-       xfs_dir2_data_use_free(args, bp, dup,
-               (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
-               (xfs_dir2_data_aoff_t)len, &needlog, &needscan);
-       /*
-        * Create the new data entry.
-        */
-       dep->inumber = cpu_to_be64(args->inumber);
-       dep->namelen = args->namelen;
-       memcpy(dep->name, args->name, args->namelen);
-       dp->d_ops->data_put_ftype(dep, args->filetype);
-       tagp = dp->d_ops->data_entry_tag_p(dep);
-       *tagp = cpu_to_be16((char *)dep - (char *)hdr);
-       /*
-        * Clean up the bestfree array and log the header, tail, and entry.
-        */
-       if (needscan)
-               xfs_dir2_data_freescan(dp, hdr, &needlog);
-       if (needlog)
-               xfs_dir2_data_log_header(args, bp);
-       xfs_dir2_block_log_tail(tp, bp);
-       xfs_dir2_data_log_entry(args, bp, dep);
-       xfs_dir3_data_check(dp, bp);
-       return 0;
-}
-
-/*
- * Log leaf entries from the block.
- */
-static void
-xfs_dir2_block_log_leaf(
-       xfs_trans_t             *tp,            /* transaction structure */
-       struct xfs_buf          *bp,            /* block buffer */
-       int                     first,          /* index of first logged leaf */
-       int                     last)           /* index of last logged leaf */
-{
-       xfs_dir2_data_hdr_t     *hdr = bp->b_addr;
-       xfs_dir2_leaf_entry_t   *blp;
-       xfs_dir2_block_tail_t   *btp;
-
-       btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
-       blp = xfs_dir2_block_leaf_p(btp);
-       xfs_trans_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr),
-               (uint)((char *)&blp[last + 1] - (char *)hdr - 1));
-}
-
-/*
- * Log the block tail.
- */
-static void
-xfs_dir2_block_log_tail(
-       xfs_trans_t             *tp,            /* transaction structure */
-       struct xfs_buf          *bp)            /* block buffer */
-{
-       xfs_dir2_data_hdr_t     *hdr = bp->b_addr;
-       xfs_dir2_block_tail_t   *btp;
-
-       btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
-       xfs_trans_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr),
-               (uint)((char *)(btp + 1) - (char *)hdr - 1));
-}
-
-/*
- * Look up an entry in the block.  This is the external routine,
- * xfs_dir2_block_lookup_int does the real work.
- */
-int                                            /* error */
-xfs_dir2_block_lookup(
-       xfs_da_args_t           *args)          /* dir lookup arguments */
-{
-       xfs_dir2_data_hdr_t     *hdr;           /* block header */
-       xfs_dir2_leaf_entry_t   *blp;           /* block leaf entries */
-       struct xfs_buf          *bp;            /* block buffer */
-       xfs_dir2_block_tail_t   *btp;           /* block tail */
-       xfs_dir2_data_entry_t   *dep;           /* block data entry */
-       xfs_inode_t             *dp;            /* incore inode */
-       int                     ent;            /* entry index */
-       int                     error;          /* error return value */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-
-       trace_xfs_dir2_block_lookup(args);
-
-       /*
-        * Get the buffer, look up the entry.
-        * If not found (ENOENT) then return, have no buffer.
-        */
-       if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent)))
-               return error;
-       dp = args->dp;
-       mp = dp->i_mount;
-       hdr = bp->b_addr;
-       xfs_dir3_data_check(dp, bp);
-       btp = xfs_dir2_block_tail_p(args->geo, hdr);
-       blp = xfs_dir2_block_leaf_p(btp);
-       /*
-        * Get the offset from the leaf entry, to point to the data.
-        */
-       dep = (xfs_dir2_data_entry_t *)((char *)hdr +
-                       xfs_dir2_dataptr_to_off(args->geo,
-                                               be32_to_cpu(blp[ent].address)));
-       /*
-        * Fill in inode number, CI name if appropriate, release the block.
-        */
-       args->inumber = be64_to_cpu(dep->inumber);
-       args->filetype = dp->d_ops->data_get_ftype(dep);
-       error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
-       xfs_trans_brelse(args->trans, bp);
-       return XFS_ERROR(error);
-}
-
-/*
- * Internal block lookup routine.
- */
-static int                                     /* error */
-xfs_dir2_block_lookup_int(
-       xfs_da_args_t           *args,          /* dir lookup arguments */
-       struct xfs_buf          **bpp,          /* returned block buffer */
-       int                     *entno)         /* returned entry number */
-{
-       xfs_dir2_dataptr_t      addr;           /* data entry address */
-       xfs_dir2_data_hdr_t     *hdr;           /* block header */
-       xfs_dir2_leaf_entry_t   *blp;           /* block leaf entries */
-       struct xfs_buf          *bp;            /* block buffer */
-       xfs_dir2_block_tail_t   *btp;           /* block tail */
-       xfs_dir2_data_entry_t   *dep;           /* block data entry */
-       xfs_inode_t             *dp;            /* incore inode */
-       int                     error;          /* error return value */
-       xfs_dahash_t            hash;           /* found hash value */
-       int                     high;           /* binary search high index */
-       int                     low;            /* binary search low index */
-       int                     mid;            /* binary search current idx */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       xfs_trans_t             *tp;            /* transaction pointer */
-       enum xfs_dacmp          cmp;            /* comparison result */
-
-       dp = args->dp;
-       tp = args->trans;
-       mp = dp->i_mount;
-
-       error = xfs_dir3_block_read(tp, dp, &bp);
-       if (error)
-               return error;
-
-       hdr = bp->b_addr;
-       xfs_dir3_data_check(dp, bp);
-       btp = xfs_dir2_block_tail_p(args->geo, hdr);
-       blp = xfs_dir2_block_leaf_p(btp);
-       /*
-        * Loop doing a binary search for our hash value.
-        * Find our entry, ENOENT if it's not there.
-        */
-       for (low = 0, high = be32_to_cpu(btp->count) - 1; ; ) {
-               ASSERT(low <= high);
-               mid = (low + high) >> 1;
-               if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval)
-                       break;
-               if (hash < args->hashval)
-                       low = mid + 1;
-               else
-                       high = mid - 1;
-               if (low > high) {
-                       ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
-                       xfs_trans_brelse(tp, bp);
-                       return XFS_ERROR(ENOENT);
-               }
-       }
-       /*
-        * Back up to the first one with the right hash value.
-        */
-       while (mid > 0 && be32_to_cpu(blp[mid - 1].hashval) == args->hashval) {
-               mid--;
-       }
-       /*
-        * Now loop forward through all the entries with the
-        * right hash value looking for our name.
-        */
-       do {
-               if ((addr = be32_to_cpu(blp[mid].address)) == XFS_DIR2_NULL_DATAPTR)
-                       continue;
-               /*
-                * Get pointer to the entry from the leaf.
-                */
-               dep = (xfs_dir2_data_entry_t *)
-                       ((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, addr));
-               /*
-                * Compare name and if it's an exact match, return the index
-                * and buffer. If it's the first case-insensitive match, store
-                * the index and buffer and continue looking for an exact match.
-                */
-               cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
-               if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
-                       args->cmpresult = cmp;
-                       *bpp = bp;
-                       *entno = mid;
-                       if (cmp == XFS_CMP_EXACT)
-                               return 0;
-               }
-       } while (++mid < be32_to_cpu(btp->count) &&
-                       be32_to_cpu(blp[mid].hashval) == hash);
-
-       ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
-       /*
-        * Here, we can only be doing a lookup (not a rename or replace).
-        * If a case-insensitive match was found earlier, return success.
-        */
-       if (args->cmpresult == XFS_CMP_CASE)
-               return 0;
-       /*
-        * No match, release the buffer and return ENOENT.
-        */
-       xfs_trans_brelse(tp, bp);
-       return XFS_ERROR(ENOENT);
-}
-
-/*
- * Remove an entry from a block format directory.
- * If that makes the block small enough to fit in shortform, transform it.
- */
-int                                            /* error */
-xfs_dir2_block_removename(
-       xfs_da_args_t           *args)          /* directory operation args */
-{
-       xfs_dir2_data_hdr_t     *hdr;           /* block header */
-       xfs_dir2_leaf_entry_t   *blp;           /* block leaf pointer */
-       struct xfs_buf          *bp;            /* block buffer */
-       xfs_dir2_block_tail_t   *btp;           /* block tail */
-       xfs_dir2_data_entry_t   *dep;           /* block data entry */
-       xfs_inode_t             *dp;            /* incore inode */
-       int                     ent;            /* block leaf entry index */
-       int                     error;          /* error return value */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       int                     needlog;        /* need to log block header */
-       int                     needscan;       /* need to fixup bestfree */
-       xfs_dir2_sf_hdr_t       sfh;            /* shortform header */
-       int                     size;           /* shortform size */
-       xfs_trans_t             *tp;            /* transaction pointer */
-
-       trace_xfs_dir2_block_removename(args);
-
-       /*
-        * Look up the entry in the block.  Gets the buffer and entry index.
-        * It will always be there, the vnodeops level does a lookup first.
-        */
-       if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
-               return error;
-       }
-       dp = args->dp;
-       tp = args->trans;
-       mp = dp->i_mount;
-       hdr = bp->b_addr;
-       btp = xfs_dir2_block_tail_p(args->geo, hdr);
-       blp = xfs_dir2_block_leaf_p(btp);
-       /*
-        * Point to the data entry using the leaf entry.
-        */
-       dep = (xfs_dir2_data_entry_t *)((char *)hdr +
-                       xfs_dir2_dataptr_to_off(args->geo,
-                                               be32_to_cpu(blp[ent].address)));
-       /*
-        * Mark the data entry's space free.
-        */
-       needlog = needscan = 0;
-       xfs_dir2_data_make_free(args, bp,
-               (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
-               dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
-       /*
-        * Fix up the block tail.
-        */
-       be32_add_cpu(&btp->stale, 1);
-       xfs_dir2_block_log_tail(tp, bp);
-       /*
-        * Remove the leaf entry by marking it stale.
-        */
-       blp[ent].address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
-       xfs_dir2_block_log_leaf(tp, bp, ent, ent);
-       /*
-        * Fix up bestfree, log the header if necessary.
-        */
-       if (needscan)
-               xfs_dir2_data_freescan(dp, hdr, &needlog);
-       if (needlog)
-               xfs_dir2_data_log_header(args, bp);
-       xfs_dir3_data_check(dp, bp);
-       /*
-        * See if the size as a shortform is good enough.
-        */
-       size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
-       if (size > XFS_IFORK_DSIZE(dp))
-               return 0;
-
-       /*
-        * If it works, do the conversion.
-        */
-       return xfs_dir2_block_to_sf(args, bp, size, &sfh);
-}
-
-/*
- * Replace an entry in a V2 block directory.
- * Change the inode number to the new value.
- */
-int                                            /* error */
-xfs_dir2_block_replace(
-       xfs_da_args_t           *args)          /* directory operation args */
-{
-       xfs_dir2_data_hdr_t     *hdr;           /* block header */
-       xfs_dir2_leaf_entry_t   *blp;           /* block leaf entries */
-       struct xfs_buf          *bp;            /* block buffer */
-       xfs_dir2_block_tail_t   *btp;           /* block tail */
-       xfs_dir2_data_entry_t   *dep;           /* block data entry */
-       xfs_inode_t             *dp;            /* incore inode */
-       int                     ent;            /* leaf entry index */
-       int                     error;          /* error return value */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-
-       trace_xfs_dir2_block_replace(args);
-
-       /*
-        * Lookup the entry in the directory.  Get buffer and entry index.
-        * This will always succeed since the caller has already done a lookup.
-        */
-       if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
-               return error;
-       }
-       dp = args->dp;
-       mp = dp->i_mount;
-       hdr = bp->b_addr;
-       btp = xfs_dir2_block_tail_p(args->geo, hdr);
-       blp = xfs_dir2_block_leaf_p(btp);
-       /*
-        * Point to the data entry we need to change.
-        */
-       dep = (xfs_dir2_data_entry_t *)((char *)hdr +
-                       xfs_dir2_dataptr_to_off(args->geo,
-                                               be32_to_cpu(blp[ent].address)));
-       ASSERT(be64_to_cpu(dep->inumber) != args->inumber);
-       /*
-        * Change the inode number to the new value.
-        */
-       dep->inumber = cpu_to_be64(args->inumber);
-       dp->d_ops->data_put_ftype(dep, args->filetype);
-       xfs_dir2_data_log_entry(args, bp, dep);
-       xfs_dir3_data_check(dp, bp);
-       return 0;
-}
-
-/*
- * Qsort comparison routine for the block leaf entries.
- */
-static int                                     /* sort order */
-xfs_dir2_block_sort(
-       const void                      *a,     /* first leaf entry */
-       const void                      *b)     /* second leaf entry */
-{
-       const xfs_dir2_leaf_entry_t     *la;    /* first leaf entry */
-       const xfs_dir2_leaf_entry_t     *lb;    /* second leaf entry */
-
-       la = a;
-       lb = b;
-       return be32_to_cpu(la->hashval) < be32_to_cpu(lb->hashval) ? -1 :
-               (be32_to_cpu(la->hashval) > be32_to_cpu(lb->hashval) ? 1 : 0);
-}
-
-/*
- * Convert a V2 leaf directory to a V2 block directory if possible.
- */
-int                                            /* error */
-xfs_dir2_leaf_to_block(
-       xfs_da_args_t           *args,          /* operation arguments */
-       struct xfs_buf          *lbp,           /* leaf buffer */
-       struct xfs_buf          *dbp)           /* data buffer */
-{
-       __be16                  *bestsp;        /* leaf bests table */
-       xfs_dir2_data_hdr_t     *hdr;           /* block header */
-       xfs_dir2_block_tail_t   *btp;           /* block tail */
-       xfs_inode_t             *dp;            /* incore directory inode */
-       xfs_dir2_data_unused_t  *dup;           /* unused data entry */
-       int                     error;          /* error return value */
-       int                     from;           /* leaf from index */
-       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
-       xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
-       xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
-       xfs_mount_t             *mp;            /* file system mount point */
-       int                     needlog;        /* need to log data header */
-       int                     needscan;       /* need to scan for bestfree */
-       xfs_dir2_sf_hdr_t       sfh;            /* shortform header */
-       int                     size;           /* bytes used */
-       __be16                  *tagp;          /* end of entry (tag) */
-       int                     to;             /* block/leaf to index */
-       xfs_trans_t             *tp;            /* transaction pointer */
-       struct xfs_dir2_leaf_entry *ents;
-       struct xfs_dir3_icleaf_hdr leafhdr;
-
-       trace_xfs_dir2_leaf_to_block(args);
-
-       dp = args->dp;
-       tp = args->trans;
-       mp = dp->i_mount;
-       leaf = lbp->b_addr;
-       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-       ents = dp->d_ops->leaf_ents_p(leaf);
-       ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
-
-       ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC ||
-              leafhdr.magic == XFS_DIR3_LEAF1_MAGIC);
-       /*
-        * If there are data blocks other than the first one, take this
-        * opportunity to remove trailing empty data blocks that may have
-        * been left behind during no-space-reservation operations.
-        * These will show up in the leaf bests table.
-        */
-       while (dp->i_d.di_size > args->geo->blksize) {
-               int hdrsz;
-
-               hdrsz = dp->d_ops->data_entry_offset;
-               bestsp = xfs_dir2_leaf_bests_p(ltp);
-               if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) ==
-                                           args->geo->blksize - hdrsz) {
-                       if ((error =
-                           xfs_dir2_leaf_trim_data(args, lbp,
-                                   (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1))))
-                               return error;
-               } else
-                       return 0;
-       }
-       /*
-        * Read the data block if we don't already have it, give up if it fails.
-        */
-       if (!dbp) {
-               error = xfs_dir3_data_read(tp, dp, args->geo->datablk, -1, &dbp);
-               if (error)
-                       return error;
-       }
-       hdr = dbp->b_addr;
-       ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
-
-       /*
-        * Size of the "leaf" area in the block.
-        */
-       size = (uint)sizeof(xfs_dir2_block_tail_t) +
-              (uint)sizeof(*lep) * (leafhdr.count - leafhdr.stale);
-       /*
-        * Look at the last data entry.
-        */
-       tagp = (__be16 *)((char *)hdr + args->geo->blksize) - 1;
-       dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
-       /*
-        * If it's not free or is too short we can't do it.
-        */
-       if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG ||
-           be16_to_cpu(dup->length) < size)
-               return 0;
-
-       /*
-        * Start converting it to block form.
-        */
-       xfs_dir3_block_init(mp, tp, dbp, dp);
-
-       needlog = 1;
-       needscan = 0;
-       /*
-        * Use up the space at the end of the block (blp/btp).
-        */
-       xfs_dir2_data_use_free(args, dbp, dup, args->geo->blksize - size, size,
-               &needlog, &needscan);
-       /*
-        * Initialize the block tail.
-        */
-       btp = xfs_dir2_block_tail_p(args->geo, hdr);
-       btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale);
-       btp->stale = 0;
-       xfs_dir2_block_log_tail(tp, dbp);
-       /*
-        * Initialize the block leaf area.  We compact out stale entries.
-        */
-       lep = xfs_dir2_block_leaf_p(btp);
-       for (from = to = 0; from < leafhdr.count; from++) {
-               if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
-                       continue;
-               lep[to++] = ents[from];
-       }
-       ASSERT(to == be32_to_cpu(btp->count));
-       xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1);
-       /*
-        * Scan the bestfree if we need it and log the data block header.
-        */
-       if (needscan)
-               xfs_dir2_data_freescan(dp, hdr, &needlog);
-       if (needlog)
-               xfs_dir2_data_log_header(args, dbp);
-       /*
-        * Pitch the old leaf block.
-        */
-       error = xfs_da_shrink_inode(args, args->geo->leafblk, lbp);
-       if (error)
-               return error;
-
-       /*
-        * Now see if the resulting block can be shrunken to shortform.
-        */
-       size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
-       if (size > XFS_IFORK_DSIZE(dp))
-               return 0;
-
-       return xfs_dir2_block_to_sf(args, dbp, size, &sfh);
-}
-
-/*
- * Convert the shortform directory to block form.
- */
-int                                            /* error */
-xfs_dir2_sf_to_block(
-       xfs_da_args_t           *args)          /* operation arguments */
-{
-       xfs_dir2_db_t           blkno;          /* dir-relative block # (0) */
-       xfs_dir2_data_hdr_t     *hdr;           /* block header */
-       xfs_dir2_leaf_entry_t   *blp;           /* block leaf entries */
-       struct xfs_buf          *bp;            /* block buffer */
-       xfs_dir2_block_tail_t   *btp;           /* block tail pointer */
-       xfs_dir2_data_entry_t   *dep;           /* data entry pointer */
-       xfs_inode_t             *dp;            /* incore directory inode */
-       int                     dummy;          /* trash */
-       xfs_dir2_data_unused_t  *dup;           /* unused entry pointer */
-       int                     endoffset;      /* end of data objects */
-       int                     error;          /* error return value */
-       int                     i;              /* index */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       int                     needlog;        /* need to log block header */
-       int                     needscan;       /* need to scan block freespc */
-       int                     newoffset;      /* offset from current entry */
-       int                     offset;         /* target block offset */
-       xfs_dir2_sf_entry_t     *sfep;          /* sf entry pointer */
-       xfs_dir2_sf_hdr_t       *oldsfp;        /* old shortform header  */
-       xfs_dir2_sf_hdr_t       *sfp;           /* shortform header  */
-       __be16                  *tagp;          /* end of data entry */
-       xfs_trans_t             *tp;            /* transaction pointer */
-       struct xfs_name         name;
-       struct xfs_ifork        *ifp;
-
-       trace_xfs_dir2_sf_to_block(args);
-
-       dp = args->dp;
-       tp = args->trans;
-       mp = dp->i_mount;
-       ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
-       ASSERT(ifp->if_flags & XFS_IFINLINE);
-       /*
-        * Bomb out if the shortform directory is way too short.
-        */
-       if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
-               ASSERT(XFS_FORCED_SHUTDOWN(mp));
-               return XFS_ERROR(EIO);
-       }
-
-       oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
-
-       ASSERT(ifp->if_bytes == dp->i_d.di_size);
-       ASSERT(ifp->if_u1.if_data != NULL);
-       ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count));
-       ASSERT(dp->i_d.di_nextents == 0);
-
-       /*
-        * Copy the directory into a temporary buffer.
-        * Then pitch the incore inode data so we can make extents.
-        */
-       sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP);
-       memcpy(sfp, oldsfp, ifp->if_bytes);
-
-       xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
-       xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK);
-       dp->i_d.di_size = 0;
-
-       /*
-        * Add block 0 to the inode.
-        */
-       error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno);
-       if (error) {
-               kmem_free(sfp);
-               return error;
-       }
-       /*
-        * Initialize the data block, then convert it to block format.
-        */
-       error = xfs_dir3_data_init(args, blkno, &bp);
-       if (error) {
-               kmem_free(sfp);
-               return error;
-       }
-       xfs_dir3_block_init(mp, tp, bp, dp);
-       hdr = bp->b_addr;
-
-       /*
-        * Compute size of block "tail" area.
-        */
-       i = (uint)sizeof(*btp) +
-           (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t);
-       /*
-        * The whole thing is initialized to free by the init routine.
-        * Say we're using the leaf and tail area.
-        */
-       dup = dp->d_ops->data_unused_p(hdr);
-       needlog = needscan = 0;
-       xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i,
-                              i, &needlog, &needscan);
-       ASSERT(needscan == 0);
-       /*
-        * Fill in the tail.
-        */
-       btp = xfs_dir2_block_tail_p(args->geo, hdr);
-       btp->count = cpu_to_be32(sfp->count + 2);       /* ., .. */
-       btp->stale = 0;
-       blp = xfs_dir2_block_leaf_p(btp);
-       endoffset = (uint)((char *)blp - (char *)hdr);
-       /*
-        * Remove the freespace, we'll manage it.
-        */
-       xfs_dir2_data_use_free(args, bp, dup,
-               (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
-               be16_to_cpu(dup->length), &needlog, &needscan);
-       /*
-        * Create entry for .
-        */
-       dep = dp->d_ops->data_dot_entry_p(hdr);
-       dep->inumber = cpu_to_be64(dp->i_ino);
-       dep->namelen = 1;
-       dep->name[0] = '.';
-       dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
-       tagp = dp->d_ops->data_entry_tag_p(dep);
-       *tagp = cpu_to_be16((char *)dep - (char *)hdr);
-       xfs_dir2_data_log_entry(args, bp, dep);
-       blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot);
-       blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
-                               (char *)dep - (char *)hdr));
-       /*
-        * Create entry for ..
-        */
-       dep = dp->d_ops->data_dotdot_entry_p(hdr);
-       dep->inumber = cpu_to_be64(dp->d_ops->sf_get_parent_ino(sfp));
-       dep->namelen = 2;
-       dep->name[0] = dep->name[1] = '.';
-       dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
-       tagp = dp->d_ops->data_entry_tag_p(dep);
-       *tagp = cpu_to_be16((char *)dep - (char *)hdr);
-       xfs_dir2_data_log_entry(args, bp, dep);
-       blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
-       blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
-                               (char *)dep - (char *)hdr));
-       offset = dp->d_ops->data_first_offset;
-       /*
-        * Loop over existing entries, stuff them in.
-        */
-       i = 0;
-       if (!sfp->count)
-               sfep = NULL;
-       else
-               sfep = xfs_dir2_sf_firstentry(sfp);
-       /*
-        * Need to preserve the existing offset values in the sf directory.
-        * Insert holes (unused entries) where necessary.
-        */
-       while (offset < endoffset) {
-               /*
-                * sfep is null when we reach the end of the list.
-                */
-               if (sfep == NULL)
-                       newoffset = endoffset;
-               else
-                       newoffset = xfs_dir2_sf_get_offset(sfep);
-               /*
-                * There should be a hole here, make one.
-                */
-               if (offset < newoffset) {
-                       dup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
-                       dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
-                       dup->length = cpu_to_be16(newoffset - offset);
-                       *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16(
-                               ((char *)dup - (char *)hdr));
-                       xfs_dir2_data_log_unused(args, bp, dup);
-                       xfs_dir2_data_freeinsert(hdr,
-                                                dp->d_ops->data_bestfree_p(hdr),
-                                                dup, &dummy);
-                       offset += be16_to_cpu(dup->length);
-                       continue;
-               }
-               /*
-                * Copy a real entry.
-                */
-               dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset);
-               dep->inumber = cpu_to_be64(dp->d_ops->sf_get_ino(sfp, sfep));
-               dep->namelen = sfep->namelen;
-               dp->d_ops->data_put_ftype(dep, dp->d_ops->sf_get_ftype(sfep));
-               memcpy(dep->name, sfep->name, dep->namelen);
-               tagp = dp->d_ops->data_entry_tag_p(dep);
-               *tagp = cpu_to_be16((char *)dep - (char *)hdr);
-               xfs_dir2_data_log_entry(args, bp, dep);
-               name.name = sfep->name;
-               name.len = sfep->namelen;
-               blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops->
-                                                       hashname(&name));
-               blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
-                                                (char *)dep - (char *)hdr));
-               offset = (int)((char *)(tagp + 1) - (char *)hdr);
-               if (++i == sfp->count)
-                       sfep = NULL;
-               else
-                       sfep = dp->d_ops->sf_nextentry(sfp, sfep);
-       }
-       /* Done with the temporary buffer */
-       kmem_free(sfp);
-       /*
-        * Sort the leaf entries by hash value.
-        */
-       xfs_sort(blp, be32_to_cpu(btp->count), sizeof(*blp), xfs_dir2_block_sort);
-       /*
-        * Log the leaf entry area and tail.
-        * Already logged the header in data_init, ignore needlog.
-        */
-       ASSERT(needscan == 0);
-       xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1);
-       xfs_dir2_block_log_tail(tp, bp);
-       xfs_dir3_data_check(dp, bp);
-       return 0;
-}
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
deleted file mode 100644 (file)
index 8c2f642..0000000
+++ /dev/null
@@ -1,1050 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_inode.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_priv.h"
-#include "xfs_error.h"
-#include "xfs_trans.h"
-#include "xfs_buf_item.h"
-#include "xfs_cksum.h"
-
-/*
- * Check the consistency of the data block.
- * The input can also be a block-format directory.
- * Return 0 is the buffer is good, otherwise an error.
- */
-int
-__xfs_dir3_data_check(
-       struct xfs_inode        *dp,            /* incore inode pointer */
-       struct xfs_buf          *bp)            /* data block's buffer */
-{
-       xfs_dir2_dataptr_t      addr;           /* addr for leaf lookup */
-       xfs_dir2_data_free_t    *bf;            /* bestfree table */
-       xfs_dir2_block_tail_t   *btp=NULL;      /* block tail */
-       int                     count;          /* count of entries found */
-       xfs_dir2_data_hdr_t     *hdr;           /* data block header */
-       xfs_dir2_data_entry_t   *dep;           /* data entry */
-       xfs_dir2_data_free_t    *dfp;           /* bestfree entry */
-       xfs_dir2_data_unused_t  *dup;           /* unused entry */
-       char                    *endp;          /* end of useful data */
-       int                     freeseen;       /* mask of bestfrees seen */
-       xfs_dahash_t            hash;           /* hash of current name */
-       int                     i;              /* leaf index */
-       int                     lastfree;       /* last entry was unused */
-       xfs_dir2_leaf_entry_t   *lep=NULL;      /* block leaf entries */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       char                    *p;             /* current data position */
-       int                     stale;          /* count of stale leaves */
-       struct xfs_name         name;
-       const struct xfs_dir_ops *ops;
-       struct xfs_da_geometry  *geo;
-
-       mp = bp->b_target->bt_mount;
-       geo = mp->m_dir_geo;
-
-       /*
-        * We can be passed a null dp here from a verifier, so we need to go the
-        * hard way to get them.
-        */
-       ops = xfs_dir_get_ops(mp, dp);
-
-       hdr = bp->b_addr;
-       p = (char *)ops->data_entry_p(hdr);
-
-       switch (hdr->magic) {
-       case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
-       case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
-               btp = xfs_dir2_block_tail_p(geo, hdr);
-               lep = xfs_dir2_block_leaf_p(btp);
-               endp = (char *)lep;
-
-               /*
-                * The number of leaf entries is limited by the size of the
-                * block and the amount of space used by the data entries.
-                * We don't know how much space is used by the data entries yet,
-                * so just ensure that the count falls somewhere inside the
-                * block right now.
-                */
-               XFS_WANT_CORRUPTED_RETURN(be32_to_cpu(btp->count) <
-                       ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry));
-               break;
-       case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
-       case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
-               endp = (char *)hdr + geo->blksize;
-               break;
-       default:
-               XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
-               return EFSCORRUPTED;
-       }
-
-       /*
-        * Account for zero bestfree entries.
-        */
-       bf = ops->data_bestfree_p(hdr);
-       count = lastfree = freeseen = 0;
-       if (!bf[0].length) {
-               XFS_WANT_CORRUPTED_RETURN(!bf[0].offset);
-               freeseen |= 1 << 0;
-       }
-       if (!bf[1].length) {
-               XFS_WANT_CORRUPTED_RETURN(!bf[1].offset);
-               freeseen |= 1 << 1;
-       }
-       if (!bf[2].length) {
-               XFS_WANT_CORRUPTED_RETURN(!bf[2].offset);
-               freeseen |= 1 << 2;
-       }
-
-       XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >=
-                                               be16_to_cpu(bf[1].length));
-       XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >=
-                                               be16_to_cpu(bf[2].length));
-       /*
-        * Loop over the data/unused entries.
-        */
-       while (p < endp) {
-               dup = (xfs_dir2_data_unused_t *)p;
-               /*
-                * If it's unused, look for the space in the bestfree table.
-                * If we find it, account for that, else make sure it
-                * doesn't need to be there.
-                */
-               if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
-                       XFS_WANT_CORRUPTED_RETURN(lastfree == 0);
-                       XFS_WANT_CORRUPTED_RETURN(
-                               be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
-                                              (char *)dup - (char *)hdr);
-                       dfp = xfs_dir2_data_freefind(hdr, bf, dup);
-                       if (dfp) {
-                               i = (int)(dfp - bf);
-                               XFS_WANT_CORRUPTED_RETURN(
-                                       (freeseen & (1 << i)) == 0);
-                               freeseen |= 1 << i;
-                       } else {
-                               XFS_WANT_CORRUPTED_RETURN(
-                                       be16_to_cpu(dup->length) <=
-                                               be16_to_cpu(bf[2].length));
-                       }
-                       p += be16_to_cpu(dup->length);
-                       lastfree = 1;
-                       continue;
-               }
-               /*
-                * It's a real entry.  Validate the fields.
-                * If this is a block directory then make sure it's
-                * in the leaf section of the block.
-                * The linear search is crude but this is DEBUG code.
-                */
-               dep = (xfs_dir2_data_entry_t *)p;
-               XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0);
-               XFS_WANT_CORRUPTED_RETURN(
-                       !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
-               XFS_WANT_CORRUPTED_RETURN(
-                       be16_to_cpu(*ops->data_entry_tag_p(dep)) ==
-                                              (char *)dep - (char *)hdr);
-               XFS_WANT_CORRUPTED_RETURN(
-                               ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX);
-               count++;
-               lastfree = 0;
-               if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
-                   hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
-                       addr = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
-                                               (xfs_dir2_data_aoff_t)
-                                               ((char *)dep - (char *)hdr));
-                       name.name = dep->name;
-                       name.len = dep->namelen;
-                       hash = mp->m_dirnameops->hashname(&name);
-                       for (i = 0; i < be32_to_cpu(btp->count); i++) {
-                               if (be32_to_cpu(lep[i].address) == addr &&
-                                   be32_to_cpu(lep[i].hashval) == hash)
-                                       break;
-                       }
-                       XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
-               }
-               p += ops->data_entsize(dep->namelen);
-       }
-       /*
-        * Need to have seen all the entries and all the bestfree slots.
-        */
-       XFS_WANT_CORRUPTED_RETURN(freeseen == 7);
-       if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
-           hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
-               for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
-                       if (lep[i].address ==
-                           cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
-                               stale++;
-                       if (i > 0)
-                               XFS_WANT_CORRUPTED_RETURN(
-                                       be32_to_cpu(lep[i].hashval) >=
-                                               be32_to_cpu(lep[i - 1].hashval));
-               }
-               XFS_WANT_CORRUPTED_RETURN(count ==
-                       be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
-               XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale));
-       }
-       return 0;
-}
-
-static bool
-xfs_dir3_data_verify(
-       struct xfs_buf          *bp)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-       struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
-
-       if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
-                       return false;
-               if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
-                       return false;
-               if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
-                       return false;
-       } else {
-               if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
-                       return false;
-       }
-       if (__xfs_dir3_data_check(NULL, bp))
-               return false;
-       return true;
-}
-
-/*
- * Readahead of the first block of the directory when it is opened is completely
- * oblivious to the format of the directory. Hence we can either get a block
- * format buffer or a data format buffer on readahead.
- */
-static void
-xfs_dir3_data_reada_verify(
-       struct xfs_buf          *bp)
-{
-       struct xfs_dir2_data_hdr *hdr = bp->b_addr;
-
-       switch (hdr->magic) {
-       case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
-       case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
-               bp->b_ops = &xfs_dir3_block_buf_ops;
-               bp->b_ops->verify_read(bp);
-               return;
-       case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
-       case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
-               xfs_dir3_data_verify(bp);
-               return;
-       default:
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-               xfs_verifier_error(bp);
-               break;
-       }
-}
-
-static void
-xfs_dir3_data_read_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-
-       if (xfs_sb_version_hascrc(&mp->m_sb) &&
-            !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
-                xfs_buf_ioerror(bp, EFSBADCRC);
-       else if (!xfs_dir3_data_verify(bp))
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-       if (bp->b_error)
-               xfs_verifier_error(bp);
-}
-
-static void
-xfs_dir3_data_write_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-       struct xfs_buf_log_item *bip = bp->b_fspriv;
-       struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
-
-       if (!xfs_dir3_data_verify(bp)) {
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-               xfs_verifier_error(bp);
-               return;
-       }
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return;
-
-       if (bip)
-               hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
-
-       xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
-}
-
-const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
-       .verify_read = xfs_dir3_data_read_verify,
-       .verify_write = xfs_dir3_data_write_verify,
-};
-
-static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
-       .verify_read = xfs_dir3_data_reada_verify,
-       .verify_write = xfs_dir3_data_write_verify,
-};
-
-
-int
-xfs_dir3_data_read(
-       struct xfs_trans        *tp,
-       struct xfs_inode        *dp,
-       xfs_dablk_t             bno,
-       xfs_daddr_t             mapped_bno,
-       struct xfs_buf          **bpp)
-{
-       int                     err;
-
-       err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
-                               XFS_DATA_FORK, &xfs_dir3_data_buf_ops);
-       if (!err && tp)
-               xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
-       return err;
-}
-
-int
-xfs_dir3_data_readahead(
-       struct xfs_inode        *dp,
-       xfs_dablk_t             bno,
-       xfs_daddr_t             mapped_bno)
-{
-       return xfs_da_reada_buf(dp, bno, mapped_bno,
-                               XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops);
-}
-
-/*
- * Given a data block and an unused entry from that block,
- * return the bestfree entry if any that corresponds to it.
- */
-xfs_dir2_data_free_t *
-xfs_dir2_data_freefind(
-       struct xfs_dir2_data_hdr *hdr,          /* data block header */
-       struct xfs_dir2_data_free *bf,          /* bestfree table pointer */
-       struct xfs_dir2_data_unused *dup)       /* unused space */
-{
-       xfs_dir2_data_free_t    *dfp;           /* bestfree entry */
-       xfs_dir2_data_aoff_t    off;            /* offset value needed */
-#ifdef DEBUG
-       int                     matched;        /* matched the value */
-       int                     seenzero;       /* saw a 0 bestfree entry */
-#endif
-
-       off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
-
-#ifdef DEBUG
-       /*
-        * Validate some consistency in the bestfree table.
-        * Check order, non-overlapping entries, and if we find the
-        * one we're looking for it has to be exact.
-        */
-       ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
-       for (dfp = &bf[0], seenzero = matched = 0;
-            dfp < &bf[XFS_DIR2_DATA_FD_COUNT];
-            dfp++) {
-               if (!dfp->offset) {
-                       ASSERT(!dfp->length);
-                       seenzero = 1;
-                       continue;
-               }
-               ASSERT(seenzero == 0);
-               if (be16_to_cpu(dfp->offset) == off) {
-                       matched = 1;
-                       ASSERT(dfp->length == dup->length);
-               } else if (off < be16_to_cpu(dfp->offset))
-                       ASSERT(off + be16_to_cpu(dup->length) <= be16_to_cpu(dfp->offset));
-               else
-                       ASSERT(be16_to_cpu(dfp->offset) + be16_to_cpu(dfp->length) <= off);
-               ASSERT(matched || be16_to_cpu(dfp->length) >= be16_to_cpu(dup->length));
-               if (dfp > &bf[0])
-                       ASSERT(be16_to_cpu(dfp[-1].length) >= be16_to_cpu(dfp[0].length));
-       }
-#endif
-       /*
-        * If this is smaller than the smallest bestfree entry,
-        * it can't be there since they're sorted.
-        */
-       if (be16_to_cpu(dup->length) <
-           be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length))
-               return NULL;
-       /*
-        * Look at the three bestfree entries for our guy.
-        */
-       for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) {
-               if (!dfp->offset)
-                       return NULL;
-               if (be16_to_cpu(dfp->offset) == off)
-                       return dfp;
-       }
-       /*
-        * Didn't find it.  This only happens if there are duplicate lengths.
-        */
-       return NULL;
-}
-
-/*
- * Insert an unused-space entry into the bestfree table.
- */
-xfs_dir2_data_free_t *                         /* entry inserted */
-xfs_dir2_data_freeinsert(
-       struct xfs_dir2_data_hdr *hdr,          /* data block pointer */
-       struct xfs_dir2_data_free *dfp,         /* bestfree table pointer */
-       struct xfs_dir2_data_unused *dup,       /* unused space */
-       int                     *loghead)       /* log the data header (out) */
-{
-       xfs_dir2_data_free_t    new;            /* new bestfree entry */
-
-       ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
-
-       new.length = dup->length;
-       new.offset = cpu_to_be16((char *)dup - (char *)hdr);
-
-       /*
-        * Insert at position 0, 1, or 2; or not at all.
-        */
-       if (be16_to_cpu(new.length) > be16_to_cpu(dfp[0].length)) {
-               dfp[2] = dfp[1];
-               dfp[1] = dfp[0];
-               dfp[0] = new;
-               *loghead = 1;
-               return &dfp[0];
-       }
-       if (be16_to_cpu(new.length) > be16_to_cpu(dfp[1].length)) {
-               dfp[2] = dfp[1];
-               dfp[1] = new;
-               *loghead = 1;
-               return &dfp[1];
-       }
-       if (be16_to_cpu(new.length) > be16_to_cpu(dfp[2].length)) {
-               dfp[2] = new;
-               *loghead = 1;
-               return &dfp[2];
-       }
-       return NULL;
-}
-
-/*
- * Remove a bestfree entry from the table.
- */
-STATIC void
-xfs_dir2_data_freeremove(
-       struct xfs_dir2_data_hdr *hdr,          /* data block header */
-       struct xfs_dir2_data_free *bf,          /* bestfree table pointer */
-       struct xfs_dir2_data_free *dfp,         /* bestfree entry pointer */
-       int                     *loghead)       /* out: log data header */
-{
-
-       ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
-
-       /*
-        * It's the first entry, slide the next 2 up.
-        */
-       if (dfp == &bf[0]) {
-               bf[0] = bf[1];
-               bf[1] = bf[2];
-       }
-       /*
-        * It's the second entry, slide the 3rd entry up.
-        */
-       else if (dfp == &bf[1])
-               bf[1] = bf[2];
-       /*
-        * Must be the last entry.
-        */
-       else
-               ASSERT(dfp == &bf[2]);
-       /*
-        * Clear the 3rd entry, must be zero now.
-        */
-       bf[2].length = 0;
-       bf[2].offset = 0;
-       *loghead = 1;
-}
-
-/*
- * Given a data block, reconstruct its bestfree map.
- */
-void
-xfs_dir2_data_freescan(
-       struct xfs_inode        *dp,
-       struct xfs_dir2_data_hdr *hdr,
-       int                     *loghead)
-{
-       xfs_dir2_block_tail_t   *btp;           /* block tail */
-       xfs_dir2_data_entry_t   *dep;           /* active data entry */
-       xfs_dir2_data_unused_t  *dup;           /* unused data entry */
-       struct xfs_dir2_data_free *bf;
-       char                    *endp;          /* end of block's data */
-       char                    *p;             /* current entry pointer */
-       struct xfs_da_geometry  *geo = dp->i_mount->m_dir_geo;
-
-       ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
-
-       /*
-        * Start by clearing the table.
-        */
-       bf = dp->d_ops->data_bestfree_p(hdr);
-       memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT);
-       *loghead = 1;
-       /*
-        * Set up pointers.
-        */
-       p = (char *)dp->d_ops->data_entry_p(hdr);
-       if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
-           hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
-               btp = xfs_dir2_block_tail_p(geo, hdr);
-               endp = (char *)xfs_dir2_block_leaf_p(btp);
-       } else
-               endp = (char *)hdr + geo->blksize;
-       /*
-        * Loop over the block's entries.
-        */
-       while (p < endp) {
-               dup = (xfs_dir2_data_unused_t *)p;
-               /*
-                * If it's a free entry, insert it.
-                */
-               if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
-                       ASSERT((char *)dup - (char *)hdr ==
-                              be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
-                       xfs_dir2_data_freeinsert(hdr, bf, dup, loghead);
-                       p += be16_to_cpu(dup->length);
-               }
-               /*
-                * For active entries, check their tags and skip them.
-                */
-               else {
-                       dep = (xfs_dir2_data_entry_t *)p;
-                       ASSERT((char *)dep - (char *)hdr ==
-                              be16_to_cpu(*dp->d_ops->data_entry_tag_p(dep)));
-                       p += dp->d_ops->data_entsize(dep->namelen);
-               }
-       }
-}
-
-/*
- * Initialize a data block at the given block number in the directory.
- * Give back the buffer for the created block.
- */
-int                                            /* error */
-xfs_dir3_data_init(
-       xfs_da_args_t           *args,          /* directory operation args */
-       xfs_dir2_db_t           blkno,          /* logical dir block number */
-       struct xfs_buf          **bpp)          /* output block buffer */
-{
-       struct xfs_buf          *bp;            /* block buffer */
-       xfs_dir2_data_hdr_t     *hdr;           /* data block header */
-       xfs_inode_t             *dp;            /* incore directory inode */
-       xfs_dir2_data_unused_t  *dup;           /* unused entry pointer */
-       struct xfs_dir2_data_free *bf;
-       int                     error;          /* error return value */
-       int                     i;              /* bestfree index */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       xfs_trans_t             *tp;            /* transaction pointer */
-       int                     t;              /* temp */
-
-       dp = args->dp;
-       mp = dp->i_mount;
-       tp = args->trans;
-       /*
-        * Get the buffer set up for the block.
-        */
-       error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, blkno),
-                              -1, &bp, XFS_DATA_FORK);
-       if (error)
-               return error;
-       bp->b_ops = &xfs_dir3_data_buf_ops;
-       xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_DATA_BUF);
-
-       /*
-        * Initialize the header.
-        */
-       hdr = bp->b_addr;
-       if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
-
-               memset(hdr3, 0, sizeof(*hdr3));
-               hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
-               hdr3->blkno = cpu_to_be64(bp->b_bn);
-               hdr3->owner = cpu_to_be64(dp->i_ino);
-               uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
-
-       } else
-               hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
-
-       bf = dp->d_ops->data_bestfree_p(hdr);
-       bf[0].offset = cpu_to_be16(dp->d_ops->data_entry_offset);
-       for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
-               bf[i].length = 0;
-               bf[i].offset = 0;
-       }
-
-       /*
-        * Set up an unused entry for the block's body.
-        */
-       dup = dp->d_ops->data_unused_p(hdr);
-       dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
-
-       t = args->geo->blksize - (uint)dp->d_ops->data_entry_offset;
-       bf[0].length = cpu_to_be16(t);
-       dup->length = cpu_to_be16(t);
-       *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr);
-       /*
-        * Log it and return it.
-        */
-       xfs_dir2_data_log_header(args, bp);
-       xfs_dir2_data_log_unused(args, bp, dup);
-       *bpp = bp;
-       return 0;
-}
-
-/*
- * Log an active data entry from the block.
- */
-void
-xfs_dir2_data_log_entry(
-       struct xfs_da_args      *args,
-       struct xfs_buf          *bp,
-       xfs_dir2_data_entry_t   *dep)           /* data entry pointer */
-{
-       struct xfs_dir2_data_hdr *hdr = bp->b_addr;
-
-       ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
-
-       xfs_trans_log_buf(args->trans, bp, (uint)((char *)dep - (char *)hdr),
-               (uint)((char *)(args->dp->d_ops->data_entry_tag_p(dep) + 1) -
-                      (char *)hdr - 1));
-}
-
-/*
- * Log a data block header.
- */
-void
-xfs_dir2_data_log_header(
-       struct xfs_da_args      *args,
-       struct xfs_buf          *bp)
-{
-#ifdef DEBUG
-       struct xfs_dir2_data_hdr *hdr = bp->b_addr;
-
-       ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
-#endif
-
-       xfs_trans_log_buf(args->trans, bp, 0,
-                         args->dp->d_ops->data_entry_offset - 1);
-}
-
-/*
- * Log a data unused entry.
- */
-void
-xfs_dir2_data_log_unused(
-       struct xfs_da_args      *args,
-       struct xfs_buf          *bp,
-       xfs_dir2_data_unused_t  *dup)           /* data unused pointer */
-{
-       xfs_dir2_data_hdr_t     *hdr = bp->b_addr;
-
-       ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
-
-       /*
-        * Log the first part of the unused entry.
-        */
-       xfs_trans_log_buf(args->trans, bp, (uint)((char *)dup - (char *)hdr),
-               (uint)((char *)&dup->length + sizeof(dup->length) -
-                      1 - (char *)hdr));
-       /*
-        * Log the end (tag) of the unused entry.
-        */
-       xfs_trans_log_buf(args->trans, bp,
-               (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr),
-               (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr +
-                      sizeof(xfs_dir2_data_off_t) - 1));
-}
-
-/*
- * Make a byte range in the data block unused.
- * Its current contents are unimportant.
- */
-void
-xfs_dir2_data_make_free(
-       struct xfs_da_args      *args,
-       struct xfs_buf          *bp,
-       xfs_dir2_data_aoff_t    offset,         /* starting byte offset */
-       xfs_dir2_data_aoff_t    len,            /* length in bytes */
-       int                     *needlogp,      /* out: log header */
-       int                     *needscanp)     /* out: regen bestfree */
-{
-       xfs_dir2_data_hdr_t     *hdr;           /* data block pointer */
-       xfs_dir2_data_free_t    *dfp;           /* bestfree pointer */
-       char                    *endptr;        /* end of data area */
-       int                     needscan;       /* need to regen bestfree */
-       xfs_dir2_data_unused_t  *newdup;        /* new unused entry */
-       xfs_dir2_data_unused_t  *postdup;       /* unused entry after us */
-       xfs_dir2_data_unused_t  *prevdup;       /* unused entry before us */
-       struct xfs_dir2_data_free *bf;
-
-       hdr = bp->b_addr;
-
-       /*
-        * Figure out where the end of the data area is.
-        */
-       if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-           hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC))
-               endptr = (char *)hdr + args->geo->blksize;
-       else {
-               xfs_dir2_block_tail_t   *btp;   /* block tail */
-
-               ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
-                       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
-               btp = xfs_dir2_block_tail_p(args->geo, hdr);
-               endptr = (char *)xfs_dir2_block_leaf_p(btp);
-       }
-       /*
-        * If this isn't the start of the block, then back up to
-        * the previous entry and see if it's free.
-        */
-       if (offset > args->dp->d_ops->data_entry_offset) {
-               __be16                  *tagp;  /* tag just before us */
-
-               tagp = (__be16 *)((char *)hdr + offset) - 1;
-               prevdup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
-               if (be16_to_cpu(prevdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
-                       prevdup = NULL;
-       } else
-               prevdup = NULL;
-       /*
-        * If this isn't the end of the block, see if the entry after
-        * us is free.
-        */
-       if ((char *)hdr + offset + len < endptr) {
-               postdup =
-                       (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
-               if (be16_to_cpu(postdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
-                       postdup = NULL;
-       } else
-               postdup = NULL;
-       ASSERT(*needscanp == 0);
-       needscan = 0;
-       /*
-        * Previous and following entries are both free,
-        * merge everything into a single free entry.
-        */
-       bf = args->dp->d_ops->data_bestfree_p(hdr);
-       if (prevdup && postdup) {
-               xfs_dir2_data_free_t    *dfp2;  /* another bestfree pointer */
-
-               /*
-                * See if prevdup and/or postdup are in bestfree table.
-                */
-               dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
-               dfp2 = xfs_dir2_data_freefind(hdr, bf, postdup);
-               /*
-                * We need a rescan unless there are exactly 2 free entries
-                * namely our two.  Then we know what's happening, otherwise
-                * since the third bestfree is there, there might be more
-                * entries.
-                */
-               needscan = (bf[2].length != 0);
-               /*
-                * Fix up the new big freespace.
-                */
-               be16_add_cpu(&prevdup->length, len + be16_to_cpu(postdup->length));
-               *xfs_dir2_data_unused_tag_p(prevdup) =
-                       cpu_to_be16((char *)prevdup - (char *)hdr);
-               xfs_dir2_data_log_unused(args, bp, prevdup);
-               if (!needscan) {
-                       /*
-                        * Has to be the case that entries 0 and 1 are
-                        * dfp and dfp2 (don't know which is which), and
-                        * entry 2 is empty.
-                        * Remove entry 1 first then entry 0.
-                        */
-                       ASSERT(dfp && dfp2);
-                       if (dfp == &bf[1]) {
-                               dfp = &bf[0];
-                               ASSERT(dfp2 == dfp);
-                               dfp2 = &bf[1];
-                       }
-                       xfs_dir2_data_freeremove(hdr, bf, dfp2, needlogp);
-                       xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
-                       /*
-                        * Now insert the new entry.
-                        */
-                       dfp = xfs_dir2_data_freeinsert(hdr, bf, prevdup,
-                                                      needlogp);
-                       ASSERT(dfp == &bf[0]);
-                       ASSERT(dfp->length == prevdup->length);
-                       ASSERT(!dfp[1].length);
-                       ASSERT(!dfp[2].length);
-               }
-       }
-       /*
-        * The entry before us is free, merge with it.
-        */
-       else if (prevdup) {
-               dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
-               be16_add_cpu(&prevdup->length, len);
-               *xfs_dir2_data_unused_tag_p(prevdup) =
-                       cpu_to_be16((char *)prevdup - (char *)hdr);
-               xfs_dir2_data_log_unused(args, bp, prevdup);
-               /*
-                * If the previous entry was in the table, the new entry
-                * is longer, so it will be in the table too.  Remove
-                * the old one and add the new one.
-                */
-               if (dfp) {
-                       xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
-                       xfs_dir2_data_freeinsert(hdr, bf, prevdup, needlogp);
-               }
-               /*
-                * Otherwise we need a scan if the new entry is big enough.
-                */
-               else {
-                       needscan = be16_to_cpu(prevdup->length) >
-                                  be16_to_cpu(bf[2].length);
-               }
-       }
-       /*
-        * The following entry is free, merge with it.
-        */
-       else if (postdup) {
-               dfp = xfs_dir2_data_freefind(hdr, bf, postdup);
-               newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
-               newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
-               newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length));
-               *xfs_dir2_data_unused_tag_p(newdup) =
-                       cpu_to_be16((char *)newdup - (char *)hdr);
-               xfs_dir2_data_log_unused(args, bp, newdup);
-               /*
-                * If the following entry was in the table, the new entry
-                * is longer, so it will be in the table too.  Remove
-                * the old one and add the new one.
-                */
-               if (dfp) {
-                       xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
-                       xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
-               }
-               /*
-                * Otherwise we need a scan if the new entry is big enough.
-                */
-               else {
-                       needscan = be16_to_cpu(newdup->length) >
-                                  be16_to_cpu(bf[2].length);
-               }
-       }
-       /*
-        * Neither neighbor is free.  Make a new entry.
-        */
-       else {
-               newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
-               newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
-               newdup->length = cpu_to_be16(len);
-               *xfs_dir2_data_unused_tag_p(newdup) =
-                       cpu_to_be16((char *)newdup - (char *)hdr);
-               xfs_dir2_data_log_unused(args, bp, newdup);
-               xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
-       }
-       *needscanp = needscan;
-}
-
-/*
- * Take a byte range out of an existing unused space and make it un-free.
- */
-void
-xfs_dir2_data_use_free(
-       struct xfs_da_args      *args,
-       struct xfs_buf          *bp,
-       xfs_dir2_data_unused_t  *dup,           /* unused entry */
-       xfs_dir2_data_aoff_t    offset,         /* starting offset to use */
-       xfs_dir2_data_aoff_t    len,            /* length to use */
-       int                     *needlogp,      /* out: need to log header */
-       int                     *needscanp)     /* out: need regen bestfree */
-{
-       xfs_dir2_data_hdr_t     *hdr;           /* data block header */
-       xfs_dir2_data_free_t    *dfp;           /* bestfree pointer */
-       int                     matchback;      /* matches end of freespace */
-       int                     matchfront;     /* matches start of freespace */
-       int                     needscan;       /* need to regen bestfree */
-       xfs_dir2_data_unused_t  *newdup;        /* new unused entry */
-       xfs_dir2_data_unused_t  *newdup2;       /* another new unused entry */
-       int                     oldlen;         /* old unused entry's length */
-       struct xfs_dir2_data_free *bf;
-
-       hdr = bp->b_addr;
-       ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
-       ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG);
-       ASSERT(offset >= (char *)dup - (char *)hdr);
-       ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)hdr);
-       ASSERT((char *)dup - (char *)hdr == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
-       /*
-        * Look up the entry in the bestfree table.
-        */
-       oldlen = be16_to_cpu(dup->length);
-       bf = args->dp->d_ops->data_bestfree_p(hdr);
-       dfp = xfs_dir2_data_freefind(hdr, bf, dup);
-       ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length));
-       /*
-        * Check for alignment with front and back of the entry.
-        */
-       matchfront = (char *)dup - (char *)hdr == offset;
-       matchback = (char *)dup + oldlen - (char *)hdr == offset + len;
-       ASSERT(*needscanp == 0);
-       needscan = 0;
-       /*
-        * If we matched it exactly we just need to get rid of it from
-        * the bestfree table.
-        */
-       if (matchfront && matchback) {
-               if (dfp) {
-                       needscan = (bf[2].offset != 0);
-                       if (!needscan)
-                               xfs_dir2_data_freeremove(hdr, bf, dfp,
-                                                        needlogp);
-               }
-       }
-       /*
-        * We match the first part of the entry.
-        * Make a new entry with the remaining freespace.
-        */
-       else if (matchfront) {
-               newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
-               newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
-               newdup->length = cpu_to_be16(oldlen - len);
-               *xfs_dir2_data_unused_tag_p(newdup) =
-                       cpu_to_be16((char *)newdup - (char *)hdr);
-               xfs_dir2_data_log_unused(args, bp, newdup);
-               /*
-                * If it was in the table, remove it and add the new one.
-                */
-               if (dfp) {
-                       xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
-                       dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
-                                                      needlogp);
-                       ASSERT(dfp != NULL);
-                       ASSERT(dfp->length == newdup->length);
-                       ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr);
-                       /*
-                        * If we got inserted at the last slot,
-                        * that means we don't know if there was a better
-                        * choice for the last slot, or not.  Rescan.
-                        */
-                       needscan = dfp == &bf[2];
-               }
-       }
-       /*
-        * We match the last part of the entry.
-        * Trim the allocated space off the tail of the entry.
-        */
-       else if (matchback) {
-               newdup = dup;
-               newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
-               *xfs_dir2_data_unused_tag_p(newdup) =
-                       cpu_to_be16((char *)newdup - (char *)hdr);
-               xfs_dir2_data_log_unused(args, bp, newdup);
-               /*
-                * If it was in the table, remove it and add the new one.
-                */
-               if (dfp) {
-                       xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
-                       dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
-                                                      needlogp);
-                       ASSERT(dfp != NULL);
-                       ASSERT(dfp->length == newdup->length);
-                       ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr);
-                       /*
-                        * If we got inserted at the last slot,
-                        * that means we don't know if there was a better
-                        * choice for the last slot, or not.  Rescan.
-                        */
-                       needscan = dfp == &bf[2];
-               }
-       }
-       /*
-        * Poking out the middle of an entry.
-        * Make two new entries.
-        */
-       else {
-               newdup = dup;
-               newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
-               *xfs_dir2_data_unused_tag_p(newdup) =
-                       cpu_to_be16((char *)newdup - (char *)hdr);
-               xfs_dir2_data_log_unused(args, bp, newdup);
-               newdup2 = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
-               newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
-               newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length));
-               *xfs_dir2_data_unused_tag_p(newdup2) =
-                       cpu_to_be16((char *)newdup2 - (char *)hdr);
-               xfs_dir2_data_log_unused(args, bp, newdup2);
-               /*
-                * If the old entry was in the table, we need to scan
-                * if the 3rd entry was valid, since these entries
-                * are smaller than the old one.
-                * If we don't need to scan that means there were 1 or 2
-                * entries in the table, and removing the old and adding
-                * the 2 new will work.
-                */
-               if (dfp) {
-                       needscan = (bf[2].length != 0);
-                       if (!needscan) {
-                               xfs_dir2_data_freeremove(hdr, bf, dfp,
-                                                        needlogp);
-                               xfs_dir2_data_freeinsert(hdr, bf, newdup,
-                                                        needlogp);
-                               xfs_dir2_data_freeinsert(hdr, bf, newdup2,
-                                                        needlogp);
-                       }
-               }
-       }
-       *needscanp = needscan;
-}
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
deleted file mode 100644 (file)
index fb0aad4..0000000
+++ /dev/null
@@ -1,1831 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_priv.h"
-#include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_trans.h"
-#include "xfs_buf_item.h"
-#include "xfs_cksum.h"
-
-/*
- * Local function declarations.
- */
-static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp,
-                                   int *indexp, struct xfs_buf **dbpp);
-static void xfs_dir3_leaf_log_bests(struct xfs_da_args *args,
-                                   struct xfs_buf *bp, int first, int last);
-static void xfs_dir3_leaf_log_tail(struct xfs_da_args *args,
-                                  struct xfs_buf *bp);
-
-/*
- * Check the internal consistency of a leaf1 block.
- * Pop an assert if something is wrong.
- */
-#ifdef DEBUG
-#define        xfs_dir3_leaf_check(dp, bp) \
-do { \
-       if (!xfs_dir3_leaf1_check((dp), (bp))) \
-               ASSERT(0); \
-} while (0);
-
-STATIC bool
-xfs_dir3_leaf1_check(
-       struct xfs_inode        *dp,
-       struct xfs_buf          *bp)
-{
-       struct xfs_dir2_leaf    *leaf = bp->b_addr;
-       struct xfs_dir3_icleaf_hdr leafhdr;
-
-       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-
-       if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) {
-               struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
-               if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
-                       return false;
-       } else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC)
-               return false;
-
-       return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
-}
-#else
-#define        xfs_dir3_leaf_check(dp, bp)
-#endif
-
-bool
-xfs_dir3_leaf_check_int(
-       struct xfs_mount        *mp,
-       struct xfs_inode        *dp,
-       struct xfs_dir3_icleaf_hdr *hdr,
-       struct xfs_dir2_leaf    *leaf)
-{
-       struct xfs_dir2_leaf_entry *ents;
-       xfs_dir2_leaf_tail_t    *ltp;
-       int                     stale;
-       int                     i;
-       const struct xfs_dir_ops *ops;
-       struct xfs_dir3_icleaf_hdr leafhdr;
-       struct xfs_da_geometry  *geo = mp->m_dir_geo;
-
-       /*
-        * we can be passed a null dp here from a verifier, so we need to go the
-        * hard way to get them.
-        */
-       ops = xfs_dir_get_ops(mp, dp);
-
-       if (!hdr) {
-               ops->leaf_hdr_from_disk(&leafhdr, leaf);
-               hdr = &leafhdr;
-       }
-
-       ents = ops->leaf_ents_p(leaf);
-       ltp = xfs_dir2_leaf_tail_p(geo, leaf);
-
-       /*
-        * XXX (dgc): This value is not restrictive enough.
-        * Should factor in the size of the bests table as well.
-        * We can deduce a value for that from di_size.
-        */
-       if (hdr->count > ops->leaf_max_ents(geo))
-               return false;
-
-       /* Leaves and bests don't overlap in leaf format. */
-       if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
-            hdr->magic == XFS_DIR3_LEAF1_MAGIC) &&
-           (char *)&ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp))
-               return false;
-
-       /* Check hash value order, count stale entries.  */
-       for (i = stale = 0; i < hdr->count; i++) {
-               if (i + 1 < hdr->count) {
-                       if (be32_to_cpu(ents[i].hashval) >
-                                       be32_to_cpu(ents[i + 1].hashval))
-                               return false;
-               }
-               if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
-                       stale++;
-       }
-       if (hdr->stale != stale)
-               return false;
-       return true;
-}
-
-/*
- * We verify the magic numbers before decoding the leaf header so that on debug
- * kernels we don't get assertion failures in xfs_dir3_leaf_hdr_from_disk() due
- * to incorrect magic numbers.
- */
-static bool
-xfs_dir3_leaf_verify(
-       struct xfs_buf          *bp,
-       __uint16_t              magic)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-       struct xfs_dir2_leaf    *leaf = bp->b_addr;
-
-       ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
-
-       if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
-               __uint16_t              magic3;
-
-               magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC
-                                                        : XFS_DIR3_LEAFN_MAGIC;
-
-               if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
-                       return false;
-               if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid))
-                       return false;
-               if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
-                       return false;
-       } else {
-               if (leaf->hdr.info.magic != cpu_to_be16(magic))
-                       return false;
-       }
-
-       return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf);
-}
-
-static void
-__read_verify(
-       struct xfs_buf  *bp,
-       __uint16_t      magic)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-
-       if (xfs_sb_version_hascrc(&mp->m_sb) &&
-            !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
-               xfs_buf_ioerror(bp, EFSBADCRC);
-       else if (!xfs_dir3_leaf_verify(bp, magic))
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-       if (bp->b_error)
-               xfs_verifier_error(bp);
-}
-
-static void
-__write_verify(
-       struct xfs_buf  *bp,
-       __uint16_t      magic)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-       struct xfs_buf_log_item *bip = bp->b_fspriv;
-       struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
-
-       if (!xfs_dir3_leaf_verify(bp, magic)) {
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-               xfs_verifier_error(bp);
-               return;
-       }
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return;
-
-       if (bip)
-               hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
-
-       xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
-}
-
-static void
-xfs_dir3_leaf1_read_verify(
-       struct xfs_buf  *bp)
-{
-       __read_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static void
-xfs_dir3_leaf1_write_verify(
-       struct xfs_buf  *bp)
-{
-       __write_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static void
-xfs_dir3_leafn_read_verify(
-       struct xfs_buf  *bp)
-{
-       __read_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir3_leafn_write_verify(
-       struct xfs_buf  *bp)
-{
-       __write_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
-const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
-       .verify_read = xfs_dir3_leaf1_read_verify,
-       .verify_write = xfs_dir3_leaf1_write_verify,
-};
-
-const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
-       .verify_read = xfs_dir3_leafn_read_verify,
-       .verify_write = xfs_dir3_leafn_write_verify,
-};
-
-static int
-xfs_dir3_leaf_read(
-       struct xfs_trans        *tp,
-       struct xfs_inode        *dp,
-       xfs_dablk_t             fbno,
-       xfs_daddr_t             mappedbno,
-       struct xfs_buf          **bpp)
-{
-       int                     err;
-
-       err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
-                               XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops);
-       if (!err && tp)
-               xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
-       return err;
-}
-
-int
-xfs_dir3_leafn_read(
-       struct xfs_trans        *tp,
-       struct xfs_inode        *dp,
-       xfs_dablk_t             fbno,
-       xfs_daddr_t             mappedbno,
-       struct xfs_buf          **bpp)
-{
-       int                     err;
-
-       err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
-                               XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops);
-       if (!err && tp)
-               xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
-       return err;
-}
-
-/*
- * Initialize a new leaf block, leaf1 or leafn magic accepted.
- */
-static void
-xfs_dir3_leaf_init(
-       struct xfs_mount        *mp,
-       struct xfs_trans        *tp,
-       struct xfs_buf          *bp,
-       xfs_ino_t               owner,
-       __uint16_t              type)
-{
-       struct xfs_dir2_leaf    *leaf = bp->b_addr;
-
-       ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC);
-
-       if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
-
-               memset(leaf3, 0, sizeof(*leaf3));
-
-               leaf3->info.hdr.magic = (type == XFS_DIR2_LEAF1_MAGIC)
-                                        ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)
-                                        : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
-               leaf3->info.blkno = cpu_to_be64(bp->b_bn);
-               leaf3->info.owner = cpu_to_be64(owner);
-               uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_uuid);
-       } else {
-               memset(leaf, 0, sizeof(*leaf));
-               leaf->hdr.info.magic = cpu_to_be16(type);
-       }
-
-       /*
-        * If it's a leaf-format directory initialize the tail.
-        * Caller is responsible for initialising the bests table.
-        */
-       if (type == XFS_DIR2_LEAF1_MAGIC) {
-               struct xfs_dir2_leaf_tail *ltp;
-
-               ltp = xfs_dir2_leaf_tail_p(mp->m_dir_geo, leaf);
-               ltp->bestcount = 0;
-               bp->b_ops = &xfs_dir3_leaf1_buf_ops;
-               xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAF1_BUF);
-       } else {
-               bp->b_ops = &xfs_dir3_leafn_buf_ops;
-               xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
-       }
-}
-
-int
-xfs_dir3_leaf_get_buf(
-       xfs_da_args_t           *args,
-       xfs_dir2_db_t           bno,
-       struct xfs_buf          **bpp,
-       __uint16_t              magic)
-{
-       struct xfs_inode        *dp = args->dp;
-       struct xfs_trans        *tp = args->trans;
-       struct xfs_mount        *mp = dp->i_mount;
-       struct xfs_buf          *bp;
-       int                     error;
-
-       ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
-       ASSERT(bno >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET) &&
-              bno < xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
-
-       error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, bno),
-                              -1, &bp, XFS_DATA_FORK);
-       if (error)
-               return error;
-
-       xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic);
-       xfs_dir3_leaf_log_header(args, bp);
-       if (magic == XFS_DIR2_LEAF1_MAGIC)
-               xfs_dir3_leaf_log_tail(args, bp);
-       *bpp = bp;
-       return 0;
-}
-
-/*
- * Convert a block form directory to a leaf form directory.
- */
-int                                            /* error */
-xfs_dir2_block_to_leaf(
-       xfs_da_args_t           *args,          /* operation arguments */
-       struct xfs_buf          *dbp)           /* input block's buffer */
-{
-       __be16                  *bestsp;        /* leaf's bestsp entries */
-       xfs_dablk_t             blkno;          /* leaf block's bno */
-       xfs_dir2_data_hdr_t     *hdr;           /* block header */
-       xfs_dir2_leaf_entry_t   *blp;           /* block's leaf entries */
-       xfs_dir2_block_tail_t   *btp;           /* block's tail */
-       xfs_inode_t             *dp;            /* incore directory inode */
-       int                     error;          /* error return code */
-       struct xfs_buf          *lbp;           /* leaf block's buffer */
-       xfs_dir2_db_t           ldb;            /* leaf block's bno */
-       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
-       xfs_dir2_leaf_tail_t    *ltp;           /* leaf's tail */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       int                     needlog;        /* need to log block header */
-       int                     needscan;       /* need to rescan bestfree */
-       xfs_trans_t             *tp;            /* transaction pointer */
-       struct xfs_dir2_data_free *bf;
-       struct xfs_dir2_leaf_entry *ents;
-       struct xfs_dir3_icleaf_hdr leafhdr;
-
-       trace_xfs_dir2_block_to_leaf(args);
-
-       dp = args->dp;
-       mp = dp->i_mount;
-       tp = args->trans;
-       /*
-        * Add the leaf block to the inode.
-        * This interface will only put blocks in the leaf/node range.
-        * Since that's empty now, we'll get the root (block 0 in range).
-        */
-       if ((error = xfs_da_grow_inode(args, &blkno))) {
-               return error;
-       }
-       ldb = xfs_dir2_da_to_db(args->geo, blkno);
-       ASSERT(ldb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET));
-       /*
-        * Initialize the leaf block, get a buffer for it.
-        */
-       error = xfs_dir3_leaf_get_buf(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC);
-       if (error)
-               return error;
-
-       leaf = lbp->b_addr;
-       hdr = dbp->b_addr;
-       xfs_dir3_data_check(dp, dbp);
-       btp = xfs_dir2_block_tail_p(args->geo, hdr);
-       blp = xfs_dir2_block_leaf_p(btp);
-       bf = dp->d_ops->data_bestfree_p(hdr);
-       ents = dp->d_ops->leaf_ents_p(leaf);
-
-       /*
-        * Set the counts in the leaf header.
-        */
-       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-       leafhdr.count = be32_to_cpu(btp->count);
-       leafhdr.stale = be32_to_cpu(btp->stale);
-       dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
-       xfs_dir3_leaf_log_header(args, lbp);
-
-       /*
-        * Could compact these but I think we always do the conversion
-        * after squeezing out stale entries.
-        */
-       memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t));
-       xfs_dir3_leaf_log_ents(args, lbp, 0, leafhdr.count - 1);
-       needscan = 0;
-       needlog = 1;
-       /*
-        * Make the space formerly occupied by the leaf entries and block
-        * tail be free.
-        */
-       xfs_dir2_data_make_free(args, dbp,
-               (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
-               (xfs_dir2_data_aoff_t)((char *)hdr + args->geo->blksize -
-                                      (char *)blp),
-               &needlog, &needscan);
-       /*
-        * Fix up the block header, make it a data block.
-        */
-       dbp->b_ops = &xfs_dir3_data_buf_ops;
-       xfs_trans_buf_set_type(tp, dbp, XFS_BLFT_DIR_DATA_BUF);
-       if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
-               hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
-       else
-               hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
-
-       if (needscan)
-               xfs_dir2_data_freescan(dp, hdr, &needlog);
-       /*
-        * Set up leaf tail and bests table.
-        */
-       ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
-       ltp->bestcount = cpu_to_be32(1);
-       bestsp = xfs_dir2_leaf_bests_p(ltp);
-       bestsp[0] =  bf[0].length;
-       /*
-        * Log the data header and leaf bests table.
-        */
-       if (needlog)
-               xfs_dir2_data_log_header(args, dbp);
-       xfs_dir3_leaf_check(dp, lbp);
-       xfs_dir3_data_check(dp, dbp);
-       xfs_dir3_leaf_log_bests(args, lbp, 0, 0);
-       return 0;
-}
-
-STATIC void
-xfs_dir3_leaf_find_stale(
-       struct xfs_dir3_icleaf_hdr *leafhdr,
-       struct xfs_dir2_leaf_entry *ents,
-       int                     index,
-       int                     *lowstale,
-       int                     *highstale)
-{
-       /*
-        * Find the first stale entry before our index, if any.
-        */
-       for (*lowstale = index - 1; *lowstale >= 0; --*lowstale) {
-               if (ents[*lowstale].address ==
-                   cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
-                       break;
-       }
-
-       /*
-        * Find the first stale entry at or after our index, if any.
-        * Stop if the result would require moving more entries than using
-        * lowstale.
-        */
-       for (*highstale = index; *highstale < leafhdr->count; ++*highstale) {
-               if (ents[*highstale].address ==
-                   cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
-                       break;
-               if (*lowstale >= 0 && index - *lowstale <= *highstale - index)
-                       break;
-       }
-}
-
-struct xfs_dir2_leaf_entry *
-xfs_dir3_leaf_find_entry(
-       struct xfs_dir3_icleaf_hdr *leafhdr,
-       struct xfs_dir2_leaf_entry *ents,
-       int                     index,          /* leaf table position */
-       int                     compact,        /* need to compact leaves */
-       int                     lowstale,       /* index of prev stale leaf */
-       int                     highstale,      /* index of next stale leaf */
-       int                     *lfloglow,      /* low leaf logging index */
-       int                     *lfloghigh)     /* high leaf logging index */
-{
-       if (!leafhdr->stale) {
-               xfs_dir2_leaf_entry_t   *lep;   /* leaf entry table pointer */
-
-               /*
-                * Now we need to make room to insert the leaf entry.
-                *
-                * If there are no stale entries, just insert a hole at index.
-                */
-               lep = &ents[index];
-               if (index < leafhdr->count)
-                       memmove(lep + 1, lep,
-                               (leafhdr->count - index) * sizeof(*lep));
-
-               /*
-                * Record low and high logging indices for the leaf.
-                */
-               *lfloglow = index;
-               *lfloghigh = leafhdr->count++;
-               return lep;
-       }
-
-       /*
-        * There are stale entries.
-        *
-        * We will use one of them for the new entry.  It's probably not at
-        * the right location, so we'll have to shift some up or down first.
-        *
-        * If we didn't compact before, we need to find the nearest stale
-        * entries before and after our insertion point.
-        */
-       if (compact == 0)
-               xfs_dir3_leaf_find_stale(leafhdr, ents, index,
-                                        &lowstale, &highstale);
-
-       /*
-        * If the low one is better, use it.
-        */
-       if (lowstale >= 0 &&
-           (highstale == leafhdr->count ||
-            index - lowstale - 1 < highstale - index)) {
-               ASSERT(index - lowstale - 1 >= 0);
-               ASSERT(ents[lowstale].address ==
-                      cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
-
-               /*
-                * Copy entries up to cover the stale entry and make room
-                * for the new entry.
-                */
-               if (index - lowstale - 1 > 0) {
-                       memmove(&ents[lowstale], &ents[lowstale + 1],
-                               (index - lowstale - 1) *
-                                       sizeof(xfs_dir2_leaf_entry_t));
-               }
-               *lfloglow = MIN(lowstale, *lfloglow);
-               *lfloghigh = MAX(index - 1, *lfloghigh);
-               leafhdr->stale--;
-               return &ents[index - 1];
-       }
-
-       /*
-        * The high one is better, so use that one.
-        */
-       ASSERT(highstale - index >= 0);
-       ASSERT(ents[highstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
-
-       /*
-        * Copy entries down to cover the stale entry and make room for the
-        * new entry.
-        */
-       if (highstale - index > 0) {
-               memmove(&ents[index + 1], &ents[index],
-                       (highstale - index) * sizeof(xfs_dir2_leaf_entry_t));
-       }
-       *lfloglow = MIN(index, *lfloglow);
-       *lfloghigh = MAX(highstale, *lfloghigh);
-       leafhdr->stale--;
-       return &ents[index];
-}
-
-/*
- * Add an entry to a leaf form directory.
- */
-int                                            /* error */
-xfs_dir2_leaf_addname(
-       xfs_da_args_t           *args)          /* operation arguments */
-{
-       __be16                  *bestsp;        /* freespace table in leaf */
-       int                     compact;        /* need to compact leaves */
-       xfs_dir2_data_hdr_t     *hdr;           /* data block header */
-       struct xfs_buf          *dbp;           /* data block buffer */
-       xfs_dir2_data_entry_t   *dep;           /* data block entry */
-       xfs_inode_t             *dp;            /* incore directory inode */
-       xfs_dir2_data_unused_t  *dup;           /* data unused entry */
-       int                     error;          /* error return value */
-       int                     grown;          /* allocated new data block */
-       int                     highstale;      /* index of next stale leaf */
-       int                     i;              /* temporary, index */
-       int                     index;          /* leaf table position */
-       struct xfs_buf          *lbp;           /* leaf's buffer */
-       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
-       int                     length;         /* length of new entry */
-       xfs_dir2_leaf_entry_t   *lep;           /* leaf entry table pointer */
-       int                     lfloglow;       /* low leaf logging index */
-       int                     lfloghigh;      /* high leaf logging index */
-       int                     lowstale;       /* index of prev stale leaf */
-       xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail pointer */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       int                     needbytes;      /* leaf block bytes needed */
-       int                     needlog;        /* need to log data header */
-       int                     needscan;       /* need to rescan data free */
-       __be16                  *tagp;          /* end of data entry */
-       xfs_trans_t             *tp;            /* transaction pointer */
-       xfs_dir2_db_t           use_block;      /* data block number */
-       struct xfs_dir2_data_free *bf;          /* bestfree table */
-       struct xfs_dir2_leaf_entry *ents;
-       struct xfs_dir3_icleaf_hdr leafhdr;
-
-       trace_xfs_dir2_leaf_addname(args);
-
-       dp = args->dp;
-       tp = args->trans;
-       mp = dp->i_mount;
-
-       error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
-       if (error)
-               return error;
-
-       /*
-        * Look up the entry by hash value and name.
-        * We know it's not there, our caller has already done a lookup.
-        * So the index is of the entry to insert in front of.
-        * But if there are dup hash values the index is of the first of those.
-        */
-       index = xfs_dir2_leaf_search_hash(args, lbp);
-       leaf = lbp->b_addr;
-       ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
-       ents = dp->d_ops->leaf_ents_p(leaf);
-       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-       bestsp = xfs_dir2_leaf_bests_p(ltp);
-       length = dp->d_ops->data_entsize(args->namelen);
-
-       /*
-        * See if there are any entries with the same hash value
-        * and space in their block for the new entry.
-        * This is good because it puts multiple same-hash value entries
-        * in a data block, improving the lookup of those entries.
-        */
-       for (use_block = -1, lep = &ents[index];
-            index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
-            index++, lep++) {
-               if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
-                       continue;
-               i = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
-               ASSERT(i < be32_to_cpu(ltp->bestcount));
-               ASSERT(bestsp[i] != cpu_to_be16(NULLDATAOFF));
-               if (be16_to_cpu(bestsp[i]) >= length) {
-                       use_block = i;
-                       break;
-               }
-       }
-       /*
-        * Didn't find a block yet, linear search all the data blocks.
-        */
-       if (use_block == -1) {
-               for (i = 0; i < be32_to_cpu(ltp->bestcount); i++) {
-                       /*
-                        * Remember a block we see that's missing.
-                        */
-                       if (bestsp[i] == cpu_to_be16(NULLDATAOFF) &&
-                           use_block == -1)
-                               use_block = i;
-                       else if (be16_to_cpu(bestsp[i]) >= length) {
-                               use_block = i;
-                               break;
-                       }
-               }
-       }
-       /*
-        * How many bytes do we need in the leaf block?
-        */
-       needbytes = 0;
-       if (!leafhdr.stale)
-               needbytes += sizeof(xfs_dir2_leaf_entry_t);
-       if (use_block == -1)
-               needbytes += sizeof(xfs_dir2_data_off_t);
-
-       /*
-        * Now kill use_block if it refers to a missing block, so we
-        * can use it as an indication of allocation needed.
-        */
-       if (use_block != -1 && bestsp[use_block] == cpu_to_be16(NULLDATAOFF))
-               use_block = -1;
-       /*
-        * If we don't have enough free bytes but we can make enough
-        * by compacting out stale entries, we'll do that.
-        */
-       if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes &&
-           leafhdr.stale > 1)
-               compact = 1;
-
-       /*
-        * Otherwise if we don't have enough free bytes we need to
-        * convert to node form.
-        */
-       else if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes) {
-               /*
-                * Just checking or no space reservation, give up.
-                */
-               if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
-                                                       args->total == 0) {
-                       xfs_trans_brelse(tp, lbp);
-                       return XFS_ERROR(ENOSPC);
-               }
-               /*
-                * Convert to node form.
-                */
-               error = xfs_dir2_leaf_to_node(args, lbp);
-               if (error)
-                       return error;
-               /*
-                * Then add the new entry.
-                */
-               return xfs_dir2_node_addname(args);
-       }
-       /*
-        * Otherwise it will fit without compaction.
-        */
-       else
-               compact = 0;
-       /*
-        * If just checking, then it will fit unless we needed to allocate
-        * a new data block.
-        */
-       if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
-               xfs_trans_brelse(tp, lbp);
-               return use_block == -1 ? XFS_ERROR(ENOSPC) : 0;
-       }
-       /*
-        * If no allocations are allowed, return now before we've
-        * changed anything.
-        */
-       if (args->total == 0 && use_block == -1) {
-               xfs_trans_brelse(tp, lbp);
-               return XFS_ERROR(ENOSPC);
-       }
-       /*
-        * Need to compact the leaf entries, removing stale ones.
-        * Leave one stale entry behind - the one closest to our
-        * insertion index - and we'll shift that one to our insertion
-        * point later.
-        */
-       if (compact) {
-               xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
-                       &highstale, &lfloglow, &lfloghigh);
-       }
-       /*
-        * There are stale entries, so we'll need log-low and log-high
-        * impossibly bad values later.
-        */
-       else if (leafhdr.stale) {
-               lfloglow = leafhdr.count;
-               lfloghigh = -1;
-       }
-       /*
-        * If there was no data block space found, we need to allocate
-        * a new one.
-        */
-       if (use_block == -1) {
-               /*
-                * Add the new data block.
-                */
-               if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE,
-                               &use_block))) {
-                       xfs_trans_brelse(tp, lbp);
-                       return error;
-               }
-               /*
-                * Initialize the block.
-                */
-               if ((error = xfs_dir3_data_init(args, use_block, &dbp))) {
-                       xfs_trans_brelse(tp, lbp);
-                       return error;
-               }
-               /*
-                * If we're adding a new data block on the end we need to
-                * extend the bests table.  Copy it up one entry.
-                */
-               if (use_block >= be32_to_cpu(ltp->bestcount)) {
-                       bestsp--;
-                       memmove(&bestsp[0], &bestsp[1],
-                               be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0]));
-                       be32_add_cpu(&ltp->bestcount, 1);
-                       xfs_dir3_leaf_log_tail(args, lbp);
-                       xfs_dir3_leaf_log_bests(args, lbp, 0,
-                                               be32_to_cpu(ltp->bestcount) - 1);
-               }
-               /*
-                * If we're filling in a previously empty block just log it.
-                */
-               else
-                       xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
-               hdr = dbp->b_addr;
-               bf = dp->d_ops->data_bestfree_p(hdr);
-               bestsp[use_block] = bf[0].length;
-               grown = 1;
-       } else {
-               /*
-                * Already had space in some data block.
-                * Just read that one in.
-                */
-               error = xfs_dir3_data_read(tp, dp,
-                                  xfs_dir2_db_to_da(args->geo, use_block),
-                                  -1, &dbp);
-               if (error) {
-                       xfs_trans_brelse(tp, lbp);
-                       return error;
-               }
-               hdr = dbp->b_addr;
-               bf = dp->d_ops->data_bestfree_p(hdr);
-               grown = 0;
-       }
-       /*
-        * Point to the biggest freespace in our data block.
-        */
-       dup = (xfs_dir2_data_unused_t *)
-             ((char *)hdr + be16_to_cpu(bf[0].offset));
-       ASSERT(be16_to_cpu(dup->length) >= length);
-       needscan = needlog = 0;
-       /*
-        * Mark the initial part of our freespace in use for the new entry.
-        */
-       xfs_dir2_data_use_free(args, dbp, dup,
-               (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
-               &needlog, &needscan);
-       /*
-        * Initialize our new entry (at last).
-        */
-       dep = (xfs_dir2_data_entry_t *)dup;
-       dep->inumber = cpu_to_be64(args->inumber);
-       dep->namelen = args->namelen;
-       memcpy(dep->name, args->name, dep->namelen);
-       dp->d_ops->data_put_ftype(dep, args->filetype);
-       tagp = dp->d_ops->data_entry_tag_p(dep);
-       *tagp = cpu_to_be16((char *)dep - (char *)hdr);
-       /*
-        * Need to scan fix up the bestfree table.
-        */
-       if (needscan)
-               xfs_dir2_data_freescan(dp, hdr, &needlog);
-       /*
-        * Need to log the data block's header.
-        */
-       if (needlog)
-               xfs_dir2_data_log_header(args, dbp);
-       xfs_dir2_data_log_entry(args, dbp, dep);
-       /*
-        * If the bests table needs to be changed, do it.
-        * Log the change unless we've already done that.
-        */
-       if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(bf[0].length)) {
-               bestsp[use_block] = bf[0].length;
-               if (!grown)
-                       xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
-       }
-
-       lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
-                                      highstale, &lfloglow, &lfloghigh);
-
-       /*
-        * Fill in the new leaf entry.
-        */
-       lep->hashval = cpu_to_be32(args->hashval);
-       lep->address = cpu_to_be32(
-                               xfs_dir2_db_off_to_dataptr(args->geo, use_block,
-                               be16_to_cpu(*tagp)));
-       /*
-        * Log the leaf fields and give up the buffers.
-        */
-       dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
-       xfs_dir3_leaf_log_header(args, lbp);
-       xfs_dir3_leaf_log_ents(args, lbp, lfloglow, lfloghigh);
-       xfs_dir3_leaf_check(dp, lbp);
-       xfs_dir3_data_check(dp, dbp);
-       return 0;
-}
-
-/*
- * Compact out any stale entries in the leaf.
- * Log the header and changed leaf entries, if any.
- */
-void
-xfs_dir3_leaf_compact(
-       xfs_da_args_t   *args,          /* operation arguments */
-       struct xfs_dir3_icleaf_hdr *leafhdr,
-       struct xfs_buf  *bp)            /* leaf buffer */
-{
-       int             from;           /* source leaf index */
-       xfs_dir2_leaf_t *leaf;          /* leaf structure */
-       int             loglow;         /* first leaf entry to log */
-       int             to;             /* target leaf index */
-       struct xfs_dir2_leaf_entry *ents;
-       struct xfs_inode *dp = args->dp;
-
-       leaf = bp->b_addr;
-       if (!leafhdr->stale)
-               return;
-
-       /*
-        * Compress out the stale entries in place.
-        */
-       ents = dp->d_ops->leaf_ents_p(leaf);
-       for (from = to = 0, loglow = -1; from < leafhdr->count; from++) {
-               if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
-                       continue;
-               /*
-                * Only actually copy the entries that are different.
-                */
-               if (from > to) {
-                       if (loglow == -1)
-                               loglow = to;
-                       ents[to] = ents[from];
-               }
-               to++;
-       }
-       /*
-        * Update and log the header, log the leaf entries.
-        */
-       ASSERT(leafhdr->stale == from - to);
-       leafhdr->count -= leafhdr->stale;
-       leafhdr->stale = 0;
-
-       dp->d_ops->leaf_hdr_to_disk(leaf, leafhdr);
-       xfs_dir3_leaf_log_header(args, bp);
-       if (loglow != -1)
-               xfs_dir3_leaf_log_ents(args, bp, loglow, to - 1);
-}
-
-/*
- * Compact the leaf entries, removing stale ones.
- * Leave one stale entry behind - the one closest to our
- * insertion index - and the caller will shift that one to our insertion
- * point later.
- * Return new insertion index, where the remaining stale entry is,
- * and leaf logging indices.
- */
-void
-xfs_dir3_leaf_compact_x1(
-       struct xfs_dir3_icleaf_hdr *leafhdr,
-       struct xfs_dir2_leaf_entry *ents,
-       int             *indexp,        /* insertion index */
-       int             *lowstalep,     /* out: stale entry before us */
-       int             *highstalep,    /* out: stale entry after us */
-       int             *lowlogp,       /* out: low log index */
-       int             *highlogp)      /* out: high log index */
-{
-       int             from;           /* source copy index */
-       int             highstale;      /* stale entry at/after index */
-       int             index;          /* insertion index */
-       int             keepstale;      /* source index of kept stale */
-       int             lowstale;       /* stale entry before index */
-       int             newindex=0;     /* new insertion index */
-       int             to;             /* destination copy index */
-
-       ASSERT(leafhdr->stale > 1);
-       index = *indexp;
-
-       xfs_dir3_leaf_find_stale(leafhdr, ents, index, &lowstale, &highstale);
-
-       /*
-        * Pick the better of lowstale and highstale.
-        */
-       if (lowstale >= 0 &&
-           (highstale == leafhdr->count ||
-            index - lowstale <= highstale - index))
-               keepstale = lowstale;
-       else
-               keepstale = highstale;
-       /*
-        * Copy the entries in place, removing all the stale entries
-        * except keepstale.
-        */
-       for (from = to = 0; from < leafhdr->count; from++) {
-               /*
-                * Notice the new value of index.
-                */
-               if (index == from)
-                       newindex = to;
-               if (from != keepstale &&
-                   ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
-                       if (from == to)
-                               *lowlogp = to;
-                       continue;
-               }
-               /*
-                * Record the new keepstale value for the insertion.
-                */
-               if (from == keepstale)
-                       lowstale = highstale = to;
-               /*
-                * Copy only the entries that have moved.
-                */
-               if (from > to)
-                       ents[to] = ents[from];
-               to++;
-       }
-       ASSERT(from > to);
-       /*
-        * If the insertion point was past the last entry,
-        * set the new insertion point accordingly.
-        */
-       if (index == from)
-               newindex = to;
-       *indexp = newindex;
-       /*
-        * Adjust the leaf header values.
-        */
-       leafhdr->count -= from - to;
-       leafhdr->stale = 1;
-       /*
-        * Remember the low/high stale value only in the "right"
-        * direction.
-        */
-       if (lowstale >= newindex)
-               lowstale = -1;
-       else
-               highstale = leafhdr->count;
-       *highlogp = leafhdr->count - 1;
-       *lowstalep = lowstale;
-       *highstalep = highstale;
-}
-
-/*
- * Log the bests entries indicated from a leaf1 block.
- */
-static void
-xfs_dir3_leaf_log_bests(
-       struct xfs_da_args      *args,
-       struct xfs_buf          *bp,            /* leaf buffer */
-       int                     first,          /* first entry to log */
-       int                     last)           /* last entry to log */
-{
-       __be16                  *firstb;        /* pointer to first entry */
-       __be16                  *lastb;         /* pointer to last entry */
-       struct xfs_dir2_leaf    *leaf = bp->b_addr;
-       xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
-
-       ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
-              leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC));
-
-       ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
-       firstb = xfs_dir2_leaf_bests_p(ltp) + first;
-       lastb = xfs_dir2_leaf_bests_p(ltp) + last;
-       xfs_trans_log_buf(args->trans, bp,
-               (uint)((char *)firstb - (char *)leaf),
-               (uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1));
-}
-
-/*
- * Log the leaf entries indicated from a leaf1 or leafn block.
- */
-void
-xfs_dir3_leaf_log_ents(
-       struct xfs_da_args      *args,
-       struct xfs_buf          *bp,
-       int                     first,
-       int                     last)
-{
-       xfs_dir2_leaf_entry_t   *firstlep;      /* pointer to first entry */
-       xfs_dir2_leaf_entry_t   *lastlep;       /* pointer to last entry */
-       struct xfs_dir2_leaf    *leaf = bp->b_addr;
-       struct xfs_dir2_leaf_entry *ents;
-
-       ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
-              leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
-              leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
-              leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
-
-       ents = args->dp->d_ops->leaf_ents_p(leaf);
-       firstlep = &ents[first];
-       lastlep = &ents[last];
-       xfs_trans_log_buf(args->trans, bp,
-               (uint)((char *)firstlep - (char *)leaf),
-               (uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1));
-}
-
-/*
- * Log the header of the leaf1 or leafn block.
- */
-void
-xfs_dir3_leaf_log_header(
-       struct xfs_da_args      *args,
-       struct xfs_buf          *bp)
-{
-       struct xfs_dir2_leaf    *leaf = bp->b_addr;
-
-       ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
-              leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
-              leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
-              leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
-
-       xfs_trans_log_buf(args->trans, bp,
-                         (uint)((char *)&leaf->hdr - (char *)leaf),
-                         args->dp->d_ops->leaf_hdr_size - 1);
-}
-
-/*
- * Log the tail of the leaf1 block.
- */
-STATIC void
-xfs_dir3_leaf_log_tail(
-       struct xfs_da_args      *args,
-       struct xfs_buf          *bp)
-{
-       struct xfs_dir2_leaf    *leaf = bp->b_addr;
-       xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
-
-       ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
-              leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
-              leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
-              leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
-
-       ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
-       xfs_trans_log_buf(args->trans, bp, (uint)((char *)ltp - (char *)leaf),
-               (uint)(args->geo->blksize - 1));
-}
-
-/*
- * Look up the entry referred to by args in the leaf format directory.
- * Most of the work is done by the xfs_dir2_leaf_lookup_int routine which
- * is also used by the node-format code.
- */
-int
-xfs_dir2_leaf_lookup(
-       xfs_da_args_t           *args)          /* operation arguments */
-{
-       struct xfs_buf          *dbp;           /* data block buffer */
-       xfs_dir2_data_entry_t   *dep;           /* data block entry */
-       xfs_inode_t             *dp;            /* incore directory inode */
-       int                     error;          /* error return code */
-       int                     index;          /* found entry index */
-       struct xfs_buf          *lbp;           /* leaf buffer */
-       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
-       xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
-       xfs_trans_t             *tp;            /* transaction pointer */
-       struct xfs_dir2_leaf_entry *ents;
-
-       trace_xfs_dir2_leaf_lookup(args);
-
-       /*
-        * Look up name in the leaf block, returning both buffers and index.
-        */
-       if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
-               return error;
-       }
-       tp = args->trans;
-       dp = args->dp;
-       xfs_dir3_leaf_check(dp, lbp);
-       leaf = lbp->b_addr;
-       ents = dp->d_ops->leaf_ents_p(leaf);
-       /*
-        * Get to the leaf entry and contained data entry address.
-        */
-       lep = &ents[index];
-
-       /*
-        * Point to the data entry.
-        */
-       dep = (xfs_dir2_data_entry_t *)
-             ((char *)dbp->b_addr +
-              xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
-       /*
-        * Return the found inode number & CI name if appropriate
-        */
-       args->inumber = be64_to_cpu(dep->inumber);
-       args->filetype = dp->d_ops->data_get_ftype(dep);
-       error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
-       xfs_trans_brelse(tp, dbp);
-       xfs_trans_brelse(tp, lbp);
-       return XFS_ERROR(error);
-}
-
-/*
- * Look up name/hash in the leaf block.
- * Fill in indexp with the found index, and dbpp with the data buffer.
- * If not found dbpp will be NULL, and ENOENT comes back.
- * lbpp will always be filled in with the leaf buffer unless there's an error.
- */
-static int                                     /* error */
-xfs_dir2_leaf_lookup_int(
-       xfs_da_args_t           *args,          /* operation arguments */
-       struct xfs_buf          **lbpp,         /* out: leaf buffer */
-       int                     *indexp,        /* out: index in leaf block */
-       struct xfs_buf          **dbpp)         /* out: data buffer */
-{
-       xfs_dir2_db_t           curdb = -1;     /* current data block number */
-       struct xfs_buf          *dbp = NULL;    /* data buffer */
-       xfs_dir2_data_entry_t   *dep;           /* data entry */
-       xfs_inode_t             *dp;            /* incore directory inode */
-       int                     error;          /* error return code */
-       int                     index;          /* index in leaf block */
-       struct xfs_buf          *lbp;           /* leaf buffer */
-       xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
-       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       xfs_dir2_db_t           newdb;          /* new data block number */
-       xfs_trans_t             *tp;            /* transaction pointer */
-       xfs_dir2_db_t           cidb = -1;      /* case match data block no. */
-       enum xfs_dacmp          cmp;            /* name compare result */
-       struct xfs_dir2_leaf_entry *ents;
-       struct xfs_dir3_icleaf_hdr leafhdr;
-
-       dp = args->dp;
-       tp = args->trans;
-       mp = dp->i_mount;
-
-       error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
-       if (error)
-               return error;
-
-       *lbpp = lbp;
-       leaf = lbp->b_addr;
-       xfs_dir3_leaf_check(dp, lbp);
-       ents = dp->d_ops->leaf_ents_p(leaf);
-       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-
-       /*
-        * Look for the first leaf entry with our hash value.
-        */
-       index = xfs_dir2_leaf_search_hash(args, lbp);
-       /*
-        * Loop over all the entries with the right hash value
-        * looking to match the name.
-        */
-       for (lep = &ents[index];
-            index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
-            lep++, index++) {
-               /*
-                * Skip over stale leaf entries.
-                */
-               if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
-                       continue;
-               /*
-                * Get the new data block number.
-                */
-               newdb = xfs_dir2_dataptr_to_db(args->geo,
-                                              be32_to_cpu(lep->address));
-               /*
-                * If it's not the same as the old data block number,
-                * need to pitch the old one and read the new one.
-                */
-               if (newdb != curdb) {
-                       if (dbp)
-                               xfs_trans_brelse(tp, dbp);
-                       error = xfs_dir3_data_read(tp, dp,
-                                          xfs_dir2_db_to_da(args->geo, newdb),
-                                          -1, &dbp);
-                       if (error) {
-                               xfs_trans_brelse(tp, lbp);
-                               return error;
-                       }
-                       curdb = newdb;
-               }
-               /*
-                * Point to the data entry.
-                */
-               dep = (xfs_dir2_data_entry_t *)((char *)dbp->b_addr +
-                       xfs_dir2_dataptr_to_off(args->geo,
-                                               be32_to_cpu(lep->address)));
-               /*
-                * Compare name and if it's an exact match, return the index
-                * and buffer. If it's the first case-insensitive match, store
-                * the index and buffer and continue looking for an exact match.
-                */
-               cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
-               if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
-                       args->cmpresult = cmp;
-                       *indexp = index;
-                       /* case exact match: return the current buffer. */
-                       if (cmp == XFS_CMP_EXACT) {
-                               *dbpp = dbp;
-                               return 0;
-                       }
-                       cidb = curdb;
-               }
-       }
-       ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
-       /*
-        * Here, we can only be doing a lookup (not a rename or remove).
-        * If a case-insensitive match was found earlier, re-read the
-        * appropriate data block if required and return it.
-        */
-       if (args->cmpresult == XFS_CMP_CASE) {
-               ASSERT(cidb != -1);
-               if (cidb != curdb) {
-                       xfs_trans_brelse(tp, dbp);
-                       error = xfs_dir3_data_read(tp, dp,
-                                          xfs_dir2_db_to_da(args->geo, cidb),
-                                          -1, &dbp);
-                       if (error) {
-                               xfs_trans_brelse(tp, lbp);
-                               return error;
-                       }
-               }
-               *dbpp = dbp;
-               return 0;
-       }
-       /*
-        * No match found, return ENOENT.
-        */
-       ASSERT(cidb == -1);
-       if (dbp)
-               xfs_trans_brelse(tp, dbp);
-       xfs_trans_brelse(tp, lbp);
-       return XFS_ERROR(ENOENT);
-}
-
-/*
- * Remove an entry from a leaf format directory.
- */
-int                                            /* error */
-xfs_dir2_leaf_removename(
-       xfs_da_args_t           *args)          /* operation arguments */
-{
-       __be16                  *bestsp;        /* leaf block best freespace */
-       xfs_dir2_data_hdr_t     *hdr;           /* data block header */
-       xfs_dir2_db_t           db;             /* data block number */
-       struct xfs_buf          *dbp;           /* data block buffer */
-       xfs_dir2_data_entry_t   *dep;           /* data entry structure */
-       xfs_inode_t             *dp;            /* incore directory inode */
-       int                     error;          /* error return code */
-       xfs_dir2_db_t           i;              /* temporary data block # */
-       int                     index;          /* index into leaf entries */
-       struct xfs_buf          *lbp;           /* leaf buffer */
-       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
-       xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
-       xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       int                     needlog;        /* need to log data header */
-       int                     needscan;       /* need to rescan data frees */
-       xfs_dir2_data_off_t     oldbest;        /* old value of best free */
-       xfs_trans_t             *tp;            /* transaction pointer */
-       struct xfs_dir2_data_free *bf;          /* bestfree table */
-       struct xfs_dir2_leaf_entry *ents;
-       struct xfs_dir3_icleaf_hdr leafhdr;
-
-       trace_xfs_dir2_leaf_removename(args);
-
-       /*
-        * Lookup the leaf entry, get the leaf and data blocks read in.
-        */
-       if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
-               return error;
-       }
-       dp = args->dp;
-       tp = args->trans;
-       mp = dp->i_mount;
-       leaf = lbp->b_addr;
-       hdr = dbp->b_addr;
-       xfs_dir3_data_check(dp, dbp);
-       bf = dp->d_ops->data_bestfree_p(hdr);
-       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-       ents = dp->d_ops->leaf_ents_p(leaf);
-       /*
-        * Point to the leaf entry, use that to point to the data entry.
-        */
-       lep = &ents[index];
-       db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
-       dep = (xfs_dir2_data_entry_t *)((char *)hdr +
-               xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
-       needscan = needlog = 0;
-       oldbest = be16_to_cpu(bf[0].length);
-       ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
-       bestsp = xfs_dir2_leaf_bests_p(ltp);
-       ASSERT(be16_to_cpu(bestsp[db]) == oldbest);
-       /*
-        * Mark the former data entry unused.
-        */
-       xfs_dir2_data_make_free(args, dbp,
-               (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
-               dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
-       /*
-        * We just mark the leaf entry stale by putting a null in it.
-        */
-       leafhdr.stale++;
-       dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
-       xfs_dir3_leaf_log_header(args, lbp);
-
-       lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
-       xfs_dir3_leaf_log_ents(args, lbp, index, index);
-
-       /*
-        * Scan the freespace in the data block again if necessary,
-        * log the data block header if necessary.
-        */
-       if (needscan)
-               xfs_dir2_data_freescan(dp, hdr, &needlog);
-       if (needlog)
-               xfs_dir2_data_log_header(args, dbp);
-       /*
-        * If the longest freespace in the data block has changed,
-        * put the new value in the bests table and log that.
-        */
-       if (be16_to_cpu(bf[0].length) != oldbest) {
-               bestsp[db] = bf[0].length;
-               xfs_dir3_leaf_log_bests(args, lbp, db, db);
-       }
-       xfs_dir3_data_check(dp, dbp);
-       /*
-        * If the data block is now empty then get rid of the data block.
-        */
-       if (be16_to_cpu(bf[0].length) ==
-                       args->geo->blksize - dp->d_ops->data_entry_offset) {
-               ASSERT(db != args->geo->datablk);
-               if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
-                       /*
-                        * Nope, can't get rid of it because it caused
-                        * allocation of a bmap btree block to do so.
-                        * Just go on, returning success, leaving the
-                        * empty block in place.
-                        */
-                       if (error == ENOSPC && args->total == 0)
-                               error = 0;
-                       xfs_dir3_leaf_check(dp, lbp);
-                       return error;
-               }
-               dbp = NULL;
-               /*
-                * If this is the last data block then compact the
-                * bests table by getting rid of entries.
-                */
-               if (db == be32_to_cpu(ltp->bestcount) - 1) {
-                       /*
-                        * Look for the last active entry (i).
-                        */
-                       for (i = db - 1; i > 0; i--) {
-                               if (bestsp[i] != cpu_to_be16(NULLDATAOFF))
-                                       break;
-                       }
-                       /*
-                        * Copy the table down so inactive entries at the
-                        * end are removed.
-                        */
-                       memmove(&bestsp[db - i], bestsp,
-                               (be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp));
-                       be32_add_cpu(&ltp->bestcount, -(db - i));
-                       xfs_dir3_leaf_log_tail(args, lbp);
-                       xfs_dir3_leaf_log_bests(args, lbp, 0,
-                                               be32_to_cpu(ltp->bestcount) - 1);
-               } else
-                       bestsp[db] = cpu_to_be16(NULLDATAOFF);
-       }
-       /*
-        * If the data block was not the first one, drop it.
-        */
-       else if (db != args->geo->datablk)
-               dbp = NULL;
-
-       xfs_dir3_leaf_check(dp, lbp);
-       /*
-        * See if we can convert to block form.
-        */
-       return xfs_dir2_leaf_to_block(args, lbp, dbp);
-}
-
-/*
- * Replace the inode number in a leaf format directory entry.
- */
-int                                            /* error */
-xfs_dir2_leaf_replace(
-       xfs_da_args_t           *args)          /* operation arguments */
-{
-       struct xfs_buf          *dbp;           /* data block buffer */
-       xfs_dir2_data_entry_t   *dep;           /* data block entry */
-       xfs_inode_t             *dp;            /* incore directory inode */
-       int                     error;          /* error return code */
-       int                     index;          /* index of leaf entry */
-       struct xfs_buf          *lbp;           /* leaf buffer */
-       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
-       xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
-       xfs_trans_t             *tp;            /* transaction pointer */
-       struct xfs_dir2_leaf_entry *ents;
-
-       trace_xfs_dir2_leaf_replace(args);
-
-       /*
-        * Look up the entry.
-        */
-       if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
-               return error;
-       }
-       dp = args->dp;
-       leaf = lbp->b_addr;
-       ents = dp->d_ops->leaf_ents_p(leaf);
-       /*
-        * Point to the leaf entry, get data address from it.
-        */
-       lep = &ents[index];
-       /*
-        * Point to the data entry.
-        */
-       dep = (xfs_dir2_data_entry_t *)
-             ((char *)dbp->b_addr +
-              xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
-       ASSERT(args->inumber != be64_to_cpu(dep->inumber));
-       /*
-        * Put the new inode number in, log it.
-        */
-       dep->inumber = cpu_to_be64(args->inumber);
-       dp->d_ops->data_put_ftype(dep, args->filetype);
-       tp = args->trans;
-       xfs_dir2_data_log_entry(args, dbp, dep);
-       xfs_dir3_leaf_check(dp, lbp);
-       xfs_trans_brelse(tp, lbp);
-       return 0;
-}
-
-/*
- * Return index in the leaf block (lbp) which is either the first
- * one with this hash value, or if there are none, the insert point
- * for that hash value.
- */
-int                                            /* index value */
-xfs_dir2_leaf_search_hash(
-       xfs_da_args_t           *args,          /* operation arguments */
-       struct xfs_buf          *lbp)           /* leaf buffer */
-{
-       xfs_dahash_t            hash=0;         /* hash from this entry */
-       xfs_dahash_t            hashwant;       /* hash value looking for */
-       int                     high;           /* high leaf index */
-       int                     low;            /* low leaf index */
-       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
-       xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
-       int                     mid=0;          /* current leaf index */
-       struct xfs_dir2_leaf_entry *ents;
-       struct xfs_dir3_icleaf_hdr leafhdr;
-
-       leaf = lbp->b_addr;
-       ents = args->dp->d_ops->leaf_ents_p(leaf);
-       args->dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-
-       /*
-        * Note, the table cannot be empty, so we have to go through the loop.
-        * Binary search the leaf entries looking for our hash value.
-        */
-       for (lep = ents, low = 0, high = leafhdr.count - 1,
-               hashwant = args->hashval;
-            low <= high; ) {
-               mid = (low + high) >> 1;
-               if ((hash = be32_to_cpu(lep[mid].hashval)) == hashwant)
-                       break;
-               if (hash < hashwant)
-                       low = mid + 1;
-               else
-                       high = mid - 1;
-       }
-       /*
-        * Found one, back up through all the equal hash values.
-        */
-       if (hash == hashwant) {
-               while (mid > 0 && be32_to_cpu(lep[mid - 1].hashval) == hashwant) {
-                       mid--;
-               }
-       }
-       /*
-        * Need to point to an entry higher than ours.
-        */
-       else if (hash < hashwant)
-               mid++;
-       return mid;
-}
-
-/*
- * Trim off a trailing data block.  We know it's empty since the leaf
- * freespace table says so.
- */
-int                                            /* error */
-xfs_dir2_leaf_trim_data(
-       xfs_da_args_t           *args,          /* operation arguments */
-       struct xfs_buf          *lbp,           /* leaf buffer */
-       xfs_dir2_db_t           db)             /* data block number */
-{
-       __be16                  *bestsp;        /* leaf bests table */
-       struct xfs_buf          *dbp;           /* data block buffer */
-       xfs_inode_t             *dp;            /* incore directory inode */
-       int                     error;          /* error return value */
-       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
-       xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       xfs_trans_t             *tp;            /* transaction pointer */
-
-       dp = args->dp;
-       mp = dp->i_mount;
-       tp = args->trans;
-       /*
-        * Read the offending data block.  We need its buffer.
-        */
-       error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, db),
-                                  -1, &dbp);
-       if (error)
-               return error;
-
-       leaf = lbp->b_addr;
-       ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
-
-#ifdef DEBUG
-{
-       struct xfs_dir2_data_hdr *hdr = dbp->b_addr;
-       struct xfs_dir2_data_free *bf = dp->d_ops->data_bestfree_p(hdr);
-
-       ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-              hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
-       ASSERT(be16_to_cpu(bf[0].length) ==
-              args->geo->blksize - dp->d_ops->data_entry_offset);
-       ASSERT(db == be32_to_cpu(ltp->bestcount) - 1);
-}
-#endif
-
-       /*
-        * Get rid of the data block.
-        */
-       if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
-               ASSERT(error != ENOSPC);
-               xfs_trans_brelse(tp, dbp);
-               return error;
-       }
-       /*
-        * Eliminate the last bests entry from the table.
-        */
-       bestsp = xfs_dir2_leaf_bests_p(ltp);
-       be32_add_cpu(&ltp->bestcount, -1);
-       memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp));
-       xfs_dir3_leaf_log_tail(args, lbp);
-       xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
-       return 0;
-}
-
-static inline size_t
-xfs_dir3_leaf_size(
-       struct xfs_dir3_icleaf_hdr      *hdr,
-       int                             counts)
-{
-       int     entries;
-       int     hdrsize;
-
-       entries = hdr->count - hdr->stale;
-       if (hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
-           hdr->magic == XFS_DIR2_LEAFN_MAGIC)
-               hdrsize = sizeof(struct xfs_dir2_leaf_hdr);
-       else
-               hdrsize = sizeof(struct xfs_dir3_leaf_hdr);
-
-       return hdrsize + entries * sizeof(xfs_dir2_leaf_entry_t)
-                      + counts * sizeof(xfs_dir2_data_off_t)
-                      + sizeof(xfs_dir2_leaf_tail_t);
-}
-
-/*
- * Convert node form directory to leaf form directory.
- * The root of the node form dir needs to already be a LEAFN block.
- * Just return if we can't do anything.
- */
-int                                            /* error */
-xfs_dir2_node_to_leaf(
-       xfs_da_state_t          *state)         /* directory operation state */
-{
-       xfs_da_args_t           *args;          /* operation arguments */
-       xfs_inode_t             *dp;            /* incore directory inode */
-       int                     error;          /* error return code */
-       struct xfs_buf          *fbp;           /* buffer for freespace block */
-       xfs_fileoff_t           fo;             /* freespace file offset */
-       xfs_dir2_free_t         *free;          /* freespace structure */
-       struct xfs_buf          *lbp;           /* buffer for leaf block */
-       xfs_dir2_leaf_tail_t    *ltp;           /* tail of leaf structure */
-       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       int                     rval;           /* successful free trim? */
-       xfs_trans_t             *tp;            /* transaction pointer */
-       struct xfs_dir3_icleaf_hdr leafhdr;
-       struct xfs_dir3_icfree_hdr freehdr;
-
-       /*
-        * There's more than a leaf level in the btree, so there must
-        * be multiple leafn blocks.  Give up.
-        */
-       if (state->path.active > 1)
-               return 0;
-       args = state->args;
-
-       trace_xfs_dir2_node_to_leaf(args);
-
-       mp = state->mp;
-       dp = args->dp;
-       tp = args->trans;
-       /*
-        * Get the last offset in the file.
-        */
-       if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) {
-               return error;
-       }
-       fo -= args->geo->fsbcount;
-       /*
-        * If there are freespace blocks other than the first one,
-        * take this opportunity to remove trailing empty freespace blocks
-        * that may have been left behind during no-space-reservation
-        * operations.
-        */
-       while (fo > args->geo->freeblk) {
-               if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) {
-                       return error;
-               }
-               if (rval)
-                       fo -= args->geo->fsbcount;
-               else
-                       return 0;
-       }
-       /*
-        * Now find the block just before the freespace block.
-        */
-       if ((error = xfs_bmap_last_before(tp, dp, &fo, XFS_DATA_FORK))) {
-               return error;
-       }
-       /*
-        * If it's not the single leaf block, give up.
-        */
-       if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + args->geo->blksize)
-               return 0;
-       lbp = state->path.blk[0].bp;
-       leaf = lbp->b_addr;
-       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-
-       ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
-              leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
-
-       /*
-        * Read the freespace block.
-        */
-       error = xfs_dir2_free_read(tp, dp,  args->geo->freeblk, &fbp);
-       if (error)
-               return error;
-       free = fbp->b_addr;
-       dp->d_ops->free_hdr_from_disk(&freehdr, free);
-
-       ASSERT(!freehdr.firstdb);
-
-       /*
-        * Now see if the leafn and free data will fit in a leaf1.
-        * If not, release the buffer and give up.
-        */
-       if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > args->geo->blksize) {
-               xfs_trans_brelse(tp, fbp);
-               return 0;
-       }
-
-       /*
-        * If the leaf has any stale entries in it, compress them out.
-        */
-       if (leafhdr.stale)
-               xfs_dir3_leaf_compact(args, &leafhdr, lbp);
-
-       lbp->b_ops = &xfs_dir3_leaf1_buf_ops;
-       xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAF1_BUF);
-       leafhdr.magic = (leafhdr.magic == XFS_DIR2_LEAFN_MAGIC)
-                                       ? XFS_DIR2_LEAF1_MAGIC
-                                       : XFS_DIR3_LEAF1_MAGIC;
-
-       /*
-        * Set up the leaf tail from the freespace block.
-        */
-       ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
-       ltp->bestcount = cpu_to_be32(freehdr.nvalid);
-
-       /*
-        * Set up the leaf bests table.
-        */
-       memcpy(xfs_dir2_leaf_bests_p(ltp), dp->d_ops->free_bests_p(free),
-               freehdr.nvalid * sizeof(xfs_dir2_data_off_t));
-
-       dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
-       xfs_dir3_leaf_log_header(args, lbp);
-       xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
-       xfs_dir3_leaf_log_tail(args, lbp);
-       xfs_dir3_leaf_check(dp, lbp);
-
-       /*
-        * Get rid of the freespace block.
-        */
-       error = xfs_dir2_shrink_inode(args,
-                       xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET),
-                       fbp);
-       if (error) {
-               /*
-                * This can't fail here because it can only happen when
-                * punching out the middle of an extent, and this is an
-                * isolated block.
-                */
-               ASSERT(error != ENOSPC);
-               return error;
-       }
-       fbp = NULL;
-       /*
-        * Now see if we can convert the single-leaf directory
-        * down to a block form directory.
-        * This routine always kills the dabuf for the leaf, so
-        * eliminate it from the path.
-        */
-       error = xfs_dir2_leaf_to_block(args, lbp, NULL);
-       state->path.blk[0].bp = NULL;
-       return error;
-}
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
deleted file mode 100644 (file)
index da43d30..0000000
+++ /dev/null
@@ -1,2284 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_priv.h"
-#include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_trans.h"
-#include "xfs_buf_item.h"
-#include "xfs_cksum.h"
-
-/*
- * Function declarations.
- */
-static int xfs_dir2_leafn_add(struct xfs_buf *bp, xfs_da_args_t *args,
-                             int index);
-static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state,
-                                    xfs_da_state_blk_t *blk1,
-                                    xfs_da_state_blk_t *blk2);
-static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
-                                int index, xfs_da_state_blk_t *dblk,
-                                int *rval);
-static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
-                                    xfs_da_state_blk_t *fblk);
-
-/*
- * Check internal consistency of a leafn block.
- */
-#ifdef DEBUG
-#define        xfs_dir3_leaf_check(dp, bp) \
-do { \
-       if (!xfs_dir3_leafn_check((dp), (bp))) \
-               ASSERT(0); \
-} while (0);
-
-static bool
-xfs_dir3_leafn_check(
-       struct xfs_inode        *dp,
-       struct xfs_buf          *bp)
-{
-       struct xfs_dir2_leaf    *leaf = bp->b_addr;
-       struct xfs_dir3_icleaf_hdr leafhdr;
-
-       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-
-       if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) {
-               struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
-               if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
-                       return false;
-       } else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC)
-               return false;
-
-       return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
-}
-#else
-#define        xfs_dir3_leaf_check(dp, bp)
-#endif
-
-static bool
-xfs_dir3_free_verify(
-       struct xfs_buf          *bp)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-       struct xfs_dir2_free_hdr *hdr = bp->b_addr;
-
-       if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
-
-               if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC))
-                       return false;
-               if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
-                       return false;
-               if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
-                       return false;
-       } else {
-               if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
-                       return false;
-       }
-
-       /* XXX: should bounds check the xfs_dir3_icfree_hdr here */
-
-       return true;
-}
-
-static void
-xfs_dir3_free_read_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-
-       if (xfs_sb_version_hascrc(&mp->m_sb) &&
-           !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF))
-               xfs_buf_ioerror(bp, EFSBADCRC);
-       else if (!xfs_dir3_free_verify(bp))
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-       if (bp->b_error)
-               xfs_verifier_error(bp);
-}
-
-static void
-xfs_dir3_free_write_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-       struct xfs_buf_log_item *bip = bp->b_fspriv;
-       struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
-
-       if (!xfs_dir3_free_verify(bp)) {
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-               xfs_verifier_error(bp);
-               return;
-       }
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return;
-
-       if (bip)
-               hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
-
-       xfs_buf_update_cksum(bp, XFS_DIR3_FREE_CRC_OFF);
-}
-
-const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
-       .verify_read = xfs_dir3_free_read_verify,
-       .verify_write = xfs_dir3_free_write_verify,
-};
-
-
-static int
-__xfs_dir3_free_read(
-       struct xfs_trans        *tp,
-       struct xfs_inode        *dp,
-       xfs_dablk_t             fbno,
-       xfs_daddr_t             mappedbno,
-       struct xfs_buf          **bpp)
-{
-       int                     err;
-
-       err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
-                               XFS_DATA_FORK, &xfs_dir3_free_buf_ops);
-
-       /* try read returns without an error or *bpp if it lands in a hole */
-       if (!err && tp && *bpp)
-               xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF);
-       return err;
-}
-
-int
-xfs_dir2_free_read(
-       struct xfs_trans        *tp,
-       struct xfs_inode        *dp,
-       xfs_dablk_t             fbno,
-       struct xfs_buf          **bpp)
-{
-       return __xfs_dir3_free_read(tp, dp, fbno, -1, bpp);
-}
-
-static int
-xfs_dir2_free_try_read(
-       struct xfs_trans        *tp,
-       struct xfs_inode        *dp,
-       xfs_dablk_t             fbno,
-       struct xfs_buf          **bpp)
-{
-       return __xfs_dir3_free_read(tp, dp, fbno, -2, bpp);
-}
-
-static int
-xfs_dir3_free_get_buf(
-       xfs_da_args_t           *args,
-       xfs_dir2_db_t           fbno,
-       struct xfs_buf          **bpp)
-{
-       struct xfs_trans        *tp = args->trans;
-       struct xfs_inode        *dp = args->dp;
-       struct xfs_mount        *mp = dp->i_mount;
-       struct xfs_buf          *bp;
-       int                     error;
-       struct xfs_dir3_icfree_hdr hdr;
-
-       error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, fbno),
-                                  -1, &bp, XFS_DATA_FORK);
-       if (error)
-               return error;
-
-       xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_FREE_BUF);
-       bp->b_ops = &xfs_dir3_free_buf_ops;
-
-       /*
-        * Initialize the new block to be empty, and remember
-        * its first slot as our empty slot.
-        */
-       memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr));
-       memset(&hdr, 0, sizeof(hdr));
-
-       if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
-
-               hdr.magic = XFS_DIR3_FREE_MAGIC;
-
-               hdr3->hdr.blkno = cpu_to_be64(bp->b_bn);
-               hdr3->hdr.owner = cpu_to_be64(dp->i_ino);
-               uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid);
-       } else
-               hdr.magic = XFS_DIR2_FREE_MAGIC;
-       dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr);
-       *bpp = bp;
-       return 0;
-}
-
-/*
- * Log entries from a freespace block.
- */
-STATIC void
-xfs_dir2_free_log_bests(
-       struct xfs_da_args      *args,
-       struct xfs_buf          *bp,
-       int                     first,          /* first entry to log */
-       int                     last)           /* last entry to log */
-{
-       xfs_dir2_free_t         *free;          /* freespace structure */
-       __be16                  *bests;
-
-       free = bp->b_addr;
-       bests = args->dp->d_ops->free_bests_p(free);
-       ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
-              free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
-       xfs_trans_log_buf(args->trans, bp,
-               (uint)((char *)&bests[first] - (char *)free),
-               (uint)((char *)&bests[last] - (char *)free +
-                      sizeof(bests[0]) - 1));
-}
-
-/*
- * Log header from a freespace block.
- */
-static void
-xfs_dir2_free_log_header(
-       struct xfs_da_args      *args,
-       struct xfs_buf          *bp)
-{
-#ifdef DEBUG
-       xfs_dir2_free_t         *free;          /* freespace structure */
-
-       free = bp->b_addr;
-       ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
-              free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
-#endif
-       xfs_trans_log_buf(args->trans, bp, 0,
-                         args->dp->d_ops->free_hdr_size - 1);
-}
-
-/*
- * Convert a leaf-format directory to a node-format directory.
- * We need to change the magic number of the leaf block, and copy
- * the freespace table out of the leaf block into its own block.
- */
-int                                            /* error */
-xfs_dir2_leaf_to_node(
-       xfs_da_args_t           *args,          /* operation arguments */
-       struct xfs_buf          *lbp)           /* leaf buffer */
-{
-       xfs_inode_t             *dp;            /* incore directory inode */
-       int                     error;          /* error return value */
-       struct xfs_buf          *fbp;           /* freespace buffer */
-       xfs_dir2_db_t           fdb;            /* freespace block number */
-       xfs_dir2_free_t         *free;          /* freespace structure */
-       __be16                  *from;          /* pointer to freespace entry */
-       int                     i;              /* leaf freespace index */
-       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
-       xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       int                     n;              /* count of live freespc ents */
-       xfs_dir2_data_off_t     off;            /* freespace entry value */
-       __be16                  *to;            /* pointer to freespace entry */
-       xfs_trans_t             *tp;            /* transaction pointer */
-       struct xfs_dir3_icfree_hdr freehdr;
-
-       trace_xfs_dir2_leaf_to_node(args);
-
-       dp = args->dp;
-       mp = dp->i_mount;
-       tp = args->trans;
-       /*
-        * Add a freespace block to the directory.
-        */
-       if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) {
-               return error;
-       }
-       ASSERT(fdb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
-       /*
-        * Get the buffer for the new freespace block.
-        */
-       error = xfs_dir3_free_get_buf(args, fdb, &fbp);
-       if (error)
-               return error;
-
-       free = fbp->b_addr;
-       dp->d_ops->free_hdr_from_disk(&freehdr, free);
-       leaf = lbp->b_addr;
-       ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
-       ASSERT(be32_to_cpu(ltp->bestcount) <=
-                               (uint)dp->i_d.di_size / args->geo->blksize);
-
-       /*
-        * Copy freespace entries from the leaf block to the new block.
-        * Count active entries.
-        */
-       from = xfs_dir2_leaf_bests_p(ltp);
-       to = dp->d_ops->free_bests_p(free);
-       for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) {
-               if ((off = be16_to_cpu(*from)) != NULLDATAOFF)
-                       n++;
-               *to = cpu_to_be16(off);
-       }
-
-       /*
-        * Now initialize the freespace block header.
-        */
-       freehdr.nused = n;
-       freehdr.nvalid = be32_to_cpu(ltp->bestcount);
-
-       dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
-       xfs_dir2_free_log_bests(args, fbp, 0, freehdr.nvalid - 1);
-       xfs_dir2_free_log_header(args, fbp);
-
-       /*
-        * Converting the leaf to a leafnode is just a matter of changing the
-        * magic number and the ops. Do the change directly to the buffer as
-        * it's less work (and less code) than decoding the header to host
-        * format and back again.
-        */
-       if (leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC))
-               leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
-       else
-               leaf->hdr.info.magic = cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
-       lbp->b_ops = &xfs_dir3_leafn_buf_ops;
-       xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAFN_BUF);
-       xfs_dir3_leaf_log_header(args, lbp);
-       xfs_dir3_leaf_check(dp, lbp);
-       return 0;
-}
-
-/*
- * Add a leaf entry to a leaf block in a node-form directory.
- * The other work necessary is done from the caller.
- */
-static int                                     /* error */
-xfs_dir2_leafn_add(
-       struct xfs_buf          *bp,            /* leaf buffer */
-       xfs_da_args_t           *args,          /* operation arguments */
-       int                     index)          /* insertion pt for new entry */
-{
-       int                     compact;        /* compacting stale leaves */
-       xfs_inode_t             *dp;            /* incore directory inode */
-       int                     highstale;      /* next stale entry */
-       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
-       xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
-       int                     lfloghigh;      /* high leaf entry logging */
-       int                     lfloglow;       /* low leaf entry logging */
-       int                     lowstale;       /* previous stale entry */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       xfs_trans_t             *tp;            /* transaction pointer */
-       struct xfs_dir3_icleaf_hdr leafhdr;
-       struct xfs_dir2_leaf_entry *ents;
-
-       trace_xfs_dir2_leafn_add(args, index);
-
-       dp = args->dp;
-       mp = dp->i_mount;
-       tp = args->trans;
-       leaf = bp->b_addr;
-       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-       ents = dp->d_ops->leaf_ents_p(leaf);
-
-       /*
-        * Quick check just to make sure we are not going to index
-        * into other peoples memory
-        */
-       if (index < 0)
-               return XFS_ERROR(EFSCORRUPTED);
-
-       /*
-        * If there are already the maximum number of leaf entries in
-        * the block, if there are no stale entries it won't fit.
-        * Caller will do a split.  If there are stale entries we'll do
-        * a compact.
-        */
-
-       if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) {
-               if (!leafhdr.stale)
-                       return XFS_ERROR(ENOSPC);
-               compact = leafhdr.stale > 1;
-       } else
-               compact = 0;
-       ASSERT(index == 0 || be32_to_cpu(ents[index - 1].hashval) <= args->hashval);
-       ASSERT(index == leafhdr.count ||
-              be32_to_cpu(ents[index].hashval) >= args->hashval);
-
-       if (args->op_flags & XFS_DA_OP_JUSTCHECK)
-               return 0;
-
-       /*
-        * Compact out all but one stale leaf entry.  Leaves behind
-        * the entry closest to index.
-        */
-       if (compact)
-               xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
-                                        &highstale, &lfloglow, &lfloghigh);
-       else if (leafhdr.stale) {
-               /*
-                * Set impossible logging indices for this case.
-                */
-               lfloglow = leafhdr.count;
-               lfloghigh = -1;
-       }
-
-       /*
-        * Insert the new entry, log everything.
-        */
-       lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
-                                      highstale, &lfloglow, &lfloghigh);
-
-       lep->hashval = cpu_to_be32(args->hashval);
-       lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(args->geo,
-                               args->blkno, args->index));
-
-       dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
-       xfs_dir3_leaf_log_header(args, bp);
-       xfs_dir3_leaf_log_ents(args, bp, lfloglow, lfloghigh);
-       xfs_dir3_leaf_check(dp, bp);
-       return 0;
-}
-
-#ifdef DEBUG
-static void
-xfs_dir2_free_hdr_check(
-       struct xfs_inode *dp,
-       struct xfs_buf  *bp,
-       xfs_dir2_db_t   db)
-{
-       struct xfs_dir3_icfree_hdr hdr;
-
-       dp->d_ops->free_hdr_from_disk(&hdr, bp->b_addr);
-
-       ASSERT((hdr.firstdb %
-               dp->d_ops->free_max_bests(dp->i_mount->m_dir_geo)) == 0);
-       ASSERT(hdr.firstdb <= db);
-       ASSERT(db < hdr.firstdb + hdr.nvalid);
-}
-#else
-#define xfs_dir2_free_hdr_check(dp, bp, db)
-#endif /* DEBUG */
-
-/*
- * Return the last hash value in the leaf.
- * Stale entries are ok.
- */
-xfs_dahash_t                                   /* hash value */
-xfs_dir2_leafn_lasthash(
-       struct xfs_inode *dp,
-       struct xfs_buf  *bp,                    /* leaf buffer */
-       int             *count)                 /* count of entries in leaf */
-{
-       struct xfs_dir2_leaf    *leaf = bp->b_addr;
-       struct xfs_dir2_leaf_entry *ents;
-       struct xfs_dir3_icleaf_hdr leafhdr;
-
-       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-
-       ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
-              leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
-
-       if (count)
-               *count = leafhdr.count;
-       if (!leafhdr.count)
-               return 0;
-
-       ents = dp->d_ops->leaf_ents_p(leaf);
-       return be32_to_cpu(ents[leafhdr.count - 1].hashval);
-}
-
-/*
- * Look up a leaf entry for space to add a name in a node-format leaf block.
- * The extrablk in state is a freespace block.
- */
-STATIC int
-xfs_dir2_leafn_lookup_for_addname(
-       struct xfs_buf          *bp,            /* leaf buffer */
-       xfs_da_args_t           *args,          /* operation arguments */
-       int                     *indexp,        /* out: leaf entry index */
-       xfs_da_state_t          *state)         /* state to fill in */
-{
-       struct xfs_buf          *curbp = NULL;  /* current data/free buffer */
-       xfs_dir2_db_t           curdb = -1;     /* current data block number */
-       xfs_dir2_db_t           curfdb = -1;    /* current free block number */
-       xfs_inode_t             *dp;            /* incore directory inode */
-       int                     error;          /* error return value */
-       int                     fi;             /* free entry index */
-       xfs_dir2_free_t         *free = NULL;   /* free block structure */
-       int                     index;          /* leaf entry index */
-       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
-       int                     length;         /* length of new data entry */
-       xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       xfs_dir2_db_t           newdb;          /* new data block number */
-       xfs_dir2_db_t           newfdb;         /* new free block number */
-       xfs_trans_t             *tp;            /* transaction pointer */
-       struct xfs_dir2_leaf_entry *ents;
-       struct xfs_dir3_icleaf_hdr leafhdr;
-
-       dp = args->dp;
-       tp = args->trans;
-       mp = dp->i_mount;
-       leaf = bp->b_addr;
-       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-       ents = dp->d_ops->leaf_ents_p(leaf);
-
-       xfs_dir3_leaf_check(dp, bp);
-       ASSERT(leafhdr.count > 0);
-
-       /*
-        * Look up the hash value in the leaf entries.
-        */
-       index = xfs_dir2_leaf_search_hash(args, bp);
-       /*
-        * Do we have a buffer coming in?
-        */
-       if (state->extravalid) {
-               /* If so, it's a free block buffer, get the block number. */
-               curbp = state->extrablk.bp;
-               curfdb = state->extrablk.blkno;
-               free = curbp->b_addr;
-               ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
-                      free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
-       }
-       length = dp->d_ops->data_entsize(args->namelen);
-       /*
-        * Loop over leaf entries with the right hash value.
-        */
-       for (lep = &ents[index];
-            index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
-            lep++, index++) {
-               /*
-                * Skip stale leaf entries.
-                */
-               if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
-                       continue;
-               /*
-                * Pull the data block number from the entry.
-                */
-               newdb = xfs_dir2_dataptr_to_db(args->geo,
-                                              be32_to_cpu(lep->address));
-               /*
-                * For addname, we're looking for a place to put the new entry.
-                * We want to use a data block with an entry of equal
-                * hash value to ours if there is one with room.
-                *
-                * If this block isn't the data block we already have
-                * in hand, take a look at it.
-                */
-               if (newdb != curdb) {
-                       __be16 *bests;
-
-                       curdb = newdb;
-                       /*
-                        * Convert the data block to the free block
-                        * holding its freespace information.
-                        */
-                       newfdb = dp->d_ops->db_to_fdb(args->geo, newdb);
-                       /*
-                        * If it's not the one we have in hand, read it in.
-                        */
-                       if (newfdb != curfdb) {
-                               /*
-                                * If we had one before, drop it.
-                                */
-                               if (curbp)
-                                       xfs_trans_brelse(tp, curbp);
-
-                               error = xfs_dir2_free_read(tp, dp,
-                                               xfs_dir2_db_to_da(args->geo,
-                                                                 newfdb),
-                                               &curbp);
-                               if (error)
-                                       return error;
-                               free = curbp->b_addr;
-
-                               xfs_dir2_free_hdr_check(dp, curbp, curdb);
-                       }
-                       /*
-                        * Get the index for our entry.
-                        */
-                       fi = dp->d_ops->db_to_fdindex(args->geo, curdb);
-                       /*
-                        * If it has room, return it.
-                        */
-                       bests = dp->d_ops->free_bests_p(free);
-                       if (unlikely(bests[fi] == cpu_to_be16(NULLDATAOFF))) {
-                               XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
-                                                       XFS_ERRLEVEL_LOW, mp);
-                               if (curfdb != newfdb)
-                                       xfs_trans_brelse(tp, curbp);
-                               return XFS_ERROR(EFSCORRUPTED);
-                       }
-                       curfdb = newfdb;
-                       if (be16_to_cpu(bests[fi]) >= length)
-                               goto out;
-               }
-       }
-       /* Didn't find any space */
-       fi = -1;
-out:
-       ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
-       if (curbp) {
-               /* Giving back a free block. */
-               state->extravalid = 1;
-               state->extrablk.bp = curbp;
-               state->extrablk.index = fi;
-               state->extrablk.blkno = curfdb;
-
-               /*
-                * Important: this magic number is not in the buffer - it's for
-                * buffer type information and therefore only the free/data type
-                * matters here, not whether CRCs are enabled or not.
-                */
-               state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
-       } else {
-               state->extravalid = 0;
-       }
-       /*
-        * Return the index, that will be the insertion point.
-        */
-       *indexp = index;
-       return XFS_ERROR(ENOENT);
-}
-
-/*
- * Look up a leaf entry in a node-format leaf block.
- * The extrablk in state a data block.
- */
-STATIC int
-xfs_dir2_leafn_lookup_for_entry(
-       struct xfs_buf          *bp,            /* leaf buffer */
-       xfs_da_args_t           *args,          /* operation arguments */
-       int                     *indexp,        /* out: leaf entry index */
-       xfs_da_state_t          *state)         /* state to fill in */
-{
-       struct xfs_buf          *curbp = NULL;  /* current data/free buffer */
-       xfs_dir2_db_t           curdb = -1;     /* current data block number */
-       xfs_dir2_data_entry_t   *dep;           /* data block entry */
-       xfs_inode_t             *dp;            /* incore directory inode */
-       int                     error;          /* error return value */
-       int                     index;          /* leaf entry index */
-       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
-       xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       xfs_dir2_db_t           newdb;          /* new data block number */
-       xfs_trans_t             *tp;            /* transaction pointer */
-       enum xfs_dacmp          cmp;            /* comparison result */
-       struct xfs_dir2_leaf_entry *ents;
-       struct xfs_dir3_icleaf_hdr leafhdr;
-
-       dp = args->dp;
-       tp = args->trans;
-       mp = dp->i_mount;
-       leaf = bp->b_addr;
-       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-       ents = dp->d_ops->leaf_ents_p(leaf);
-
-       xfs_dir3_leaf_check(dp, bp);
-       ASSERT(leafhdr.count > 0);
-
-       /*
-        * Look up the hash value in the leaf entries.
-        */
-       index = xfs_dir2_leaf_search_hash(args, bp);
-       /*
-        * Do we have a buffer coming in?
-        */
-       if (state->extravalid) {
-               curbp = state->extrablk.bp;
-               curdb = state->extrablk.blkno;
-       }
-       /*
-        * Loop over leaf entries with the right hash value.
-        */
-       for (lep = &ents[index];
-            index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
-            lep++, index++) {
-               /*
-                * Skip stale leaf entries.
-                */
-               if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
-                       continue;
-               /*
-                * Pull the data block number from the entry.
-                */
-               newdb = xfs_dir2_dataptr_to_db(args->geo,
-                                              be32_to_cpu(lep->address));
-               /*
-                * Not adding a new entry, so we really want to find
-                * the name given to us.
-                *
-                * If it's a different data block, go get it.
-                */
-               if (newdb != curdb) {
-                       /*
-                        * If we had a block before that we aren't saving
-                        * for a CI name, drop it
-                        */
-                       if (curbp && (args->cmpresult == XFS_CMP_DIFFERENT ||
-                                               curdb != state->extrablk.blkno))
-                               xfs_trans_brelse(tp, curbp);
-                       /*
-                        * If needing the block that is saved with a CI match,
-                        * use it otherwise read in the new data block.
-                        */
-                       if (args->cmpresult != XFS_CMP_DIFFERENT &&
-                                       newdb == state->extrablk.blkno) {
-                               ASSERT(state->extravalid);
-                               curbp = state->extrablk.bp;
-                       } else {
-                               error = xfs_dir3_data_read(tp, dp,
-                                               xfs_dir2_db_to_da(args->geo,
-                                                                 newdb),
-                                               -1, &curbp);
-                               if (error)
-                                       return error;
-                       }
-                       xfs_dir3_data_check(dp, curbp);
-                       curdb = newdb;
-               }
-               /*
-                * Point to the data entry.
-                */
-               dep = (xfs_dir2_data_entry_t *)((char *)curbp->b_addr +
-                       xfs_dir2_dataptr_to_off(args->geo,
-                                               be32_to_cpu(lep->address)));
-               /*
-                * Compare the entry and if it's an exact match, return
-                * EEXIST immediately. If it's the first case-insensitive
-                * match, store the block & inode number and continue looking.
-                */
-               cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
-               if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
-                       /* If there is a CI match block, drop it */
-                       if (args->cmpresult != XFS_CMP_DIFFERENT &&
-                                               curdb != state->extrablk.blkno)
-                               xfs_trans_brelse(tp, state->extrablk.bp);
-                       args->cmpresult = cmp;
-                       args->inumber = be64_to_cpu(dep->inumber);
-                       args->filetype = dp->d_ops->data_get_ftype(dep);
-                       *indexp = index;
-                       state->extravalid = 1;
-                       state->extrablk.bp = curbp;
-                       state->extrablk.blkno = curdb;
-                       state->extrablk.index = (int)((char *)dep -
-                                                       (char *)curbp->b_addr);
-                       state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
-                       curbp->b_ops = &xfs_dir3_data_buf_ops;
-                       xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
-                       if (cmp == XFS_CMP_EXACT)
-                               return XFS_ERROR(EEXIST);
-               }
-       }
-       ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT));
-       if (curbp) {
-               if (args->cmpresult == XFS_CMP_DIFFERENT) {
-                       /* Giving back last used data block. */
-                       state->extravalid = 1;
-                       state->extrablk.bp = curbp;
-                       state->extrablk.index = -1;
-                       state->extrablk.blkno = curdb;
-                       state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
-                       curbp->b_ops = &xfs_dir3_data_buf_ops;
-                       xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
-               } else {
-                       /* If the curbp is not the CI match block, drop it */
-                       if (state->extrablk.bp != curbp)
-                               xfs_trans_brelse(tp, curbp);
-               }
-       } else {
-               state->extravalid = 0;
-       }
-       *indexp = index;
-       return XFS_ERROR(ENOENT);
-}
-
-/*
- * Look up a leaf entry in a node-format leaf block.
- * If this is an addname then the extrablk in state is a freespace block,
- * otherwise it's a data block.
- */
-int
-xfs_dir2_leafn_lookup_int(
-       struct xfs_buf          *bp,            /* leaf buffer */
-       xfs_da_args_t           *args,          /* operation arguments */
-       int                     *indexp,        /* out: leaf entry index */
-       xfs_da_state_t          *state)         /* state to fill in */
-{
-       if (args->op_flags & XFS_DA_OP_ADDNAME)
-               return xfs_dir2_leafn_lookup_for_addname(bp, args, indexp,
-                                                       state);
-       return xfs_dir2_leafn_lookup_for_entry(bp, args, indexp, state);
-}
-
-/*
- * Move count leaf entries from source to destination leaf.
- * Log entries and headers.  Stale entries are preserved.
- */
-static void
-xfs_dir3_leafn_moveents(
-       xfs_da_args_t                   *args,  /* operation arguments */
-       struct xfs_buf                  *bp_s,  /* source */
-       struct xfs_dir3_icleaf_hdr      *shdr,
-       struct xfs_dir2_leaf_entry      *sents,
-       int                             start_s,/* source leaf index */
-       struct xfs_buf                  *bp_d,  /* destination */
-       struct xfs_dir3_icleaf_hdr      *dhdr,
-       struct xfs_dir2_leaf_entry      *dents,
-       int                             start_d,/* destination leaf index */
-       int                             count)  /* count of leaves to copy */
-{
-       int                             stale;  /* count stale leaves copied */
-
-       trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count);
-
-       /*
-        * Silently return if nothing to do.
-        */
-       if (count == 0)
-               return;
-
-       /*
-        * If the destination index is not the end of the current
-        * destination leaf entries, open up a hole in the destination
-        * to hold the new entries.
-        */
-       if (start_d < dhdr->count) {
-               memmove(&dents[start_d + count], &dents[start_d],
-                       (dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t));
-               xfs_dir3_leaf_log_ents(args, bp_d, start_d + count,
-                                      count + dhdr->count - 1);
-       }
-       /*
-        * If the source has stale leaves, count the ones in the copy range
-        * so we can update the header correctly.
-        */
-       if (shdr->stale) {
-               int     i;                      /* temp leaf index */
-
-               for (i = start_s, stale = 0; i < start_s + count; i++) {
-                       if (sents[i].address ==
-                                       cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
-                               stale++;
-               }
-       } else
-               stale = 0;
-       /*
-        * Copy the leaf entries from source to destination.
-        */
-       memcpy(&dents[start_d], &sents[start_s],
-               count * sizeof(xfs_dir2_leaf_entry_t));
-       xfs_dir3_leaf_log_ents(args, bp_d, start_d, start_d + count - 1);
-
-       /*
-        * If there are source entries after the ones we copied,
-        * delete the ones we copied by sliding the next ones down.
-        */
-       if (start_s + count < shdr->count) {
-               memmove(&sents[start_s], &sents[start_s + count],
-                       count * sizeof(xfs_dir2_leaf_entry_t));
-               xfs_dir3_leaf_log_ents(args, bp_s, start_s, start_s + count - 1);
-       }
-
-       /*
-        * Update the headers and log them.
-        */
-       shdr->count -= count;
-       shdr->stale -= stale;
-       dhdr->count += count;
-       dhdr->stale += stale;
-}
-
-/*
- * Determine the sort order of two leaf blocks.
- * Returns 1 if both are valid and leaf2 should be before leaf1, else 0.
- */
-int                                            /* sort order */
-xfs_dir2_leafn_order(
-       struct xfs_inode        *dp,
-       struct xfs_buf          *leaf1_bp,              /* leaf1 buffer */
-       struct xfs_buf          *leaf2_bp)              /* leaf2 buffer */
-{
-       struct xfs_dir2_leaf    *leaf1 = leaf1_bp->b_addr;
-       struct xfs_dir2_leaf    *leaf2 = leaf2_bp->b_addr;
-       struct xfs_dir2_leaf_entry *ents1;
-       struct xfs_dir2_leaf_entry *ents2;
-       struct xfs_dir3_icleaf_hdr hdr1;
-       struct xfs_dir3_icleaf_hdr hdr2;
-
-       dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
-       dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
-       ents1 = dp->d_ops->leaf_ents_p(leaf1);
-       ents2 = dp->d_ops->leaf_ents_p(leaf2);
-
-       if (hdr1.count > 0 && hdr2.count > 0 &&
-           (be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) ||
-            be32_to_cpu(ents2[hdr2.count - 1].hashval) <
-                               be32_to_cpu(ents1[hdr1.count - 1].hashval)))
-               return 1;
-       return 0;
-}
-
-/*
- * Rebalance leaf entries between two leaf blocks.
- * This is actually only called when the second block is new,
- * though the code deals with the general case.
- * A new entry will be inserted in one of the blocks, and that
- * entry is taken into account when balancing.
- */
-static void
-xfs_dir2_leafn_rebalance(
-       xfs_da_state_t          *state,         /* btree cursor */
-       xfs_da_state_blk_t      *blk1,          /* first btree block */
-       xfs_da_state_blk_t      *blk2)          /* second btree block */
-{
-       xfs_da_args_t           *args;          /* operation arguments */
-       int                     count;          /* count (& direction) leaves */
-       int                     isleft;         /* new goes in left leaf */
-       xfs_dir2_leaf_t         *leaf1;         /* first leaf structure */
-       xfs_dir2_leaf_t         *leaf2;         /* second leaf structure */
-       int                     mid;            /* midpoint leaf index */
-#if defined(DEBUG) || defined(XFS_WARN)
-       int                     oldstale;       /* old count of stale leaves */
-#endif
-       int                     oldsum;         /* old total leaf count */
-       int                     swap;           /* swapped leaf blocks */
-       struct xfs_dir2_leaf_entry *ents1;
-       struct xfs_dir2_leaf_entry *ents2;
-       struct xfs_dir3_icleaf_hdr hdr1;
-       struct xfs_dir3_icleaf_hdr hdr2;
-       struct xfs_inode        *dp = state->args->dp;
-
-       args = state->args;
-       /*
-        * If the block order is wrong, swap the arguments.
-        */
-       if ((swap = xfs_dir2_leafn_order(dp, blk1->bp, blk2->bp))) {
-               xfs_da_state_blk_t      *tmp;   /* temp for block swap */
-
-               tmp = blk1;
-               blk1 = blk2;
-               blk2 = tmp;
-       }
-       leaf1 = blk1->bp->b_addr;
-       leaf2 = blk2->bp->b_addr;
-       dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
-       dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
-       ents1 = dp->d_ops->leaf_ents_p(leaf1);
-       ents2 = dp->d_ops->leaf_ents_p(leaf2);
-
-       oldsum = hdr1.count + hdr2.count;
-#if defined(DEBUG) || defined(XFS_WARN)
-       oldstale = hdr1.stale + hdr2.stale;
-#endif
-       mid = oldsum >> 1;
-
-       /*
-        * If the old leaf count was odd then the new one will be even,
-        * so we need to divide the new count evenly.
-        */
-       if (oldsum & 1) {
-               xfs_dahash_t    midhash;        /* middle entry hash value */
-
-               if (mid >= hdr1.count)
-                       midhash = be32_to_cpu(ents2[mid - hdr1.count].hashval);
-               else
-                       midhash = be32_to_cpu(ents1[mid].hashval);
-               isleft = args->hashval <= midhash;
-       }
-       /*
-        * If the old count is even then the new count is odd, so there's
-        * no preferred side for the new entry.
-        * Pick the left one.
-        */
-       else
-               isleft = 1;
-       /*
-        * Calculate moved entry count.  Positive means left-to-right,
-        * negative means right-to-left.  Then move the entries.
-        */
-       count = hdr1.count - mid + (isleft == 0);
-       if (count > 0)
-               xfs_dir3_leafn_moveents(args, blk1->bp, &hdr1, ents1,
-                                       hdr1.count - count, blk2->bp,
-                                       &hdr2, ents2, 0, count);
-       else if (count < 0)
-               xfs_dir3_leafn_moveents(args, blk2->bp, &hdr2, ents2, 0,
-                                       blk1->bp, &hdr1, ents1,
-                                       hdr1.count, count);
-
-       ASSERT(hdr1.count + hdr2.count == oldsum);
-       ASSERT(hdr1.stale + hdr2.stale == oldstale);
-
-       /* log the changes made when moving the entries */
-       dp->d_ops->leaf_hdr_to_disk(leaf1, &hdr1);
-       dp->d_ops->leaf_hdr_to_disk(leaf2, &hdr2);
-       xfs_dir3_leaf_log_header(args, blk1->bp);
-       xfs_dir3_leaf_log_header(args, blk2->bp);
-
-       xfs_dir3_leaf_check(dp, blk1->bp);
-       xfs_dir3_leaf_check(dp, blk2->bp);
-
-       /*
-        * Mark whether we're inserting into the old or new leaf.
-        */
-       if (hdr1.count < hdr2.count)
-               state->inleaf = swap;
-       else if (hdr1.count > hdr2.count)
-               state->inleaf = !swap;
-       else
-               state->inleaf = swap ^ (blk1->index <= hdr1.count);
-       /*
-        * Adjust the expected index for insertion.
-        */
-       if (!state->inleaf)
-               blk2->index = blk1->index - hdr1.count;
-
-       /*
-        * Finally sanity check just to make sure we are not returning a
-        * negative index
-        */
-       if (blk2->index < 0) {
-               state->inleaf = 1;
-               blk2->index = 0;
-               xfs_alert(dp->i_mount,
-       "%s: picked the wrong leaf? reverting original leaf: blk1->index %d",
-                       __func__, blk1->index);
-       }
-}
-
-static int
-xfs_dir3_data_block_free(
-       xfs_da_args_t           *args,
-       struct xfs_dir2_data_hdr *hdr,
-       struct xfs_dir2_free    *free,
-       xfs_dir2_db_t           fdb,
-       int                     findex,
-       struct xfs_buf          *fbp,
-       int                     longest)
-{
-       int                     logfree = 0;
-       __be16                  *bests;
-       struct xfs_dir3_icfree_hdr freehdr;
-       struct xfs_inode        *dp = args->dp;
-
-       dp->d_ops->free_hdr_from_disk(&freehdr, free);
-       bests = dp->d_ops->free_bests_p(free);
-       if (hdr) {
-               /*
-                * Data block is not empty, just set the free entry to the new
-                * value.
-                */
-               bests[findex] = cpu_to_be16(longest);
-               xfs_dir2_free_log_bests(args, fbp, findex, findex);
-               return 0;
-       }
-
-       /* One less used entry in the free table. */
-       freehdr.nused--;
-
-       /*
-        * If this was the last entry in the table, we can trim the table size
-        * back.  There might be other entries at the end referring to
-        * non-existent data blocks, get those too.
-        */
-       if (findex == freehdr.nvalid - 1) {
-               int     i;              /* free entry index */
-
-               for (i = findex - 1; i >= 0; i--) {
-                       if (bests[i] != cpu_to_be16(NULLDATAOFF))
-                               break;
-               }
-               freehdr.nvalid = i + 1;
-               logfree = 0;
-       } else {
-               /* Not the last entry, just punch it out.  */
-               bests[findex] = cpu_to_be16(NULLDATAOFF);
-               logfree = 1;
-       }
-
-       dp->d_ops->free_hdr_to_disk(free, &freehdr);
-       xfs_dir2_free_log_header(args, fbp);
-
-       /*
-        * If there are no useful entries left in the block, get rid of the
-        * block if we can.
-        */
-       if (!freehdr.nused) {
-               int error;
-
-               error = xfs_dir2_shrink_inode(args, fdb, fbp);
-               if (error == 0) {
-                       fbp = NULL;
-                       logfree = 0;
-               } else if (error != ENOSPC || args->total != 0)
-                       return error;
-               /*
-                * It's possible to get ENOSPC if there is no
-                * space reservation.  In this case some one
-                * else will eventually get rid of this block.
-                */
-       }
-
-       /* Log the free entry that changed, unless we got rid of it.  */
-       if (logfree)
-               xfs_dir2_free_log_bests(args, fbp, findex, findex);
-       return 0;
-}
-
-/*
- * Remove an entry from a node directory.
- * This removes the leaf entry and the data entry,
- * and updates the free block if necessary.
- */
-static int                                     /* error */
-xfs_dir2_leafn_remove(
-       xfs_da_args_t           *args,          /* operation arguments */
-       struct xfs_buf          *bp,            /* leaf buffer */
-       int                     index,          /* leaf entry index */
-       xfs_da_state_blk_t      *dblk,          /* data block */
-       int                     *rval)          /* resulting block needs join */
-{
-       xfs_dir2_data_hdr_t     *hdr;           /* data block header */
-       xfs_dir2_db_t           db;             /* data block number */
-       struct xfs_buf          *dbp;           /* data block buffer */
-       xfs_dir2_data_entry_t   *dep;           /* data block entry */
-       xfs_inode_t             *dp;            /* incore directory inode */
-       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
-       xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
-       int                     longest;        /* longest data free entry */
-       int                     off;            /* data block entry offset */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       int                     needlog;        /* need to log data header */
-       int                     needscan;       /* need to rescan data frees */
-       xfs_trans_t             *tp;            /* transaction pointer */
-       struct xfs_dir2_data_free *bf;          /* bestfree table */
-       struct xfs_dir3_icleaf_hdr leafhdr;
-       struct xfs_dir2_leaf_entry *ents;
-
-       trace_xfs_dir2_leafn_remove(args, index);
-
-       dp = args->dp;
-       tp = args->trans;
-       mp = dp->i_mount;
-       leaf = bp->b_addr;
-       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-       ents = dp->d_ops->leaf_ents_p(leaf);
-
-       /*
-        * Point to the entry we're removing.
-        */
-       lep = &ents[index];
-
-       /*
-        * Extract the data block and offset from the entry.
-        */
-       db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
-       ASSERT(dblk->blkno == db);
-       off = xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address));
-       ASSERT(dblk->index == off);
-
-       /*
-        * Kill the leaf entry by marking it stale.
-        * Log the leaf block changes.
-        */
-       leafhdr.stale++;
-       dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
-       xfs_dir3_leaf_log_header(args, bp);
-
-       lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
-       xfs_dir3_leaf_log_ents(args, bp, index, index);
-
-       /*
-        * Make the data entry free.  Keep track of the longest freespace
-        * in the data block in case it changes.
-        */
-       dbp = dblk->bp;
-       hdr = dbp->b_addr;
-       dep = (xfs_dir2_data_entry_t *)((char *)hdr + off);
-       bf = dp->d_ops->data_bestfree_p(hdr);
-       longest = be16_to_cpu(bf[0].length);
-       needlog = needscan = 0;
-       xfs_dir2_data_make_free(args, dbp, off,
-               dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
-       /*
-        * Rescan the data block freespaces for bestfree.
-        * Log the data block header if needed.
-        */
-       if (needscan)
-               xfs_dir2_data_freescan(dp, hdr, &needlog);
-       if (needlog)
-               xfs_dir2_data_log_header(args, dbp);
-       xfs_dir3_data_check(dp, dbp);
-       /*
-        * If the longest data block freespace changes, need to update
-        * the corresponding freeblock entry.
-        */
-       if (longest < be16_to_cpu(bf[0].length)) {
-               int             error;          /* error return value */
-               struct xfs_buf  *fbp;           /* freeblock buffer */
-               xfs_dir2_db_t   fdb;            /* freeblock block number */
-               int             findex;         /* index in freeblock entries */
-               xfs_dir2_free_t *free;          /* freeblock structure */
-
-               /*
-                * Convert the data block number to a free block,
-                * read in the free block.
-                */
-               fdb = dp->d_ops->db_to_fdb(args->geo, db);
-               error = xfs_dir2_free_read(tp, dp,
-                                          xfs_dir2_db_to_da(args->geo, fdb),
-                                          &fbp);
-               if (error)
-                       return error;
-               free = fbp->b_addr;
-#ifdef DEBUG
-       {
-               struct xfs_dir3_icfree_hdr freehdr;
-               dp->d_ops->free_hdr_from_disk(&freehdr, free);
-               ASSERT(freehdr.firstdb == dp->d_ops->free_max_bests(args->geo) *
-                       (fdb - xfs_dir2_byte_to_db(args->geo,
-                                                  XFS_DIR2_FREE_OFFSET)));
-       }
-#endif
-               /*
-                * Calculate which entry we need to fix.
-                */
-               findex = dp->d_ops->db_to_fdindex(args->geo, db);
-               longest = be16_to_cpu(bf[0].length);
-               /*
-                * If the data block is now empty we can get rid of it
-                * (usually).
-                */
-               if (longest == args->geo->blksize -
-                              dp->d_ops->data_entry_offset) {
-                       /*
-                        * Try to punch out the data block.
-                        */
-                       error = xfs_dir2_shrink_inode(args, db, dbp);
-                       if (error == 0) {
-                               dblk->bp = NULL;
-                               hdr = NULL;
-                       }
-                       /*
-                        * We can get ENOSPC if there's no space reservation.
-                        * In this case just drop the buffer and some one else
-                        * will eventually get rid of the empty block.
-                        */
-                       else if (!(error == ENOSPC && args->total == 0))
-                               return error;
-               }
-               /*
-                * If we got rid of the data block, we can eliminate that entry
-                * in the free block.
-                */
-               error = xfs_dir3_data_block_free(args, hdr, free,
-                                                fdb, findex, fbp, longest);
-               if (error)
-                       return error;
-       }
-
-       xfs_dir3_leaf_check(dp, bp);
-       /*
-        * Return indication of whether this leaf block is empty enough
-        * to justify trying to join it with a neighbor.
-        */
-       *rval = (dp->d_ops->leaf_hdr_size +
-                (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) <
-               args->geo->magicpct;
-       return 0;
-}
-
-/*
- * Split the leaf entries in the old block into old and new blocks.
- */
-int                                            /* error */
-xfs_dir2_leafn_split(
-       xfs_da_state_t          *state,         /* btree cursor */
-       xfs_da_state_blk_t      *oldblk,        /* original block */
-       xfs_da_state_blk_t      *newblk)        /* newly created block */
-{
-       xfs_da_args_t           *args;          /* operation arguments */
-       xfs_dablk_t             blkno;          /* new leaf block number */
-       int                     error;          /* error return value */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       struct xfs_inode        *dp;
-
-       /*
-        * Allocate space for a new leaf node.
-        */
-       args = state->args;
-       dp = args->dp;
-       mp = dp->i_mount;
-       ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC);
-       error = xfs_da_grow_inode(args, &blkno);
-       if (error) {
-               return error;
-       }
-       /*
-        * Initialize the new leaf block.
-        */
-       error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(args->geo, blkno),
-                                     &newblk->bp, XFS_DIR2_LEAFN_MAGIC);
-       if (error)
-               return error;
-
-       newblk->blkno = blkno;
-       newblk->magic = XFS_DIR2_LEAFN_MAGIC;
-       /*
-        * Rebalance the entries across the two leaves, link the new
-        * block into the leaves.
-        */
-       xfs_dir2_leafn_rebalance(state, oldblk, newblk);
-       error = xfs_da3_blk_link(state, oldblk, newblk);
-       if (error) {
-               return error;
-       }
-       /*
-        * Insert the new entry in the correct block.
-        */
-       if (state->inleaf)
-               error = xfs_dir2_leafn_add(oldblk->bp, args, oldblk->index);
-       else
-               error = xfs_dir2_leafn_add(newblk->bp, args, newblk->index);
-       /*
-        * Update last hashval in each block since we added the name.
-        */
-       oldblk->hashval = xfs_dir2_leafn_lasthash(dp, oldblk->bp, NULL);
-       newblk->hashval = xfs_dir2_leafn_lasthash(dp, newblk->bp, NULL);
-       xfs_dir3_leaf_check(dp, oldblk->bp);
-       xfs_dir3_leaf_check(dp, newblk->bp);
-       return error;
-}
-
-/*
- * Check a leaf block and its neighbors to see if the block should be
- * collapsed into one or the other neighbor.  Always keep the block
- * with the smaller block number.
- * If the current block is over 50% full, don't try to join it, return 0.
- * If the block is empty, fill in the state structure and return 2.
- * If it can be collapsed, fill in the state structure and return 1.
- * If nothing can be done, return 0.
- */
-int                                            /* error */
-xfs_dir2_leafn_toosmall(
-       xfs_da_state_t          *state,         /* btree cursor */
-       int                     *action)        /* resulting action to take */
-{
-       xfs_da_state_blk_t      *blk;           /* leaf block */
-       xfs_dablk_t             blkno;          /* leaf block number */
-       struct xfs_buf          *bp;            /* leaf buffer */
-       int                     bytes;          /* bytes in use */
-       int                     count;          /* leaf live entry count */
-       int                     error;          /* error return value */
-       int                     forward;        /* sibling block direction */
-       int                     i;              /* sibling counter */
-       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
-       int                     rval;           /* result from path_shift */
-       struct xfs_dir3_icleaf_hdr leafhdr;
-       struct xfs_dir2_leaf_entry *ents;
-       struct xfs_inode        *dp = state->args->dp;
-
-       /*
-        * Check for the degenerate case of the block being over 50% full.
-        * If so, it's not worth even looking to see if we might be able
-        * to coalesce with a sibling.
-        */
-       blk = &state->path.blk[state->path.active - 1];
-       leaf = blk->bp->b_addr;
-       dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-       ents = dp->d_ops->leaf_ents_p(leaf);
-       xfs_dir3_leaf_check(dp, blk->bp);
-
-       count = leafhdr.count - leafhdr.stale;
-       bytes = dp->d_ops->leaf_hdr_size + count * sizeof(ents[0]);
-       if (bytes > (state->args->geo->blksize >> 1)) {
-               /*
-                * Blk over 50%, don't try to join.
-                */
-               *action = 0;
-               return 0;
-       }
-       /*
-        * Check for the degenerate case of the block being empty.
-        * If the block is empty, we'll simply delete it, no need to
-        * coalesce it with a sibling block.  We choose (arbitrarily)
-        * to merge with the forward block unless it is NULL.
-        */
-       if (count == 0) {
-               /*
-                * Make altpath point to the block we want to keep and
-                * path point to the block we want to drop (this one).
-                */
-               forward = (leafhdr.forw != 0);
-               memcpy(&state->altpath, &state->path, sizeof(state->path));
-               error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
-                       &rval);
-               if (error)
-                       return error;
-               *action = rval ? 2 : 0;
-               return 0;
-       }
-       /*
-        * Examine each sibling block to see if we can coalesce with
-        * at least 25% free space to spare.  We need to figure out
-        * whether to merge with the forward or the backward block.
-        * We prefer coalescing with the lower numbered sibling so as
-        * to shrink a directory over time.
-        */
-       forward = leafhdr.forw < leafhdr.back;
-       for (i = 0, bp = NULL; i < 2; forward = !forward, i++) {
-               struct xfs_dir3_icleaf_hdr hdr2;
-
-               blkno = forward ? leafhdr.forw : leafhdr.back;
-               if (blkno == 0)
-                       continue;
-               /*
-                * Read the sibling leaf block.
-                */
-               error = xfs_dir3_leafn_read(state->args->trans, dp,
-                                           blkno, -1, &bp);
-               if (error)
-                       return error;
-
-               /*
-                * Count bytes in the two blocks combined.
-                */
-               count = leafhdr.count - leafhdr.stale;
-               bytes = state->args->geo->blksize -
-                       (state->args->geo->blksize >> 2);
-
-               leaf = bp->b_addr;
-               dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf);
-               ents = dp->d_ops->leaf_ents_p(leaf);
-               count += hdr2.count - hdr2.stale;
-               bytes -= count * sizeof(ents[0]);
-
-               /*
-                * Fits with at least 25% to spare.
-                */
-               if (bytes >= 0)
-                       break;
-               xfs_trans_brelse(state->args->trans, bp);
-       }
-       /*
-        * Didn't like either block, give up.
-        */
-       if (i >= 2) {
-               *action = 0;
-               return 0;
-       }
-
-       /*
-        * Make altpath point to the block we want to keep (the lower
-        * numbered block) and path point to the block we want to drop.
-        */
-       memcpy(&state->altpath, &state->path, sizeof(state->path));
-       if (blkno < blk->blkno)
-               error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
-                       &rval);
-       else
-               error = xfs_da3_path_shift(state, &state->path, forward, 0,
-                       &rval);
-       if (error) {
-               return error;
-       }
-       *action = rval ? 0 : 1;
-       return 0;
-}
-
-/*
- * Move all the leaf entries from drop_blk to save_blk.
- * This is done as part of a join operation.
- */
-void
-xfs_dir2_leafn_unbalance(
-       xfs_da_state_t          *state,         /* cursor */
-       xfs_da_state_blk_t      *drop_blk,      /* dead block */
-       xfs_da_state_blk_t      *save_blk)      /* surviving block */
-{
-       xfs_da_args_t           *args;          /* operation arguments */
-       xfs_dir2_leaf_t         *drop_leaf;     /* dead leaf structure */
-       xfs_dir2_leaf_t         *save_leaf;     /* surviving leaf structure */
-       struct xfs_dir3_icleaf_hdr savehdr;
-       struct xfs_dir3_icleaf_hdr drophdr;
-       struct xfs_dir2_leaf_entry *sents;
-       struct xfs_dir2_leaf_entry *dents;
-       struct xfs_inode        *dp = state->args->dp;
-
-       args = state->args;
-       ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
-       ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC);
-       drop_leaf = drop_blk->bp->b_addr;
-       save_leaf = save_blk->bp->b_addr;
-
-       dp->d_ops->leaf_hdr_from_disk(&savehdr, save_leaf);
-       dp->d_ops->leaf_hdr_from_disk(&drophdr, drop_leaf);
-       sents = dp->d_ops->leaf_ents_p(save_leaf);
-       dents = dp->d_ops->leaf_ents_p(drop_leaf);
-
-       /*
-        * If there are any stale leaf entries, take this opportunity
-        * to purge them.
-        */
-       if (drophdr.stale)
-               xfs_dir3_leaf_compact(args, &drophdr, drop_blk->bp);
-       if (savehdr.stale)
-               xfs_dir3_leaf_compact(args, &savehdr, save_blk->bp);
-
-       /*
-        * Move the entries from drop to the appropriate end of save.
-        */
-       drop_blk->hashval = be32_to_cpu(dents[drophdr.count - 1].hashval);
-       if (xfs_dir2_leafn_order(dp, save_blk->bp, drop_blk->bp))
-               xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
-                                       save_blk->bp, &savehdr, sents, 0,
-                                       drophdr.count);
-       else
-               xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
-                                       save_blk->bp, &savehdr, sents,
-                                       savehdr.count, drophdr.count);
-       save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval);
-
-       /* log the changes made when moving the entries */
-       dp->d_ops->leaf_hdr_to_disk(save_leaf, &savehdr);
-       dp->d_ops->leaf_hdr_to_disk(drop_leaf, &drophdr);
-       xfs_dir3_leaf_log_header(args, save_blk->bp);
-       xfs_dir3_leaf_log_header(args, drop_blk->bp);
-
-       xfs_dir3_leaf_check(dp, save_blk->bp);
-       xfs_dir3_leaf_check(dp, drop_blk->bp);
-}
-
-/*
- * Top-level node form directory addname routine.
- */
-int                                            /* error */
-xfs_dir2_node_addname(
-       xfs_da_args_t           *args)          /* operation arguments */
-{
-       xfs_da_state_blk_t      *blk;           /* leaf block for insert */
-       int                     error;          /* error return value */
-       int                     rval;           /* sub-return value */
-       xfs_da_state_t          *state;         /* btree cursor */
-
-       trace_xfs_dir2_node_addname(args);
-
-       /*
-        * Allocate and initialize the state (btree cursor).
-        */
-       state = xfs_da_state_alloc();
-       state->args = args;
-       state->mp = args->dp->i_mount;
-       /*
-        * Look up the name.  We're not supposed to find it, but
-        * this gives us the insertion point.
-        */
-       error = xfs_da3_node_lookup_int(state, &rval);
-       if (error)
-               rval = error;
-       if (rval != ENOENT) {
-               goto done;
-       }
-       /*
-        * Add the data entry to a data block.
-        * Extravalid is set to a freeblock found by lookup.
-        */
-       rval = xfs_dir2_node_addname_int(args,
-               state->extravalid ? &state->extrablk : NULL);
-       if (rval) {
-               goto done;
-       }
-       blk = &state->path.blk[state->path.active - 1];
-       ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
-       /*
-        * Add the new leaf entry.
-        */
-       rval = xfs_dir2_leafn_add(blk->bp, args, blk->index);
-       if (rval == 0) {
-               /*
-                * It worked, fix the hash values up the btree.
-                */
-               if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
-                       xfs_da3_fixhashpath(state, &state->path);
-       } else {
-               /*
-                * It didn't work, we need to split the leaf block.
-                */
-               if (args->total == 0) {
-                       ASSERT(rval == ENOSPC);
-                       goto done;
-               }
-               /*
-                * Split the leaf block and insert the new entry.
-                */
-               rval = xfs_da3_split(state);
-       }
-done:
-       xfs_da_state_free(state);
-       return rval;
-}
-
-/*
- * Add the data entry for a node-format directory name addition.
- * The leaf entry is added in xfs_dir2_leafn_add.
- * We may enter with a freespace block that the lookup found.
- */
-static int                                     /* error */
-xfs_dir2_node_addname_int(
-       xfs_da_args_t           *args,          /* operation arguments */
-       xfs_da_state_blk_t      *fblk)          /* optional freespace block */
-{
-       xfs_dir2_data_hdr_t     *hdr;           /* data block header */
-       xfs_dir2_db_t           dbno;           /* data block number */
-       struct xfs_buf          *dbp;           /* data block buffer */
-       xfs_dir2_data_entry_t   *dep;           /* data entry pointer */
-       xfs_inode_t             *dp;            /* incore directory inode */
-       xfs_dir2_data_unused_t  *dup;           /* data unused entry pointer */
-       int                     error;          /* error return value */
-       xfs_dir2_db_t           fbno;           /* freespace block number */
-       struct xfs_buf          *fbp;           /* freespace buffer */
-       int                     findex;         /* freespace entry index */
-       xfs_dir2_free_t         *free=NULL;     /* freespace block structure */
-       xfs_dir2_db_t           ifbno;          /* initial freespace block no */
-       xfs_dir2_db_t           lastfbno=0;     /* highest freespace block no */
-       int                     length;         /* length of the new entry */
-       int                     logfree;        /* need to log free entry */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       int                     needlog;        /* need to log data header */
-       int                     needscan;       /* need to rescan data frees */
-       __be16                  *tagp;          /* data entry tag pointer */
-       xfs_trans_t             *tp;            /* transaction pointer */
-       __be16                  *bests;
-       struct xfs_dir3_icfree_hdr freehdr;
-       struct xfs_dir2_data_free *bf;
-
-       dp = args->dp;
-       mp = dp->i_mount;
-       tp = args->trans;
-       length = dp->d_ops->data_entsize(args->namelen);
-       /*
-        * If we came in with a freespace block that means that lookup
-        * found an entry with our hash value.  This is the freespace
-        * block for that data entry.
-        */
-       if (fblk) {
-               fbp = fblk->bp;
-               /*
-                * Remember initial freespace block number.
-                */
-               ifbno = fblk->blkno;
-               free = fbp->b_addr;
-               findex = fblk->index;
-               bests = dp->d_ops->free_bests_p(free);
-               dp->d_ops->free_hdr_from_disk(&freehdr, free);
-
-               /*
-                * This means the free entry showed that the data block had
-                * space for our entry, so we remembered it.
-                * Use that data block.
-                */
-               if (findex >= 0) {
-                       ASSERT(findex < freehdr.nvalid);
-                       ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
-                       ASSERT(be16_to_cpu(bests[findex]) >= length);
-                       dbno = freehdr.firstdb + findex;
-               } else {
-                       /*
-                        * The data block looked at didn't have enough room.
-                        * We'll start at the beginning of the freespace entries.
-                        */
-                       dbno = -1;
-                       findex = 0;
-               }
-       } else {
-               /*
-                * Didn't come in with a freespace block, so no data block.
-                */
-               ifbno = dbno = -1;
-               fbp = NULL;
-               findex = 0;
-       }
-
-       /*
-        * If we don't have a data block yet, we're going to scan the
-        * freespace blocks looking for one.  Figure out what the
-        * highest freespace block number is.
-        */
-       if (dbno == -1) {
-               xfs_fileoff_t   fo;             /* freespace block number */
-
-               if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK)))
-                       return error;
-               lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
-               fbno = ifbno;
-       }
-       /*
-        * While we haven't identified a data block, search the freeblock
-        * data for a good data block.  If we find a null freeblock entry,
-        * indicating a hole in the data blocks, remember that.
-        */
-       while (dbno == -1) {
-               /*
-                * If we don't have a freeblock in hand, get the next one.
-                */
-               if (fbp == NULL) {
-                       /*
-                        * Happens the first time through unless lookup gave
-                        * us a freespace block to start with.
-                        */
-                       if (++fbno == 0)
-                               fbno = xfs_dir2_byte_to_db(args->geo,
-                                                       XFS_DIR2_FREE_OFFSET);
-                       /*
-                        * If it's ifbno we already looked at it.
-                        */
-                       if (fbno == ifbno)
-                               fbno++;
-                       /*
-                        * If it's off the end we're done.
-                        */
-                       if (fbno >= lastfbno)
-                               break;
-                       /*
-                        * Read the block.  There can be holes in the
-                        * freespace blocks, so this might not succeed.
-                        * This should be really rare, so there's no reason
-                        * to avoid it.
-                        */
-                       error = xfs_dir2_free_try_read(tp, dp,
-                                       xfs_dir2_db_to_da(args->geo, fbno),
-                                       &fbp);
-                       if (error)
-                               return error;
-                       if (!fbp)
-                               continue;
-                       free = fbp->b_addr;
-                       findex = 0;
-               }
-               /*
-                * Look at the current free entry.  Is it good enough?
-                *
-                * The bests initialisation should be where the bufer is read in
-                * the above branch. But gcc is too stupid to realise that bests
-                * and the freehdr are actually initialised if they are placed
-                * there, so we have to do it here to avoid warnings. Blech.
-                */
-               bests = dp->d_ops->free_bests_p(free);
-               dp->d_ops->free_hdr_from_disk(&freehdr, free);
-               if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
-                   be16_to_cpu(bests[findex]) >= length)
-                       dbno = freehdr.firstdb + findex;
-               else {
-                       /*
-                        * Are we done with the freeblock?
-                        */
-                       if (++findex == freehdr.nvalid) {
-                               /*
-                                * Drop the block.
-                                */
-                               xfs_trans_brelse(tp, fbp);
-                               fbp = NULL;
-                               if (fblk && fblk->bp)
-                                       fblk->bp = NULL;
-                       }
-               }
-       }
-       /*
-        * If we don't have a data block, we need to allocate one and make
-        * the freespace entries refer to it.
-        */
-       if (unlikely(dbno == -1)) {
-               /*
-                * Not allowed to allocate, return failure.
-                */
-               if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
-                       return XFS_ERROR(ENOSPC);
-
-               /*
-                * Allocate and initialize the new data block.
-                */
-               if (unlikely((error = xfs_dir2_grow_inode(args,
-                                                        XFS_DIR2_DATA_SPACE,
-                                                        &dbno)) ||
-                   (error = xfs_dir3_data_init(args, dbno, &dbp))))
-                       return error;
-
-               /*
-                * If (somehow) we have a freespace block, get rid of it.
-                */
-               if (fbp)
-                       xfs_trans_brelse(tp, fbp);
-               if (fblk && fblk->bp)
-                       fblk->bp = NULL;
-
-               /*
-                * Get the freespace block corresponding to the data block
-                * that was just allocated.
-                */
-               fbno = dp->d_ops->db_to_fdb(args->geo, dbno);
-               error = xfs_dir2_free_try_read(tp, dp,
-                                      xfs_dir2_db_to_da(args->geo, fbno),
-                                      &fbp);
-               if (error)
-                       return error;
-
-               /*
-                * If there wasn't a freespace block, the read will
-                * return a NULL fbp.  Allocate and initialize a new one.
-                */
-               if (!fbp) {
-                       error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
-                                                   &fbno);
-                       if (error)
-                               return error;
-
-                       if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) {
-                               xfs_alert(mp,
-                       "%s: dir ino %llu needed freesp block %lld for\n"
-                       "  data block %lld, got %lld ifbno %llu lastfbno %d",
-                                       __func__, (unsigned long long)dp->i_ino,
-                                       (long long)dp->d_ops->db_to_fdb(
-                                                               args->geo, dbno),
-                                       (long long)dbno, (long long)fbno,
-                                       (unsigned long long)ifbno, lastfbno);
-                               if (fblk) {
-                                       xfs_alert(mp,
-                               " fblk 0x%p blkno %llu index %d magic 0x%x",
-                                               fblk,
-                                               (unsigned long long)fblk->blkno,
-                                               fblk->index,
-                                               fblk->magic);
-                               } else {
-                                       xfs_alert(mp, " ... fblk is NULL");
-                               }
-                               XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
-                                                XFS_ERRLEVEL_LOW, mp);
-                               return XFS_ERROR(EFSCORRUPTED);
-                       }
-
-                       /*
-                        * Get a buffer for the new block.
-                        */
-                       error = xfs_dir3_free_get_buf(args, fbno, &fbp);
-                       if (error)
-                               return error;
-                       free = fbp->b_addr;
-                       bests = dp->d_ops->free_bests_p(free);
-                       dp->d_ops->free_hdr_from_disk(&freehdr, free);
-
-                       /*
-                        * Remember the first slot as our empty slot.
-                        */
-                       freehdr.firstdb =
-                               (fbno - xfs_dir2_byte_to_db(args->geo,
-                                                       XFS_DIR2_FREE_OFFSET)) *
-                                       dp->d_ops->free_max_bests(args->geo);
-               } else {
-                       free = fbp->b_addr;
-                       bests = dp->d_ops->free_bests_p(free);
-                       dp->d_ops->free_hdr_from_disk(&freehdr, free);
-               }
-
-               /*
-                * Set the freespace block index from the data block number.
-                */
-               findex = dp->d_ops->db_to_fdindex(args->geo, dbno);
-               /*
-                * If it's after the end of the current entries in the
-                * freespace block, extend that table.
-                */
-               if (findex >= freehdr.nvalid) {
-                       ASSERT(findex < dp->d_ops->free_max_bests(args->geo));
-                       freehdr.nvalid = findex + 1;
-                       /*
-                        * Tag new entry so nused will go up.
-                        */
-                       bests[findex] = cpu_to_be16(NULLDATAOFF);
-               }
-               /*
-                * If this entry was for an empty data block
-                * (this should always be true) then update the header.
-                */
-               if (bests[findex] == cpu_to_be16(NULLDATAOFF)) {
-                       freehdr.nused++;
-                       dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
-                       xfs_dir2_free_log_header(args, fbp);
-               }
-               /*
-                * Update the real value in the table.
-                * We haven't allocated the data entry yet so this will
-                * change again.
-                */
-               hdr = dbp->b_addr;
-               bf = dp->d_ops->data_bestfree_p(hdr);
-               bests[findex] = bf[0].length;
-               logfree = 1;
-       }
-       /*
-        * We had a data block so we don't have to make a new one.
-        */
-       else {
-               /*
-                * If just checking, we succeeded.
-                */
-               if (args->op_flags & XFS_DA_OP_JUSTCHECK)
-                       return 0;
-
-               /*
-                * Read the data block in.
-                */
-               error = xfs_dir3_data_read(tp, dp,
-                                          xfs_dir2_db_to_da(args->geo, dbno),
-                                          -1, &dbp);
-               if (error)
-                       return error;
-               hdr = dbp->b_addr;
-               bf = dp->d_ops->data_bestfree_p(hdr);
-               logfree = 0;
-       }
-       ASSERT(be16_to_cpu(bf[0].length) >= length);
-       /*
-        * Point to the existing unused space.
-        */
-       dup = (xfs_dir2_data_unused_t *)
-             ((char *)hdr + be16_to_cpu(bf[0].offset));
-       needscan = needlog = 0;
-       /*
-        * Mark the first part of the unused space, inuse for us.
-        */
-       xfs_dir2_data_use_free(args, dbp, dup,
-               (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
-               &needlog, &needscan);
-       /*
-        * Fill in the new entry and log it.
-        */
-       dep = (xfs_dir2_data_entry_t *)dup;
-       dep->inumber = cpu_to_be64(args->inumber);
-       dep->namelen = args->namelen;
-       memcpy(dep->name, args->name, dep->namelen);
-       dp->d_ops->data_put_ftype(dep, args->filetype);
-       tagp = dp->d_ops->data_entry_tag_p(dep);
-       *tagp = cpu_to_be16((char *)dep - (char *)hdr);
-       xfs_dir2_data_log_entry(args, dbp, dep);
-       /*
-        * Rescan the block for bestfree if needed.
-        */
-       if (needscan)
-               xfs_dir2_data_freescan(dp, hdr, &needlog);
-       /*
-        * Log the data block header if needed.
-        */
-       if (needlog)
-               xfs_dir2_data_log_header(args, dbp);
-       /*
-        * If the freespace entry is now wrong, update it.
-        */
-       bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */
-       if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) {
-               bests[findex] = bf[0].length;
-               logfree = 1;
-       }
-       /*
-        * Log the freespace entry if needed.
-        */
-       if (logfree)
-               xfs_dir2_free_log_bests(args, fbp, findex, findex);
-       /*
-        * Return the data block and offset in args, then drop the data block.
-        */
-       args->blkno = (xfs_dablk_t)dbno;
-       args->index = be16_to_cpu(*tagp);
-       return 0;
-}
-
-/*
- * Lookup an entry in a node-format directory.
- * All the real work happens in xfs_da3_node_lookup_int.
- * The only real output is the inode number of the entry.
- */
-int                                            /* error */
-xfs_dir2_node_lookup(
-       xfs_da_args_t   *args)                  /* operation arguments */
-{
-       int             error;                  /* error return value */
-       int             i;                      /* btree level */
-       int             rval;                   /* operation return value */
-       xfs_da_state_t  *state;                 /* btree cursor */
-
-       trace_xfs_dir2_node_lookup(args);
-
-       /*
-        * Allocate and initialize the btree cursor.
-        */
-       state = xfs_da_state_alloc();
-       state->args = args;
-       state->mp = args->dp->i_mount;
-       /*
-        * Fill in the path to the entry in the cursor.
-        */
-       error = xfs_da3_node_lookup_int(state, &rval);
-       if (error)
-               rval = error;
-       else if (rval == ENOENT && args->cmpresult == XFS_CMP_CASE) {
-               /* If a CI match, dup the actual name and return EEXIST */
-               xfs_dir2_data_entry_t   *dep;
-
-               dep = (xfs_dir2_data_entry_t *)
-                       ((char *)state->extrablk.bp->b_addr +
-                                                state->extrablk.index);
-               rval = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
-       }
-       /*
-        * Release the btree blocks and leaf block.
-        */
-       for (i = 0; i < state->path.active; i++) {
-               xfs_trans_brelse(args->trans, state->path.blk[i].bp);
-               state->path.blk[i].bp = NULL;
-       }
-       /*
-        * Release the data block if we have it.
-        */
-       if (state->extravalid && state->extrablk.bp) {
-               xfs_trans_brelse(args->trans, state->extrablk.bp);
-               state->extrablk.bp = NULL;
-       }
-       xfs_da_state_free(state);
-       return rval;
-}
-
-/*
- * Remove an entry from a node-format directory.
- */
-int                                            /* error */
-xfs_dir2_node_removename(
-       struct xfs_da_args      *args)          /* operation arguments */
-{
-       struct xfs_da_state_blk *blk;           /* leaf block */
-       int                     error;          /* error return value */
-       int                     rval;           /* operation return value */
-       struct xfs_da_state     *state;         /* btree cursor */
-
-       trace_xfs_dir2_node_removename(args);
-
-       /*
-        * Allocate and initialize the btree cursor.
-        */
-       state = xfs_da_state_alloc();
-       state->args = args;
-       state->mp = args->dp->i_mount;
-
-       /* Look up the entry we're deleting, set up the cursor. */
-       error = xfs_da3_node_lookup_int(state, &rval);
-       if (error)
-               goto out_free;
-
-       /* Didn't find it, upper layer screwed up. */
-       if (rval != EEXIST) {
-               error = rval;
-               goto out_free;
-       }
-
-       blk = &state->path.blk[state->path.active - 1];
-       ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
-       ASSERT(state->extravalid);
-       /*
-        * Remove the leaf and data entries.
-        * Extrablk refers to the data block.
-        */
-       error = xfs_dir2_leafn_remove(args, blk->bp, blk->index,
-               &state->extrablk, &rval);
-       if (error)
-               goto out_free;
-       /*
-        * Fix the hash values up the btree.
-        */
-       xfs_da3_fixhashpath(state, &state->path);
-       /*
-        * If we need to join leaf blocks, do it.
-        */
-       if (rval && state->path.active > 1)
-               error = xfs_da3_join(state);
-       /*
-        * If no errors so far, try conversion to leaf format.
-        */
-       if (!error)
-               error = xfs_dir2_node_to_leaf(state);
-out_free:
-       xfs_da_state_free(state);
-       return error;
-}
-
-/*
- * Replace an entry's inode number in a node-format directory.
- */
-int                                            /* error */
-xfs_dir2_node_replace(
-       xfs_da_args_t           *args)          /* operation arguments */
-{
-       xfs_da_state_blk_t      *blk;           /* leaf block */
-       xfs_dir2_data_hdr_t     *hdr;           /* data block header */
-       xfs_dir2_data_entry_t   *dep;           /* data entry changed */
-       int                     error;          /* error return value */
-       int                     i;              /* btree level */
-       xfs_ino_t               inum;           /* new inode number */
-       xfs_dir2_leaf_t         *leaf;          /* leaf structure */
-       xfs_dir2_leaf_entry_t   *lep;           /* leaf entry being changed */
-       int                     rval;           /* internal return value */
-       xfs_da_state_t          *state;         /* btree cursor */
-
-       trace_xfs_dir2_node_replace(args);
-
-       /*
-        * Allocate and initialize the btree cursor.
-        */
-       state = xfs_da_state_alloc();
-       state->args = args;
-       state->mp = args->dp->i_mount;
-       inum = args->inumber;
-       /*
-        * Lookup the entry to change in the btree.
-        */
-       error = xfs_da3_node_lookup_int(state, &rval);
-       if (error) {
-               rval = error;
-       }
-       /*
-        * It should be found, since the vnodeops layer has looked it up
-        * and locked it.  But paranoia is good.
-        */
-       if (rval == EEXIST) {
-               struct xfs_dir2_leaf_entry *ents;
-               /*
-                * Find the leaf entry.
-                */
-               blk = &state->path.blk[state->path.active - 1];
-               ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
-               leaf = blk->bp->b_addr;
-               ents = args->dp->d_ops->leaf_ents_p(leaf);
-               lep = &ents[blk->index];
-               ASSERT(state->extravalid);
-               /*
-                * Point to the data entry.
-                */
-               hdr = state->extrablk.bp->b_addr;
-               ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-                      hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
-               dep = (xfs_dir2_data_entry_t *)
-                     ((char *)hdr +
-                      xfs_dir2_dataptr_to_off(args->geo,
-                                              be32_to_cpu(lep->address)));
-               ASSERT(inum != be64_to_cpu(dep->inumber));
-               /*
-                * Fill in the new inode number and log the entry.
-                */
-               dep->inumber = cpu_to_be64(inum);
-               args->dp->d_ops->data_put_ftype(dep, args->filetype);
-               xfs_dir2_data_log_entry(args, state->extrablk.bp, dep);
-               rval = 0;
-       }
-       /*
-        * Didn't find it, and we're holding a data block.  Drop it.
-        */
-       else if (state->extravalid) {
-               xfs_trans_brelse(args->trans, state->extrablk.bp);
-               state->extrablk.bp = NULL;
-       }
-       /*
-        * Release all the buffers in the cursor.
-        */
-       for (i = 0; i < state->path.active; i++) {
-               xfs_trans_brelse(args->trans, state->path.blk[i].bp);
-               state->path.blk[i].bp = NULL;
-       }
-       xfs_da_state_free(state);
-       return rval;
-}
-
-/*
- * Trim off a trailing empty freespace block.
- * Return (in rvalp) 1 if we did it, 0 if not.
- */
-int                                            /* error */
-xfs_dir2_node_trim_free(
-       xfs_da_args_t           *args,          /* operation arguments */
-       xfs_fileoff_t           fo,             /* free block number */
-       int                     *rvalp)         /* out: did something */
-{
-       struct xfs_buf          *bp;            /* freespace buffer */
-       xfs_inode_t             *dp;            /* incore directory inode */
-       int                     error;          /* error return code */
-       xfs_dir2_free_t         *free;          /* freespace structure */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       xfs_trans_t             *tp;            /* transaction pointer */
-       struct xfs_dir3_icfree_hdr freehdr;
-
-       dp = args->dp;
-       mp = dp->i_mount;
-       tp = args->trans;
-       /*
-        * Read the freespace block.
-        */
-       error = xfs_dir2_free_try_read(tp, dp, fo, &bp);
-       if (error)
-               return error;
-       /*
-        * There can be holes in freespace.  If fo is a hole, there's
-        * nothing to do.
-        */
-       if (!bp)
-               return 0;
-       free = bp->b_addr;
-       dp->d_ops->free_hdr_from_disk(&freehdr, free);
-
-       /*
-        * If there are used entries, there's nothing to do.
-        */
-       if (freehdr.nused > 0) {
-               xfs_trans_brelse(tp, bp);
-               *rvalp = 0;
-               return 0;
-       }
-       /*
-        * Blow the block away.
-        */
-       error = xfs_dir2_shrink_inode(args,
-                       xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo), bp);
-       if (error) {
-               /*
-                * Can't fail with ENOSPC since that only happens with no
-                * space reservation, when breaking up an extent into two
-                * pieces.  This is the last block of an extent.
-                */
-               ASSERT(error != ENOSPC);
-               xfs_trans_brelse(tp, bp);
-               return error;
-       }
-       /*
-        * Return that we succeeded.
-        */
-       *rvalp = 1;
-       return 0;
-}
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
deleted file mode 100644 (file)
index 27ce079..0000000
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DIR2_PRIV_H__
-#define __XFS_DIR2_PRIV_H__
-
-struct dir_context;
-
-/*
- * Directory offset/block conversion functions.
- *
- * DB blocks here are logical directory block numbers, not filesystem blocks.
- */
-
-/*
- * Convert dataptr to byte in file space
- */
-static inline xfs_dir2_off_t
-xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp)
-{
-       return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG;
-}
-
-/*
- * Convert byte in file space to dataptr.  It had better be aligned.
- */
-static inline xfs_dir2_dataptr_t
-xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by)
-{
-       return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG);
-}
-
-/*
- * Convert byte in space to (DB) block
- */
-static inline xfs_dir2_db_t
-xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
-{
-       return (xfs_dir2_db_t)(by >> geo->blklog);
-}
-
-/*
- * Convert dataptr to a block number
- */
-static inline xfs_dir2_db_t
-xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
-{
-       return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp));
-}
-
-/*
- * Convert byte in space to offset in a block
- */
-static inline xfs_dir2_data_aoff_t
-xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
-{
-       return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1));
-}
-
-/*
- * Convert dataptr to a byte offset in a block
- */
-static inline xfs_dir2_data_aoff_t
-xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
-{
-       return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp));
-}
-
-/*
- * Convert block and offset to byte in space
- */
-static inline xfs_dir2_off_t
-xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
-                       xfs_dir2_data_aoff_t o)
-{
-       return ((xfs_dir2_off_t)db << geo->blklog) + o;
-}
-
-/*
- * Convert block (DB) to block (dablk)
- */
-static inline xfs_dablk_t
-xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
-{
-       return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog));
-}
-
-/*
- * Convert byte in space to (DA) block
- */
-static inline xfs_dablk_t
-xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
-{
-       return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by));
-}
-
-/*
- * Convert block and offset to dataptr
- */
-static inline xfs_dir2_dataptr_t
-xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
-                          xfs_dir2_data_aoff_t o)
-{
-       return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o));
-}
-
-/*
- * Convert block (dablk) to block (DB)
- */
-static inline xfs_dir2_db_t
-xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da)
-{
-       return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog));
-}
-
-/*
- * Convert block (dablk) to byte offset in space
- */
-static inline xfs_dir2_off_t
-xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da)
-{
-       return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0);
-}
-
-/*
- * Directory tail pointer accessor functions. Based on block geometry.
- */
-static inline struct xfs_dir2_block_tail *
-xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr)
-{
-       return ((struct xfs_dir2_block_tail *)
-               ((char *)hdr + geo->blksize)) - 1;
-}
-
-static inline struct xfs_dir2_leaf_tail *
-xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
-{
-       return (struct xfs_dir2_leaf_tail *)
-               ((char *)lp + geo->blksize -
-                 sizeof(struct xfs_dir2_leaf_tail));
-}
-
-/* xfs_dir2.c */
-extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
-extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
-                               xfs_dir2_db_t *dbp);
-extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
-                               const unsigned char *name, int len);
-
-#define S_SHIFT 12
-extern const unsigned char xfs_mode_to_ftype[];
-
-extern unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp,
-                                       __uint8_t filetype);
-
-
-/* xfs_dir2_block.c */
-extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp,
-                              struct xfs_buf **bpp);
-extern int xfs_dir2_block_addname(struct xfs_da_args *args);
-extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
-extern int xfs_dir2_block_removename(struct xfs_da_args *args);
-extern int xfs_dir2_block_replace(struct xfs_da_args *args);
-extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
-               struct xfs_buf *lbp, struct xfs_buf *dbp);
-
-/* xfs_dir2_data.c */
-#ifdef DEBUG
-#define        xfs_dir3_data_check(dp,bp) __xfs_dir3_data_check(dp, bp);
-#else
-#define        xfs_dir3_data_check(dp,bp)
-#endif
-
-extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
-extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
-               xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
-extern int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno,
-               xfs_daddr_t mapped_bno);
-
-extern struct xfs_dir2_data_free *
-xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
-               struct xfs_dir2_data_free *bf, struct xfs_dir2_data_unused *dup,
-               int *loghead);
-extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
-               struct xfs_buf **bpp);
-
-/* xfs_dir2_leaf.c */
-extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
-               xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
-extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
-               struct xfs_buf *dbp);
-extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
-extern void xfs_dir3_leaf_compact(struct xfs_da_args *args,
-               struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_buf *bp);
-extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
-               struct xfs_dir2_leaf_entry *ents, int *indexp,
-               int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
-extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
-               struct xfs_buf **bpp, __uint16_t magic);
-extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args,
-               struct xfs_buf *bp, int first, int last);
-extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args,
-               struct xfs_buf *bp);
-extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args);
-extern int xfs_dir2_leaf_removename(struct xfs_da_args *args);
-extern int xfs_dir2_leaf_replace(struct xfs_da_args *args);
-extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args,
-               struct xfs_buf *lbp);
-extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args,
-               struct xfs_buf *lbp, xfs_dir2_db_t db);
-extern struct xfs_dir2_leaf_entry *
-xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr,
-               struct xfs_dir2_leaf_entry *ents, int index, int compact,
-               int lowstale, int highstale, int *lfloglow, int *lfloghigh);
-extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state);
-
-extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, struct xfs_inode *dp,
-               struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf);
-
-/* xfs_dir2_node.c */
-extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
-               struct xfs_buf *lbp);
-extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_inode *dp,
-               struct xfs_buf *bp, int *count);
-extern int xfs_dir2_leafn_lookup_int(struct xfs_buf *bp,
-               struct xfs_da_args *args, int *indexp,
-               struct xfs_da_state *state);
-extern int xfs_dir2_leafn_order(struct xfs_inode *dp, struct xfs_buf *leaf1_bp,
-               struct xfs_buf *leaf2_bp);
-extern int xfs_dir2_leafn_split(struct xfs_da_state *state,
-       struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk);
-extern int xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action);
-extern void xfs_dir2_leafn_unbalance(struct xfs_da_state *state,
-               struct xfs_da_state_blk *drop_blk,
-               struct xfs_da_state_blk *save_blk);
-extern int xfs_dir2_node_addname(struct xfs_da_args *args);
-extern int xfs_dir2_node_lookup(struct xfs_da_args *args);
-extern int xfs_dir2_node_removename(struct xfs_da_args *args);
-extern int xfs_dir2_node_replace(struct xfs_da_args *args);
-extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
-               int *rvalp);
-extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
-               xfs_dablk_t fbno, struct xfs_buf **bpp);
-
-/* xfs_dir2_sf.c */
-extern int xfs_dir2_block_sfsize(struct xfs_inode *dp,
-               struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp);
-extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp,
-               int size, xfs_dir2_sf_hdr_t *sfhp);
-extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
-extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
-extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
-extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
-extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
-
-/* xfs_dir2_readdir.c */
-extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx,
-                      size_t bufsize);
-
-#endif /* __XFS_DIR2_PRIV_H__ */
index 48e99afb9cb0ceff340e4842009d67d6c063d2f1..f1b69edcdf310fc0342c0aff40bacfa2682f13c2 100644 (file)
@@ -95,7 +95,7 @@ xfs_dir2_sf_getdents(
         */
        if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
                ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-               return XFS_ERROR(EIO);
+               return -EIO;
        }
 
        ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
@@ -677,7 +677,7 @@ xfs_readdir(
        trace_xfs_readdir(dp);
 
        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-               return XFS_ERROR(EIO);
+               return -EIO;
 
        ASSERT(S_ISDIR(dp->i_d.di_mode));
        XFS_STATS_INC(xs_dir_getdents);
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
deleted file mode 100644 (file)
index 53c3be6..0000000
+++ /dev/null
@@ -1,1184 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_inode.h"
-#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_error.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_priv.h"
-#include "xfs_trace.h"
-#include "xfs_dinode.h"
-
-/*
- * Prototypes for internal functions.
- */
-static void xfs_dir2_sf_addname_easy(xfs_da_args_t *args,
-                                    xfs_dir2_sf_entry_t *sfep,
-                                    xfs_dir2_data_aoff_t offset,
-                                    int new_isize);
-static void xfs_dir2_sf_addname_hard(xfs_da_args_t *args, int objchange,
-                                    int new_isize);
-static int xfs_dir2_sf_addname_pick(xfs_da_args_t *args, int objchange,
-                                   xfs_dir2_sf_entry_t **sfepp,
-                                   xfs_dir2_data_aoff_t *offsetp);
-#ifdef DEBUG
-static void xfs_dir2_sf_check(xfs_da_args_t *args);
-#else
-#define        xfs_dir2_sf_check(args)
-#endif /* DEBUG */
-#if XFS_BIG_INUMS
-static void xfs_dir2_sf_toino4(xfs_da_args_t *args);
-static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
-#endif /* XFS_BIG_INUMS */
-
-/*
- * Given a block directory (dp/block), calculate its size as a shortform (sf)
- * directory and a header for the sf directory, if it will fit it the
- * space currently present in the inode.  If it won't fit, the output
- * size is too big (but not accurate).
- */
-int                                            /* size for sf form */
-xfs_dir2_block_sfsize(
-       xfs_inode_t             *dp,            /* incore inode pointer */
-       xfs_dir2_data_hdr_t     *hdr,           /* block directory data */
-       xfs_dir2_sf_hdr_t       *sfhp)          /* output: header for sf form */
-{
-       xfs_dir2_dataptr_t      addr;           /* data entry address */
-       xfs_dir2_leaf_entry_t   *blp;           /* leaf area of the block */
-       xfs_dir2_block_tail_t   *btp;           /* tail area of the block */
-       int                     count;          /* shortform entry count */
-       xfs_dir2_data_entry_t   *dep;           /* data entry in the block */
-       int                     i;              /* block entry index */
-       int                     i8count;        /* count of big-inode entries */
-       int                     isdot;          /* entry is "." */
-       int                     isdotdot;       /* entry is ".." */
-       xfs_mount_t             *mp;            /* mount structure pointer */
-       int                     namelen;        /* total name bytes */
-       xfs_ino_t               parent = 0;     /* parent inode number */
-       int                     size=0;         /* total computed size */
-       int                     has_ftype;
-       struct xfs_da_geometry  *geo;
-
-       mp = dp->i_mount;
-       geo = mp->m_dir_geo;
-
-       /*
-        * if there is a filetype field, add the extra byte to the namelen
-        * for each entry that we see.
-        */
-       has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0;
-
-       count = i8count = namelen = 0;
-       btp = xfs_dir2_block_tail_p(geo, hdr);
-       blp = xfs_dir2_block_leaf_p(btp);
-
-       /*
-        * Iterate over the block's data entries by using the leaf pointers.
-        */
-       for (i = 0; i < be32_to_cpu(btp->count); i++) {
-               if ((addr = be32_to_cpu(blp[i].address)) == XFS_DIR2_NULL_DATAPTR)
-                       continue;
-               /*
-                * Calculate the pointer to the entry at hand.
-                */
-               dep = (xfs_dir2_data_entry_t *)((char *)hdr +
-                               xfs_dir2_dataptr_to_off(geo, addr));
-               /*
-                * Detect . and .., so we can special-case them.
-                * . is not included in sf directories.
-                * .. is included by just the parent inode number.
-                */
-               isdot = dep->namelen == 1 && dep->name[0] == '.';
-               isdotdot =
-                       dep->namelen == 2 &&
-                       dep->name[0] == '.' && dep->name[1] == '.';
-#if XFS_BIG_INUMS
-               if (!isdot)
-                       i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM;
-#endif
-               /* take into account the file type field */
-               if (!isdot && !isdotdot) {
-                       count++;
-                       namelen += dep->namelen + has_ftype;
-               } else if (isdotdot)
-                       parent = be64_to_cpu(dep->inumber);
-               /*
-                * Calculate the new size, see if we should give up yet.
-                */
-               size = xfs_dir2_sf_hdr_size(i8count) +          /* header */
-                      count +                                  /* namelen */
-                      count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */
-                      namelen +                                /* name */
-                      (i8count ?                               /* inumber */
-                               (uint)sizeof(xfs_dir2_ino8_t) * count :
-                               (uint)sizeof(xfs_dir2_ino4_t) * count);
-               if (size > XFS_IFORK_DSIZE(dp))
-                       return size;            /* size value is a failure */
-       }
-       /*
-        * Create the output header, if it worked.
-        */
-       sfhp->count = count;
-       sfhp->i8count = i8count;
-       dp->d_ops->sf_put_parent_ino(sfhp, parent);
-       return size;
-}
-
-/*
- * Convert a block format directory to shortform.
- * Caller has already checked that it will fit, and built us a header.
- */
-int                                            /* error */
-xfs_dir2_block_to_sf(
-       xfs_da_args_t           *args,          /* operation arguments */
-       struct xfs_buf          *bp,
-       int                     size,           /* shortform directory size */
-       xfs_dir2_sf_hdr_t       *sfhp)          /* shortform directory hdr */
-{
-       xfs_dir2_data_hdr_t     *hdr;           /* block header */
-       xfs_dir2_block_tail_t   *btp;           /* block tail pointer */
-       xfs_dir2_data_entry_t   *dep;           /* data entry pointer */
-       xfs_inode_t             *dp;            /* incore directory inode */
-       xfs_dir2_data_unused_t  *dup;           /* unused data pointer */
-       char                    *endptr;        /* end of data entries */
-       int                     error;          /* error return value */
-       int                     logflags;       /* inode logging flags */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       char                    *ptr;           /* current data pointer */
-       xfs_dir2_sf_entry_t     *sfep;          /* shortform entry */
-       xfs_dir2_sf_hdr_t       *sfp;           /* shortform directory header */
-       xfs_dir2_sf_hdr_t       *dst;           /* temporary data buffer */
-
-       trace_xfs_dir2_block_to_sf(args);
-
-       dp = args->dp;
-       mp = dp->i_mount;
-
-       /*
-        * allocate a temporary destination buffer the size of the inode
-        * to format the data into. Once we have formatted the data, we
-        * can free the block and copy the formatted data into the inode literal
-        * area.
-        */
-       dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP);
-       hdr = bp->b_addr;
-
-       /*
-        * Copy the header into the newly allocate local space.
-        */
-       sfp = (xfs_dir2_sf_hdr_t *)dst;
-       memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
-
-       /*
-        * Set up to loop over the block's entries.
-        */
-       btp = xfs_dir2_block_tail_p(args->geo, hdr);
-       ptr = (char *)dp->d_ops->data_entry_p(hdr);
-       endptr = (char *)xfs_dir2_block_leaf_p(btp);
-       sfep = xfs_dir2_sf_firstentry(sfp);
-       /*
-        * Loop over the active and unused entries.
-        * Stop when we reach the leaf/tail portion of the block.
-        */
-       while (ptr < endptr) {
-               /*
-                * If it's unused, just skip over it.
-                */
-               dup = (xfs_dir2_data_unused_t *)ptr;
-               if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
-                       ptr += be16_to_cpu(dup->length);
-                       continue;
-               }
-               dep = (xfs_dir2_data_entry_t *)ptr;
-               /*
-                * Skip .
-                */
-               if (dep->namelen == 1 && dep->name[0] == '.')
-                       ASSERT(be64_to_cpu(dep->inumber) == dp->i_ino);
-               /*
-                * Skip .., but make sure the inode number is right.
-                */
-               else if (dep->namelen == 2 &&
-                        dep->name[0] == '.' && dep->name[1] == '.')
-                       ASSERT(be64_to_cpu(dep->inumber) ==
-                              dp->d_ops->sf_get_parent_ino(sfp));
-               /*
-                * Normal entry, copy it into shortform.
-                */
-               else {
-                       sfep->namelen = dep->namelen;
-                       xfs_dir2_sf_put_offset(sfep,
-                               (xfs_dir2_data_aoff_t)
-                               ((char *)dep - (char *)hdr));
-                       memcpy(sfep->name, dep->name, dep->namelen);
-                       dp->d_ops->sf_put_ino(sfp, sfep,
-                                             be64_to_cpu(dep->inumber));
-                       dp->d_ops->sf_put_ftype(sfep,
-                                       dp->d_ops->data_get_ftype(dep));
-
-                       sfep = dp->d_ops->sf_nextentry(sfp, sfep);
-               }
-               ptr += dp->d_ops->data_entsize(dep->namelen);
-       }
-       ASSERT((char *)sfep - (char *)sfp == size);
-
-       /* now we are done with the block, we can shrink the inode */
-       logflags = XFS_ILOG_CORE;
-       error = xfs_dir2_shrink_inode(args, args->geo->datablk, bp);
-       if (error) {
-               ASSERT(error != ENOSPC);
-               goto out;
-       }
-
-       /*
-        * The buffer is now unconditionally gone, whether
-        * xfs_dir2_shrink_inode worked or not.
-        *
-        * Convert the inode to local format and copy the data in.
-        */
-       dp->i_df.if_flags &= ~XFS_IFEXTENTS;
-       dp->i_df.if_flags |= XFS_IFINLINE;
-       dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
-       ASSERT(dp->i_df.if_bytes == 0);
-       xfs_idata_realloc(dp, size, XFS_DATA_FORK);
-
-       logflags |= XFS_ILOG_DDATA;
-       memcpy(dp->i_df.if_u1.if_data, dst, size);
-       dp->i_d.di_size = size;
-       xfs_dir2_sf_check(args);
-out:
-       xfs_trans_log_inode(args->trans, dp, logflags);
-       kmem_free(dst);
-       return error;
-}
-
-/*
- * Add a name to a shortform directory.
- * There are two algorithms, "easy" and "hard" which we decide on
- * before changing anything.
- * Convert to block form if necessary, if the new entry won't fit.
- */
-int                                            /* error */
-xfs_dir2_sf_addname(
-       xfs_da_args_t           *args)          /* operation arguments */
-{
-       xfs_inode_t             *dp;            /* incore directory inode */
-       int                     error;          /* error return value */
-       int                     incr_isize;     /* total change in size */
-       int                     new_isize;      /* di_size after adding name */
-       int                     objchange;      /* changing to 8-byte inodes */
-       xfs_dir2_data_aoff_t    offset = 0;     /* offset for new entry */
-       int                     pick;           /* which algorithm to use */
-       xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
-       xfs_dir2_sf_entry_t     *sfep = NULL;   /* shortform entry */
-
-       trace_xfs_dir2_sf_addname(args);
-
-       ASSERT(xfs_dir2_sf_lookup(args) == ENOENT);
-       dp = args->dp;
-       ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-       /*
-        * Make sure the shortform value has some of its header.
-        */
-       if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
-               ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-               return XFS_ERROR(EIO);
-       }
-       ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
-       ASSERT(dp->i_df.if_u1.if_data != NULL);
-       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-       ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
-       /*
-        * Compute entry (and change in) size.
-        */
-       incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen);
-       objchange = 0;
-#if XFS_BIG_INUMS
-       /*
-        * Do we have to change to 8 byte inodes?
-        */
-       if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
-               /*
-                * Yes, adjust the inode size.  old count + (parent + new)
-                */
-               incr_isize +=
-                       (sfp->count + 2) *
-                       ((uint)sizeof(xfs_dir2_ino8_t) -
-                        (uint)sizeof(xfs_dir2_ino4_t));
-               objchange = 1;
-       }
-#endif
-       new_isize = (int)dp->i_d.di_size + incr_isize;
-       /*
-        * Won't fit as shortform any more (due to size),
-        * or the pick routine says it won't (due to offset values).
-        */
-       if (new_isize > XFS_IFORK_DSIZE(dp) ||
-           (pick =
-            xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) {
-               /*
-                * Just checking or no space reservation, it doesn't fit.
-                */
-               if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
-                       return XFS_ERROR(ENOSPC);
-               /*
-                * Convert to block form then add the name.
-                */
-               error = xfs_dir2_sf_to_block(args);
-               if (error)
-                       return error;
-               return xfs_dir2_block_addname(args);
-       }
-       /*
-        * Just checking, it fits.
-        */
-       if (args->op_flags & XFS_DA_OP_JUSTCHECK)
-               return 0;
-       /*
-        * Do it the easy way - just add it at the end.
-        */
-       if (pick == 1)
-               xfs_dir2_sf_addname_easy(args, sfep, offset, new_isize);
-       /*
-        * Do it the hard way - look for a place to insert the new entry.
-        * Convert to 8 byte inode numbers first if necessary.
-        */
-       else {
-               ASSERT(pick == 2);
-#if XFS_BIG_INUMS
-               if (objchange)
-                       xfs_dir2_sf_toino8(args);
-#endif
-               xfs_dir2_sf_addname_hard(args, objchange, new_isize);
-       }
-       xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
-       return 0;
-}
-
-/*
- * Add the new entry the "easy" way.
- * This is copying the old directory and adding the new entry at the end.
- * Since it's sorted by "offset" we need room after the last offset
- * that's already there, and then room to convert to a block directory.
- * This is already checked by the pick routine.
- */
-static void
-xfs_dir2_sf_addname_easy(
-       xfs_da_args_t           *args,          /* operation arguments */
-       xfs_dir2_sf_entry_t     *sfep,          /* pointer to new entry */
-       xfs_dir2_data_aoff_t    offset,         /* offset to use for new ent */
-       int                     new_isize)      /* new directory size */
-{
-       int                     byteoff;        /* byte offset in sf dir */
-       xfs_inode_t             *dp;            /* incore directory inode */
-       xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
-
-       dp = args->dp;
-
-       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-       byteoff = (int)((char *)sfep - (char *)sfp);
-       /*
-        * Grow the in-inode space.
-        */
-       xfs_idata_realloc(dp, dp->d_ops->sf_entsize(sfp, args->namelen),
-                         XFS_DATA_FORK);
-       /*
-        * Need to set up again due to realloc of the inode data.
-        */
-       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-       sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff);
-       /*
-        * Fill in the new entry.
-        */
-       sfep->namelen = args->namelen;
-       xfs_dir2_sf_put_offset(sfep, offset);
-       memcpy(sfep->name, args->name, sfep->namelen);
-       dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
-       dp->d_ops->sf_put_ftype(sfep, args->filetype);
-
-       /*
-        * Update the header and inode.
-        */
-       sfp->count++;
-#if XFS_BIG_INUMS
-       if (args->inumber > XFS_DIR2_MAX_SHORT_INUM)
-               sfp->i8count++;
-#endif
-       dp->i_d.di_size = new_isize;
-       xfs_dir2_sf_check(args);
-}
-
-/*
- * Add the new entry the "hard" way.
- * The caller has already converted to 8 byte inode numbers if necessary,
- * in which case we need to leave the i8count at 1.
- * Find a hole that the new entry will fit into, and copy
- * the first part of the entries, the new entry, and the last part of
- * the entries.
- */
-/* ARGSUSED */
-static void
-xfs_dir2_sf_addname_hard(
-       xfs_da_args_t           *args,          /* operation arguments */
-       int                     objchange,      /* changing inode number size */
-       int                     new_isize)      /* new directory size */
-{
-       int                     add_datasize;   /* data size need for new ent */
-       char                    *buf;           /* buffer for old */
-       xfs_inode_t             *dp;            /* incore directory inode */
-       int                     eof;            /* reached end of old dir */
-       int                     nbytes;         /* temp for byte copies */
-       xfs_dir2_data_aoff_t    new_offset;     /* next offset value */
-       xfs_dir2_data_aoff_t    offset;         /* current offset value */
-       int                     old_isize;      /* previous di_size */
-       xfs_dir2_sf_entry_t     *oldsfep;       /* entry in original dir */
-       xfs_dir2_sf_hdr_t       *oldsfp;        /* original shortform dir */
-       xfs_dir2_sf_entry_t     *sfep;          /* entry in new dir */
-       xfs_dir2_sf_hdr_t       *sfp;           /* new shortform dir */
-       struct xfs_mount        *mp;
-
-       /*
-        * Copy the old directory to the stack buffer.
-        */
-       dp = args->dp;
-       mp = dp->i_mount;
-
-       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-       old_isize = (int)dp->i_d.di_size;
-       buf = kmem_alloc(old_isize, KM_SLEEP);
-       oldsfp = (xfs_dir2_sf_hdr_t *)buf;
-       memcpy(oldsfp, sfp, old_isize);
-       /*
-        * Loop over the old directory finding the place we're going
-        * to insert the new entry.
-        * If it's going to end up at the end then oldsfep will point there.
-        */
-       for (offset = dp->d_ops->data_first_offset,
-             oldsfep = xfs_dir2_sf_firstentry(oldsfp),
-             add_datasize = dp->d_ops->data_entsize(args->namelen),
-             eof = (char *)oldsfep == &buf[old_isize];
-            !eof;
-            offset = new_offset + dp->d_ops->data_entsize(oldsfep->namelen),
-             oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep),
-             eof = (char *)oldsfep == &buf[old_isize]) {
-               new_offset = xfs_dir2_sf_get_offset(oldsfep);
-               if (offset + add_datasize <= new_offset)
-                       break;
-       }
-       /*
-        * Get rid of the old directory, then allocate space for
-        * the new one.  We do this so xfs_idata_realloc won't copy
-        * the data.
-        */
-       xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK);
-       xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK);
-       /*
-        * Reset the pointer since the buffer was reallocated.
-        */
-       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-       /*
-        * Copy the first part of the directory, including the header.
-        */
-       nbytes = (int)((char *)oldsfep - (char *)oldsfp);
-       memcpy(sfp, oldsfp, nbytes);
-       sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + nbytes);
-       /*
-        * Fill in the new entry, and update the header counts.
-        */
-       sfep->namelen = args->namelen;
-       xfs_dir2_sf_put_offset(sfep, offset);
-       memcpy(sfep->name, args->name, sfep->namelen);
-       dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
-       dp->d_ops->sf_put_ftype(sfep, args->filetype);
-       sfp->count++;
-#if XFS_BIG_INUMS
-       if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
-               sfp->i8count++;
-#endif
-       /*
-        * If there's more left to copy, do that.
-        */
-       if (!eof) {
-               sfep = dp->d_ops->sf_nextentry(sfp, sfep);
-               memcpy(sfep, oldsfep, old_isize - nbytes);
-       }
-       kmem_free(buf);
-       dp->i_d.di_size = new_isize;
-       xfs_dir2_sf_check(args);
-}
-
-/*
- * Decide if the new entry will fit at all.
- * If it will fit, pick between adding the new entry to the end (easy)
- * or somewhere else (hard).
- * Return 0 (won't fit), 1 (easy), 2 (hard).
- */
-/*ARGSUSED*/
-static int                                     /* pick result */
-xfs_dir2_sf_addname_pick(
-       xfs_da_args_t           *args,          /* operation arguments */
-       int                     objchange,      /* inode # size changes */
-       xfs_dir2_sf_entry_t     **sfepp,        /* out(1): new entry ptr */
-       xfs_dir2_data_aoff_t    *offsetp)       /* out(1): new offset */
-{
-       xfs_inode_t             *dp;            /* incore directory inode */
-       int                     holefit;        /* found hole it will fit in */
-       int                     i;              /* entry number */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       xfs_dir2_data_aoff_t    offset;         /* data block offset */
-       xfs_dir2_sf_entry_t     *sfep;          /* shortform entry */
-       xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
-       int                     size;           /* entry's data size */
-       int                     used;           /* data bytes used */
-
-       dp = args->dp;
-       mp = dp->i_mount;
-
-       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-       size = dp->d_ops->data_entsize(args->namelen);
-       offset = dp->d_ops->data_first_offset;
-       sfep = xfs_dir2_sf_firstentry(sfp);
-       holefit = 0;
-       /*
-        * Loop over sf entries.
-        * Keep track of data offset and whether we've seen a place
-        * to insert the new entry.
-        */
-       for (i = 0; i < sfp->count; i++) {
-               if (!holefit)
-                       holefit = offset + size <= xfs_dir2_sf_get_offset(sfep);
-               offset = xfs_dir2_sf_get_offset(sfep) +
-                        dp->d_ops->data_entsize(sfep->namelen);
-               sfep = dp->d_ops->sf_nextentry(sfp, sfep);
-       }
-       /*
-        * Calculate data bytes used excluding the new entry, if this
-        * was a data block (block form directory).
-        */
-       used = offset +
-              (sfp->count + 3) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
-              (uint)sizeof(xfs_dir2_block_tail_t);
-       /*
-        * If it won't fit in a block form then we can't insert it,
-        * we'll go back, convert to block, then try the insert and convert
-        * to leaf.
-        */
-       if (used + (holefit ? 0 : size) > args->geo->blksize)
-               return 0;
-       /*
-        * If changing the inode number size, do it the hard way.
-        */
-#if XFS_BIG_INUMS
-       if (objchange) {
-               return 2;
-       }
-#else
-       ASSERT(objchange == 0);
-#endif
-       /*
-        * If it won't fit at the end then do it the hard way (use the hole).
-        */
-       if (used + size > args->geo->blksize)
-               return 2;
-       /*
-        * Do it the easy way.
-        */
-       *sfepp = sfep;
-       *offsetp = offset;
-       return 1;
-}
-
-#ifdef DEBUG
-/*
- * Check consistency of shortform directory, assert if bad.
- */
-static void
-xfs_dir2_sf_check(
-       xfs_da_args_t           *args)          /* operation arguments */
-{
-       xfs_inode_t             *dp;            /* incore directory inode */
-       int                     i;              /* entry number */
-       int                     i8count;        /* number of big inode#s */
-       xfs_ino_t               ino;            /* entry inode number */
-       int                     offset;         /* data offset */
-       xfs_dir2_sf_entry_t     *sfep;          /* shortform dir entry */
-       xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
-       struct xfs_mount        *mp;
-
-       dp = args->dp;
-       mp = dp->i_mount;
-
-       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-       offset = dp->d_ops->data_first_offset;
-       ino = dp->d_ops->sf_get_parent_ino(sfp);
-       i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
-
-       for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
-            i < sfp->count;
-            i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
-               ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset);
-               ino = dp->d_ops->sf_get_ino(sfp, sfep);
-               i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
-               offset =
-                       xfs_dir2_sf_get_offset(sfep) +
-                       dp->d_ops->data_entsize(sfep->namelen);
-               ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX);
-       }
-       ASSERT(i8count == sfp->i8count);
-       ASSERT(XFS_BIG_INUMS || i8count == 0);
-       ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
-       ASSERT(offset +
-              (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
-              (uint)sizeof(xfs_dir2_block_tail_t) <= args->geo->blksize);
-}
-#endif /* DEBUG */
-
-/*
- * Create a new (shortform) directory.
- */
-int                                    /* error, always 0 */
-xfs_dir2_sf_create(
-       xfs_da_args_t   *args,          /* operation arguments */
-       xfs_ino_t       pino)           /* parent inode number */
-{
-       xfs_inode_t     *dp;            /* incore directory inode */
-       int             i8count;        /* parent inode is an 8-byte number */
-       xfs_dir2_sf_hdr_t *sfp;         /* shortform structure */
-       int             size;           /* directory size */
-
-       trace_xfs_dir2_sf_create(args);
-
-       dp = args->dp;
-
-       ASSERT(dp != NULL);
-       ASSERT(dp->i_d.di_size == 0);
-       /*
-        * If it's currently a zero-length extent file,
-        * convert it to local format.
-        */
-       if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) {
-               dp->i_df.if_flags &= ~XFS_IFEXTENTS;    /* just in case */
-               dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
-               xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
-               dp->i_df.if_flags |= XFS_IFINLINE;
-       }
-       ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-       ASSERT(dp->i_df.if_bytes == 0);
-       i8count = pino > XFS_DIR2_MAX_SHORT_INUM;
-       size = xfs_dir2_sf_hdr_size(i8count);
-       /*
-        * Make a buffer for the data.
-        */
-       xfs_idata_realloc(dp, size, XFS_DATA_FORK);
-       /*
-        * Fill in the header,
-        */
-       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-       sfp->i8count = i8count;
-       /*
-        * Now can put in the inode number, since i8count is set.
-        */
-       dp->d_ops->sf_put_parent_ino(sfp, pino);
-       sfp->count = 0;
-       dp->i_d.di_size = size;
-       xfs_dir2_sf_check(args);
-       xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
-       return 0;
-}
-
-/*
- * Lookup an entry in a shortform directory.
- * Returns EEXIST if found, ENOENT if not found.
- */
-int                                            /* error */
-xfs_dir2_sf_lookup(
-       xfs_da_args_t           *args)          /* operation arguments */
-{
-       xfs_inode_t             *dp;            /* incore directory inode */
-       int                     i;              /* entry index */
-       int                     error;
-       xfs_dir2_sf_entry_t     *sfep;          /* shortform directory entry */
-       xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
-       enum xfs_dacmp          cmp;            /* comparison result */
-       xfs_dir2_sf_entry_t     *ci_sfep;       /* case-insens. entry */
-
-       trace_xfs_dir2_sf_lookup(args);
-
-       xfs_dir2_sf_check(args);
-       dp = args->dp;
-
-       ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-       /*
-        * Bail out if the directory is way too short.
-        */
-       if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
-               ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-               return XFS_ERROR(EIO);
-       }
-       ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
-       ASSERT(dp->i_df.if_u1.if_data != NULL);
-       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-       ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
-       /*
-        * Special case for .
-        */
-       if (args->namelen == 1 && args->name[0] == '.') {
-               args->inumber = dp->i_ino;
-               args->cmpresult = XFS_CMP_EXACT;
-               args->filetype = XFS_DIR3_FT_DIR;
-               return XFS_ERROR(EEXIST);
-       }
-       /*
-        * Special case for ..
-        */
-       if (args->namelen == 2 &&
-           args->name[0] == '.' && args->name[1] == '.') {
-               args->inumber = dp->d_ops->sf_get_parent_ino(sfp);
-               args->cmpresult = XFS_CMP_EXACT;
-               args->filetype = XFS_DIR3_FT_DIR;
-               return XFS_ERROR(EEXIST);
-       }
-       /*
-        * Loop over all the entries trying to match ours.
-        */
-       ci_sfep = NULL;
-       for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
-            i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
-               /*
-                * Compare name and if it's an exact match, return the inode
-                * number. If it's the first case-insensitive match, store the
-                * inode number and continue looking for an exact match.
-                */
-               cmp = dp->i_mount->m_dirnameops->compname(args, sfep->name,
-                                                               sfep->namelen);
-               if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
-                       args->cmpresult = cmp;
-                       args->inumber = dp->d_ops->sf_get_ino(sfp, sfep);
-                       args->filetype = dp->d_ops->sf_get_ftype(sfep);
-                       if (cmp == XFS_CMP_EXACT)
-                               return XFS_ERROR(EEXIST);
-                       ci_sfep = sfep;
-               }
-       }
-       ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
-       /*
-        * Here, we can only be doing a lookup (not a rename or replace).
-        * If a case-insensitive match was not found, return ENOENT.
-        */
-       if (!ci_sfep)
-               return XFS_ERROR(ENOENT);
-       /* otherwise process the CI match as required by the caller */
-       error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen);
-       return XFS_ERROR(error);
-}
-
-/*
- * Remove an entry from a shortform directory.
- */
-int                                            /* error */
-xfs_dir2_sf_removename(
-       xfs_da_args_t           *args)
-{
-       int                     byteoff;        /* offset of removed entry */
-       xfs_inode_t             *dp;            /* incore directory inode */
-       int                     entsize;        /* this entry's size */
-       int                     i;              /* shortform entry index */
-       int                     newsize;        /* new inode size */
-       int                     oldsize;        /* old inode size */
-       xfs_dir2_sf_entry_t     *sfep;          /* shortform directory entry */
-       xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
-
-       trace_xfs_dir2_sf_removename(args);
-
-       dp = args->dp;
-
-       ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-       oldsize = (int)dp->i_d.di_size;
-       /*
-        * Bail out if the directory is way too short.
-        */
-       if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) {
-               ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-               return XFS_ERROR(EIO);
-       }
-       ASSERT(dp->i_df.if_bytes == oldsize);
-       ASSERT(dp->i_df.if_u1.if_data != NULL);
-       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-       ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->i8count));
-       /*
-        * Loop over the old directory entries.
-        * Find the one we're deleting.
-        */
-       for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
-            i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
-               if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
-                                                               XFS_CMP_EXACT) {
-                       ASSERT(dp->d_ops->sf_get_ino(sfp, sfep) ==
-                              args->inumber);
-                       break;
-               }
-       }
-       /*
-        * Didn't find it.
-        */
-       if (i == sfp->count)
-               return XFS_ERROR(ENOENT);
-       /*
-        * Calculate sizes.
-        */
-       byteoff = (int)((char *)sfep - (char *)sfp);
-       entsize = dp->d_ops->sf_entsize(sfp, args->namelen);
-       newsize = oldsize - entsize;
-       /*
-        * Copy the part if any after the removed entry, sliding it down.
-        */
-       if (byteoff + entsize < oldsize)
-               memmove((char *)sfp + byteoff, (char *)sfp + byteoff + entsize,
-                       oldsize - (byteoff + entsize));
-       /*
-        * Fix up the header and file size.
-        */
-       sfp->count--;
-       dp->i_d.di_size = newsize;
-       /*
-        * Reallocate, making it smaller.
-        */
-       xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
-       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-#if XFS_BIG_INUMS
-       /*
-        * Are we changing inode number size?
-        */
-       if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
-               if (sfp->i8count == 1)
-                       xfs_dir2_sf_toino4(args);
-               else
-                       sfp->i8count--;
-       }
-#endif
-       xfs_dir2_sf_check(args);
-       xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
-       return 0;
-}
-
-/*
- * Replace the inode number of an entry in a shortform directory.
- */
-int                                            /* error */
-xfs_dir2_sf_replace(
-       xfs_da_args_t           *args)          /* operation arguments */
-{
-       xfs_inode_t             *dp;            /* incore directory inode */
-       int                     i;              /* entry index */
-#if XFS_BIG_INUMS || defined(DEBUG)
-       xfs_ino_t               ino=0;          /* entry old inode number */
-#endif
-#if XFS_BIG_INUMS
-       int                     i8elevated;     /* sf_toino8 set i8count=1 */
-#endif
-       xfs_dir2_sf_entry_t     *sfep;          /* shortform directory entry */
-       xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
-
-       trace_xfs_dir2_sf_replace(args);
-
-       dp = args->dp;
-
-       ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-       /*
-        * Bail out if the shortform directory is way too small.
-        */
-       if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
-               ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-               return XFS_ERROR(EIO);
-       }
-       ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
-       ASSERT(dp->i_df.if_u1.if_data != NULL);
-       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-       ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
-#if XFS_BIG_INUMS
-       /*
-        * New inode number is large, and need to convert to 8-byte inodes.
-        */
-       if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
-               int     error;                  /* error return value */
-               int     newsize;                /* new inode size */
-
-               newsize =
-                       dp->i_df.if_bytes +
-                       (sfp->count + 1) *
-                       ((uint)sizeof(xfs_dir2_ino8_t) -
-                        (uint)sizeof(xfs_dir2_ino4_t));
-               /*
-                * Won't fit as shortform, convert to block then do replace.
-                */
-               if (newsize > XFS_IFORK_DSIZE(dp)) {
-                       error = xfs_dir2_sf_to_block(args);
-                       if (error) {
-                               return error;
-                       }
-                       return xfs_dir2_block_replace(args);
-               }
-               /*
-                * Still fits, convert to 8-byte now.
-                */
-               xfs_dir2_sf_toino8(args);
-               i8elevated = 1;
-               sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-       } else
-               i8elevated = 0;
-#endif
-       ASSERT(args->namelen != 1 || args->name[0] != '.');
-       /*
-        * Replace ..'s entry.
-        */
-       if (args->namelen == 2 &&
-           args->name[0] == '.' && args->name[1] == '.') {
-#if XFS_BIG_INUMS || defined(DEBUG)
-               ino = dp->d_ops->sf_get_parent_ino(sfp);
-               ASSERT(args->inumber != ino);
-#endif
-               dp->d_ops->sf_put_parent_ino(sfp, args->inumber);
-       }
-       /*
-        * Normal entry, look for the name.
-        */
-       else {
-               for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
-                    i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
-                       if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
-                                                               XFS_CMP_EXACT) {
-#if XFS_BIG_INUMS || defined(DEBUG)
-                               ino = dp->d_ops->sf_get_ino(sfp, sfep);
-                               ASSERT(args->inumber != ino);
-#endif
-                               dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
-                               dp->d_ops->sf_put_ftype(sfep, args->filetype);
-                               break;
-                       }
-               }
-               /*
-                * Didn't find it.
-                */
-               if (i == sfp->count) {
-                       ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
-#if XFS_BIG_INUMS
-                       if (i8elevated)
-                               xfs_dir2_sf_toino4(args);
-#endif
-                       return XFS_ERROR(ENOENT);
-               }
-       }
-#if XFS_BIG_INUMS
-       /*
-        * See if the old number was large, the new number is small.
-        */
-       if (ino > XFS_DIR2_MAX_SHORT_INUM &&
-           args->inumber <= XFS_DIR2_MAX_SHORT_INUM) {
-               /*
-                * And the old count was one, so need to convert to small.
-                */
-               if (sfp->i8count == 1)
-                       xfs_dir2_sf_toino4(args);
-               else
-                       sfp->i8count--;
-       }
-       /*
-        * See if the old number was small, the new number is large.
-        */
-       if (ino <= XFS_DIR2_MAX_SHORT_INUM &&
-           args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
-               /*
-                * add to the i8count unless we just converted to 8-byte
-                * inodes (which does an implied i8count = 1)
-                */
-               ASSERT(sfp->i8count != 0);
-               if (!i8elevated)
-                       sfp->i8count++;
-       }
-#endif
-       xfs_dir2_sf_check(args);
-       xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
-       return 0;
-}
-
-#if XFS_BIG_INUMS
-/*
- * Convert from 8-byte inode numbers to 4-byte inode numbers.
- * The last 8-byte inode number is gone, but the count is still 1.
- */
-static void
-xfs_dir2_sf_toino4(
-       xfs_da_args_t           *args)          /* operation arguments */
-{
-       char                    *buf;           /* old dir's buffer */
-       xfs_inode_t             *dp;            /* incore directory inode */
-       int                     i;              /* entry index */
-       int                     newsize;        /* new inode size */
-       xfs_dir2_sf_entry_t     *oldsfep;       /* old sf entry */
-       xfs_dir2_sf_hdr_t       *oldsfp;        /* old sf directory */
-       int                     oldsize;        /* old inode size */
-       xfs_dir2_sf_entry_t     *sfep;          /* new sf entry */
-       xfs_dir2_sf_hdr_t       *sfp;           /* new sf directory */
-       struct xfs_mount        *mp;
-
-       trace_xfs_dir2_sf_toino4(args);
-
-       dp = args->dp;
-       mp = dp->i_mount;
-
-       /*
-        * Copy the old directory to the buffer.
-        * Then nuke it from the inode, and add the new buffer to the inode.
-        * Don't want xfs_idata_realloc copying the data here.
-        */
-       oldsize = dp->i_df.if_bytes;
-       buf = kmem_alloc(oldsize, KM_SLEEP);
-       oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-       ASSERT(oldsfp->i8count == 1);
-       memcpy(buf, oldsfp, oldsize);
-       /*
-        * Compute the new inode size.
-        */
-       newsize =
-               oldsize -
-               (oldsfp->count + 1) *
-               ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
-       xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
-       xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
-       /*
-        * Reset our pointers, the data has moved.
-        */
-       oldsfp = (xfs_dir2_sf_hdr_t *)buf;
-       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-       /*
-        * Fill in the new header.
-        */
-       sfp->count = oldsfp->count;
-       sfp->i8count = 0;
-       dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
-       /*
-        * Copy the entries field by field.
-        */
-       for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
-                   oldsfep = xfs_dir2_sf_firstentry(oldsfp);
-            i < sfp->count;
-            i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
-                 oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
-               sfep->namelen = oldsfep->namelen;
-               sfep->offset = oldsfep->offset;
-               memcpy(sfep->name, oldsfep->name, sfep->namelen);
-               dp->d_ops->sf_put_ino(sfp, sfep,
-                                     dp->d_ops->sf_get_ino(oldsfp, oldsfep));
-               dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
-       }
-       /*
-        * Clean up the inode.
-        */
-       kmem_free(buf);
-       dp->i_d.di_size = newsize;
-       xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
-}
-
-/*
- * Convert existing entries from 4-byte inode numbers to 8-byte inode numbers.
- * The new entry w/ an 8-byte inode number is not there yet; we leave with
- * i8count set to 1, but no corresponding 8-byte entry.
- */
-static void
-xfs_dir2_sf_toino8(
-       xfs_da_args_t           *args)          /* operation arguments */
-{
-       char                    *buf;           /* old dir's buffer */
-       xfs_inode_t             *dp;            /* incore directory inode */
-       int                     i;              /* entry index */
-       int                     newsize;        /* new inode size */
-       xfs_dir2_sf_entry_t     *oldsfep;       /* old sf entry */
-       xfs_dir2_sf_hdr_t       *oldsfp;        /* old sf directory */
-       int                     oldsize;        /* old inode size */
-       xfs_dir2_sf_entry_t     *sfep;          /* new sf entry */
-       xfs_dir2_sf_hdr_t       *sfp;           /* new sf directory */
-       struct xfs_mount        *mp;
-
-       trace_xfs_dir2_sf_toino8(args);
-
-       dp = args->dp;
-       mp = dp->i_mount;
-
-       /*
-        * Copy the old directory to the buffer.
-        * Then nuke it from the inode, and add the new buffer to the inode.
-        * Don't want xfs_idata_realloc copying the data here.
-        */
-       oldsize = dp->i_df.if_bytes;
-       buf = kmem_alloc(oldsize, KM_SLEEP);
-       oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-       ASSERT(oldsfp->i8count == 0);
-       memcpy(buf, oldsfp, oldsize);
-       /*
-        * Compute the new inode size (nb: entry count + 1 for parent)
-        */
-       newsize =
-               oldsize +
-               (oldsfp->count + 1) *
-               ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
-       xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
-       xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
-       /*
-        * Reset our pointers, the data has moved.
-        */
-       oldsfp = (xfs_dir2_sf_hdr_t *)buf;
-       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-       /*
-        * Fill in the new header.
-        */
-       sfp->count = oldsfp->count;
-       sfp->i8count = 1;
-       dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
-       /*
-        * Copy the entries field by field.
-        */
-       for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
-                   oldsfep = xfs_dir2_sf_firstentry(oldsfp);
-            i < sfp->count;
-            i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
-                 oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
-               sfep->namelen = oldsfep->namelen;
-               sfep->offset = oldsfep->offset;
-               memcpy(sfep->name, oldsfep->name, sfep->namelen);
-               dp->d_ops->sf_put_ino(sfp, sfep,
-                                     dp->d_ops->sf_get_ino(oldsfp, oldsfep));
-               dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
-       }
-       /*
-        * Clean up the inode.
-        */
-       kmem_free(buf);
-       dp->i_d.di_size = newsize;
-       xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
-}
-#endif /* XFS_BIG_INUMS */
index 4f11ef0111395bbb7775407e9039e92cad0a932d..13d08a1b390e6a756c34927af4797576a809ad77 100644 (file)
@@ -124,7 +124,7 @@ xfs_trim_extents(
                }
 
                trace_xfs_discard_extent(mp, agno, fbno, flen);
-               error = -blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0);
+               error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0);
                if (error)
                        goto out_del_cursor;
                *blocks_trimmed += flen;
@@ -166,11 +166,11 @@ xfs_ioc_trim(
        int                     error, last_error = 0;
 
        if (!capable(CAP_SYS_ADMIN))
-               return -XFS_ERROR(EPERM);
+               return -EPERM;
        if (!blk_queue_discard(q))
-               return -XFS_ERROR(EOPNOTSUPP);
+               return -EOPNOTSUPP;
        if (copy_from_user(&range, urange, sizeof(range)))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
 
        /*
         * Truncating down the len isn't actually quite correct, but using
@@ -182,7 +182,7 @@ xfs_ioc_trim(
        if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) ||
            range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) ||
            range.len < mp->m_sb.sb_blocksize)
-               return -XFS_ERROR(EINVAL);
+               return -EINVAL;
 
        start = BTOBB(range.start);
        end = start + BTOBBT(range.len) - 1;
@@ -195,7 +195,7 @@ xfs_ioc_trim(
        end_agno = xfs_daddr_to_agno(mp, end);
 
        for (agno = start_agno; agno <= end_agno; agno++) {
-               error = -xfs_trim_extents(mp, agno, start, end, minlen,
+               error = xfs_trim_extents(mp, agno, start, end, minlen,
                                          &blocks_trimmed);
                if (error)
                        last_error = error;
@@ -206,7 +206,7 @@ xfs_ioc_trim(
 
        range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
        if (copy_to_user(urange, &range, sizeof(range)))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
        return 0;
 }
 
@@ -222,11 +222,11 @@ xfs_discard_extents(
                trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
                                         busyp->length);
 
-               error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
+               error = blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
                                XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
                                XFS_FSB_TO_BB(mp, busyp->length),
                                GFP_NOFS, 0);
-               if (error && error != EOPNOTSUPP) {
+               if (error && error != -EOPNOTSUPP) {
                        xfs_info(mp,
         "discard failed for extent [0x%llu,%u], error %d",
                                 (unsigned long long)busyp->bno,
index 3ee0cd43edc00a00889c94fd08b87b0087ab7568..63c2de49f61de1782e066156dbec263addcfbbec 100644 (file)
@@ -327,7 +327,7 @@ xfs_qm_dqalloc(
         */
        if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
                xfs_iunlock(quotip, XFS_ILOCK_EXCL);
-               return (ESRCH);
+               return -ESRCH;
        }
 
        xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
@@ -354,7 +354,7 @@ xfs_qm_dqalloc(
                               mp->m_quotainfo->qi_dqchunklen,
                               0);
        if (!bp) {
-               error = ENOMEM;
+               error = -ENOMEM;
                goto error1;
        }
        bp->b_ops = &xfs_dquot_buf_ops;
@@ -400,7 +400,7 @@ xfs_qm_dqalloc(
       error0:
        xfs_iunlock(quotip, XFS_ILOCK_EXCL);
 
-       return (error);
+       return error;
 }
 
 STATIC int
@@ -426,7 +426,7 @@ xfs_qm_dqrepair(
 
        if (error) {
                ASSERT(*bpp == NULL);
-               return XFS_ERROR(error);
+               return error;
        }
        (*bpp)->b_ops = &xfs_dquot_buf_ops;
 
@@ -442,7 +442,7 @@ xfs_qm_dqrepair(
                if (error) {
                        /* repair failed, we're screwed */
                        xfs_trans_brelse(tp, *bpp);
-                       return XFS_ERROR(EIO);
+                       return -EIO;
                }
        }
 
@@ -480,7 +480,7 @@ xfs_qm_dqtobp(
                 * didn't have the quota inode lock.
                 */
                xfs_iunlock(quotip, lock_mode);
-               return ESRCH;
+               return -ESRCH;
        }
 
        /*
@@ -508,7 +508,7 @@ xfs_qm_dqtobp(
                 * We don't allocate unless we're asked to
                 */
                if (!(flags & XFS_QMOPT_DQALLOC))
-                       return ENOENT;
+                       return -ENOENT;
 
                ASSERT(tp);
                error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
@@ -530,7 +530,7 @@ xfs_qm_dqtobp(
                                           mp->m_quotainfo->qi_dqchunklen,
                                           0, &bp, &xfs_dquot_buf_ops);
 
-               if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) {
+               if (error == -EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) {
                        xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff *
                                                mp->m_quotainfo->qi_dqperchunk;
                        ASSERT(bp == NULL);
@@ -539,7 +539,7 @@ xfs_qm_dqtobp(
 
                if (error) {
                        ASSERT(bp == NULL);
-                       return XFS_ERROR(error);
+                       return error;
                }
        }
 
@@ -547,7 +547,7 @@ xfs_qm_dqtobp(
        *O_bpp = bp;
        *O_ddpp = bp->b_addr + dqp->q_bufoffset;
 
-       return (0);
+       return 0;
 }
 
 
@@ -715,7 +715,7 @@ xfs_qm_dqget(
        if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
            (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) ||
            (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
-               return (ESRCH);
+               return -ESRCH;
        }
 
 #ifdef DEBUG
@@ -723,7 +723,7 @@ xfs_qm_dqget(
                if ((xfs_dqerror_target == mp->m_ddev_targp) &&
                    (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
                        xfs_debug(mp, "Returning error in dqget");
-                       return (EIO);
+                       return -EIO;
                }
        }
 
@@ -796,14 +796,14 @@ restart:
                } else {
                        /* inode stays locked on return */
                        xfs_qm_dqdestroy(dqp);
-                       return XFS_ERROR(ESRCH);
+                       return -ESRCH;
                }
        }
 
        mutex_lock(&qi->qi_tree_lock);
-       error = -radix_tree_insert(tree, id, dqp);
+       error = radix_tree_insert(tree, id, dqp);
        if (unlikely(error)) {
-               WARN_ON(error != EEXIST);
+               WARN_ON(error != -EEXIST);
 
                /*
                 * Duplicate found. Just throw away the new dquot and start
@@ -829,7 +829,7 @@ restart:
        ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
        trace_xfs_dqget_miss(dqp);
        *O_dqpp = dqp;
-       return (0);
+       return 0;
 }
 
 /*
@@ -966,7 +966,7 @@ xfs_qm_dqflush(
                                             SHUTDOWN_CORRUPT_INCORE);
                else
                        spin_unlock(&mp->m_ail->xa_lock);
-               error = XFS_ERROR(EIO);
+               error = -EIO;
                goto out_unlock;
        }
 
@@ -974,7 +974,8 @@ xfs_qm_dqflush(
         * Get the buffer containing the on-disk dquot
         */
        error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
-                                  mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL);
+                                  mp->m_quotainfo->qi_dqchunklen, 0, &bp,
+                                  &xfs_dquot_buf_ops);
        if (error)
                goto out_unlock;
 
@@ -992,7 +993,7 @@ xfs_qm_dqflush(
                xfs_buf_relse(bp);
                xfs_dqfunlock(dqp);
                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-               return XFS_ERROR(EIO);
+               return -EIO;
        }
 
        /* This is the only portion of data that needs to persist */
@@ -1045,7 +1046,7 @@ xfs_qm_dqflush(
 
 out_unlock:
        xfs_dqfunlock(dqp);
-       return XFS_ERROR(EIO);
+       return -EIO;
 }
 
 /*
index 68a68f7048374a21b186077381e055981e3967c9..c24c67e22a2a53b18bf11b00b503381fd45998f7 100644 (file)
@@ -139,6 +139,21 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
        }
 }
 
+/*
+ * Check whether a dquot is under low free space conditions. We assume the quota
+ * is enabled and enforced.
+ */
+static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp)
+{
+       int64_t freesp;
+
+       freesp = be64_to_cpu(dqp->q_core.d_blk_hardlimit) - dqp->q_res_bcount;
+       if (freesp < dqp->q_low_space[XFS_QLOWSP_1_PCNT])
+               return true;
+
+       return false;
+}
+
 #define XFS_DQ_IS_LOCKED(dqp)  (mutex_is_locked(&((dqp)->q_qlock)))
 #define XFS_DQ_IS_DIRTY(dqp)   ((dqp)->dq_flags & XFS_DQ_DIRTY)
 #define XFS_QM_ISUDQ(dqp)      ((dqp)->dq_flags & XFS_DQ_USER)
diff --git a/fs/xfs/xfs_dquot_buf.c b/fs/xfs/xfs_dquot_buf.c
deleted file mode 100644 (file)
index c2ac0c6..0000000
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_inode.h"
-#include "xfs_quota.h"
-#include "xfs_trans.h"
-#include "xfs_qm.h"
-#include "xfs_error.h"
-#include "xfs_cksum.h"
-#include "xfs_trace.h"
-
-int
-xfs_calc_dquots_per_chunk(
-       unsigned int            nbblks) /* basic block units */
-{
-       unsigned int    ndquots;
-
-       ASSERT(nbblks > 0);
-       ndquots = BBTOB(nbblks);
-       do_div(ndquots, sizeof(xfs_dqblk_t));
-
-       return ndquots;
-}
-
-/*
- * Do some primitive error checking on ondisk dquot data structures.
- */
-int
-xfs_dqcheck(
-       struct xfs_mount *mp,
-       xfs_disk_dquot_t *ddq,
-       xfs_dqid_t       id,
-       uint             type,    /* used only when IO_dorepair is true */
-       uint             flags,
-       char             *str)
-{
-       xfs_dqblk_t      *d = (xfs_dqblk_t *)ddq;
-       int             errs = 0;
-
-       /*
-        * We can encounter an uninitialized dquot buffer for 2 reasons:
-        * 1. If we crash while deleting the quotainode(s), and those blks got
-        *    used for user data. This is because we take the path of regular
-        *    file deletion; however, the size field of quotainodes is never
-        *    updated, so all the tricks that we play in itruncate_finish
-        *    don't quite matter.
-        *
-        * 2. We don't play the quota buffers when there's a quotaoff logitem.
-        *    But the allocation will be replayed so we'll end up with an
-        *    uninitialized quota block.
-        *
-        * This is all fine; things are still consistent, and we haven't lost
-        * any quota information. Just don't complain about bad dquot blks.
-        */
-       if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) {
-               if (flags & XFS_QMOPT_DOWARN)
-                       xfs_alert(mp,
-                       "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
-                       str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
-               errs++;
-       }
-       if (ddq->d_version != XFS_DQUOT_VERSION) {
-               if (flags & XFS_QMOPT_DOWARN)
-                       xfs_alert(mp,
-                       "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
-                       str, id, ddq->d_version, XFS_DQUOT_VERSION);
-               errs++;
-       }
-
-       if (ddq->d_flags != XFS_DQ_USER &&
-           ddq->d_flags != XFS_DQ_PROJ &&
-           ddq->d_flags != XFS_DQ_GROUP) {
-               if (flags & XFS_QMOPT_DOWARN)
-                       xfs_alert(mp,
-                       "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
-                       str, id, ddq->d_flags);
-               errs++;
-       }
-
-       if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
-               if (flags & XFS_QMOPT_DOWARN)
-                       xfs_alert(mp,
-                       "%s : ondisk-dquot 0x%p, ID mismatch: "
-                       "0x%x expected, found id 0x%x",
-                       str, ddq, id, be32_to_cpu(ddq->d_id));
-               errs++;
-       }
-
-       if (!errs && ddq->d_id) {
-               if (ddq->d_blk_softlimit &&
-                   be64_to_cpu(ddq->d_bcount) >
-                               be64_to_cpu(ddq->d_blk_softlimit)) {
-                       if (!ddq->d_btimer) {
-                               if (flags & XFS_QMOPT_DOWARN)
-                                       xfs_alert(mp,
-                       "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
-                                       str, (int)be32_to_cpu(ddq->d_id), ddq);
-                               errs++;
-                       }
-               }
-               if (ddq->d_ino_softlimit &&
-                   be64_to_cpu(ddq->d_icount) >
-                               be64_to_cpu(ddq->d_ino_softlimit)) {
-                       if (!ddq->d_itimer) {
-                               if (flags & XFS_QMOPT_DOWARN)
-                                       xfs_alert(mp,
-                       "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
-                                       str, (int)be32_to_cpu(ddq->d_id), ddq);
-                               errs++;
-                       }
-               }
-               if (ddq->d_rtb_softlimit &&
-                   be64_to_cpu(ddq->d_rtbcount) >
-                               be64_to_cpu(ddq->d_rtb_softlimit)) {
-                       if (!ddq->d_rtbtimer) {
-                               if (flags & XFS_QMOPT_DOWARN)
-                                       xfs_alert(mp,
-                       "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
-                                       str, (int)be32_to_cpu(ddq->d_id), ddq);
-                               errs++;
-                       }
-               }
-       }
-
-       if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
-               return errs;
-
-       if (flags & XFS_QMOPT_DOWARN)
-               xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
-
-       /*
-        * Typically, a repair is only requested by quotacheck.
-        */
-       ASSERT(id != -1);
-       ASSERT(flags & XFS_QMOPT_DQREPAIR);
-       memset(d, 0, sizeof(xfs_dqblk_t));
-
-       d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
-       d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
-       d->dd_diskdq.d_flags = type;
-       d->dd_diskdq.d_id = cpu_to_be32(id);
-
-       if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
-               xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
-                                XFS_DQUOT_CRC_OFF);
-       }
-
-       return errs;
-}
-
-STATIC bool
-xfs_dquot_buf_verify_crc(
-       struct xfs_mount        *mp,
-       struct xfs_buf          *bp)
-{
-       struct xfs_dqblk        *d = (struct xfs_dqblk *)bp->b_addr;
-       int                     ndquots;
-       int                     i;
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return true;
-
-       /*
-        * if we are in log recovery, the quota subsystem has not been
-        * initialised so we have no quotainfo structure. In that case, we need
-        * to manually calculate the number of dquots in the buffer.
-        */
-       if (mp->m_quotainfo)
-               ndquots = mp->m_quotainfo->qi_dqperchunk;
-       else
-               ndquots = xfs_calc_dquots_per_chunk(
-                                       XFS_BB_TO_FSB(mp, bp->b_length));
-
-       for (i = 0; i < ndquots; i++, d++) {
-               if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
-                                XFS_DQUOT_CRC_OFF))
-                       return false;
-               if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid))
-                       return false;
-       }
-       return true;
-}
-
-STATIC bool
-xfs_dquot_buf_verify(
-       struct xfs_mount        *mp,
-       struct xfs_buf          *bp)
-{
-       struct xfs_dqblk        *d = (struct xfs_dqblk *)bp->b_addr;
-       xfs_dqid_t              id = 0;
-       int                     ndquots;
-       int                     i;
-
-       /*
-        * if we are in log recovery, the quota subsystem has not been
-        * initialised so we have no quotainfo structure. In that case, we need
-        * to manually calculate the number of dquots in the buffer.
-        */
-       if (mp->m_quotainfo)
-               ndquots = mp->m_quotainfo->qi_dqperchunk;
-       else
-               ndquots = xfs_calc_dquots_per_chunk(bp->b_length);
-
-       /*
-        * On the first read of the buffer, verify that each dquot is valid.
-        * We don't know what the id of the dquot is supposed to be, just that
-        * they should be increasing monotonically within the buffer. If the
-        * first id is corrupt, then it will fail on the second dquot in the
-        * buffer so corruptions could point to the wrong dquot in this case.
-        */
-       for (i = 0; i < ndquots; i++) {
-               struct xfs_disk_dquot   *ddq;
-               int                     error;
-
-               ddq = &d[i].dd_diskdq;
-
-               if (i == 0)
-                       id = be32_to_cpu(ddq->d_id);
-
-               error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
-                                      "xfs_dquot_buf_verify");
-               if (error)
-                       return false;
-       }
-       return true;
-}
-
-static void
-xfs_dquot_buf_read_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-
-       if (!xfs_dquot_buf_verify_crc(mp, bp))
-               xfs_buf_ioerror(bp, EFSBADCRC);
-       else if (!xfs_dquot_buf_verify(mp, bp))
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-       if (bp->b_error)
-               xfs_verifier_error(bp);
-}
-
-/*
- * we don't calculate the CRC here as that is done when the dquot is flushed to
- * the buffer after the update is done. This ensures that the dquot in the
- * buffer always has an up-to-date CRC value.
- */
-static void
-xfs_dquot_buf_write_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-
-       if (!xfs_dquot_buf_verify(mp, bp)) {
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-               xfs_verifier_error(bp);
-               return;
-       }
-}
-
-const struct xfs_buf_ops xfs_dquot_buf_ops = {
-       .verify_read = xfs_dquot_buf_read_verify,
-       .verify_write = xfs_dquot_buf_write_verify,
-};
-
index edac5b057d28790b5f0c6f767b12b1360a1488b7..b92fd7bc49e358159eff5ab24db6fd6a78be2ced 100644 (file)
 
 #ifdef DEBUG
 
-int    xfs_etrap[XFS_ERROR_NTRAP] = {
-       0,
-};
-
-int
-xfs_error_trap(int e)
-{
-       int i;
-
-       if (!e)
-               return 0;
-       for (i = 0; i < XFS_ERROR_NTRAP; i++) {
-               if (xfs_etrap[i] == 0)
-                       break;
-               if (e != xfs_etrap[i])
-                       continue;
-               xfs_notice(NULL, "%s: error %d", __func__, e);
-               BUG();
-               break;
-       }
-       return e;
-}
-
 int    xfs_etest[XFS_NUM_INJECT_ERROR];
 int64_t        xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
 char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
@@ -190,7 +167,7 @@ xfs_verifier_error(
        struct xfs_mount *mp = bp->b_target->bt_mount;
 
        xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx",
-                 bp->b_error == EFSBADCRC ? "CRC error" : "corruption",
+                 bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
                  __return_address, bp->b_bn);
 
        xfs_alert(mp, "Unmount and run xfs_repair");
index c1c57d4a4b5db6f37a626d7e78a48f32facf8dcd..279a76e52791fcee2bb8d40b4724231dc35d92bb 100644 (file)
 #ifndef        __XFS_ERROR_H__
 #define        __XFS_ERROR_H__
 
-#ifdef DEBUG
-#define        XFS_ERROR_NTRAP 10
-extern int     xfs_etrap[XFS_ERROR_NTRAP];
-extern int     xfs_error_trap(int);
-#define        XFS_ERROR(e)    xfs_error_trap(e)
-#else
-#define        XFS_ERROR(e)    (e)
-#endif
-
 struct xfs_mount;
 
 extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
@@ -56,7 +47,7 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
                if (unlikely(!fs_is_ok)) { \
                        XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \
                                         XFS_ERRLEVEL_LOW, NULL); \
-                       error = XFS_ERROR(EFSCORRUPTED); \
+                       error = -EFSCORRUPTED; \
                        goto l; \
                } \
        }
@@ -68,7 +59,7 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
                if (unlikely(!fs_is_ok)) { \
                        XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \
                                         XFS_ERRLEVEL_LOW, NULL); \
-                       return XFS_ERROR(EFSCORRUPTED); \
+                       return -EFSCORRUPTED; \
                } \
        }
 
index 753e467aa1a5991d0175087284ff9cde35591c40..5a6bd5d8779ab0029818c5880b15c1b260fb37fb 100644 (file)
@@ -147,9 +147,9 @@ xfs_nfs_get_inode(
                 * We don't use ESTALE directly down the chain to not
                 * confuse applications using bulkstat that expect EINVAL.
                 */
-               if (error == EINVAL || error == ENOENT)
-                       error = ESTALE;
-               return ERR_PTR(-error);
+               if (error == -EINVAL || error == -ENOENT)
+                       error = -ESTALE;
+               return ERR_PTR(error);
        }
 
        if (ip->i_d.di_gen != generation) {
@@ -217,7 +217,7 @@ xfs_fs_get_parent(
 
        error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip, NULL);
        if (unlikely(error))
-               return ERR_PTR(-error);
+               return ERR_PTR(error);
 
        return d_obtain_alias(VFS_I(cip));
 }
@@ -237,7 +237,7 @@ xfs_fs_nfs_commit_metadata(
 
        if (!lsn)
                return 0;
-       return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+       return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
 }
 
 const struct export_operations xfs_export_operations = {
index fb7a4c1ce1c537255294d66e79639699658544b1..c4327419dc5cbc5f3b0e33f00ae6a01afd24e300 100644 (file)
@@ -298,7 +298,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
                }
                return 0;
        }
-       return EFSCORRUPTED;
+       return -EFSCORRUPTED;
 }
 
 /*
index 1f66779d7a46628cf3a068dd5c08b36368fb6545..076b1708d1345474ec2fd4e3a4c2e3bc605cb75a 100644 (file)
@@ -38,6 +38,7 @@
 #include "xfs_trace.h"
 #include "xfs_log.h"
 #include "xfs_dinode.h"
+#include "xfs_icache.h"
 
 #include <linux/aio.h>
 #include <linux/dcache.h>
@@ -155,7 +156,7 @@ xfs_dir_fsync(
 
        if (!lsn)
                return 0;
-       return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+       return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
 }
 
 STATIC int
@@ -179,7 +180,7 @@ xfs_file_fsync(
                return error;
 
        if (XFS_FORCED_SHUTDOWN(mp))
-               return -XFS_ERROR(EIO);
+               return -EIO;
 
        xfs_iflags_clear(ip, XFS_ITRUNCATED);
 
@@ -225,7 +226,7 @@ xfs_file_fsync(
            !log_flushed)
                xfs_blkdev_issue_flush(mp->m_ddev_targp);
 
-       return -error;
+       return error;
 }
 
 STATIC ssize_t
@@ -246,11 +247,11 @@ xfs_file_read_iter(
        XFS_STATS_INC(xs_read_calls);
 
        if (unlikely(file->f_flags & O_DIRECT))
-               ioflags |= IO_ISDIRECT;
+               ioflags |= XFS_IO_ISDIRECT;
        if (file->f_mode & FMODE_NOCMTIME)
-               ioflags |= IO_INVIS;
+               ioflags |= XFS_IO_INVIS;
 
-       if (unlikely(ioflags & IO_ISDIRECT)) {
+       if (unlikely(ioflags & XFS_IO_ISDIRECT)) {
                xfs_buftarg_t   *target =
                        XFS_IS_REALTIME_INODE(ip) ?
                                mp->m_rtdev_targp : mp->m_ddev_targp;
@@ -258,7 +259,7 @@ xfs_file_read_iter(
                if ((pos | size) & target->bt_logical_sectormask) {
                        if (pos == i_size_read(inode))
                                return 0;
-                       return -XFS_ERROR(EINVAL);
+                       return -EINVAL;
                }
        }
 
@@ -283,7 +284,7 @@ xfs_file_read_iter(
         * proceeed concurrently without serialisation.
         */
        xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-       if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) {
+       if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
                xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
                xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
 
@@ -325,7 +326,7 @@ xfs_file_splice_read(
        XFS_STATS_INC(xs_read_calls);
 
        if (infilp->f_mode & FMODE_NOCMTIME)
-               ioflags |= IO_INVIS;
+               ioflags |= XFS_IO_INVIS;
 
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return -EIO;
@@ -524,7 +525,7 @@ restart:
                        xfs_rw_ilock(ip, *iolock);
                        goto restart;
                }
-               error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
+               error = xfs_zero_eof(ip, *pos, i_size_read(inode));
                if (error)
                        return error;
        }
@@ -594,7 +595,7 @@ xfs_file_dio_aio_write(
 
        /* DIO must be aligned to device logical sector size */
        if ((pos | count) & target->bt_logical_sectormask)
-               return -XFS_ERROR(EINVAL);
+               return -EINVAL;
 
        /* "unaligned" here means not aligned to a filesystem block */
        if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
@@ -689,14 +690,28 @@ write_retry:
        ret = generic_perform_write(file, from, pos);
        if (likely(ret >= 0))
                iocb->ki_pos = pos + ret;
+
        /*
-        * If we just got an ENOSPC, try to write back all dirty inodes to
-        * convert delalloc space to free up some of the excess reserved
-        * metadata space.
+        * If we hit a space limit, try to free up some lingering preallocated
+        * space before returning an error. In the case of ENOSPC, first try to
+        * write back all dirty inodes to free up some of the excess reserved
+        * metadata space. This reduces the chances that the eofblocks scan
+        * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
+        * also behaves as a filter to prevent too many eofblocks scans from
+        * running at the same time.
         */
-       if (ret == -ENOSPC && !enospc) {
+       if (ret == -EDQUOT && !enospc) {
+               enospc = xfs_inode_free_quota_eofblocks(ip);
+               if (enospc)
+                       goto write_retry;
+       } else if (ret == -ENOSPC && !enospc) {
+               struct xfs_eofblocks eofb = {0};
+
                enospc = 1;
                xfs_flush_inodes(ip->i_mount);
+               eofb.eof_scan_owner = ip->i_ino; /* for locking */
+               eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
+               xfs_icache_free_eofblocks(ip->i_mount, &eofb);
                goto write_retry;
        }
 
@@ -772,7 +787,7 @@ xfs_file_fallocate(
                unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
 
                if (offset & blksize_mask || len & blksize_mask) {
-                       error = EINVAL;
+                       error = -EINVAL;
                        goto out_unlock;
                }
 
@@ -781,7 +796,7 @@ xfs_file_fallocate(
                 * in which case it is effectively a truncate operation
                 */
                if (offset + len >= i_size_read(inode)) {
-                       error = EINVAL;
+                       error = -EINVAL;
                        goto out_unlock;
                }
 
@@ -794,7 +809,7 @@ xfs_file_fallocate(
                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
                    offset + len > i_size_read(inode)) {
                        new_size = offset + len;
-                       error = -inode_newsize_ok(inode, new_size);
+                       error = inode_newsize_ok(inode, new_size);
                        if (error)
                                goto out_unlock;
                }
@@ -844,7 +859,7 @@ xfs_file_fallocate(
 
 out_unlock:
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-       return -error;
+       return error;
 }
 
 
@@ -889,7 +904,7 @@ xfs_file_release(
        struct inode    *inode,
        struct file     *filp)
 {
-       return -xfs_release(XFS_I(inode));
+       return xfs_release(XFS_I(inode));
 }
 
 STATIC int
@@ -918,7 +933,7 @@ xfs_file_readdir(
 
        error = xfs_readdir(ip, ctx, bufsize);
        if (error)
-               return -error;
+               return error;
        return 0;
 }
 
@@ -1184,7 +1199,7 @@ xfs_seek_data(
 
        isize = i_size_read(inode);
        if (start >= isize) {
-               error = ENXIO;
+               error = -ENXIO;
                goto out_unlock;
        }
 
@@ -1206,7 +1221,7 @@ xfs_seek_data(
 
                /* No extents at given offset, must be beyond EOF */
                if (nmap == 0) {
-                       error = ENXIO;
+                       error = -ENXIO;
                        goto out_unlock;
                }
 
@@ -1237,7 +1252,7 @@ xfs_seek_data(
                 * we are reading after EOF if nothing in map[1].
                 */
                if (nmap == 1) {
-                       error = ENXIO;
+                       error = -ENXIO;
                        goto out_unlock;
                }
 
@@ -1250,7 +1265,7 @@ xfs_seek_data(
                fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
                start = XFS_FSB_TO_B(mp, fsbno);
                if (start >= isize) {
-                       error = ENXIO;
+                       error = -ENXIO;
                        goto out_unlock;
                }
        }
@@ -1262,7 +1277,7 @@ out_unlock:
        xfs_iunlock(ip, lock);
 
        if (error)
-               return -error;
+               return error;
        return offset;
 }
 
@@ -1282,13 +1297,13 @@ xfs_seek_hole(
        int                     error;
 
        if (XFS_FORCED_SHUTDOWN(mp))
-               return -XFS_ERROR(EIO);
+               return -EIO;
 
        lock = xfs_ilock_data_map_shared(ip);
 
        isize = i_size_read(inode);
        if (start >= isize) {
-               error = ENXIO;
+               error = -ENXIO;
                goto out_unlock;
        }
 
@@ -1307,7 +1322,7 @@ xfs_seek_hole(
 
                /* No extents at given offset, must be beyond EOF */
                if (nmap == 0) {
-                       error = ENXIO;
+                       error = -ENXIO;
                        goto out_unlock;
                }
 
@@ -1370,7 +1385,7 @@ out_unlock:
        xfs_iunlock(ip, lock);
 
        if (error)
-               return -error;
+               return error;
        return offset;
 }
 
index 8ec81bed7992149420efd5781cfb12c9596633f5..e92730c1d3ca6d7ba7f32938154d261285f7d1ef 100644 (file)
@@ -258,7 +258,7 @@ next_ag:
        if (*agp == NULLAGNUMBER)
                return 0;
 
-       err = ENOMEM;
+       err = -ENOMEM;
        item = kmem_alloc(sizeof(*item), KM_MAYFAIL);
        if (!item)
                goto out_put_ag;
@@ -268,7 +268,7 @@ next_ag:
 
        err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru);
        if (err) {
-               if (err == EEXIST)
+               if (err == -EEXIST)
                        err = 0;
                goto out_free_item;
        }
diff --git a/fs/xfs/xfs_format.h b/fs/xfs/xfs_format.h
deleted file mode 100644 (file)
index 34d85ac..0000000
+++ /dev/null
@@ -1,428 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_FORMAT_H__
-#define __XFS_FORMAT_H__
-
-/*
- * XFS On Disk Format Definitions
- *
- * This header file defines all the on-disk format definitions for 
- * general XFS objects. Directory and attribute related objects are defined in
- * xfs_da_format.h, which log and log item formats are defined in
- * xfs_log_format.h. Everything else goes here.
- */
-
-struct xfs_mount;
-struct xfs_trans;
-struct xfs_inode;
-struct xfs_buf;
-struct xfs_ifork;
-
-/*
- * RealTime Device format definitions
- */
-
-/* Min and max rt extent sizes, specified in bytes */
-#define        XFS_MAX_RTEXTSIZE       (1024 * 1024 * 1024)    /* 1GB */
-#define        XFS_DFL_RTEXTSIZE       (64 * 1024)             /* 64kB */
-#define        XFS_MIN_RTEXTSIZE       (4 * 1024)              /* 4kB */
-
-#define        XFS_BLOCKSIZE(mp)       ((mp)->m_sb.sb_blocksize)
-#define        XFS_BLOCKMASK(mp)       ((mp)->m_blockmask)
-#define        XFS_BLOCKWSIZE(mp)      ((mp)->m_blockwsize)
-#define        XFS_BLOCKWMASK(mp)      ((mp)->m_blockwmask)
-
-/*
- * RT Summary and bit manipulation macros.
- */
-#define        XFS_SUMOFFS(mp,ls,bb)   ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
-#define        XFS_SUMOFFSTOBLOCK(mp,s)        \
-       (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
-#define        XFS_SUMPTR(mp,bp,so)    \
-       ((xfs_suminfo_t *)((bp)->b_addr + \
-               (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
-
-#define        XFS_BITTOBLOCK(mp,bi)   ((bi) >> (mp)->m_blkbit_log)
-#define        XFS_BLOCKTOBIT(mp,bb)   ((bb) << (mp)->m_blkbit_log)
-#define        XFS_BITTOWORD(mp,bi)    \
-       ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
-
-#define        XFS_RTMIN(a,b)  ((a) < (b) ? (a) : (b))
-#define        XFS_RTMAX(a,b)  ((a) > (b) ? (a) : (b))
-
-#define        XFS_RTLOBIT(w)  xfs_lowbit32(w)
-#define        XFS_RTHIBIT(w)  xfs_highbit32(w)
-
-#if XFS_BIG_BLKNOS
-#define        XFS_RTBLOCKLOG(b)       xfs_highbit64(b)
-#else
-#define        XFS_RTBLOCKLOG(b)       xfs_highbit32(b)
-#endif
-
-/*
- * Dquot and dquot block format definitions
- */
-#define XFS_DQUOT_MAGIC                0x4451          /* 'DQ' */
-#define XFS_DQUOT_VERSION      (u_int8_t)0x01  /* latest version number */
-
-/*
- * This is the main portion of the on-disk representation of quota
- * information for a user. This is the q_core of the xfs_dquot_t that
- * is kept in kernel memory. We pad this with some more expansion room
- * to construct the on disk structure.
- */
-typedef struct xfs_disk_dquot {
-       __be16          d_magic;        /* dquot magic = XFS_DQUOT_MAGIC */
-       __u8            d_version;      /* dquot version */
-       __u8            d_flags;        /* XFS_DQ_USER/PROJ/GROUP */
-       __be32          d_id;           /* user,project,group id */
-       __be64          d_blk_hardlimit;/* absolute limit on disk blks */
-       __be64          d_blk_softlimit;/* preferred limit on disk blks */
-       __be64          d_ino_hardlimit;/* maximum # allocated inodes */
-       __be64          d_ino_softlimit;/* preferred inode limit */
-       __be64          d_bcount;       /* disk blocks owned by the user */
-       __be64          d_icount;       /* inodes owned by the user */
-       __be32          d_itimer;       /* zero if within inode limits if not,
-                                          this is when we refuse service */
-       __be32          d_btimer;       /* similar to above; for disk blocks */
-       __be16          d_iwarns;       /* warnings issued wrt num inodes */
-       __be16          d_bwarns;       /* warnings issued wrt disk blocks */
-       __be32          d_pad0;         /* 64 bit align */
-       __be64          d_rtb_hardlimit;/* absolute limit on realtime blks */
-       __be64          d_rtb_softlimit;/* preferred limit on RT disk blks */
-       __be64          d_rtbcount;     /* realtime blocks owned */
-       __be32          d_rtbtimer;     /* similar to above; for RT disk blocks */
-       __be16          d_rtbwarns;     /* warnings issued wrt RT disk blocks */
-       __be16          d_pad;
-} xfs_disk_dquot_t;
-
-/*
- * This is what goes on disk. This is separated from the xfs_disk_dquot because
- * carrying the unnecessary padding would be a waste of memory.
- */
-typedef struct xfs_dqblk {
-       xfs_disk_dquot_t  dd_diskdq;    /* portion that lives incore as well */
-       char              dd_fill[4];   /* filling for posterity */
-
-       /*
-        * These two are only present on filesystems with the CRC bits set.
-        */
-       __be32            dd_crc;       /* checksum */
-       __be64            dd_lsn;       /* last modification in log */
-       uuid_t            dd_uuid;      /* location information */
-} xfs_dqblk_t;
-
-#define XFS_DQUOT_CRC_OFF      offsetof(struct xfs_dqblk, dd_crc)
-
-/*
- * Remote symlink format and access functions.
- */
-#define XFS_SYMLINK_MAGIC      0x58534c4d      /* XSLM */
-
-struct xfs_dsymlink_hdr {
-       __be32  sl_magic;
-       __be32  sl_offset;
-       __be32  sl_bytes;
-       __be32  sl_crc;
-       uuid_t  sl_uuid;
-       __be64  sl_owner;
-       __be64  sl_blkno;
-       __be64  sl_lsn;
-};
-
-#define XFS_SYMLINK_CRC_OFF    offsetof(struct xfs_dsymlink_hdr, sl_crc)
-
-/*
- * The maximum pathlen is 1024 bytes. Since the minimum file system
- * blocksize is 512 bytes, we can get a max of 3 extents back from
- * bmapi when crc headers are taken into account.
- */
-#define XFS_SYMLINK_MAPS 3
-
-#define XFS_SYMLINK_BUF_SPACE(mp, bufsize)     \
-       ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
-                       sizeof(struct xfs_dsymlink_hdr) : 0))
-
-
-/*
- * Allocation Btree format definitions
- *
- * There are two on-disk btrees, one sorted by blockno and one sorted
- * by blockcount and blockno.  All blocks look the same to make the code
- * simpler; if we have time later, we'll make the optimizations.
- */
-#define        XFS_ABTB_MAGIC          0x41425442      /* 'ABTB' for bno tree */
-#define        XFS_ABTB_CRC_MAGIC      0x41423342      /* 'AB3B' */
-#define        XFS_ABTC_MAGIC          0x41425443      /* 'ABTC' for cnt tree */
-#define        XFS_ABTC_CRC_MAGIC      0x41423343      /* 'AB3C' */
-
-/*
- * Data record/key structure
- */
-typedef struct xfs_alloc_rec {
-       __be32          ar_startblock;  /* starting block number */
-       __be32          ar_blockcount;  /* count of free blocks */
-} xfs_alloc_rec_t, xfs_alloc_key_t;
-
-typedef struct xfs_alloc_rec_incore {
-       xfs_agblock_t   ar_startblock;  /* starting block number */
-       xfs_extlen_t    ar_blockcount;  /* count of free blocks */
-} xfs_alloc_rec_incore_t;
-
-/* btree pointer type */
-typedef __be32 xfs_alloc_ptr_t;
-
-/*
- * Block numbers in the AG:
- * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3.
- */
-#define        XFS_BNO_BLOCK(mp)       ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1))
-#define        XFS_CNT_BLOCK(mp)       ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
-
-
-/*
- * Inode Allocation Btree format definitions
- *
- * There is a btree for the inode map per allocation group.
- */
-#define        XFS_IBT_MAGIC           0x49414254      /* 'IABT' */
-#define        XFS_IBT_CRC_MAGIC       0x49414233      /* 'IAB3' */
-#define        XFS_FIBT_MAGIC          0x46494254      /* 'FIBT' */
-#define        XFS_FIBT_CRC_MAGIC      0x46494233      /* 'FIB3' */
-
-typedef        __uint64_t      xfs_inofree_t;
-#define        XFS_INODES_PER_CHUNK            (NBBY * sizeof(xfs_inofree_t))
-#define        XFS_INODES_PER_CHUNK_LOG        (XFS_NBBYLOG + 3)
-#define        XFS_INOBT_ALL_FREE              ((xfs_inofree_t)-1)
-#define        XFS_INOBT_MASK(i)               ((xfs_inofree_t)1 << (i))
-
-static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
-{
-       return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
-}
-
-/*
- * Data record structure
- */
-typedef struct xfs_inobt_rec {
-       __be32          ir_startino;    /* starting inode number */
-       __be32          ir_freecount;   /* count of free inodes (set bits) */
-       __be64          ir_free;        /* free inode mask */
-} xfs_inobt_rec_t;
-
-typedef struct xfs_inobt_rec_incore {
-       xfs_agino_t     ir_startino;    /* starting inode number */
-       __int32_t       ir_freecount;   /* count of free inodes (set bits) */
-       xfs_inofree_t   ir_free;        /* free inode mask */
-} xfs_inobt_rec_incore_t;
-
-
-/*
- * Key structure
- */
-typedef struct xfs_inobt_key {
-       __be32          ir_startino;    /* starting inode number */
-} xfs_inobt_key_t;
-
-/* btree pointer type */
-typedef __be32 xfs_inobt_ptr_t;
-
-/*
- * block numbers in the AG.
- */
-#define        XFS_IBT_BLOCK(mp)               ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
-#define        XFS_FIBT_BLOCK(mp)              ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
-
-/*
- * The first data block of an AG depends on whether the filesystem was formatted
- * with the finobt feature. If so, account for the finobt reserved root btree
- * block.
- */
-#define XFS_PREALLOC_BLOCKS(mp) \
-       (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \
-        XFS_FIBT_BLOCK(mp) + 1 : \
-        XFS_IBT_BLOCK(mp) + 1)
-
-
-
-/*
- * BMAP Btree format definitions
- *
- * This includes both the root block definition that sits inside an inode fork
- * and the record/pointer formats for the leaf/node in the blocks.
- */
-#define XFS_BMAP_MAGIC         0x424d4150      /* 'BMAP' */
-#define XFS_BMAP_CRC_MAGIC     0x424d4133      /* 'BMA3' */
-
-/*
- * Bmap root header, on-disk form only.
- */
-typedef struct xfs_bmdr_block {
-       __be16          bb_level;       /* 0 is a leaf */
-       __be16          bb_numrecs;     /* current # of data records */
-} xfs_bmdr_block_t;
-
-/*
- * Bmap btree record and extent descriptor.
- *  l0:63 is an extent flag (value 1 indicates non-normal).
- *  l0:9-62 are startoff.
- *  l0:0-8 and l1:21-63 are startblock.
- *  l1:0-20 are blockcount.
- */
-#define BMBT_EXNTFLAG_BITLEN   1
-#define BMBT_STARTOFF_BITLEN   54
-#define BMBT_STARTBLOCK_BITLEN 52
-#define BMBT_BLOCKCOUNT_BITLEN 21
-
-typedef struct xfs_bmbt_rec {
-       __be64                  l0, l1;
-} xfs_bmbt_rec_t;
-
-typedef __uint64_t     xfs_bmbt_rec_base_t;    /* use this for casts */
-typedef xfs_bmbt_rec_t xfs_bmdr_rec_t;
-
-typedef struct xfs_bmbt_rec_host {
-       __uint64_t              l0, l1;
-} xfs_bmbt_rec_host_t;
-
-/*
- * Values and macros for delayed-allocation startblock fields.
- */
-#define STARTBLOCKVALBITS      17
-#define STARTBLOCKMASKBITS     (15 + XFS_BIG_BLKNOS * 20)
-#define DSTARTBLOCKMASKBITS    (15 + 20)
-#define STARTBLOCKMASK         \
-       (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
-#define DSTARTBLOCKMASK                \
-       (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
-
-static inline int isnullstartblock(xfs_fsblock_t x)
-{
-       return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
-}
-
-static inline int isnulldstartblock(xfs_dfsbno_t x)
-{
-       return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK;
-}
-
-static inline xfs_fsblock_t nullstartblock(int k)
-{
-       ASSERT(k < (1 << STARTBLOCKVALBITS));
-       return STARTBLOCKMASK | (k);
-}
-
-static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
-{
-       return (xfs_filblks_t)((x) & ~STARTBLOCKMASK);
-}
-
-/*
- * Possible extent formats.
- */
-typedef enum {
-       XFS_EXTFMT_NOSTATE = 0,
-       XFS_EXTFMT_HASSTATE
-} xfs_exntfmt_t;
-
-/*
- * Possible extent states.
- */
-typedef enum {
-       XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
-       XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID
-} xfs_exntst_t;
-
-/*
- * Incore version of above.
- */
-typedef struct xfs_bmbt_irec
-{
-       xfs_fileoff_t   br_startoff;    /* starting file offset */
-       xfs_fsblock_t   br_startblock;  /* starting block number */
-       xfs_filblks_t   br_blockcount;  /* number of blocks */
-       xfs_exntst_t    br_state;       /* extent state */
-} xfs_bmbt_irec_t;
-
-/*
- * Key structure for non-leaf levels of the tree.
- */
-typedef struct xfs_bmbt_key {
-       __be64          br_startoff;    /* starting file offset */
-} xfs_bmbt_key_t, xfs_bmdr_key_t;
-
-/* btree pointer type */
-typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
-
-
-/*
- * Generic Btree block format definitions
- *
- * This is a combination of the actual format used on disk for short and long
- * format btrees.  The first three fields are shared by both format, but the
- * pointers are different and should be used with care.
- *
- * To get the size of the actual short or long form headers please use the size
- * macros below.  Never use sizeof(xfs_btree_block).
- *
- * The blkno, crc, lsn, owner and uuid fields are only available in filesystems
- * with the crc feature bit, and all accesses to them must be conditional on
- * that flag.
- */
-struct xfs_btree_block {
-       __be32          bb_magic;       /* magic number for block type */
-       __be16          bb_level;       /* 0 is a leaf */
-       __be16          bb_numrecs;     /* current # of data records */
-       union {
-               struct {
-                       __be32          bb_leftsib;
-                       __be32          bb_rightsib;
-
-                       __be64          bb_blkno;
-                       __be64          bb_lsn;
-                       uuid_t          bb_uuid;
-                       __be32          bb_owner;
-                       __le32          bb_crc;
-               } s;                    /* short form pointers */
-               struct  {
-                       __be64          bb_leftsib;
-                       __be64          bb_rightsib;
-
-                       __be64          bb_blkno;
-                       __be64          bb_lsn;
-                       uuid_t          bb_uuid;
-                       __be64          bb_owner;
-                       __le32          bb_crc;
-                       __be32          bb_pad; /* padding for alignment */
-               } l;                    /* long form pointers */
-       } bb_u;                         /* rest */
-};
-
-#define XFS_BTREE_SBLOCK_LEN   16      /* size of a short form block */
-#define XFS_BTREE_LBLOCK_LEN   24      /* size of a long form block */
-
-/* sizes of CRC enabled btree blocks */
-#define XFS_BTREE_SBLOCK_CRC_LEN       (XFS_BTREE_SBLOCK_LEN + 40)
-#define XFS_BTREE_LBLOCK_CRC_LEN       (XFS_BTREE_LBLOCK_LEN + 48)
-
-#define XFS_BTREE_SBLOCK_CRC_OFF \
-       offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
-#define XFS_BTREE_LBLOCK_CRC_OFF \
-       offsetof(struct xfs_btree_block, bb_u.l.bb_crc)
-
-#endif /* __XFS_FORMAT_H__ */
index d34703dbcb423b7fdbb52438b2525dd74417f1eb..18dc721ca19f85f7436ab7c3241e28506521abee 100644 (file)
@@ -255,8 +255,8 @@ typedef struct xfs_fsop_resblks {
        ((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES)
 
 /* Used for sanity checks on superblock */
-#define XFS_MAX_DBLOCKS(s) ((xfs_drfsbno_t)(s)->sb_agcount * (s)->sb_agblocks)
-#define XFS_MIN_DBLOCKS(s) ((xfs_drfsbno_t)((s)->sb_agcount - 1) *     \
+#define XFS_MAX_DBLOCKS(s) ((xfs_rfsblock_t)(s)->sb_agcount * (s)->sb_agblocks)
+#define XFS_MIN_DBLOCKS(s) ((xfs_rfsblock_t)((s)->sb_agcount - 1) *    \
                         (s)->sb_agblocks + XFS_MIN_AG_BLOCKS)
 
 /*
@@ -375,6 +375,9 @@ struct xfs_fs_eofblocks {
 #define XFS_EOF_FLAGS_GID              (1 << 2) /* filter by gid */
 #define XFS_EOF_FLAGS_PRID             (1 << 3) /* filter by project id */
 #define XFS_EOF_FLAGS_MINFILESIZE      (1 << 4) /* filter by min file size */
+#define XFS_EOF_FLAGS_UNION            (1 << 5) /* union filter algorithm;
+                                                 * kernel only, not included in
+                                                 * valid mask */
 #define XFS_EOF_FLAGS_VALID    \
        (XFS_EOF_FLAGS_SYNC |   \
         XFS_EOF_FLAGS_UID |    \
index d2295561570af03ffc5971dcf7cf67630cd4f6e5..f91de1ef05e1024b24e2f13046f828f3be497d15 100644 (file)
@@ -168,7 +168,7 @@ xfs_growfs_data_private(
        nb = in->newblocks;
        pct = in->imaxpct;
        if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100)
-               return XFS_ERROR(EINVAL);
+               return -EINVAL;
        if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
                return error;
        dpct = pct - mp->m_sb.sb_imax_pct;
@@ -176,7 +176,7 @@ xfs_growfs_data_private(
                                XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
                                XFS_FSS_TO_BB(mp, 1), 0, NULL);
        if (!bp)
-               return EIO;
+               return -EIO;
        if (bp->b_error) {
                error = bp->b_error;
                xfs_buf_relse(bp);
@@ -191,7 +191,7 @@ xfs_growfs_data_private(
                nagcount--;
                nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
                if (nb < mp->m_sb.sb_dblocks)
-                       return XFS_ERROR(EINVAL);
+                       return -EINVAL;
        }
        new = nb - mp->m_sb.sb_dblocks;
        oagcount = mp->m_sb.sb_agcount;
@@ -229,7 +229,7 @@ xfs_growfs_data_private(
                                XFS_FSS_TO_BB(mp, 1), 0,
                                &xfs_agf_buf_ops);
                if (!bp) {
-                       error = ENOMEM;
+                       error = -ENOMEM;
                        goto error0;
                }
 
@@ -270,7 +270,7 @@ xfs_growfs_data_private(
                                XFS_FSS_TO_BB(mp, 1), 0,
                                &xfs_agfl_buf_ops);
                if (!bp) {
-                       error = ENOMEM;
+                       error = -ENOMEM;
                        goto error0;
                }
 
@@ -298,7 +298,7 @@ xfs_growfs_data_private(
                                XFS_FSS_TO_BB(mp, 1), 0,
                                &xfs_agi_buf_ops);
                if (!bp) {
-                       error = ENOMEM;
+                       error = -ENOMEM;
                        goto error0;
                }
 
@@ -336,7 +336,7 @@ xfs_growfs_data_private(
                                &xfs_allocbt_buf_ops);
 
                if (!bp) {
-                       error = ENOMEM;
+                       error = -ENOMEM;
                        goto error0;
                }
 
@@ -365,7 +365,7 @@ xfs_growfs_data_private(
                                BTOBB(mp->m_sb.sb_blocksize), 0,
                                &xfs_allocbt_buf_ops);
                if (!bp) {
-                       error = ENOMEM;
+                       error = -ENOMEM;
                        goto error0;
                }
 
@@ -395,7 +395,7 @@ xfs_growfs_data_private(
                                BTOBB(mp->m_sb.sb_blocksize), 0,
                                &xfs_inobt_buf_ops);
                if (!bp) {
-                       error = ENOMEM;
+                       error = -ENOMEM;
                        goto error0;
                }
 
@@ -420,7 +420,7 @@ xfs_growfs_data_private(
                                BTOBB(mp->m_sb.sb_blocksize), 0,
                                &xfs_inobt_buf_ops);
                        if (!bp) {
-                               error = ENOMEM;
+                               error = -ENOMEM;
                                goto error0;
                        }
 
@@ -531,7 +531,7 @@ xfs_growfs_data_private(
                                bp->b_ops = &xfs_sb_buf_ops;
                                xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
                        } else
-                               error = ENOMEM;
+                               error = -ENOMEM;
                }
 
                /*
@@ -576,17 +576,17 @@ xfs_growfs_log_private(
 
        nb = in->newblocks;
        if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES))
-               return XFS_ERROR(EINVAL);
+               return -EINVAL;
        if (nb == mp->m_sb.sb_logblocks &&
            in->isint == (mp->m_sb.sb_logstart != 0))
-               return XFS_ERROR(EINVAL);
+               return -EINVAL;
        /*
         * Moving the log is hard, need new interfaces to sync
         * the log first, hold off all activity while moving it.
         * Can have shorter or longer log in the same space,
         * or transform internal to external log or vice versa.
         */
-       return XFS_ERROR(ENOSYS);
+       return -ENOSYS;
 }
 
 /*
@@ -604,9 +604,9 @@ xfs_growfs_data(
        int error;
 
        if (!capable(CAP_SYS_ADMIN))
-               return XFS_ERROR(EPERM);
+               return -EPERM;
        if (!mutex_trylock(&mp->m_growlock))
-               return XFS_ERROR(EWOULDBLOCK);
+               return -EWOULDBLOCK;
        error = xfs_growfs_data_private(mp, in);
        mutex_unlock(&mp->m_growlock);
        return error;
@@ -620,9 +620,9 @@ xfs_growfs_log(
        int error;
 
        if (!capable(CAP_SYS_ADMIN))
-               return XFS_ERROR(EPERM);
+               return -EPERM;
        if (!mutex_trylock(&mp->m_growlock))
-               return XFS_ERROR(EWOULDBLOCK);
+               return -EWOULDBLOCK;
        error = xfs_growfs_log_private(mp, in);
        mutex_unlock(&mp->m_growlock);
        return error;
@@ -674,7 +674,7 @@ xfs_reserve_blocks(
        /* If inval is null, report current values and return */
        if (inval == (__uint64_t *)NULL) {
                if (!outval)
-                       return EINVAL;
+                       return -EINVAL;
                outval->resblks = mp->m_resblks;
                outval->resblks_avail = mp->m_resblks_avail;
                return 0;
@@ -757,7 +757,7 @@ out:
                int error;
                error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
                                                 fdblks_delta, 0);
-               if (error == ENOSPC)
+               if (error == -ENOSPC)
                        goto retry;
        }
        return 0;
@@ -818,7 +818,7 @@ xfs_fs_goingdown(
                                SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR);
                break;
        default:
-               return XFS_ERROR(EINVAL);
+               return -EINVAL;
        }
 
        return 0;
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
deleted file mode 100644 (file)
index 5960e55..0000000
+++ /dev/null
@@ -1,2189 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_inum.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_bmap.h"
-#include "xfs_cksum.h"
-#include "xfs_trans.h"
-#include "xfs_buf_item.h"
-#include "xfs_icreate_item.h"
-#include "xfs_icache.h"
-#include "xfs_dinode.h"
-#include "xfs_trace.h"
-
-
-/*
- * Allocation group level functions.
- */
-static inline int
-xfs_ialloc_cluster_alignment(
-       xfs_alloc_arg_t *args)
-{
-       if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
-           args->mp->m_sb.sb_inoalignmt >=
-            XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size))
-               return args->mp->m_sb.sb_inoalignmt;
-       return 1;
-}
-
-/*
- * Lookup a record by ino in the btree given by cur.
- */
-int                                    /* error */
-xfs_inobt_lookup(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       xfs_agino_t             ino,    /* starting inode of chunk */
-       xfs_lookup_t            dir,    /* <=, >=, == */
-       int                     *stat)  /* success/failure */
-{
-       cur->bc_rec.i.ir_startino = ino;
-       cur->bc_rec.i.ir_freecount = 0;
-       cur->bc_rec.i.ir_free = 0;
-       return xfs_btree_lookup(cur, dir, stat);
-}
-
-/*
- * Update the record referred to by cur to the value given.
- * This either works (return 0) or gets an EFSCORRUPTED error.
- */
-STATIC int                             /* error */
-xfs_inobt_update(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       xfs_inobt_rec_incore_t  *irec)  /* btree record */
-{
-       union xfs_btree_rec     rec;
-
-       rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
-       rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
-       rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
-       return xfs_btree_update(cur, &rec);
-}
-
-/*
- * Get the data from the pointed-to record.
- */
-int                                    /* error */
-xfs_inobt_get_rec(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       xfs_inobt_rec_incore_t  *irec,  /* btree record */
-       int                     *stat)  /* output: success/failure */
-{
-       union xfs_btree_rec     *rec;
-       int                     error;
-
-       error = xfs_btree_get_rec(cur, &rec, stat);
-       if (!error && *stat == 1) {
-               irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
-               irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
-               irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
-       }
-       return error;
-}
-
-/*
- * Insert a single inobt record. Cursor must already point to desired location.
- */
-STATIC int
-xfs_inobt_insert_rec(
-       struct xfs_btree_cur    *cur,
-       __int32_t               freecount,
-       xfs_inofree_t           free,
-       int                     *stat)
-{
-       cur->bc_rec.i.ir_freecount = freecount;
-       cur->bc_rec.i.ir_free = free;
-       return xfs_btree_insert(cur, stat);
-}
-
-/*
- * Insert records describing a newly allocated inode chunk into the inobt.
- */
-STATIC int
-xfs_inobt_insert(
-       struct xfs_mount        *mp,
-       struct xfs_trans        *tp,
-       struct xfs_buf          *agbp,
-       xfs_agino_t             newino,
-       xfs_agino_t             newlen,
-       xfs_btnum_t             btnum)
-{
-       struct xfs_btree_cur    *cur;
-       struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
-       xfs_agnumber_t          agno = be32_to_cpu(agi->agi_seqno);
-       xfs_agino_t             thisino;
-       int                     i;
-       int                     error;
-
-       cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
-
-       for (thisino = newino;
-            thisino < newino + newlen;
-            thisino += XFS_INODES_PER_CHUNK) {
-               error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i);
-               if (error) {
-                       xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-                       return error;
-               }
-               ASSERT(i == 0);
-
-               error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK,
-                                            XFS_INOBT_ALL_FREE, &i);
-               if (error) {
-                       xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-                       return error;
-               }
-               ASSERT(i == 1);
-       }
-
-       xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-
-       return 0;
-}
-
-/*
- * Verify that the number of free inodes in the AGI is correct.
- */
-#ifdef DEBUG
-STATIC int
-xfs_check_agi_freecount(
-       struct xfs_btree_cur    *cur,
-       struct xfs_agi          *agi)
-{
-       if (cur->bc_nlevels == 1) {
-               xfs_inobt_rec_incore_t rec;
-               int             freecount = 0;
-               int             error;
-               int             i;
-
-               error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
-               if (error)
-                       return error;
-
-               do {
-                       error = xfs_inobt_get_rec(cur, &rec, &i);
-                       if (error)
-                               return error;
-
-                       if (i) {
-                               freecount += rec.ir_freecount;
-                               error = xfs_btree_increment(cur, 0, &i);
-                               if (error)
-                                       return error;
-                       }
-               } while (i == 1);
-
-               if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
-                       ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
-       }
-       return 0;
-}
-#else
-#define xfs_check_agi_freecount(cur, agi)      0
-#endif
-
-/*
- * Initialise a new set of inodes. When called without a transaction context
- * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
- * than logging them (which in a transaction context puts them into the AIL
- * for writeback rather than the xfsbufd queue).
- */
-int
-xfs_ialloc_inode_init(
-       struct xfs_mount        *mp,
-       struct xfs_trans        *tp,
-       struct list_head        *buffer_list,
-       xfs_agnumber_t          agno,
-       xfs_agblock_t           agbno,
-       xfs_agblock_t           length,
-       unsigned int            gen)
-{
-       struct xfs_buf          *fbuf;
-       struct xfs_dinode       *free;
-       int                     nbufs, blks_per_cluster, inodes_per_cluster;
-       int                     version;
-       int                     i, j;
-       xfs_daddr_t             d;
-       xfs_ino_t               ino = 0;
-
-       /*
-        * Loop over the new block(s), filling in the inodes.  For small block
-        * sizes, manipulate the inodes in buffers  which are multiples of the
-        * blocks size.
-        */
-       blks_per_cluster = xfs_icluster_size_fsb(mp);
-       inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
-       nbufs = length / blks_per_cluster;
-
-       /*
-        * Figure out what version number to use in the inodes we create.  If
-        * the superblock version has caught up to the one that supports the new
-        * inode format, then use the new inode version.  Otherwise use the old
-        * version so that old kernels will continue to be able to use the file
-        * system.
-        *
-        * For v3 inodes, we also need to write the inode number into the inode,
-        * so calculate the first inode number of the chunk here as
-        * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not
-        * across multiple filesystem blocks (such as a cluster) and so cannot
-        * be used in the cluster buffer loop below.
-        *
-        * Further, because we are writing the inode directly into the buffer
-        * and calculating a CRC on the entire inode, we have ot log the entire
-        * inode so that the entire range the CRC covers is present in the log.
-        * That means for v3 inode we log the entire buffer rather than just the
-        * inode cores.
-        */
-       if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               version = 3;
-               ino = XFS_AGINO_TO_INO(mp, agno,
-                                      XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
-
-               /*
-                * log the initialisation that is about to take place as an
-                * logical operation. This means the transaction does not
-                * need to log the physical changes to the inode buffers as log
-                * recovery will know what initialisation is actually needed.
-                * Hence we only need to log the buffers as "ordered" buffers so
-                * they track in the AIL as if they were physically logged.
-                */
-               if (tp)
-                       xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos,
-                                       mp->m_sb.sb_inodesize, length, gen);
-       } else
-               version = 2;
-
-       for (j = 0; j < nbufs; j++) {
-               /*
-                * Get the block.
-                */
-               d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
-               fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
-                                        mp->m_bsize * blks_per_cluster,
-                                        XBF_UNMAPPED);
-               if (!fbuf)
-                       return ENOMEM;
-
-               /* Initialize the inode buffers and log them appropriately. */
-               fbuf->b_ops = &xfs_inode_buf_ops;
-               xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
-               for (i = 0; i < inodes_per_cluster; i++) {
-                       int     ioffset = i << mp->m_sb.sb_inodelog;
-                       uint    isize = xfs_dinode_size(version);
-
-                       free = xfs_make_iptr(mp, fbuf, i);
-                       free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
-                       free->di_version = version;
-                       free->di_gen = cpu_to_be32(gen);
-                       free->di_next_unlinked = cpu_to_be32(NULLAGINO);
-
-                       if (version == 3) {
-                               free->di_ino = cpu_to_be64(ino);
-                               ino++;
-                               uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid);
-                               xfs_dinode_calc_crc(mp, free);
-                       } else if (tp) {
-                               /* just log the inode core */
-                               xfs_trans_log_buf(tp, fbuf, ioffset,
-                                                 ioffset + isize - 1);
-                       }
-               }
-
-               if (tp) {
-                       /*
-                        * Mark the buffer as an inode allocation buffer so it
-                        * sticks in AIL at the point of this allocation
-                        * transaction. This ensures the they are on disk before
-                        * the tail of the log can be moved past this
-                        * transaction (i.e. by preventing relogging from moving
-                        * it forward in the log).
-                        */
-                       xfs_trans_inode_alloc_buf(tp, fbuf);
-                       if (version == 3) {
-                               /*
-                                * Mark the buffer as ordered so that they are
-                                * not physically logged in the transaction but
-                                * still tracked in the AIL as part of the
-                                * transaction and pin the log appropriately.
-                                */
-                               xfs_trans_ordered_buf(tp, fbuf);
-                               xfs_trans_log_buf(tp, fbuf, 0,
-                                                 BBTOB(fbuf->b_length) - 1);
-                       }
-               } else {
-                       fbuf->b_flags |= XBF_DONE;
-                       xfs_buf_delwri_queue(fbuf, buffer_list);
-                       xfs_buf_relse(fbuf);
-               }
-       }
-       return 0;
-}
-
-/*
- * Allocate new inodes in the allocation group specified by agbp.
- * Return 0 for success, else error code.
- */
-STATIC int                             /* error code or 0 */
-xfs_ialloc_ag_alloc(
-       xfs_trans_t     *tp,            /* transaction pointer */
-       xfs_buf_t       *agbp,          /* alloc group buffer */
-       int             *alloc)
-{
-       xfs_agi_t       *agi;           /* allocation group header */
-       xfs_alloc_arg_t args;           /* allocation argument structure */
-       xfs_agnumber_t  agno;
-       int             error;
-       xfs_agino_t     newino;         /* new first inode's number */
-       xfs_agino_t     newlen;         /* new number of inodes */
-       int             isaligned = 0;  /* inode allocation at stripe unit */
-                                       /* boundary */
-       struct xfs_perag *pag;
-
-       memset(&args, 0, sizeof(args));
-       args.tp = tp;
-       args.mp = tp->t_mountp;
-
-       /*
-        * Locking will ensure that we don't have two callers in here
-        * at one time.
-        */
-       newlen = args.mp->m_ialloc_inos;
-       if (args.mp->m_maxicount &&
-           args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
-               return XFS_ERROR(ENOSPC);
-       args.minlen = args.maxlen = args.mp->m_ialloc_blks;
-       /*
-        * First try to allocate inodes contiguous with the last-allocated
-        * chunk of inodes.  If the filesystem is striped, this will fill
-        * an entire stripe unit with inodes.
-        */
-       agi = XFS_BUF_TO_AGI(agbp);
-       newino = be32_to_cpu(agi->agi_newino);
-       agno = be32_to_cpu(agi->agi_seqno);
-       args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
-                    args.mp->m_ialloc_blks;
-       if (likely(newino != NULLAGINO &&
-                 (args.agbno < be32_to_cpu(agi->agi_length)))) {
-               args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
-               args.type = XFS_ALLOCTYPE_THIS_BNO;
-               args.prod = 1;
-
-               /*
-                * We need to take into account alignment here to ensure that
-                * we don't modify the free list if we fail to have an exact
-                * block. If we don't have an exact match, and every oher
-                * attempt allocation attempt fails, we'll end up cancelling
-                * a dirty transaction and shutting down.
-                *
-                * For an exact allocation, alignment must be 1,
-                * however we need to take cluster alignment into account when
-                * fixing up the freelist. Use the minalignslop field to
-                * indicate that extra blocks might be required for alignment,
-                * but not to use them in the actual exact allocation.
-                */
-               args.alignment = 1;
-               args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
-
-               /* Allow space for the inode btree to split. */
-               args.minleft = args.mp->m_in_maxlevels - 1;
-               if ((error = xfs_alloc_vextent(&args)))
-                       return error;
-
-               /*
-                * This request might have dirtied the transaction if the AG can
-                * satisfy the request, but the exact block was not available.
-                * If the allocation did fail, subsequent requests will relax
-                * the exact agbno requirement and increase the alignment
-                * instead. It is critical that the total size of the request
-                * (len + alignment + slop) does not increase from this point
-                * on, so reset minalignslop to ensure it is not included in
-                * subsequent requests.
-                */
-               args.minalignslop = 0;
-       } else
-               args.fsbno = NULLFSBLOCK;
-
-       if (unlikely(args.fsbno == NULLFSBLOCK)) {
-               /*
-                * Set the alignment for the allocation.
-                * If stripe alignment is turned on then align at stripe unit
-                * boundary.
-                * If the cluster size is smaller than a filesystem block
-                * then we're doing I/O for inodes in filesystem block size
-                * pieces, so don't need alignment anyway.
-                */
-               isaligned = 0;
-               if (args.mp->m_sinoalign) {
-                       ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
-                       args.alignment = args.mp->m_dalign;
-                       isaligned = 1;
-               } else
-                       args.alignment = xfs_ialloc_cluster_alignment(&args);
-               /*
-                * Need to figure out where to allocate the inode blocks.
-                * Ideally they should be spaced out through the a.g.
-                * For now, just allocate blocks up front.
-                */
-               args.agbno = be32_to_cpu(agi->agi_root);
-               args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
-               /*
-                * Allocate a fixed-size extent of inodes.
-                */
-               args.type = XFS_ALLOCTYPE_NEAR_BNO;
-               args.prod = 1;
-               /*
-                * Allow space for the inode btree to split.
-                */
-               args.minleft = args.mp->m_in_maxlevels - 1;
-               if ((error = xfs_alloc_vextent(&args)))
-                       return error;
-       }
-
-       /*
-        * If stripe alignment is turned on, then try again with cluster
-        * alignment.
-        */
-       if (isaligned && args.fsbno == NULLFSBLOCK) {
-               args.type = XFS_ALLOCTYPE_NEAR_BNO;
-               args.agbno = be32_to_cpu(agi->agi_root);
-               args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
-               args.alignment = xfs_ialloc_cluster_alignment(&args);
-               if ((error = xfs_alloc_vextent(&args)))
-                       return error;
-       }
-
-       if (args.fsbno == NULLFSBLOCK) {
-               *alloc = 0;
-               return 0;
-       }
-       ASSERT(args.len == args.minlen);
-
-       /*
-        * Stamp and write the inode buffers.
-        *
-        * Seed the new inode cluster with a random generation number. This
-        * prevents short-term reuse of generation numbers if a chunk is
-        * freed and then immediately reallocated. We use random numbers
-        * rather than a linear progression to prevent the next generation
-        * number from being easily guessable.
-        */
-       error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno,
-                       args.len, prandom_u32());
-
-       if (error)
-               return error;
-       /*
-        * Convert the results.
-        */
-       newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
-       be32_add_cpu(&agi->agi_count, newlen);
-       be32_add_cpu(&agi->agi_freecount, newlen);
-       pag = xfs_perag_get(args.mp, agno);
-       pag->pagi_freecount += newlen;
-       xfs_perag_put(pag);
-       agi->agi_newino = cpu_to_be32(newino);
-
-       /*
-        * Insert records describing the new inode chunk into the btrees.
-        */
-       error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
-                                XFS_BTNUM_INO);
-       if (error)
-               return error;
-
-       if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
-               error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
-                                        XFS_BTNUM_FINO);
-               if (error)
-                       return error;
-       }
-       /*
-        * Log allocation group header fields
-        */
-       xfs_ialloc_log_agi(tp, agbp,
-               XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
-       /*
-        * Modify/log superblock values for inode count and inode free count.
-        */
-       xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
-       xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
-       *alloc = 1;
-       return 0;
-}
-
-STATIC xfs_agnumber_t
-xfs_ialloc_next_ag(
-       xfs_mount_t     *mp)
-{
-       xfs_agnumber_t  agno;
-
-       spin_lock(&mp->m_agirotor_lock);
-       agno = mp->m_agirotor;
-       if (++mp->m_agirotor >= mp->m_maxagi)
-               mp->m_agirotor = 0;
-       spin_unlock(&mp->m_agirotor_lock);
-
-       return agno;
-}
-
-/*
- * Select an allocation group to look for a free inode in, based on the parent
- * inode and the mode.  Return the allocation group buffer.
- */
-STATIC xfs_agnumber_t
-xfs_ialloc_ag_select(
-       xfs_trans_t     *tp,            /* transaction pointer */
-       xfs_ino_t       parent,         /* parent directory inode number */
-       umode_t         mode,           /* bits set to indicate file type */
-       int             okalloc)        /* ok to allocate more space */
-{
-       xfs_agnumber_t  agcount;        /* number of ag's in the filesystem */
-       xfs_agnumber_t  agno;           /* current ag number */
-       int             flags;          /* alloc buffer locking flags */
-       xfs_extlen_t    ineed;          /* blocks needed for inode allocation */
-       xfs_extlen_t    longest = 0;    /* longest extent available */
-       xfs_mount_t     *mp;            /* mount point structure */
-       int             needspace;      /* file mode implies space allocated */
-       xfs_perag_t     *pag;           /* per allocation group data */
-       xfs_agnumber_t  pagno;          /* parent (starting) ag number */
-       int             error;
-
-       /*
-        * Files of these types need at least one block if length > 0
-        * (and they won't fit in the inode, but that's hard to figure out).
-        */
-       needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
-       mp = tp->t_mountp;
-       agcount = mp->m_maxagi;
-       if (S_ISDIR(mode))
-               pagno = xfs_ialloc_next_ag(mp);
-       else {
-               pagno = XFS_INO_TO_AGNO(mp, parent);
-               if (pagno >= agcount)
-                       pagno = 0;
-       }
-
-       ASSERT(pagno < agcount);
-
-       /*
-        * Loop through allocation groups, looking for one with a little
-        * free space in it.  Note we don't look for free inodes, exactly.
-        * Instead, we include whether there is a need to allocate inodes
-        * to mean that blocks must be allocated for them,
-        * if none are currently free.
-        */
-       agno = pagno;
-       flags = XFS_ALLOC_FLAG_TRYLOCK;
-       for (;;) {
-               pag = xfs_perag_get(mp, agno);
-               if (!pag->pagi_inodeok) {
-                       xfs_ialloc_next_ag(mp);
-                       goto nextag;
-               }
-
-               if (!pag->pagi_init) {
-                       error = xfs_ialloc_pagi_init(mp, tp, agno);
-                       if (error)
-                               goto nextag;
-               }
-
-               if (pag->pagi_freecount) {
-                       xfs_perag_put(pag);
-                       return agno;
-               }
-
-               if (!okalloc)
-                       goto nextag;
-
-               if (!pag->pagf_init) {
-                       error = xfs_alloc_pagf_init(mp, tp, agno, flags);
-                       if (error)
-                               goto nextag;
-               }
-
-               /*
-                * Is there enough free space for the file plus a block of
-                * inodes? (if we need to allocate some)?
-                */
-               ineed = mp->m_ialloc_blks;
-               longest = pag->pagf_longest;
-               if (!longest)
-                       longest = pag->pagf_flcount > 0;
-
-               if (pag->pagf_freeblks >= needspace + ineed &&
-                   longest >= ineed) {
-                       xfs_perag_put(pag);
-                       return agno;
-               }
-nextag:
-               xfs_perag_put(pag);
-               /*
-                * No point in iterating over the rest, if we're shutting
-                * down.
-                */
-               if (XFS_FORCED_SHUTDOWN(mp))
-                       return NULLAGNUMBER;
-               agno++;
-               if (agno >= agcount)
-                       agno = 0;
-               if (agno == pagno) {
-                       if (flags == 0)
-                               return NULLAGNUMBER;
-                       flags = 0;
-               }
-       }
-}
-
-/*
- * Try to retrieve the next record to the left/right from the current one.
- */
-STATIC int
-xfs_ialloc_next_rec(
-       struct xfs_btree_cur    *cur,
-       xfs_inobt_rec_incore_t  *rec,
-       int                     *done,
-       int                     left)
-{
-       int                     error;
-       int                     i;
-
-       if (left)
-               error = xfs_btree_decrement(cur, 0, &i);
-       else
-               error = xfs_btree_increment(cur, 0, &i);
-
-       if (error)
-               return error;
-       *done = !i;
-       if (i) {
-               error = xfs_inobt_get_rec(cur, rec, &i);
-               if (error)
-                       return error;
-               XFS_WANT_CORRUPTED_RETURN(i == 1);
-       }
-
-       return 0;
-}
-
-STATIC int
-xfs_ialloc_get_rec(
-       struct xfs_btree_cur    *cur,
-       xfs_agino_t             agino,
-       xfs_inobt_rec_incore_t  *rec,
-       int                     *done)
-{
-       int                     error;
-       int                     i;
-
-       error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
-       if (error)
-               return error;
-       *done = !i;
-       if (i) {
-               error = xfs_inobt_get_rec(cur, rec, &i);
-               if (error)
-                       return error;
-               XFS_WANT_CORRUPTED_RETURN(i == 1);
-       }
-
-       return 0;
-}
-
-/*
- * Allocate an inode using the inobt-only algorithm.
- */
-STATIC int
-xfs_dialloc_ag_inobt(
-       struct xfs_trans        *tp,
-       struct xfs_buf          *agbp,
-       xfs_ino_t               parent,
-       xfs_ino_t               *inop)
-{
-       struct xfs_mount        *mp = tp->t_mountp;
-       struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
-       xfs_agnumber_t          agno = be32_to_cpu(agi->agi_seqno);
-       xfs_agnumber_t          pagno = XFS_INO_TO_AGNO(mp, parent);
-       xfs_agino_t             pagino = XFS_INO_TO_AGINO(mp, parent);
-       struct xfs_perag        *pag;
-       struct xfs_btree_cur    *cur, *tcur;
-       struct xfs_inobt_rec_incore rec, trec;
-       xfs_ino_t               ino;
-       int                     error;
-       int                     offset;
-       int                     i, j;
-
-       pag = xfs_perag_get(mp, agno);
-
-       ASSERT(pag->pagi_init);
-       ASSERT(pag->pagi_inodeok);
-       ASSERT(pag->pagi_freecount > 0);
-
- restart_pagno:
-       cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
-       /*
-        * If pagino is 0 (this is the root inode allocation) use newino.
-        * This must work because we've just allocated some.
-        */
-       if (!pagino)
-               pagino = be32_to_cpu(agi->agi_newino);
-
-       error = xfs_check_agi_freecount(cur, agi);
-       if (error)
-               goto error0;
-
-       /*
-        * If in the same AG as the parent, try to get near the parent.
-        */
-       if (pagno == agno) {
-               int             doneleft;       /* done, to the left */
-               int             doneright;      /* done, to the right */
-               int             searchdistance = 10;
-
-               error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
-               if (error)
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-
-               error = xfs_inobt_get_rec(cur, &rec, &j);
-               if (error)
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(j == 1, error0);
-
-               if (rec.ir_freecount > 0) {
-                       /*
-                        * Found a free inode in the same chunk
-                        * as the parent, done.
-                        */
-                       goto alloc_inode;
-               }
-
-
-               /*
-                * In the same AG as parent, but parent's chunk is full.
-                */
-
-               /* duplicate the cursor, search left & right simultaneously */
-               error = xfs_btree_dup_cursor(cur, &tcur);
-               if (error)
-                       goto error0;
-
-               /*
-                * Skip to last blocks looked up if same parent inode.
-                */
-               if (pagino != NULLAGINO &&
-                   pag->pagl_pagino == pagino &&
-                   pag->pagl_leftrec != NULLAGINO &&
-                   pag->pagl_rightrec != NULLAGINO) {
-                       error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
-                                                  &trec, &doneleft);
-                       if (error)
-                               goto error1;
-
-                       error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
-                                                  &rec, &doneright);
-                       if (error)
-                               goto error1;
-               } else {
-                       /* search left with tcur, back up 1 record */
-                       error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
-                       if (error)
-                               goto error1;
-
-                       /* search right with cur, go forward 1 record. */
-                       error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
-                       if (error)
-                               goto error1;
-               }
-
-               /*
-                * Loop until we find an inode chunk with a free inode.
-                */
-               while (!doneleft || !doneright) {
-                       int     useleft;  /* using left inode chunk this time */
-
-                       if (!--searchdistance) {
-                               /*
-                                * Not in range - save last search
-                                * location and allocate a new inode
-                                */
-                               xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-                               pag->pagl_leftrec = trec.ir_startino;
-                               pag->pagl_rightrec = rec.ir_startino;
-                               pag->pagl_pagino = pagino;
-                               goto newino;
-                       }
-
-                       /* figure out the closer block if both are valid. */
-                       if (!doneleft && !doneright) {
-                               useleft = pagino -
-                                (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
-                                 rec.ir_startino - pagino;
-                       } else {
-                               useleft = !doneleft;
-                       }
-
-                       /* free inodes to the left? */
-                       if (useleft && trec.ir_freecount) {
-                               rec = trec;
-                               xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-                               cur = tcur;
-
-                               pag->pagl_leftrec = trec.ir_startino;
-                               pag->pagl_rightrec = rec.ir_startino;
-                               pag->pagl_pagino = pagino;
-                               goto alloc_inode;
-                       }
-
-                       /* free inodes to the right? */
-                       if (!useleft && rec.ir_freecount) {
-                               xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-
-                               pag->pagl_leftrec = trec.ir_startino;
-                               pag->pagl_rightrec = rec.ir_startino;
-                               pag->pagl_pagino = pagino;
-                               goto alloc_inode;
-                       }
-
-                       /* get next record to check */
-                       if (useleft) {
-                               error = xfs_ialloc_next_rec(tcur, &trec,
-                                                                &doneleft, 1);
-                       } else {
-                               error = xfs_ialloc_next_rec(cur, &rec,
-                                                                &doneright, 0);
-                       }
-                       if (error)
-                               goto error1;
-               }
-
-               /*
-                * We've reached the end of the btree. because
-                * we are only searching a small chunk of the
-                * btree each search, there is obviously free
-                * inodes closer to the parent inode than we
-                * are now. restart the search again.
-                */
-               pag->pagl_pagino = NULLAGINO;
-               pag->pagl_leftrec = NULLAGINO;
-               pag->pagl_rightrec = NULLAGINO;
-               xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-               xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-               goto restart_pagno;
-       }
-
-       /*
-        * In a different AG from the parent.
-        * See if the most recently allocated block has any free.
-        */
-newino:
-       if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
-               error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
-                                        XFS_LOOKUP_EQ, &i);
-               if (error)
-                       goto error0;
-
-               if (i == 1) {
-                       error = xfs_inobt_get_rec(cur, &rec, &j);
-                       if (error)
-                               goto error0;
-
-                       if (j == 1 && rec.ir_freecount > 0) {
-                               /*
-                                * The last chunk allocated in the group
-                                * still has a free inode.
-                                */
-                               goto alloc_inode;
-                       }
-               }
-       }
-
-       /*
-        * None left in the last group, search the whole AG
-        */
-       error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
-       if (error)
-               goto error0;
-       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-
-       for (;;) {
-               error = xfs_inobt_get_rec(cur, &rec, &i);
-               if (error)
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               if (rec.ir_freecount > 0)
-                       break;
-               error = xfs_btree_increment(cur, 0, &i);
-               if (error)
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-       }
-
-alloc_inode:
-       offset = xfs_lowbit64(rec.ir_free);
-       ASSERT(offset >= 0);
-       ASSERT(offset < XFS_INODES_PER_CHUNK);
-       ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
-                                  XFS_INODES_PER_CHUNK) == 0);
-       ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
-       rec.ir_free &= ~XFS_INOBT_MASK(offset);
-       rec.ir_freecount--;
-       error = xfs_inobt_update(cur, &rec);
-       if (error)
-               goto error0;
-       be32_add_cpu(&agi->agi_freecount, -1);
-       xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
-       pag->pagi_freecount--;
-
-       error = xfs_check_agi_freecount(cur, agi);
-       if (error)
-               goto error0;
-
-       xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-       xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
-       xfs_perag_put(pag);
-       *inop = ino;
-       return 0;
-error1:
-       xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-error0:
-       xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-       xfs_perag_put(pag);
-       return error;
-}
-
-/*
- * Use the free inode btree to allocate an inode based on distance from the
- * parent. Note that the provided cursor may be deleted and replaced.
- */
-STATIC int
-xfs_dialloc_ag_finobt_near(
-       xfs_agino_t                     pagino,
-       struct xfs_btree_cur            **ocur,
-       struct xfs_inobt_rec_incore     *rec)
-{
-       struct xfs_btree_cur            *lcur = *ocur;  /* left search cursor */
-       struct xfs_btree_cur            *rcur;  /* right search cursor */
-       struct xfs_inobt_rec_incore     rrec;
-       int                             error;
-       int                             i, j;
-
-       error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i);
-       if (error)
-               return error;
-
-       if (i == 1) {
-               error = xfs_inobt_get_rec(lcur, rec, &i);
-               if (error)
-                       return error;
-               XFS_WANT_CORRUPTED_RETURN(i == 1);
-
-               /*
-                * See if we've landed in the parent inode record. The finobt
-                * only tracks chunks with at least one free inode, so record
-                * existence is enough.
-                */
-               if (pagino >= rec->ir_startino &&
-                   pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK))
-                       return 0;
-       }
-
-       error = xfs_btree_dup_cursor(lcur, &rcur);
-       if (error)
-               return error;
-
-       error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j);
-       if (error)
-               goto error_rcur;
-       if (j == 1) {
-               error = xfs_inobt_get_rec(rcur, &rrec, &j);
-               if (error)
-                       goto error_rcur;
-               XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur);
-       }
-
-       XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur);
-       if (i == 1 && j == 1) {
-               /*
-                * Both the left and right records are valid. Choose the closer
-                * inode chunk to the target.
-                */
-               if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) >
-                   (rrec.ir_startino - pagino)) {
-                       *rec = rrec;
-                       xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
-                       *ocur = rcur;
-               } else {
-                       xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
-               }
-       } else if (j == 1) {
-               /* only the right record is valid */
-               *rec = rrec;
-               xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
-               *ocur = rcur;
-       } else if (i == 1) {
-               /* only the left record is valid */
-               xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
-       }
-
-       return 0;
-
-error_rcur:
-       xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR);
-       return error;
-}
-
-/*
- * Use the free inode btree to find a free inode based on a newino hint. If
- * the hint is NULL, find the first free inode in the AG.
- */
-STATIC int
-xfs_dialloc_ag_finobt_newino(
-       struct xfs_agi                  *agi,
-       struct xfs_btree_cur            *cur,
-       struct xfs_inobt_rec_incore     *rec)
-{
-       int error;
-       int i;
-
-       if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
-               error = xfs_inobt_lookup(cur, agi->agi_newino, XFS_LOOKUP_EQ,
-                                        &i);
-               if (error)
-                       return error;
-               if (i == 1) {
-                       error = xfs_inobt_get_rec(cur, rec, &i);
-                       if (error)
-                               return error;
-                       XFS_WANT_CORRUPTED_RETURN(i == 1);
-
-                       return 0;
-               }
-       }
-
-       /*
-        * Find the first inode available in the AG.
-        */
-       error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
-       if (error)
-               return error;
-       XFS_WANT_CORRUPTED_RETURN(i == 1);
-
-       error = xfs_inobt_get_rec(cur, rec, &i);
-       if (error)
-               return error;
-       XFS_WANT_CORRUPTED_RETURN(i == 1);
-
-       return 0;
-}
-
-/*
- * Update the inobt based on a modification made to the finobt. Also ensure that
- * the records from both trees are equivalent post-modification.
- */
-STATIC int
-xfs_dialloc_ag_update_inobt(
-       struct xfs_btree_cur            *cur,   /* inobt cursor */
-       struct xfs_inobt_rec_incore     *frec,  /* finobt record */
-       int                             offset) /* inode offset */
-{
-       struct xfs_inobt_rec_incore     rec;
-       int                             error;
-       int                             i;
-
-       error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
-       if (error)
-               return error;
-       XFS_WANT_CORRUPTED_RETURN(i == 1);
-
-       error = xfs_inobt_get_rec(cur, &rec, &i);
-       if (error)
-               return error;
-       XFS_WANT_CORRUPTED_RETURN(i == 1);
-       ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
-                                  XFS_INODES_PER_CHUNK) == 0);
-
-       rec.ir_free &= ~XFS_INOBT_MASK(offset);
-       rec.ir_freecount--;
-
-       XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) &&
-                                 (rec.ir_freecount == frec->ir_freecount));
-
-       error = xfs_inobt_update(cur, &rec);
-       if (error)
-               return error;
-
-       return 0;
-}
-
-/*
- * Allocate an inode using the free inode btree, if available. Otherwise, fall
- * back to the inobt search algorithm.
- *
- * The caller selected an AG for us, and made sure that free inodes are
- * available.
- */
-STATIC int
-xfs_dialloc_ag(
-       struct xfs_trans        *tp,
-       struct xfs_buf          *agbp,
-       xfs_ino_t               parent,
-       xfs_ino_t               *inop)
-{
-       struct xfs_mount                *mp = tp->t_mountp;
-       struct xfs_agi                  *agi = XFS_BUF_TO_AGI(agbp);
-       xfs_agnumber_t                  agno = be32_to_cpu(agi->agi_seqno);
-       xfs_agnumber_t                  pagno = XFS_INO_TO_AGNO(mp, parent);
-       xfs_agino_t                     pagino = XFS_INO_TO_AGINO(mp, parent);
-       struct xfs_perag                *pag;
-       struct xfs_btree_cur            *cur;   /* finobt cursor */
-       struct xfs_btree_cur            *icur;  /* inobt cursor */
-       struct xfs_inobt_rec_incore     rec;
-       xfs_ino_t                       ino;
-       int                             error;
-       int                             offset;
-       int                             i;
-
-       if (!xfs_sb_version_hasfinobt(&mp->m_sb))
-               return xfs_dialloc_ag_inobt(tp, agbp, parent, inop);
-
-       pag = xfs_perag_get(mp, agno);
-
-       /*
-        * If pagino is 0 (this is the root inode allocation) use newino.
-        * This must work because we've just allocated some.
-        */
-       if (!pagino)
-               pagino = be32_to_cpu(agi->agi_newino);
-
-       cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
-
-       error = xfs_check_agi_freecount(cur, agi);
-       if (error)
-               goto error_cur;
-
-       /*
-        * The search algorithm depends on whether we're in the same AG as the
-        * parent. If so, find the closest available inode to the parent. If
-        * not, consider the agi hint or find the first free inode in the AG.
-        */
-       if (agno == pagno)
-               error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
-       else
-               error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
-       if (error)
-               goto error_cur;
-
-       offset = xfs_lowbit64(rec.ir_free);
-       ASSERT(offset >= 0);
-       ASSERT(offset < XFS_INODES_PER_CHUNK);
-       ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
-                                  XFS_INODES_PER_CHUNK) == 0);
-       ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
-
-       /*
-        * Modify or remove the finobt record.
-        */
-       rec.ir_free &= ~XFS_INOBT_MASK(offset);
-       rec.ir_freecount--;
-       if (rec.ir_freecount)
-               error = xfs_inobt_update(cur, &rec);
-       else
-               error = xfs_btree_delete(cur, &i);
-       if (error)
-               goto error_cur;
-
-       /*
-        * The finobt has now been updated appropriately. We haven't updated the
-        * agi and superblock yet, so we can create an inobt cursor and validate
-        * the original freecount. If all is well, make the equivalent update to
-        * the inobt using the finobt record and offset information.
-        */
-       icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
-
-       error = xfs_check_agi_freecount(icur, agi);
-       if (error)
-               goto error_icur;
-
-       error = xfs_dialloc_ag_update_inobt(icur, &rec, offset);
-       if (error)
-               goto error_icur;
-
-       /*
-        * Both trees have now been updated. We must update the perag and
-        * superblock before we can check the freecount for each btree.
-        */
-       be32_add_cpu(&agi->agi_freecount, -1);
-       xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
-       pag->pagi_freecount--;
-
-       xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
-
-       error = xfs_check_agi_freecount(icur, agi);
-       if (error)
-               goto error_icur;
-       error = xfs_check_agi_freecount(cur, agi);
-       if (error)
-               goto error_icur;
-
-       xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
-       xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-       xfs_perag_put(pag);
-       *inop = ino;
-       return 0;
-
-error_icur:
-       xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
-error_cur:
-       xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-       xfs_perag_put(pag);
-       return error;
-}
-
-/*
- * Allocate an inode on disk.
- *
- * Mode is used to tell whether the new inode will need space, and whether it
- * is a directory.
- *
- * This function is designed to be called twice if it has to do an allocation
- * to make more free inodes.  On the first call, *IO_agbp should be set to NULL.
- * If an inode is available without having to performn an allocation, an inode
- * number is returned.  In this case, *IO_agbp is set to NULL.  If an allocation
- * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp.
- * The caller should then commit the current transaction, allocate a
- * new transaction, and call xfs_dialloc() again, passing in the previous value
- * of *IO_agbp.  IO_agbp should be held across the transactions. Since the AGI
- * buffer is locked across the two calls, the second call is guaranteed to have
- * a free inode available.
- *
- * Once we successfully pick an inode its number is returned and the on-disk
- * data structures are updated.  The inode itself is not read in, since doing so
- * would break ordering constraints with xfs_reclaim.
- */
-int
-xfs_dialloc(
-       struct xfs_trans        *tp,
-       xfs_ino_t               parent,
-       umode_t                 mode,
-       int                     okalloc,
-       struct xfs_buf          **IO_agbp,
-       xfs_ino_t               *inop)
-{
-       struct xfs_mount        *mp = tp->t_mountp;
-       struct xfs_buf          *agbp;
-       xfs_agnumber_t          agno;
-       int                     error;
-       int                     ialloced;
-       int                     noroom = 0;
-       xfs_agnumber_t          start_agno;
-       struct xfs_perag        *pag;
-
-       if (*IO_agbp) {
-               /*
-                * If the caller passes in a pointer to the AGI buffer,
-                * continue where we left off before.  In this case, we
-                * know that the allocation group has free inodes.
-                */
-               agbp = *IO_agbp;
-               goto out_alloc;
-       }
-
-       /*
-        * We do not have an agbp, so select an initial allocation
-        * group for inode allocation.
-        */
-       start_agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc);
-       if (start_agno == NULLAGNUMBER) {
-               *inop = NULLFSINO;
-               return 0;
-       }
-
-       /*
-        * If we have already hit the ceiling of inode blocks then clear
-        * okalloc so we scan all available agi structures for a free
-        * inode.
-        */
-       if (mp->m_maxicount &&
-           mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) {
-               noroom = 1;
-               okalloc = 0;
-       }
-
-       /*
-        * Loop until we find an allocation group that either has free inodes
-        * or in which we can allocate some inodes.  Iterate through the
-        * allocation groups upward, wrapping at the end.
-        */
-       agno = start_agno;
-       for (;;) {
-               pag = xfs_perag_get(mp, agno);
-               if (!pag->pagi_inodeok) {
-                       xfs_ialloc_next_ag(mp);
-                       goto nextag;
-               }
-
-               if (!pag->pagi_init) {
-                       error = xfs_ialloc_pagi_init(mp, tp, agno);
-                       if (error)
-                               goto out_error;
-               }
-
-               /*
-                * Do a first racy fast path check if this AG is usable.
-                */
-               if (!pag->pagi_freecount && !okalloc)
-                       goto nextag;
-
-               /*
-                * Then read in the AGI buffer and recheck with the AGI buffer
-                * lock held.
-                */
-               error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
-               if (error)
-                       goto out_error;
-
-               if (pag->pagi_freecount) {
-                       xfs_perag_put(pag);
-                       goto out_alloc;
-               }
-
-               if (!okalloc)
-                       goto nextag_relse_buffer;
-
-
-               error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced);
-               if (error) {
-                       xfs_trans_brelse(tp, agbp);
-
-                       if (error != ENOSPC)
-                               goto out_error;
-
-                       xfs_perag_put(pag);
-                       *inop = NULLFSINO;
-                       return 0;
-               }
-
-               if (ialloced) {
-                       /*
-                        * We successfully allocated some inodes, return
-                        * the current context to the caller so that it
-                        * can commit the current transaction and call
-                        * us again where we left off.
-                        */
-                       ASSERT(pag->pagi_freecount > 0);
-                       xfs_perag_put(pag);
-
-                       *IO_agbp = agbp;
-                       *inop = NULLFSINO;
-                       return 0;
-               }
-
-nextag_relse_buffer:
-               xfs_trans_brelse(tp, agbp);
-nextag:
-               xfs_perag_put(pag);
-               if (++agno == mp->m_sb.sb_agcount)
-                       agno = 0;
-               if (agno == start_agno) {
-                       *inop = NULLFSINO;
-                       return noroom ? ENOSPC : 0;
-               }
-       }
-
-out_alloc:
-       *IO_agbp = NULL;
-       return xfs_dialloc_ag(tp, agbp, parent, inop);
-out_error:
-       xfs_perag_put(pag);
-       return XFS_ERROR(error);
-}
-
-STATIC int
-xfs_difree_inobt(
-       struct xfs_mount                *mp,
-       struct xfs_trans                *tp,
-       struct xfs_buf                  *agbp,
-       xfs_agino_t                     agino,
-       struct xfs_bmap_free            *flist,
-       int                             *deleted,
-       xfs_ino_t                       *first_ino,
-       struct xfs_inobt_rec_incore     *orec)
-{
-       struct xfs_agi                  *agi = XFS_BUF_TO_AGI(agbp);
-       xfs_agnumber_t                  agno = be32_to_cpu(agi->agi_seqno);
-       struct xfs_perag                *pag;
-       struct xfs_btree_cur            *cur;
-       struct xfs_inobt_rec_incore     rec;
-       int                             ilen;
-       int                             error;
-       int                             i;
-       int                             off;
-
-       ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
-       ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length));
-
-       /*
-        * Initialize the cursor.
-        */
-       cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
-
-       error = xfs_check_agi_freecount(cur, agi);
-       if (error)
-               goto error0;
-
-       /*
-        * Look for the entry describing this inode.
-        */
-       if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
-               xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
-                       __func__, error);
-               goto error0;
-       }
-       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-       error = xfs_inobt_get_rec(cur, &rec, &i);
-       if (error) {
-               xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
-                       __func__, error);
-               goto error0;
-       }
-       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-       /*
-        * Get the offset in the inode chunk.
-        */
-       off = agino - rec.ir_startino;
-       ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
-       ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
-       /*
-        * Mark the inode free & increment the count.
-        */
-       rec.ir_free |= XFS_INOBT_MASK(off);
-       rec.ir_freecount++;
-
-       /*
-        * When an inode cluster is free, it becomes eligible for removal
-        */
-       if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
-           (rec.ir_freecount == mp->m_ialloc_inos)) {
-
-               *deleted = 1;
-               *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
-
-               /*
-                * Remove the inode cluster from the AGI B+Tree, adjust the
-                * AGI and Superblock inode counts, and mark the disk space
-                * to be freed when the transaction is committed.
-                */
-               ilen = mp->m_ialloc_inos;
-               be32_add_cpu(&agi->agi_count, -ilen);
-               be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
-               xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
-               pag = xfs_perag_get(mp, agno);
-               pag->pagi_freecount -= ilen - 1;
-               xfs_perag_put(pag);
-               xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
-               xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
-
-               if ((error = xfs_btree_delete(cur, &i))) {
-                       xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
-                               __func__, error);
-                       goto error0;
-               }
-
-               xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
-                                 XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
-                                 mp->m_ialloc_blks, flist, mp);
-       } else {
-               *deleted = 0;
-
-               error = xfs_inobt_update(cur, &rec);
-               if (error) {
-                       xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
-                               __func__, error);
-                       goto error0;
-               }
-
-               /* 
-                * Change the inode free counts and log the ag/sb changes.
-                */
-               be32_add_cpu(&agi->agi_freecount, 1);
-               xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
-               pag = xfs_perag_get(mp, agno);
-               pag->pagi_freecount++;
-               xfs_perag_put(pag);
-               xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
-       }
-
-       error = xfs_check_agi_freecount(cur, agi);
-       if (error)
-               goto error0;
-
-       *orec = rec;
-       xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-       return 0;
-
-error0:
-       xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-       return error;
-}
-
-/*
- * Free an inode in the free inode btree.
- */
-STATIC int
-xfs_difree_finobt(
-       struct xfs_mount                *mp,
-       struct xfs_trans                *tp,
-       struct xfs_buf                  *agbp,
-       xfs_agino_t                     agino,
-       struct xfs_inobt_rec_incore     *ibtrec) /* inobt record */
-{
-       struct xfs_agi                  *agi = XFS_BUF_TO_AGI(agbp);
-       xfs_agnumber_t                  agno = be32_to_cpu(agi->agi_seqno);
-       struct xfs_btree_cur            *cur;
-       struct xfs_inobt_rec_incore     rec;
-       int                             offset = agino - ibtrec->ir_startino;
-       int                             error;
-       int                             i;
-
-       cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
-
-       error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
-       if (error)
-               goto error;
-       if (i == 0) {
-               /*
-                * If the record does not exist in the finobt, we must have just
-                * freed an inode in a previously fully allocated chunk. If not,
-                * something is out of sync.
-                */
-               XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error);
-
-               error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount,
-                                            ibtrec->ir_free, &i);
-               if (error)
-                       goto error;
-               ASSERT(i == 1);
-
-               goto out;
-       }
-
-       /*
-        * Read and update the existing record. We could just copy the ibtrec
-        * across here, but that would defeat the purpose of having redundant
-        * metadata. By making the modifications independently, we can catch
-        * corruptions that we wouldn't see if we just copied from one record
-        * to another.
-        */
-       error = xfs_inobt_get_rec(cur, &rec, &i);
-       if (error)
-               goto error;
-       XFS_WANT_CORRUPTED_GOTO(i == 1, error);
-
-       rec.ir_free |= XFS_INOBT_MASK(offset);
-       rec.ir_freecount++;
-
-       XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) &&
-                               (rec.ir_freecount == ibtrec->ir_freecount),
-                               error);
-
-       /*
-        * The content of inobt records should always match between the inobt
-        * and finobt. The lifecycle of records in the finobt is different from
-        * the inobt in that the finobt only tracks records with at least one
-        * free inode. Hence, if all of the inodes are free and we aren't
-        * keeping inode chunks permanently on disk, remove the record.
-        * Otherwise, update the record with the new information.
-        */
-       if (rec.ir_freecount == mp->m_ialloc_inos &&
-           !(mp->m_flags & XFS_MOUNT_IKEEP)) {
-               error = xfs_btree_delete(cur, &i);
-               if (error)
-                       goto error;
-               ASSERT(i == 1);
-       } else {
-               error = xfs_inobt_update(cur, &rec);
-               if (error)
-                       goto error;
-       }
-
-out:
-       error = xfs_check_agi_freecount(cur, agi);
-       if (error)
-               goto error;
-
-       xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-       return 0;
-
-error:
-       xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-       return error;
-}
-
-/*
- * Free disk inode.  Carefully avoids touching the incore inode, all
- * manipulations incore are the caller's responsibility.
- * The on-disk inode is not changed by this operation, only the
- * btree (free inode mask) is changed.
- */
-int
-xfs_difree(
-       struct xfs_trans        *tp,            /* transaction pointer */
-       xfs_ino_t               inode,          /* inode to be freed */
-       struct xfs_bmap_free    *flist,         /* extents to free */
-       int                     *deleted,/* set if inode cluster was deleted */
-       xfs_ino_t               *first_ino)/* first inode in deleted cluster */
-{
-       /* REFERENCED */
-       xfs_agblock_t           agbno;  /* block number containing inode */
-       struct xfs_buf          *agbp;  /* buffer for allocation group header */
-       xfs_agino_t             agino;  /* allocation group inode number */
-       xfs_agnumber_t          agno;   /* allocation group number */
-       int                     error;  /* error return value */
-       struct xfs_mount        *mp;    /* mount structure for filesystem */
-       struct xfs_inobt_rec_incore rec;/* btree record */
-
-       mp = tp->t_mountp;
-
-       /*
-        * Break up inode number into its components.
-        */
-       agno = XFS_INO_TO_AGNO(mp, inode);
-       if (agno >= mp->m_sb.sb_agcount)  {
-               xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
-                       __func__, agno, mp->m_sb.sb_agcount);
-               ASSERT(0);
-               return XFS_ERROR(EINVAL);
-       }
-       agino = XFS_INO_TO_AGINO(mp, inode);
-       if (inode != XFS_AGINO_TO_INO(mp, agno, agino))  {
-               xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
-                       __func__, (unsigned long long)inode,
-                       (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
-               ASSERT(0);
-               return XFS_ERROR(EINVAL);
-       }
-       agbno = XFS_AGINO_TO_AGBNO(mp, agino);
-       if (agbno >= mp->m_sb.sb_agblocks)  {
-               xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
-                       __func__, agbno, mp->m_sb.sb_agblocks);
-               ASSERT(0);
-               return XFS_ERROR(EINVAL);
-       }
-       /*
-        * Get the allocation group header.
-        */
-       error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
-       if (error) {
-               xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
-                       __func__, error);
-               return error;
-       }
-
-       /*
-        * Fix up the inode allocation btree.
-        */
-       error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino,
-                                &rec);
-       if (error)
-               goto error0;
-
-       /*
-        * Fix up the free inode btree.
-        */
-       if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
-               error = xfs_difree_finobt(mp, tp, agbp, agino, &rec);
-               if (error)
-                       goto error0;
-       }
-
-       return 0;
-
-error0:
-       return error;
-}
-
-STATIC int
-xfs_imap_lookup(
-       struct xfs_mount        *mp,
-       struct xfs_trans        *tp,
-       xfs_agnumber_t          agno,
-       xfs_agino_t             agino,
-       xfs_agblock_t           agbno,
-       xfs_agblock_t           *chunk_agbno,
-       xfs_agblock_t           *offset_agbno,
-       int                     flags)
-{
-       struct xfs_inobt_rec_incore rec;
-       struct xfs_btree_cur    *cur;
-       struct xfs_buf          *agbp;
-       int                     error;
-       int                     i;
-
-       error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
-       if (error) {
-               xfs_alert(mp,
-                       "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
-                       __func__, error, agno);
-               return error;
-       }
-
-       /*
-        * Lookup the inode record for the given agino. If the record cannot be
-        * found, then it's an invalid inode number and we should abort. Once
-        * we have a record, we need to ensure it contains the inode number
-        * we are looking up.
-        */
-       cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
-       error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
-       if (!error) {
-               if (i)
-                       error = xfs_inobt_get_rec(cur, &rec, &i);
-               if (!error && i == 0)
-                       error = EINVAL;
-       }
-
-       xfs_trans_brelse(tp, agbp);
-       xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-       if (error)
-               return error;
-
-       /* check that the returned record contains the required inode */
-       if (rec.ir_startino > agino ||
-           rec.ir_startino + mp->m_ialloc_inos <= agino)
-               return EINVAL;
-
-       /* for untrusted inodes check it is allocated first */
-       if ((flags & XFS_IGET_UNTRUSTED) &&
-           (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
-               return EINVAL;
-
-       *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
-       *offset_agbno = agbno - *chunk_agbno;
-       return 0;
-}
-
-/*
- * Return the location of the inode in imap, for mapping it into a buffer.
- */
-int
-xfs_imap(
-       xfs_mount_t      *mp,   /* file system mount structure */
-       xfs_trans_t      *tp,   /* transaction pointer */
-       xfs_ino_t       ino,    /* inode to locate */
-       struct xfs_imap *imap,  /* location map structure */
-       uint            flags)  /* flags for inode btree lookup */
-{
-       xfs_agblock_t   agbno;  /* block number of inode in the alloc group */
-       xfs_agino_t     agino;  /* inode number within alloc group */
-       xfs_agnumber_t  agno;   /* allocation group number */
-       int             blks_per_cluster; /* num blocks per inode cluster */
-       xfs_agblock_t   chunk_agbno;    /* first block in inode chunk */
-       xfs_agblock_t   cluster_agbno;  /* first block in inode cluster */
-       int             error;  /* error code */
-       int             offset; /* index of inode in its buffer */
-       xfs_agblock_t   offset_agbno;   /* blks from chunk start to inode */
-
-       ASSERT(ino != NULLFSINO);
-
-       /*
-        * Split up the inode number into its parts.
-        */
-       agno = XFS_INO_TO_AGNO(mp, ino);
-       agino = XFS_INO_TO_AGINO(mp, ino);
-       agbno = XFS_AGINO_TO_AGBNO(mp, agino);
-       if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
-           ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
-#ifdef DEBUG
-               /*
-                * Don't output diagnostic information for untrusted inodes
-                * as they can be invalid without implying corruption.
-                */
-               if (flags & XFS_IGET_UNTRUSTED)
-                       return XFS_ERROR(EINVAL);
-               if (agno >= mp->m_sb.sb_agcount) {
-                       xfs_alert(mp,
-                               "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
-                               __func__, agno, mp->m_sb.sb_agcount);
-               }
-               if (agbno >= mp->m_sb.sb_agblocks) {
-                       xfs_alert(mp,
-               "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
-                               __func__, (unsigned long long)agbno,
-                               (unsigned long)mp->m_sb.sb_agblocks);
-               }
-               if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
-                       xfs_alert(mp,
-               "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
-                               __func__, ino,
-                               XFS_AGINO_TO_INO(mp, agno, agino));
-               }
-               xfs_stack_trace();
-#endif /* DEBUG */
-               return XFS_ERROR(EINVAL);
-       }
-
-       blks_per_cluster = xfs_icluster_size_fsb(mp);
-
-       /*
-        * For bulkstat and handle lookups, we have an untrusted inode number
-        * that we have to verify is valid. We cannot do this just by reading
-        * the inode buffer as it may have been unlinked and removed leaving
-        * inodes in stale state on disk. Hence we have to do a btree lookup
-        * in all cases where an untrusted inode number is passed.
-        */
-       if (flags & XFS_IGET_UNTRUSTED) {
-               error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
-                                       &chunk_agbno, &offset_agbno, flags);
-               if (error)
-                       return error;
-               goto out_map;
-       }
-
-       /*
-        * If the inode cluster size is the same as the blocksize or
-        * smaller we get to the buffer by simple arithmetics.
-        */
-       if (blks_per_cluster == 1) {
-               offset = XFS_INO_TO_OFFSET(mp, ino);
-               ASSERT(offset < mp->m_sb.sb_inopblock);
-
-               imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
-               imap->im_len = XFS_FSB_TO_BB(mp, 1);
-               imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
-               return 0;
-       }
-
-       /*
-        * If the inode chunks are aligned then use simple maths to
-        * find the location. Otherwise we have to do a btree
-        * lookup to find the location.
-        */
-       if (mp->m_inoalign_mask) {
-               offset_agbno = agbno & mp->m_inoalign_mask;
-               chunk_agbno = agbno - offset_agbno;
-       } else {
-               error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
-                                       &chunk_agbno, &offset_agbno, flags);
-               if (error)
-                       return error;
-       }
-
-out_map:
-       ASSERT(agbno >= chunk_agbno);
-       cluster_agbno = chunk_agbno +
-               ((offset_agbno / blks_per_cluster) * blks_per_cluster);
-       offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
-               XFS_INO_TO_OFFSET(mp, ino);
-
-       imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
-       imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
-       imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
-
-       /*
-        * If the inode number maps to a block outside the bounds
-        * of the file system then return NULL rather than calling
-        * read_buf and panicing when we get an error from the
-        * driver.
-        */
-       if ((imap->im_blkno + imap->im_len) >
-           XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
-               xfs_alert(mp,
-       "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
-                       __func__, (unsigned long long) imap->im_blkno,
-                       (unsigned long long) imap->im_len,
-                       XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
-               return XFS_ERROR(EINVAL);
-       }
-       return 0;
-}
-
-/*
- * Compute and fill in value of m_in_maxlevels.
- */
-void
-xfs_ialloc_compute_maxlevels(
-       xfs_mount_t     *mp)            /* file system mount structure */
-{
-       int             level;
-       uint            maxblocks;
-       uint            maxleafents;
-       int             minleafrecs;
-       int             minnoderecs;
-
-       maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
-               XFS_INODES_PER_CHUNK_LOG;
-       minleafrecs = mp->m_alloc_mnr[0];
-       minnoderecs = mp->m_alloc_mnr[1];
-       maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
-       for (level = 1; maxblocks > 1; level++)
-               maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
-       mp->m_in_maxlevels = level;
-}
-
-/*
- * Log specified fields for the ag hdr (inode section). The growth of the agi
- * structure over time requires that we interpret the buffer as two logical
- * regions delineated by the end of the unlinked list. This is due to the size
- * of the hash table and its location in the middle of the agi.
- *
- * For example, a request to log a field before agi_unlinked and a field after
- * agi_unlinked could cause us to log the entire hash table and use an excessive
- * amount of log space. To avoid this behavior, log the region up through
- * agi_unlinked in one call and the region after agi_unlinked through the end of
- * the structure in another.
- */
-void
-xfs_ialloc_log_agi(
-       xfs_trans_t     *tp,            /* transaction pointer */
-       xfs_buf_t       *bp,            /* allocation group header buffer */
-       int             fields)         /* bitmask of fields to log */
-{
-       int                     first;          /* first byte number */
-       int                     last;           /* last byte number */
-       static const short      offsets[] = {   /* field starting offsets */
-                                       /* keep in sync with bit definitions */
-               offsetof(xfs_agi_t, agi_magicnum),
-               offsetof(xfs_agi_t, agi_versionnum),
-               offsetof(xfs_agi_t, agi_seqno),
-               offsetof(xfs_agi_t, agi_length),
-               offsetof(xfs_agi_t, agi_count),
-               offsetof(xfs_agi_t, agi_root),
-               offsetof(xfs_agi_t, agi_level),
-               offsetof(xfs_agi_t, agi_freecount),
-               offsetof(xfs_agi_t, agi_newino),
-               offsetof(xfs_agi_t, agi_dirino),
-               offsetof(xfs_agi_t, agi_unlinked),
-               offsetof(xfs_agi_t, agi_free_root),
-               offsetof(xfs_agi_t, agi_free_level),
-               sizeof(xfs_agi_t)
-       };
-#ifdef DEBUG
-       xfs_agi_t               *agi;   /* allocation group header */
-
-       agi = XFS_BUF_TO_AGI(bp);
-       ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
-#endif
-
-       xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF);
-
-       /*
-        * Compute byte offsets for the first and last fields in the first
-        * region and log the agi buffer. This only logs up through
-        * agi_unlinked.
-        */
-       if (fields & XFS_AGI_ALL_BITS_R1) {
-               xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1,
-                                 &first, &last);
-               xfs_trans_log_buf(tp, bp, first, last);
-       }
-
-       /*
-        * Mask off the bits in the first region and calculate the first and
-        * last field offsets for any bits in the second region.
-        */
-       fields &= ~XFS_AGI_ALL_BITS_R1;
-       if (fields) {
-               xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2,
-                                 &first, &last);
-               xfs_trans_log_buf(tp, bp, first, last);
-       }
-}
-
-#ifdef DEBUG
-STATIC void
-xfs_check_agi_unlinked(
-       struct xfs_agi          *agi)
-{
-       int                     i;
-
-       for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
-               ASSERT(agi->agi_unlinked[i]);
-}
-#else
-#define xfs_check_agi_unlinked(agi)
-#endif
-
-static bool
-xfs_agi_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount *mp = bp->b_target->bt_mount;
-       struct xfs_agi  *agi = XFS_BUF_TO_AGI(bp);
-
-       if (xfs_sb_version_hascrc(&mp->m_sb) &&
-           !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid))
-                       return false;
-       /*
-        * Validate the magic number of the agi block.
-        */
-       if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC))
-               return false;
-       if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
-               return false;
-
-       /*
-        * during growfs operations, the perag is not fully initialised,
-        * so we can't use it for any useful checking. growfs ensures we can't
-        * use it by using uncached buffers that don't have the perag attached
-        * so we can detect and avoid this problem.
-        */
-       if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno)
-               return false;
-
-       xfs_check_agi_unlinked(agi);
-       return true;
-}
-
-static void
-xfs_agi_read_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount *mp = bp->b_target->bt_mount;
-
-       if (xfs_sb_version_hascrc(&mp->m_sb) &&
-           !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
-               xfs_buf_ioerror(bp, EFSBADCRC);
-       else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp,
-                               XFS_ERRTAG_IALLOC_READ_AGI,
-                               XFS_RANDOM_IALLOC_READ_AGI))
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-       if (bp->b_error)
-               xfs_verifier_error(bp);
-}
-
-static void
-xfs_agi_write_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount *mp = bp->b_target->bt_mount;
-       struct xfs_buf_log_item *bip = bp->b_fspriv;
-
-       if (!xfs_agi_verify(bp)) {
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-               xfs_verifier_error(bp);
-               return;
-       }
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return;
-
-       if (bip)
-               XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
-       xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
-}
-
-const struct xfs_buf_ops xfs_agi_buf_ops = {
-       .verify_read = xfs_agi_read_verify,
-       .verify_write = xfs_agi_write_verify,
-};
-
-/*
- * Read in the allocation group header (inode allocation section)
- */
-int
-xfs_read_agi(
-       struct xfs_mount        *mp,    /* file system mount structure */
-       struct xfs_trans        *tp,    /* transaction pointer */
-       xfs_agnumber_t          agno,   /* allocation group number */
-       struct xfs_buf          **bpp)  /* allocation group hdr buf */
-{
-       int                     error;
-
-       trace_xfs_read_agi(mp, agno);
-
-       ASSERT(agno != NULLAGNUMBER);
-       error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-                       XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-                       XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
-       if (error)
-               return error;
-
-       xfs_buf_set_ref(*bpp, XFS_AGI_REF);
-       return 0;
-}
-
-int
-xfs_ialloc_read_agi(
-       struct xfs_mount        *mp,    /* file system mount structure */
-       struct xfs_trans        *tp,    /* transaction pointer */
-       xfs_agnumber_t          agno,   /* allocation group number */
-       struct xfs_buf          **bpp)  /* allocation group hdr buf */
-{
-       struct xfs_agi          *agi;   /* allocation group header */
-       struct xfs_perag        *pag;   /* per allocation group data */
-       int                     error;
-
-       trace_xfs_ialloc_read_agi(mp, agno);
-
-       error = xfs_read_agi(mp, tp, agno, bpp);
-       if (error)
-               return error;
-
-       agi = XFS_BUF_TO_AGI(*bpp);
-       pag = xfs_perag_get(mp, agno);
-       if (!pag->pagi_init) {
-               pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
-               pag->pagi_count = be32_to_cpu(agi->agi_count);
-               pag->pagi_init = 1;
-       }
-
-       /*
-        * It's possible for these to be out of sync if
-        * we are in the middle of a forced shutdown.
-        */
-       ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
-               XFS_FORCED_SHUTDOWN(mp));
-       xfs_perag_put(pag);
-       return 0;
-}
-
-/*
- * Read in the agi to initialise the per-ag data in the mount structure
- */
-int
-xfs_ialloc_pagi_init(
-       xfs_mount_t     *mp,            /* file system mount structure */
-       xfs_trans_t     *tp,            /* transaction pointer */
-       xfs_agnumber_t  agno)           /* allocation group number */
-{
-       xfs_buf_t       *bp = NULL;
-       int             error;
-
-       error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
-       if (error)
-               return error;
-       if (bp)
-               xfs_trans_brelse(tp, bp);
-       return 0;
-}
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
deleted file mode 100644 (file)
index 95ad1c0..0000000
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_IALLOC_H__
-#define        __XFS_IALLOC_H__
-
-struct xfs_buf;
-struct xfs_dinode;
-struct xfs_imap;
-struct xfs_mount;
-struct xfs_trans;
-struct xfs_btree_cur;
-
-/* Move inodes in clusters of this size */
-#define        XFS_INODE_BIG_CLUSTER_SIZE      8192
-
-/* Calculate and return the number of filesystem blocks per inode cluster */
-static inline int
-xfs_icluster_size_fsb(
-       struct xfs_mount        *mp)
-{
-       if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size)
-               return 1;
-       return mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog;
-}
-
-/*
- * Make an inode pointer out of the buffer/offset.
- */
-static inline struct xfs_dinode *
-xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
-{
-       return (struct xfs_dinode *)
-               (xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog));
-}
-
-/*
- * Allocate an inode on disk.
- * Mode is used to tell whether the new inode will need space, and whether
- * it is a directory.
- *
- * To work within the constraint of one allocation per transaction,
- * xfs_dialloc() is designed to be called twice if it has to do an
- * allocation to make more free inodes.  If an inode is
- * available without an allocation, agbp would be set to the current
- * agbp and alloc_done set to false.
- * If an allocation needed to be done, agbp would be set to the
- * inode header of the allocation group and alloc_done set to true.
- * The caller should then commit the current transaction and allocate a new
- * transaction.  xfs_dialloc() should then be called again with
- * the agbp value returned from the previous call.
- *
- * Once we successfully pick an inode its number is returned and the
- * on-disk data structures are updated.  The inode itself is not read
- * in, since doing so would break ordering constraints with xfs_reclaim.
- *
- * *agbp should be set to NULL on the first call, *alloc_done set to FALSE.
- */
-int                                    /* error */
-xfs_dialloc(
-       struct xfs_trans *tp,           /* transaction pointer */
-       xfs_ino_t       parent,         /* parent inode (directory) */
-       umode_t         mode,           /* mode bits for new inode */
-       int             okalloc,        /* ok to allocate more space */
-       struct xfs_buf  **agbp,         /* buf for a.g. inode header */
-       xfs_ino_t       *inop);         /* inode number allocated */
-
-/*
- * Free disk inode.  Carefully avoids touching the incore inode, all
- * manipulations incore are the caller's responsibility.
- * The on-disk inode is not changed by this operation, only the
- * btree (free inode mask) is changed.
- */
-int                                    /* error */
-xfs_difree(
-       struct xfs_trans *tp,           /* transaction pointer */
-       xfs_ino_t       inode,          /* inode to be freed */
-       struct xfs_bmap_free *flist,    /* extents to free */
-       int             *deleted,       /* set if inode cluster was deleted */
-       xfs_ino_t       *first_ino);    /* first inode in deleted cluster */
-
-/*
- * Return the location of the inode in imap, for mapping it into a buffer.
- */
-int
-xfs_imap(
-       struct xfs_mount *mp,           /* file system mount structure */
-       struct xfs_trans *tp,           /* transaction pointer */
-       xfs_ino_t       ino,            /* inode to locate */
-       struct xfs_imap *imap,          /* location map structure */
-       uint            flags);         /* flags for inode btree lookup */
-
-/*
- * Compute and fill in value of m_in_maxlevels.
- */
-void
-xfs_ialloc_compute_maxlevels(
-       struct xfs_mount *mp);          /* file system mount structure */
-
-/*
- * Log specified fields for the ag hdr (inode section)
- */
-void
-xfs_ialloc_log_agi(
-       struct xfs_trans *tp,           /* transaction pointer */
-       struct xfs_buf  *bp,            /* allocation group header buffer */
-       int             fields);        /* bitmask of fields to log */
-
-/*
- * Read in the allocation group header (inode allocation section)
- */
-int                                    /* error */
-xfs_ialloc_read_agi(
-       struct xfs_mount *mp,           /* file system mount structure */
-       struct xfs_trans *tp,           /* transaction pointer */
-       xfs_agnumber_t  agno,           /* allocation group number */
-       struct xfs_buf  **bpp);         /* allocation group hdr buf */
-
-/*
- * Read in the allocation group header to initialise the per-ag data
- * in the mount structure
- */
-int
-xfs_ialloc_pagi_init(
-       struct xfs_mount *mp,           /* file system mount structure */
-       struct xfs_trans *tp,           /* transaction pointer */
-        xfs_agnumber_t  agno);         /* allocation group number */
-
-/*
- * Lookup a record by ino in the btree given by cur.
- */
-int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
-               xfs_lookup_t dir, int *stat);
-
-/*
- * Get the data from the pointed-to record.
- */
-int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
-               xfs_inobt_rec_incore_t *rec, int *stat);
-
-/*
- * Inode chunk initialisation routine
- */
-int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
-                         struct list_head *buffer_list,
-                         xfs_agnumber_t agno, xfs_agblock_t agbno,
-                         xfs_agblock_t length, unsigned int gen);
-
-#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
deleted file mode 100644 (file)
index 726f83a..0000000
+++ /dev/null
@@ -1,422 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_cksum.h"
-#include "xfs_trans.h"
-
-
-STATIC int
-xfs_inobt_get_minrecs(
-       struct xfs_btree_cur    *cur,
-       int                     level)
-{
-       return cur->bc_mp->m_inobt_mnr[level != 0];
-}
-
-STATIC struct xfs_btree_cur *
-xfs_inobt_dup_cursor(
-       struct xfs_btree_cur    *cur)
-{
-       return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
-                       cur->bc_private.a.agbp, cur->bc_private.a.agno,
-                       cur->bc_btnum);
-}
-
-STATIC void
-xfs_inobt_set_root(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_ptr     *nptr,
-       int                     inc)    /* level change */
-{
-       struct xfs_buf          *agbp = cur->bc_private.a.agbp;
-       struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
-
-       agi->agi_root = nptr->s;
-       be32_add_cpu(&agi->agi_level, inc);
-       xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
-}
-
-STATIC void
-xfs_finobt_set_root(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_ptr     *nptr,
-       int                     inc)    /* level change */
-{
-       struct xfs_buf          *agbp = cur->bc_private.a.agbp;
-       struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
-
-       agi->agi_free_root = nptr->s;
-       be32_add_cpu(&agi->agi_free_level, inc);
-       xfs_ialloc_log_agi(cur->bc_tp, agbp,
-                          XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL);
-}
-
-STATIC int
-xfs_inobt_alloc_block(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_ptr     *start,
-       union xfs_btree_ptr     *new,
-       int                     *stat)
-{
-       xfs_alloc_arg_t         args;           /* block allocation args */
-       int                     error;          /* error return value */
-       xfs_agblock_t           sbno = be32_to_cpu(start->s);
-
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-
-       memset(&args, 0, sizeof(args));
-       args.tp = cur->bc_tp;
-       args.mp = cur->bc_mp;
-       args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
-       args.minlen = 1;
-       args.maxlen = 1;
-       args.prod = 1;
-       args.type = XFS_ALLOCTYPE_NEAR_BNO;
-
-       error = xfs_alloc_vextent(&args);
-       if (error) {
-               XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-               return error;
-       }
-       if (args.fsbno == NULLFSBLOCK) {
-               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-               *stat = 0;
-               return 0;
-       }
-       ASSERT(args.len == 1);
-       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-
-       new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno));
-       *stat = 1;
-       return 0;
-}
-
-STATIC int
-xfs_inobt_free_block(
-       struct xfs_btree_cur    *cur,
-       struct xfs_buf          *bp)
-{
-       xfs_fsblock_t           fsbno;
-       int                     error;
-
-       fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp));
-       error = xfs_free_extent(cur->bc_tp, fsbno, 1);
-       if (error)
-               return error;
-
-       xfs_trans_binval(cur->bc_tp, bp);
-       return error;
-}
-
-STATIC int
-xfs_inobt_get_maxrecs(
-       struct xfs_btree_cur    *cur,
-       int                     level)
-{
-       return cur->bc_mp->m_inobt_mxr[level != 0];
-}
-
-STATIC void
-xfs_inobt_init_key_from_rec(
-       union xfs_btree_key     *key,
-       union xfs_btree_rec     *rec)
-{
-       key->inobt.ir_startino = rec->inobt.ir_startino;
-}
-
-STATIC void
-xfs_inobt_init_rec_from_key(
-       union xfs_btree_key     *key,
-       union xfs_btree_rec     *rec)
-{
-       rec->inobt.ir_startino = key->inobt.ir_startino;
-}
-
-STATIC void
-xfs_inobt_init_rec_from_cur(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_rec     *rec)
-{
-       rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
-       rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
-       rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
-}
-
-/*
- * initial value of ptr for lookup
- */
-STATIC void
-xfs_inobt_init_ptr_from_cur(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_ptr     *ptr)
-{
-       struct xfs_agi          *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
-
-       ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
-
-       ptr->s = agi->agi_root;
-}
-
-STATIC void
-xfs_finobt_init_ptr_from_cur(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_ptr     *ptr)
-{
-       struct xfs_agi          *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
-
-       ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
-       ptr->s = agi->agi_free_root;
-}
-
-STATIC __int64_t
-xfs_inobt_key_diff(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_key     *key)
-{
-       return (__int64_t)be32_to_cpu(key->inobt.ir_startino) -
-                         cur->bc_rec.i.ir_startino;
-}
-
-static int
-xfs_inobt_verify(
-       struct xfs_buf          *bp)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-       struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
-       struct xfs_perag        *pag = bp->b_pag;
-       unsigned int            level;
-
-       /*
-        * During growfs operations, we can't verify the exact owner as the
-        * perag is not fully initialised and hence not attached to the buffer.
-        *
-        * Similarly, during log recovery we will have a perag structure
-        * attached, but the agi information will not yet have been initialised
-        * from the on disk AGI. We don't currently use any of this information,
-        * but beware of the landmine (i.e. need to check pag->pagi_init) if we
-        * ever do.
-        */
-       switch (block->bb_magic) {
-       case cpu_to_be32(XFS_IBT_CRC_MAGIC):
-       case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
-               if (!xfs_sb_version_hascrc(&mp->m_sb))
-                       return false;
-               if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
-                       return false;
-               if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
-                       return false;
-               if (pag &&
-                   be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
-                       return false;
-               /* fall through */
-       case cpu_to_be32(XFS_IBT_MAGIC):
-       case cpu_to_be32(XFS_FIBT_MAGIC):
-               break;
-       default:
-               return 0;
-       }
-
-       /* numrecs and level verification */
-       level = be16_to_cpu(block->bb_level);
-       if (level >= mp->m_in_maxlevels)
-               return false;
-       if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0])
-               return false;
-
-       /* sibling pointer verification */
-       if (!block->bb_u.s.bb_leftsib ||
-           (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
-            block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
-               return false;
-       if (!block->bb_u.s.bb_rightsib ||
-           (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
-            block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
-               return false;
-
-       return true;
-}
-
-static void
-xfs_inobt_read_verify(
-       struct xfs_buf  *bp)
-{
-       if (!xfs_btree_sblock_verify_crc(bp))
-               xfs_buf_ioerror(bp, EFSBADCRC);
-       else if (!xfs_inobt_verify(bp))
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-       if (bp->b_error) {
-               trace_xfs_btree_corrupt(bp, _RET_IP_);
-               xfs_verifier_error(bp);
-       }
-}
-
-static void
-xfs_inobt_write_verify(
-       struct xfs_buf  *bp)
-{
-       if (!xfs_inobt_verify(bp)) {
-               trace_xfs_btree_corrupt(bp, _RET_IP_);
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-               xfs_verifier_error(bp);
-               return;
-       }
-       xfs_btree_sblock_calc_crc(bp);
-
-}
-
-const struct xfs_buf_ops xfs_inobt_buf_ops = {
-       .verify_read = xfs_inobt_read_verify,
-       .verify_write = xfs_inobt_write_verify,
-};
-
-#if defined(DEBUG) || defined(XFS_WARN)
-STATIC int
-xfs_inobt_keys_inorder(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_key     *k1,
-       union xfs_btree_key     *k2)
-{
-       return be32_to_cpu(k1->inobt.ir_startino) <
-               be32_to_cpu(k2->inobt.ir_startino);
-}
-
-STATIC int
-xfs_inobt_recs_inorder(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_rec     *r1,
-       union xfs_btree_rec     *r2)
-{
-       return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <=
-               be32_to_cpu(r2->inobt.ir_startino);
-}
-#endif /* DEBUG */
-
-static const struct xfs_btree_ops xfs_inobt_ops = {
-       .rec_len                = sizeof(xfs_inobt_rec_t),
-       .key_len                = sizeof(xfs_inobt_key_t),
-
-       .dup_cursor             = xfs_inobt_dup_cursor,
-       .set_root               = xfs_inobt_set_root,
-       .alloc_block            = xfs_inobt_alloc_block,
-       .free_block             = xfs_inobt_free_block,
-       .get_minrecs            = xfs_inobt_get_minrecs,
-       .get_maxrecs            = xfs_inobt_get_maxrecs,
-       .init_key_from_rec      = xfs_inobt_init_key_from_rec,
-       .init_rec_from_key      = xfs_inobt_init_rec_from_key,
-       .init_rec_from_cur      = xfs_inobt_init_rec_from_cur,
-       .init_ptr_from_cur      = xfs_inobt_init_ptr_from_cur,
-       .key_diff               = xfs_inobt_key_diff,
-       .buf_ops                = &xfs_inobt_buf_ops,
-#if defined(DEBUG) || defined(XFS_WARN)
-       .keys_inorder           = xfs_inobt_keys_inorder,
-       .recs_inorder           = xfs_inobt_recs_inorder,
-#endif
-};
-
-static const struct xfs_btree_ops xfs_finobt_ops = {
-       .rec_len                = sizeof(xfs_inobt_rec_t),
-       .key_len                = sizeof(xfs_inobt_key_t),
-
-       .dup_cursor             = xfs_inobt_dup_cursor,
-       .set_root               = xfs_finobt_set_root,
-       .alloc_block            = xfs_inobt_alloc_block,
-       .free_block             = xfs_inobt_free_block,
-       .get_minrecs            = xfs_inobt_get_minrecs,
-       .get_maxrecs            = xfs_inobt_get_maxrecs,
-       .init_key_from_rec      = xfs_inobt_init_key_from_rec,
-       .init_rec_from_key      = xfs_inobt_init_rec_from_key,
-       .init_rec_from_cur      = xfs_inobt_init_rec_from_cur,
-       .init_ptr_from_cur      = xfs_finobt_init_ptr_from_cur,
-       .key_diff               = xfs_inobt_key_diff,
-       .buf_ops                = &xfs_inobt_buf_ops,
-#if defined(DEBUG) || defined(XFS_WARN)
-       .keys_inorder           = xfs_inobt_keys_inorder,
-       .recs_inorder           = xfs_inobt_recs_inorder,
-#endif
-};
-
-/*
- * Allocate a new inode btree cursor.
- */
-struct xfs_btree_cur *                         /* new inode btree cursor */
-xfs_inobt_init_cursor(
-       struct xfs_mount        *mp,            /* file system mount point */
-       struct xfs_trans        *tp,            /* transaction pointer */
-       struct xfs_buf          *agbp,          /* buffer for agi structure */
-       xfs_agnumber_t          agno,           /* allocation group number */
-       xfs_btnum_t             btnum)          /* ialloc or free ino btree */
-{
-       struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
-       struct xfs_btree_cur    *cur;
-
-       cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
-
-       cur->bc_tp = tp;
-       cur->bc_mp = mp;
-       cur->bc_btnum = btnum;
-       if (btnum == XFS_BTNUM_INO) {
-               cur->bc_nlevels = be32_to_cpu(agi->agi_level);
-               cur->bc_ops = &xfs_inobt_ops;
-       } else {
-               cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
-               cur->bc_ops = &xfs_finobt_ops;
-       }
-
-       cur->bc_blocklog = mp->m_sb.sb_blocklog;
-
-       if (xfs_sb_version_hascrc(&mp->m_sb))
-               cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
-
-       cur->bc_private.a.agbp = agbp;
-       cur->bc_private.a.agno = agno;
-
-       return cur;
-}
-
-/*
- * Calculate number of records in an inobt btree block.
- */
-int
-xfs_inobt_maxrecs(
-       struct xfs_mount        *mp,
-       int                     blocklen,
-       int                     leaf)
-{
-       blocklen -= XFS_INOBT_BLOCK_LEN(mp);
-
-       if (leaf)
-               return blocklen / sizeof(xfs_inobt_rec_t);
-       return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
-}
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
deleted file mode 100644 (file)
index d7ebea7..0000000
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_IALLOC_BTREE_H__
-#define        __XFS_IALLOC_BTREE_H__
-
-/*
- * Inode map on-disk structures
- */
-
-struct xfs_buf;
-struct xfs_btree_cur;
-struct xfs_mount;
-
-/*
- * Btree block header size depends on a superblock flag.
- */
-#define XFS_INOBT_BLOCK_LEN(mp) \
-       (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
-               XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
-
-/*
- * Record, key, and pointer address macros for btree blocks.
- *
- * (note that some of these may appear unused, but they are used in userspace)
- */
-#define XFS_INOBT_REC_ADDR(mp, block, index) \
-       ((xfs_inobt_rec_t *) \
-               ((char *)(block) + \
-                XFS_INOBT_BLOCK_LEN(mp) + \
-                (((index) - 1) * sizeof(xfs_inobt_rec_t))))
-
-#define XFS_INOBT_KEY_ADDR(mp, block, index) \
-       ((xfs_inobt_key_t *) \
-               ((char *)(block) + \
-                XFS_INOBT_BLOCK_LEN(mp) + \
-                ((index) - 1) * sizeof(xfs_inobt_key_t)))
-
-#define XFS_INOBT_PTR_ADDR(mp, block, index, maxrecs) \
-       ((xfs_inobt_ptr_t *) \
-               ((char *)(block) + \
-                XFS_INOBT_BLOCK_LEN(mp) + \
-                (maxrecs) * sizeof(xfs_inobt_key_t) + \
-                ((index) - 1) * sizeof(xfs_inobt_ptr_t)))
-
-extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
-               struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t,
-               xfs_btnum_t);
-extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
-
-#endif /* __XFS_IALLOC_BTREE_H__ */
index c48df5f25b9f460e2fd312c9557b69d1db4209a9..981b2cf519853f72c91dff609c90b78eb1626c4e 100644 (file)
@@ -33,6 +33,9 @@
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_bmap_util.h"
+#include "xfs_quota.h"
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
 
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -158,7 +161,7 @@ xfs_iget_cache_hit(
        if (ip->i_ino != ino) {
                trace_xfs_iget_skip(ip);
                XFS_STATS_INC(xs_ig_frecycle);
-               error = EAGAIN;
+               error = -EAGAIN;
                goto out_error;
        }
 
@@ -176,7 +179,7 @@ xfs_iget_cache_hit(
        if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
                trace_xfs_iget_skip(ip);
                XFS_STATS_INC(xs_ig_frecycle);
-               error = EAGAIN;
+               error = -EAGAIN;
                goto out_error;
        }
 
@@ -184,7 +187,7 @@ xfs_iget_cache_hit(
         * If lookup is racing with unlink return an error immediately.
         */
        if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
-               error = ENOENT;
+               error = -ENOENT;
                goto out_error;
        }
 
@@ -206,7 +209,7 @@ xfs_iget_cache_hit(
                spin_unlock(&ip->i_flags_lock);
                rcu_read_unlock();
 
-               error = -inode_init_always(mp->m_super, inode);
+               error = inode_init_always(mp->m_super, inode);
                if (error) {
                        /*
                         * Re-initializing the inode failed, and we are in deep
@@ -243,7 +246,7 @@ xfs_iget_cache_hit(
                /* If the VFS inode is being torn down, pause and try again. */
                if (!igrab(inode)) {
                        trace_xfs_iget_skip(ip);
-                       error = EAGAIN;
+                       error = -EAGAIN;
                        goto out_error;
                }
 
@@ -285,7 +288,7 @@ xfs_iget_cache_miss(
 
        ip = xfs_inode_alloc(mp, ino);
        if (!ip)
-               return ENOMEM;
+               return -ENOMEM;
 
        error = xfs_iread(mp, tp, ip, flags);
        if (error)
@@ -294,7 +297,7 @@ xfs_iget_cache_miss(
        trace_xfs_iget_miss(ip);
 
        if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
-               error = ENOENT;
+               error = -ENOENT;
                goto out_destroy;
        }
 
@@ -305,7 +308,7 @@ xfs_iget_cache_miss(
         * recurse into the file system.
         */
        if (radix_tree_preload(GFP_NOFS)) {
-               error = EAGAIN;
+               error = -EAGAIN;
                goto out_destroy;
        }
 
@@ -341,7 +344,7 @@ xfs_iget_cache_miss(
        if (unlikely(error)) {
                WARN_ON(error != -EEXIST);
                XFS_STATS_INC(xs_ig_dup);
-               error = EAGAIN;
+               error = -EAGAIN;
                goto out_preload_end;
        }
        spin_unlock(&pag->pag_ici_lock);
@@ -408,7 +411,7 @@ xfs_iget(
 
        /* reject inode numbers outside existing AGs */
        if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
-               return EINVAL;
+               return -EINVAL;
 
        /* get the perag structure and ensure that it's inode capable */
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
@@ -445,7 +448,7 @@ again:
        return 0;
 
 out_error_or_again:
-       if (error == EAGAIN) {
+       if (error == -EAGAIN) {
                delay(1);
                goto again;
        }
@@ -489,18 +492,18 @@ xfs_inode_ag_walk_grab(
 
        /* nothing to sync during shutdown */
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-               return EFSCORRUPTED;
+               return -EFSCORRUPTED;
 
        /* If we can't grab the inode, it must on it's way to reclaim. */
        if (!igrab(inode))
-               return ENOENT;
+               return -ENOENT;
 
        /* inode is valid */
        return 0;
 
 out_unlock_noent:
        spin_unlock(&ip->i_flags_lock);
-       return ENOENT;
+       return -ENOENT;
 }
 
 STATIC int
@@ -583,16 +586,16 @@ restart:
                                continue;
                        error = execute(batch[i], flags, args);
                        IRELE(batch[i]);
-                       if (error == EAGAIN) {
+                       if (error == -EAGAIN) {
                                skipped++;
                                continue;
                        }
-                       if (error && last_error != EFSCORRUPTED)
+                       if (error && last_error != -EFSCORRUPTED)
                                last_error = error;
                }
 
                /* bail out if the filesystem is corrupted.  */
-               if (error == EFSCORRUPTED)
+               if (error == -EFSCORRUPTED)
                        break;
 
                cond_resched();
@@ -652,11 +655,11 @@ xfs_inode_ag_iterator(
                xfs_perag_put(pag);
                if (error) {
                        last_error = error;
-                       if (error == EFSCORRUPTED)
+                       if (error == -EFSCORRUPTED)
                                break;
                }
        }
-       return XFS_ERROR(last_error);
+       return last_error;
 }
 
 int
@@ -680,11 +683,11 @@ xfs_inode_ag_iterator_tag(
                xfs_perag_put(pag);
                if (error) {
                        last_error = error;
-                       if (error == EFSCORRUPTED)
+                       if (error == -EFSCORRUPTED)
                                break;
                }
        }
-       return XFS_ERROR(last_error);
+       return last_error;
 }
 
 /*
@@ -944,7 +947,7 @@ restart:
         * see the stale flag set on the inode.
         */
        error = xfs_iflush(ip, &bp);
-       if (error == EAGAIN) {
+       if (error == -EAGAIN) {
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                /* backoff longer than in xfs_ifree_cluster */
                delay(2);
@@ -997,7 +1000,7 @@ out:
        xfs_iflags_clear(ip, XFS_IRECLAIM);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        /*
-        * We could return EAGAIN here to make reclaim rescan the inode tree in
+        * We could return -EAGAIN here to make reclaim rescan the inode tree in
         * a short while. However, this just burns CPU time scanning the tree
         * waiting for IO to complete and the reclaim work never goes back to
         * the idle state. Instead, return 0 to let the next scheduled
@@ -1100,7 +1103,7 @@ restart:
                                if (!batch[i])
                                        continue;
                                error = xfs_reclaim_inode(batch[i], pag, flags);
-                               if (error && last_error != EFSCORRUPTED)
+                               if (error && last_error != -EFSCORRUPTED)
                                        last_error = error;
                        }
 
@@ -1129,7 +1132,7 @@ restart:
                trylock = 0;
                goto restart;
        }
-       return XFS_ERROR(last_error);
+       return last_error;
 }
 
 int
@@ -1203,6 +1206,30 @@ xfs_inode_match_id(
        return 1;
 }
 
+/*
+ * A union-based inode filtering algorithm. Process the inode if any of the
+ * criteria match. This is for global/internal scans only.
+ */
+STATIC int
+xfs_inode_match_id_union(
+       struct xfs_inode        *ip,
+       struct xfs_eofblocks    *eofb)
+{
+       if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
+           uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
+               return 1;
+
+       if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
+           gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
+               return 1;
+
+       if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
+           xfs_get_projid(ip) == eofb->eof_prid)
+               return 1;
+
+       return 0;
+}
+
 STATIC int
 xfs_inode_free_eofblocks(
        struct xfs_inode        *ip,
@@ -1211,6 +1238,10 @@ xfs_inode_free_eofblocks(
 {
        int ret;
        struct xfs_eofblocks *eofb = args;
+       bool need_iolock = true;
+       int match;
+
+       ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
 
        if (!xfs_can_free_eofblocks(ip, false)) {
                /* inode could be preallocated or append-only */
@@ -1228,19 +1259,31 @@ xfs_inode_free_eofblocks(
                return 0;
 
        if (eofb) {
-               if (!xfs_inode_match_id(ip, eofb))
+               if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
+                       match = xfs_inode_match_id_union(ip, eofb);
+               else
+                       match = xfs_inode_match_id(ip, eofb);
+               if (!match)
                        return 0;
 
                /* skip the inode if the file size is too small */
                if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
                    XFS_ISIZE(ip) < eofb->eof_min_file_size)
                        return 0;
+
+               /*
+                * A scan owner implies we already hold the iolock. Skip it in
+                * xfs_free_eofblocks() to avoid deadlock. This also eliminates
+                * the possibility of EAGAIN being returned.
+                */
+               if (eofb->eof_scan_owner == ip->i_ino)
+                       need_iolock = false;
        }
 
-       ret = xfs_free_eofblocks(ip->i_mount, ip, true);
+       ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock);
 
        /* don't revisit the inode if we're not waiting */
-       if (ret == EAGAIN && !(flags & SYNC_WAIT))
+       if (ret == -EAGAIN && !(flags & SYNC_WAIT))
                ret = 0;
 
        return ret;
@@ -1260,6 +1303,55 @@ xfs_icache_free_eofblocks(
                                         eofb, XFS_ICI_EOFBLOCKS_TAG);
 }
 
+/*
+ * Run eofblocks scans on the quotas applicable to the inode. For inodes with
+ * multiple quotas, we don't know exactly which quota caused an allocation
+ * failure. We make a best effort by including each quota under low free space
+ * conditions (less than 1% free space) in the scan.
+ */
+int
+xfs_inode_free_quota_eofblocks(
+       struct xfs_inode *ip)
+{
+       int scan = 0;
+       struct xfs_eofblocks eofb = {0};
+       struct xfs_dquot *dq;
+
+       ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+
+       /*
+        * Set the scan owner to avoid a potential livelock. Otherwise, the scan
+        * can repeatedly trylock on the inode we're currently processing. We
+        * run a sync scan to increase effectiveness and use the union filter to
+        * cover all applicable quotas in a single scan.
+        */
+       eofb.eof_scan_owner = ip->i_ino;
+       eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC;
+
+       if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) {
+               dq = xfs_inode_dquot(ip, XFS_DQ_USER);
+               if (dq && xfs_dquot_lowsp(dq)) {
+                       eofb.eof_uid = VFS_I(ip)->i_uid;
+                       eofb.eof_flags |= XFS_EOF_FLAGS_UID;
+                       scan = 1;
+               }
+       }
+
+       if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) {
+               dq = xfs_inode_dquot(ip, XFS_DQ_GROUP);
+               if (dq && xfs_dquot_lowsp(dq)) {
+                       eofb.eof_gid = VFS_I(ip)->i_gid;
+                       eofb.eof_flags |= XFS_EOF_FLAGS_GID;
+                       scan = 1;
+               }
+       }
+
+       if (scan)
+               xfs_icache_free_eofblocks(ip->i_mount, &eofb);
+
+       return scan;
+}
+
 void
 xfs_inode_set_eofblocks_tag(
        xfs_inode_t     *ip)
index 9cf017b899be9d531e46d6d7b33a3a110c051cb8..46748b86b12f5511b7239e08d88825511ddefa08 100644 (file)
@@ -27,6 +27,7 @@ struct xfs_eofblocks {
        kgid_t          eof_gid;
        prid_t          eof_prid;
        __u64           eof_min_file_size;
+       xfs_ino_t       eof_scan_owner;
 };
 
 #define SYNC_WAIT              0x0001  /* wait for i/o to complete */
@@ -57,6 +58,7 @@ void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
 void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
 void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
 int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
+int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip);
 void xfs_eofblocks_worker(struct work_struct *);
 
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
@@ -72,31 +74,32 @@ xfs_fs_eofblocks_from_user(
        struct xfs_eofblocks            *dst)
 {
        if (src->eof_version != XFS_EOFBLOCKS_VERSION)
-               return EINVAL;
+               return -EINVAL;
 
        if (src->eof_flags & ~XFS_EOF_FLAGS_VALID)
-               return EINVAL;
+               return -EINVAL;
 
        if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) ||
            memchr_inv(src->pad64, 0, sizeof(src->pad64)))
-               return EINVAL;
+               return -EINVAL;
 
        dst->eof_flags = src->eof_flags;
        dst->eof_prid = src->eof_prid;
        dst->eof_min_file_size = src->eof_min_file_size;
+       dst->eof_scan_owner = NULLFSINO;
 
        dst->eof_uid = INVALID_UID;
        if (src->eof_flags & XFS_EOF_FLAGS_UID) {
                dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid);
                if (!uid_valid(dst->eof_uid))
-                       return EINVAL;
+                       return -EINVAL;
        }
 
        dst->eof_gid = INVALID_GID;
        if (src->eof_flags & XFS_EOF_FLAGS_GID) {
                dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid);
                if (!gid_valid(dst->eof_gid))
-                       return EINVAL;
+                       return -EINVAL;
        }
        return 0;
 }
index a6115fe1ac948a4b57efb6bf00830bb9a4845ed6..fea3c92fb3f0603b5ca44bb86a5a18dcaaac6240 100644 (file)
@@ -583,7 +583,7 @@ xfs_lookup(
        trace_xfs_lookup(dp, name);
 
        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-               return XFS_ERROR(EIO);
+               return -EIO;
 
        lock_mode = xfs_ilock_data_map_shared(dp);
        error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
@@ -893,7 +893,7 @@ xfs_dir_ialloc(
        }
        if (!ialloc_context && !ip) {
                *ipp = NULL;
-               return XFS_ERROR(ENOSPC);
+               return -ENOSPC;
        }
 
        /*
@@ -1088,7 +1088,7 @@ xfs_create(
        trace_xfs_create(dp, name);
 
        if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
+               return -EIO;
 
        prid = xfs_get_initial_prid(dp);
 
@@ -1125,12 +1125,12 @@ xfs_create(
         */
        tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
        error = xfs_trans_reserve(tp, &tres, resblks, 0);
-       if (error == ENOSPC) {
+       if (error == -ENOSPC) {
                /* flush outstanding delalloc blocks and retry */
                xfs_flush_inodes(mp);
                error = xfs_trans_reserve(tp, &tres, resblks, 0);
        }
-       if (error == ENOSPC) {
+       if (error == -ENOSPC) {
                /* No space at all so try a "no-allocation" reservation */
                resblks = 0;
                error = xfs_trans_reserve(tp, &tres, 0, 0);
@@ -1165,7 +1165,7 @@ xfs_create(
        error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
                               prid, resblks > 0, &ip, &committed);
        if (error) {
-               if (error == ENOSPC)
+               if (error == -ENOSPC)
                        goto out_trans_cancel;
                goto out_trans_abort;
        }
@@ -1184,7 +1184,7 @@ xfs_create(
                                        &first_block, &free_list, resblks ?
                                        resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
        if (error) {
-               ASSERT(error != ENOSPC);
+               ASSERT(error != -ENOSPC);
                goto out_trans_abort;
        }
        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -1274,7 +1274,7 @@ xfs_create_tmpfile(
        uint                    resblks;
 
        if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
+               return -EIO;
 
        prid = xfs_get_initial_prid(dp);
 
@@ -1293,7 +1293,7 @@ xfs_create_tmpfile(
 
        tres = &M_RES(mp)->tr_create_tmpfile;
        error = xfs_trans_reserve(tp, tres, resblks, 0);
-       if (error == ENOSPC) {
+       if (error == -ENOSPC) {
                /* No space at all so try a "no-allocation" reservation */
                resblks = 0;
                error = xfs_trans_reserve(tp, tres, 0, 0);
@@ -1311,7 +1311,7 @@ xfs_create_tmpfile(
        error = xfs_dir_ialloc(&tp, dp, mode, 1, 0,
                                prid, resblks > 0, &ip, NULL);
        if (error) {
-               if (error == ENOSPC)
+               if (error == -ENOSPC)
                        goto out_trans_cancel;
                goto out_trans_abort;
        }
@@ -1382,7 +1382,7 @@ xfs_link(
        ASSERT(!S_ISDIR(sip->i_d.di_mode));
 
        if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
+               return -EIO;
 
        error = xfs_qm_dqattach(sip, 0);
        if (error)
@@ -1396,7 +1396,7 @@ xfs_link(
        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
        resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
-       if (error == ENOSPC) {
+       if (error == -ENOSPC) {
                resblks = 0;
                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
        }
@@ -1417,7 +1417,7 @@ xfs_link(
         */
        if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
                     (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
-               error = XFS_ERROR(EXDEV);
+               error = -EXDEV;
                goto error_return;
        }
 
@@ -1635,8 +1635,8 @@ xfs_release(
                truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
                if (truncated) {
                        xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
-                       if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
-                               error = -filemap_flush(VFS_I(ip)->i_mapping);
+                       if (ip->i_delayed_blks > 0) {
+                               error = filemap_flush(VFS_I(ip)->i_mapping);
                                if (error)
                                        return error;
                        }
@@ -1673,7 +1673,7 @@ xfs_release(
                        return 0;
 
                error = xfs_free_eofblocks(mp, ip, true);
-               if (error && error != EAGAIN)
+               if (error && error != -EAGAIN)
                        return error;
 
                /* delalloc blocks after truncation means it really is dirty */
@@ -1772,7 +1772,7 @@ xfs_inactive_ifree(
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree,
                                  XFS_IFREE_SPACE_RES(mp), 0);
        if (error) {
-               if (error == ENOSPC) {
+               if (error == -ENOSPC) {
                        xfs_warn_ratelimited(mp,
                        "Failed to remove inode(s) from unlinked list. "
                        "Please free space, unmount and run xfs_repair.");
@@ -2219,7 +2219,7 @@ xfs_ifree_cluster(
                                        XBF_UNMAPPED);
 
                if (!bp)
-                       return ENOMEM;
+                       return -ENOMEM;
 
                /*
                 * This buffer may not have been correctly initialised as we
@@ -2491,7 +2491,7 @@ xfs_remove(
        trace_xfs_remove(dp, name);
 
        if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
+               return -EIO;
 
        error = xfs_qm_dqattach(dp, 0);
        if (error)
@@ -2521,12 +2521,12 @@ xfs_remove(
         */
        resblks = XFS_REMOVE_SPACE_RES(mp);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0);
-       if (error == ENOSPC) {
+       if (error == -ENOSPC) {
                resblks = 0;
                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0);
        }
        if (error) {
-               ASSERT(error != ENOSPC);
+               ASSERT(error != -ENOSPC);
                cancel_flags = 0;
                goto out_trans_cancel;
        }
@@ -2543,11 +2543,11 @@ xfs_remove(
        if (is_dir) {
                ASSERT(ip->i_d.di_nlink >= 2);
                if (ip->i_d.di_nlink != 2) {
-                       error = XFS_ERROR(ENOTEMPTY);
+                       error = -ENOTEMPTY;
                        goto out_trans_cancel;
                }
                if (!xfs_dir_isempty(ip)) {
-                       error = XFS_ERROR(ENOTEMPTY);
+                       error = -ENOTEMPTY;
                        goto out_trans_cancel;
                }
 
@@ -2582,7 +2582,7 @@ xfs_remove(
        error = xfs_dir_removename(tp, dp, name, ip->i_ino,
                                        &first_block, &free_list, resblks);
        if (error) {
-               ASSERT(error != ENOENT);
+               ASSERT(error != -ENOENT);
                goto out_bmap_cancel;
        }
 
@@ -2702,7 +2702,7 @@ xfs_rename(
        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
        spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0);
-       if (error == ENOSPC) {
+       if (error == -ENOSPC) {
                spaceres = 0;
                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0);
        }
@@ -2747,7 +2747,7 @@ xfs_rename(
         */
        if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
                     (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
-               error = XFS_ERROR(EXDEV);
+               error = -EXDEV;
                goto error_return;
        }
 
@@ -2770,7 +2770,7 @@ xfs_rename(
                error = xfs_dir_createname(tp, target_dp, target_name,
                                                src_ip->i_ino, &first_block,
                                                &free_list, spaceres);
-               if (error == ENOSPC)
+               if (error == -ENOSPC)
                        goto error_return;
                if (error)
                        goto abort_return;
@@ -2795,7 +2795,7 @@ xfs_rename(
                         */
                        if (!(xfs_dir_isempty(target_ip)) ||
                            (target_ip->i_d.di_nlink > 2)) {
-                               error = XFS_ERROR(EEXIST);
+                               error = -EEXIST;
                                goto error_return;
                        }
                }
@@ -2847,7 +2847,7 @@ xfs_rename(
                error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
                                        target_dp->i_ino,
                                        &first_block, &free_list, spaceres);
-               ASSERT(error != EEXIST);
+               ASSERT(error != -EEXIST);
                if (error)
                        goto abort_return;
        }
@@ -3055,7 +3055,7 @@ cluster_corrupt_out:
                if (bp->b_iodone) {
                        XFS_BUF_UNDONE(bp);
                        xfs_buf_stale(bp);
-                       xfs_buf_ioerror(bp, EIO);
+                       xfs_buf_ioerror(bp, -EIO);
                        xfs_buf_ioend(bp, 0);
                } else {
                        xfs_buf_stale(bp);
@@ -3069,7 +3069,7 @@ cluster_corrupt_out:
        xfs_iflush_abort(iq, false);
        kmem_free(ilist);
        xfs_perag_put(pag);
-       return XFS_ERROR(EFSCORRUPTED);
+       return -EFSCORRUPTED;
 }
 
 /*
@@ -3124,7 +3124,7 @@ xfs_iflush(
         * as we wait for an empty AIL as part of the unmount process.
         */
        if (XFS_FORCED_SHUTDOWN(mp)) {
-               error = XFS_ERROR(EIO);
+               error = -EIO;
                goto abort_out;
        }
 
@@ -3167,7 +3167,7 @@ corrupt_out:
        xfs_buf_relse(bp);
        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 cluster_corrupt_out:
-       error = XFS_ERROR(EFSCORRUPTED);
+       error = -EFSCORRUPTED;
 abort_out:
        /*
         * Unlocks the flush lock
@@ -3331,5 +3331,5 @@ xfs_iflush_int(
        return 0;
 
 corrupt_out:
-       return XFS_ERROR(EFSCORRUPTED);
+       return -EFSCORRUPTED;
 }
index f72bffa6726628224c2f9f0e9fbee6480b3995dd..c10e3fadd9af659f58d3a512e9706d93bcf123ff 100644 (file)
@@ -398,4 +398,14 @@ do { \
 
 extern struct kmem_zone        *xfs_inode_zone;
 
+/*
+ * Flags for read/write calls
+ */
+#define XFS_IO_ISDIRECT        0x00001         /* bypass page cache */
+#define XFS_IO_INVIS   0x00002         /* don't update inode timestamps */
+
+#define XFS_IO_FLAGS \
+       { XFS_IO_ISDIRECT,      "DIRECT" }, \
+       { XFS_IO_INVIS,         "INVIS"}
+
 #endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c
deleted file mode 100644 (file)
index cb35ae4..0000000
+++ /dev/null
@@ -1,479 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_inode.h"
-#include "xfs_error.h"
-#include "xfs_cksum.h"
-#include "xfs_icache.h"
-#include "xfs_trans.h"
-#include "xfs_ialloc.h"
-#include "xfs_dinode.h"
-
-/*
- * Check that none of the inode's in the buffer have a next
- * unlinked field of 0.
- */
-#if defined(DEBUG)
-void
-xfs_inobp_check(
-       xfs_mount_t     *mp,
-       xfs_buf_t       *bp)
-{
-       int             i;
-       int             j;
-       xfs_dinode_t    *dip;
-
-       j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
-
-       for (i = 0; i < j; i++) {
-               dip = (xfs_dinode_t *)xfs_buf_offset(bp,
-                                       i * mp->m_sb.sb_inodesize);
-               if (!dip->di_next_unlinked)  {
-                       xfs_alert(mp,
-       "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.",
-                               i, (long long)bp->b_bn);
-               }
-       }
-}
-#endif
-
-/*
- * If we are doing readahead on an inode buffer, we might be in log recovery
- * reading an inode allocation buffer that hasn't yet been replayed, and hence
- * has not had the inode cores stamped into it. Hence for readahead, the buffer
- * may be potentially invalid.
- *
- * If the readahead buffer is invalid, we don't want to mark it with an error,
- * but we do want to clear the DONE status of the buffer so that a followup read
- * will re-read it from disk. This will ensure that we don't get an unnecessary
- * warnings during log recovery and we don't get unnecssary panics on debug
- * kernels.
- */
-static void
-xfs_inode_buf_verify(
-       struct xfs_buf  *bp,
-       bool            readahead)
-{
-       struct xfs_mount *mp = bp->b_target->bt_mount;
-       int             i;
-       int             ni;
-
-       /*
-        * Validate the magic number and version of every inode in the buffer
-        */
-       ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
-       for (i = 0; i < ni; i++) {
-               int             di_ok;
-               xfs_dinode_t    *dip;
-
-               dip = (struct xfs_dinode *)xfs_buf_offset(bp,
-                                       (i << mp->m_sb.sb_inodelog));
-               di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
-                           XFS_DINODE_GOOD_VERSION(dip->di_version);
-               if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
-                                               XFS_ERRTAG_ITOBP_INOTOBP,
-                                               XFS_RANDOM_ITOBP_INOTOBP))) {
-                       if (readahead) {
-                               bp->b_flags &= ~XBF_DONE;
-                               return;
-                       }
-
-                       xfs_buf_ioerror(bp, EFSCORRUPTED);
-                       xfs_verifier_error(bp);
-#ifdef DEBUG
-                       xfs_alert(mp,
-                               "bad inode magic/vsn daddr %lld #%d (magic=%x)",
-                               (unsigned long long)bp->b_bn, i,
-                               be16_to_cpu(dip->di_magic));
-#endif
-               }
-       }
-       xfs_inobp_check(mp, bp);
-}
-
-
-static void
-xfs_inode_buf_read_verify(
-       struct xfs_buf  *bp)
-{
-       xfs_inode_buf_verify(bp, false);
-}
-
-static void
-xfs_inode_buf_readahead_verify(
-       struct xfs_buf  *bp)
-{
-       xfs_inode_buf_verify(bp, true);
-}
-
-static void
-xfs_inode_buf_write_verify(
-       struct xfs_buf  *bp)
-{
-       xfs_inode_buf_verify(bp, false);
-}
-
-const struct xfs_buf_ops xfs_inode_buf_ops = {
-       .verify_read = xfs_inode_buf_read_verify,
-       .verify_write = xfs_inode_buf_write_verify,
-};
-
-const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
-       .verify_read = xfs_inode_buf_readahead_verify,
-       .verify_write = xfs_inode_buf_write_verify,
-};
-
-
-/*
- * This routine is called to map an inode to the buffer containing the on-disk
- * version of the inode.  It returns a pointer to the buffer containing the
- * on-disk inode in the bpp parameter, and in the dipp parameter it returns a
- * pointer to the on-disk inode within that buffer.
- *
- * If a non-zero error is returned, then the contents of bpp and dipp are
- * undefined.
- */
-int
-xfs_imap_to_bp(
-       struct xfs_mount        *mp,
-       struct xfs_trans        *tp,
-       struct xfs_imap         *imap,
-       struct xfs_dinode       **dipp,
-       struct xfs_buf          **bpp,
-       uint                    buf_flags,
-       uint                    iget_flags)
-{
-       struct xfs_buf          *bp;
-       int                     error;
-
-       buf_flags |= XBF_UNMAPPED;
-       error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
-                                  (int)imap->im_len, buf_flags, &bp,
-                                  &xfs_inode_buf_ops);
-       if (error) {
-               if (error == EAGAIN) {
-                       ASSERT(buf_flags & XBF_TRYLOCK);
-                       return error;
-               }
-
-               if (error == EFSCORRUPTED &&
-                   (iget_flags & XFS_IGET_UNTRUSTED))
-                       return XFS_ERROR(EINVAL);
-
-               xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
-                       __func__, error);
-               return error;
-       }
-
-       *bpp = bp;
-       *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
-       return 0;
-}
-
-void
-xfs_dinode_from_disk(
-       xfs_icdinode_t          *to,
-       xfs_dinode_t            *from)
-{
-       to->di_magic = be16_to_cpu(from->di_magic);
-       to->di_mode = be16_to_cpu(from->di_mode);
-       to->di_version = from ->di_version;
-       to->di_format = from->di_format;
-       to->di_onlink = be16_to_cpu(from->di_onlink);
-       to->di_uid = be32_to_cpu(from->di_uid);
-       to->di_gid = be32_to_cpu(from->di_gid);
-       to->di_nlink = be32_to_cpu(from->di_nlink);
-       to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
-       to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
-       memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
-       to->di_flushiter = be16_to_cpu(from->di_flushiter);
-       to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
-       to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
-       to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
-       to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
-       to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
-       to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
-       to->di_size = be64_to_cpu(from->di_size);
-       to->di_nblocks = be64_to_cpu(from->di_nblocks);
-       to->di_extsize = be32_to_cpu(from->di_extsize);
-       to->di_nextents = be32_to_cpu(from->di_nextents);
-       to->di_anextents = be16_to_cpu(from->di_anextents);
-       to->di_forkoff = from->di_forkoff;
-       to->di_aformat  = from->di_aformat;
-       to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
-       to->di_dmstate  = be16_to_cpu(from->di_dmstate);
-       to->di_flags    = be16_to_cpu(from->di_flags);
-       to->di_gen      = be32_to_cpu(from->di_gen);
-
-       if (to->di_version == 3) {
-               to->di_changecount = be64_to_cpu(from->di_changecount);
-               to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
-               to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
-               to->di_flags2 = be64_to_cpu(from->di_flags2);
-               to->di_ino = be64_to_cpu(from->di_ino);
-               to->di_lsn = be64_to_cpu(from->di_lsn);
-               memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
-               uuid_copy(&to->di_uuid, &from->di_uuid);
-       }
-}
-
-void
-xfs_dinode_to_disk(
-       xfs_dinode_t            *to,
-       xfs_icdinode_t          *from)
-{
-       to->di_magic = cpu_to_be16(from->di_magic);
-       to->di_mode = cpu_to_be16(from->di_mode);
-       to->di_version = from ->di_version;
-       to->di_format = from->di_format;
-       to->di_onlink = cpu_to_be16(from->di_onlink);
-       to->di_uid = cpu_to_be32(from->di_uid);
-       to->di_gid = cpu_to_be32(from->di_gid);
-       to->di_nlink = cpu_to_be32(from->di_nlink);
-       to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
-       to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
-       memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
-       to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
-       to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
-       to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
-       to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
-       to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
-       to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
-       to->di_size = cpu_to_be64(from->di_size);
-       to->di_nblocks = cpu_to_be64(from->di_nblocks);
-       to->di_extsize = cpu_to_be32(from->di_extsize);
-       to->di_nextents = cpu_to_be32(from->di_nextents);
-       to->di_anextents = cpu_to_be16(from->di_anextents);
-       to->di_forkoff = from->di_forkoff;
-       to->di_aformat = from->di_aformat;
-       to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
-       to->di_dmstate = cpu_to_be16(from->di_dmstate);
-       to->di_flags = cpu_to_be16(from->di_flags);
-       to->di_gen = cpu_to_be32(from->di_gen);
-
-       if (from->di_version == 3) {
-               to->di_changecount = cpu_to_be64(from->di_changecount);
-               to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
-               to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
-               to->di_flags2 = cpu_to_be64(from->di_flags2);
-               to->di_ino = cpu_to_be64(from->di_ino);
-               to->di_lsn = cpu_to_be64(from->di_lsn);
-               memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
-               uuid_copy(&to->di_uuid, &from->di_uuid);
-               to->di_flushiter = 0;
-       } else {
-               to->di_flushiter = cpu_to_be16(from->di_flushiter);
-       }
-}
-
-static bool
-xfs_dinode_verify(
-       struct xfs_mount        *mp,
-       struct xfs_inode        *ip,
-       struct xfs_dinode       *dip)
-{
-       if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
-               return false;
-
-       /* only version 3 or greater inodes are extensively verified here */
-       if (dip->di_version < 3)
-               return true;
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return false;
-       if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
-                             XFS_DINODE_CRC_OFF))
-               return false;
-       if (be64_to_cpu(dip->di_ino) != ip->i_ino)
-               return false;
-       if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid))
-               return false;
-       return true;
-}
-
-void
-xfs_dinode_calc_crc(
-       struct xfs_mount        *mp,
-       struct xfs_dinode       *dip)
-{
-       __uint32_t              crc;
-
-       if (dip->di_version < 3)
-               return;
-
-       ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
-       crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
-                             XFS_DINODE_CRC_OFF);
-       dip->di_crc = xfs_end_cksum(crc);
-}
-
-/*
- * Read the disk inode attributes into the in-core inode structure.
- *
- * For version 5 superblocks, if we are initialising a new inode and we are not
- * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
- * inode core with a random generation number. If we are keeping inodes around,
- * we need to read the inode cluster to get the existing generation number off
- * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
- * format) then log recovery is dependent on the di_flushiter field being
- * initialised from the current on-disk value and hence we must also read the
- * inode off disk.
- */
-int
-xfs_iread(
-       xfs_mount_t     *mp,
-       xfs_trans_t     *tp,
-       xfs_inode_t     *ip,
-       uint            iget_flags)
-{
-       xfs_buf_t       *bp;
-       xfs_dinode_t    *dip;
-       int             error;
-
-       /*
-        * Fill in the location information in the in-core inode.
-        */
-       error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
-       if (error)
-               return error;
-
-       /* shortcut IO on inode allocation if possible */
-       if ((iget_flags & XFS_IGET_CREATE) &&
-           xfs_sb_version_hascrc(&mp->m_sb) &&
-           !(mp->m_flags & XFS_MOUNT_IKEEP)) {
-               /* initialise the on-disk inode core */
-               memset(&ip->i_d, 0, sizeof(ip->i_d));
-               ip->i_d.di_magic = XFS_DINODE_MAGIC;
-               ip->i_d.di_gen = prandom_u32();
-               if (xfs_sb_version_hascrc(&mp->m_sb)) {
-                       ip->i_d.di_version = 3;
-                       ip->i_d.di_ino = ip->i_ino;
-                       uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
-               } else
-                       ip->i_d.di_version = 2;
-               return 0;
-       }
-
-       /*
-        * Get pointers to the on-disk inode and the buffer containing it.
-        */
-       error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
-       if (error)
-               return error;
-
-       /* even unallocated inodes are verified */
-       if (!xfs_dinode_verify(mp, ip, dip)) {
-               xfs_alert(mp, "%s: validation failed for inode %lld failed",
-                               __func__, ip->i_ino);
-
-               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
-               error = XFS_ERROR(EFSCORRUPTED);
-               goto out_brelse;
-       }
-
-       /*
-        * If the on-disk inode is already linked to a directory
-        * entry, copy all of the inode into the in-core inode.
-        * xfs_iformat_fork() handles copying in the inode format
-        * specific information.
-        * Otherwise, just get the truly permanent information.
-        */
-       if (dip->di_mode) {
-               xfs_dinode_from_disk(&ip->i_d, dip);
-               error = xfs_iformat_fork(ip, dip);
-               if (error)  {
-#ifdef DEBUG
-                       xfs_alert(mp, "%s: xfs_iformat() returned error %d",
-                               __func__, error);
-#endif /* DEBUG */
-                       goto out_brelse;
-               }
-       } else {
-               /*
-                * Partial initialisation of the in-core inode. Just the bits
-                * that xfs_ialloc won't overwrite or relies on being correct.
-                */
-               ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
-               ip->i_d.di_version = dip->di_version;
-               ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
-               ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
-
-               if (dip->di_version == 3) {
-                       ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
-                       uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
-               }
-
-               /*
-                * Make sure to pull in the mode here as well in
-                * case the inode is released without being used.
-                * This ensures that xfs_inactive() will see that
-                * the inode is already free and not try to mess
-                * with the uninitialized part of it.
-                */
-               ip->i_d.di_mode = 0;
-       }
-
-       /*
-        * Automatically convert version 1 inode formats in memory to version 2
-        * inode format. If the inode is modified, it will get logged and
-        * rewritten as a version 2 inode. We can do this because we set the
-        * superblock feature bit for v2 inodes unconditionally during mount
-        * and it means the reast of the code can assume the inode version is 2
-        * or higher.
-        */
-       if (ip->i_d.di_version == 1) {
-               ip->i_d.di_version = 2;
-               memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
-               ip->i_d.di_nlink = ip->i_d.di_onlink;
-               ip->i_d.di_onlink = 0;
-               xfs_set_projid(ip, 0);
-       }
-
-       ip->i_delayed_blks = 0;
-
-       /*
-        * Mark the buffer containing the inode as something to keep
-        * around for a while.  This helps to keep recently accessed
-        * meta-data in-core longer.
-        */
-       xfs_buf_set_ref(bp, XFS_INO_REF);
-
-       /*
-        * Use xfs_trans_brelse() to release the buffer containing the on-disk
-        * inode, because it was acquired with xfs_trans_read_buf() in
-        * xfs_imap_to_bp() above.  If tp is NULL, this is just a normal
-        * brelse().  If we're within a transaction, then xfs_trans_brelse()
-        * will only release the buffer if it is not dirty within the
-        * transaction.  It will be OK to release the buffer in this case,
-        * because inodes on disk are never destroyed and we will be locking the
-        * new in-core inode before putting it in the cache where other
-        * processes can find it.  Thus we don't have to worry about the inode
-        * being changed just because we released the buffer.
-        */
- out_brelse:
-       xfs_trans_brelse(tp, bp);
-       return error;
-}
diff --git a/fs/xfs/xfs_inode_buf.h b/fs/xfs/xfs_inode_buf.h
deleted file mode 100644 (file)
index 9308c47..0000000
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef        __XFS_INODE_BUF_H__
-#define        __XFS_INODE_BUF_H__
-
-struct xfs_inode;
-struct xfs_dinode;
-struct xfs_icdinode;
-
-/*
- * Inode location information.  Stored in the inode and passed to
- * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
- */
-struct xfs_imap {
-       xfs_daddr_t     im_blkno;       /* starting BB of inode chunk */
-       ushort          im_len;         /* length in BBs of inode chunk */
-       ushort          im_boffset;     /* inode offset in block in bytes */
-};
-
-int    xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
-                      struct xfs_imap *, struct xfs_dinode **,
-                      struct xfs_buf **, uint, uint);
-int    xfs_iread(struct xfs_mount *, struct xfs_trans *,
-                 struct xfs_inode *, uint);
-void   xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
-void   xfs_dinode_to_disk(struct xfs_dinode *to, struct xfs_icdinode *from);
-void   xfs_dinode_from_disk(struct xfs_icdinode *to, struct xfs_dinode *from);
-
-#if defined(DEBUG)
-void   xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
-#else
-#define        xfs_inobp_check(mp, bp)
-#endif /* DEBUG */
-
-#endif /* __XFS_INODE_BUF_H__ */
diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c
deleted file mode 100644 (file)
index b031e8d..0000000
+++ /dev/null
@@ -1,1906 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include <linux/log2.h>
-
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_inum.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_inode.h"
-#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_bmap.h"
-#include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-
-kmem_zone_t *xfs_ifork_zone;
-
-STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
-STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
-STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
-
-#ifdef DEBUG
-/*
- * Make sure that the extents in the given memory buffer
- * are valid.
- */
-void
-xfs_validate_extents(
-       xfs_ifork_t             *ifp,
-       int                     nrecs,
-       xfs_exntfmt_t           fmt)
-{
-       xfs_bmbt_irec_t         irec;
-       xfs_bmbt_rec_host_t     rec;
-       int                     i;
-
-       for (i = 0; i < nrecs; i++) {
-               xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
-               rec.l0 = get_unaligned(&ep->l0);
-               rec.l1 = get_unaligned(&ep->l1);
-               xfs_bmbt_get_all(&rec, &irec);
-               if (fmt == XFS_EXTFMT_NOSTATE)
-                       ASSERT(irec.br_state == XFS_EXT_NORM);
-       }
-}
-#else /* DEBUG */
-#define xfs_validate_extents(ifp, nrecs, fmt)
-#endif /* DEBUG */
-
-
-/*
- * Move inode type and inode format specific information from the
- * on-disk inode to the in-core inode.  For fifos, devs, and sockets
- * this means set if_rdev to the proper value.  For files, directories,
- * and symlinks this means to bring in the in-line data or extent
- * pointers.  For a file in B-tree format, only the root is immediately
- * brought in-core.  The rest will be in-lined in if_extents when it
- * is first referenced (see xfs_iread_extents()).
- */
-int
-xfs_iformat_fork(
-       xfs_inode_t             *ip,
-       xfs_dinode_t            *dip)
-{
-       xfs_attr_shortform_t    *atp;
-       int                     size;
-       int                     error = 0;
-       xfs_fsize_t             di_size;
-
-       if (unlikely(be32_to_cpu(dip->di_nextents) +
-                    be16_to_cpu(dip->di_anextents) >
-                    be64_to_cpu(dip->di_nblocks))) {
-               xfs_warn(ip->i_mount,
-                       "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
-                       (unsigned long long)ip->i_ino,
-                       (int)(be32_to_cpu(dip->di_nextents) +
-                             be16_to_cpu(dip->di_anextents)),
-                       (unsigned long long)
-                               be64_to_cpu(dip->di_nblocks));
-               XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
-                                    ip->i_mount, dip);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
-               xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
-                       (unsigned long long)ip->i_ino,
-                       dip->di_forkoff);
-               XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
-                                    ip->i_mount, dip);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
-                    !ip->i_mount->m_rtdev_targp)) {
-               xfs_warn(ip->i_mount,
-                       "corrupt dinode %Lu, has realtime flag set.",
-                       ip->i_ino);
-               XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
-                                    XFS_ERRLEVEL_LOW, ip->i_mount, dip);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       switch (ip->i_d.di_mode & S_IFMT) {
-       case S_IFIFO:
-       case S_IFCHR:
-       case S_IFBLK:
-       case S_IFSOCK:
-               if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
-                       XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
-                                             ip->i_mount, dip);
-                       return XFS_ERROR(EFSCORRUPTED);
-               }
-               ip->i_d.di_size = 0;
-               ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
-               break;
-
-       case S_IFREG:
-       case S_IFLNK:
-       case S_IFDIR:
-               switch (dip->di_format) {
-               case XFS_DINODE_FMT_LOCAL:
-                       /*
-                        * no local regular files yet
-                        */
-                       if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) {
-                               xfs_warn(ip->i_mount,
-                       "corrupt inode %Lu (local format for regular file).",
-                                       (unsigned long long) ip->i_ino);
-                               XFS_CORRUPTION_ERROR("xfs_iformat(4)",
-                                                    XFS_ERRLEVEL_LOW,
-                                                    ip->i_mount, dip);
-                               return XFS_ERROR(EFSCORRUPTED);
-                       }
-
-                       di_size = be64_to_cpu(dip->di_size);
-                       if (unlikely(di_size < 0 ||
-                                    di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
-                               xfs_warn(ip->i_mount,
-                       "corrupt inode %Lu (bad size %Ld for local inode).",
-                                       (unsigned long long) ip->i_ino,
-                                       (long long) di_size);
-                               XFS_CORRUPTION_ERROR("xfs_iformat(5)",
-                                                    XFS_ERRLEVEL_LOW,
-                                                    ip->i_mount, dip);
-                               return XFS_ERROR(EFSCORRUPTED);
-                       }
-
-                       size = (int)di_size;
-                       error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
-                       break;
-               case XFS_DINODE_FMT_EXTENTS:
-                       error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
-                       break;
-               case XFS_DINODE_FMT_BTREE:
-                       error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
-                       break;
-               default:
-                       XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
-                                        ip->i_mount);
-                       return XFS_ERROR(EFSCORRUPTED);
-               }
-               break;
-
-       default:
-               XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-       if (error) {
-               return error;
-       }
-       if (!XFS_DFORK_Q(dip))
-               return 0;
-
-       ASSERT(ip->i_afp == NULL);
-       ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
-
-       switch (dip->di_aformat) {
-       case XFS_DINODE_FMT_LOCAL:
-               atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
-               size = be16_to_cpu(atp->hdr.totsize);
-
-               if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
-                       xfs_warn(ip->i_mount,
-                               "corrupt inode %Lu (bad attr fork size %Ld).",
-                               (unsigned long long) ip->i_ino,
-                               (long long) size);
-                       XFS_CORRUPTION_ERROR("xfs_iformat(8)",
-                                            XFS_ERRLEVEL_LOW,
-                                            ip->i_mount, dip);
-                       return XFS_ERROR(EFSCORRUPTED);
-               }
-
-               error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
-               break;
-       case XFS_DINODE_FMT_EXTENTS:
-               error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
-               break;
-       case XFS_DINODE_FMT_BTREE:
-               error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
-               break;
-       default:
-               error = XFS_ERROR(EFSCORRUPTED);
-               break;
-       }
-       if (error) {
-               kmem_zone_free(xfs_ifork_zone, ip->i_afp);
-               ip->i_afp = NULL;
-               xfs_idestroy_fork(ip, XFS_DATA_FORK);
-       }
-       return error;
-}
-
-/*
- * The file is in-lined in the on-disk inode.
- * If it fits into if_inline_data, then copy
- * it there, otherwise allocate a buffer for it
- * and copy the data there.  Either way, set
- * if_data to point at the data.
- * If we allocate a buffer for the data, make
- * sure that its size is a multiple of 4 and
- * record the real size in i_real_bytes.
- */
-STATIC int
-xfs_iformat_local(
-       xfs_inode_t     *ip,
-       xfs_dinode_t    *dip,
-       int             whichfork,
-       int             size)
-{
-       xfs_ifork_t     *ifp;
-       int             real_size;
-
-       /*
-        * If the size is unreasonable, then something
-        * is wrong and we just bail out rather than crash in
-        * kmem_alloc() or memcpy() below.
-        */
-       if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
-               xfs_warn(ip->i_mount,
-       "corrupt inode %Lu (bad size %d for local fork, size = %d).",
-                       (unsigned long long) ip->i_ino, size,
-                       XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
-               XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
-                                    ip->i_mount, dip);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       real_size = 0;
-       if (size == 0)
-               ifp->if_u1.if_data = NULL;
-       else if (size <= sizeof(ifp->if_u2.if_inline_data))
-               ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
-       else {
-               real_size = roundup(size, 4);
-               ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
-       }
-       ifp->if_bytes = size;
-       ifp->if_real_bytes = real_size;
-       if (size)
-               memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
-       ifp->if_flags &= ~XFS_IFEXTENTS;
-       ifp->if_flags |= XFS_IFINLINE;
-       return 0;
-}
-
-/*
- * The file consists of a set of extents all
- * of which fit into the on-disk inode.
- * If there are few enough extents to fit into
- * the if_inline_ext, then copy them there.
- * Otherwise allocate a buffer for them and copy
- * them into it.  Either way, set if_extents
- * to point at the extents.
- */
-STATIC int
-xfs_iformat_extents(
-       xfs_inode_t     *ip,
-       xfs_dinode_t    *dip,
-       int             whichfork)
-{
-       xfs_bmbt_rec_t  *dp;
-       xfs_ifork_t     *ifp;
-       int             nex;
-       int             size;
-       int             i;
-
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       nex = XFS_DFORK_NEXTENTS(dip, whichfork);
-       size = nex * (uint)sizeof(xfs_bmbt_rec_t);
-
-       /*
-        * If the number of extents is unreasonable, then something
-        * is wrong and we just bail out rather than crash in
-        * kmem_alloc() or memcpy() below.
-        */
-       if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
-               xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
-                       (unsigned long long) ip->i_ino, nex);
-               XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
-                                    ip->i_mount, dip);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       ifp->if_real_bytes = 0;
-       if (nex == 0)
-               ifp->if_u1.if_extents = NULL;
-       else if (nex <= XFS_INLINE_EXTS)
-               ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
-       else
-               xfs_iext_add(ifp, 0, nex);
-
-       ifp->if_bytes = size;
-       if (size) {
-               dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
-               xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
-               for (i = 0; i < nex; i++, dp++) {
-                       xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
-                       ep->l0 = get_unaligned_be64(&dp->l0);
-                       ep->l1 = get_unaligned_be64(&dp->l1);
-               }
-               XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
-               if (whichfork != XFS_DATA_FORK ||
-                       XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
-                               if (unlikely(xfs_check_nostate_extents(
-                                   ifp, 0, nex))) {
-                                       XFS_ERROR_REPORT("xfs_iformat_extents(2)",
-                                                        XFS_ERRLEVEL_LOW,
-                                                        ip->i_mount);
-                                       return XFS_ERROR(EFSCORRUPTED);
-                               }
-       }
-       ifp->if_flags |= XFS_IFEXTENTS;
-       return 0;
-}
-
-/*
- * The file has too many extents to fit into
- * the inode, so they are in B-tree format.
- * Allocate a buffer for the root of the B-tree
- * and copy the root into it.  The i_extents
- * field will remain NULL until all of the
- * extents are read in (when they are needed).
- */
-STATIC int
-xfs_iformat_btree(
-       xfs_inode_t             *ip,
-       xfs_dinode_t            *dip,
-       int                     whichfork)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       xfs_bmdr_block_t        *dfp;
-       xfs_ifork_t             *ifp;
-       /* REFERENCED */
-       int                     nrecs;
-       int                     size;
-
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
-       size = XFS_BMAP_BROOT_SPACE(mp, dfp);
-       nrecs = be16_to_cpu(dfp->bb_numrecs);
-
-       /*
-        * blow out if -- fork has less extents than can fit in
-        * fork (fork shouldn't be a btree format), root btree
-        * block has more records than can fit into the fork,
-        * or the number of extents is greater than the number of
-        * blocks.
-        */
-       if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
-                                       XFS_IFORK_MAXEXT(ip, whichfork) ||
-                    XFS_BMDR_SPACE_CALC(nrecs) >
-                                       XFS_DFORK_SIZE(dip, mp, whichfork) ||
-                    XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
-               xfs_warn(mp, "corrupt inode %Lu (btree).",
-                                       (unsigned long long) ip->i_ino);
-               XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
-                                        mp, dip);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       ifp->if_broot_bytes = size;
-       ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
-       ASSERT(ifp->if_broot != NULL);
-       /*
-        * Copy and convert from the on-disk structure
-        * to the in-memory structure.
-        */
-       xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
-                        ifp->if_broot, size);
-       ifp->if_flags &= ~XFS_IFEXTENTS;
-       ifp->if_flags |= XFS_IFBROOT;
-
-       return 0;
-}
-
-/*
- * Read in extents from a btree-format inode.
- * Allocate and fill in if_extents.  Real work is done in xfs_bmap.c.
- */
-int
-xfs_iread_extents(
-       xfs_trans_t     *tp,
-       xfs_inode_t     *ip,
-       int             whichfork)
-{
-       int             error;
-       xfs_ifork_t     *ifp;
-       xfs_extnum_t    nextents;
-
-       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
-       if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
-               XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
-                                ip->i_mount);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-       nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-
-       /*
-        * We know that the size is valid (it's checked in iformat_btree)
-        */
-       ifp->if_bytes = ifp->if_real_bytes = 0;
-       ifp->if_flags |= XFS_IFEXTENTS;
-       xfs_iext_add(ifp, 0, nextents);
-       error = xfs_bmap_read_extents(tp, ip, whichfork);
-       if (error) {
-               xfs_iext_destroy(ifp);
-               ifp->if_flags &= ~XFS_IFEXTENTS;
-               return error;
-       }
-       xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
-       return 0;
-}
-/*
- * Reallocate the space for if_broot based on the number of records
- * being added or deleted as indicated in rec_diff.  Move the records
- * and pointers in if_broot to fit the new size.  When shrinking this
- * will eliminate holes between the records and pointers created by
- * the caller.  When growing this will create holes to be filled in
- * by the caller.
- *
- * The caller must not request to add more records than would fit in
- * the on-disk inode root.  If the if_broot is currently NULL, then
- * if we are adding records, one will be allocated.  The caller must also
- * not request that the number of records go below zero, although
- * it can go to zero.
- *
- * ip -- the inode whose if_broot area is changing
- * ext_diff -- the change in the number of records, positive or negative,
- *      requested for the if_broot array.
- */
-void
-xfs_iroot_realloc(
-       xfs_inode_t             *ip,
-       int                     rec_diff,
-       int                     whichfork)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       int                     cur_max;
-       xfs_ifork_t             *ifp;
-       struct xfs_btree_block  *new_broot;
-       int                     new_max;
-       size_t                  new_size;
-       char                    *np;
-       char                    *op;
-
-       /*
-        * Handle the degenerate case quietly.
-        */
-       if (rec_diff == 0) {
-               return;
-       }
-
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       if (rec_diff > 0) {
-               /*
-                * If there wasn't any memory allocated before, just
-                * allocate it now and get out.
-                */
-               if (ifp->if_broot_bytes == 0) {
-                       new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
-                       ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
-                       ifp->if_broot_bytes = (int)new_size;
-                       return;
-               }
-
-               /*
-                * If there is already an existing if_broot, then we need
-                * to realloc() it and shift the pointers to their new
-                * location.  The records don't change location because
-                * they are kept butted up against the btree block header.
-                */
-               cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
-               new_max = cur_max + rec_diff;
-               new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
-               ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
-                               XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max),
-                               KM_SLEEP | KM_NOFS);
-               op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
-                                                    ifp->if_broot_bytes);
-               np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
-                                                    (int)new_size);
-               ifp->if_broot_bytes = (int)new_size;
-               ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
-                       XFS_IFORK_SIZE(ip, whichfork));
-               memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
-               return;
-       }
-
-       /*
-        * rec_diff is less than 0.  In this case, we are shrinking the
-        * if_broot buffer.  It must already exist.  If we go to zero
-        * records, just get rid of the root and clear the status bit.
-        */
-       ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
-       cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
-       new_max = cur_max + rec_diff;
-       ASSERT(new_max >= 0);
-       if (new_max > 0)
-               new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
-       else
-               new_size = 0;
-       if (new_size > 0) {
-               new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
-               /*
-                * First copy over the btree block header.
-                */
-               memcpy(new_broot, ifp->if_broot,
-                       XFS_BMBT_BLOCK_LEN(ip->i_mount));
-       } else {
-               new_broot = NULL;
-               ifp->if_flags &= ~XFS_IFBROOT;
-       }
-
-       /*
-        * Only copy the records and pointers if there are any.
-        */
-       if (new_max > 0) {
-               /*
-                * First copy the records.
-                */
-               op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
-               np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
-               memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
-
-               /*
-                * Then copy the pointers.
-                */
-               op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
-                                                    ifp->if_broot_bytes);
-               np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
-                                                    (int)new_size);
-               memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
-       }
-       kmem_free(ifp->if_broot);
-       ifp->if_broot = new_broot;
-       ifp->if_broot_bytes = (int)new_size;
-       if (ifp->if_broot)
-               ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
-                       XFS_IFORK_SIZE(ip, whichfork));
-       return;
-}
-
-
-/*
- * This is called when the amount of space needed for if_data
- * is increased or decreased.  The change in size is indicated by
- * the number of bytes that need to be added or deleted in the
- * byte_diff parameter.
- *
- * If the amount of space needed has decreased below the size of the
- * inline buffer, then switch to using the inline buffer.  Otherwise,
- * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
- * to what is needed.
- *
- * ip -- the inode whose if_data area is changing
- * byte_diff -- the change in the number of bytes, positive or negative,
- *      requested for the if_data array.
- */
-void
-xfs_idata_realloc(
-       xfs_inode_t     *ip,
-       int             byte_diff,
-       int             whichfork)
-{
-       xfs_ifork_t     *ifp;
-       int             new_size;
-       int             real_size;
-
-       if (byte_diff == 0) {
-               return;
-       }
-
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       new_size = (int)ifp->if_bytes + byte_diff;
-       ASSERT(new_size >= 0);
-
-       if (new_size == 0) {
-               if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
-                       kmem_free(ifp->if_u1.if_data);
-               }
-               ifp->if_u1.if_data = NULL;
-               real_size = 0;
-       } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
-               /*
-                * If the valid extents/data can fit in if_inline_ext/data,
-                * copy them from the malloc'd vector and free it.
-                */
-               if (ifp->if_u1.if_data == NULL) {
-                       ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
-               } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
-                       ASSERT(ifp->if_real_bytes != 0);
-                       memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
-                             new_size);
-                       kmem_free(ifp->if_u1.if_data);
-                       ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
-               }
-               real_size = 0;
-       } else {
-               /*
-                * Stuck with malloc/realloc.
-                * For inline data, the underlying buffer must be
-                * a multiple of 4 bytes in size so that it can be
-                * logged and stay on word boundaries.  We enforce
-                * that here.
-                */
-               real_size = roundup(new_size, 4);
-               if (ifp->if_u1.if_data == NULL) {
-                       ASSERT(ifp->if_real_bytes == 0);
-                       ifp->if_u1.if_data = kmem_alloc(real_size,
-                                                       KM_SLEEP | KM_NOFS);
-               } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
-                       /*
-                        * Only do the realloc if the underlying size
-                        * is really changing.
-                        */
-                       if (ifp->if_real_bytes != real_size) {
-                               ifp->if_u1.if_data =
-                                       kmem_realloc(ifp->if_u1.if_data,
-                                                       real_size,
-                                                       ifp->if_real_bytes,
-                                                       KM_SLEEP | KM_NOFS);
-                       }
-               } else {
-                       ASSERT(ifp->if_real_bytes == 0);
-                       ifp->if_u1.if_data = kmem_alloc(real_size,
-                                                       KM_SLEEP | KM_NOFS);
-                       memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
-                               ifp->if_bytes);
-               }
-       }
-       ifp->if_real_bytes = real_size;
-       ifp->if_bytes = new_size;
-       ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
-}
-
-void
-xfs_idestroy_fork(
-       xfs_inode_t     *ip,
-       int             whichfork)
-{
-       xfs_ifork_t     *ifp;
-
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       if (ifp->if_broot != NULL) {
-               kmem_free(ifp->if_broot);
-               ifp->if_broot = NULL;
-       }
-
-       /*
-        * If the format is local, then we can't have an extents
-        * array so just look for an inline data array.  If we're
-        * not local then we may or may not have an extents list,
-        * so check and free it up if we do.
-        */
-       if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
-               if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
-                   (ifp->if_u1.if_data != NULL)) {
-                       ASSERT(ifp->if_real_bytes != 0);
-                       kmem_free(ifp->if_u1.if_data);
-                       ifp->if_u1.if_data = NULL;
-                       ifp->if_real_bytes = 0;
-               }
-       } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
-                  ((ifp->if_flags & XFS_IFEXTIREC) ||
-                   ((ifp->if_u1.if_extents != NULL) &&
-                    (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
-               ASSERT(ifp->if_real_bytes != 0);
-               xfs_iext_destroy(ifp);
-       }
-       ASSERT(ifp->if_u1.if_extents == NULL ||
-              ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
-       ASSERT(ifp->if_real_bytes == 0);
-       if (whichfork == XFS_ATTR_FORK) {
-               kmem_zone_free(xfs_ifork_zone, ip->i_afp);
-               ip->i_afp = NULL;
-       }
-}
-
-/*
- * Convert in-core extents to on-disk form
- *
- * For either the data or attr fork in extent format, we need to endian convert
- * the in-core extent as we place them into the on-disk inode.
- *
- * In the case of the data fork, the in-core and on-disk fork sizes can be
- * different due to delayed allocation extents. We only copy on-disk extents
- * here, so callers must always use the physical fork size to determine the
- * size of the buffer passed to this routine.  We will return the size actually
- * used.
- */
-int
-xfs_iextents_copy(
-       xfs_inode_t             *ip,
-       xfs_bmbt_rec_t          *dp,
-       int                     whichfork)
-{
-       int                     copied;
-       int                     i;
-       xfs_ifork_t             *ifp;
-       int                     nrecs;
-       xfs_fsblock_t           start_block;
-
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-       ASSERT(ifp->if_bytes > 0);
-
-       nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
-       ASSERT(nrecs > 0);
-
-       /*
-        * There are some delayed allocation extents in the
-        * inode, so copy the extents one at a time and skip
-        * the delayed ones.  There must be at least one
-        * non-delayed extent.
-        */
-       copied = 0;
-       for (i = 0; i < nrecs; i++) {
-               xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
-               start_block = xfs_bmbt_get_startblock(ep);
-               if (isnullstartblock(start_block)) {
-                       /*
-                        * It's a delayed allocation extent, so skip it.
-                        */
-                       continue;
-               }
-
-               /* Translate to on disk format */
-               put_unaligned_be64(ep->l0, &dp->l0);
-               put_unaligned_be64(ep->l1, &dp->l1);
-               dp++;
-               copied++;
-       }
-       ASSERT(copied != 0);
-       xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
-
-       return (copied * (uint)sizeof(xfs_bmbt_rec_t));
-}
-
-/*
- * Each of the following cases stores data into the same region
- * of the on-disk inode, so only one of them can be valid at
- * any given time. While it is possible to have conflicting formats
- * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
- * in EXTENTS format, this can only happen when the fork has
- * changed formats after being modified but before being flushed.
- * In these cases, the format always takes precedence, because the
- * format indicates the current state of the fork.
- */
-void
-xfs_iflush_fork(
-       xfs_inode_t             *ip,
-       xfs_dinode_t            *dip,
-       xfs_inode_log_item_t    *iip,
-       int                     whichfork)
-{
-       char                    *cp;
-       xfs_ifork_t             *ifp;
-       xfs_mount_t             *mp;
-       static const short      brootflag[2] =
-               { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
-       static const short      dataflag[2] =
-               { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
-       static const short      extflag[2] =
-               { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
-
-       if (!iip)
-               return;
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       /*
-        * This can happen if we gave up in iformat in an error path,
-        * for the attribute fork.
-        */
-       if (!ifp) {
-               ASSERT(whichfork == XFS_ATTR_FORK);
-               return;
-       }
-       cp = XFS_DFORK_PTR(dip, whichfork);
-       mp = ip->i_mount;
-       switch (XFS_IFORK_FORMAT(ip, whichfork)) {
-       case XFS_DINODE_FMT_LOCAL:
-               if ((iip->ili_fields & dataflag[whichfork]) &&
-                   (ifp->if_bytes > 0)) {
-                       ASSERT(ifp->if_u1.if_data != NULL);
-                       ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
-                       memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
-               }
-               break;
-
-       case XFS_DINODE_FMT_EXTENTS:
-               ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
-                      !(iip->ili_fields & extflag[whichfork]));
-               if ((iip->ili_fields & extflag[whichfork]) &&
-                   (ifp->if_bytes > 0)) {
-                       ASSERT(xfs_iext_get_ext(ifp, 0));
-                       ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
-                       (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
-                               whichfork);
-               }
-               break;
-
-       case XFS_DINODE_FMT_BTREE:
-               if ((iip->ili_fields & brootflag[whichfork]) &&
-                   (ifp->if_broot_bytes > 0)) {
-                       ASSERT(ifp->if_broot != NULL);
-                       ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
-                               XFS_IFORK_SIZE(ip, whichfork));
-                       xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
-                               (xfs_bmdr_block_t *)cp,
-                               XFS_DFORK_SIZE(dip, mp, whichfork));
-               }
-               break;
-
-       case XFS_DINODE_FMT_DEV:
-               if (iip->ili_fields & XFS_ILOG_DEV) {
-                       ASSERT(whichfork == XFS_DATA_FORK);
-                       xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
-               }
-               break;
-
-       case XFS_DINODE_FMT_UUID:
-               if (iip->ili_fields & XFS_ILOG_UUID) {
-                       ASSERT(whichfork == XFS_DATA_FORK);
-                       memcpy(XFS_DFORK_DPTR(dip),
-                              &ip->i_df.if_u2.if_uuid,
-                              sizeof(uuid_t));
-               }
-               break;
-
-       default:
-               ASSERT(0);
-               break;
-       }
-}
-
-/*
- * Return a pointer to the extent record at file index idx.
- */
-xfs_bmbt_rec_host_t *
-xfs_iext_get_ext(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_extnum_t    idx)            /* index of target extent */
-{
-       ASSERT(idx >= 0);
-       ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
-
-       if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
-               return ifp->if_u1.if_ext_irec->er_extbuf;
-       } else if (ifp->if_flags & XFS_IFEXTIREC) {
-               xfs_ext_irec_t  *erp;           /* irec pointer */
-               int             erp_idx = 0;    /* irec index */
-               xfs_extnum_t    page_idx = idx; /* ext index in target list */
-
-               erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
-               return &erp->er_extbuf[page_idx];
-       } else if (ifp->if_bytes) {
-               return &ifp->if_u1.if_extents[idx];
-       } else {
-               return NULL;
-       }
-}
-
-/*
- * Insert new item(s) into the extent records for incore inode
- * fork 'ifp'.  'count' new items are inserted at index 'idx'.
- */
-void
-xfs_iext_insert(
-       xfs_inode_t     *ip,            /* incore inode pointer */
-       xfs_extnum_t    idx,            /* starting index of new items */
-       xfs_extnum_t    count,          /* number of inserted items */
-       xfs_bmbt_irec_t *new,           /* items to insert */
-       int             state)          /* type of extent conversion */
-{
-       xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
-       xfs_extnum_t    i;              /* extent record index */
-
-       trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
-
-       ASSERT(ifp->if_flags & XFS_IFEXTENTS);
-       xfs_iext_add(ifp, idx, count);
-       for (i = idx; i < idx + count; i++, new++)
-               xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
-}
-
-/*
- * This is called when the amount of space required for incore file
- * extents needs to be increased. The ext_diff parameter stores the
- * number of new extents being added and the idx parameter contains
- * the extent index where the new extents will be added. If the new
- * extents are being appended, then we just need to (re)allocate and
- * initialize the space. Otherwise, if the new extents are being
- * inserted into the middle of the existing entries, a bit more work
- * is required to make room for the new extents to be inserted. The
- * caller is responsible for filling in the new extent entries upon
- * return.
- */
-void
-xfs_iext_add(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_extnum_t    idx,            /* index to begin adding exts */
-       int             ext_diff)       /* number of extents to add */
-{
-       int             byte_diff;      /* new bytes being added */
-       int             new_size;       /* size of extents after adding */
-       xfs_extnum_t    nextents;       /* number of extents in file */
-
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       ASSERT((idx >= 0) && (idx <= nextents));
-       byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
-       new_size = ifp->if_bytes + byte_diff;
-       /*
-        * If the new number of extents (nextents + ext_diff)
-        * fits inside the inode, then continue to use the inline
-        * extent buffer.
-        */
-       if (nextents + ext_diff <= XFS_INLINE_EXTS) {
-               if (idx < nextents) {
-                       memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
-                               &ifp->if_u2.if_inline_ext[idx],
-                               (nextents - idx) * sizeof(xfs_bmbt_rec_t));
-                       memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
-               }
-               ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
-               ifp->if_real_bytes = 0;
-       }
-       /*
-        * Otherwise use a linear (direct) extent list.
-        * If the extents are currently inside the inode,
-        * xfs_iext_realloc_direct will switch us from
-        * inline to direct extent allocation mode.
-        */
-       else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
-               xfs_iext_realloc_direct(ifp, new_size);
-               if (idx < nextents) {
-                       memmove(&ifp->if_u1.if_extents[idx + ext_diff],
-                               &ifp->if_u1.if_extents[idx],
-                               (nextents - idx) * sizeof(xfs_bmbt_rec_t));
-                       memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
-               }
-       }
-       /* Indirection array */
-       else {
-               xfs_ext_irec_t  *erp;
-               int             erp_idx = 0;
-               int             page_idx = idx;
-
-               ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
-               if (ifp->if_flags & XFS_IFEXTIREC) {
-                       erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
-               } else {
-                       xfs_iext_irec_init(ifp);
-                       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-                       erp = ifp->if_u1.if_ext_irec;
-               }
-               /* Extents fit in target extent page */
-               if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
-                       if (page_idx < erp->er_extcount) {
-                               memmove(&erp->er_extbuf[page_idx + ext_diff],
-                                       &erp->er_extbuf[page_idx],
-                                       (erp->er_extcount - page_idx) *
-                                       sizeof(xfs_bmbt_rec_t));
-                               memset(&erp->er_extbuf[page_idx], 0, byte_diff);
-                       }
-                       erp->er_extcount += ext_diff;
-                       xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
-               }
-               /* Insert a new extent page */
-               else if (erp) {
-                       xfs_iext_add_indirect_multi(ifp,
-                               erp_idx, page_idx, ext_diff);
-               }
-               /*
-                * If extent(s) are being appended to the last page in
-                * the indirection array and the new extent(s) don't fit
-                * in the page, then erp is NULL and erp_idx is set to
-                * the next index needed in the indirection array.
-                */
-               else {
-                       uint    count = ext_diff;
-
-                       while (count) {
-                               erp = xfs_iext_irec_new(ifp, erp_idx);
-                               erp->er_extcount = min(count, XFS_LINEAR_EXTS);
-                               count -= erp->er_extcount;
-                               if (count)
-                                       erp_idx++;
-                       }
-               }
-       }
-       ifp->if_bytes = new_size;
-}
-
-/*
- * This is called when incore extents are being added to the indirection
- * array and the new extents do not fit in the target extent list. The
- * erp_idx parameter contains the irec index for the target extent list
- * in the indirection array, and the idx parameter contains the extent
- * index within the list. The number of extents being added is stored
- * in the count parameter.
- *
- *    |-------|   |-------|
- *    |       |   |       |    idx - number of extents before idx
- *    |  idx  |   | count |
- *    |       |   |       |    count - number of extents being inserted at idx
- *    |-------|   |-------|
- *    | count |   | nex2  |    nex2 - number of extents after idx + count
- *    |-------|   |-------|
- */
-void
-xfs_iext_add_indirect_multi(
-       xfs_ifork_t     *ifp,                   /* inode fork pointer */
-       int             erp_idx,                /* target extent irec index */
-       xfs_extnum_t    idx,                    /* index within target list */
-       int             count)                  /* new extents being added */
-{
-       int             byte_diff;              /* new bytes being added */
-       xfs_ext_irec_t  *erp;                   /* pointer to irec entry */
-       xfs_extnum_t    ext_diff;               /* number of extents to add */
-       xfs_extnum_t    ext_cnt;                /* new extents still needed */
-       xfs_extnum_t    nex2;                   /* extents after idx + count */
-       xfs_bmbt_rec_t  *nex2_ep = NULL;        /* temp list for nex2 extents */
-       int             nlists;                 /* number of irec's (lists) */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       erp = &ifp->if_u1.if_ext_irec[erp_idx];
-       nex2 = erp->er_extcount - idx;
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-
-       /*
-        * Save second part of target extent list
-        * (all extents past */
-       if (nex2) {
-               byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
-               nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
-               memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
-               erp->er_extcount -= nex2;
-               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
-               memset(&erp->er_extbuf[idx], 0, byte_diff);
-       }
-
-       /*
-        * Add the new extents to the end of the target
-        * list, then allocate new irec record(s) and
-        * extent buffer(s) as needed to store the rest
-        * of the new extents.
-        */
-       ext_cnt = count;
-       ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
-       if (ext_diff) {
-               erp->er_extcount += ext_diff;
-               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
-               ext_cnt -= ext_diff;
-       }
-       while (ext_cnt) {
-               erp_idx++;
-               erp = xfs_iext_irec_new(ifp, erp_idx);
-               ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
-               erp->er_extcount = ext_diff;
-               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
-               ext_cnt -= ext_diff;
-       }
-
-       /* Add nex2 extents back to indirection array */
-       if (nex2) {
-               xfs_extnum_t    ext_avail;
-               int             i;
-
-               byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
-               ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
-               i = 0;
-               /*
-                * If nex2 extents fit in the current page, append
-                * nex2_ep after the new extents.
-                */
-               if (nex2 <= ext_avail) {
-                       i = erp->er_extcount;
-               }
-               /*
-                * Otherwise, check if space is available in the
-                * next page.
-                */
-               else if ((erp_idx < nlists - 1) &&
-                        (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
-                         ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
-                       erp_idx++;
-                       erp++;
-                       /* Create a hole for nex2 extents */
-                       memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
-                               erp->er_extcount * sizeof(xfs_bmbt_rec_t));
-               }
-               /*
-                * Final choice, create a new extent page for
-                * nex2 extents.
-                */
-               else {
-                       erp_idx++;
-                       erp = xfs_iext_irec_new(ifp, erp_idx);
-               }
-               memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
-               kmem_free(nex2_ep);
-               erp->er_extcount += nex2;
-               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
-       }
-}
-
-/*
- * This is called when the amount of space required for incore file
- * extents needs to be decreased. The ext_diff parameter stores the
- * number of extents to be removed and the idx parameter contains
- * the extent index where the extents will be removed from.
- *
- * If the amount of space needed has decreased below the linear
- * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
- * extent array.  Otherwise, use kmem_realloc() to adjust the
- * size to what is needed.
- */
-void
-xfs_iext_remove(
-       xfs_inode_t     *ip,            /* incore inode pointer */
-       xfs_extnum_t    idx,            /* index to begin removing exts */
-       int             ext_diff,       /* number of extents to remove */
-       int             state)          /* type of extent conversion */
-{
-       xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
-       xfs_extnum_t    nextents;       /* number of extents in file */
-       int             new_size;       /* size of extents after removal */
-
-       trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
-
-       ASSERT(ext_diff > 0);
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
-
-       if (new_size == 0) {
-               xfs_iext_destroy(ifp);
-       } else if (ifp->if_flags & XFS_IFEXTIREC) {
-               xfs_iext_remove_indirect(ifp, idx, ext_diff);
-       } else if (ifp->if_real_bytes) {
-               xfs_iext_remove_direct(ifp, idx, ext_diff);
-       } else {
-               xfs_iext_remove_inline(ifp, idx, ext_diff);
-       }
-       ifp->if_bytes = new_size;
-}
-
-/*
- * This removes ext_diff extents from the inline buffer, beginning
- * at extent index idx.
- */
-void
-xfs_iext_remove_inline(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_extnum_t    idx,            /* index to begin removing exts */
-       int             ext_diff)       /* number of extents to remove */
-{
-       int             nextents;       /* number of extents in file */
-
-       ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
-       ASSERT(idx < XFS_INLINE_EXTS);
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       ASSERT(((nextents - ext_diff) > 0) &&
-               (nextents - ext_diff) < XFS_INLINE_EXTS);
-
-       if (idx + ext_diff < nextents) {
-               memmove(&ifp->if_u2.if_inline_ext[idx],
-                       &ifp->if_u2.if_inline_ext[idx + ext_diff],
-                       (nextents - (idx + ext_diff)) *
-                        sizeof(xfs_bmbt_rec_t));
-               memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
-                       0, ext_diff * sizeof(xfs_bmbt_rec_t));
-       } else {
-               memset(&ifp->if_u2.if_inline_ext[idx], 0,
-                       ext_diff * sizeof(xfs_bmbt_rec_t));
-       }
-}
-
-/*
- * This removes ext_diff extents from a linear (direct) extent list,
- * beginning at extent index idx. If the extents are being removed
- * from the end of the list (ie. truncate) then we just need to re-
- * allocate the list to remove the extra space. Otherwise, if the
- * extents are being removed from the middle of the existing extent
- * entries, then we first need to move the extent records beginning
- * at idx + ext_diff up in the list to overwrite the records being
- * removed, then remove the extra space via kmem_realloc.
- */
-void
-xfs_iext_remove_direct(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_extnum_t    idx,            /* index to begin removing exts */
-       int             ext_diff)       /* number of extents to remove */
-{
-       xfs_extnum_t    nextents;       /* number of extents in file */
-       int             new_size;       /* size of extents after removal */
-
-       ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
-       new_size = ifp->if_bytes -
-               (ext_diff * sizeof(xfs_bmbt_rec_t));
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-
-       if (new_size == 0) {
-               xfs_iext_destroy(ifp);
-               return;
-       }
-       /* Move extents up in the list (if needed) */
-       if (idx + ext_diff < nextents) {
-               memmove(&ifp->if_u1.if_extents[idx],
-                       &ifp->if_u1.if_extents[idx + ext_diff],
-                       (nextents - (idx + ext_diff)) *
-                        sizeof(xfs_bmbt_rec_t));
-       }
-       memset(&ifp->if_u1.if_extents[nextents - ext_diff],
-               0, ext_diff * sizeof(xfs_bmbt_rec_t));
-       /*
-        * Reallocate the direct extent list. If the extents
-        * will fit inside the inode then xfs_iext_realloc_direct
-        * will switch from direct to inline extent allocation
-        * mode for us.
-        */
-       xfs_iext_realloc_direct(ifp, new_size);
-       ifp->if_bytes = new_size;
-}
-
-/*
- * This is called when incore extents are being removed from the
- * indirection array and the extents being removed span multiple extent
- * buffers. The idx parameter contains the file extent index where we
- * want to begin removing extents, and the count parameter contains
- * how many extents need to be removed.
- *
- *    |-------|   |-------|
- *    | nex1  |   |       |    nex1 - number of extents before idx
- *    |-------|   | count |
- *    |       |   |       |    count - number of extents being removed at idx
- *    | count |   |-------|
- *    |       |   | nex2  |    nex2 - number of extents after idx + count
- *    |-------|   |-------|
- */
-void
-xfs_iext_remove_indirect(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_extnum_t    idx,            /* index to begin removing extents */
-       int             count)          /* number of extents to remove */
-{
-       xfs_ext_irec_t  *erp;           /* indirection array pointer */
-       int             erp_idx = 0;    /* indirection array index */
-       xfs_extnum_t    ext_cnt;        /* extents left to remove */
-       xfs_extnum_t    ext_diff;       /* extents to remove in current list */
-       xfs_extnum_t    nex1;           /* number of extents before idx */
-       xfs_extnum_t    nex2;           /* extents after idx + count */
-       int             page_idx = idx; /* index in target extent list */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       erp = xfs_iext_idx_to_irec(ifp,  &page_idx, &erp_idx, 0);
-       ASSERT(erp != NULL);
-       nex1 = page_idx;
-       ext_cnt = count;
-       while (ext_cnt) {
-               nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
-               ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
-               /*
-                * Check for deletion of entire list;
-                * xfs_iext_irec_remove() updates extent offsets.
-                */
-               if (ext_diff == erp->er_extcount) {
-                       xfs_iext_irec_remove(ifp, erp_idx);
-                       ext_cnt -= ext_diff;
-                       nex1 = 0;
-                       if (ext_cnt) {
-                               ASSERT(erp_idx < ifp->if_real_bytes /
-                                       XFS_IEXT_BUFSZ);
-                               erp = &ifp->if_u1.if_ext_irec[erp_idx];
-                               nex1 = 0;
-                               continue;
-                       } else {
-                               break;
-                       }
-               }
-               /* Move extents up (if needed) */
-               if (nex2) {
-                       memmove(&erp->er_extbuf[nex1],
-                               &erp->er_extbuf[nex1 + ext_diff],
-                               nex2 * sizeof(xfs_bmbt_rec_t));
-               }
-               /* Zero out rest of page */
-               memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
-                       ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
-               /* Update remaining counters */
-               erp->er_extcount -= ext_diff;
-               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
-               ext_cnt -= ext_diff;
-               nex1 = 0;
-               erp_idx++;
-               erp++;
-       }
-       ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
-       xfs_iext_irec_compact(ifp);
-}
-
-/*
- * Create, destroy, or resize a linear (direct) block of extents.
- */
-void
-xfs_iext_realloc_direct(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       int             new_size)       /* new size of extents after adding */
-{
-       int             rnew_size;      /* real new size of extents */
-
-       rnew_size = new_size;
-
-       ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
-               ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
-                (new_size != ifp->if_real_bytes)));
-
-       /* Free extent records */
-       if (new_size == 0) {
-               xfs_iext_destroy(ifp);
-       }
-       /* Resize direct extent list and zero any new bytes */
-       else if (ifp->if_real_bytes) {
-               /* Check if extents will fit inside the inode */
-               if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
-                       xfs_iext_direct_to_inline(ifp, new_size /
-                               (uint)sizeof(xfs_bmbt_rec_t));
-                       ifp->if_bytes = new_size;
-                       return;
-               }
-               if (!is_power_of_2(new_size)){
-                       rnew_size = roundup_pow_of_two(new_size);
-               }
-               if (rnew_size != ifp->if_real_bytes) {
-                       ifp->if_u1.if_extents =
-                               kmem_realloc(ifp->if_u1.if_extents,
-                                               rnew_size,
-                                               ifp->if_real_bytes, KM_NOFS);
-               }
-               if (rnew_size > ifp->if_real_bytes) {
-                       memset(&ifp->if_u1.if_extents[ifp->if_bytes /
-                               (uint)sizeof(xfs_bmbt_rec_t)], 0,
-                               rnew_size - ifp->if_real_bytes);
-               }
-       }
-       /* Switch from the inline extent buffer to a direct extent list */
-       else {
-               if (!is_power_of_2(new_size)) {
-                       rnew_size = roundup_pow_of_two(new_size);
-               }
-               xfs_iext_inline_to_direct(ifp, rnew_size);
-       }
-       ifp->if_real_bytes = rnew_size;
-       ifp->if_bytes = new_size;
-}
-
-/*
- * Switch from linear (direct) extent records to inline buffer.
- */
-void
-xfs_iext_direct_to_inline(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_extnum_t    nextents)       /* number of extents in file */
-{
-       ASSERT(ifp->if_flags & XFS_IFEXTENTS);
-       ASSERT(nextents <= XFS_INLINE_EXTS);
-       /*
-        * The inline buffer was zeroed when we switched
-        * from inline to direct extent allocation mode,
-        * so we don't need to clear it here.
-        */
-       memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
-               nextents * sizeof(xfs_bmbt_rec_t));
-       kmem_free(ifp->if_u1.if_extents);
-       ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
-       ifp->if_real_bytes = 0;
-}
-
-/*
- * Switch from inline buffer to linear (direct) extent records.
- * new_size should already be rounded up to the next power of 2
- * by the caller (when appropriate), so use new_size as it is.
- * However, since new_size may be rounded up, we can't update
- * if_bytes here. It is the caller's responsibility to update
- * if_bytes upon return.
- */
-void
-xfs_iext_inline_to_direct(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       int             new_size)       /* number of extents in file */
-{
-       ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
-       memset(ifp->if_u1.if_extents, 0, new_size);
-       if (ifp->if_bytes) {
-               memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
-                       ifp->if_bytes);
-               memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
-                       sizeof(xfs_bmbt_rec_t));
-       }
-       ifp->if_real_bytes = new_size;
-}
-
-/*
- * Resize an extent indirection array to new_size bytes.
- */
-STATIC void
-xfs_iext_realloc_indirect(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       int             new_size)       /* new indirection array size */
-{
-       int             nlists;         /* number of irec's (ex lists) */
-       int             size;           /* current indirection array size */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-       size = nlists * sizeof(xfs_ext_irec_t);
-       ASSERT(ifp->if_real_bytes);
-       ASSERT((new_size >= 0) && (new_size != size));
-       if (new_size == 0) {
-               xfs_iext_destroy(ifp);
-       } else {
-               ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
-                       kmem_realloc(ifp->if_u1.if_ext_irec,
-                               new_size, size, KM_NOFS);
-       }
-}
-
-/*
- * Switch from indirection array to linear (direct) extent allocations.
- */
-STATIC void
-xfs_iext_indirect_to_direct(
-        xfs_ifork_t    *ifp)           /* inode fork pointer */
-{
-       xfs_bmbt_rec_host_t *ep;        /* extent record pointer */
-       xfs_extnum_t    nextents;       /* number of extents in file */
-       int             size;           /* size of file extents */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       ASSERT(nextents <= XFS_LINEAR_EXTS);
-       size = nextents * sizeof(xfs_bmbt_rec_t);
-
-       xfs_iext_irec_compact_pages(ifp);
-       ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
-
-       ep = ifp->if_u1.if_ext_irec->er_extbuf;
-       kmem_free(ifp->if_u1.if_ext_irec);
-       ifp->if_flags &= ~XFS_IFEXTIREC;
-       ifp->if_u1.if_extents = ep;
-       ifp->if_bytes = size;
-       if (nextents < XFS_LINEAR_EXTS) {
-               xfs_iext_realloc_direct(ifp, size);
-       }
-}
-
-/*
- * Free incore file extents.
- */
-void
-xfs_iext_destroy(
-       xfs_ifork_t     *ifp)           /* inode fork pointer */
-{
-       if (ifp->if_flags & XFS_IFEXTIREC) {
-               int     erp_idx;
-               int     nlists;
-
-               nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-               for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
-                       xfs_iext_irec_remove(ifp, erp_idx);
-               }
-               ifp->if_flags &= ~XFS_IFEXTIREC;
-       } else if (ifp->if_real_bytes) {
-               kmem_free(ifp->if_u1.if_extents);
-       } else if (ifp->if_bytes) {
-               memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
-                       sizeof(xfs_bmbt_rec_t));
-       }
-       ifp->if_u1.if_extents = NULL;
-       ifp->if_real_bytes = 0;
-       ifp->if_bytes = 0;
-}
-
-/*
- * Return a pointer to the extent record for file system block bno.
- */
-xfs_bmbt_rec_host_t *                  /* pointer to found extent record */
-xfs_iext_bno_to_ext(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_fileoff_t   bno,            /* block number to search for */
-       xfs_extnum_t    *idxp)          /* index of target extent */
-{
-       xfs_bmbt_rec_host_t *base;      /* pointer to first extent */
-       xfs_filblks_t   blockcount = 0; /* number of blocks in extent */
-       xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */
-       xfs_ext_irec_t  *erp = NULL;    /* indirection array pointer */
-       int             high;           /* upper boundary in search */
-       xfs_extnum_t    idx = 0;        /* index of target extent */
-       int             low;            /* lower boundary in search */
-       xfs_extnum_t    nextents;       /* number of file extents */
-       xfs_fileoff_t   startoff = 0;   /* start offset of extent */
-
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       if (nextents == 0) {
-               *idxp = 0;
-               return NULL;
-       }
-       low = 0;
-       if (ifp->if_flags & XFS_IFEXTIREC) {
-               /* Find target extent list */
-               int     erp_idx = 0;
-               erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
-               base = erp->er_extbuf;
-               high = erp->er_extcount - 1;
-       } else {
-               base = ifp->if_u1.if_extents;
-               high = nextents - 1;
-       }
-       /* Binary search extent records */
-       while (low <= high) {
-               idx = (low + high) >> 1;
-               ep = base + idx;
-               startoff = xfs_bmbt_get_startoff(ep);
-               blockcount = xfs_bmbt_get_blockcount(ep);
-               if (bno < startoff) {
-                       high = idx - 1;
-               } else if (bno >= startoff + blockcount) {
-                       low = idx + 1;
-               } else {
-                       /* Convert back to file-based extent index */
-                       if (ifp->if_flags & XFS_IFEXTIREC) {
-                               idx += erp->er_extoff;
-                       }
-                       *idxp = idx;
-                       return ep;
-               }
-       }
-       /* Convert back to file-based extent index */
-       if (ifp->if_flags & XFS_IFEXTIREC) {
-               idx += erp->er_extoff;
-       }
-       if (bno >= startoff + blockcount) {
-               if (++idx == nextents) {
-                       ep = NULL;
-               } else {
-                       ep = xfs_iext_get_ext(ifp, idx);
-               }
-       }
-       *idxp = idx;
-       return ep;
-}
-
-/*
- * Return a pointer to the indirection array entry containing the
- * extent record for filesystem block bno. Store the index of the
- * target irec in *erp_idxp.
- */
-xfs_ext_irec_t *                       /* pointer to found extent record */
-xfs_iext_bno_to_irec(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_fileoff_t   bno,            /* block number to search for */
-       int             *erp_idxp)      /* irec index of target ext list */
-{
-       xfs_ext_irec_t  *erp = NULL;    /* indirection array pointer */
-       xfs_ext_irec_t  *erp_next;      /* next indirection array entry */
-       int             erp_idx;        /* indirection array index */
-       int             nlists;         /* number of extent irec's (lists) */
-       int             high;           /* binary search upper limit */
-       int             low;            /* binary search lower limit */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-       erp_idx = 0;
-       low = 0;
-       high = nlists - 1;
-       while (low <= high) {
-               erp_idx = (low + high) >> 1;
-               erp = &ifp->if_u1.if_ext_irec[erp_idx];
-               erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
-               if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
-                       high = erp_idx - 1;
-               } else if (erp_next && bno >=
-                          xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
-                       low = erp_idx + 1;
-               } else {
-                       break;
-               }
-       }
-       *erp_idxp = erp_idx;
-       return erp;
-}
-
-/*
- * Return a pointer to the indirection array entry containing the
- * extent record at file extent index *idxp. Store the index of the
- * target irec in *erp_idxp and store the page index of the target
- * extent record in *idxp.
- */
-xfs_ext_irec_t *
-xfs_iext_idx_to_irec(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_extnum_t    *idxp,          /* extent index (file -> page) */
-       int             *erp_idxp,      /* pointer to target irec */
-       int             realloc)        /* new bytes were just added */
-{
-       xfs_ext_irec_t  *prev;          /* pointer to previous irec */
-       xfs_ext_irec_t  *erp = NULL;    /* pointer to current irec */
-       int             erp_idx;        /* indirection array index */
-       int             nlists;         /* number of irec's (ex lists) */
-       int             high;           /* binary search upper limit */
-       int             low;            /* binary search lower limit */
-       xfs_extnum_t    page_idx = *idxp; /* extent index in target list */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       ASSERT(page_idx >= 0);
-       ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
-       ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
-
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-       erp_idx = 0;
-       low = 0;
-       high = nlists - 1;
-
-       /* Binary search extent irec's */
-       while (low <= high) {
-               erp_idx = (low + high) >> 1;
-               erp = &ifp->if_u1.if_ext_irec[erp_idx];
-               prev = erp_idx > 0 ? erp - 1 : NULL;
-               if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
-                    realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
-                       high = erp_idx - 1;
-               } else if (page_idx > erp->er_extoff + erp->er_extcount ||
-                          (page_idx == erp->er_extoff + erp->er_extcount &&
-                           !realloc)) {
-                       low = erp_idx + 1;
-               } else if (page_idx == erp->er_extoff + erp->er_extcount &&
-                          erp->er_extcount == XFS_LINEAR_EXTS) {
-                       ASSERT(realloc);
-                       page_idx = 0;
-                       erp_idx++;
-                       erp = erp_idx < nlists ? erp + 1 : NULL;
-                       break;
-               } else {
-                       page_idx -= erp->er_extoff;
-                       break;
-               }
-       }
-       *idxp = page_idx;
-       *erp_idxp = erp_idx;
-       return(erp);
-}
-
-/*
- * Allocate and initialize an indirection array once the space needed
- * for incore extents increases above XFS_IEXT_BUFSZ.
- */
-void
-xfs_iext_irec_init(
-       xfs_ifork_t     *ifp)           /* inode fork pointer */
-{
-       xfs_ext_irec_t  *erp;           /* indirection array pointer */
-       xfs_extnum_t    nextents;       /* number of extents in file */
-
-       ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       ASSERT(nextents <= XFS_LINEAR_EXTS);
-
-       erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
-
-       if (nextents == 0) {
-               ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
-       } else if (!ifp->if_real_bytes) {
-               xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
-       } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
-               xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
-       }
-       erp->er_extbuf = ifp->if_u1.if_extents;
-       erp->er_extcount = nextents;
-       erp->er_extoff = 0;
-
-       ifp->if_flags |= XFS_IFEXTIREC;
-       ifp->if_real_bytes = XFS_IEXT_BUFSZ;
-       ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
-       ifp->if_u1.if_ext_irec = erp;
-
-       return;
-}
-
-/*
- * Allocate and initialize a new entry in the indirection array.
- */
-xfs_ext_irec_t *
-xfs_iext_irec_new(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       int             erp_idx)        /* index for new irec */
-{
-       xfs_ext_irec_t  *erp;           /* indirection array pointer */
-       int             i;              /* loop counter */
-       int             nlists;         /* number of irec's (ex lists) */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-
-       /* Resize indirection array */
-       xfs_iext_realloc_indirect(ifp, ++nlists *
-                                 sizeof(xfs_ext_irec_t));
-       /*
-        * Move records down in the array so the
-        * new page can use erp_idx.
-        */
-       erp = ifp->if_u1.if_ext_irec;
-       for (i = nlists - 1; i > erp_idx; i--) {
-               memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
-       }
-       ASSERT(i == erp_idx);
-
-       /* Initialize new extent record */
-       erp = ifp->if_u1.if_ext_irec;
-       erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
-       ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
-       memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
-       erp[erp_idx].er_extcount = 0;
-       erp[erp_idx].er_extoff = erp_idx > 0 ?
-               erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
-       return (&erp[erp_idx]);
-}
-
-/*
- * Remove a record from the indirection array.
- */
-void
-xfs_iext_irec_remove(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       int             erp_idx)        /* irec index to remove */
-{
-       xfs_ext_irec_t  *erp;           /* indirection array pointer */
-       int             i;              /* loop counter */
-       int             nlists;         /* number of irec's (ex lists) */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-       erp = &ifp->if_u1.if_ext_irec[erp_idx];
-       if (erp->er_extbuf) {
-               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
-                       -erp->er_extcount);
-               kmem_free(erp->er_extbuf);
-       }
-       /* Compact extent records */
-       erp = ifp->if_u1.if_ext_irec;
-       for (i = erp_idx; i < nlists - 1; i++) {
-               memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
-       }
-       /*
-        * Manually free the last extent record from the indirection
-        * array.  A call to xfs_iext_realloc_indirect() with a size
-        * of zero would result in a call to xfs_iext_destroy() which
-        * would in turn call this function again, creating a nasty
-        * infinite loop.
-        */
-       if (--nlists) {
-               xfs_iext_realloc_indirect(ifp,
-                       nlists * sizeof(xfs_ext_irec_t));
-       } else {
-               kmem_free(ifp->if_u1.if_ext_irec);
-       }
-       ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
-}
-
-/*
- * This is called to clean up large amounts of unused memory allocated
- * by the indirection array.  Before compacting anything though, verify
- * that the indirection array is still needed and switch back to the
- * linear extent list (or even the inline buffer) if possible.  The
- * compaction policy is as follows:
- *
- *    Full Compaction: Extents fit into a single page (or inline buffer)
- * Partial Compaction: Extents occupy less than 50% of allocated space
- *      No Compaction: Extents occupy at least 50% of allocated space
- */
-void
-xfs_iext_irec_compact(
-       xfs_ifork_t     *ifp)           /* inode fork pointer */
-{
-       xfs_extnum_t    nextents;       /* number of extents in file */
-       int             nlists;         /* number of irec's (ex lists) */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-
-       if (nextents == 0) {
-               xfs_iext_destroy(ifp);
-       } else if (nextents <= XFS_INLINE_EXTS) {
-               xfs_iext_indirect_to_direct(ifp);
-               xfs_iext_direct_to_inline(ifp, nextents);
-       } else if (nextents <= XFS_LINEAR_EXTS) {
-               xfs_iext_indirect_to_direct(ifp);
-       } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
-               xfs_iext_irec_compact_pages(ifp);
-       }
-}
-
-/*
- * Combine extents from neighboring extent pages.
- */
-void
-xfs_iext_irec_compact_pages(
-       xfs_ifork_t     *ifp)           /* inode fork pointer */
-{
-       xfs_ext_irec_t  *erp, *erp_next;/* pointers to irec entries */
-       int             erp_idx = 0;    /* indirection array index */
-       int             nlists;         /* number of irec's (ex lists) */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-       while (erp_idx < nlists - 1) {
-               erp = &ifp->if_u1.if_ext_irec[erp_idx];
-               erp_next = erp + 1;
-               if (erp_next->er_extcount <=
-                   (XFS_LINEAR_EXTS - erp->er_extcount)) {
-                       memcpy(&erp->er_extbuf[erp->er_extcount],
-                               erp_next->er_extbuf, erp_next->er_extcount *
-                               sizeof(xfs_bmbt_rec_t));
-                       erp->er_extcount += erp_next->er_extcount;
-                       /*
-                        * Free page before removing extent record
-                        * so er_extoffs don't get modified in
-                        * xfs_iext_irec_remove.
-                        */
-                       kmem_free(erp_next->er_extbuf);
-                       erp_next->er_extbuf = NULL;
-                       xfs_iext_irec_remove(ifp, erp_idx + 1);
-                       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-               } else {
-                       erp_idx++;
-               }
-       }
-}
-
-/*
- * This is called to update the er_extoff field in the indirection
- * array when extents have been added or removed from one of the
- * extent lists. erp_idx contains the irec index to begin updating
- * at and ext_diff contains the number of extents that were added
- * or removed.
- */
-void
-xfs_iext_irec_update_extoffs(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       int             erp_idx,        /* irec index to update */
-       int             ext_diff)       /* number of new extents */
-{
-       int             i;              /* loop counter */
-       int             nlists;         /* number of irec's (ex lists */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-       for (i = erp_idx; i < nlists; i++) {
-               ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
-       }
-}
diff --git a/fs/xfs/xfs_inode_fork.h b/fs/xfs/xfs_inode_fork.h
deleted file mode 100644 (file)
index 7d3b1ed..0000000
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef        __XFS_INODE_FORK_H__
-#define        __XFS_INODE_FORK_H__
-
-struct xfs_inode_log_item;
-struct xfs_dinode;
-
-/*
- * The following xfs_ext_irec_t struct introduces a second (top) level
- * to the in-core extent allocation scheme. These structs are allocated
- * in a contiguous block, creating an indirection array where each entry
- * (irec) contains a pointer to a buffer of in-core extent records which
- * it manages. Each extent buffer is 4k in size, since 4k is the system
- * page size on Linux i386 and systems with larger page sizes don't seem
- * to gain much, if anything, by using their native page size as the
- * extent buffer size. Also, using 4k extent buffers everywhere provides
- * a consistent interface for CXFS across different platforms.
- *
- * There is currently no limit on the number of irec's (extent lists)
- * allowed, so heavily fragmented files may require an indirection array
- * which spans multiple system pages of memory. The number of extents
- * which would require this amount of contiguous memory is very large
- * and should not cause problems in the foreseeable future. However,
- * if the memory needed for the contiguous array ever becomes a problem,
- * it is possible that a third level of indirection may be required.
- */
-typedef struct xfs_ext_irec {
-       xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */
-       xfs_extnum_t    er_extoff;      /* extent offset in file */
-       xfs_extnum_t    er_extcount;    /* number of extents in page/block */
-} xfs_ext_irec_t;
-
-/*
- * File incore extent information, present for each of data & attr forks.
- */
-#define        XFS_IEXT_BUFSZ          4096
-#define        XFS_LINEAR_EXTS         (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t))
-#define        XFS_INLINE_EXTS         2
-#define        XFS_INLINE_DATA         32
-typedef struct xfs_ifork {
-       int                     if_bytes;       /* bytes in if_u1 */
-       int                     if_real_bytes;  /* bytes allocated in if_u1 */
-       struct xfs_btree_block  *if_broot;      /* file's incore btree root */
-       short                   if_broot_bytes; /* bytes allocated for root */
-       unsigned char           if_flags;       /* per-fork flags */
-       union {
-               xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
-               xfs_ext_irec_t  *if_ext_irec;   /* irec map file exts */
-               char            *if_data;       /* inline file data */
-       } if_u1;
-       union {
-               xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS];
-                                               /* very small file extents */
-               char            if_inline_data[XFS_INLINE_DATA];
-                                               /* very small file data */
-               xfs_dev_t       if_rdev;        /* dev number if special */
-               uuid_t          if_uuid;        /* mount point value */
-       } if_u2;
-} xfs_ifork_t;
-
-/*
- * Per-fork incore inode flags.
- */
-#define        XFS_IFINLINE    0x01    /* Inline data is read in */
-#define        XFS_IFEXTENTS   0x02    /* All extent pointers are read in */
-#define        XFS_IFBROOT     0x04    /* i_broot points to the bmap b-tree root */
-#define        XFS_IFEXTIREC   0x08    /* Indirection array of extent blocks */
-
-/*
- * Fork handling.
- */
-
-#define XFS_IFORK_Q(ip)                        ((ip)->i_d.di_forkoff != 0)
-#define XFS_IFORK_BOFF(ip)             ((int)((ip)->i_d.di_forkoff << 3))
-
-#define XFS_IFORK_PTR(ip,w)            \
-       ((w) == XFS_DATA_FORK ? \
-               &(ip)->i_df : \
-               (ip)->i_afp)
-#define XFS_IFORK_DSIZE(ip) \
-       (XFS_IFORK_Q(ip) ? \
-               XFS_IFORK_BOFF(ip) : \
-               XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version))
-#define XFS_IFORK_ASIZE(ip) \
-       (XFS_IFORK_Q(ip) ? \
-               XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \
-                       XFS_IFORK_BOFF(ip) : \
-               0)
-#define XFS_IFORK_SIZE(ip,w) \
-       ((w) == XFS_DATA_FORK ? \
-               XFS_IFORK_DSIZE(ip) : \
-               XFS_IFORK_ASIZE(ip))
-#define XFS_IFORK_FORMAT(ip,w) \
-       ((w) == XFS_DATA_FORK ? \
-               (ip)->i_d.di_format : \
-               (ip)->i_d.di_aformat)
-#define XFS_IFORK_FMT_SET(ip,w,n) \
-       ((w) == XFS_DATA_FORK ? \
-               ((ip)->i_d.di_format = (n)) : \
-               ((ip)->i_d.di_aformat = (n)))
-#define XFS_IFORK_NEXTENTS(ip,w) \
-       ((w) == XFS_DATA_FORK ? \
-               (ip)->i_d.di_nextents : \
-               (ip)->i_d.di_anextents)
-#define XFS_IFORK_NEXT_SET(ip,w,n) \
-       ((w) == XFS_DATA_FORK ? \
-               ((ip)->i_d.di_nextents = (n)) : \
-               ((ip)->i_d.di_anextents = (n)))
-#define XFS_IFORK_MAXEXT(ip, w) \
-       (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
-
-int            xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *);
-void           xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
-                               struct xfs_inode_log_item *, int);
-void           xfs_idestroy_fork(struct xfs_inode *, int);
-void           xfs_idata_realloc(struct xfs_inode *, int, int);
-void           xfs_iroot_realloc(struct xfs_inode *, int, int);
-int            xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
-int            xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
-                                 int);
-
-struct xfs_bmbt_rec_host *
-               xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t);
-void           xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t,
-                               struct xfs_bmbt_irec *, int);
-void           xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int);
-void           xfs_iext_add_indirect_multi(struct xfs_ifork *, int,
-                                           xfs_extnum_t, int);
-void           xfs_iext_remove(struct xfs_inode *, xfs_extnum_t, int, int);
-void           xfs_iext_remove_inline(struct xfs_ifork *, xfs_extnum_t, int);
-void           xfs_iext_remove_direct(struct xfs_ifork *, xfs_extnum_t, int);
-void           xfs_iext_remove_indirect(struct xfs_ifork *, xfs_extnum_t, int);
-void           xfs_iext_realloc_direct(struct xfs_ifork *, int);
-void           xfs_iext_direct_to_inline(struct xfs_ifork *, xfs_extnum_t);
-void           xfs_iext_inline_to_direct(struct xfs_ifork *, int);
-void           xfs_iext_destroy(struct xfs_ifork *);
-struct xfs_bmbt_rec_host *
-               xfs_iext_bno_to_ext(struct xfs_ifork *, xfs_fileoff_t, int *);
-struct xfs_ext_irec *
-               xfs_iext_bno_to_irec(struct xfs_ifork *, xfs_fileoff_t, int *);
-struct xfs_ext_irec *
-               xfs_iext_idx_to_irec(struct xfs_ifork *, xfs_extnum_t *, int *,
-                                    int);
-void           xfs_iext_irec_init(struct xfs_ifork *);
-struct xfs_ext_irec *
-               xfs_iext_irec_new(struct xfs_ifork *, int);
-void           xfs_iext_irec_remove(struct xfs_ifork *, int);
-void           xfs_iext_irec_compact(struct xfs_ifork *);
-void           xfs_iext_irec_compact_pages(struct xfs_ifork *);
-void           xfs_iext_irec_compact_full(struct xfs_ifork *);
-void           xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int);
-
-extern struct kmem_zone        *xfs_ifork_zone;
-
-#endif /* __XFS_INODE_FORK_H__ */
index a640137b357326de1d4df03ac63b9b918ec92588..de5a7be36e603e5525ac5ba3c49264aa7af6d5d1 100644 (file)
@@ -788,5 +788,5 @@ xfs_inode_item_format_convert(
                in_f->ilf_boffset = in_f64->ilf_boffset;
                return 0;
        }
-       return EFSCORRUPTED;
+       return -EFSCORRUPTED;
 }
diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h
deleted file mode 100644 (file)
index 90efdaf..0000000
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_INUM_H__
-#define        __XFS_INUM_H__
-
-/*
- * Inode number format:
- * low inopblog bits - offset in block
- * next agblklog bits - block number in ag
- * next agno_log bits - ag number
- * high agno_log-agblklog-inopblog bits - 0
- */
-
-struct xfs_mount;
-
-#define        XFS_INO_MASK(k)                 (__uint32_t)((1ULL << (k)) - 1)
-#define        XFS_INO_OFFSET_BITS(mp)         (mp)->m_sb.sb_inopblog
-#define        XFS_INO_AGBNO_BITS(mp)          (mp)->m_sb.sb_agblklog
-#define        XFS_INO_AGINO_BITS(mp)          (mp)->m_agino_log
-#define        XFS_INO_AGNO_BITS(mp)           (mp)->m_agno_log
-#define        XFS_INO_BITS(mp)                \
-       XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp)
-#define        XFS_INO_TO_AGNO(mp,i)           \
-       ((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp)))
-#define        XFS_INO_TO_AGINO(mp,i)          \
-       ((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp)))
-#define        XFS_INO_TO_AGBNO(mp,i)          \
-       (((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \
-               XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp)))
-#define        XFS_INO_TO_OFFSET(mp,i)         \
-       ((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
-#define        XFS_INO_TO_FSB(mp,i)            \
-       XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i))
-#define        XFS_AGINO_TO_INO(mp,a,i)        \
-       (((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i))
-#define        XFS_AGINO_TO_AGBNO(mp,i)        ((i) >> XFS_INO_OFFSET_BITS(mp))
-#define        XFS_AGINO_TO_OFFSET(mp,i)       \
-       ((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
-#define        XFS_OFFBNO_TO_AGINO(mp,b,o)     \
-       ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
-
-#if XFS_BIG_INUMS
-#define        XFS_MAXINUMBER          ((xfs_ino_t)((1ULL << 56) - 1ULL))
-#else
-#define        XFS_MAXINUMBER          ((xfs_ino_t)((1ULL << 32) - 1ULL))
-#endif
-#define        XFS_MAXINUMBER_32       ((xfs_ino_t)((1ULL << 32) - 1ULL))
-
-#endif /* __XFS_INUM_H__ */
index 8bc1bbce74517c95675dc661412e946e456fedbe..3799695b92495a02e7a15722fbe77b842b77efa2 100644 (file)
@@ -207,7 +207,7 @@ xfs_open_by_handle(
        struct path             path;
 
        if (!capable(CAP_SYS_ADMIN))
-               return -XFS_ERROR(EPERM);
+               return -EPERM;
 
        dentry = xfs_handlereq_to_dentry(parfilp, hreq);
        if (IS_ERR(dentry))
@@ -216,7 +216,7 @@ xfs_open_by_handle(
 
        /* Restrict xfs_open_by_handle to directories & regular files. */
        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
-               error = -XFS_ERROR(EPERM);
+               error = -EPERM;
                goto out_dput;
        }
 
@@ -228,18 +228,18 @@ xfs_open_by_handle(
        fmode = OPEN_FMODE(permflag);
        if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
            (fmode & FMODE_WRITE) && IS_APPEND(inode)) {
-               error = -XFS_ERROR(EPERM);
+               error = -EPERM;
                goto out_dput;
        }
 
        if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
-               error = -XFS_ERROR(EACCES);
+               error = -EACCES;
                goto out_dput;
        }
 
        /* Can't write directories. */
        if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) {
-               error = -XFS_ERROR(EISDIR);
+               error = -EISDIR;
                goto out_dput;
        }
 
@@ -282,7 +282,7 @@ xfs_readlink_by_handle(
        int                     error;
 
        if (!capable(CAP_SYS_ADMIN))
-               return -XFS_ERROR(EPERM);
+               return -EPERM;
 
        dentry = xfs_handlereq_to_dentry(parfilp, hreq);
        if (IS_ERR(dentry))
@@ -290,22 +290,22 @@ xfs_readlink_by_handle(
 
        /* Restrict this handle operation to symlinks only. */
        if (!S_ISLNK(dentry->d_inode->i_mode)) {
-               error = -XFS_ERROR(EINVAL);
+               error = -EINVAL;
                goto out_dput;
        }
 
        if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
-               error = -XFS_ERROR(EFAULT);
+               error = -EFAULT;
                goto out_dput;
        }
 
        link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
        if (!link) {
-               error = -XFS_ERROR(ENOMEM);
+               error = -ENOMEM;
                goto out_dput;
        }
 
-       error = -xfs_readlink(XFS_I(dentry->d_inode), link);
+       error = xfs_readlink(XFS_I(dentry->d_inode), link);
        if (error)
                goto out_kfree;
        error = readlink_copy(hreq->ohandle, olen, link);
@@ -330,10 +330,10 @@ xfs_set_dmattrs(
        int             error;
 
        if (!capable(CAP_SYS_ADMIN))
-               return XFS_ERROR(EPERM);
+               return -EPERM;
 
        if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
+               return -EIO;
 
        tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
@@ -364,9 +364,9 @@ xfs_fssetdm_by_handle(
        struct dentry           *dentry;
 
        if (!capable(CAP_MKNOD))
-               return -XFS_ERROR(EPERM);
+               return -EPERM;
        if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
 
        error = mnt_want_write_file(parfilp);
        if (error)
@@ -379,16 +379,16 @@ xfs_fssetdm_by_handle(
        }
 
        if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
-               error = -XFS_ERROR(EPERM);
+               error = -EPERM;
                goto out;
        }
 
        if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) {
-               error = -XFS_ERROR(EFAULT);
+               error = -EFAULT;
                goto out;
        }
 
-       error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
+       error = xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
                                 fsd.fsd_dmstate);
 
  out:
@@ -409,18 +409,18 @@ xfs_attrlist_by_handle(
        char                    *kbuf;
 
        if (!capable(CAP_SYS_ADMIN))
-               return -XFS_ERROR(EPERM);
+               return -EPERM;
        if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
        if (al_hreq.buflen < sizeof(struct attrlist) ||
            al_hreq.buflen > XATTR_LIST_MAX)
-               return -XFS_ERROR(EINVAL);
+               return -EINVAL;
 
        /*
         * Reject flags, only allow namespaces.
         */
        if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
-               return -XFS_ERROR(EINVAL);
+               return -EINVAL;
 
        dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq);
        if (IS_ERR(dentry))
@@ -431,7 +431,7 @@ xfs_attrlist_by_handle(
                goto out_dput;
 
        cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
-       error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
+       error = xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
                                        al_hreq.flags, cursor);
        if (error)
                goto out_kfree;
@@ -455,20 +455,20 @@ xfs_attrmulti_attr_get(
        __uint32_t              flags)
 {
        unsigned char           *kbuf;
-       int                     error = EFAULT;
+       int                     error = -EFAULT;
 
        if (*len > XATTR_SIZE_MAX)
-               return EINVAL;
+               return -EINVAL;
        kbuf = kmem_zalloc_large(*len, KM_SLEEP);
        if (!kbuf)
-               return ENOMEM;
+               return -ENOMEM;
 
        error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
        if (error)
                goto out_kfree;
 
        if (copy_to_user(ubuf, kbuf, *len))
-               error = EFAULT;
+               error = -EFAULT;
 
 out_kfree:
        kmem_free(kbuf);
@@ -484,20 +484,17 @@ xfs_attrmulti_attr_set(
        __uint32_t              flags)
 {
        unsigned char           *kbuf;
-       int                     error = EFAULT;
 
        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-               return EPERM;
+               return -EPERM;
        if (len > XATTR_SIZE_MAX)
-               return EINVAL;
+               return -EINVAL;
 
        kbuf = memdup_user(ubuf, len);
        if (IS_ERR(kbuf))
                return PTR_ERR(kbuf);
 
-       error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
-
-       return error;
+       return xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
 }
 
 int
@@ -507,7 +504,7 @@ xfs_attrmulti_attr_remove(
        __uint32_t              flags)
 {
        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-               return EPERM;
+               return -EPERM;
        return xfs_attr_remove(XFS_I(inode), name, flags);
 }
 
@@ -524,9 +521,9 @@ xfs_attrmulti_by_handle(
        unsigned char           *attr_name;
 
        if (!capable(CAP_SYS_ADMIN))
-               return -XFS_ERROR(EPERM);
+               return -EPERM;
        if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
 
        /* overflow check */
        if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t))
@@ -536,18 +533,18 @@ xfs_attrmulti_by_handle(
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
 
-       error = E2BIG;
+       error = -E2BIG;
        size = am_hreq.opcount * sizeof(xfs_attr_multiop_t);
        if (!size || size > 16 * PAGE_SIZE)
                goto out_dput;
 
        ops = memdup_user(am_hreq.ops, size);
        if (IS_ERR(ops)) {
-               error = -PTR_ERR(ops);
+               error = PTR_ERR(ops);
                goto out_dput;
        }
 
-       error = ENOMEM;
+       error = -ENOMEM;
        attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
        if (!attr_name)
                goto out_kfree_ops;
@@ -557,7 +554,7 @@ xfs_attrmulti_by_handle(
                ops[i].am_error = strncpy_from_user((char *)attr_name,
                                ops[i].am_attrname, MAXNAMELEN);
                if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
-                       error = ERANGE;
+                       error = -ERANGE;
                if (ops[i].am_error < 0)
                        break;
 
@@ -588,19 +585,19 @@ xfs_attrmulti_by_handle(
                        mnt_drop_write_file(parfilp);
                        break;
                default:
-                       ops[i].am_error = EINVAL;
+                       ops[i].am_error = -EINVAL;
                }
        }
 
        if (copy_to_user(am_hreq.ops, ops, size))
-               error = XFS_ERROR(EFAULT);
+               error = -EFAULT;
 
        kfree(attr_name);
  out_kfree_ops:
        kfree(ops);
  out_dput:
        dput(dentry);
-       return -error;
+       return error;
 }
 
 int
@@ -625,16 +622,16 @@ xfs_ioc_space(
         */
        if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) &&
            !capable(CAP_SYS_ADMIN))
-               return -XFS_ERROR(EPERM);
+               return -EPERM;
 
        if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
-               return -XFS_ERROR(EPERM);
+               return -EPERM;
 
        if (!(filp->f_mode & FMODE_WRITE))
-               return -XFS_ERROR(EBADF);
+               return -EBADF;
 
        if (!S_ISREG(inode->i_mode))
-               return -XFS_ERROR(EINVAL);
+               return -EINVAL;
 
        error = mnt_want_write_file(filp);
        if (error)
@@ -652,7 +649,7 @@ xfs_ioc_space(
                bf->l_start += XFS_ISIZE(ip);
                break;
        default:
-               error = XFS_ERROR(EINVAL);
+               error = -EINVAL;
                goto out_unlock;
        }
 
@@ -669,7 +666,7 @@ xfs_ioc_space(
        case XFS_IOC_UNRESVSP:
        case XFS_IOC_UNRESVSP64:
                if (bf->l_len <= 0) {
-                       error = XFS_ERROR(EINVAL);
+                       error = -EINVAL;
                        goto out_unlock;
                }
                break;
@@ -682,7 +679,7 @@ xfs_ioc_space(
            bf->l_start > mp->m_super->s_maxbytes ||
            bf->l_start + bf->l_len < 0 ||
            bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) {
-               error = XFS_ERROR(EINVAL);
+               error = -EINVAL;
                goto out_unlock;
        }
 
@@ -723,7 +720,7 @@ xfs_ioc_space(
                break;
        default:
                ASSERT(0);
-               error = XFS_ERROR(EINVAL);
+               error = -EINVAL;
        }
 
        if (error)
@@ -739,7 +736,7 @@ xfs_ioc_space(
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
-       if (!(ioflags & IO_INVIS)) {
+       if (!(ioflags & XFS_IO_INVIS)) {
                ip->i_d.di_mode &= ~S_ISUID;
                if (ip->i_d.di_mode & S_IXGRP)
                        ip->i_d.di_mode &= ~S_ISGID;
@@ -759,7 +756,7 @@ xfs_ioc_space(
 out_unlock:
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
        mnt_drop_write_file(filp);
-       return -error;
+       return error;
 }
 
 STATIC int
@@ -781,41 +778,41 @@ xfs_ioc_bulkstat(
                return -EPERM;
 
        if (XFS_FORCED_SHUTDOWN(mp))
-               return -XFS_ERROR(EIO);
+               return -EIO;
 
        if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t)))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
 
        if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
 
        if ((count = bulkreq.icount) <= 0)
-               return -XFS_ERROR(EINVAL);
+               return -EINVAL;
 
        if (bulkreq.ubuffer == NULL)
-               return -XFS_ERROR(EINVAL);
+               return -EINVAL;
 
        if (cmd == XFS_IOC_FSINUMBERS)
                error = xfs_inumbers(mp, &inlast, &count,
                                        bulkreq.ubuffer, xfs_inumbers_fmt);
        else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE)
-               error = xfs_bulkstat_single(mp, &inlast,
-                                               bulkreq.ubuffer, &done);
+               error = xfs_bulkstat_one(mp, inlast, bulkreq.ubuffer,
+                                       sizeof(xfs_bstat_t), NULL, &done);
        else    /* XFS_IOC_FSBULKSTAT */
                error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one,
                                     sizeof(xfs_bstat_t), bulkreq.ubuffer,
                                     &done);
 
        if (error)
-               return -error;
+               return error;
 
        if (bulkreq.ocount != NULL) {
                if (copy_to_user(bulkreq.lastip, &inlast,
                                                sizeof(xfs_ino_t)))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
 
                if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
        }
 
        return 0;
@@ -831,7 +828,7 @@ xfs_ioc_fsgeometry_v1(
 
        error = xfs_fs_geometry(mp, &fsgeo, 3);
        if (error)
-               return -error;
+               return error;
 
        /*
         * Caller should have passed an argument of type
@@ -839,7 +836,7 @@ xfs_ioc_fsgeometry_v1(
         * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
         */
        if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
        return 0;
 }
 
@@ -853,10 +850,10 @@ xfs_ioc_fsgeometry(
 
        error = xfs_fs_geometry(mp, &fsgeo, 4);
        if (error)
-               return -error;
+               return error;
 
        if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
        return 0;
 }
 
@@ -1041,16 +1038,16 @@ xfs_ioctl_setattr(
        trace_xfs_ioctl_setattr(ip);
 
        if (mp->m_flags & XFS_MOUNT_RDONLY)
-               return XFS_ERROR(EROFS);
+               return -EROFS;
        if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
+               return -EIO;
 
        /*
         * Disallow 32bit project ids when projid32bit feature is not enabled.
         */
        if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) &&
                        !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
-               return XFS_ERROR(EINVAL);
+               return -EINVAL;
 
        /*
         * If disk quotas is on, we make sure that the dquots do exist on disk,
@@ -1088,7 +1085,7 @@ xfs_ioctl_setattr(
         * CAP_FSETID capability is applicable.
         */
        if (!inode_owner_or_capable(VFS_I(ip))) {
-               code = XFS_ERROR(EPERM);
+               code = -EPERM;
                goto error_return;
        }
 
@@ -1099,7 +1096,7 @@ xfs_ioctl_setattr(
         */
        if (mask & FSX_PROJID) {
                if (current_user_ns() != &init_user_ns) {
-                       code = XFS_ERROR(EINVAL);
+                       code = -EINVAL;
                        goto error_return;
                }
 
@@ -1122,7 +1119,7 @@ xfs_ioctl_setattr(
                if (ip->i_d.di_nextents &&
                    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
                     fa->fsx_extsize)) {
-                       code = XFS_ERROR(EINVAL);       /* EFBIG? */
+                       code = -EINVAL; /* EFBIG? */
                        goto error_return;
                }
 
@@ -1141,7 +1138,7 @@ xfs_ioctl_setattr(
 
                        extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
                        if (extsize_fsb > MAXEXTLEN) {
-                               code = XFS_ERROR(EINVAL);
+                               code = -EINVAL;
                                goto error_return;
                        }
 
@@ -1153,13 +1150,13 @@ xfs_ioctl_setattr(
                        } else {
                                size = mp->m_sb.sb_blocksize;
                                if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
-                                       code = XFS_ERROR(EINVAL);
+                                       code = -EINVAL;
                                        goto error_return;
                                }
                        }
 
                        if (fa->fsx_extsize % size) {
-                               code = XFS_ERROR(EINVAL);
+                               code = -EINVAL;
                                goto error_return;
                        }
                }
@@ -1173,7 +1170,7 @@ xfs_ioctl_setattr(
                if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
                    (XFS_IS_REALTIME_INODE(ip)) !=
                    (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
-                       code = XFS_ERROR(EINVAL);       /* EFBIG? */
+                       code = -EINVAL; /* EFBIG? */
                        goto error_return;
                }
 
@@ -1184,7 +1181,7 @@ xfs_ioctl_setattr(
                        if ((mp->m_sb.sb_rblocks == 0) ||
                            (mp->m_sb.sb_rextsize == 0) ||
                            (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
-                               code = XFS_ERROR(EINVAL);
+                               code = -EINVAL;
                                goto error_return;
                        }
                }
@@ -1198,7 +1195,7 @@ xfs_ioctl_setattr(
                     (fa->fsx_xflags &
                                (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
                    !capable(CAP_LINUX_IMMUTABLE)) {
-                       code = XFS_ERROR(EPERM);
+                       code = -EPERM;
                        goto error_return;
                }
        }
@@ -1301,7 +1298,7 @@ xfs_ioc_fssetxattr(
                return error;
        error = xfs_ioctl_setattr(ip, &fa, mask);
        mnt_drop_write_file(filp);
-       return -error;
+       return error;
 }
 
 STATIC int
@@ -1346,7 +1343,7 @@ xfs_ioc_setxflags(
                return error;
        error = xfs_ioctl_setattr(ip, &fa, mask);
        mnt_drop_write_file(filp);
-       return -error;
+       return error;
 }
 
 STATIC int
@@ -1356,7 +1353,7 @@ xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
 
        /* copy only getbmap portion (not getbmapx) */
        if (copy_to_user(base, bmv, sizeof(struct getbmap)))
-               return XFS_ERROR(EFAULT);
+               return -EFAULT;
 
        *ap += sizeof(struct getbmap);
        return 0;
@@ -1373,23 +1370,23 @@ xfs_ioc_getbmap(
        int                     error;
 
        if (copy_from_user(&bmx, arg, sizeof(struct getbmapx)))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
 
        if (bmx.bmv_count < 2)
-               return -XFS_ERROR(EINVAL);
+               return -EINVAL;
 
        bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
-       if (ioflags & IO_INVIS)
+       if (ioflags & XFS_IO_INVIS)
                bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
 
        error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
                            (struct getbmap *)arg+1);
        if (error)
-               return -error;
+               return error;
 
        /* copy back header - only size of getbmap */
        if (copy_to_user(arg, &bmx, sizeof(struct getbmap)))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
        return 0;
 }
 
@@ -1399,7 +1396,7 @@ xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full)
        struct getbmapx __user  *base = *ap;
 
        if (copy_to_user(base, bmv, sizeof(struct getbmapx)))
-               return XFS_ERROR(EFAULT);
+               return -EFAULT;
 
        *ap += sizeof(struct getbmapx);
        return 0;
@@ -1414,22 +1411,22 @@ xfs_ioc_getbmapx(
        int                     error;
 
        if (copy_from_user(&bmx, arg, sizeof(bmx)))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
 
        if (bmx.bmv_count < 2)
-               return -XFS_ERROR(EINVAL);
+               return -EINVAL;
 
        if (bmx.bmv_iflags & (~BMV_IF_VALID))
-               return -XFS_ERROR(EINVAL);
+               return -EINVAL;
 
        error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format,
                            (struct getbmapx *)arg+1);
        if (error)
-               return -error;
+               return error;
 
        /* copy back header */
        if (copy_to_user(arg, &bmx, sizeof(struct getbmapx)))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
 
        return 0;
 }
@@ -1445,33 +1442,33 @@ xfs_ioc_swapext(
        /* Pull information for the target fd */
        f = fdget((int)sxp->sx_fdtarget);
        if (!f.file) {
-               error = XFS_ERROR(EINVAL);
+               error = -EINVAL;
                goto out;
        }
 
        if (!(f.file->f_mode & FMODE_WRITE) ||
            !(f.file->f_mode & FMODE_READ) ||
            (f.file->f_flags & O_APPEND)) {
-               error = XFS_ERROR(EBADF);
+               error = -EBADF;
                goto out_put_file;
        }
 
        tmp = fdget((int)sxp->sx_fdtmp);
        if (!tmp.file) {
-               error = XFS_ERROR(EINVAL);
+               error = -EINVAL;
                goto out_put_file;
        }
 
        if (!(tmp.file->f_mode & FMODE_WRITE) ||
            !(tmp.file->f_mode & FMODE_READ) ||
            (tmp.file->f_flags & O_APPEND)) {
-               error = XFS_ERROR(EBADF);
+               error = -EBADF;
                goto out_put_tmp_file;
        }
 
        if (IS_SWAPFILE(file_inode(f.file)) ||
            IS_SWAPFILE(file_inode(tmp.file))) {
-               error = XFS_ERROR(EINVAL);
+               error = -EINVAL;
                goto out_put_tmp_file;
        }
 
@@ -1479,17 +1476,17 @@ xfs_ioc_swapext(
        tip = XFS_I(file_inode(tmp.file));
 
        if (ip->i_mount != tip->i_mount) {
-               error = XFS_ERROR(EINVAL);
+               error = -EINVAL;
                goto out_put_tmp_file;
        }
 
        if (ip->i_ino == tip->i_ino) {
-               error = XFS_ERROR(EINVAL);
+               error = -EINVAL;
                goto out_put_tmp_file;
        }
 
        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-               error = XFS_ERROR(EIO);
+               error = -EIO;
                goto out_put_tmp_file;
        }
 
@@ -1523,7 +1520,7 @@ xfs_file_ioctl(
        int                     error;
 
        if (filp->f_mode & FMODE_NOCMTIME)
-               ioflags |= IO_INVIS;
+               ioflags |= XFS_IO_INVIS;
 
        trace_xfs_file_ioctl(ip);
 
@@ -1542,7 +1539,7 @@ xfs_file_ioctl(
                xfs_flock64_t           bf;
 
                if (copy_from_user(&bf, arg, sizeof(bf)))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
                return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
        }
        case XFS_IOC_DIOINFO: {
@@ -1555,7 +1552,7 @@ xfs_file_ioctl(
                da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
 
                if (copy_to_user(arg, &da, sizeof(da)))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
                return 0;
        }
 
@@ -1588,7 +1585,7 @@ xfs_file_ioctl(
                struct fsdmidata        dmi;
 
                if (copy_from_user(&dmi, arg, sizeof(dmi)))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
 
                error = mnt_want_write_file(filp);
                if (error)
@@ -1597,7 +1594,7 @@ xfs_file_ioctl(
                error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
                                dmi.fsd_dmstate);
                mnt_drop_write_file(filp);
-               return -error;
+               return error;
        }
 
        case XFS_IOC_GETBMAP:
@@ -1613,14 +1610,14 @@ xfs_file_ioctl(
                xfs_fsop_handlereq_t    hreq;
 
                if (copy_from_user(&hreq, arg, sizeof(hreq)))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
                return xfs_find_handle(cmd, &hreq);
        }
        case XFS_IOC_OPEN_BY_HANDLE: {
                xfs_fsop_handlereq_t    hreq;
 
                if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
                return xfs_open_by_handle(filp, &hreq);
        }
        case XFS_IOC_FSSETDM_BY_HANDLE:
@@ -1630,7 +1627,7 @@ xfs_file_ioctl(
                xfs_fsop_handlereq_t    hreq;
 
                if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
                return xfs_readlink_by_handle(filp, &hreq);
        }
        case XFS_IOC_ATTRLIST_BY_HANDLE:
@@ -1643,13 +1640,13 @@ xfs_file_ioctl(
                struct xfs_swapext      sxp;
 
                if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t)))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
                error = mnt_want_write_file(filp);
                if (error)
                        return error;
                error = xfs_ioc_swapext(&sxp);
                mnt_drop_write_file(filp);
-               return -error;
+               return error;
        }
 
        case XFS_IOC_FSCOUNTS: {
@@ -1657,10 +1654,10 @@ xfs_file_ioctl(
 
                error = xfs_fs_counts(mp, &out);
                if (error)
-                       return -error;
+                       return error;
 
                if (copy_to_user(arg, &out, sizeof(out)))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
                return 0;
        }
 
@@ -1672,10 +1669,10 @@ xfs_file_ioctl(
                        return -EPERM;
 
                if (mp->m_flags & XFS_MOUNT_RDONLY)
-                       return -XFS_ERROR(EROFS);
+                       return -EROFS;
 
                if (copy_from_user(&inout, arg, sizeof(inout)))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
 
                error = mnt_want_write_file(filp);
                if (error)
@@ -1686,10 +1683,10 @@ xfs_file_ioctl(
                error = xfs_reserve_blocks(mp, &in, &inout);
                mnt_drop_write_file(filp);
                if (error)
-                       return -error;
+                       return error;
 
                if (copy_to_user(arg, &inout, sizeof(inout)))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
                return 0;
        }
 
@@ -1701,10 +1698,10 @@ xfs_file_ioctl(
 
                error = xfs_reserve_blocks(mp, NULL, &out);
                if (error)
-                       return -error;
+                       return error;
 
                if (copy_to_user(arg, &out, sizeof(out)))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
 
                return 0;
        }
@@ -1713,42 +1710,42 @@ xfs_file_ioctl(
                xfs_growfs_data_t in;
 
                if (copy_from_user(&in, arg, sizeof(in)))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
 
                error = mnt_want_write_file(filp);
                if (error)
                        return error;
                error = xfs_growfs_data(mp, &in);
                mnt_drop_write_file(filp);
-               return -error;
+               return error;
        }
 
        case XFS_IOC_FSGROWFSLOG: {
                xfs_growfs_log_t in;
 
                if (copy_from_user(&in, arg, sizeof(in)))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
 
                error = mnt_want_write_file(filp);
                if (error)
                        return error;
                error = xfs_growfs_log(mp, &in);
                mnt_drop_write_file(filp);
-               return -error;
+               return error;
        }
 
        case XFS_IOC_FSGROWFSRT: {
                xfs_growfs_rt_t in;
 
                if (copy_from_user(&in, arg, sizeof(in)))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
 
                error = mnt_want_write_file(filp);
                if (error)
                        return error;
                error = xfs_growfs_rt(mp, &in);
                mnt_drop_write_file(filp);
-               return -error;
+               return error;
        }
 
        case XFS_IOC_GOINGDOWN: {
@@ -1758,10 +1755,9 @@ xfs_file_ioctl(
                        return -EPERM;
 
                if (get_user(in, (__uint32_t __user *)arg))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
 
-               error = xfs_fs_goingdown(mp, in);
-               return -error;
+               return xfs_fs_goingdown(mp, in);
        }
 
        case XFS_IOC_ERROR_INJECTION: {
@@ -1771,18 +1767,16 @@ xfs_file_ioctl(
                        return -EPERM;
 
                if (copy_from_user(&in, arg, sizeof(in)))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
 
-               error = xfs_errortag_add(in.errtag, mp);
-               return -error;
+               return xfs_errortag_add(in.errtag, mp);
        }
 
        case XFS_IOC_ERROR_CLEARALL:
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
 
-               error = xfs_errortag_clearall(mp, 1);
-               return -error;
+               return xfs_errortag_clearall(mp, 1);
 
        case XFS_IOC_FREE_EOFBLOCKS: {
                struct xfs_fs_eofblocks eofb;
@@ -1792,16 +1786,16 @@ xfs_file_ioctl(
                        return -EPERM;
 
                if (mp->m_flags & XFS_MOUNT_RDONLY)
-                       return -XFS_ERROR(EROFS);
+                       return -EROFS;
 
                if (copy_from_user(&eofb, arg, sizeof(eofb)))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
 
                error = xfs_fs_eofblocks_from_user(&eofb, &keofb);
                if (error)
-                       return -error;
+                       return error;
 
-               return -xfs_icache_free_eofblocks(mp, &keofb);
+               return xfs_icache_free_eofblocks(mp, &keofb);
        }
 
        default:
index 944d5baa710a39da9aa1d6a82a27013c15813d64..a554646ff141479eeea453ff7899bdd8e91d92b8 100644 (file)
@@ -28,7 +28,6 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_vnode.h"
 #include "xfs_inode.h"
 #include "xfs_itable.h"
 #include "xfs_error.h"
@@ -56,7 +55,7 @@ xfs_compat_flock64_copyin(
            get_user(bf->l_sysid,       &arg32->l_sysid) ||
            get_user(bf->l_pid,         &arg32->l_pid) ||
            copy_from_user(bf->l_pad,   &arg32->l_pad,  4*sizeof(u32)))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
        return 0;
 }
 
@@ -70,10 +69,10 @@ xfs_compat_ioc_fsgeometry_v1(
 
        error = xfs_fs_geometry(mp, &fsgeo, 3);
        if (error)
-               return -error;
+               return error;
        /* The 32-bit variant simply has some padding at the end */
        if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1)))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
        return 0;
 }
 
@@ -84,7 +83,7 @@ xfs_compat_growfs_data_copyin(
 {
        if (get_user(in->newblocks, &arg32->newblocks) ||
            get_user(in->imaxpct,   &arg32->imaxpct))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
        return 0;
 }
 
@@ -95,14 +94,14 @@ xfs_compat_growfs_rt_copyin(
 {
        if (get_user(in->newblocks, &arg32->newblocks) ||
            get_user(in->extsize,   &arg32->extsize))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
        return 0;
 }
 
 STATIC int
 xfs_inumbers_fmt_compat(
        void                    __user *ubuffer,
-       const xfs_inogrp_t      *buffer,
+       const struct xfs_inogrp *buffer,
        long                    count,
        long                    *written)
 {
@@ -113,7 +112,7 @@ xfs_inumbers_fmt_compat(
                if (put_user(buffer[i].xi_startino,   &p32[i].xi_startino) ||
                    put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) ||
                    put_user(buffer[i].xi_allocmask,  &p32[i].xi_allocmask))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
        }
        *written = count * sizeof(*p32);
        return 0;
@@ -132,7 +131,7 @@ xfs_ioctl32_bstime_copyin(
 
        if (get_user(sec32,             &bstime32->tv_sec)      ||
            get_user(bstime->tv_nsec,   &bstime32->tv_nsec))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
        bstime->tv_sec = sec32;
        return 0;
 }
@@ -164,7 +163,7 @@ xfs_ioctl32_bstat_copyin(
            get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) ||
            get_user(bstat->bs_dmstate, &bstat32->bs_dmstate)   ||
            get_user(bstat->bs_aextents, &bstat32->bs_aextents))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
        return 0;
 }
 
@@ -180,7 +179,7 @@ xfs_bstime_store_compat(
        sec32 = p->tv_sec;
        if (put_user(sec32, &p32->tv_sec) ||
            put_user(p->tv_nsec, &p32->tv_nsec))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
        return 0;
 }
 
@@ -195,7 +194,7 @@ xfs_bulkstat_one_fmt_compat(
        compat_xfs_bstat_t      __user *p32 = ubuffer;
 
        if (ubsize < sizeof(*p32))
-               return XFS_ERROR(ENOMEM);
+               return -ENOMEM;
 
        if (put_user(buffer->bs_ino,      &p32->bs_ino)         ||
            put_user(buffer->bs_mode,     &p32->bs_mode)        ||
@@ -218,7 +217,7 @@ xfs_bulkstat_one_fmt_compat(
            put_user(buffer->bs_dmevmask, &p32->bs_dmevmask)    ||
            put_user(buffer->bs_dmstate,  &p32->bs_dmstate)     ||
            put_user(buffer->bs_aextents, &p32->bs_aextents))
-               return XFS_ERROR(EFAULT);
+               return -EFAULT;
        if (ubused)
                *ubused = sizeof(*p32);
        return 0;
@@ -256,30 +255,30 @@ xfs_compat_ioc_bulkstat(
        /* should be called again (unused here, but used in dmapi) */
 
        if (!capable(CAP_SYS_ADMIN))
-               return -XFS_ERROR(EPERM);
+               return -EPERM;
 
        if (XFS_FORCED_SHUTDOWN(mp))
-               return -XFS_ERROR(EIO);
+               return -EIO;
 
        if (get_user(addr, &p32->lastip))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
        bulkreq.lastip = compat_ptr(addr);
        if (get_user(bulkreq.icount, &p32->icount) ||
            get_user(addr, &p32->ubuffer))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
        bulkreq.ubuffer = compat_ptr(addr);
        if (get_user(addr, &p32->ocount))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
        bulkreq.ocount = compat_ptr(addr);
 
        if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
 
        if ((count = bulkreq.icount) <= 0)
-               return -XFS_ERROR(EINVAL);
+               return -EINVAL;
 
        if (bulkreq.ubuffer == NULL)
-               return -XFS_ERROR(EINVAL);
+               return -EINVAL;
 
        if (cmd == XFS_IOC_FSINUMBERS_32) {
                error = xfs_inumbers(mp, &inlast, &count,
@@ -294,17 +293,17 @@ xfs_compat_ioc_bulkstat(
                        xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t),
                        bulkreq.ubuffer, &done);
        } else
-               error = XFS_ERROR(EINVAL);
+               error = -EINVAL;
        if (error)
-               return -error;
+               return error;
 
        if (bulkreq.ocount != NULL) {
                if (copy_to_user(bulkreq.lastip, &inlast,
                                                sizeof(xfs_ino_t)))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
 
                if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
        }
 
        return 0;
@@ -318,7 +317,7 @@ xfs_compat_handlereq_copyin(
        compat_xfs_fsop_handlereq_t     hreq32;
 
        if (copy_from_user(&hreq32, arg32, sizeof(compat_xfs_fsop_handlereq_t)))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
 
        hreq->fd = hreq32.fd;
        hreq->path = compat_ptr(hreq32.path);
@@ -352,19 +351,19 @@ xfs_compat_attrlist_by_handle(
        char                    *kbuf;
 
        if (!capable(CAP_SYS_ADMIN))
-               return -XFS_ERROR(EPERM);
+               return -EPERM;
        if (copy_from_user(&al_hreq, arg,
                           sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
        if (al_hreq.buflen < sizeof(struct attrlist) ||
            al_hreq.buflen > XATTR_LIST_MAX)
-               return -XFS_ERROR(EINVAL);
+               return -EINVAL;
 
        /*
         * Reject flags, only allow namespaces.
         */
        if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
-               return -XFS_ERROR(EINVAL);
+               return -EINVAL;
 
        dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq);
        if (IS_ERR(dentry))
@@ -376,7 +375,7 @@ xfs_compat_attrlist_by_handle(
                goto out_dput;
 
        cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
-       error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
+       error = xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
                                        al_hreq.flags, cursor);
        if (error)
                goto out_kfree;
@@ -404,10 +403,10 @@ xfs_compat_attrmulti_by_handle(
        unsigned char                           *attr_name;
 
        if (!capable(CAP_SYS_ADMIN))
-               return -XFS_ERROR(EPERM);
+               return -EPERM;
        if (copy_from_user(&am_hreq, arg,
                           sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
 
        /* overflow check */
        if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t))
@@ -417,7 +416,7 @@ xfs_compat_attrmulti_by_handle(
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
 
-       error = E2BIG;
+       error = -E2BIG;
        size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t);
        if (!size || size > 16 * PAGE_SIZE)
                goto out_dput;
@@ -428,7 +427,7 @@ xfs_compat_attrmulti_by_handle(
                goto out_dput;
        }
 
-       error = ENOMEM;
+       error = -ENOMEM;
        attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
        if (!attr_name)
                goto out_kfree_ops;
@@ -439,7 +438,7 @@ xfs_compat_attrmulti_by_handle(
                                compat_ptr(ops[i].am_attrname),
                                MAXNAMELEN);
                if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
-                       error = ERANGE;
+                       error = -ERANGE;
                if (ops[i].am_error < 0)
                        break;
 
@@ -470,19 +469,19 @@ xfs_compat_attrmulti_by_handle(
                        mnt_drop_write_file(parfilp);
                        break;
                default:
-                       ops[i].am_error = EINVAL;
+                       ops[i].am_error = -EINVAL;
                }
        }
 
        if (copy_to_user(compat_ptr(am_hreq.ops), ops, size))
-               error = XFS_ERROR(EFAULT);
+               error = -EFAULT;
 
        kfree(attr_name);
  out_kfree_ops:
        kfree(ops);
  out_dput:
        dput(dentry);
-       return -error;
+       return error;
 }
 
 STATIC int
@@ -496,26 +495,26 @@ xfs_compat_fssetdm_by_handle(
        struct dentry           *dentry;
 
        if (!capable(CAP_MKNOD))
-               return -XFS_ERROR(EPERM);
+               return -EPERM;
        if (copy_from_user(&dmhreq, arg,
                           sizeof(compat_xfs_fsop_setdm_handlereq_t)))
-               return -XFS_ERROR(EFAULT);
+               return -EFAULT;
 
        dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
 
        if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
-               error = -XFS_ERROR(EPERM);
+               error = -EPERM;
                goto out;
        }
 
        if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) {
-               error = -XFS_ERROR(EFAULT);
+               error = -EFAULT;
                goto out;
        }
 
-       error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
+       error = xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
                                 fsd.fsd_dmstate);
 
 out:
@@ -537,7 +536,7 @@ xfs_file_compat_ioctl(
        int                     error;
 
        if (filp->f_mode & FMODE_NOCMTIME)
-               ioflags |= IO_INVIS;
+               ioflags |= XFS_IO_INVIS;
 
        trace_xfs_file_compat_ioctl(ip);
 
@@ -588,7 +587,7 @@ xfs_file_compat_ioctl(
                struct xfs_flock64      bf;
 
                if (xfs_compat_flock64_copyin(&bf, arg))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
                cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
                return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
        }
@@ -598,25 +597,25 @@ xfs_file_compat_ioctl(
                struct xfs_growfs_data  in;
 
                if (xfs_compat_growfs_data_copyin(&in, arg))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
                error = mnt_want_write_file(filp);
                if (error)
                        return error;
                error = xfs_growfs_data(mp, &in);
                mnt_drop_write_file(filp);
-               return -error;
+               return error;
        }
        case XFS_IOC_FSGROWFSRT_32: {
                struct xfs_growfs_rt    in;
 
                if (xfs_compat_growfs_rt_copyin(&in, arg))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
                error = mnt_want_write_file(filp);
                if (error)
                        return error;
                error = xfs_growfs_rt(mp, &in);
                mnt_drop_write_file(filp);
-               return -error;
+               return error;
        }
 #endif
        /* long changes size, but xfs only copiese out 32 bits */
@@ -633,13 +632,13 @@ xfs_file_compat_ioctl(
                if (copy_from_user(&sxp, sxu,
                                   offsetof(struct xfs_swapext, sx_stat)) ||
                    xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
                error = mnt_want_write_file(filp);
                if (error)
                        return error;
                error = xfs_ioc_swapext(&sxp);
                mnt_drop_write_file(filp);
-               return -error;
+               return error;
        }
        case XFS_IOC_FSBULKSTAT_32:
        case XFS_IOC_FSBULKSTAT_SINGLE_32:
@@ -651,7 +650,7 @@ xfs_file_compat_ioctl(
                struct xfs_fsop_handlereq       hreq;
 
                if (xfs_compat_handlereq_copyin(&hreq, arg))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
                cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq);
                return xfs_find_handle(cmd, &hreq);
        }
@@ -659,14 +658,14 @@ xfs_file_compat_ioctl(
                struct xfs_fsop_handlereq       hreq;
 
                if (xfs_compat_handlereq_copyin(&hreq, arg))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
                return xfs_open_by_handle(filp, &hreq);
        }
        case XFS_IOC_READLINK_BY_HANDLE_32: {
                struct xfs_fsop_handlereq       hreq;
 
                if (xfs_compat_handlereq_copyin(&hreq, arg))
-                       return -XFS_ERROR(EFAULT);
+                       return -EFAULT;
                return xfs_readlink_by_handle(filp, &hreq);
        }
        case XFS_IOC_ATTRLIST_BY_HANDLE_32:
@@ -676,6 +675,6 @@ xfs_file_compat_ioctl(
        case XFS_IOC_FSSETDM_BY_HANDLE_32:
                return xfs_compat_fssetdm_by_handle(filp, arg);
        default:
-               return -XFS_ERROR(ENOIOCTLCMD);
+               return -ENOIOCTLCMD;
        }
 }
index 6d3ec2b6ee294c7ec38e28fd32376162276f1005..e9c47b6f5e5a97f3b100a4a6262a1c26a1b01de9 100644 (file)
@@ -110,7 +110,7 @@ xfs_alert_fsblock_zero(
                (unsigned long long)imap->br_startoff,
                (unsigned long long)imap->br_blockcount,
                imap->br_state);
-       return EFSCORRUPTED;
+       return -EFSCORRUPTED;
 }
 
 int
@@ -138,7 +138,7 @@ xfs_iomap_write_direct(
 
        error = xfs_qm_dqattach(ip, 0);
        if (error)
-               return XFS_ERROR(error);
+               return error;
 
        rt = XFS_IS_REALTIME_INODE(ip);
        extsz = xfs_get_extsz_hint(ip);
@@ -148,7 +148,7 @@ xfs_iomap_write_direct(
        if ((offset + count) > XFS_ISIZE(ip)) {
                error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
                if (error)
-                       return XFS_ERROR(error);
+                       return error;
        } else {
                if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
                        last_fsb = MIN(last_fsb, (xfs_fileoff_t)
@@ -188,7 +188,7 @@ xfs_iomap_write_direct(
         */
        if (error) {
                xfs_trans_cancel(tp, 0);
-               return XFS_ERROR(error);
+               return error;
        }
 
        xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -225,7 +225,7 @@ xfs_iomap_write_direct(
         * Copy any maps to caller's array and return any error.
         */
        if (nimaps == 0) {
-               error = XFS_ERROR(ENOSPC);
+               error = -ENOSPC;
                goto out_unlock;
        }
 
@@ -397,7 +397,8 @@ xfs_quota_calc_throttle(
        struct xfs_inode *ip,
        int type,
        xfs_fsblock_t *qblocks,
-       int *qshift)
+       int *qshift,
+       int64_t *qfreesp)
 {
        int64_t freesp;
        int shift = 0;
@@ -406,6 +407,7 @@ xfs_quota_calc_throttle(
        /* over hi wmark, squash the prealloc completely */
        if (dq->q_res_bcount >= dq->q_prealloc_hi_wmark) {
                *qblocks = 0;
+               *qfreesp = 0;
                return;
        }
 
@@ -418,6 +420,9 @@ xfs_quota_calc_throttle(
                        shift += 2;
        }
 
+       if (freesp < *qfreesp)
+               *qfreesp = freesp;
+
        /* only overwrite the throttle values if we are more aggressive */
        if ((freesp >> shift) < (*qblocks >> *qshift)) {
                *qblocks = freesp;
@@ -476,15 +481,18 @@ xfs_iomap_prealloc_size(
        }
 
        /*
-        * Check each quota to cap the prealloc size and provide a shift
-        * value to throttle with.
+        * Check each quota to cap the prealloc size, provide a shift value to
+        * throttle with and adjust amount of available space.
         */
        if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks))
-               xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift);
+               xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift,
+                                       &freesp);
        if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks))
-               xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift);
+               xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift,
+                                       &freesp);
        if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks))
-               xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift);
+               xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift,
+                                       &freesp);
 
        /*
         * The final prealloc size is set to the minimum of free space available
@@ -552,7 +560,7 @@ xfs_iomap_write_delay(
         */
        error = xfs_qm_dqattach_locked(ip, 0);
        if (error)
-               return XFS_ERROR(error);
+               return error;
 
        extsz = xfs_get_extsz_hint(ip);
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
@@ -596,11 +604,11 @@ retry:
                                imap, &nimaps, XFS_BMAPI_ENTIRE);
        switch (error) {
        case 0:
-       case ENOSPC:
-       case EDQUOT:
+       case -ENOSPC:
+       case -EDQUOT:
                break;
        default:
-               return XFS_ERROR(error);
+               return error;
        }
 
        /*
@@ -614,7 +622,7 @@ retry:
                        error = 0;
                        goto retry;
                }
-               return XFS_ERROR(error ? error : ENOSPC);
+               return error ? error : -ENOSPC;
        }
 
        if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
@@ -663,7 +671,7 @@ xfs_iomap_write_allocate(
         */
        error = xfs_qm_dqattach(ip, 0);
        if (error)
-               return XFS_ERROR(error);
+               return error;
 
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
        count_fsb = imap->br_blockcount;
@@ -690,7 +698,7 @@ xfs_iomap_write_allocate(
                                                  nres, 0);
                        if (error) {
                                xfs_trans_cancel(tp, 0);
-                               return XFS_ERROR(error);
+                               return error;
                        }
                        xfs_ilock(ip, XFS_ILOCK_EXCL);
                        xfs_trans_ijoin(tp, ip, 0);
@@ -739,7 +747,7 @@ xfs_iomap_write_allocate(
                        if ((map_start_fsb + count_fsb) > last_block) {
                                count_fsb = last_block - map_start_fsb;
                                if (count_fsb == 0) {
-                                       error = EAGAIN;
+                                       error = -EAGAIN;
                                        goto trans_cancel;
                                }
                        }
@@ -793,7 +801,7 @@ trans_cancel:
        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
 error0:
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       return XFS_ERROR(error);
+       return error;
 }
 
 int
@@ -853,7 +861,7 @@ xfs_iomap_write_unwritten(
                                          resblks, 0);
                if (error) {
                        xfs_trans_cancel(tp, 0);
-                       return XFS_ERROR(error);
+                       return error;
                }
 
                xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -892,7 +900,7 @@ xfs_iomap_write_unwritten(
                error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                if (error)
-                       return XFS_ERROR(error);
+                       return error;
 
                if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
                        return xfs_alert_fsblock_zero(ip, &imap);
@@ -915,5 +923,5 @@ error_on_bmapi_transaction:
        xfs_bmap_cancel(&free_list);
        xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       return XFS_ERROR(error);
+       return error;
 }
index 205613a06068c6226cb68c7a8fd3499540202614..72129493e9d3563687a6606223a123692fe13a5c 100644 (file)
@@ -72,7 +72,7 @@ xfs_initxattrs(
        int                     error = 0;
 
        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
-               error = -xfs_attr_set(ip, xattr->name, xattr->value,
+               error = xfs_attr_set(ip, xattr->name, xattr->value,
                                      xattr->value_len, ATTR_SECURE);
                if (error < 0)
                        break;
@@ -93,7 +93,7 @@ xfs_init_security(
        struct inode    *dir,
        const struct qstr *qstr)
 {
-       return -security_inode_init_security(inode, dir, qstr,
+       return security_inode_init_security(inode, dir, qstr,
                                             &xfs_initxattrs, NULL);
 }
 
@@ -173,12 +173,12 @@ xfs_generic_create(
 
 #ifdef CONFIG_XFS_POSIX_ACL
        if (default_acl) {
-               error = -xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+               error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
                if (error)
                        goto out_cleanup_inode;
        }
        if (acl) {
-               error = -xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
+               error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
                if (error)
                        goto out_cleanup_inode;
        }
@@ -194,7 +194,7 @@ xfs_generic_create(
                posix_acl_release(default_acl);
        if (acl)
                posix_acl_release(acl);
-       return -error;
+       return error;
 
  out_cleanup_inode:
        if (!tmpfile)
@@ -248,8 +248,8 @@ xfs_vn_lookup(
        xfs_dentry_to_name(&name, dentry, 0);
        error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
        if (unlikely(error)) {
-               if (unlikely(error != ENOENT))
-                       return ERR_PTR(-error);
+               if (unlikely(error != -ENOENT))
+                       return ERR_PTR(error);
                d_add(dentry, NULL);
                return NULL;
        }
@@ -275,8 +275,8 @@ xfs_vn_ci_lookup(
        xfs_dentry_to_name(&xname, dentry, 0);
        error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name);
        if (unlikely(error)) {
-               if (unlikely(error != ENOENT))
-                       return ERR_PTR(-error);
+               if (unlikely(error != -ENOENT))
+                       return ERR_PTR(error);
                /*
                 * call d_add(dentry, NULL) here when d_drop_negative_children
                 * is called in xfs_vn_mknod (ie. allow negative dentries
@@ -311,7 +311,7 @@ xfs_vn_link(
 
        error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
        if (unlikely(error))
-               return -error;
+               return error;
 
        ihold(inode);
        d_instantiate(dentry, inode);
@@ -328,7 +328,7 @@ xfs_vn_unlink(
 
        xfs_dentry_to_name(&name, dentry, 0);
 
-       error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode));
+       error = xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode));
        if (error)
                return error;
 
@@ -375,7 +375,7 @@ xfs_vn_symlink(
        xfs_cleanup_inode(dir, inode, dentry);
        iput(inode);
  out:
-       return -error;
+       return error;
 }
 
 STATIC int
@@ -392,8 +392,8 @@ xfs_vn_rename(
        xfs_dentry_to_name(&oname, odentry, 0);
        xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode);
 
-       return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
-                          XFS_I(ndir), &nname, new_inode ?
+       return xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
+                         XFS_I(ndir), &nname, new_inode ?
                                                XFS_I(new_inode) : NULL);
 }
 
@@ -414,7 +414,7 @@ xfs_vn_follow_link(
        if (!link)
                goto out_err;
 
-       error = -xfs_readlink(XFS_I(dentry->d_inode), link);
+       error = xfs_readlink(XFS_I(dentry->d_inode), link);
        if (unlikely(error))
                goto out_kfree;
 
@@ -441,7 +441,7 @@ xfs_vn_getattr(
        trace_xfs_getattr(ip);
 
        if (XFS_FORCED_SHUTDOWN(mp))
-               return -XFS_ERROR(EIO);
+               return -EIO;
 
        stat->size = XFS_ISIZE(ip);
        stat->dev = inode->i_sb->s_dev;
@@ -546,14 +546,14 @@ xfs_setattr_nonsize(
        /* If acls are being inherited, we already have this checked */
        if (!(flags & XFS_ATTR_NOACL)) {
                if (mp->m_flags & XFS_MOUNT_RDONLY)
-                       return XFS_ERROR(EROFS);
+                       return -EROFS;
 
                if (XFS_FORCED_SHUTDOWN(mp))
-                       return XFS_ERROR(EIO);
+                       return -EIO;
 
-               error = -inode_change_ok(inode, iattr);
+               error = inode_change_ok(inode, iattr);
                if (error)
-                       return XFS_ERROR(error);
+                       return error;
        }
 
        ASSERT((mask & ATTR_SIZE) == 0);
@@ -703,7 +703,7 @@ xfs_setattr_nonsize(
        xfs_qm_dqrele(gdqp);
 
        if (error)
-               return XFS_ERROR(error);
+               return error;
 
        /*
         * XXX(hch): Updating the ACL entries is not atomic vs the i_mode
@@ -713,9 +713,9 @@ xfs_setattr_nonsize(
         *           Posix ACL code seems to care about this issue either.
         */
        if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) {
-               error = -posix_acl_chmod(inode, inode->i_mode);
+               error = posix_acl_chmod(inode, inode->i_mode);
                if (error)
-                       return XFS_ERROR(error);
+                       return error;
        }
 
        return 0;
@@ -748,14 +748,14 @@ xfs_setattr_size(
        trace_xfs_setattr(ip);
 
        if (mp->m_flags & XFS_MOUNT_RDONLY)
-               return XFS_ERROR(EROFS);
+               return -EROFS;
 
        if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
+               return -EIO;
 
-       error = -inode_change_ok(inode, iattr);
+       error = inode_change_ok(inode, iattr);
        if (error)
-               return XFS_ERROR(error);
+               return error;
 
        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
        ASSERT(S_ISREG(ip->i_d.di_mode));
@@ -818,7 +818,7 @@ xfs_setattr_size(
         * care about here.
         */
        if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
-               error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+               error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
                                                      ip->i_d.di_size, newsize);
                if (error)
                        return error;
@@ -844,7 +844,7 @@ xfs_setattr_size(
         * much we can do about this, except to hope that the caller sees ENOMEM
         * and retries the truncate operation.
         */
-       error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
+       error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
        if (error)
                return error;
        truncate_setsize(inode, newsize);
@@ -950,7 +950,7 @@ xfs_vn_setattr(
                error = xfs_setattr_nonsize(ip, iattr, 0);
        }
 
-       return -error;
+       return error;
 }
 
 STATIC int
@@ -970,7 +970,7 @@ xfs_vn_update_time(
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
        if (error) {
                xfs_trans_cancel(tp, 0);
-               return -error;
+               return error;
        }
 
        xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -991,7 +991,7 @@ xfs_vn_update_time(
        }
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
-       return -xfs_trans_commit(tp, 0);
+       return xfs_trans_commit(tp, 0);
 }
 
 #define XFS_FIEMAP_FLAGS       (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
@@ -1036,7 +1036,7 @@ xfs_fiemap_format(
                *full = 1;      /* user array now full */
        }
 
-       return -error;
+       return error;
 }
 
 STATIC int
@@ -1055,12 +1055,12 @@ xfs_vn_fiemap(
                return error;
 
        /* Set up bmap header for xfs internal routine */
-       bm.bmv_offset = BTOBB(start);
+       bm.bmv_offset = BTOBBT(start);
        /* Special case for whole file */
        if (length == FIEMAP_MAX_OFFSET)
                bm.bmv_length = -1LL;
        else
-               bm.bmv_length = BTOBB(length);
+               bm.bmv_length = BTOBB(start + length) - bm.bmv_offset;
 
        /* We add one because in getbmap world count includes the header */
        bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
@@ -1075,7 +1075,7 @@ xfs_vn_fiemap(
 
        error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
        if (error)
-               return -error;
+               return error;
 
        return 0;
 }
index cb64f222d607438e4b6f8d6bfe4c627e3bda4d22..f71be9c68017ea993808827918201b5eff31a833 100644 (file)
@@ -67,19 +67,17 @@ xfs_bulkstat_one_int(
        *stat = BULKSTAT_RV_NOTHING;
 
        if (!buffer || xfs_internal_inum(mp, ino))
-               return XFS_ERROR(EINVAL);
+               return -EINVAL;
 
        buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL);
        if (!buf)
-               return XFS_ERROR(ENOMEM);
+               return -ENOMEM;
 
        error = xfs_iget(mp, NULL, ino,
                         (XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED),
                         XFS_ILOCK_SHARED, &ip);
-       if (error) {
-               *stat = BULKSTAT_RV_NOTHING;
+       if (error)
                goto out_free;
-       }
 
        ASSERT(ip != NULL);
        ASSERT(ip->i_imap.im_blkno != 0);
@@ -136,7 +134,6 @@ xfs_bulkstat_one_int(
        IRELE(ip);
 
        error = formatter(buffer, ubsize, ubused, buf);
-
        if (!error)
                *stat = BULKSTAT_RV_DIDONE;
 
@@ -154,9 +151,9 @@ xfs_bulkstat_one_fmt(
        const xfs_bstat_t       *buffer)
 {
        if (ubsize < sizeof(*buffer))
-               return XFS_ERROR(ENOMEM);
+               return -ENOMEM;
        if (copy_to_user(ubuffer, buffer, sizeof(*buffer)))
-               return XFS_ERROR(EFAULT);
+               return -EFAULT;
        if (ubused)
                *ubused = sizeof(*buffer);
        return 0;
@@ -175,8 +172,169 @@ xfs_bulkstat_one(
                                    xfs_bulkstat_one_fmt, ubused, stat);
 }
 
+/*
+ * Loop over all clusters in a chunk for a given incore inode allocation btree
+ * record.  Do a readahead if there are any allocated inodes in that cluster.
+ */
+STATIC void
+xfs_bulkstat_ichunk_ra(
+       struct xfs_mount                *mp,
+       xfs_agnumber_t                  agno,
+       struct xfs_inobt_rec_incore     *irec)
+{
+       xfs_agblock_t                   agbno;
+       struct blk_plug                 plug;
+       int                             blks_per_cluster;
+       int                             inodes_per_cluster;
+       int                             i;      /* inode chunk index */
+
+       agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino);
+       blks_per_cluster = xfs_icluster_size_fsb(mp);
+       inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
+
+       blk_start_plug(&plug);
+       for (i = 0; i < XFS_INODES_PER_CHUNK;
+            i += inodes_per_cluster, agbno += blks_per_cluster) {
+               if (xfs_inobt_maskn(i, inodes_per_cluster) & ~irec->ir_free) {
+                       xfs_btree_reada_bufs(mp, agno, agbno, blks_per_cluster,
+                                            &xfs_inode_buf_ops);
+               }
+       }
+       blk_finish_plug(&plug);
+}
+
+/*
+ * Lookup the inode chunk that the given inode lives in and then get the record
+ * if we found the chunk.  If the inode was not the last in the chunk and there
+ * are some left allocated, update the data for the pointed-to record as well as
+ * return the count of grabbed inodes.
+ */
+STATIC int
+xfs_bulkstat_grab_ichunk(
+       struct xfs_btree_cur            *cur,   /* btree cursor */
+       xfs_agino_t                     agino,  /* starting inode of chunk */
+       int                             *icount,/* return # of inodes grabbed */
+       struct xfs_inobt_rec_incore     *irec)  /* btree record */
+{
+       int                             idx;    /* index into inode chunk */
+       int                             stat;
+       int                             error = 0;
+
+       /* Lookup the inode chunk that this inode lives in */
+       error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &stat);
+       if (error)
+               return error;
+       if (!stat) {
+               *icount = 0;
+               return error;
+       }
+
+       /* Get the record, should always work */
+       error = xfs_inobt_get_rec(cur, irec, &stat);
+       if (error)
+               return error;
+       XFS_WANT_CORRUPTED_RETURN(stat == 1);
+
+       /* Check if the record contains the inode in request */
+       if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino)
+               return -EINVAL;
+
+       idx = agino - irec->ir_startino + 1;
+       if (idx < XFS_INODES_PER_CHUNK &&
+           (xfs_inobt_maskn(idx, XFS_INODES_PER_CHUNK - idx) & ~irec->ir_free)) {
+               int     i;
+
+               /* We got a right chunk with some left inodes allocated at it.
+                * Grab the chunk record.  Mark all the uninteresting inodes
+                * free -- because they're before our start point.
+                */
+               for (i = 0; i < idx; i++) {
+                       if (XFS_INOBT_MASK(i) & ~irec->ir_free)
+                               irec->ir_freecount++;
+               }
+
+               irec->ir_free |= xfs_inobt_maskn(0, idx);
+               *icount = XFS_INODES_PER_CHUNK - irec->ir_freecount;
+       }
+
+       return 0;
+}
+
 #define XFS_BULKSTAT_UBLEFT(ubleft)    ((ubleft) >= statstruct_size)
 
+/*
+ * Process inodes in chunk with a pointer to a formatter function
+ * that will iget the inode and fill in the appropriate structure.
+ */
+int
+xfs_bulkstat_ag_ichunk(
+       struct xfs_mount                *mp,
+       xfs_agnumber_t                  agno,
+       struct xfs_inobt_rec_incore     *irbp,
+       bulkstat_one_pf                 formatter,
+       size_t                          statstruct_size,
+       struct xfs_bulkstat_agichunk    *acp)
+{
+       xfs_ino_t                       lastino = acp->ac_lastino;
+       char                            __user **ubufp = acp->ac_ubuffer;
+       int                             ubleft = acp->ac_ubleft;
+       int                             ubelem = acp->ac_ubelem;
+       int                             chunkidx, clustidx;
+       int                             error = 0;
+       xfs_agino_t                     agino;
+
+       for (agino = irbp->ir_startino, chunkidx = clustidx = 0;
+            XFS_BULKSTAT_UBLEFT(ubleft) &&
+            irbp->ir_freecount < XFS_INODES_PER_CHUNK;
+            chunkidx++, clustidx++, agino++) {
+               int             fmterror;       /* bulkstat formatter result */
+               int             ubused;
+               xfs_ino_t       ino = XFS_AGINO_TO_INO(mp, agno, agino);
+
+               ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
+
+               /* Skip if this inode is free */
+               if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) {
+                       lastino = ino;
+                       continue;
+               }
+
+               /*
+                * Count used inodes as free so we can tell when the
+                * chunk is used up.
+                */
+               irbp->ir_freecount++;
+
+               /* Get the inode and fill in a single buffer */
+               ubused = statstruct_size;
+               error = formatter(mp, ino, *ubufp, ubleft, &ubused, &fmterror);
+               if (fmterror == BULKSTAT_RV_NOTHING) {
+                       if (error && error != -ENOENT && error != -EINVAL) {
+                               ubleft = 0;
+                               break;
+                       }
+                       lastino = ino;
+                       continue;
+               }
+               if (fmterror == BULKSTAT_RV_GIVEUP) {
+                       ubleft = 0;
+                       ASSERT(error);
+                       break;
+               }
+               if (*ubufp)
+                       *ubufp += ubused;
+               ubleft -= ubused;
+               ubelem++;
+               lastino = ino;
+       }
+
+       acp->ac_lastino = lastino;
+       acp->ac_ubleft = ubleft;
+       acp->ac_ubelem = ubelem;
+
+       return error;
+}
+
 /*
  * Return stat information in bulk (by-inode) for the filesystem.
  */
@@ -190,13 +348,10 @@ xfs_bulkstat(
        char                    __user *ubuffer, /* buffer with inode stats */
        int                     *done)  /* 1 if there are more stats to get */
 {
-       xfs_agblock_t           agbno=0;/* allocation group block number */
        xfs_buf_t               *agbp;  /* agi header buffer */
        xfs_agi_t               *agi;   /* agi header data */
        xfs_agino_t             agino;  /* inode # in allocation group */
        xfs_agnumber_t          agno;   /* allocation group number */
-       int                     chunkidx; /* current index into inode chunk */
-       int                     clustidx; /* current index into inode cluster */
        xfs_btree_cur_t         *cur;   /* btree cursor for ialloc btree */
        int                     end_of_ag; /* set if we've seen the ag end */
        int                     error;  /* error code */
@@ -209,8 +364,6 @@ xfs_bulkstat(
        xfs_inobt_rec_incore_t  *irbuf; /* start of irec buffer */
        xfs_inobt_rec_incore_t  *irbufend; /* end of good irec buffer entries */
        xfs_ino_t               lastino; /* last inode number returned */
-       int                     blks_per_cluster; /* # of blocks per cluster */
-       int                     inodes_per_cluster;/* # of inodes per cluster */
        int                     nirbuf; /* size of irbuf */
        int                     rval;   /* return value error code */
        int                     tmp;    /* result value from btree calls */
@@ -218,7 +371,6 @@ xfs_bulkstat(
        int                     ubleft; /* bytes left in user's buffer */
        char                    __user *ubufp;  /* pointer into user's buffer */
        int                     ubelem; /* spaces used in user's buffer */
-       int                     ubused; /* bytes used by formatter */
 
        /*
         * Get the last inode value, see if there's nothing to do.
@@ -233,20 +385,16 @@ xfs_bulkstat(
                *ubcountp = 0;
                return 0;
        }
-       if (!ubcountp || *ubcountp <= 0) {
-               return EINVAL;
-       }
+
        ubcount = *ubcountp; /* statstruct's */
        ubleft = ubcount * statstruct_size; /* bytes */
        *ubcountp = ubelem = 0;
        *done = 0;
        fmterror = 0;
        ubufp = ubuffer;
-       blks_per_cluster = xfs_icluster_size_fsb(mp);
-       inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
        irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
        if (!irbuf)
-               return ENOMEM;
+               return -ENOMEM;
 
        nirbuf = irbsize / sizeof(*irbuf);
 
@@ -258,14 +406,8 @@ xfs_bulkstat(
        while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
                cond_resched();
                error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
-               if (error) {
-                       /*
-                        * Skip this allocation group and go to the next one.
-                        */
-                       agno++;
-                       agino = 0;
-                       continue;
-               }
+               if (error)
+                       break;
                agi = XFS_BUF_TO_AGI(agbp);
                /*
                 * Allocate and initialize a btree cursor for ialloc btree.
@@ -275,96 +417,39 @@ xfs_bulkstat(
                irbp = irbuf;
                irbufend = irbuf + nirbuf;
                end_of_ag = 0;
-               /*
-                * If we're returning in the middle of an allocation group,
-                * we need to get the remainder of the chunk we're in.
-                */
+               icount = 0;
                if (agino > 0) {
-                       xfs_inobt_rec_incore_t r;
-
                        /*
-                        * Lookup the inode chunk that this inode lives in.
+                        * In the middle of an allocation group, we need to get
+                        * the remainder of the chunk we're in.
                         */
-                       error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE,
-                                                &tmp);
-                       if (!error &&   /* no I/O error */
-                           tmp &&      /* lookup succeeded */
-                                       /* got the record, should always work */
-                           !(error = xfs_inobt_get_rec(cur, &r, &i)) &&
-                           i == 1 &&
-                                       /* this is the right chunk */
-                           agino < r.ir_startino + XFS_INODES_PER_CHUNK &&
-                                       /* lastino was not last in chunk */
-                           (chunkidx = agino - r.ir_startino + 1) <
-                                   XFS_INODES_PER_CHUNK &&
-                                       /* there are some left allocated */
-                           xfs_inobt_maskn(chunkidx,
-                                   XFS_INODES_PER_CHUNK - chunkidx) &
-                                   ~r.ir_free) {
-                               /*
-                                * Grab the chunk record.  Mark all the
-                                * uninteresting inodes (because they're
-                                * before our start point) free.
-                                */
-                               for (i = 0; i < chunkidx; i++) {
-                                       if (XFS_INOBT_MASK(i) & ~r.ir_free)
-                                               r.ir_freecount++;
-                               }
-                               r.ir_free |= xfs_inobt_maskn(0, chunkidx);
+                       struct xfs_inobt_rec_incore     r;
+
+                       error = xfs_bulkstat_grab_ichunk(cur, agino, &icount, &r);
+                       if (error)
+                               break;
+                       if (icount) {
                                irbp->ir_startino = r.ir_startino;
                                irbp->ir_freecount = r.ir_freecount;
                                irbp->ir_free = r.ir_free;
                                irbp++;
                                agino = r.ir_startino + XFS_INODES_PER_CHUNK;
-                               icount = XFS_INODES_PER_CHUNK - r.ir_freecount;
-                       } else {
-                               /*
-                                * If any of those tests failed, bump the
-                                * inode number (just in case).
-                                */
-                               agino++;
-                               icount = 0;
                        }
-                       /*
-                        * In any case, increment to the next record.
-                        */
-                       if (!error)
-                               error = xfs_btree_increment(cur, 0, &tmp);
+                       /* Increment to the next record */
+                       error = xfs_btree_increment(cur, 0, &tmp);
                } else {
-                       /*
-                        * Start of ag.  Lookup the first inode chunk.
-                        */
+                       /* Start of ag.  Lookup the first inode chunk */
                        error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp);
-                       icount = 0;
                }
+               if (error)
+                       break;
+
                /*
                 * Loop through inode btree records in this ag,
                 * until we run out of inodes or space in the buffer.
                 */
                while (irbp < irbufend && icount < ubcount) {
-                       xfs_inobt_rec_incore_t r;
-
-                       /*
-                        * Loop as long as we're unable to read the
-                        * inode btree.
-                        */
-                       while (error) {
-                               agino += XFS_INODES_PER_CHUNK;
-                               if (XFS_AGINO_TO_AGBNO(mp, agino) >=
-                                               be32_to_cpu(agi->agi_length))
-                                       break;
-                               error = xfs_inobt_lookup(cur, agino,
-                                                        XFS_LOOKUP_GE, &tmp);
-                               cond_resched();
-                       }
-                       /*
-                        * If ran off the end of the ag either with an error,
-                        * or the normal way, set end and stop collecting.
-                        */
-                       if (error) {
-                               end_of_ag = 1;
-                               break;
-                       }
+                       struct xfs_inobt_rec_incore     r;
 
                        error = xfs_inobt_get_rec(cur, &r, &i);
                        if (error || i == 0) {
@@ -377,25 +462,7 @@ xfs_bulkstat(
                         * Also start read-ahead now for this chunk.
                         */
                        if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
-                               struct blk_plug plug;
-                               /*
-                                * Loop over all clusters in the next chunk.
-                                * Do a readahead if there are any allocated
-                                * inodes in that cluster.
-                                */
-                               blk_start_plug(&plug);
-                               agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
-                               for (chunkidx = 0;
-                                    chunkidx < XFS_INODES_PER_CHUNK;
-                                    chunkidx += inodes_per_cluster,
-                                    agbno += blks_per_cluster) {
-                                       if (xfs_inobt_maskn(chunkidx,
-                                           inodes_per_cluster) & ~r.ir_free)
-                                               xfs_btree_reada_bufs(mp, agno,
-                                                       agbno, blks_per_cluster,
-                                                       &xfs_inode_buf_ops);
-                               }
-                               blk_finish_plug(&plug);
+                               xfs_bulkstat_ichunk_ra(mp, agno, &r);
                                irbp->ir_startino = r.ir_startino;
                                irbp->ir_freecount = r.ir_freecount;
                                irbp->ir_free = r.ir_free;
@@ -422,57 +489,20 @@ xfs_bulkstat(
                irbufend = irbp;
                for (irbp = irbuf;
                     irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) {
-                       /*
-                        * Now process this chunk of inodes.
-                        */
-                       for (agino = irbp->ir_startino, chunkidx = clustidx = 0;
-                            XFS_BULKSTAT_UBLEFT(ubleft) &&
-                               irbp->ir_freecount < XFS_INODES_PER_CHUNK;
-                            chunkidx++, clustidx++, agino++) {
-                               ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
-
-                               ino = XFS_AGINO_TO_INO(mp, agno, agino);
-                               /*
-                                * Skip if this inode is free.
-                                */
-                               if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) {
-                                       lastino = ino;
-                                       continue;
-                               }
-                               /*
-                                * Count used inodes as free so we can tell
-                                * when the chunk is used up.
-                                */
-                               irbp->ir_freecount++;
-
-                               /*
-                                * Get the inode and fill in a single buffer.
-                                */
-                               ubused = statstruct_size;
-                               error = formatter(mp, ino, ubufp, ubleft,
-                                                 &ubused, &fmterror);
-                               if (fmterror == BULKSTAT_RV_NOTHING) {
-                                       if (error && error != ENOENT &&
-                                               error != EINVAL) {
-                                               ubleft = 0;
-                                               rval = error;
-                                               break;
-                                       }
-                                       lastino = ino;
-                                       continue;
-                               }
-                               if (fmterror == BULKSTAT_RV_GIVEUP) {
-                                       ubleft = 0;
-                                       ASSERT(error);
-                                       rval = error;
-                                       break;
-                               }
-                               if (ubufp)
-                                       ubufp += ubused;
-                               ubleft -= ubused;
-                               ubelem++;
-                               lastino = ino;
-                       }
+                       struct xfs_bulkstat_agichunk ac;
+
+                       ac.ac_lastino = lastino;
+                       ac.ac_ubuffer = &ubuffer;
+                       ac.ac_ubleft = ubleft;
+                       ac.ac_ubelem = ubelem;
+                       error = xfs_bulkstat_ag_ichunk(mp, agno, irbp,
+                                       formatter, statstruct_size, &ac);
+                       if (error)
+                               rval = error;
+
+                       lastino = ac.ac_lastino;
+                       ubleft = ac.ac_ubleft;
+                       ubelem = ac.ac_ubelem;
 
                        cond_resched();
                }
@@ -512,58 +542,10 @@ xfs_bulkstat(
        return rval;
 }
 
-/*
- * Return stat information in bulk (by-inode) for the filesystem.
- * Special case for non-sequential one inode bulkstat.
- */
-int                                    /* error status */
-xfs_bulkstat_single(
-       xfs_mount_t             *mp,    /* mount point for filesystem */
-       xfs_ino_t               *lastinop, /* inode to return */
-       char                    __user *buffer, /* buffer with inode stats */
-       int                     *done)  /* 1 if there are more stats to get */
-{
-       int                     count;  /* count value for bulkstat call */
-       int                     error;  /* return value */
-       xfs_ino_t               ino;    /* filesystem inode number */
-       int                     res;    /* result from bs1 */
-
-       /*
-        * note that requesting valid inode numbers which are not allocated
-        * to inodes will most likely cause xfs_imap_to_bp to generate warning
-        * messages about bad magic numbers. This is ok. The fact that
-        * the inode isn't actually an inode is handled by the
-        * error check below. Done this way to make the usual case faster
-        * at the expense of the error case.
-        */
-
-       ino = *lastinop;
-       error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t),
-                                NULL, &res);
-       if (error) {
-               /*
-                * Special case way failed, do it the "long" way
-                * to see if that works.
-                */
-               (*lastinop)--;
-               count = 1;
-               if (xfs_bulkstat(mp, lastinop, &count, xfs_bulkstat_one,
-                               sizeof(xfs_bstat_t), buffer, done))
-                       return error;
-               if (count == 0 || (xfs_ino_t)*lastinop != ino)
-                       return error == EFSCORRUPTED ?
-                               XFS_ERROR(EINVAL) : error;
-               else
-                       return 0;
-       }
-       *done = 0;
-       return 0;
-}
-
 int
 xfs_inumbers_fmt(
        void                    __user *ubuffer, /* buffer to write to */
-       const xfs_inogrp_t      *buffer,        /* buffer to read from */
+       const struct xfs_inogrp *buffer,        /* buffer to read from */
        long                    count,          /* # of elements to read */
        long                    *written)       /* # of bytes written */
 {
@@ -578,127 +560,104 @@ xfs_inumbers_fmt(
  */
 int                                    /* error status */
 xfs_inumbers(
-       xfs_mount_t     *mp,            /* mount point for filesystem */
-       xfs_ino_t       *lastino,       /* last inode returned */
-       int             *count,         /* size of buffer/count returned */
-       void            __user *ubuffer,/* buffer with inode descriptions */
-       inumbers_fmt_pf formatter)
+       struct xfs_mount        *mp,/* mount point for filesystem */
+       xfs_ino_t               *lastino,/* last inode returned */
+       int                     *count,/* size of buffer/count returned */
+       void                    __user *ubuffer,/* buffer with inode descriptions */
+       inumbers_fmt_pf         formatter)
 {
-       xfs_buf_t       *agbp;
-       xfs_agino_t     agino;
-       xfs_agnumber_t  agno;
-       int             bcount;
-       xfs_inogrp_t    *buffer;
-       int             bufidx;
-       xfs_btree_cur_t *cur;
-       int             error;
-       xfs_inobt_rec_incore_t r;
-       int             i;
-       xfs_ino_t       ino;
-       int             left;
-       int             tmp;
-
-       ino = (xfs_ino_t)*lastino;
-       agno = XFS_INO_TO_AGNO(mp, ino);
-       agino = XFS_INO_TO_AGINO(mp, ino);
-       left = *count;
+       xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, *lastino);
+       xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, *lastino);
+       struct xfs_btree_cur    *cur = NULL;
+       struct xfs_buf          *agbp = NULL;
+       struct xfs_inogrp       *buffer;
+       int                     bcount;
+       int                     left = *count;
+       int                     bufidx = 0;
+       int                     error = 0;
+
        *count = 0;
+       if (agno >= mp->m_sb.sb_agcount ||
+           *lastino != XFS_AGINO_TO_INO(mp, agno, agino))
+               return error;
+
        bcount = MIN(left, (int)(PAGE_SIZE / sizeof(*buffer)));
        buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP);
-       error = bufidx = 0;
-       cur = NULL;
-       agbp = NULL;
-       while (left > 0 && agno < mp->m_sb.sb_agcount) {
-               if (agbp == NULL) {
+       do {
+               struct xfs_inobt_rec_incore     r;
+               int                             stat;
+
+               if (!agbp) {
                        error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
-                       if (error) {
-                               /*
-                                * If we can't read the AGI of this ag,
-                                * then just skip to the next one.
-                                */
-                               ASSERT(cur == NULL);
-                               agbp = NULL;
-                               agno++;
-                               agino = 0;
-                               continue;
-                       }
+                       if (error)
+                               break;
+
                        cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
                                                    XFS_BTNUM_INO);
                        error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
-                                                &tmp);
-                       if (error) {
-                               xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-                               cur = NULL;
-                               xfs_buf_relse(agbp);
-                               agbp = NULL;
-                               /*
-                                * Move up the last inode in the current
-                                * chunk.  The lookup_ge will always get
-                                * us the first inode in the next chunk.
-                                */
-                               agino += XFS_INODES_PER_CHUNK - 1;
-                               continue;
-                       }
-               }
-               error = xfs_inobt_get_rec(cur, &r, &i);
-               if (error || i == 0) {
-                       xfs_buf_relse(agbp);
-                       agbp = NULL;
-                       xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-                       cur = NULL;
-                       agno++;
-                       agino = 0;
-                       continue;
+                                                &stat);
+                       if (error)
+                               break;
+                       if (!stat)
+                               goto next_ag;
                }
+
+               error = xfs_inobt_get_rec(cur, &r, &stat);
+               if (error)
+                       break;
+               if (!stat)
+                       goto next_ag;
+
                agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
                buffer[bufidx].xi_startino =
                        XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
                buffer[bufidx].xi_alloccount =
                        XFS_INODES_PER_CHUNK - r.ir_freecount;
                buffer[bufidx].xi_allocmask = ~r.ir_free;
-               bufidx++;
-               left--;
-               if (bufidx == bcount) {
-                       long written;
-                       if (formatter(ubuffer, buffer, bufidx, &written)) {
-                               error = XFS_ERROR(EFAULT);
+               if (++bufidx == bcount) {
+                       long    written;
+
+                       error = formatter(ubuffer, buffer, bufidx, &written);
+                       if (error)
                                break;
-                       }
                        ubuffer += written;
                        *count += bufidx;
                        bufidx = 0;
                }
-               if (left) {
-                       error = xfs_btree_increment(cur, 0, &tmp);
-                       if (error) {
-                               xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-                               cur = NULL;
-                               xfs_buf_relse(agbp);
-                               agbp = NULL;
-                               /*
-                                * The agino value has already been bumped.
-                                * Just try to skip up to it.
-                                */
-                               agino += XFS_INODES_PER_CHUNK;
-                               continue;
-                       }
-               }
-       }
+               if (!--left)
+                       break;
+
+               error = xfs_btree_increment(cur, 0, &stat);
+               if (error)
+                       break;
+               if (stat)
+                       continue;
+
+next_ag:
+               xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+               cur = NULL;
+               xfs_buf_relse(agbp);
+               agbp = NULL;
+               agino = 0;
+       } while (++agno < mp->m_sb.sb_agcount);
+
        if (!error) {
                if (bufidx) {
-                       long written;
-                       if (formatter(ubuffer, buffer, bufidx, &written))
-                               error = XFS_ERROR(EFAULT);
-                       else
+                       long    written;
+
+                       error = formatter(ubuffer, buffer, bufidx, &written);
+                       if (!error)
                                *count += bufidx;
                }
                *lastino = XFS_AGINO_TO_INO(mp, agno, agino);
        }
+
        kmem_free(buffer);
        if (cur)
                xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR :
                                           XFS_BTREE_NOERROR));
        if (agbp)
                xfs_buf_relse(agbp);
+
        return error;
 }
index 97295d91d1703c21cbeacacc876f9eded07aeb38..aaed08022eb9e9cd7e271d86d54e308e01427c77 100644 (file)
@@ -30,6 +30,22 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount      *mp,
                               int              *ubused,
                               int              *stat);
 
+struct xfs_bulkstat_agichunk {
+       xfs_ino_t       ac_lastino;     /* last inode returned */
+       char            __user **ac_ubuffer;/* pointer into user's buffer */
+       int             ac_ubleft;      /* bytes left in user's buffer */
+       int             ac_ubelem;      /* spaces used in user's buffer */
+};
+
+int
+xfs_bulkstat_ag_ichunk(
+       struct xfs_mount                *mp,
+       xfs_agnumber_t                  agno,
+       struct xfs_inobt_rec_incore     *irbp,
+       bulkstat_one_pf                 formatter,
+       size_t                          statstruct_size,
+       struct xfs_bulkstat_agichunk    *acp);
+
 /*
  * Values for stat return value.
  */
@@ -50,13 +66,6 @@ xfs_bulkstat(
        char            __user *ubuffer,/* buffer with inode stats */
        int             *done);         /* 1 if there are more stats to get */
 
-int
-xfs_bulkstat_single(
-       xfs_mount_t             *mp,
-       xfs_ino_t               *lastinop,
-       char                    __user *buffer,
-       int                     *done);
-
 typedef int (*bulkstat_one_fmt_pf)(  /* used size in bytes or negative error */
        void                    __user *ubuffer, /* buffer to write to */
        int                     ubsize,          /* remaining user buffer sz */
index 825249d2dfc1a740b6c5523ac1808af53776efb0..d10dc8f397c970e7dacc6ec404bf932c36d0434c 100644 (file)
 
 #include <linux/types.h>
 
-/*
- * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.
- * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set.
- */
-#if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
-# define XFS_BIG_BLKNOS        1
-# define XFS_BIG_INUMS 1
-#else
-# define XFS_BIG_BLKNOS        0
-# define XFS_BIG_INUMS 0
-#endif
-
 /*
  * Kernel specific type declarations for XFS
  */
@@ -113,7 +101,7 @@ typedef __uint64_t __psunsigned_t;
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
 
-#include "xfs_vnode.h"
+#include "xfs_fs.h"
 #include "xfs_stats.h"
 #include "xfs_sysctl.h"
 #include "xfs_iops.h"
@@ -191,6 +179,17 @@ typedef __uint64_t __psunsigned_t;
 #define MAX(a,b)       (max(a,b))
 #define howmany(x, y)  (((x)+((y)-1))/(y))
 
+/*
+ * XFS wrapper structure for sysfs support. It depends on external data
+ * structures and is embedded in various internal data structures to implement
+ * the XFS sysfs object heirarchy. Define it here for broad access throughout
+ * the codebase.
+ */
+struct xfs_kobj {
+       struct kobject          kobject;
+       struct completion       complete;
+};
+
 /* Kernel uid/gid conversion. These are used to convert to/from the on disk
  * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally.
  * The conversion here is type only, the value will remain the same since we
@@ -331,7 +330,7 @@ static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)
 {
        x += y - 1;
        do_div(x, y);
-       return(x * y);
+       return x * y;
 }
 
 static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
index 292308dede6da566500883ccfb97122adb59b89f..ca4fd5bd8522ced2b17363487a1d0d39d60f4a8f 100644 (file)
@@ -34,6 +34,7 @@
 #include "xfs_trace.h"
 #include "xfs_fsops.h"
 #include "xfs_cksum.h"
+#include "xfs_sysfs.h"
 
 kmem_zone_t    *xfs_log_ticket_zone;
 
@@ -283,7 +284,7 @@ xlog_grant_head_wait(
        return 0;
 shutdown:
        list_del_init(&tic->t_queue);
-       return XFS_ERROR(EIO);
+       return -EIO;
 }
 
 /*
@@ -377,7 +378,7 @@ xfs_log_regrant(
        int                     error = 0;
 
        if (XLOG_FORCED_SHUTDOWN(log))
-               return XFS_ERROR(EIO);
+               return -EIO;
 
        XFS_STATS_INC(xs_try_logspace);
 
@@ -446,7 +447,7 @@ xfs_log_reserve(
        ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
 
        if (XLOG_FORCED_SHUTDOWN(log))
-               return XFS_ERROR(EIO);
+               return -EIO;
 
        XFS_STATS_INC(xs_try_logspace);
 
@@ -454,7 +455,7 @@ xfs_log_reserve(
        tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
                                KM_SLEEP | KM_MAYFAIL);
        if (!tic)
-               return XFS_ERROR(ENOMEM);
+               return -ENOMEM;
 
        tic->t_trans_type = t_type;
        *ticp = tic;
@@ -590,7 +591,7 @@ xfs_log_release_iclog(
 {
        if (xlog_state_release_iclog(mp->m_log, iclog)) {
                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
-               return EIO;
+               return -EIO;
        }
 
        return 0;
@@ -628,7 +629,7 @@ xfs_log_mount(
 
        mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
        if (IS_ERR(mp->m_log)) {
-               error = -PTR_ERR(mp->m_log);
+               error = PTR_ERR(mp->m_log);
                goto out;
        }
 
@@ -652,18 +653,18 @@ xfs_log_mount(
                xfs_warn(mp,
                "Log size %d blocks too small, minimum size is %d blocks",
                         mp->m_sb.sb_logblocks, min_logfsbs);
-               error = EINVAL;
+               error = -EINVAL;
        } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) {
                xfs_warn(mp,
                "Log size %d blocks too large, maximum size is %lld blocks",
                         mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS);
-               error = EINVAL;
+               error = -EINVAL;
        } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) {
                xfs_warn(mp,
                "log size %lld bytes too large, maximum size is %lld bytes",
                         XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks),
                         XFS_MAX_LOG_BYTES);
-               error = EINVAL;
+               error = -EINVAL;
        }
        if (error) {
                if (xfs_sb_version_hascrc(&mp->m_sb)) {
@@ -707,6 +708,11 @@ xfs_log_mount(
                }
        }
 
+       error = xfs_sysfs_init(&mp->m_log->l_kobj, &xfs_log_ktype, &mp->m_kobj,
+                              "log");
+       if (error)
+               goto out_destroy_ail;
+
        /* Normal transactions can now occur */
        mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
 
@@ -947,6 +953,9 @@ xfs_log_unmount(
        xfs_log_quiesce(mp);
 
        xfs_trans_ail_destroy(mp);
+
+       xfs_sysfs_del(&mp->m_log->l_kobj);
+
        xlog_dealloc_log(mp->m_log);
 }
 
@@ -1313,7 +1322,7 @@ xlog_alloc_log(
        xlog_in_core_t          *iclog, *prev_iclog=NULL;
        xfs_buf_t               *bp;
        int                     i;
-       int                     error = ENOMEM;
+       int                     error = -ENOMEM;
        uint                    log2_size = 0;
 
        log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL);
@@ -1340,7 +1349,7 @@ xlog_alloc_log(
        xlog_grant_head_init(&log->l_reserve_head);
        xlog_grant_head_init(&log->l_write_head);
 
-       error = EFSCORRUPTED;
+       error = -EFSCORRUPTED;
        if (xfs_sb_version_hassector(&mp->m_sb)) {
                log2_size = mp->m_sb.sb_logsectlog;
                if (log2_size < BBSHIFT) {
@@ -1369,8 +1378,14 @@ xlog_alloc_log(
 
        xlog_get_iclog_buffer_size(mp, log);
 
-       error = ENOMEM;
-       bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0);
+       /*
+        * Use a NULL block for the extra log buffer used during splits so that
+        * it will trigger errors if we ever try to do IO on it without first
+        * having set it up properly.
+        */
+       error = -ENOMEM;
+       bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL,
+                          BTOBB(log->l_iclog_size), 0);
        if (!bp)
                goto out_free_log;
 
@@ -1463,7 +1478,7 @@ out_free_iclog:
 out_free_log:
        kmem_free(log);
 out:
-       return ERR_PTR(-error);
+       return ERR_PTR(error);
 }      /* xlog_alloc_log */
 
 
@@ -1661,7 +1676,7 @@ xlog_bdstrat(
 
        xfs_buf_lock(bp);
        if (iclog->ic_state & XLOG_STATE_IOERROR) {
-               xfs_buf_ioerror(bp, EIO);
+               xfs_buf_ioerror(bp, -EIO);
                xfs_buf_stale(bp);
                xfs_buf_ioend(bp, 0);
                /*
@@ -2360,7 +2375,7 @@ xlog_write(
 
                        ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
                        if (!ophdr)
-                               return XFS_ERROR(EIO);
+                               return -EIO;
 
                        xlog_write_adv_cnt(&ptr, &len, &log_offset,
                                           sizeof(struct xlog_op_header));
@@ -2859,7 +2874,7 @@ restart:
        spin_lock(&log->l_icloglock);
        if (XLOG_FORCED_SHUTDOWN(log)) {
                spin_unlock(&log->l_icloglock);
-               return XFS_ERROR(EIO);
+               return -EIO;
        }
 
        iclog = log->l_iclog;
@@ -3047,7 +3062,7 @@ xlog_state_release_iclog(
        int             sync = 0;       /* do we sync? */
 
        if (iclog->ic_state & XLOG_STATE_IOERROR)
-               return XFS_ERROR(EIO);
+               return -EIO;
 
        ASSERT(atomic_read(&iclog->ic_refcnt) > 0);
        if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock))
@@ -3055,7 +3070,7 @@ xlog_state_release_iclog(
 
        if (iclog->ic_state & XLOG_STATE_IOERROR) {
                spin_unlock(&log->l_icloglock);
-               return XFS_ERROR(EIO);
+               return -EIO;
        }
        ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE ||
               iclog->ic_state == XLOG_STATE_WANT_SYNC);
@@ -3172,7 +3187,7 @@ _xfs_log_force(
        iclog = log->l_iclog;
        if (iclog->ic_state & XLOG_STATE_IOERROR) {
                spin_unlock(&log->l_icloglock);
-               return XFS_ERROR(EIO);
+               return -EIO;
        }
 
        /* If the head iclog is not active nor dirty, we just attach
@@ -3210,7 +3225,7 @@ _xfs_log_force(
                                spin_unlock(&log->l_icloglock);
 
                                if (xlog_state_release_iclog(log, iclog))
-                                       return XFS_ERROR(EIO);
+                                       return -EIO;
 
                                if (log_flushed)
                                        *log_flushed = 1;
@@ -3246,7 +3261,7 @@ maybe_sleep:
                 */
                if (iclog->ic_state & XLOG_STATE_IOERROR) {
                        spin_unlock(&log->l_icloglock);
-                       return XFS_ERROR(EIO);
+                       return -EIO;
                }
                XFS_STATS_INC(xs_log_force_sleep);
                xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
@@ -3256,7 +3271,7 @@ maybe_sleep:
                 * and the memory read should be atomic.
                 */
                if (iclog->ic_state & XLOG_STATE_IOERROR)
-                       return XFS_ERROR(EIO);
+                       return -EIO;
                if (log_flushed)
                        *log_flushed = 1;
        } else {
@@ -3324,7 +3339,7 @@ try_again:
        iclog = log->l_iclog;
        if (iclog->ic_state & XLOG_STATE_IOERROR) {
                spin_unlock(&log->l_icloglock);
-               return XFS_ERROR(EIO);
+               return -EIO;
        }
 
        do {
@@ -3375,7 +3390,7 @@ try_again:
                        xlog_state_switch_iclogs(log, iclog, 0);
                        spin_unlock(&log->l_icloglock);
                        if (xlog_state_release_iclog(log, iclog))
-                               return XFS_ERROR(EIO);
+                               return -EIO;
                        if (log_flushed)
                                *log_flushed = 1;
                        spin_lock(&log->l_icloglock);
@@ -3390,7 +3405,7 @@ try_again:
                         */
                        if (iclog->ic_state & XLOG_STATE_IOERROR) {
                                spin_unlock(&log->l_icloglock);
-                               return XFS_ERROR(EIO);
+                               return -EIO;
                        }
                        XFS_STATS_INC(xs_log_force_sleep);
                        xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
@@ -3400,7 +3415,7 @@ try_again:
                         * and the memory read should be atomic.
                         */
                        if (iclog->ic_state & XLOG_STATE_IOERROR)
-                               return XFS_ERROR(EIO);
+                               return -EIO;
 
                        if (log_flushed)
                                *log_flushed = 1;
index b3425b34e3d5782d95252c71acadfffdbd24ad4c..f6b79e5325dd4426da54b940fd1bea0ca170a6e8 100644 (file)
@@ -78,8 +78,6 @@ xlog_cil_init_post_recovery(
 {
        log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
        log->l_cilp->xc_ctx->sequence = 1;
-       log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
-                                                               log->l_curr_block);
 }
 
 /*
@@ -634,7 +632,7 @@ out_abort_free_ticket:
        xfs_log_ticket_put(tic);
 out_abort:
        xlog_cil_committed(ctx, XFS_LI_ABORTED);
-       return XFS_ERROR(EIO);
+       return -EIO;
 }
 
 static void
@@ -928,12 +926,12 @@ xlog_cil_init(
 
        cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
        if (!cil)
-               return ENOMEM;
+               return -ENOMEM;
 
        ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
        if (!ctx) {
                kmem_free(cil);
-               return ENOMEM;
+               return -ENOMEM;
        }
 
        INIT_WORK(&cil->xc_push_work, xlog_cil_push_work);
diff --git a/fs/xfs/xfs_log_format.h b/fs/xfs/xfs_log_format.h
deleted file mode 100644 (file)
index f0969c7..0000000
+++ /dev/null
@@ -1,679 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef        __XFS_LOG_FORMAT_H__
-#define __XFS_LOG_FORMAT_H__
-
-struct xfs_mount;
-struct xfs_trans_res;
-
-/*
- * On-disk Log Format definitions.
- *
- * This file contains all the on-disk format definitions used within the log. It
- * includes the physical log structure itself, as well as all the log item
- * format structures that are written into the log and intepreted by log
- * recovery. We start with the physical log format definitions, and then work
- * through all the log items definitions and everything they encode into the
- * log.
- */
-typedef __uint32_t xlog_tid_t;
-
-#define XLOG_MIN_ICLOGS                2
-#define XLOG_MAX_ICLOGS                8
-#define XLOG_HEADER_MAGIC_NUM  0xFEEDbabe      /* Invalid cycle number */
-#define XLOG_VERSION_1         1
-#define XLOG_VERSION_2         2               /* Large IClogs, Log sunit */
-#define XLOG_VERSION_OKBITS    (XLOG_VERSION_1 | XLOG_VERSION_2)
-#define XLOG_MIN_RECORD_BSIZE  (16*1024)       /* eventually 32k */
-#define XLOG_BIG_RECORD_BSIZE  (32*1024)       /* 32k buffers */
-#define XLOG_MAX_RECORD_BSIZE  (256*1024)
-#define XLOG_HEADER_CYCLE_SIZE (32*1024)       /* cycle data in header */
-#define XLOG_MIN_RECORD_BSHIFT 14              /* 16384 == 1 << 14 */
-#define XLOG_BIG_RECORD_BSHIFT 15              /* 32k == 1 << 15 */
-#define XLOG_MAX_RECORD_BSHIFT 18              /* 256k == 1 << 18 */
-#define XLOG_BTOLSUNIT(log, b)  (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \
-                                 (log)->l_mp->m_sb.sb_logsunit)
-#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)
-
-#define XLOG_HEADER_SIZE       512
-
-/* Minimum number of transactions that must fit in the log (defined by mkfs) */
-#define XFS_MIN_LOG_FACTOR     3
-
-#define XLOG_REC_SHIFT(log) \
-       BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
-        XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
-#define XLOG_TOTAL_REC_SHIFT(log) \
-       BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
-        XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
-
-/* get lsn fields */
-#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
-#define BLOCK_LSN(lsn) ((uint)(lsn))
-
-/* this is used in a spot where we might otherwise double-endian-flip */
-#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0])
-
-static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
-{
-       return ((xfs_lsn_t)cycle << 32) | block;
-}
-
-static inline uint xlog_get_cycle(char *ptr)
-{
-       if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
-               return be32_to_cpu(*((__be32 *)ptr + 1));
-       else
-               return be32_to_cpu(*(__be32 *)ptr);
-}
-
-/* Log Clients */
-#define XFS_TRANSACTION                0x69
-#define XFS_VOLUME             0x2
-#define XFS_LOG                        0xaa
-
-#define XLOG_UNMOUNT_TYPE      0x556e  /* Un for Unmount */
-
-/* Region types for iovec's i_type */
-#define XLOG_REG_TYPE_BFORMAT          1
-#define XLOG_REG_TYPE_BCHUNK           2
-#define XLOG_REG_TYPE_EFI_FORMAT       3
-#define XLOG_REG_TYPE_EFD_FORMAT       4
-#define XLOG_REG_TYPE_IFORMAT          5
-#define XLOG_REG_TYPE_ICORE            6
-#define XLOG_REG_TYPE_IEXT             7
-#define XLOG_REG_TYPE_IBROOT           8
-#define XLOG_REG_TYPE_ILOCAL           9
-#define XLOG_REG_TYPE_IATTR_EXT                10
-#define XLOG_REG_TYPE_IATTR_BROOT      11
-#define XLOG_REG_TYPE_IATTR_LOCAL      12
-#define XLOG_REG_TYPE_QFORMAT          13
-#define XLOG_REG_TYPE_DQUOT            14
-#define XLOG_REG_TYPE_QUOTAOFF         15
-#define XLOG_REG_TYPE_LRHEADER         16
-#define XLOG_REG_TYPE_UNMOUNT          17
-#define XLOG_REG_TYPE_COMMIT           18
-#define XLOG_REG_TYPE_TRANSHDR         19
-#define XLOG_REG_TYPE_ICREATE          20
-#define XLOG_REG_TYPE_MAX              20
-
-/*
- * Flags to log operation header
- *
- * The first write of a new transaction will be preceded with a start
- * record, XLOG_START_TRANS.  Once a transaction is committed, a commit
- * record is written, XLOG_COMMIT_TRANS.  If a single region can not fit into
- * the remainder of the current active in-core log, it is split up into
- * multiple regions.  Each partial region will be marked with a
- * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS.
- *
- */
-#define XLOG_START_TRANS       0x01    /* Start a new transaction */
-#define XLOG_COMMIT_TRANS      0x02    /* Commit this transaction */
-#define XLOG_CONTINUE_TRANS    0x04    /* Cont this trans into new region */
-#define XLOG_WAS_CONT_TRANS    0x08    /* Cont this trans into new region */
-#define XLOG_END_TRANS         0x10    /* End a continued transaction */
-#define XLOG_UNMOUNT_TRANS     0x20    /* Unmount a filesystem transaction */
-
-
-typedef struct xlog_op_header {
-       __be32     oh_tid;      /* transaction id of operation  :  4 b */
-       __be32     oh_len;      /* bytes in data region         :  4 b */
-       __u8       oh_clientid; /* who sent me this             :  1 b */
-       __u8       oh_flags;    /*                              :  1 b */
-       __u16      oh_res2;     /* 32 bit align                 :  2 b */
-} xlog_op_header_t;
-
-/* valid values for h_fmt */
-#define XLOG_FMT_UNKNOWN  0
-#define XLOG_FMT_LINUX_LE 1
-#define XLOG_FMT_LINUX_BE 2
-#define XLOG_FMT_IRIX_BE  3
-
-/* our fmt */
-#ifdef XFS_NATIVE_HOST
-#define XLOG_FMT XLOG_FMT_LINUX_BE
-#else
-#define XLOG_FMT XLOG_FMT_LINUX_LE
-#endif
-
-typedef struct xlog_rec_header {
-       __be32    h_magicno;    /* log record (LR) identifier           :  4 */
-       __be32    h_cycle;      /* write cycle of log                   :  4 */
-       __be32    h_version;    /* LR version                           :  4 */
-       __be32    h_len;        /* len in bytes; should be 64-bit aligned: 4 */
-       __be64    h_lsn;        /* lsn of this LR                       :  8 */
-       __be64    h_tail_lsn;   /* lsn of 1st LR w/ buffers not committed: 8 */
-       __le32    h_crc;        /* crc of log record                    :  4 */
-       __be32    h_prev_block; /* block number to previous LR          :  4 */
-       __be32    h_num_logops; /* number of log operations in this LR  :  4 */
-       __be32    h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
-       /* new fields */
-       __be32    h_fmt;        /* format of log record                 :  4 */
-       uuid_t    h_fs_uuid;    /* uuid of FS                           : 16 */
-       __be32    h_size;       /* iclog size                           :  4 */
-} xlog_rec_header_t;
-
-typedef struct xlog_rec_ext_header {
-       __be32    xh_cycle;     /* write cycle of log                   : 4 */
-       __be32    xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /*    : 256 */
-} xlog_rec_ext_header_t;
-
-/*
- * Quite misnamed, because this union lays out the actual on-disk log buffer.
- */
-typedef union xlog_in_core2 {
-       xlog_rec_header_t       hic_header;
-       xlog_rec_ext_header_t   hic_xheader;
-       char                    hic_sector[XLOG_HEADER_SIZE];
-} xlog_in_core_2_t;
-
-/* not an on-disk structure, but needed by log recovery in userspace */
-typedef struct xfs_log_iovec {
-       void            *i_addr;        /* beginning address of region */
-       int             i_len;          /* length in bytes of region */
-       uint            i_type;         /* type of region */
-} xfs_log_iovec_t;
-
-
-/*
- * Transaction Header definitions.
- *
- * This is the structure written in the log at the head of every transaction. It
- * identifies the type and id of the transaction, and contains the number of
- * items logged by the transaction so we know how many to expect during
- * recovery.
- *
- * Do not change the below structure without redoing the code in
- * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
- */
-typedef struct xfs_trans_header {
-       uint            th_magic;               /* magic number */
-       uint            th_type;                /* transaction type */
-       __int32_t       th_tid;                 /* transaction id (unused) */
-       uint            th_num_items;           /* num items logged by trans */
-} xfs_trans_header_t;
-
-#define        XFS_TRANS_HEADER_MAGIC  0x5452414e      /* TRAN */
-
-/*
- * Log item types.
- */
-#define        XFS_LI_EFI              0x1236
-#define        XFS_LI_EFD              0x1237
-#define        XFS_LI_IUNLINK          0x1238
-#define        XFS_LI_INODE            0x123b  /* aligned ino chunks, var-size ibufs */
-#define        XFS_LI_BUF              0x123c  /* v2 bufs, variable sized inode bufs */
-#define        XFS_LI_DQUOT            0x123d
-#define        XFS_LI_QUOTAOFF         0x123e
-#define        XFS_LI_ICREATE          0x123f
-
-#define XFS_LI_TYPE_DESC \
-       { XFS_LI_EFI,           "XFS_LI_EFI" }, \
-       { XFS_LI_EFD,           "XFS_LI_EFD" }, \
-       { XFS_LI_IUNLINK,       "XFS_LI_IUNLINK" }, \
-       { XFS_LI_INODE,         "XFS_LI_INODE" }, \
-       { XFS_LI_BUF,           "XFS_LI_BUF" }, \
-       { XFS_LI_DQUOT,         "XFS_LI_DQUOT" }, \
-       { XFS_LI_QUOTAOFF,      "XFS_LI_QUOTAOFF" }, \
-       { XFS_LI_ICREATE,       "XFS_LI_ICREATE" }
-
-/*
- * Inode Log Item Format definitions.
- *
- * This is the structure used to lay out an inode log item in the
- * log.  The size of the inline data/extents/b-tree root to be logged
- * (if any) is indicated in the ilf_dsize field.  Changes to this structure
- * must be added on to the end.
- */
-typedef struct xfs_inode_log_format {
-       __uint16_t              ilf_type;       /* inode log item type */
-       __uint16_t              ilf_size;       /* size of this item */
-       __uint32_t              ilf_fields;     /* flags for fields logged */
-       __uint16_t              ilf_asize;      /* size of attr d/ext/root */
-       __uint16_t              ilf_dsize;      /* size of data/ext/root */
-       __uint64_t              ilf_ino;        /* inode number */
-       union {
-               __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
-               uuid_t          ilfu_uuid;      /* mount point value */
-       } ilf_u;
-       __int64_t               ilf_blkno;      /* blkno of inode buffer */
-       __int32_t               ilf_len;        /* len of inode buffer */
-       __int32_t               ilf_boffset;    /* off of inode in buffer */
-} xfs_inode_log_format_t;
-
-typedef struct xfs_inode_log_format_32 {
-       __uint16_t              ilf_type;       /* inode log item type */
-       __uint16_t              ilf_size;       /* size of this item */
-       __uint32_t              ilf_fields;     /* flags for fields logged */
-       __uint16_t              ilf_asize;      /* size of attr d/ext/root */
-       __uint16_t              ilf_dsize;      /* size of data/ext/root */
-       __uint64_t              ilf_ino;        /* inode number */
-       union {
-               __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
-               uuid_t          ilfu_uuid;      /* mount point value */
-       } ilf_u;
-       __int64_t               ilf_blkno;      /* blkno of inode buffer */
-       __int32_t               ilf_len;        /* len of inode buffer */
-       __int32_t               ilf_boffset;    /* off of inode in buffer */
-} __attribute__((packed)) xfs_inode_log_format_32_t;
-
-typedef struct xfs_inode_log_format_64 {
-       __uint16_t              ilf_type;       /* inode log item type */
-       __uint16_t              ilf_size;       /* size of this item */
-       __uint32_t              ilf_fields;     /* flags for fields logged */
-       __uint16_t              ilf_asize;      /* size of attr d/ext/root */
-       __uint16_t              ilf_dsize;      /* size of data/ext/root */
-       __uint32_t              ilf_pad;        /* pad for 64 bit boundary */
-       __uint64_t              ilf_ino;        /* inode number */
-       union {
-               __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
-               uuid_t          ilfu_uuid;      /* mount point value */
-       } ilf_u;
-       __int64_t               ilf_blkno;      /* blkno of inode buffer */
-       __int32_t               ilf_len;        /* len of inode buffer */
-       __int32_t               ilf_boffset;    /* off of inode in buffer */
-} xfs_inode_log_format_64_t;
-
-/*
- * Flags for xfs_trans_log_inode flags field.
- */
-#define        XFS_ILOG_CORE   0x001   /* log standard inode fields */
-#define        XFS_ILOG_DDATA  0x002   /* log i_df.if_data */
-#define        XFS_ILOG_DEXT   0x004   /* log i_df.if_extents */
-#define        XFS_ILOG_DBROOT 0x008   /* log i_df.i_broot */
-#define        XFS_ILOG_DEV    0x010   /* log the dev field */
-#define        XFS_ILOG_UUID   0x020   /* log the uuid field */
-#define        XFS_ILOG_ADATA  0x040   /* log i_af.if_data */
-#define        XFS_ILOG_AEXT   0x080   /* log i_af.if_extents */
-#define        XFS_ILOG_ABROOT 0x100   /* log i_af.i_broot */
-#define XFS_ILOG_DOWNER        0x200   /* change the data fork owner on replay */
-#define XFS_ILOG_AOWNER        0x400   /* change the attr fork owner on replay */
-
-
-/*
- * The timestamps are dirty, but not necessarily anything else in the inode
- * core.  Unlike the other fields above this one must never make it to disk
- * in the ilf_fields of the inode_log_format, but is purely store in-memory in
- * ili_fields in the inode_log_item.
- */
-#define XFS_ILOG_TIMESTAMP     0x4000
-
-#define        XFS_ILOG_NONCORE        (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
-                                XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
-                                XFS_ILOG_UUID | XFS_ILOG_ADATA | \
-                                XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \
-                                XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
-
-#define        XFS_ILOG_DFORK          (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
-                                XFS_ILOG_DBROOT)
-
-#define        XFS_ILOG_AFORK          (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
-                                XFS_ILOG_ABROOT)
-
-#define        XFS_ILOG_ALL            (XFS_ILOG_CORE | XFS_ILOG_DDATA | \
-                                XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
-                                XFS_ILOG_DEV | XFS_ILOG_UUID | \
-                                XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
-                                XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP | \
-                                XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
-
-static inline int xfs_ilog_fbroot(int w)
-{
-       return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
-}
-
-static inline int xfs_ilog_fext(int w)
-{
-       return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
-}
-
-static inline int xfs_ilog_fdata(int w)
-{
-       return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
-}
-
-/*
- * Incore version of the on-disk inode core structures. We log this directly
- * into the journal in host CPU format (for better or worse) and as such
- * directly mirrors the xfs_dinode structure as it must contain all the same
- * information.
- */
-typedef struct xfs_ictimestamp {
-       __int32_t       t_sec;          /* timestamp seconds */
-       __int32_t       t_nsec;         /* timestamp nanoseconds */
-} xfs_ictimestamp_t;
-
-/*
- * NOTE:  This structure must be kept identical to struct xfs_dinode
- *       in xfs_dinode.h except for the endianness annotations.
- */
-typedef struct xfs_icdinode {
-       __uint16_t      di_magic;       /* inode magic # = XFS_DINODE_MAGIC */
-       __uint16_t      di_mode;        /* mode and type of file */
-       __int8_t        di_version;     /* inode version */
-       __int8_t        di_format;      /* format of di_c data */
-       __uint16_t      di_onlink;      /* old number of links to file */
-       __uint32_t      di_uid;         /* owner's user id */
-       __uint32_t      di_gid;         /* owner's group id */
-       __uint32_t      di_nlink;       /* number of links to file */
-       __uint16_t      di_projid_lo;   /* lower part of owner's project id */
-       __uint16_t      di_projid_hi;   /* higher part of owner's project id */
-       __uint8_t       di_pad[6];      /* unused, zeroed space */
-       __uint16_t      di_flushiter;   /* incremented on flush */
-       xfs_ictimestamp_t di_atime;     /* time last accessed */
-       xfs_ictimestamp_t di_mtime;     /* time last modified */
-       xfs_ictimestamp_t di_ctime;     /* time created/inode modified */
-       xfs_fsize_t     di_size;        /* number of bytes in file */
-       xfs_drfsbno_t   di_nblocks;     /* # of direct & btree blocks used */
-       xfs_extlen_t    di_extsize;     /* basic/minimum extent size for file */
-       xfs_extnum_t    di_nextents;    /* number of extents in data fork */
-       xfs_aextnum_t   di_anextents;   /* number of extents in attribute fork*/
-       __uint8_t       di_forkoff;     /* attr fork offs, <<3 for 64b align */
-       __int8_t        di_aformat;     /* format of attr fork's data */
-       __uint32_t      di_dmevmask;    /* DMIG event mask */
-       __uint16_t      di_dmstate;     /* DMIG state info */
-       __uint16_t      di_flags;       /* random flags, XFS_DIFLAG_... */
-       __uint32_t      di_gen;         /* generation number */
-
-       /* di_next_unlinked is the only non-core field in the old dinode */
-       xfs_agino_t     di_next_unlinked;/* agi unlinked list ptr */
-
-       /* start of the extended dinode, writable fields */
-       __uint32_t      di_crc;         /* CRC of the inode */
-       __uint64_t      di_changecount; /* number of attribute changes */
-       xfs_lsn_t       di_lsn;         /* flush sequence */
-       __uint64_t      di_flags2;      /* more random flags */
-       __uint8_t       di_pad2[16];    /* more padding for future expansion */
-
-       /* fields only written to during inode creation */
-       xfs_ictimestamp_t di_crtime;    /* time created */
-       xfs_ino_t       di_ino;         /* inode number */
-       uuid_t          di_uuid;        /* UUID of the filesystem */
-
-       /* structure must be padded to 64 bit alignment */
-} xfs_icdinode_t;
-
-static inline uint xfs_icdinode_size(int version)
-{
-       if (version == 3)
-               return sizeof(struct xfs_icdinode);
-       return offsetof(struct xfs_icdinode, di_next_unlinked);
-}
-
-/*
- * Buffer Log Format defintions
- *
- * These are the physical dirty bitmap defintions for the log format structure.
- */
-#define        XFS_BLF_CHUNK           128
-#define        XFS_BLF_SHIFT           7
-#define        BIT_TO_WORD_SHIFT       5
-#define        NBWORD                  (NBBY * sizeof(unsigned int))
-
-/*
- * This flag indicates that the buffer contains on disk inodes
- * and requires special recovery handling.
- */
-#define        XFS_BLF_INODE_BUF       (1<<0)
-
-/*
- * This flag indicates that the buffer should not be replayed
- * during recovery because its blocks are being freed.
- */
-#define        XFS_BLF_CANCEL          (1<<1)
-
-/*
- * This flag indicates that the buffer contains on disk
- * user or group dquots and may require special recovery handling.
- */
-#define        XFS_BLF_UDQUOT_BUF      (1<<2)
-#define XFS_BLF_PDQUOT_BUF     (1<<3)
-#define        XFS_BLF_GDQUOT_BUF      (1<<4)
-
-/*
- * This is the structure used to lay out a buf log item in the
- * log.  The data map describes which 128 byte chunks of the buffer
- * have been logged.
- */
-#define XFS_BLF_DATAMAP_SIZE   ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
-
-typedef struct xfs_buf_log_format {
-       unsigned short  blf_type;       /* buf log item type indicator */
-       unsigned short  blf_size;       /* size of this item */
-       ushort          blf_flags;      /* misc state */
-       ushort          blf_len;        /* number of blocks in this buf */
-       __int64_t       blf_blkno;      /* starting blkno of this buf */
-       unsigned int    blf_map_size;   /* used size of data bitmap in words */
-       unsigned int    blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
-} xfs_buf_log_format_t;
-
-/*
- * All buffers now need to tell recovery where the magic number
- * is so that it can verify and calculate the CRCs on the buffer correctly
- * once the changes have been replayed into the buffer.
- *
- * The type value is held in the upper 5 bits of the blf_flags field, which is
- * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down.
- */
-#define XFS_BLFT_BITS  5
-#define XFS_BLFT_SHIFT 11
-#define XFS_BLFT_MASK  (((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT)
-
-enum xfs_blft {
-       XFS_BLFT_UNKNOWN_BUF = 0,
-       XFS_BLFT_UDQUOT_BUF,
-       XFS_BLFT_PDQUOT_BUF,
-       XFS_BLFT_GDQUOT_BUF,
-       XFS_BLFT_BTREE_BUF,
-       XFS_BLFT_AGF_BUF,
-       XFS_BLFT_AGFL_BUF,
-       XFS_BLFT_AGI_BUF,
-       XFS_BLFT_DINO_BUF,
-       XFS_BLFT_SYMLINK_BUF,
-       XFS_BLFT_DIR_BLOCK_BUF,
-       XFS_BLFT_DIR_DATA_BUF,
-       XFS_BLFT_DIR_FREE_BUF,
-       XFS_BLFT_DIR_LEAF1_BUF,
-       XFS_BLFT_DIR_LEAFN_BUF,
-       XFS_BLFT_DA_NODE_BUF,
-       XFS_BLFT_ATTR_LEAF_BUF,
-       XFS_BLFT_ATTR_RMT_BUF,
-       XFS_BLFT_SB_BUF,
-       XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS),
-};
-
-static inline void
-xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type)
-{
-       ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF);
-       blf->blf_flags &= ~XFS_BLFT_MASK;
-       blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK);
-}
-
-static inline __uint16_t
-xfs_blft_from_flags(struct xfs_buf_log_format *blf)
-{
-       return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT;
-}
-
-/*
- * EFI/EFD log format definitions
- */
-typedef struct xfs_extent {
-       xfs_dfsbno_t    ext_start;
-       xfs_extlen_t    ext_len;
-} xfs_extent_t;
-
-/*
- * Since an xfs_extent_t has types (start:64, len: 32)
- * there are different alignments on 32 bit and 64 bit kernels.
- * So we provide the different variants for use by a
- * conversion routine.
- */
-typedef struct xfs_extent_32 {
-       __uint64_t      ext_start;
-       __uint32_t      ext_len;
-} __attribute__((packed)) xfs_extent_32_t;
-
-typedef struct xfs_extent_64 {
-       __uint64_t      ext_start;
-       __uint32_t      ext_len;
-       __uint32_t      ext_pad;
-} xfs_extent_64_t;
-
-/*
- * This is the structure used to lay out an efi log item in the
- * log.  The efi_extents field is a variable size array whose
- * size is given by efi_nextents.
- */
-typedef struct xfs_efi_log_format {
-       __uint16_t              efi_type;       /* efi log item type */
-       __uint16_t              efi_size;       /* size of this item */
-       __uint32_t              efi_nextents;   /* # extents to free */
-       __uint64_t              efi_id;         /* efi identifier */
-       xfs_extent_t            efi_extents[1]; /* array of extents to free */
-} xfs_efi_log_format_t;
-
-typedef struct xfs_efi_log_format_32 {
-       __uint16_t              efi_type;       /* efi log item type */
-       __uint16_t              efi_size;       /* size of this item */
-       __uint32_t              efi_nextents;   /* # extents to free */
-       __uint64_t              efi_id;         /* efi identifier */
-       xfs_extent_32_t         efi_extents[1]; /* array of extents to free */
-} __attribute__((packed)) xfs_efi_log_format_32_t;
-
-typedef struct xfs_efi_log_format_64 {
-       __uint16_t              efi_type;       /* efi log item type */
-       __uint16_t              efi_size;       /* size of this item */
-       __uint32_t              efi_nextents;   /* # extents to free */
-       __uint64_t              efi_id;         /* efi identifier */
-       xfs_extent_64_t         efi_extents[1]; /* array of extents to free */
-} xfs_efi_log_format_64_t;
-
-/*
- * This is the structure used to lay out an efd log item in the
- * log.  The efd_extents array is a variable size array whose
- * size is given by efd_nextents;
- */
-typedef struct xfs_efd_log_format {
-       __uint16_t              efd_type;       /* efd log item type */
-       __uint16_t              efd_size;       /* size of this item */
-       __uint32_t              efd_nextents;   /* # of extents freed */
-       __uint64_t              efd_efi_id;     /* id of corresponding efi */
-       xfs_extent_t            efd_extents[1]; /* array of extents freed */
-} xfs_efd_log_format_t;
-
-typedef struct xfs_efd_log_format_32 {
-       __uint16_t              efd_type;       /* efd log item type */
-       __uint16_t              efd_size;       /* size of this item */
-       __uint32_t              efd_nextents;   /* # of extents freed */
-       __uint64_t              efd_efi_id;     /* id of corresponding efi */
-       xfs_extent_32_t         efd_extents[1]; /* array of extents freed */
-} __attribute__((packed)) xfs_efd_log_format_32_t;
-
-typedef struct xfs_efd_log_format_64 {
-       __uint16_t              efd_type;       /* efd log item type */
-       __uint16_t              efd_size;       /* size of this item */
-       __uint32_t              efd_nextents;   /* # of extents freed */
-       __uint64_t              efd_efi_id;     /* id of corresponding efi */
-       xfs_extent_64_t         efd_extents[1]; /* array of extents freed */
-} xfs_efd_log_format_64_t;
-
-/*
- * Dquot Log format definitions.
- *
- * The first two fields must be the type and size fitting into
- * 32 bits : log_recovery code assumes that.
- */
-typedef struct xfs_dq_logformat {
-       __uint16_t              qlf_type;      /* dquot log item type */
-       __uint16_t              qlf_size;      /* size of this item */
-       xfs_dqid_t              qlf_id;        /* usr/grp/proj id : 32 bits */
-       __int64_t               qlf_blkno;     /* blkno of dquot buffer */
-       __int32_t               qlf_len;       /* len of dquot buffer */
-       __uint32_t              qlf_boffset;   /* off of dquot in buffer */
-} xfs_dq_logformat_t;
-
-/*
- * log format struct for QUOTAOFF records.
- * The first two fields must be the type and size fitting into
- * 32 bits : log_recovery code assumes that.
- * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer
- * to the first and ensures that the first logitem is taken out of the AIL
- * only when the last one is securely committed.
- */
-typedef struct xfs_qoff_logformat {
-       unsigned short          qf_type;        /* quotaoff log item type */
-       unsigned short          qf_size;        /* size of this item */
-       unsigned int            qf_flags;       /* USR and/or GRP */
-       char                    qf_pad[12];     /* padding for future */
-} xfs_qoff_logformat_t;
-
-/*
- * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
- */
-#define XFS_UQUOTA_ACCT        0x0001  /* user quota accounting ON */
-#define XFS_UQUOTA_ENFD        0x0002  /* user quota limits enforced */
-#define XFS_UQUOTA_CHKD        0x0004  /* quotacheck run on usr quotas */
-#define XFS_PQUOTA_ACCT        0x0008  /* project quota accounting ON */
-#define XFS_OQUOTA_ENFD        0x0010  /* other (grp/prj) quota limits enforced */
-#define XFS_OQUOTA_CHKD        0x0020  /* quotacheck run on other (grp/prj) quotas */
-#define XFS_GQUOTA_ACCT        0x0040  /* group quota accounting ON */
-
-/*
- * Conversion to and from the combined OQUOTA flag (if necessary)
- * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk()
- */
-#define XFS_GQUOTA_ENFD        0x0080  /* group quota limits enforced */
-#define XFS_GQUOTA_CHKD        0x0100  /* quotacheck run on group quotas */
-#define XFS_PQUOTA_ENFD        0x0200  /* project quota limits enforced */
-#define XFS_PQUOTA_CHKD        0x0400  /* quotacheck run on project quotas */
-
-#define XFS_ALL_QUOTA_ACCT     \
-               (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
-#define XFS_ALL_QUOTA_ENFD     \
-               (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD)
-#define XFS_ALL_QUOTA_CHKD     \
-               (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD)
-
-#define XFS_MOUNT_QUOTA_ALL    (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
-                                XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
-                                XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\
-                                XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\
-                                XFS_PQUOTA_CHKD)
-
-/*
- * Inode create log item structure
- *
- * Log recovery assumes the first two entries are the type and size and they fit
- * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so
- * decoding can be done correctly.
- */
-struct xfs_icreate_log {
-       __uint16_t      icl_type;       /* type of log format structure */
-       __uint16_t      icl_size;       /* size of log format structure */
-       __be32          icl_ag;         /* ag being allocated in */
-       __be32          icl_agbno;      /* start block of inode range */
-       __be32          icl_count;      /* number of inodes to initialise */
-       __be32          icl_isize;      /* size of inodes */
-       __be32          icl_length;     /* length of extent to initialise */
-       __be32          icl_gen;        /* inode generation number to use */
-};
-
-#endif /* __XFS_LOG_FORMAT_H__ */
index 9bc403a9e54f300f570e6e200574c36fb46db9b1..db7cbdeb2b42c766914682862f3179866393111b 100644 (file)
@@ -405,6 +405,8 @@ struct xlog {
        struct xlog_grant_head  l_reserve_head;
        struct xlog_grant_head  l_write_head;
 
+       struct xfs_kobj         l_kobj;
+
        /* The following field are used for debugging; need to hold icloglock */
 #ifdef DEBUG
        char                    *l_iclog_bak[XLOG_MAX_ICLOGS];
index 981af0f6504b1e5b5e171ea3ea207e8759913b2c..1fd5787add9924d7ac726329f2ee88c13a90422d 100644 (file)
@@ -179,7 +179,7 @@ xlog_bread_noalign(
                xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
                        nbblks);
                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
-               return EFSCORRUPTED;
+               return -EFSCORRUPTED;
        }
 
        blk_no = round_down(blk_no, log->l_sectBBsize);
@@ -194,7 +194,7 @@ xlog_bread_noalign(
        bp->b_error = 0;
 
        if (XFS_FORCED_SHUTDOWN(log->l_mp))
-               return XFS_ERROR(EIO);
+               return -EIO;
 
        xfs_buf_iorequest(bp);
        error = xfs_buf_iowait(bp);
@@ -268,7 +268,7 @@ xlog_bwrite(
                xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
                        nbblks);
                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
-               return EFSCORRUPTED;
+               return -EFSCORRUPTED;
        }
 
        blk_no = round_down(blk_no, log->l_sectBBsize);
@@ -330,14 +330,14 @@ xlog_header_check_recover(
                xlog_header_check_dump(mp, head);
                XFS_ERROR_REPORT("xlog_header_check_recover(1)",
                                 XFS_ERRLEVEL_HIGH, mp);
-               return XFS_ERROR(EFSCORRUPTED);
+               return -EFSCORRUPTED;
        } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
                xfs_warn(mp,
        "dirty log entry has mismatched uuid - can't recover");
                xlog_header_check_dump(mp, head);
                XFS_ERROR_REPORT("xlog_header_check_recover(2)",
                                 XFS_ERRLEVEL_HIGH, mp);
-               return XFS_ERROR(EFSCORRUPTED);
+               return -EFSCORRUPTED;
        }
        return 0;
 }
@@ -364,7 +364,7 @@ xlog_header_check_mount(
                xlog_header_check_dump(mp, head);
                XFS_ERROR_REPORT("xlog_header_check_mount",
                                 XFS_ERRLEVEL_HIGH, mp);
-               return XFS_ERROR(EFSCORRUPTED);
+               return -EFSCORRUPTED;
        }
        return 0;
 }
@@ -462,7 +462,7 @@ xlog_find_verify_cycle(
        while (!(bp = xlog_get_bp(log, bufblks))) {
                bufblks >>= 1;
                if (bufblks < log->l_sectBBsize)
-                       return ENOMEM;
+                       return -ENOMEM;
        }
 
        for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
@@ -524,7 +524,7 @@ xlog_find_verify_log_record(
 
        if (!(bp = xlog_get_bp(log, num_blks))) {
                if (!(bp = xlog_get_bp(log, 1)))
-                       return ENOMEM;
+                       return -ENOMEM;
                smallmem = 1;
        } else {
                error = xlog_bread(log, start_blk, num_blks, bp, &offset);
@@ -539,7 +539,7 @@ xlog_find_verify_log_record(
                        xfs_warn(log->l_mp,
                "Log inconsistent (didn't find previous header)");
                        ASSERT(0);
-                       error = XFS_ERROR(EIO);
+                       error = -EIO;
                        goto out;
                }
 
@@ -564,7 +564,7 @@ xlog_find_verify_log_record(
         * will be called again for the end of the physical log.
         */
        if (i == -1) {
-               error = -1;
+               error = 1;
                goto out;
        }
 
@@ -628,7 +628,12 @@ xlog_find_head(
        int             error, log_bbnum = log->l_logBBsize;
 
        /* Is the end of the log device zeroed? */
-       if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
+       error = xlog_find_zeroed(log, &first_blk);
+       if (error < 0) {
+               xfs_warn(log->l_mp, "empty log check failed");
+               return error;
+       }
+       if (error == 1) {
                *return_head_blk = first_blk;
 
                /* Is the whole lot zeroed? */
@@ -641,15 +646,12 @@ xlog_find_head(
                }
 
                return 0;
-       } else if (error) {
-               xfs_warn(log->l_mp, "empty log check failed");
-               return error;
        }
 
        first_blk = 0;                  /* get cycle # of 1st block */
        bp = xlog_get_bp(log, 1);
        if (!bp)
-               return ENOMEM;
+               return -ENOMEM;
 
        error = xlog_bread(log, 0, 1, bp, &offset);
        if (error)
@@ -818,29 +820,29 @@ validate_head:
                start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
 
                /* start ptr at last block ptr before head_blk */
-               if ((error = xlog_find_verify_log_record(log, start_blk,
-                                                       &head_blk, 0)) == -1) {
-                       error = XFS_ERROR(EIO);
-                       goto bp_err;
-               } else if (error)
+               error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
+               if (error == 1)
+                       error = -EIO;
+               if (error)
                        goto bp_err;
        } else {
                start_blk = 0;
                ASSERT(head_blk <= INT_MAX);
-               if ((error = xlog_find_verify_log_record(log, start_blk,
-                                                       &head_blk, 0)) == -1) {
+               error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
+               if (error < 0)
+                       goto bp_err;
+               if (error == 1) {
                        /* We hit the beginning of the log during our search */
                        start_blk = log_bbnum - (num_scan_bblks - head_blk);
                        new_blk = log_bbnum;
                        ASSERT(start_blk <= INT_MAX &&
                                (xfs_daddr_t) log_bbnum-start_blk >= 0);
                        ASSERT(head_blk <= INT_MAX);
-                       if ((error = xlog_find_verify_log_record(log,
-                                                       start_blk, &new_blk,
-                                                       (int)head_blk)) == -1) {
-                               error = XFS_ERROR(EIO);
-                               goto bp_err;
-                       } else if (error)
+                       error = xlog_find_verify_log_record(log, start_blk,
+                                                       &new_blk, (int)head_blk);
+                       if (error == 1)
+                               error = -EIO;
+                       if (error)
                                goto bp_err;
                        if (new_blk != log_bbnum)
                                head_blk = new_blk;
@@ -911,7 +913,7 @@ xlog_find_tail(
 
        bp = xlog_get_bp(log, 1);
        if (!bp)
-               return ENOMEM;
+               return -ENOMEM;
        if (*head_blk == 0) {                           /* special case */
                error = xlog_bread(log, 0, 1, bp, &offset);
                if (error)
@@ -961,7 +963,7 @@ xlog_find_tail(
                xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
                xlog_put_bp(bp);
                ASSERT(0);
-               return XFS_ERROR(EIO);
+               return -EIO;
        }
 
        /* find blk_no of tail of log */
@@ -1092,8 +1094,8 @@ done:
  *
  * Return:
  *     0  => the log is completely written to
- *     -1 => use *blk_no as the first block of the log
- *     >0 => error has occurred
+ *     1 => use *blk_no as the first block of the log
+ *     <0 => error has occurred
  */
 STATIC int
 xlog_find_zeroed(
@@ -1112,7 +1114,7 @@ xlog_find_zeroed(
        /* check totally zeroed log */
        bp = xlog_get_bp(log, 1);
        if (!bp)
-               return ENOMEM;
+               return -ENOMEM;
        error = xlog_bread(log, 0, 1, bp, &offset);
        if (error)
                goto bp_err;
@@ -1121,7 +1123,7 @@ xlog_find_zeroed(
        if (first_cycle == 0) {         /* completely zeroed log */
                *blk_no = 0;
                xlog_put_bp(bp);
-               return -1;
+               return 1;
        }
 
        /* check partially zeroed log */
@@ -1141,7 +1143,7 @@ xlog_find_zeroed(
                 */
                xfs_warn(log->l_mp,
                        "Log inconsistent or not a log (last==0, first!=1)");
-               error = XFS_ERROR(EINVAL);
+               error = -EINVAL;
                goto bp_err;
        }
 
@@ -1179,19 +1181,18 @@ xlog_find_zeroed(
         * Potentially backup over partial log record write.  We don't need
         * to search the end of the log because we know it is zero.
         */
-       if ((error = xlog_find_verify_log_record(log, start_blk,
-                               &last_blk, 0)) == -1) {
-           error = XFS_ERROR(EIO);
-           goto bp_err;
-       } else if (error)
-           goto bp_err;
+       error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0);
+       if (error == 1)
+               error = -EIO;
+       if (error)
+               goto bp_err;
 
        *blk_no = last_blk;
 bp_err:
        xlog_put_bp(bp);
        if (error)
                return error;
-       return -1;
+       return 1;
 }
 
 /*
@@ -1251,7 +1252,7 @@ xlog_write_log_records(
        while (!(bp = xlog_get_bp(log, bufblks))) {
                bufblks >>= 1;
                if (bufblks < sectbb)
-                       return ENOMEM;
+                       return -ENOMEM;
        }
 
        /* We may need to do a read at the start to fill in part of
@@ -1354,7 +1355,7 @@ xlog_clear_stale_blocks(
                if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
                        XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
                                         XFS_ERRLEVEL_LOW, log->l_mp);
-                       return XFS_ERROR(EFSCORRUPTED);
+                       return -EFSCORRUPTED;
                }
                tail_distance = tail_block + (log->l_logBBsize - head_block);
        } else {
@@ -1366,7 +1367,7 @@ xlog_clear_stale_blocks(
                if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
                        XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
                                         XFS_ERRLEVEL_LOW, log->l_mp);
-                       return XFS_ERROR(EFSCORRUPTED);
+                       return -EFSCORRUPTED;
                }
                tail_distance = tail_block - head_block;
        }
@@ -1551,7 +1552,7 @@ xlog_recover_add_to_trans(
                        xfs_warn(log->l_mp, "%s: bad header magic number",
                                __func__);
                        ASSERT(0);
-                       return XFS_ERROR(EIO);
+                       return -EIO;
                }
                if (len == sizeof(xfs_trans_header_t))
                        xlog_recover_add_item(&trans->r_itemq);
@@ -1581,7 +1582,7 @@ xlog_recover_add_to_trans(
                                  in_f->ilf_size);
                        ASSERT(0);
                        kmem_free(ptr);
-                       return XFS_ERROR(EIO);
+                       return -EIO;
                }
 
                item->ri_total = in_f->ilf_size;
@@ -1702,7 +1703,7 @@ xlog_recover_reorder_trans(
                         */
                        if (!list_empty(&sort_list))
                                list_splice_init(&sort_list, &trans->r_itemq);
-                       error = XFS_ERROR(EIO);
+                       error = -EIO;
                        goto out;
                }
        }
@@ -1943,7 +1944,7 @@ xlog_recover_do_inode_buffer(
                                item, bp);
                        XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
                                         XFS_ERRLEVEL_LOW, mp);
-                       return XFS_ERROR(EFSCORRUPTED);
+                       return -EFSCORRUPTED;
                }
 
                buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
@@ -2125,6 +2126,17 @@ xlog_recover_validate_buf_type(
        __uint16_t              magic16;
        __uint16_t              magicda;
 
+       /*
+        * We can only do post recovery validation on items on CRC enabled
+        * fielsystems as we need to know when the buffer was written to be able
+        * to determine if we should have replayed the item. If we replay old
+        * metadata over a newer buffer, then it will enter a temporarily
+        * inconsistent state resulting in verification failures. Hence for now
+        * just avoid the verification stage for non-crc filesystems
+        */
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return;
+
        magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
        magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
        magicda = be16_to_cpu(info->magic);
@@ -2162,8 +2174,6 @@ xlog_recover_validate_buf_type(
                bp->b_ops = &xfs_agf_buf_ops;
                break;
        case XFS_BLFT_AGFL_BUF:
-               if (!xfs_sb_version_hascrc(&mp->m_sb))
-                       break;
                if (magic32 != XFS_AGFL_MAGIC) {
                        xfs_warn(mp, "Bad AGFL block magic!");
                        ASSERT(0);
@@ -2196,10 +2206,6 @@ xlog_recover_validate_buf_type(
 #endif
                break;
        case XFS_BLFT_DINO_BUF:
-               /*
-                * we get here with inode allocation buffers, not buffers that
-                * track unlinked list changes.
-                */
                if (magic16 != XFS_DINODE_MAGIC) {
                        xfs_warn(mp, "Bad INODE block magic!");
                        ASSERT(0);
@@ -2279,8 +2285,6 @@ xlog_recover_validate_buf_type(
                bp->b_ops = &xfs_attr3_leaf_buf_ops;
                break;
        case XFS_BLFT_ATTR_RMT_BUF:
-               if (!xfs_sb_version_hascrc(&mp->m_sb))
-                       break;
                if (magic32 != XFS_ATTR3_RMT_MAGIC) {
                        xfs_warn(mp, "Bad attr remote magic!");
                        ASSERT(0);
@@ -2387,16 +2391,7 @@ xlog_recover_do_reg_buffer(
        /* Shouldn't be any more regions */
        ASSERT(i == item->ri_total);
 
-       /*
-        * We can only do post recovery validation on items on CRC enabled
-        * fielsystems as we need to know when the buffer was written to be able
-        * to determine if we should have replayed the item. If we replay old
-        * metadata over a newer buffer, then it will enter a temporarily
-        * inconsistent state resulting in verification failures. Hence for now
-        * just avoid the verification stage for non-crc filesystems
-        */
-       if (xfs_sb_version_hascrc(&mp->m_sb))
-               xlog_recover_validate_buf_type(mp, bp, buf_f);
+       xlog_recover_validate_buf_type(mp, bp, buf_f);
 }
 
 /*
@@ -2404,8 +2399,11 @@ xlog_recover_do_reg_buffer(
  * Simple algorithm: if we have found a QUOTAOFF log item of the same type
  * (ie. USR or GRP), then just toss this buffer away; don't recover it.
  * Else, treat it as a regular buffer and do recovery.
+ *
+ * Return false if the buffer was tossed and true if we recovered the buffer to
+ * indicate to the caller if the buffer needs writing.
  */
-STATIC void
+STATIC bool
 xlog_recover_do_dquot_buffer(
        struct xfs_mount                *mp,
        struct xlog                     *log,
@@ -2420,9 +2418,8 @@ xlog_recover_do_dquot_buffer(
        /*
         * Filesystems are required to send in quota flags at mount time.
         */
-       if (mp->m_qflags == 0) {
-               return;
-       }
+       if (!mp->m_qflags)
+               return false;
 
        type = 0;
        if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
@@ -2435,9 +2432,10 @@ xlog_recover_do_dquot_buffer(
         * This type of quotas was turned off, so ignore this buffer
         */
        if (log->l_quotaoffs_flag & type)
-               return;
+               return false;
 
        xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+       return true;
 }
 
 /*
@@ -2496,7 +2494,7 @@ xlog_recover_buffer_pass2(
        bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
                          buf_flags, NULL);
        if (!bp)
-               return XFS_ERROR(ENOMEM);
+               return -ENOMEM;
        error = bp->b_error;
        if (error) {
                xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
@@ -2504,23 +2502,44 @@ xlog_recover_buffer_pass2(
        }
 
        /*
-        * recover the buffer only if we get an LSN from it and it's less than
+        * Recover the buffer only if we get an LSN from it and it's less than
         * the lsn of the transaction we are replaying.
+        *
+        * Note that we have to be extremely careful of readahead here.
+        * Readahead does not attach verfiers to the buffers so if we don't
+        * actually do any replay after readahead because of the LSN we found
+        * in the buffer if more recent than that current transaction then we
+        * need to attach the verifier directly. Failure to do so can lead to
+        * future recovery actions (e.g. EFI and unlinked list recovery) can
+        * operate on the buffers and they won't get the verifier attached. This
+        * can lead to blocks on disk having the correct content but a stale
+        * CRC.
+        *
+        * It is safe to assume these clean buffers are currently up to date.
+        * If the buffer is dirtied by a later transaction being replayed, then
+        * the verifier will be reset to match whatever recover turns that
+        * buffer into.
         */
        lsn = xlog_recover_get_buf_lsn(mp, bp);
-       if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0)
+       if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+               xlog_recover_validate_buf_type(mp, bp, buf_f);
                goto out_release;
+       }
 
        if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
                error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
+               if (error)
+                       goto out_release;
        } else if (buf_f->blf_flags &
                  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
-               xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
+               bool    dirty;
+
+               dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
+               if (!dirty)
+                       goto out_release;
        } else {
                xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
        }
-       if (error)
-               goto out_release;
 
        /*
         * Perform delayed write on the buffer.  Asynchronous writes will be
@@ -2598,7 +2617,7 @@ xfs_recover_inode_owner_change(
 
        ip = xfs_inode_alloc(mp, in_f->ilf_ino);
        if (!ip)
-               return ENOMEM;
+               return -ENOMEM;
 
        /* instantiate the inode */
        xfs_dinode_from_disk(&ip->i_d, dip);
@@ -2676,7 +2695,7 @@ xlog_recover_inode_pass2(
        bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
                          &xfs_inode_buf_ops);
        if (!bp) {
-               error = ENOMEM;
+               error = -ENOMEM;
                goto error;
        }
        error = bp->b_error;
@@ -2697,7 +2716,7 @@ xlog_recover_inode_pass2(
                        __func__, dip, bp, in_f->ilf_ino);
                XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
                                 XFS_ERRLEVEL_LOW, mp);
-               error = EFSCORRUPTED;
+               error = -EFSCORRUPTED;
                goto out_release;
        }
        dicp = item->ri_buf[1].i_addr;
@@ -2707,7 +2726,7 @@ xlog_recover_inode_pass2(
                        __func__, item, in_f->ilf_ino);
                XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
                                 XFS_ERRLEVEL_LOW, mp);
-               error = EFSCORRUPTED;
+               error = -EFSCORRUPTED;
                goto out_release;
        }
 
@@ -2764,7 +2783,7 @@ xlog_recover_inode_pass2(
                "%s: Bad regular inode log record, rec ptr 0x%p, "
                "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
                                __func__, item, dip, bp, in_f->ilf_ino);
-                       error = EFSCORRUPTED;
+                       error = -EFSCORRUPTED;
                        goto out_release;
                }
        } else if (unlikely(S_ISDIR(dicp->di_mode))) {
@@ -2777,7 +2796,7 @@ xlog_recover_inode_pass2(
                "%s: Bad dir inode log record, rec ptr 0x%p, "
                "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
                                __func__, item, dip, bp, in_f->ilf_ino);
-                       error = EFSCORRUPTED;
+                       error = -EFSCORRUPTED;
                        goto out_release;
                }
        }
@@ -2790,7 +2809,7 @@ xlog_recover_inode_pass2(
                        __func__, item, dip, bp, in_f->ilf_ino,
                        dicp->di_nextents + dicp->di_anextents,
                        dicp->di_nblocks);
-               error = EFSCORRUPTED;
+               error = -EFSCORRUPTED;
                goto out_release;
        }
        if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
@@ -2800,7 +2819,7 @@ xlog_recover_inode_pass2(
        "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
        "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
                        item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
-               error = EFSCORRUPTED;
+               error = -EFSCORRUPTED;
                goto out_release;
        }
        isize = xfs_icdinode_size(dicp->di_version);
@@ -2810,7 +2829,7 @@ xlog_recover_inode_pass2(
                xfs_alert(mp,
                        "%s: Bad inode log record length %d, rec ptr 0x%p",
                        __func__, item->ri_buf[1].i_len, item);
-               error = EFSCORRUPTED;
+               error = -EFSCORRUPTED;
                goto out_release;
        }
 
@@ -2898,7 +2917,7 @@ xlog_recover_inode_pass2(
                default:
                        xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
                        ASSERT(0);
-                       error = EIO;
+                       error = -EIO;
                        goto out_release;
                }
        }
@@ -2919,7 +2938,7 @@ out_release:
 error:
        if (need_free)
                kmem_free(in_f);
-       return XFS_ERROR(error);
+       return error;
 }
 
 /*
@@ -2946,7 +2965,7 @@ xlog_recover_quotaoff_pass1(
        if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
                log->l_quotaoffs_flag |= XFS_DQ_GROUP;
 
-       return (0);
+       return 0;
 }
 
 /*
@@ -2971,17 +2990,17 @@ xlog_recover_dquot_pass2(
         * Filesystems are required to send in quota flags at mount time.
         */
        if (mp->m_qflags == 0)
-               return (0);
+               return 0;
 
        recddq = item->ri_buf[1].i_addr;
        if (recddq == NULL) {
                xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
-               return XFS_ERROR(EIO);
+               return -EIO;
        }
        if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
                xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
                        item->ri_buf[1].i_len, __func__);
-               return XFS_ERROR(EIO);
+               return -EIO;
        }
 
        /*
@@ -2990,7 +3009,7 @@ xlog_recover_dquot_pass2(
        type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
        ASSERT(type);
        if (log->l_quotaoffs_flag & type)
-               return (0);
+               return 0;
 
        /*
         * At this point we know that quota was _not_ turned off.
@@ -3007,30 +3026,25 @@ xlog_recover_dquot_pass2(
        error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
                           "xlog_recover_dquot_pass2 (log copy)");
        if (error)
-               return XFS_ERROR(EIO);
+               return -EIO;
        ASSERT(dq_f->qlf_len == 1);
 
+       /*
+        * At this point we are assuming that the dquots have been allocated
+        * and hence the buffer has valid dquots stamped in it. It should,
+        * therefore, pass verifier validation. If the dquot is bad, then the
+        * we'll return an error here, so we don't need to specifically check
+        * the dquot in the buffer after the verifier has run.
+        */
        error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
                                   XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
-                                  NULL);
+                                  &xfs_dquot_buf_ops);
        if (error)
                return error;
 
        ASSERT(bp);
        ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
 
-       /*
-        * At least the magic num portion should be on disk because this
-        * was among a chunk of dquots created earlier, and we did some
-        * minimal initialization then.
-        */
-       error = xfs_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
-                          "xlog_recover_dquot_pass2");
-       if (error) {
-               xfs_buf_relse(bp);
-               return XFS_ERROR(EIO);
-       }
-
        /*
         * If the dquot has an LSN in it, recover the dquot only if it's less
         * than the lsn of the transaction we are replaying.
@@ -3178,38 +3192,38 @@ xlog_recover_do_icreate_pass2(
        icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
        if (icl->icl_type != XFS_LI_ICREATE) {
                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
-               return EINVAL;
+               return -EINVAL;
        }
 
        if (icl->icl_size != 1) {
                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
-               return EINVAL;
+               return -EINVAL;
        }
 
        agno = be32_to_cpu(icl->icl_ag);
        if (agno >= mp->m_sb.sb_agcount) {
                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
-               return EINVAL;
+               return -EINVAL;
        }
        agbno = be32_to_cpu(icl->icl_agbno);
        if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
-               return EINVAL;
+               return -EINVAL;
        }
        isize = be32_to_cpu(icl->icl_isize);
        if (isize != mp->m_sb.sb_inodesize) {
                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
-               return EINVAL;
+               return -EINVAL;
        }
        count = be32_to_cpu(icl->icl_count);
        if (!count) {
                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
-               return EINVAL;
+               return -EINVAL;
        }
        length = be32_to_cpu(icl->icl_length);
        if (!length || length >= mp->m_sb.sb_agblocks) {
                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
-               return EINVAL;
+               return -EINVAL;
        }
 
        /* existing allocation is fixed value */
@@ -3218,7 +3232,7 @@ xlog_recover_do_icreate_pass2(
        if (count != mp->m_ialloc_inos ||
             length != mp->m_ialloc_blks) {
                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
-               return EINVAL;
+               return -EINVAL;
        }
 
        /*
@@ -3389,7 +3403,7 @@ xlog_recover_commit_pass1(
                xfs_warn(log->l_mp, "%s: invalid item type (%d)",
                        __func__, ITEM_TYPE(item));
                ASSERT(0);
-               return XFS_ERROR(EIO);
+               return -EIO;
        }
 }
 
@@ -3425,7 +3439,7 @@ xlog_recover_commit_pass2(
                xfs_warn(log->l_mp, "%s: invalid item type (%d)",
                        __func__, ITEM_TYPE(item));
                ASSERT(0);
-               return XFS_ERROR(EIO);
+               return -EIO;
        }
 }
 
@@ -3560,7 +3574,7 @@ xlog_recover_process_data(
 
        /* check the log format matches our own - else we can't recover */
        if (xlog_header_check_recover(log->l_mp, rhead))
-               return (XFS_ERROR(EIO));
+               return -EIO;
 
        while ((dp < lp) && num_logops) {
                ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
@@ -3571,7 +3585,7 @@ xlog_recover_process_data(
                        xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
                                        __func__, ohead->oh_clientid);
                        ASSERT(0);
-                       return (XFS_ERROR(EIO));
+                       return -EIO;
                }
                tid = be32_to_cpu(ohead->oh_tid);
                hash = XLOG_RHASH(tid);
@@ -3585,7 +3599,7 @@ xlog_recover_process_data(
                                xfs_warn(log->l_mp, "%s: bad length 0x%x",
                                        __func__, be32_to_cpu(ohead->oh_len));
                                WARN_ON(1);
-                               return (XFS_ERROR(EIO));
+                               return -EIO;
                        }
                        flags = ohead->oh_flags & ~XLOG_END_TRANS;
                        if (flags & XLOG_WAS_CONT_TRANS)
@@ -3607,7 +3621,7 @@ xlog_recover_process_data(
                                xfs_warn(log->l_mp, "%s: bad transaction",
                                        __func__);
                                ASSERT(0);
-                               error = XFS_ERROR(EIO);
+                               error = -EIO;
                                break;
                        case 0:
                        case XLOG_CONTINUE_TRANS:
@@ -3618,7 +3632,7 @@ xlog_recover_process_data(
                                xfs_warn(log->l_mp, "%s: bad flag 0x%x",
                                        __func__, flags);
                                ASSERT(0);
-                               error = XFS_ERROR(EIO);
+                               error = -EIO;
                                break;
                        }
                        if (error) {
@@ -3669,7 +3683,7 @@ xlog_recover_process_efi(
                         */
                        set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
                        xfs_efi_release(efip, efip->efi_format.efi_nextents);
-                       return XFS_ERROR(EIO);
+                       return -EIO;
                }
        }
 
@@ -3969,7 +3983,7 @@ xlog_unpack_data_crc(
                 * CRC protection by punting an error back up the stack.
                 */
                if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
-                       return EFSCORRUPTED;
+                       return -EFSCORRUPTED;
        }
 
        return 0;
@@ -4018,14 +4032,14 @@ xlog_valid_rec_header(
        if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
                XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
                                XFS_ERRLEVEL_LOW, log->l_mp);
-               return XFS_ERROR(EFSCORRUPTED);
+               return -EFSCORRUPTED;
        }
        if (unlikely(
            (!rhead->h_version ||
            (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
                xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
                        __func__, be32_to_cpu(rhead->h_version));
-               return XFS_ERROR(EIO);
+               return -EIO;
        }
 
        /* LR body must have data or it wouldn't have been written */
@@ -4033,12 +4047,12 @@ xlog_valid_rec_header(
        if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
                XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
                                XFS_ERRLEVEL_LOW, log->l_mp);
-               return XFS_ERROR(EFSCORRUPTED);
+               return -EFSCORRUPTED;
        }
        if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
                XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
                                XFS_ERRLEVEL_LOW, log->l_mp);
-               return XFS_ERROR(EFSCORRUPTED);
+               return -EFSCORRUPTED;
        }
        return 0;
 }
@@ -4081,7 +4095,7 @@ xlog_do_recovery_pass(
                 */
                hbp = xlog_get_bp(log, 1);
                if (!hbp)
-                       return ENOMEM;
+                       return -ENOMEM;
 
                error = xlog_bread(log, tail_blk, 1, hbp, &offset);
                if (error)
@@ -4110,11 +4124,11 @@ xlog_do_recovery_pass(
        }
 
        if (!hbp)
-               return ENOMEM;
+               return -ENOMEM;
        dbp = xlog_get_bp(log, BTOBB(h_size));
        if (!dbp) {
                xlog_put_bp(hbp);
-               return ENOMEM;
+               return -ENOMEM;
        }
 
        memset(rhash, 0, sizeof(rhash));
@@ -4388,7 +4402,7 @@ xlog_do_recover(
         * If IO errors happened during recovery, bail out.
         */
        if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
-               return (EIO);
+               return -EIO;
        }
 
        /*
@@ -4415,7 +4429,7 @@ xlog_do_recover(
 
        if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
                xfs_buf_relse(bp);
-               return XFS_ERROR(EIO);
+               return -EIO;
        }
 
        xfs_buf_iorequest(bp);
@@ -4492,7 +4506,7 @@ xlog_recover(
 "Please recover the log on a kernel that supports the unknown features.",
                                (log->l_mp->m_sb.sb_features_log_incompat &
                                        XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
-                       return EINVAL;
+                       return -EINVAL;
                }
 
                xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
deleted file mode 100644 (file)
index 1c55ccb..0000000
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef        __XFS_LOG_RECOVER_H__
-#define __XFS_LOG_RECOVER_H__
-
-/*
- * Macros, structures, prototypes for internal log manager use.
- */
-
-#define XLOG_RHASH_BITS  4
-#define XLOG_RHASH_SIZE        16
-#define XLOG_RHASH_SHIFT 2
-#define XLOG_RHASH(tid)        \
-       ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
-
-#define XLOG_MAX_REGIONS_IN_ITEM   (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1)
-
-
-/*
- * item headers are in ri_buf[0].  Additional buffers follow.
- */
-typedef struct xlog_recover_item {
-       struct list_head        ri_list;
-       int                     ri_type;
-       int                     ri_cnt; /* count of regions found */
-       int                     ri_total;       /* total regions */
-       xfs_log_iovec_t         *ri_buf;        /* ptr to regions buffer */
-} xlog_recover_item_t;
-
-struct xlog_tid;
-typedef struct xlog_recover {
-       struct hlist_node       r_list;
-       xlog_tid_t              r_log_tid;      /* log's transaction id */
-       xfs_trans_header_t      r_theader;      /* trans header for partial */
-       int                     r_state;        /* not needed */
-       xfs_lsn_t               r_lsn;          /* xact lsn */
-       struct list_head        r_itemq;        /* q for items */
-} xlog_recover_t;
-
-#define ITEM_TYPE(i)   (*(ushort *)(i)->ri_buf[0].i_addr)
-
-/*
- * This is the number of entries in the l_buf_cancel_table used during
- * recovery.
- */
-#define        XLOG_BC_TABLE_SIZE      64
-
-#define        XLOG_RECOVER_PASS1      1
-#define        XLOG_RECOVER_PASS2      2
-
-#endif /* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/xfs_log_rlimit.c b/fs/xfs/xfs_log_rlimit.c
deleted file mode 100644 (file)
index ee7e0e8..0000000
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2013 Jie Liu.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_ag.h"
-#include "xfs_sb.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_trans_space.h"
-#include "xfs_inode.h"
-#include "xfs_da_btree.h"
-#include "xfs_attr_leaf.h"
-#include "xfs_bmap_btree.h"
-
-/*
- * Calculate the maximum length in bytes that would be required for a local
- * attribute value as large attributes out of line are not logged.
- */
-STATIC int
-xfs_log_calc_max_attrsetm_res(
-       struct xfs_mount        *mp)
-{
-       int                     size;
-       int                     nblks;
-
-       size = xfs_attr_leaf_entsize_local_max(mp->m_attr_geo->blksize) -
-              MAXNAMELEN - 1;
-       nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
-       nblks += XFS_B_TO_FSB(mp, size);
-       nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK);
-
-       return  M_RES(mp)->tr_attrsetm.tr_logres +
-               M_RES(mp)->tr_attrsetrt.tr_logres * nblks;
-}
-
-/*
- * Iterate over the log space reservation table to figure out and return
- * the maximum one in terms of the pre-calculated values which were done
- * at mount time.
- */
-STATIC void
-xfs_log_get_max_trans_res(
-       struct xfs_mount        *mp,
-       struct xfs_trans_res    *max_resp)
-{
-       struct xfs_trans_res    *resp;
-       struct xfs_trans_res    *end_resp;
-       int                     log_space = 0;
-       int                     attr_space;
-
-       attr_space = xfs_log_calc_max_attrsetm_res(mp);
-
-       resp = (struct xfs_trans_res *)M_RES(mp);
-       end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1);
-       for (; resp < end_resp; resp++) {
-               int             tmp = resp->tr_logcount > 1 ?
-                                     resp->tr_logres * resp->tr_logcount :
-                                     resp->tr_logres;
-               if (log_space < tmp) {
-                       log_space = tmp;
-                       *max_resp = *resp;              /* struct copy */
-               }
-       }
-
-       if (attr_space > log_space) {
-               *max_resp = M_RES(mp)->tr_attrsetm;     /* struct copy */
-               max_resp->tr_logres = attr_space;
-       }
-}
-
-/*
- * Calculate the minimum valid log size for the given superblock configuration.
- * Used to calculate the minimum log size at mkfs time, and to determine if
- * the log is large enough or not at mount time. Returns the minimum size in
- * filesystem block size units.
- */
-int
-xfs_log_calc_minimum_size(
-       struct xfs_mount        *mp)
-{
-       struct xfs_trans_res    tres = {0};
-       int                     max_logres;
-       int                     min_logblks = 0;
-       int                     lsunit = 0;
-
-       xfs_log_get_max_trans_res(mp, &tres);
-
-       max_logres = xfs_log_calc_unit_res(mp, tres.tr_logres);
-       if (tres.tr_logcount > 1)
-               max_logres *= tres.tr_logcount;
-
-       if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1)
-               lsunit = BTOBB(mp->m_sb.sb_logsunit);
-
-       /*
-        * Two factors should be taken into account for calculating the minimum
-        * log space.
-        * 1) The fundamental limitation is that no single transaction can be
-        *    larger than half size of the log.
-        *
-        *    From mkfs.xfs, this is considered by the XFS_MIN_LOG_FACTOR
-        *    define, which is set to 3. That means we can definitely fit
-        *    maximally sized 2 transactions in the log. We'll use this same
-        *    value here.
-        *
-        * 2) If the lsunit option is specified, a transaction requires 2 LSU
-        *    for the reservation because there are two log writes that can
-        *    require padding - the transaction data and the commit record which
-        *    are written separately and both can require padding to the LSU.
-        *    Consider that we can have an active CIL reservation holding 2*LSU,
-        *    but the CIL is not over a push threshold, in this case, if we
-        *    don't have enough log space for at one new transaction, which
-        *    includes another 2*LSU in the reservation, we will run into dead
-        *    loop situation in log space grant procedure. i.e.
-        *    xlog_grant_head_wait().
-        *
-        *    Hence the log size needs to be able to contain two maximally sized
-        *    and padded transactions, which is (2 * (2 * LSU + maxlres)).
-        *
-        * Also, the log size should be a multiple of the log stripe unit, round
-        * it up to lsunit boundary if lsunit is specified.
-        */
-       if (lsunit) {
-               min_logblks = roundup_64(BTOBB(max_logres), lsunit) +
-                             2 * lsunit;
-       } else
-               min_logblks = BTOBB(max_logres) + 2 * BBSIZE;
-       min_logblks *= XFS_MIN_LOG_FACTOR;
-
-       return XFS_BB_TO_FSB(mp, min_logblks);
-}
index 3507cd0ec4004e37f65ec490e9fb4ea9e6e314f0..fbf0384a466fa25fba334862b0c1762be68ab6d4 100644 (file)
@@ -42,6 +42,7 @@
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_dinode.h"
+#include "xfs_sysfs.h"
 
 
 #ifdef HAVE_PERCPU_SB
@@ -60,6 +61,8 @@ static DEFINE_MUTEX(xfs_uuid_table_mutex);
 static int xfs_uuid_table_size;
 static uuid_t *xfs_uuid_table;
 
+extern struct kset *xfs_kset;
+
 /*
  * See if the UUID is unique among mounted XFS filesystems.
  * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
@@ -76,7 +79,7 @@ xfs_uuid_mount(
 
        if (uuid_is_nil(uuid)) {
                xfs_warn(mp, "Filesystem has nil UUID - can't mount");
-               return XFS_ERROR(EINVAL);
+               return -EINVAL;
        }
 
        mutex_lock(&xfs_uuid_table_mutex);
@@ -104,7 +107,7 @@ xfs_uuid_mount(
  out_duplicate:
        mutex_unlock(&xfs_uuid_table_mutex);
        xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid);
-       return XFS_ERROR(EINVAL);
+       return -EINVAL;
 }
 
 STATIC void
@@ -173,13 +176,9 @@ xfs_sb_validate_fsb_count(
        ASSERT(PAGE_SHIFT >= sbp->sb_blocklog);
        ASSERT(sbp->sb_blocklog >= BBSHIFT);
 
-#if XFS_BIG_BLKNOS     /* Limited by ULONG_MAX of page cache index */
+       /* Limited by ULONG_MAX of page cache index */
        if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
-               return EFBIG;
-#else                  /* Limited by UINT_MAX of sectors */
-       if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX)
-               return EFBIG;
-#endif
+               return -EFBIG;
        return 0;
 }
 
@@ -250,9 +249,9 @@ xfs_initialize_perag(
                mp->m_flags &= ~XFS_MOUNT_32BITINODES;
 
        if (mp->m_flags & XFS_MOUNT_32BITINODES)
-               index = xfs_set_inode32(mp);
+               index = xfs_set_inode32(mp, agcount);
        else
-               index = xfs_set_inode64(mp);
+               index = xfs_set_inode64(mp, agcount);
 
        if (maxagi)
                *maxagi = index;
@@ -308,15 +307,15 @@ reread:
        if (!bp) {
                if (loud)
                        xfs_warn(mp, "SB buffer read failed");
-               return EIO;
+               return -EIO;
        }
        if (bp->b_error) {
                error = bp->b_error;
                if (loud)
                        xfs_warn(mp, "SB validate failed with error %d.", error);
                /* bad CRC means corrupted metadata */
-               if (error == EFSBADCRC)
-                       error = EFSCORRUPTED;
+               if (error == -EFSBADCRC)
+                       error = -EFSCORRUPTED;
                goto release_buf;
        }
 
@@ -324,7 +323,6 @@ reread:
         * Initialize the mount structure from the superblock.
         */
        xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
-       xfs_sb_quota_from_disk(sbp);
 
        /*
         * If we haven't validated the superblock, do so now before we try
@@ -333,7 +331,7 @@ reread:
        if (sbp->sb_magicnum != XFS_SB_MAGIC) {
                if (loud)
                        xfs_warn(mp, "Invalid superblock magic number");
-               error = EINVAL;
+               error = -EINVAL;
                goto release_buf;
        }
 
@@ -344,7 +342,7 @@ reread:
                if (loud)
                        xfs_warn(mp, "device supports %u byte sectors (not %u)",
                                sector_size, sbp->sb_sectsize);
-               error = ENOSYS;
+               error = -ENOSYS;
                goto release_buf;
        }
 
@@ -392,7 +390,7 @@ xfs_update_alignment(xfs_mount_t *mp)
                        xfs_warn(mp,
                "alignment check failed: sunit/swidth vs. blocksize(%d)",
                                sbp->sb_blocksize);
-                       return XFS_ERROR(EINVAL);
+                       return -EINVAL;
                } else {
                        /*
                         * Convert the stripe unit and width to FSBs.
@@ -402,14 +400,14 @@ xfs_update_alignment(xfs_mount_t *mp)
                                xfs_warn(mp,
                        "alignment check failed: sunit/swidth vs. agsize(%d)",
                                         sbp->sb_agblocks);
-                               return XFS_ERROR(EINVAL);
+                               return -EINVAL;
                        } else if (mp->m_dalign) {
                                mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
                        } else {
                                xfs_warn(mp,
                        "alignment check failed: sunit(%d) less than bsize(%d)",
                                         mp->m_dalign, sbp->sb_blocksize);
-                               return XFS_ERROR(EINVAL);
+                               return -EINVAL;
                        }
                }
 
@@ -429,7 +427,7 @@ xfs_update_alignment(xfs_mount_t *mp)
                } else {
                        xfs_warn(mp,
        "cannot change alignment: superblock does not support data alignment");
-                       return XFS_ERROR(EINVAL);
+                       return -EINVAL;
                }
        } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
                    xfs_sb_version_hasdalign(&mp->m_sb)) {
@@ -556,14 +554,14 @@ xfs_check_sizes(xfs_mount_t *mp)
        d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
        if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
                xfs_warn(mp, "filesystem size mismatch detected");
-               return XFS_ERROR(EFBIG);
+               return -EFBIG;
        }
        bp = xfs_buf_read_uncached(mp->m_ddev_targp,
                                        d - XFS_FSS_TO_BB(mp, 1),
                                        XFS_FSS_TO_BB(mp, 1), 0, NULL);
        if (!bp) {
                xfs_warn(mp, "last sector read failed");
-               return EIO;
+               return -EIO;
        }
        xfs_buf_relse(bp);
 
@@ -571,14 +569,14 @@ xfs_check_sizes(xfs_mount_t *mp)
                d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
                if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
                        xfs_warn(mp, "log size mismatch detected");
-                       return XFS_ERROR(EFBIG);
+                       return -EFBIG;
                }
                bp = xfs_buf_read_uncached(mp->m_logdev_targp,
                                        d - XFS_FSB_TO_BB(mp, 1),
                                        XFS_FSB_TO_BB(mp, 1), 0, NULL);
                if (!bp) {
                        xfs_warn(mp, "log device read failed");
-                       return EIO;
+                       return -EIO;
                }
                xfs_buf_relse(bp);
        }
@@ -731,10 +729,15 @@ xfs_mountfs(
 
        xfs_set_maxicount(mp);
 
-       error = xfs_uuid_mount(mp);
+       mp->m_kobj.kobject.kset = xfs_kset;
+       error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname);
        if (error)
                goto out;
 
+       error = xfs_uuid_mount(mp);
+       if (error)
+               goto out_remove_sysfs;
+
        /*
         * Set the minimum read and write sizes
         */
@@ -816,7 +819,7 @@ xfs_mountfs(
        if (!sbp->sb_logblocks) {
                xfs_warn(mp, "no log defined");
                XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
-               error = XFS_ERROR(EFSCORRUPTED);
+               error = -EFSCORRUPTED;
                goto out_free_perag;
        }
 
@@ -855,7 +858,7 @@ xfs_mountfs(
             !mp->m_sb.sb_inprogress) {
                error = xfs_initialize_perag_data(mp, sbp->sb_agcount);
                if (error)
-                       goto out_fail_wait;
+                       goto out_log_dealloc;
        }
 
        /*
@@ -876,7 +879,7 @@ xfs_mountfs(
                xfs_iunlock(rip, XFS_ILOCK_EXCL);
                XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
                                 mp);
-               error = XFS_ERROR(EFSCORRUPTED);
+               error = -EFSCORRUPTED;
                goto out_rele_rip;
        }
        mp->m_rootip = rip;     /* save it */
@@ -927,7 +930,7 @@ xfs_mountfs(
                        xfs_notice(mp, "resetting quota flags");
                        error = xfs_mount_reset_sbqflags(mp);
                        if (error)
-                               return error;
+                               goto out_rtunmount;
                }
        }
 
@@ -989,6 +992,8 @@ xfs_mountfs(
        xfs_da_unmount(mp);
  out_remove_uuid:
        xfs_uuid_unmount(mp);
+ out_remove_sysfs:
+       xfs_sysfs_del(&mp->m_kobj);
  out:
        return error;
 }
@@ -1071,6 +1076,8 @@ xfs_unmountfs(
        xfs_errortag_clearall(mp, 0);
 #endif
        xfs_free_perag(mp);
+
+       xfs_sysfs_del(&mp->m_kobj);
 }
 
 int
@@ -1152,7 +1159,7 @@ xfs_mod_incore_sb_unlocked(
                lcounter += delta;
                if (lcounter < 0) {
                        ASSERT(0);
-                       return XFS_ERROR(EINVAL);
+                       return -EINVAL;
                }
                mp->m_sb.sb_icount = lcounter;
                return 0;
@@ -1161,7 +1168,7 @@ xfs_mod_incore_sb_unlocked(
                lcounter += delta;
                if (lcounter < 0) {
                        ASSERT(0);
-                       return XFS_ERROR(EINVAL);
+                       return -EINVAL;
                }
                mp->m_sb.sb_ifree = lcounter;
                return 0;
@@ -1191,7 +1198,7 @@ xfs_mod_incore_sb_unlocked(
                         * blocks if were allowed to.
                         */
                        if (!rsvd)
-                               return XFS_ERROR(ENOSPC);
+                               return -ENOSPC;
 
                        lcounter = (long long)mp->m_resblks_avail + delta;
                        if (lcounter >= 0) {
@@ -1202,7 +1209,7 @@ xfs_mod_incore_sb_unlocked(
                                "Filesystem \"%s\": reserve blocks depleted! "
                                "Consider increasing reserve pool size.",
                                mp->m_fsname);
-                       return XFS_ERROR(ENOSPC);
+                       return -ENOSPC;
                }
 
                mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
@@ -1211,7 +1218,7 @@ xfs_mod_incore_sb_unlocked(
                lcounter = (long long)mp->m_sb.sb_frextents;
                lcounter += delta;
                if (lcounter < 0) {
-                       return XFS_ERROR(ENOSPC);
+                       return -ENOSPC;
                }
                mp->m_sb.sb_frextents = lcounter;
                return 0;
@@ -1220,7 +1227,7 @@ xfs_mod_incore_sb_unlocked(
                lcounter += delta;
                if (lcounter < 0) {
                        ASSERT(0);
-                       return XFS_ERROR(EINVAL);
+                       return -EINVAL;
                }
                mp->m_sb.sb_dblocks = lcounter;
                return 0;
@@ -1229,7 +1236,7 @@ xfs_mod_incore_sb_unlocked(
                scounter += delta;
                if (scounter < 0) {
                        ASSERT(0);
-                       return XFS_ERROR(EINVAL);
+                       return -EINVAL;
                }
                mp->m_sb.sb_agcount = scounter;
                return 0;
@@ -1238,7 +1245,7 @@ xfs_mod_incore_sb_unlocked(
                scounter += delta;
                if (scounter < 0) {
                        ASSERT(0);
-                       return XFS_ERROR(EINVAL);
+                       return -EINVAL;
                }
                mp->m_sb.sb_imax_pct = scounter;
                return 0;
@@ -1247,7 +1254,7 @@ xfs_mod_incore_sb_unlocked(
                scounter += delta;
                if (scounter < 0) {
                        ASSERT(0);
-                       return XFS_ERROR(EINVAL);
+                       return -EINVAL;
                }
                mp->m_sb.sb_rextsize = scounter;
                return 0;
@@ -1256,7 +1263,7 @@ xfs_mod_incore_sb_unlocked(
                scounter += delta;
                if (scounter < 0) {
                        ASSERT(0);
-                       return XFS_ERROR(EINVAL);
+                       return -EINVAL;
                }
                mp->m_sb.sb_rbmblocks = scounter;
                return 0;
@@ -1265,7 +1272,7 @@ xfs_mod_incore_sb_unlocked(
                lcounter += delta;
                if (lcounter < 0) {
                        ASSERT(0);
-                       return XFS_ERROR(EINVAL);
+                       return -EINVAL;
                }
                mp->m_sb.sb_rblocks = lcounter;
                return 0;
@@ -1274,7 +1281,7 @@ xfs_mod_incore_sb_unlocked(
                lcounter += delta;
                if (lcounter < 0) {
                        ASSERT(0);
-                       return XFS_ERROR(EINVAL);
+                       return -EINVAL;
                }
                mp->m_sb.sb_rextents = lcounter;
                return 0;
@@ -1283,13 +1290,13 @@ xfs_mod_incore_sb_unlocked(
                scounter += delta;
                if (scounter < 0) {
                        ASSERT(0);
-                       return XFS_ERROR(EINVAL);
+                       return -EINVAL;
                }
                mp->m_sb.sb_rextslog = scounter;
                return 0;
        default:
                ASSERT(0);
-               return XFS_ERROR(EINVAL);
+               return -EINVAL;
        }
 }
 
@@ -1452,7 +1459,7 @@ xfs_dev_is_read_only(
            (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
                xfs_notice(mp, "%s required on read-only device.", message);
                xfs_notice(mp, "write access unavailable, cannot proceed.");
-               return EROFS;
+               return -EROFS;
        }
        return 0;
 }
@@ -1995,7 +2002,7 @@ slow_path:
         * (e.g. lots of space just got freed). After that
         * we are done.
         */
-       if (ret != ENOSPC)
+       if (ret != -ENOSPC)
                xfs_icsb_balance_counter(mp, field, 0);
        xfs_icsb_unlock(mp);
        return ret;
index 7295a0b7c343ced6c39291e3495e9ccce6967ad0..b0447c86e7e24e5e21c7b8a3ff28103d89583382 100644 (file)
@@ -166,6 +166,7 @@ typedef struct xfs_mount {
                                                   on the next remount,rw */
        int64_t                 m_low_space[XFS_LOWSP_MAX];
                                                /* low free space thresholds */
+       struct xfs_kobj         m_kobj;
 
        struct workqueue_struct *m_data_workqueue;
        struct workqueue_struct *m_unwritten_workqueue;
index f99b4933dc226267935493cacd0ba103a38cef30..1eb6f3df698c8bb4eac11b5c03aa23790cbc65c4 100644 (file)
@@ -337,20 +337,20 @@ xfs_mru_cache_create(
                *mrup = NULL;
 
        if (!mrup || !grp_count || !lifetime_ms || !free_func)
-               return EINVAL;
+               return -EINVAL;
 
        if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
-               return EINVAL;
+               return -EINVAL;
 
        if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP)))
-               return ENOMEM;
+               return -ENOMEM;
 
        /* An extra list is needed to avoid reaping up to a grp_time early. */
        mru->grp_count = grp_count + 1;
        mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP);
 
        if (!mru->lists) {
-               err = ENOMEM;
+               err = -ENOMEM;
                goto exit;
        }
 
@@ -434,16 +434,16 @@ xfs_mru_cache_insert(
 
        ASSERT(mru && mru->lists);
        if (!mru || !mru->lists)
-               return EINVAL;
+               return -EINVAL;
 
        if (radix_tree_preload(GFP_KERNEL))
-               return ENOMEM;
+               return -ENOMEM;
 
        INIT_LIST_HEAD(&elem->list_node);
        elem->key = key;
 
        spin_lock(&mru->lock);
-       error = -radix_tree_insert(&mru->store, key, elem);
+       error = radix_tree_insert(&mru->store, key, elem);
        radix_tree_preload_end();
        if (!error)
                _xfs_mru_cache_list_insert(mru, elem);
index 6d26759c779aa42b01c22f868ec55effde134b4b..10232102b4a6ffac8a8a92f627ba0cb0caea64ab 100644 (file)
@@ -98,18 +98,18 @@ restart:
                        next_index = be32_to_cpu(dqp->q_core.d_id) + 1;
 
                        error = execute(batch[i], data);
-                       if (error == EAGAIN) {
+                       if (error == -EAGAIN) {
                                skipped++;
                                continue;
                        }
-                       if (error && last_error != EFSCORRUPTED)
+                       if (error && last_error != -EFSCORRUPTED)
                                last_error = error;
                }
 
                mutex_unlock(&qi->qi_tree_lock);
 
                /* bail out if the filesystem is corrupted.  */
-               if (last_error == EFSCORRUPTED) {
+               if (last_error == -EFSCORRUPTED) {
                        skipped = 0;
                        break;
                }
@@ -138,7 +138,7 @@ xfs_qm_dqpurge(
        xfs_dqlock(dqp);
        if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
                xfs_dqunlock(dqp);
-               return EAGAIN;
+               return -EAGAIN;
        }
 
        dqp->dq_flags |= XFS_DQ_FREEING;
@@ -221,100 +221,6 @@ xfs_qm_unmount(
        }
 }
 
-
-/*
- * This is called from xfs_mountfs to start quotas and initialize all
- * necessary data structures like quotainfo.  This is also responsible for
- * running a quotacheck as necessary.  We are guaranteed that the superblock
- * is consistently read in at this point.
- *
- * If we fail here, the mount will continue with quota turned off. We don't
- * need to inidicate success or failure at all.
- */
-void
-xfs_qm_mount_quotas(
-       xfs_mount_t     *mp)
-{
-       int             error = 0;
-       uint            sbf;
-
-       /*
-        * If quotas on realtime volumes is not supported, we disable
-        * quotas immediately.
-        */
-       if (mp->m_sb.sb_rextents) {
-               xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
-               mp->m_qflags = 0;
-               goto write_changes;
-       }
-
-       ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-
-       /*
-        * Allocate the quotainfo structure inside the mount struct, and
-        * create quotainode(s), and change/rev superblock if necessary.
-        */
-       error = xfs_qm_init_quotainfo(mp);
-       if (error) {
-               /*
-                * We must turn off quotas.
-                */
-               ASSERT(mp->m_quotainfo == NULL);
-               mp->m_qflags = 0;
-               goto write_changes;
-       }
-       /*
-        * If any of the quotas are not consistent, do a quotacheck.
-        */
-       if (XFS_QM_NEED_QUOTACHECK(mp)) {
-               error = xfs_qm_quotacheck(mp);
-               if (error) {
-                       /* Quotacheck failed and disabled quotas. */
-                       return;
-               }
-       }
-       /* 
-        * If one type of quotas is off, then it will lose its
-        * quotachecked status, since we won't be doing accounting for
-        * that type anymore.
-        */
-       if (!XFS_IS_UQUOTA_ON(mp))
-               mp->m_qflags &= ~XFS_UQUOTA_CHKD;
-       if (!XFS_IS_GQUOTA_ON(mp))
-               mp->m_qflags &= ~XFS_GQUOTA_CHKD;
-       if (!XFS_IS_PQUOTA_ON(mp))
-               mp->m_qflags &= ~XFS_PQUOTA_CHKD;
-
- write_changes:
-       /*
-        * We actually don't have to acquire the m_sb_lock at all.
-        * This can only be called from mount, and that's single threaded. XXX
-        */
-       spin_lock(&mp->m_sb_lock);
-       sbf = mp->m_sb.sb_qflags;
-       mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL;
-       spin_unlock(&mp->m_sb_lock);
-
-       if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
-               if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) {
-                       /*
-                        * We could only have been turning quotas off.
-                        * We aren't in very good shape actually because
-                        * the incore structures are convinced that quotas are
-                        * off, but the on disk superblock doesn't know that !
-                        */
-                       ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
-                       xfs_alert(mp, "%s: Superblock update failed!",
-                               __func__);
-               }
-       }
-
-       if (error) {
-               xfs_warn(mp, "Failed to initialize disk quotas.");
-               return;
-       }
-}
-
 /*
  * Called from the vfsops layer.
  */
@@ -671,7 +577,7 @@ xfs_qm_init_quotainfo(
 
        qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
 
-       error = -list_lru_init(&qinf->qi_lru);
+       error = list_lru_init(&qinf->qi_lru);
        if (error)
                goto out_free_qinf;
 
@@ -995,7 +901,7 @@ xfs_qm_dqiter_bufs(
                 * will leave a trace in the log indicating corruption has
                 * been detected.
                 */
-               if (error == EFSCORRUPTED) {
+               if (error == -EFSCORRUPTED) {
                        error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
                                      XFS_FSB_TO_DADDR(mp, bno),
                                      mp->m_quotainfo->qi_dqchunklen, 0, &bp,
@@ -1005,6 +911,12 @@ xfs_qm_dqiter_bufs(
                if (error)
                        break;
 
+               /*
+                * A corrupt buffer might not have a verifier attached, so
+                * make sure we have the correct one attached before writeback
+                * occurs.
+                */
+               bp->b_ops = &xfs_dquot_buf_ops;
                xfs_qm_reset_dqcounts(mp, bp, firstid, type);
                xfs_buf_delwri_queue(bp, buffer_list);
                xfs_buf_relse(bp);
@@ -1090,7 +1002,7 @@ xfs_qm_dqiterate(
                                        xfs_buf_readahead(mp->m_ddev_targp,
                                               XFS_FSB_TO_DADDR(mp, rablkno),
                                               mp->m_quotainfo->qi_dqchunklen,
-                                              NULL);
+                                              &xfs_dquot_buf_ops);
                                        rablkno++;
                                }
                        }
@@ -1138,8 +1050,8 @@ xfs_qm_quotacheck_dqadjust(
                /*
                 * Shouldn't be able to turn off quotas here.
                 */
-               ASSERT(error != ESRCH);
-               ASSERT(error != ENOENT);
+               ASSERT(error != -ESRCH);
+               ASSERT(error != -ENOENT);
                return error;
        }
 
@@ -1226,7 +1138,7 @@ xfs_qm_dqusage_adjust(
         */
        if (xfs_is_quota_inode(&mp->m_sb, ino)) {
                *res = BULKSTAT_RV_NOTHING;
-               return XFS_ERROR(EINVAL);
+               return -EINVAL;
        }
 
        /*
@@ -1330,7 +1242,7 @@ out_unlock:
  * Walk thru all the filesystem inodes and construct a consistent view
  * of the disk quota world. If the quotacheck fails, disable quotas.
  */
-int
+STATIC int
 xfs_qm_quotacheck(
        xfs_mount_t     *mp)
 {
@@ -1463,7 +1375,100 @@ xfs_qm_quotacheck(
                }
        } else
                xfs_notice(mp, "Quotacheck: Done.");
-       return (error);
+       return error;
+}
+
+/*
+ * This is called from xfs_mountfs to start quotas and initialize all
+ * necessary data structures like quotainfo.  This is also responsible for
+ * running a quotacheck as necessary.  We are guaranteed that the superblock
+ * is consistently read in at this point.
+ *
+ * If we fail here, the mount will continue with quota turned off. We don't
+ * need to inidicate success or failure at all.
+ */
+void
+xfs_qm_mount_quotas(
+       struct xfs_mount        *mp)
+{
+       int                     error = 0;
+       uint                    sbf;
+
+       /*
+        * If quotas on realtime volumes is not supported, we disable
+        * quotas immediately.
+        */
+       if (mp->m_sb.sb_rextents) {
+               xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
+               mp->m_qflags = 0;
+               goto write_changes;
+       }
+
+       ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+       /*
+        * Allocate the quotainfo structure inside the mount struct, and
+        * create quotainode(s), and change/rev superblock if necessary.
+        */
+       error = xfs_qm_init_quotainfo(mp);
+       if (error) {
+               /*
+                * We must turn off quotas.
+                */
+               ASSERT(mp->m_quotainfo == NULL);
+               mp->m_qflags = 0;
+               goto write_changes;
+       }
+       /*
+        * If any of the quotas are not consistent, do a quotacheck.
+        */
+       if (XFS_QM_NEED_QUOTACHECK(mp)) {
+               error = xfs_qm_quotacheck(mp);
+               if (error) {
+                       /* Quotacheck failed and disabled quotas. */
+                       return;
+               }
+       }
+       /*
+        * If one type of quotas is off, then it will lose its
+        * quotachecked status, since we won't be doing accounting for
+        * that type anymore.
+        */
+       if (!XFS_IS_UQUOTA_ON(mp))
+               mp->m_qflags &= ~XFS_UQUOTA_CHKD;
+       if (!XFS_IS_GQUOTA_ON(mp))
+               mp->m_qflags &= ~XFS_GQUOTA_CHKD;
+       if (!XFS_IS_PQUOTA_ON(mp))
+               mp->m_qflags &= ~XFS_PQUOTA_CHKD;
+
+ write_changes:
+       /*
+        * We actually don't have to acquire the m_sb_lock at all.
+        * This can only be called from mount, and that's single threaded. XXX
+        */
+       spin_lock(&mp->m_sb_lock);
+       sbf = mp->m_sb.sb_qflags;
+       mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL;
+       spin_unlock(&mp->m_sb_lock);
+
+       if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
+               if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) {
+                       /*
+                        * We could only have been turning quotas off.
+                        * We aren't in very good shape actually because
+                        * the incore structures are convinced that quotas are
+                        * off, but the on disk superblock doesn't know that !
+                        */
+                       ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
+                       xfs_alert(mp, "%s: Superblock update failed!",
+                               __func__);
+               }
+       }
+
+       if (error) {
+               xfs_warn(mp, "Failed to initialize disk quotas.");
+               return;
+       }
 }
 
 /*
@@ -1493,7 +1498,7 @@ xfs_qm_init_quotainos(
                        error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
                                             0, 0, &uip);
                        if (error)
-                               return XFS_ERROR(error);
+                               return error;
                }
                if (XFS_IS_GQUOTA_ON(mp) &&
                    mp->m_sb.sb_gquotino != NULLFSINO) {
@@ -1563,7 +1568,7 @@ error_rele:
                IRELE(gip);
        if (pip)
                IRELE(pip);
-       return XFS_ERROR(error);
+       return error;
 }
 
 STATIC void
@@ -1679,7 +1684,7 @@ xfs_qm_vop_dqalloc(
                                                 XFS_QMOPT_DOWARN,
                                                 &uq);
                        if (error) {
-                               ASSERT(error != ENOENT);
+                               ASSERT(error != -ENOENT);
                                return error;
                        }
                        /*
@@ -1706,7 +1711,7 @@ xfs_qm_vop_dqalloc(
                                                 XFS_QMOPT_DOWARN,
                                                 &gq);
                        if (error) {
-                               ASSERT(error != ENOENT);
+                               ASSERT(error != -ENOENT);
                                goto error_rele;
                        }
                        xfs_dqunlock(gq);
@@ -1726,7 +1731,7 @@ xfs_qm_vop_dqalloc(
                                                 XFS_QMOPT_DOWARN,
                                                 &pq);
                        if (error) {
-                               ASSERT(error != ENOENT);
+                               ASSERT(error != -ENOENT);
                                goto error_rele;
                        }
                        xfs_dqunlock(pq);
@@ -1895,7 +1900,7 @@ xfs_qm_vop_chown_reserve(
                                -((xfs_qcnt_t)delblks), 0, blkflags);
        }
 
-       return (0);
+       return 0;
 }
 
 int
index 797fd4636273657992a1124bff8e5865fe36ba28..3a07a937e232a7e51bf089f981a664f782f5a75f 100644 (file)
@@ -157,7 +157,6 @@ struct xfs_dquot_acct {
 #define XFS_QM_RTBWARNLIMIT    5
 
 extern void            xfs_qm_destroy_quotainfo(struct xfs_mount *);
-extern int             xfs_qm_quotacheck(struct xfs_mount *);
 extern int             xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t);
 
 /* dquot stuff */
index e9be63abd8d29f9003521ce5efa0b2aeaae2f467..2c61e61b0205eef889af4fe88c2a58359e010649 100644 (file)
@@ -117,7 +117,7 @@ xfs_qm_newmount(
                        (uquotaondisk ? " usrquota" : ""),
                        (gquotaondisk ? " grpquota" : ""),
                        (pquotaondisk ? " prjquota" : ""));
-               return XFS_ERROR(EPERM);
+               return -EPERM;
        }
 
        if (XFS_IS_QUOTA_ON(mp) || quotaondisk) {
index bbc813caba4c88b28df1e5710146aa1095629500..80f2d77d929a87797b67269880b1c04907b9f402 100644 (file)
@@ -64,10 +64,10 @@ xfs_qm_scall_quotaoff(
        /*
         * No file system can have quotas enabled on disk but not in core.
         * Note that quota utilities (like quotaoff) _expect_
-        * errno == EEXIST here.
+        * errno == -EEXIST here.
         */
        if ((mp->m_qflags & flags) == 0)
-               return XFS_ERROR(EEXIST);
+               return -EEXIST;
        error = 0;
 
        flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
@@ -94,7 +94,7 @@ xfs_qm_scall_quotaoff(
 
                /* XXX what to do if error ? Revert back to old vals incore ? */
                error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
-               return (error);
+               return error;
        }
 
        dqtype = 0;
@@ -198,7 +198,7 @@ xfs_qm_scall_quotaoff(
        if (mp->m_qflags == 0) {
                mutex_unlock(&q->qi_quotaofflock);
                xfs_qm_destroy_quotainfo(mp);
-               return (0);
+               return 0;
        }
 
        /*
@@ -278,13 +278,13 @@ xfs_qm_scall_trunc_qfiles(
        xfs_mount_t     *mp,
        uint            flags)
 {
-       int             error = EINVAL;
+       int             error = -EINVAL;
 
        if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 ||
            (flags & ~XFS_DQ_ALLTYPES)) {
                xfs_debug(mp, "%s: flags=%x m_qflags=%x",
                        __func__, flags, mp->m_qflags);
-               return XFS_ERROR(EINVAL);
+               return -EINVAL;
        }
 
        if (flags & XFS_DQ_USER) {
@@ -328,7 +328,7 @@ xfs_qm_scall_quotaon(
        if (flags == 0) {
                xfs_debug(mp, "%s: zero flags, m_qflags=%x",
                        __func__, mp->m_qflags);
-               return XFS_ERROR(EINVAL);
+               return -EINVAL;
        }
 
        /* No fs can turn on quotas with a delayed effect */
@@ -351,13 +351,13 @@ xfs_qm_scall_quotaon(
                xfs_debug(mp,
                        "%s: Can't enforce without acct, flags=%x sbflags=%x",
                        __func__, flags, mp->m_sb.sb_qflags);
-               return XFS_ERROR(EINVAL);
+               return -EINVAL;
        }
        /*
         * If everything's up to-date incore, then don't waste time.
         */
        if ((mp->m_qflags & flags) == flags)
-               return XFS_ERROR(EEXIST);
+               return -EEXIST;
 
        /*
         * Change sb_qflags on disk but not incore mp->qflags
@@ -372,11 +372,11 @@ xfs_qm_scall_quotaon(
         * There's nothing to change if it's the same.
         */
        if ((qf & flags) == flags && sbflags == 0)
-               return XFS_ERROR(EEXIST);
+               return -EEXIST;
        sbflags |= XFS_SB_QFLAGS;
 
        if ((error = xfs_qm_write_sb_changes(mp, sbflags)))
-               return (error);
+               return error;
        /*
         * If we aren't trying to switch on quota enforcement, we are done.
         */
@@ -387,10 +387,10 @@ xfs_qm_scall_quotaon(
             ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) !=
             (mp->m_qflags & XFS_GQUOTA_ACCT)) ||
            (flags & XFS_ALL_QUOTA_ENFD) == 0)
-               return (0);
+               return 0;
 
        if (! XFS_IS_QUOTA_RUNNING(mp))
-               return XFS_ERROR(ESRCH);
+               return -ESRCH;
 
        /*
         * Switch on quota enforcement in core.
@@ -399,7 +399,7 @@ xfs_qm_scall_quotaon(
        mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
        mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
 
-       return (0);
+       return 0;
 }
 
 
@@ -426,7 +426,7 @@ xfs_qm_scall_getqstat(
        if (!xfs_sb_version_hasquota(&mp->m_sb)) {
                out->qs_uquota.qfs_ino = NULLFSINO;
                out->qs_gquota.qfs_ino = NULLFSINO;
-               return (0);
+               return 0;
        }
 
        out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
@@ -514,7 +514,7 @@ xfs_qm_scall_getqstatv(
                out->qs_uquota.qfs_ino = NULLFSINO;
                out->qs_gquota.qfs_ino = NULLFSINO;
                out->qs_pquota.qfs_ino = NULLFSINO;
-               return (0);
+               return 0;
        }
 
        out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
@@ -595,7 +595,7 @@ xfs_qm_scall_setqlim(
        xfs_qcnt_t              hard, soft;
 
        if (newlim->d_fieldmask & ~XFS_DQ_MASK)
-               return EINVAL;
+               return -EINVAL;
        if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
                return 0;
 
@@ -615,7 +615,7 @@ xfs_qm_scall_setqlim(
         */
        error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp);
        if (error) {
-               ASSERT(error != ENOENT);
+               ASSERT(error != -ENOENT);
                goto out_unlock;
        }
        xfs_dqunlock(dqp);
@@ -758,7 +758,7 @@ xfs_qm_log_quotaoff_end(
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0);
        if (error) {
                xfs_trans_cancel(tp, 0);
-               return (error);
+               return error;
        }
 
        qoffi = xfs_trans_get_qoff_item(tp, startqoff,
@@ -772,7 +772,7 @@ xfs_qm_log_quotaoff_end(
         */
        xfs_trans_set_sync(tp);
        error = xfs_trans_commit(tp, 0);
-       return (error);
+       return error;
 }
 
 
@@ -822,7 +822,7 @@ error0:
                spin_unlock(&mp->m_sb_lock);
        }
        *qoffstartp = qoffi;
-       return (error);
+       return error;
 }
 
 
@@ -850,7 +850,7 @@ xfs_qm_scall_getquota(
         * our utility programs are concerned.
         */
        if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
-               error = XFS_ERROR(ENOENT);
+               error = -ENOENT;
                goto out_put;
        }
 
@@ -953,7 +953,7 @@ xfs_qm_export_flags(
                uflags |= FS_QUOTA_GDQ_ENFD;
        if (flags & XFS_PQUOTA_ENFD)
                uflags |= FS_QUOTA_PDQ_ENFD;
-       return (uflags);
+       return uflags;
 }
 
 
diff --git a/fs/xfs/xfs_quota_defs.h b/fs/xfs/xfs_quota_defs.h
deleted file mode 100644 (file)
index 137e209..0000000
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_QUOTA_DEFS_H__
-#define __XFS_QUOTA_DEFS_H__
-
-/*
- * Quota definitions shared between user and kernel source trees.
- */
-
-/*
- * Even though users may not have quota limits occupying all 64-bits,
- * they may need 64-bit accounting. Hence, 64-bit quota-counters,
- * and quota-limits. This is a waste in the common case, but hey ...
- */
-typedef __uint64_t     xfs_qcnt_t;
-typedef __uint16_t     xfs_qwarncnt_t;
-
-/*
- * flags for q_flags field in the dquot.
- */
-#define XFS_DQ_USER            0x0001          /* a user quota */
-#define XFS_DQ_PROJ            0x0002          /* project quota */
-#define XFS_DQ_GROUP           0x0004          /* a group quota */
-#define XFS_DQ_DIRTY           0x0008          /* dquot is dirty */
-#define XFS_DQ_FREEING         0x0010          /* dquot is beeing torn down */
-
-#define XFS_DQ_ALLTYPES                (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
-
-#define XFS_DQ_FLAGS \
-       { XFS_DQ_USER,          "USER" }, \
-       { XFS_DQ_PROJ,          "PROJ" }, \
-       { XFS_DQ_GROUP,         "GROUP" }, \
-       { XFS_DQ_DIRTY,         "DIRTY" }, \
-       { XFS_DQ_FREEING,       "FREEING" }
-
-/*
- * We have the possibility of all three quota types being active at once, and
- * hence free space modification requires modification of all three current
- * dquots in a single transaction. For this case we need to have a reservation
- * of at least 3 dquots.
- *
- * However, a chmod operation can change both UID and GID in a single
- * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be
- * modified. Hence for this case we need to reserve space for at least 4 dquots.
- *
- * And in the worst case, there's a rename operation that can be modifying up to
- * 4 inodes with dquots attached to them. In reality, the only inodes that can
- * have their dquots modified are the source and destination directory inodes
- * due to directory name creation and removal. That can require space allocation
- * and/or freeing on both directory inodes, and hence all three dquots on each
- * inode can be modified. And if the directories are world writeable, all the
- * dquots can be unique and so 6 dquots can be modified....
- *
- * And, of course, we also need to take into account the dquot log format item
- * used to describe each dquot.
- */
-#define XFS_DQUOT_LOGRES(mp)   \
-       ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6)
-
-#define XFS_IS_QUOTA_RUNNING(mp)       ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
-#define XFS_IS_UQUOTA_RUNNING(mp)      ((mp)->m_qflags & XFS_UQUOTA_ACCT)
-#define XFS_IS_PQUOTA_RUNNING(mp)      ((mp)->m_qflags & XFS_PQUOTA_ACCT)
-#define XFS_IS_GQUOTA_RUNNING(mp)      ((mp)->m_qflags & XFS_GQUOTA_ACCT)
-#define XFS_IS_UQUOTA_ENFORCED(mp)     ((mp)->m_qflags & XFS_UQUOTA_ENFD)
-#define XFS_IS_GQUOTA_ENFORCED(mp)     ((mp)->m_qflags & XFS_GQUOTA_ENFD)
-#define XFS_IS_PQUOTA_ENFORCED(mp)     ((mp)->m_qflags & XFS_PQUOTA_ENFD)
-
-/*
- * Incore only flags for quotaoff - these bits get cleared when quota(s)
- * are in the process of getting turned off. These flags are in m_qflags but
- * never in sb_qflags.
- */
-#define XFS_UQUOTA_ACTIVE      0x1000  /* uquotas are being turned off */
-#define XFS_GQUOTA_ACTIVE      0x2000  /* gquotas are being turned off */
-#define XFS_PQUOTA_ACTIVE      0x4000  /* pquotas are being turned off */
-#define XFS_ALL_QUOTA_ACTIVE   \
-       (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
-
-/*
- * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
- * quota will be not be switched off as long as that inode lock is held.
- */
-#define XFS_IS_QUOTA_ON(mp)    ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
-                                                  XFS_GQUOTA_ACTIVE | \
-                                                  XFS_PQUOTA_ACTIVE))
-#define XFS_IS_OQUOTA_ON(mp)   ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \
-                                                  XFS_PQUOTA_ACTIVE))
-#define XFS_IS_UQUOTA_ON(mp)   ((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
-#define XFS_IS_GQUOTA_ON(mp)   ((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
-#define XFS_IS_PQUOTA_ON(mp)   ((mp)->m_qflags & XFS_PQUOTA_ACTIVE)
-
-/*
- * Flags to tell various functions what to do. Not all of these are meaningful
- * to a single function. None of these XFS_QMOPT_* flags are meant to have
- * persistent values (ie. their values can and will change between versions)
- */
-#define XFS_QMOPT_DQALLOC      0x0000002 /* alloc dquot ondisk if needed */
-#define XFS_QMOPT_UQUOTA       0x0000004 /* user dquot requested */
-#define XFS_QMOPT_PQUOTA       0x0000008 /* project dquot requested */
-#define XFS_QMOPT_FORCE_RES    0x0000010 /* ignore quota limits */
-#define XFS_QMOPT_SBVERSION    0x0000040 /* change superblock version num */
-#define XFS_QMOPT_DOWARN        0x0000400 /* increase warning cnt if needed */
-#define XFS_QMOPT_DQREPAIR     0x0001000 /* repair dquot if damaged */
-#define XFS_QMOPT_GQUOTA       0x0002000 /* group dquot requested */
-#define XFS_QMOPT_ENOSPC       0x0004000 /* enospc instead of edquot (prj) */
-
-/*
- * flags to xfs_trans_mod_dquot to indicate which field needs to be
- * modified.
- */
-#define XFS_QMOPT_RES_REGBLKS  0x0010000
-#define XFS_QMOPT_RES_RTBLKS   0x0020000
-#define XFS_QMOPT_BCOUNT       0x0040000
-#define XFS_QMOPT_ICOUNT       0x0080000
-#define XFS_QMOPT_RTBCOUNT     0x0100000
-#define XFS_QMOPT_DELBCOUNT    0x0200000
-#define XFS_QMOPT_DELRTBCOUNT  0x0400000
-#define XFS_QMOPT_RES_INOS     0x0800000
-
-/*
- * flags for dqalloc.
- */
-#define XFS_QMOPT_INHERIT      0x1000000
-
-/*
- * flags to xfs_trans_mod_dquot.
- */
-#define XFS_TRANS_DQ_RES_BLKS  XFS_QMOPT_RES_REGBLKS
-#define XFS_TRANS_DQ_RES_RTBLKS        XFS_QMOPT_RES_RTBLKS
-#define XFS_TRANS_DQ_RES_INOS  XFS_QMOPT_RES_INOS
-#define XFS_TRANS_DQ_BCOUNT    XFS_QMOPT_BCOUNT
-#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT
-#define XFS_TRANS_DQ_ICOUNT    XFS_QMOPT_ICOUNT
-#define XFS_TRANS_DQ_RTBCOUNT  XFS_QMOPT_RTBCOUNT
-#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT
-
-
-#define XFS_QMOPT_QUOTALL      \
-               (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
-#define XFS_QMOPT_RESBLK_MASK  (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
-
-extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq,
-                      xfs_dqid_t id, uint type, uint flags, char *str);
-extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
-
-#endif /* __XFS_QUOTA_H__ */
index 2ad1b9822e92f5f712b5cbeab615bd04c939d22d..b238027df98735d60ff451ce7d5d3440bd12ce0e 100644 (file)
@@ -51,7 +51,7 @@ xfs_fs_get_xstate(
 
        if (!XFS_IS_QUOTA_RUNNING(mp))
                return -ENOSYS;
-       return -xfs_qm_scall_getqstat(mp, fqs);
+       return xfs_qm_scall_getqstat(mp, fqs);
 }
 
 STATIC int
@@ -63,7 +63,7 @@ xfs_fs_get_xstatev(
 
        if (!XFS_IS_QUOTA_RUNNING(mp))
                return -ENOSYS;
-       return -xfs_qm_scall_getqstatv(mp, fqs);
+       return xfs_qm_scall_getqstatv(mp, fqs);
 }
 
 STATIC int
@@ -95,11 +95,11 @@ xfs_fs_set_xstate(
 
        switch (op) {
        case Q_XQUOTAON:
-               return -xfs_qm_scall_quotaon(mp, flags);
+               return xfs_qm_scall_quotaon(mp, flags);
        case Q_XQUOTAOFF:
                if (!XFS_IS_QUOTA_ON(mp))
                        return -EINVAL;
-               return -xfs_qm_scall_quotaoff(mp, flags);
+               return xfs_qm_scall_quotaoff(mp, flags);
        }
 
        return -EINVAL;
@@ -112,7 +112,7 @@ xfs_fs_rm_xquota(
 {
        struct xfs_mount        *mp = XFS_M(sb);
        unsigned int            flags = 0;
-       
+
        if (sb->s_flags & MS_RDONLY)
                return -EROFS;
 
@@ -123,11 +123,11 @@ xfs_fs_rm_xquota(
                flags |= XFS_DQ_USER;
        if (uflags & FS_GROUP_QUOTA)
                flags |= XFS_DQ_GROUP;
-       if (uflags & FS_USER_QUOTA)
+       if (uflags & FS_PROJ_QUOTA)
                flags |= XFS_DQ_PROJ;
 
-       return -xfs_qm_scall_trunc_qfiles(mp, flags);
-}      
+       return xfs_qm_scall_trunc_qfiles(mp, flags);
+}
 
 STATIC int
 xfs_fs_get_dqblk(
@@ -142,7 +142,7 @@ xfs_fs_get_dqblk(
        if (!XFS_IS_QUOTA_ON(mp))
                return -ESRCH;
 
-       return -xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid),
+       return xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid),
                                      xfs_quota_type(qid.type), fdq);
 }
 
@@ -161,7 +161,7 @@ xfs_fs_set_dqblk(
        if (!XFS_IS_QUOTA_ON(mp))
                return -ESRCH;
 
-       return -xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid),
+       return xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid),
                                     xfs_quota_type(qid.type), fdq);
 }
 
index ec5ca65c62116e62d509b987c92c983e79d3cbb6..909e143b87ae66124472c8f8e08fe1a9aaa4737e 100644 (file)
@@ -863,7 +863,7 @@ xfs_growfs_rt_alloc(
                                        XFS_BMAPI_METADATA, &firstblock,
                                        resblks, &map, &nmap, &flist);
                if (!error && nmap < 1)
-                       error = XFS_ERROR(ENOSPC);
+                       error = -ENOSPC;
                if (error)
                        goto error_cancel;
                /*
@@ -903,7 +903,7 @@ xfs_growfs_rt_alloc(
                        bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
                                mp->m_bsize, 0);
                        if (bp == NULL) {
-                               error = XFS_ERROR(EIO);
+                               error = -EIO;
 error_cancel:
                                xfs_trans_cancel(tp, cancelflags);
                                goto error;
@@ -944,9 +944,9 @@ xfs_growfs_rt(
        xfs_buf_t       *bp;            /* temporary buffer */
        int             error;          /* error return value */
        xfs_mount_t     *nmp;           /* new (fake) mount structure */
-       xfs_drfsbno_t   nrblocks;       /* new number of realtime blocks */
+       xfs_rfsblock_t  nrblocks;       /* new number of realtime blocks */
        xfs_extlen_t    nrbmblocks;     /* new number of rt bitmap blocks */
-       xfs_drtbno_t    nrextents;      /* new number of realtime extents */
+       xfs_rtblock_t   nrextents;      /* new number of realtime extents */
        uint8_t         nrextslog;      /* new log2 of sb_rextents */
        xfs_extlen_t    nrsumblocks;    /* new number of summary blocks */
        uint            nrsumlevels;    /* new rt summary levels */
@@ -962,11 +962,11 @@ xfs_growfs_rt(
         * Initial error checking.
         */
        if (!capable(CAP_SYS_ADMIN))
-               return XFS_ERROR(EPERM);
+               return -EPERM;
        if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL ||
            (nrblocks = in->newblocks) <= sbp->sb_rblocks ||
            (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize)))
-               return XFS_ERROR(EINVAL);
+               return -EINVAL;
        if ((error = xfs_sb_validate_fsb_count(sbp, nrblocks)))
                return error;
        /*
@@ -976,7 +976,7 @@ xfs_growfs_rt(
                                XFS_FSB_TO_BB(mp, nrblocks - 1),
                                XFS_FSB_TO_BB(mp, 1), 0, NULL);
        if (!bp)
-               return EIO;
+               return -EIO;
        if (bp->b_error) {
                error = bp->b_error;
                xfs_buf_relse(bp);
@@ -1001,7 +1001,7 @@ xfs_growfs_rt(
         * since we'll log basically the whole summary file at once.
         */
        if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1))
-               return XFS_ERROR(EINVAL);
+               return -EINVAL;
        /*
         * Get the old block counts for bitmap and summary inodes.
         * These can't change since other growfs callers are locked out.
@@ -1208,7 +1208,7 @@ xfs_rtallocate_extent(
                                len, &sumbp, &sb, prod, &r);
                break;
        default:
-               error = EIO;
+               error = -EIO;
                ASSERT(0);
        }
        if (error)
@@ -1247,7 +1247,7 @@ xfs_rtmount_init(
        if (mp->m_rtdev_targp == NULL) {
                xfs_warn(mp,
        "Filesystem has a realtime volume, use rtdev=device option");
-               return XFS_ERROR(ENODEV);
+               return -ENODEV;
        }
        mp->m_rsumlevels = sbp->sb_rextslog + 1;
        mp->m_rsumsize =
@@ -1263,7 +1263,7 @@ xfs_rtmount_init(
                xfs_warn(mp, "realtime mount -- %llu != %llu",
                        (unsigned long long) XFS_BB_TO_FSB(mp, d),
                        (unsigned long long) mp->m_sb.sb_rblocks);
-               return XFS_ERROR(EFBIG);
+               return -EFBIG;
        }
        bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
                                        d - XFS_FSB_TO_BB(mp, 1),
@@ -1272,7 +1272,7 @@ xfs_rtmount_init(
                xfs_warn(mp, "realtime device size check failed");
                if (bp)
                        xfs_buf_relse(bp);
-               return EIO;
+               return -EIO;
        }
        xfs_buf_relse(bp);
        return 0;
index 752b63d103003288d48c463571cc59279f8c531d..c642795324af649c1c80666e5bdc7e0739551e1d 100644 (file)
@@ -132,7 +132,7 @@ xfs_rtmount_init(
                return 0;
 
        xfs_warn(mp, "Not built with CONFIG_XFS_RT");
-       return ENOSYS;
+       return -ENOSYS;
 }
 # define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
 # define xfs_rtunmount_inodes(m)
diff --git a/fs/xfs/xfs_rtbitmap.c b/fs/xfs/xfs_rtbitmap.c
deleted file mode 100644 (file)
index f4dd697..0000000
+++ /dev/null
@@ -1,973 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_error.h"
-#include "xfs_trans.h"
-#include "xfs_trans_space.h"
-#include "xfs_trace.h"
-#include "xfs_buf.h"
-#include "xfs_icache.h"
-#include "xfs_dinode.h"
-#include "xfs_rtalloc.h"
-
-
-/*
- * Realtime allocator bitmap functions shared with userspace.
- */
-
-/*
- * Get a buffer for the bitmap or summary file block specified.
- * The buffer is returned read and locked.
- */
-int
-xfs_rtbuf_get(
-       xfs_mount_t     *mp,            /* file system mount structure */
-       xfs_trans_t     *tp,            /* transaction pointer */
-       xfs_rtblock_t   block,          /* block number in bitmap or summary */
-       int             issum,          /* is summary not bitmap */
-       xfs_buf_t       **bpp)          /* output: buffer for the block */
-{
-       xfs_buf_t       *bp;            /* block buffer, result */
-       xfs_inode_t     *ip;            /* bitmap or summary inode */
-       xfs_bmbt_irec_t map;
-       int             nmap = 1;
-       int             error;          /* error value */
-
-       ip = issum ? mp->m_rsumip : mp->m_rbmip;
-
-       error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK);
-       if (error)
-               return error;
-
-       ASSERT(map.br_startblock != NULLFSBLOCK);
-       error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-                                  XFS_FSB_TO_DADDR(mp, map.br_startblock),
-                                  mp->m_bsize, 0, &bp, NULL);
-       if (error)
-               return error;
-       *bpp = bp;
-       return 0;
-}
-
-/*
- * Searching backward from start to limit, find the first block whose
- * allocated/free state is different from start's.
- */
-int
-xfs_rtfind_back(
-       xfs_mount_t     *mp,            /* file system mount point */
-       xfs_trans_t     *tp,            /* transaction pointer */
-       xfs_rtblock_t   start,          /* starting block to look at */
-       xfs_rtblock_t   limit,          /* last block to look at */
-       xfs_rtblock_t   *rtblock)       /* out: start block found */
-{
-       xfs_rtword_t    *b;             /* current word in buffer */
-       int             bit;            /* bit number in the word */
-       xfs_rtblock_t   block;          /* bitmap block number */
-       xfs_buf_t       *bp;            /* buf for the block */
-       xfs_rtword_t    *bufp;          /* starting word in buffer */
-       int             error;          /* error value */
-       xfs_rtblock_t   firstbit;       /* first useful bit in the word */
-       xfs_rtblock_t   i;              /* current bit number rel. to start */
-       xfs_rtblock_t   len;            /* length of inspected area */
-       xfs_rtword_t    mask;           /* mask of relevant bits for value */
-       xfs_rtword_t    want;           /* mask for "good" values */
-       xfs_rtword_t    wdiff;          /* difference from wanted value */
-       int             word;           /* word number in the buffer */
-
-       /*
-        * Compute and read in starting bitmap block for starting block.
-        */
-       block = XFS_BITTOBLOCK(mp, start);
-       error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
-       if (error) {
-               return error;
-       }
-       bufp = bp->b_addr;
-       /*
-        * Get the first word's index & point to it.
-        */
-       word = XFS_BITTOWORD(mp, start);
-       b = &bufp[word];
-       bit = (int)(start & (XFS_NBWORD - 1));
-       len = start - limit + 1;
-       /*
-        * Compute match value, based on the bit at start: if 1 (free)
-        * then all-ones, else all-zeroes.
-        */
-       want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
-       /*
-        * If the starting position is not word-aligned, deal with the
-        * partial word.
-        */
-       if (bit < XFS_NBWORD - 1) {
-               /*
-                * Calculate first (leftmost) bit number to look at,
-                * and mask for all the relevant bits in this word.
-                */
-               firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0);
-               mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) <<
-                       firstbit;
-               /*
-                * Calculate the difference between the value there
-                * and what we're looking for.
-                */
-               if ((wdiff = (*b ^ want) & mask)) {
-                       /*
-                        * Different.  Mark where we are and return.
-                        */
-                       xfs_trans_brelse(tp, bp);
-                       i = bit - XFS_RTHIBIT(wdiff);
-                       *rtblock = start - i + 1;
-                       return 0;
-               }
-               i = bit - firstbit + 1;
-               /*
-                * Go on to previous block if that's where the previous word is
-                * and we need the previous word.
-                */
-               if (--word == -1 && i < len) {
-                       /*
-                        * If done with this block, get the previous one.
-                        */
-                       xfs_trans_brelse(tp, bp);
-                       error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
-                       if (error) {
-                               return error;
-                       }
-                       bufp = bp->b_addr;
-                       word = XFS_BLOCKWMASK(mp);
-                       b = &bufp[word];
-               } else {
-                       /*
-                        * Go on to the previous word in the buffer.
-                        */
-                       b--;
-               }
-       } else {
-               /*
-                * Starting on a word boundary, no partial word.
-                */
-               i = 0;
-       }
-       /*
-        * Loop over whole words in buffers.  When we use up one buffer
-        * we move on to the previous one.
-        */
-       while (len - i >= XFS_NBWORD) {
-               /*
-                * Compute difference between actual and desired value.
-                */
-               if ((wdiff = *b ^ want)) {
-                       /*
-                        * Different, mark where we are and return.
-                        */
-                       xfs_trans_brelse(tp, bp);
-                       i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
-                       *rtblock = start - i + 1;
-                       return 0;
-               }
-               i += XFS_NBWORD;
-               /*
-                * Go on to previous block if that's where the previous word is
-                * and we need the previous word.
-                */
-               if (--word == -1 && i < len) {
-                       /*
-                        * If done with this block, get the previous one.
-                        */
-                       xfs_trans_brelse(tp, bp);
-                       error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
-                       if (error) {
-                               return error;
-                       }
-                       bufp = bp->b_addr;
-                       word = XFS_BLOCKWMASK(mp);
-                       b = &bufp[word];
-               } else {
-                       /*
-                        * Go on to the previous word in the buffer.
-                        */
-                       b--;
-               }
-       }
-       /*
-        * If not ending on a word boundary, deal with the last
-        * (partial) word.
-        */
-       if (len - i) {
-               /*
-                * Calculate first (leftmost) bit number to look at,
-                * and mask for all the relevant bits in this word.
-                */
-               firstbit = XFS_NBWORD - (len - i);
-               mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit;
-               /*
-                * Compute difference between actual and desired value.
-                */
-               if ((wdiff = (*b ^ want) & mask)) {
-                       /*
-                        * Different, mark where we are and return.
-                        */
-                       xfs_trans_brelse(tp, bp);
-                       i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
-                       *rtblock = start - i + 1;
-                       return 0;
-               } else
-                       i = len;
-       }
-       /*
-        * No match, return that we scanned the whole area.
-        */
-       xfs_trans_brelse(tp, bp);
-       *rtblock = start - i + 1;
-       return 0;
-}
-
-/*
- * Searching forward from start to limit, find the first block whose
- * allocated/free state is different from start's.
- */
-int
-xfs_rtfind_forw(
-       xfs_mount_t     *mp,            /* file system mount point */
-       xfs_trans_t     *tp,            /* transaction pointer */
-       xfs_rtblock_t   start,          /* starting block to look at */
-       xfs_rtblock_t   limit,          /* last block to look at */
-       xfs_rtblock_t   *rtblock)       /* out: start block found */
-{
-       xfs_rtword_t    *b;             /* current word in buffer */
-       int             bit;            /* bit number in the word */
-       xfs_rtblock_t   block;          /* bitmap block number */
-       xfs_buf_t       *bp;            /* buf for the block */
-       xfs_rtword_t    *bufp;          /* starting word in buffer */
-       int             error;          /* error value */
-       xfs_rtblock_t   i;              /* current bit number rel. to start */
-       xfs_rtblock_t   lastbit;        /* last useful bit in the word */
-       xfs_rtblock_t   len;            /* length of inspected area */
-       xfs_rtword_t    mask;           /* mask of relevant bits for value */
-       xfs_rtword_t    want;           /* mask for "good" values */
-       xfs_rtword_t    wdiff;          /* difference from wanted value */
-       int             word;           /* word number in the buffer */
-
-       /*
-        * Compute and read in starting bitmap block for starting block.
-        */
-       block = XFS_BITTOBLOCK(mp, start);
-       error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
-       if (error) {
-               return error;
-       }
-       bufp = bp->b_addr;
-       /*
-        * Get the first word's index & point to it.
-        */
-       word = XFS_BITTOWORD(mp, start);
-       b = &bufp[word];
-       bit = (int)(start & (XFS_NBWORD - 1));
-       len = limit - start + 1;
-       /*
-        * Compute match value, based on the bit at start: if 1 (free)
-        * then all-ones, else all-zeroes.
-        */
-       want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
-       /*
-        * If the starting position is not word-aligned, deal with the
-        * partial word.
-        */
-       if (bit) {
-               /*
-                * Calculate last (rightmost) bit number to look at,
-                * and mask for all the relevant bits in this word.
-                */
-               lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
-               mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
-               /*
-                * Calculate the difference between the value there
-                * and what we're looking for.
-                */
-               if ((wdiff = (*b ^ want) & mask)) {
-                       /*
-                        * Different.  Mark where we are and return.
-                        */
-                       xfs_trans_brelse(tp, bp);
-                       i = XFS_RTLOBIT(wdiff) - bit;
-                       *rtblock = start + i - 1;
-                       return 0;
-               }
-               i = lastbit - bit;
-               /*
-                * Go on to next block if that's where the next word is
-                * and we need the next word.
-                */
-               if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
-                       /*
-                        * If done with this block, get the previous one.
-                        */
-                       xfs_trans_brelse(tp, bp);
-                       error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
-                       if (error) {
-                               return error;
-                       }
-                       b = bufp = bp->b_addr;
-                       word = 0;
-               } else {
-                       /*
-                        * Go on to the previous word in the buffer.
-                        */
-                       b++;
-               }
-       } else {
-               /*
-                * Starting on a word boundary, no partial word.
-                */
-               i = 0;
-       }
-       /*
-        * Loop over whole words in buffers.  When we use up one buffer
-        * we move on to the next one.
-        */
-       while (len - i >= XFS_NBWORD) {
-               /*
-                * Compute difference between actual and desired value.
-                */
-               if ((wdiff = *b ^ want)) {
-                       /*
-                        * Different, mark where we are and return.
-                        */
-                       xfs_trans_brelse(tp, bp);
-                       i += XFS_RTLOBIT(wdiff);
-                       *rtblock = start + i - 1;
-                       return 0;
-               }
-               i += XFS_NBWORD;
-               /*
-                * Go on to next block if that's where the next word is
-                * and we need the next word.
-                */
-               if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
-                       /*
-                        * If done with this block, get the next one.
-                        */
-                       xfs_trans_brelse(tp, bp);
-                       error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
-                       if (error) {
-                               return error;
-                       }
-                       b = bufp = bp->b_addr;
-                       word = 0;
-               } else {
-                       /*
-                        * Go on to the next word in the buffer.
-                        */
-                       b++;
-               }
-       }
-       /*
-        * If not ending on a word boundary, deal with the last
-        * (partial) word.
-        */
-       if ((lastbit = len - i)) {
-               /*
-                * Calculate mask for all the relevant bits in this word.
-                */
-               mask = ((xfs_rtword_t)1 << lastbit) - 1;
-               /*
-                * Compute difference between actual and desired value.
-                */
-               if ((wdiff = (*b ^ want) & mask)) {
-                       /*
-                        * Different, mark where we are and return.
-                        */
-                       xfs_trans_brelse(tp, bp);
-                       i += XFS_RTLOBIT(wdiff);
-                       *rtblock = start + i - 1;
-                       return 0;
-               } else
-                       i = len;
-       }
-       /*
-        * No match, return that we scanned the whole area.
-        */
-       xfs_trans_brelse(tp, bp);
-       *rtblock = start + i - 1;
-       return 0;
-}
-
-/*
- * Read and modify the summary information for a given extent size,
- * bitmap block combination.
- * Keeps track of a current summary block, so we don't keep reading
- * it from the buffer cache.
- */
-int
-xfs_rtmodify_summary(
-       xfs_mount_t     *mp,            /* file system mount point */
-       xfs_trans_t     *tp,            /* transaction pointer */
-       int             log,            /* log2 of extent size */
-       xfs_rtblock_t   bbno,           /* bitmap block number */
-       int             delta,          /* change to make to summary info */
-       xfs_buf_t       **rbpp,         /* in/out: summary block buffer */
-       xfs_fsblock_t   *rsb)           /* in/out: summary block number */
-{
-       xfs_buf_t       *bp;            /* buffer for the summary block */
-       int             error;          /* error value */
-       xfs_fsblock_t   sb;             /* summary fsblock */
-       int             so;             /* index into the summary file */
-       xfs_suminfo_t   *sp;            /* pointer to returned data */
-
-       /*
-        * Compute entry number in the summary file.
-        */
-       so = XFS_SUMOFFS(mp, log, bbno);
-       /*
-        * Compute the block number in the summary file.
-        */
-       sb = XFS_SUMOFFSTOBLOCK(mp, so);
-       /*
-        * If we have an old buffer, and the block number matches, use that.
-        */
-       if (rbpp && *rbpp && *rsb == sb)
-               bp = *rbpp;
-       /*
-        * Otherwise we have to get the buffer.
-        */
-       else {
-               /*
-                * If there was an old one, get rid of it first.
-                */
-               if (rbpp && *rbpp)
-                       xfs_trans_brelse(tp, *rbpp);
-               error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
-               if (error) {
-                       return error;
-               }
-               /*
-                * Remember this buffer and block for the next call.
-                */
-               if (rbpp) {
-                       *rbpp = bp;
-                       *rsb = sb;
-               }
-       }
-       /*
-        * Point to the summary information, modify and log it.
-        */
-       sp = XFS_SUMPTR(mp, bp, so);
-       *sp += delta;
-       xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr),
-               (uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1));
-       return 0;
-}
-
-/*
- * Set the given range of bitmap bits to the given value.
- * Do whatever I/O and logging is required.
- */
-int
-xfs_rtmodify_range(
-       xfs_mount_t     *mp,            /* file system mount point */
-       xfs_trans_t     *tp,            /* transaction pointer */
-       xfs_rtblock_t   start,          /* starting block to modify */
-       xfs_extlen_t    len,            /* length of extent to modify */
-       int             val)            /* 1 for free, 0 for allocated */
-{
-       xfs_rtword_t    *b;             /* current word in buffer */
-       int             bit;            /* bit number in the word */
-       xfs_rtblock_t   block;          /* bitmap block number */
-       xfs_buf_t       *bp;            /* buf for the block */
-       xfs_rtword_t    *bufp;          /* starting word in buffer */
-       int             error;          /* error value */
-       xfs_rtword_t    *first;         /* first used word in the buffer */
-       int             i;              /* current bit number rel. to start */
-       int             lastbit;        /* last useful bit in word */
-       xfs_rtword_t    mask;           /* mask o frelevant bits for value */
-       int             word;           /* word number in the buffer */
-
-       /*
-        * Compute starting bitmap block number.
-        */
-       block = XFS_BITTOBLOCK(mp, start);
-       /*
-        * Read the bitmap block, and point to its data.
-        */
-       error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
-       if (error) {
-               return error;
-       }
-       bufp = bp->b_addr;
-       /*
-        * Compute the starting word's address, and starting bit.
-        */
-       word = XFS_BITTOWORD(mp, start);
-       first = b = &bufp[word];
-       bit = (int)(start & (XFS_NBWORD - 1));
-       /*
-        * 0 (allocated) => all zeroes; 1 (free) => all ones.
-        */
-       val = -val;
-       /*
-        * If not starting on a word boundary, deal with the first
-        * (partial) word.
-        */
-       if (bit) {
-               /*
-                * Compute first bit not changed and mask of relevant bits.
-                */
-               lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
-               mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
-               /*
-                * Set/clear the active bits.
-                */
-               if (val)
-                       *b |= mask;
-               else
-                       *b &= ~mask;
-               i = lastbit - bit;
-               /*
-                * Go on to the next block if that's where the next word is
-                * and we need the next word.
-                */
-               if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
-                       /*
-                        * Log the changed part of this block.
-                        * Get the next one.
-                        */
-                       xfs_trans_log_buf(tp, bp,
-                               (uint)((char *)first - (char *)bufp),
-                               (uint)((char *)b - (char *)bufp));
-                       error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
-                       if (error) {
-                               return error;
-                       }
-                       first = b = bufp = bp->b_addr;
-                       word = 0;
-               } else {
-                       /*
-                        * Go on to the next word in the buffer
-                        */
-                       b++;
-               }
-       } else {
-               /*
-                * Starting on a word boundary, no partial word.
-                */
-               i = 0;
-       }
-       /*
-        * Loop over whole words in buffers.  When we use up one buffer
-        * we move on to the next one.
-        */
-       while (len - i >= XFS_NBWORD) {
-               /*
-                * Set the word value correctly.
-                */
-               *b = val;
-               i += XFS_NBWORD;
-               /*
-                * Go on to the next block if that's where the next word is
-                * and we need the next word.
-                */
-               if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
-                       /*
-                        * Log the changed part of this block.
-                        * Get the next one.
-                        */
-                       xfs_trans_log_buf(tp, bp,
-                               (uint)((char *)first - (char *)bufp),
-                               (uint)((char *)b - (char *)bufp));
-                       error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
-                       if (error) {
-                               return error;
-                       }
-                       first = b = bufp = bp->b_addr;
-                       word = 0;
-               } else {
-                       /*
-                        * Go on to the next word in the buffer
-                        */
-                       b++;
-               }
-       }
-       /*
-        * If not ending on a word boundary, deal with the last
-        * (partial) word.
-        */
-       if ((lastbit = len - i)) {
-               /*
-                * Compute a mask of relevant bits.
-                */
-               bit = 0;
-               mask = ((xfs_rtword_t)1 << lastbit) - 1;
-               /*
-                * Set/clear the active bits.
-                */
-               if (val)
-                       *b |= mask;
-               else
-                       *b &= ~mask;
-               b++;
-       }
-       /*
-        * Log any remaining changed bytes.
-        */
-       if (b > first)
-               xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp),
-                       (uint)((char *)b - (char *)bufp - 1));
-       return 0;
-}
-
-/*
- * Mark an extent specified by start and len freed.
- * Updates all the summary information as well as the bitmap.
- */
-int
-xfs_rtfree_range(
-       xfs_mount_t     *mp,            /* file system mount point */
-       xfs_trans_t     *tp,            /* transaction pointer */
-       xfs_rtblock_t   start,          /* starting block to free */
-       xfs_extlen_t    len,            /* length to free */
-       xfs_buf_t       **rbpp,         /* in/out: summary block buffer */
-       xfs_fsblock_t   *rsb)           /* in/out: summary block number */
-{
-       xfs_rtblock_t   end;            /* end of the freed extent */
-       int             error;          /* error value */
-       xfs_rtblock_t   postblock;      /* first block freed > end */
-       xfs_rtblock_t   preblock;       /* first block freed < start */
-
-       end = start + len - 1;
-       /*
-        * Modify the bitmap to mark this extent freed.
-        */
-       error = xfs_rtmodify_range(mp, tp, start, len, 1);
-       if (error) {
-               return error;
-       }
-       /*
-        * Assume we're freeing out of the middle of an allocated extent.
-        * We need to find the beginning and end of the extent so we can
-        * properly update the summary.
-        */
-       error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
-       if (error) {
-               return error;
-       }
-       /*
-        * Find the next allocated block (end of allocated extent).
-        */
-       error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
-               &postblock);
-       if (error)
-               return error;
-       /*
-        * If there are blocks not being freed at the front of the
-        * old extent, add summary data for them to be allocated.
-        */
-       if (preblock < start) {
-               error = xfs_rtmodify_summary(mp, tp,
-                       XFS_RTBLOCKLOG(start - preblock),
-                       XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
-               if (error) {
-                       return error;
-               }
-       }
-       /*
-        * If there are blocks not being freed at the end of the
-        * old extent, add summary data for them to be allocated.
-        */
-       if (postblock > end) {
-               error = xfs_rtmodify_summary(mp, tp,
-                       XFS_RTBLOCKLOG(postblock - end),
-                       XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb);
-               if (error) {
-                       return error;
-               }
-       }
-       /*
-        * Increment the summary information corresponding to the entire
-        * (new) free extent.
-        */
-       error = xfs_rtmodify_summary(mp, tp,
-               XFS_RTBLOCKLOG(postblock + 1 - preblock),
-               XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
-       return error;
-}
-
-/*
- * Check that the given range is either all allocated (val = 0) or
- * all free (val = 1).
- */
-int
-xfs_rtcheck_range(
-       xfs_mount_t     *mp,            /* file system mount point */
-       xfs_trans_t     *tp,            /* transaction pointer */
-       xfs_rtblock_t   start,          /* starting block number of extent */
-       xfs_extlen_t    len,            /* length of extent */
-       int             val,            /* 1 for free, 0 for allocated */
-       xfs_rtblock_t   *new,           /* out: first block not matching */
-       int             *stat)          /* out: 1 for matches, 0 for not */
-{
-       xfs_rtword_t    *b;             /* current word in buffer */
-       int             bit;            /* bit number in the word */
-       xfs_rtblock_t   block;          /* bitmap block number */
-       xfs_buf_t       *bp;            /* buf for the block */
-       xfs_rtword_t    *bufp;          /* starting word in buffer */
-       int             error;          /* error value */
-       xfs_rtblock_t   i;              /* current bit number rel. to start */
-       xfs_rtblock_t   lastbit;        /* last useful bit in word */
-       xfs_rtword_t    mask;           /* mask of relevant bits for value */
-       xfs_rtword_t    wdiff;          /* difference from wanted value */
-       int             word;           /* word number in the buffer */
-
-       /*
-        * Compute starting bitmap block number
-        */
-       block = XFS_BITTOBLOCK(mp, start);
-       /*
-        * Read the bitmap block.
-        */
-       error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
-       if (error) {
-               return error;
-       }
-       bufp = bp->b_addr;
-       /*
-        * Compute the starting word's address, and starting bit.
-        */
-       word = XFS_BITTOWORD(mp, start);
-       b = &bufp[word];
-       bit = (int)(start & (XFS_NBWORD - 1));
-       /*
-        * 0 (allocated) => all zero's; 1 (free) => all one's.
-        */
-       val = -val;
-       /*
-        * If not starting on a word boundary, deal with the first
-        * (partial) word.
-        */
-       if (bit) {
-               /*
-                * Compute first bit not examined.
-                */
-               lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
-               /*
-                * Mask of relevant bits.
-                */
-               mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
-               /*
-                * Compute difference between actual and desired value.
-                */
-               if ((wdiff = (*b ^ val) & mask)) {
-                       /*
-                        * Different, compute first wrong bit and return.
-                        */
-                       xfs_trans_brelse(tp, bp);
-                       i = XFS_RTLOBIT(wdiff) - bit;
-                       *new = start + i;
-                       *stat = 0;
-                       return 0;
-               }
-               i = lastbit - bit;
-               /*
-                * Go on to next block if that's where the next word is
-                * and we need the next word.
-                */
-               if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
-                       /*
-                        * If done with this block, get the next one.
-                        */
-                       xfs_trans_brelse(tp, bp);
-                       error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
-                       if (error) {
-                               return error;
-                       }
-                       b = bufp = bp->b_addr;
-                       word = 0;
-               } else {
-                       /*
-                        * Go on to the next word in the buffer.
-                        */
-                       b++;
-               }
-       } else {
-               /*
-                * Starting on a word boundary, no partial word.
-                */
-               i = 0;
-       }
-       /*
-        * Loop over whole words in buffers.  When we use up one buffer
-        * we move on to the next one.
-        */
-       while (len - i >= XFS_NBWORD) {
-               /*
-                * Compute difference between actual and desired value.
-                */
-               if ((wdiff = *b ^ val)) {
-                       /*
-                        * Different, compute first wrong bit and return.
-                        */
-                       xfs_trans_brelse(tp, bp);
-                       i += XFS_RTLOBIT(wdiff);
-                       *new = start + i;
-                       *stat = 0;
-                       return 0;
-               }
-               i += XFS_NBWORD;
-               /*
-                * Go on to next block if that's where the next word is
-                * and we need the next word.
-                */
-               if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
-                       /*
-                        * If done with this block, get the next one.
-                        */
-                       xfs_trans_brelse(tp, bp);
-                       error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
-                       if (error) {
-                               return error;
-                       }
-                       b = bufp = bp->b_addr;
-                       word = 0;
-               } else {
-                       /*
-                        * Go on to the next word in the buffer.
-                        */
-                       b++;
-               }
-       }
-       /*
-        * If not ending on a word boundary, deal with the last
-        * (partial) word.
-        */
-       if ((lastbit = len - i)) {
-               /*
-                * Mask of relevant bits.
-                */
-               mask = ((xfs_rtword_t)1 << lastbit) - 1;
-               /*
-                * Compute difference between actual and desired value.
-                */
-               if ((wdiff = (*b ^ val) & mask)) {
-                       /*
-                        * Different, compute first wrong bit and return.
-                        */
-                       xfs_trans_brelse(tp, bp);
-                       i += XFS_RTLOBIT(wdiff);
-                       *new = start + i;
-                       *stat = 0;
-                       return 0;
-               } else
-                       i = len;
-       }
-       /*
-        * Successful, return.
-        */
-       xfs_trans_brelse(tp, bp);
-       *new = start + i;
-       *stat = 1;
-       return 0;
-}
-
-#ifdef DEBUG
-/*
- * Check that the given extent (block range) is allocated already.
- */
-STATIC int                             /* error */
-xfs_rtcheck_alloc_range(
-       xfs_mount_t     *mp,            /* file system mount point */
-       xfs_trans_t     *tp,            /* transaction pointer */
-       xfs_rtblock_t   bno,            /* starting block number of extent */
-       xfs_extlen_t    len)            /* length of extent */
-{
-       xfs_rtblock_t   new;            /* dummy for xfs_rtcheck_range */
-       int             stat;
-       int             error;
-
-       error = xfs_rtcheck_range(mp, tp, bno, len, 0, &new, &stat);
-       if (error)
-               return error;
-       ASSERT(stat);
-       return 0;
-}
-#else
-#define xfs_rtcheck_alloc_range(m,t,b,l)       (0)
-#endif
-/*
- * Free an extent in the realtime subvolume.  Length is expressed in
- * realtime extents, as is the block number.
- */
-int                                    /* error */
-xfs_rtfree_extent(
-       xfs_trans_t     *tp,            /* transaction pointer */
-       xfs_rtblock_t   bno,            /* starting block number to free */
-       xfs_extlen_t    len)            /* length of extent freed */
-{
-       int             error;          /* error value */
-       xfs_mount_t     *mp;            /* file system mount structure */
-       xfs_fsblock_t   sb;             /* summary file block number */
-       xfs_buf_t       *sumbp = NULL;  /* summary file block buffer */
-
-       mp = tp->t_mountp;
-
-       ASSERT(mp->m_rbmip->i_itemp != NULL);
-       ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
-
-       error = xfs_rtcheck_alloc_range(mp, tp, bno, len);
-       if (error)
-               return error;
-
-       /*
-        * Free the range of realtime blocks.
-        */
-       error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb);
-       if (error) {
-               return error;
-       }
-       /*
-        * Mark more blocks free in the superblock.
-        */
-       xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len);
-       /*
-        * If we've now freed all the blocks, reset the file sequence
-        * number to 0.
-        */
-       if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
-           mp->m_sb.sb_rextents) {
-               if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
-                       mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
-               *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
-               xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
-       }
-       return 0;
-}
-
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c
deleted file mode 100644 (file)
index 7703fa6..0000000
+++ /dev/null
@@ -1,836 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_inode.h"
-#include "xfs_ialloc.h"
-#include "xfs_alloc.h"
-#include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_cksum.h"
-#include "xfs_trans.h"
-#include "xfs_buf_item.h"
-#include "xfs_dinode.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-
-/*
- * Physical superblock buffer manipulations. Shared with libxfs in userspace.
- */
-
-static const struct {
-       short offset;
-       short type;     /* 0 = integer
-                        * 1 = binary / string (no translation)
-                        */
-} xfs_sb_info[] = {
-       { offsetof(xfs_sb_t, sb_magicnum),      0 },
-       { offsetof(xfs_sb_t, sb_blocksize),     0 },
-       { offsetof(xfs_sb_t, sb_dblocks),       0 },
-       { offsetof(xfs_sb_t, sb_rblocks),       0 },
-       { offsetof(xfs_sb_t, sb_rextents),      0 },
-       { offsetof(xfs_sb_t, sb_uuid),          1 },
-       { offsetof(xfs_sb_t, sb_logstart),      0 },
-       { offsetof(xfs_sb_t, sb_rootino),       0 },
-       { offsetof(xfs_sb_t, sb_rbmino),        0 },
-       { offsetof(xfs_sb_t, sb_rsumino),       0 },
-       { offsetof(xfs_sb_t, sb_rextsize),      0 },
-       { offsetof(xfs_sb_t, sb_agblocks),      0 },
-       { offsetof(xfs_sb_t, sb_agcount),       0 },
-       { offsetof(xfs_sb_t, sb_rbmblocks),     0 },
-       { offsetof(xfs_sb_t, sb_logblocks),     0 },
-       { offsetof(xfs_sb_t, sb_versionnum),    0 },
-       { offsetof(xfs_sb_t, sb_sectsize),      0 },
-       { offsetof(xfs_sb_t, sb_inodesize),     0 },
-       { offsetof(xfs_sb_t, sb_inopblock),     0 },
-       { offsetof(xfs_sb_t, sb_fname[0]),      1 },
-       { offsetof(xfs_sb_t, sb_blocklog),      0 },
-       { offsetof(xfs_sb_t, sb_sectlog),       0 },
-       { offsetof(xfs_sb_t, sb_inodelog),      0 },
-       { offsetof(xfs_sb_t, sb_inopblog),      0 },
-       { offsetof(xfs_sb_t, sb_agblklog),      0 },
-       { offsetof(xfs_sb_t, sb_rextslog),      0 },
-       { offsetof(xfs_sb_t, sb_inprogress),    0 },
-       { offsetof(xfs_sb_t, sb_imax_pct),      0 },
-       { offsetof(xfs_sb_t, sb_icount),        0 },
-       { offsetof(xfs_sb_t, sb_ifree),         0 },
-       { offsetof(xfs_sb_t, sb_fdblocks),      0 },
-       { offsetof(xfs_sb_t, sb_frextents),     0 },
-       { offsetof(xfs_sb_t, sb_uquotino),      0 },
-       { offsetof(xfs_sb_t, sb_gquotino),      0 },
-       { offsetof(xfs_sb_t, sb_qflags),        0 },
-       { offsetof(xfs_sb_t, sb_flags),         0 },
-       { offsetof(xfs_sb_t, sb_shared_vn),     0 },
-       { offsetof(xfs_sb_t, sb_inoalignmt),    0 },
-       { offsetof(xfs_sb_t, sb_unit),          0 },
-       { offsetof(xfs_sb_t, sb_width),         0 },
-       { offsetof(xfs_sb_t, sb_dirblklog),     0 },
-       { offsetof(xfs_sb_t, sb_logsectlog),    0 },
-       { offsetof(xfs_sb_t, sb_logsectsize),   0 },
-       { offsetof(xfs_sb_t, sb_logsunit),      0 },
-       { offsetof(xfs_sb_t, sb_features2),     0 },
-       { offsetof(xfs_sb_t, sb_bad_features2), 0 },
-       { offsetof(xfs_sb_t, sb_features_compat),       0 },
-       { offsetof(xfs_sb_t, sb_features_ro_compat),    0 },
-       { offsetof(xfs_sb_t, sb_features_incompat),     0 },
-       { offsetof(xfs_sb_t, sb_features_log_incompat), 0 },
-       { offsetof(xfs_sb_t, sb_crc),           0 },
-       { offsetof(xfs_sb_t, sb_pad),           0 },
-       { offsetof(xfs_sb_t, sb_pquotino),      0 },
-       { offsetof(xfs_sb_t, sb_lsn),           0 },
-       { sizeof(xfs_sb_t),                     0 }
-};
-
-/*
- * Reference counting access wrappers to the perag structures.
- * Because we never free per-ag structures, the only thing we
- * have to protect against changes is the tree structure itself.
- */
-struct xfs_perag *
-xfs_perag_get(
-       struct xfs_mount        *mp,
-       xfs_agnumber_t          agno)
-{
-       struct xfs_perag        *pag;
-       int                     ref = 0;
-
-       rcu_read_lock();
-       pag = radix_tree_lookup(&mp->m_perag_tree, agno);
-       if (pag) {
-               ASSERT(atomic_read(&pag->pag_ref) >= 0);
-               ref = atomic_inc_return(&pag->pag_ref);
-       }
-       rcu_read_unlock();
-       trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
-       return pag;
-}
-
-/*
- * search from @first to find the next perag with the given tag set.
- */
-struct xfs_perag *
-xfs_perag_get_tag(
-       struct xfs_mount        *mp,
-       xfs_agnumber_t          first,
-       int                     tag)
-{
-       struct xfs_perag        *pag;
-       int                     found;
-       int                     ref;
-
-       rcu_read_lock();
-       found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
-                                       (void **)&pag, first, 1, tag);
-       if (found <= 0) {
-               rcu_read_unlock();
-               return NULL;
-       }
-       ref = atomic_inc_return(&pag->pag_ref);
-       rcu_read_unlock();
-       trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
-       return pag;
-}
-
-void
-xfs_perag_put(
-       struct xfs_perag        *pag)
-{
-       int     ref;
-
-       ASSERT(atomic_read(&pag->pag_ref) > 0);
-       ref = atomic_dec_return(&pag->pag_ref);
-       trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
-}
-
-/*
- * Check the validity of the SB found.
- */
-STATIC int
-xfs_mount_validate_sb(
-       xfs_mount_t     *mp,
-       xfs_sb_t        *sbp,
-       bool            check_inprogress,
-       bool            check_version)
-{
-
-       /*
-        * If the log device and data device have the
-        * same device number, the log is internal.
-        * Consequently, the sb_logstart should be non-zero.  If
-        * we have a zero sb_logstart in this case, we may be trying to mount
-        * a volume filesystem in a non-volume manner.
-        */
-       if (sbp->sb_magicnum != XFS_SB_MAGIC) {
-               xfs_warn(mp, "bad magic number");
-               return XFS_ERROR(EWRONGFS);
-       }
-
-
-       if (!xfs_sb_good_version(sbp)) {
-               xfs_warn(mp, "bad version");
-               return XFS_ERROR(EWRONGFS);
-       }
-
-       /*
-        * Version 5 superblock feature mask validation. Reject combinations the
-        * kernel cannot support up front before checking anything else. For
-        * write validation, we don't need to check feature masks.
-        */
-       if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
-               if (xfs_sb_has_compat_feature(sbp,
-                                       XFS_SB_FEAT_COMPAT_UNKNOWN)) {
-                       xfs_warn(mp,
-"Superblock has unknown compatible features (0x%x) enabled.\n"
-"Using a more recent kernel is recommended.",
-                               (sbp->sb_features_compat &
-                                               XFS_SB_FEAT_COMPAT_UNKNOWN));
-               }
-
-               if (xfs_sb_has_ro_compat_feature(sbp,
-                                       XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
-                       xfs_alert(mp,
-"Superblock has unknown read-only compatible features (0x%x) enabled.",
-                               (sbp->sb_features_ro_compat &
-                                               XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
-                       if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                               xfs_warn(mp,
-"Attempted to mount read-only compatible filesystem read-write.\n"
-"Filesystem can only be safely mounted read only.");
-                               return XFS_ERROR(EINVAL);
-                       }
-               }
-               if (xfs_sb_has_incompat_feature(sbp,
-                                       XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
-                       xfs_warn(mp,
-"Superblock has unknown incompatible features (0x%x) enabled.\n"
-"Filesystem can not be safely mounted by this kernel.",
-                               (sbp->sb_features_incompat &
-                                               XFS_SB_FEAT_INCOMPAT_UNKNOWN));
-                       return XFS_ERROR(EINVAL);
-               }
-       }
-
-       if (xfs_sb_version_has_pquotino(sbp)) {
-               if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) {
-                       xfs_notice(mp,
-                          "Version 5 of Super block has XFS_OQUOTA bits.");
-                       return XFS_ERROR(EFSCORRUPTED);
-               }
-       } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
-                               XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
-                       xfs_notice(mp,
-"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits.");
-                       return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       if (unlikely(
-           sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
-               xfs_warn(mp,
-               "filesystem is marked as having an external log; "
-               "specify logdev on the mount command line.");
-               return XFS_ERROR(EINVAL);
-       }
-
-       if (unlikely(
-           sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
-               xfs_warn(mp,
-               "filesystem is marked as having an internal log; "
-               "do not specify logdev on the mount command line.");
-               return XFS_ERROR(EINVAL);
-       }
-
-       /*
-        * More sanity checking.  Most of these were stolen directly from
-        * xfs_repair.
-        */
-       if (unlikely(
-           sbp->sb_agcount <= 0                                        ||
-           sbp->sb_sectsize < XFS_MIN_SECTORSIZE                       ||
-           sbp->sb_sectsize > XFS_MAX_SECTORSIZE                       ||
-           sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG                    ||
-           sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG                    ||
-           sbp->sb_sectsize != (1 << sbp->sb_sectlog)                  ||
-           sbp->sb_blocksize < XFS_MIN_BLOCKSIZE                       ||
-           sbp->sb_blocksize > XFS_MAX_BLOCKSIZE                       ||
-           sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG                    ||
-           sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG                    ||
-           sbp->sb_blocksize != (1 << sbp->sb_blocklog)                ||
-           sbp->sb_inodesize < XFS_DINODE_MIN_SIZE                     ||
-           sbp->sb_inodesize > XFS_DINODE_MAX_SIZE                     ||
-           sbp->sb_inodelog < XFS_DINODE_MIN_LOG                       ||
-           sbp->sb_inodelog > XFS_DINODE_MAX_LOG                       ||
-           sbp->sb_inodesize != (1 << sbp->sb_inodelog)                ||
-           sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) ||
-           (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog)   ||
-           (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)  ||
-           (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)  ||
-           (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */)    ||
-           sbp->sb_dblocks == 0                                        ||
-           sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp)                      ||
-           sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp)                      ||
-           sbp->sb_shared_vn != 0)) {
-               xfs_notice(mp, "SB sanity check failed");
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       /*
-        * Until this is fixed only page-sized or smaller data blocks work.
-        */
-       if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
-               xfs_warn(mp,
-               "File system with blocksize %d bytes. "
-               "Only pagesize (%ld) or less will currently work.",
-                               sbp->sb_blocksize, PAGE_SIZE);
-               return XFS_ERROR(ENOSYS);
-       }
-
-       /*
-        * Currently only very few inode sizes are supported.
-        */
-       switch (sbp->sb_inodesize) {
-       case 256:
-       case 512:
-       case 1024:
-       case 2048:
-               break;
-       default:
-               xfs_warn(mp, "inode size of %d bytes not supported",
-                               sbp->sb_inodesize);
-               return XFS_ERROR(ENOSYS);
-       }
-
-       if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
-           xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
-               xfs_warn(mp,
-               "file system too large to be mounted on this system.");
-               return XFS_ERROR(EFBIG);
-       }
-
-       if (check_inprogress && sbp->sb_inprogress) {
-               xfs_warn(mp, "Offline file system operation in progress!");
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-       return 0;
-}
-
-void
-xfs_sb_quota_from_disk(struct xfs_sb *sbp)
-{
-       /*
-        * older mkfs doesn't initialize quota inodes to NULLFSINO. This
-        * leads to in-core values having two different values for a quota
-        * inode to be invalid: 0 and NULLFSINO. Change it to a single value
-        * NULLFSINO.
-        *
-        * Note that this change affect only the in-core values. These
-        * values are not written back to disk unless any quota information
-        * is written to the disk. Even in that case, sb_pquotino field is
-        * not written to disk unless the superblock supports pquotino.
-        */
-       if (sbp->sb_uquotino == 0)
-               sbp->sb_uquotino = NULLFSINO;
-       if (sbp->sb_gquotino == 0)
-               sbp->sb_gquotino = NULLFSINO;
-       if (sbp->sb_pquotino == 0)
-               sbp->sb_pquotino = NULLFSINO;
-
-       /*
-        * We need to do these manipilations only if we are working
-        * with an older version of on-disk superblock.
-        */
-       if (xfs_sb_version_has_pquotino(sbp))
-               return;
-
-       if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
-               sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
-                                       XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
-       if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
-               sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
-                                       XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
-       sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
-
-       if (sbp->sb_qflags & XFS_PQUOTA_ACCT)  {
-               /*
-                * In older version of superblock, on-disk superblock only
-                * has sb_gquotino, and in-core superblock has both sb_gquotino
-                * and sb_pquotino. But, only one of them is supported at any
-                * point of time. So, if PQUOTA is set in disk superblock,
-                * copy over sb_gquotino to sb_pquotino.
-                */
-               sbp->sb_pquotino = sbp->sb_gquotino;
-               sbp->sb_gquotino = NULLFSINO;
-       }
-}
-
-void
-xfs_sb_from_disk(
-       struct xfs_sb   *to,
-       xfs_dsb_t       *from)
-{
-       to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
-       to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
-       to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
-       to->sb_rblocks = be64_to_cpu(from->sb_rblocks);
-       to->sb_rextents = be64_to_cpu(from->sb_rextents);
-       memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
-       to->sb_logstart = be64_to_cpu(from->sb_logstart);
-       to->sb_rootino = be64_to_cpu(from->sb_rootino);
-       to->sb_rbmino = be64_to_cpu(from->sb_rbmino);
-       to->sb_rsumino = be64_to_cpu(from->sb_rsumino);
-       to->sb_rextsize = be32_to_cpu(from->sb_rextsize);
-       to->sb_agblocks = be32_to_cpu(from->sb_agblocks);
-       to->sb_agcount = be32_to_cpu(from->sb_agcount);
-       to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks);
-       to->sb_logblocks = be32_to_cpu(from->sb_logblocks);
-       to->sb_versionnum = be16_to_cpu(from->sb_versionnum);
-       to->sb_sectsize = be16_to_cpu(from->sb_sectsize);
-       to->sb_inodesize = be16_to_cpu(from->sb_inodesize);
-       to->sb_inopblock = be16_to_cpu(from->sb_inopblock);
-       memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
-       to->sb_blocklog = from->sb_blocklog;
-       to->sb_sectlog = from->sb_sectlog;
-       to->sb_inodelog = from->sb_inodelog;
-       to->sb_inopblog = from->sb_inopblog;
-       to->sb_agblklog = from->sb_agblklog;
-       to->sb_rextslog = from->sb_rextslog;
-       to->sb_inprogress = from->sb_inprogress;
-       to->sb_imax_pct = from->sb_imax_pct;
-       to->sb_icount = be64_to_cpu(from->sb_icount);
-       to->sb_ifree = be64_to_cpu(from->sb_ifree);
-       to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks);
-       to->sb_frextents = be64_to_cpu(from->sb_frextents);
-       to->sb_uquotino = be64_to_cpu(from->sb_uquotino);
-       to->sb_gquotino = be64_to_cpu(from->sb_gquotino);
-       to->sb_qflags = be16_to_cpu(from->sb_qflags);
-       to->sb_flags = from->sb_flags;
-       to->sb_shared_vn = from->sb_shared_vn;
-       to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt);
-       to->sb_unit = be32_to_cpu(from->sb_unit);
-       to->sb_width = be32_to_cpu(from->sb_width);
-       to->sb_dirblklog = from->sb_dirblklog;
-       to->sb_logsectlog = from->sb_logsectlog;
-       to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize);
-       to->sb_logsunit = be32_to_cpu(from->sb_logsunit);
-       to->sb_features2 = be32_to_cpu(from->sb_features2);
-       to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2);
-       to->sb_features_compat = be32_to_cpu(from->sb_features_compat);
-       to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat);
-       to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat);
-       to->sb_features_log_incompat =
-                               be32_to_cpu(from->sb_features_log_incompat);
-       to->sb_pad = 0;
-       to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
-       to->sb_lsn = be64_to_cpu(from->sb_lsn);
-}
-
-static inline void
-xfs_sb_quota_to_disk(
-       xfs_dsb_t       *to,
-       xfs_sb_t        *from,
-       __int64_t       *fields)
-{
-       __uint16_t      qflags = from->sb_qflags;
-
-       /*
-        * We need to do these manipilations only if we are working
-        * with an older version of on-disk superblock.
-        */
-       if (xfs_sb_version_has_pquotino(from))
-               return;
-
-       if (*fields & XFS_SB_QFLAGS) {
-               /*
-                * The in-core version of sb_qflags do not have
-                * XFS_OQUOTA_* flags, whereas the on-disk version
-                * does.  So, convert incore XFS_{PG}QUOTA_* flags
-                * to on-disk XFS_OQUOTA_* flags.
-                */
-               qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
-                               XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
-
-               if (from->sb_qflags &
-                               (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
-                       qflags |= XFS_OQUOTA_ENFD;
-               if (from->sb_qflags &
-                               (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
-                       qflags |= XFS_OQUOTA_CHKD;
-               to->sb_qflags = cpu_to_be16(qflags);
-               *fields &= ~XFS_SB_QFLAGS;
-       }
-
-       /*
-        * GQUOTINO and PQUOTINO cannot be used together in versions of
-        * superblock that do not have pquotino. from->sb_flags tells us which
-        * quota is active and should be copied to disk. If neither are active,
-        * make sure we write NULLFSINO to the sb_gquotino field as a quota
-        * inode value of "0" is invalid when the XFS_SB_VERSION_QUOTA feature
-        * bit is set.
-        *
-        * Note that we don't need to handle the sb_uquotino or sb_pquotino here
-        * as they do not require any translation. Hence the main sb field loop
-        * will write them appropriately from the in-core superblock.
-        */
-       if ((*fields & XFS_SB_GQUOTINO) &&
-                               (from->sb_qflags & XFS_GQUOTA_ACCT))
-               to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
-       else if ((*fields & XFS_SB_PQUOTINO) &&
-                               (from->sb_qflags & XFS_PQUOTA_ACCT))
-               to->sb_gquotino = cpu_to_be64(from->sb_pquotino);
-       else {
-               /*
-                * We can't rely on just the fields being logged to tell us
-                * that it is safe to write NULLFSINO - we should only do that
-                * if quotas are not actually enabled. Hence only write
-                * NULLFSINO if both in-core quota inodes are NULL.
-                */
-               if (from->sb_gquotino == NULLFSINO &&
-                   from->sb_pquotino == NULLFSINO)
-                       to->sb_gquotino = cpu_to_be64(NULLFSINO);
-       }
-
-       *fields &= ~(XFS_SB_PQUOTINO | XFS_SB_GQUOTINO);
-}
-
-/*
- * Copy in core superblock to ondisk one.
- *
- * The fields argument is mask of superblock fields to copy.
- */
-void
-xfs_sb_to_disk(
-       xfs_dsb_t       *to,
-       xfs_sb_t        *from,
-       __int64_t       fields)
-{
-       xfs_caddr_t     to_ptr = (xfs_caddr_t)to;
-       xfs_caddr_t     from_ptr = (xfs_caddr_t)from;
-       xfs_sb_field_t  f;
-       int             first;
-       int             size;
-
-       ASSERT(fields);
-       if (!fields)
-               return;
-
-       xfs_sb_quota_to_disk(to, from, &fields);
-       while (fields) {
-               f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
-               first = xfs_sb_info[f].offset;
-               size = xfs_sb_info[f + 1].offset - first;
-
-               ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
-
-               if (size == 1 || xfs_sb_info[f].type == 1) {
-                       memcpy(to_ptr + first, from_ptr + first, size);
-               } else {
-                       switch (size) {
-                       case 2:
-                               *(__be16 *)(to_ptr + first) =
-                                     cpu_to_be16(*(__u16 *)(from_ptr + first));
-                               break;
-                       case 4:
-                               *(__be32 *)(to_ptr + first) =
-                                     cpu_to_be32(*(__u32 *)(from_ptr + first));
-                               break;
-                       case 8:
-                               *(__be64 *)(to_ptr + first) =
-                                     cpu_to_be64(*(__u64 *)(from_ptr + first));
-                               break;
-                       default:
-                               ASSERT(0);
-                       }
-               }
-
-               fields &= ~(1LL << f);
-       }
-}
-
-static int
-xfs_sb_verify(
-       struct xfs_buf  *bp,
-       bool            check_version)
-{
-       struct xfs_mount *mp = bp->b_target->bt_mount;
-       struct xfs_sb   sb;
-
-       xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
-
-       /*
-        * Only check the in progress field for the primary superblock as
-        * mkfs.xfs doesn't clear it from secondary superblocks.
-        */
-       return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR,
-                                    check_version);
-}
-
-/*
- * If the superblock has the CRC feature bit set or the CRC field is non-null,
- * check that the CRC is valid.  We check the CRC field is non-null because a
- * single bit error could clear the feature bit and unused parts of the
- * superblock are supposed to be zero. Hence a non-null crc field indicates that
- * we've potentially lost a feature bit and we should check it anyway.
- *
- * However, past bugs (i.e. in growfs) left non-zeroed regions beyond the
- * last field in V4 secondary superblocks.  So for secondary superblocks,
- * we are more forgiving, and ignore CRC failures if the primary doesn't
- * indicate that the fs version is V5.
- */
-static void
-xfs_sb_read_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount *mp = bp->b_target->bt_mount;
-       struct xfs_dsb  *dsb = XFS_BUF_TO_SBP(bp);
-       int             error;
-
-       /*
-        * open code the version check to avoid needing to convert the entire
-        * superblock from disk order just to check the version number
-        */
-       if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) &&
-           (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) ==
-                                               XFS_SB_VERSION_5) ||
-            dsb->sb_crc != 0)) {
-
-               if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) {
-                       /* Only fail bad secondaries on a known V5 filesystem */
-                       if (bp->b_bn == XFS_SB_DADDR ||
-                           xfs_sb_version_hascrc(&mp->m_sb)) {
-                               error = EFSBADCRC;
-                               goto out_error;
-                       }
-               }
-       }
-       error = xfs_sb_verify(bp, true);
-
-out_error:
-       if (error) {
-               xfs_buf_ioerror(bp, error);
-               if (error == EFSCORRUPTED || error == EFSBADCRC)
-                       xfs_verifier_error(bp);
-       }
-}
-
-/*
- * We may be probed for a filesystem match, so we may not want to emit
- * messages when the superblock buffer is not actually an XFS superblock.
- * If we find an XFS superblock, then run a normal, noisy mount because we are
- * really going to mount it and want to know about errors.
- */
-static void
-xfs_sb_quiet_read_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_dsb  *dsb = XFS_BUF_TO_SBP(bp);
-
-       if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) {
-               /* XFS filesystem, verify noisily! */
-               xfs_sb_read_verify(bp);
-               return;
-       }
-       /* quietly fail */
-       xfs_buf_ioerror(bp, EWRONGFS);
-}
-
-static void
-xfs_sb_write_verify(
-       struct xfs_buf          *bp)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-       struct xfs_buf_log_item *bip = bp->b_fspriv;
-       int                     error;
-
-       error = xfs_sb_verify(bp, false);
-       if (error) {
-               xfs_buf_ioerror(bp, error);
-               xfs_verifier_error(bp);
-               return;
-       }
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return;
-
-       if (bip)
-               XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
-
-       xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF);
-}
-
-const struct xfs_buf_ops xfs_sb_buf_ops = {
-       .verify_read = xfs_sb_read_verify,
-       .verify_write = xfs_sb_write_verify,
-};
-
-const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
-       .verify_read = xfs_sb_quiet_read_verify,
-       .verify_write = xfs_sb_write_verify,
-};
-
-/*
- * xfs_mount_common
- *
- * Mount initialization code establishing various mount
- * fields from the superblock associated with the given
- * mount structure
- */
-void
-xfs_sb_mount_common(
-       struct xfs_mount *mp,
-       struct xfs_sb   *sbp)
-{
-       mp->m_agfrotor = mp->m_agirotor = 0;
-       spin_lock_init(&mp->m_agirotor_lock);
-       mp->m_maxagi = mp->m_sb.sb_agcount;
-       mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
-       mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
-       mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
-       mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
-       mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
-       mp->m_blockmask = sbp->sb_blocksize - 1;
-       mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
-       mp->m_blockwmask = mp->m_blockwsize - 1;
-
-       mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
-       mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
-       mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
-       mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
-
-       mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
-       mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
-       mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
-       mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
-
-       mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
-       mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
-       mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
-       mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
-
-       mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
-       mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
-                                       sbp->sb_inopblock);
-       mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
-}
-
-/*
- * xfs_initialize_perag_data
- *
- * Read in each per-ag structure so we can count up the number of
- * allocated inodes, free inodes and used filesystem blocks as this
- * information is no longer persistent in the superblock. Once we have
- * this information, write it into the in-core superblock structure.
- */
-int
-xfs_initialize_perag_data(
-       struct xfs_mount *mp,
-       xfs_agnumber_t  agcount)
-{
-       xfs_agnumber_t  index;
-       xfs_perag_t     *pag;
-       xfs_sb_t        *sbp = &mp->m_sb;
-       uint64_t        ifree = 0;
-       uint64_t        ialloc = 0;
-       uint64_t        bfree = 0;
-       uint64_t        bfreelst = 0;
-       uint64_t        btree = 0;
-       int             error;
-
-       for (index = 0; index < agcount; index++) {
-               /*
-                * read the agf, then the agi. This gets us
-                * all the information we need and populates the
-                * per-ag structures for us.
-                */
-               error = xfs_alloc_pagf_init(mp, NULL, index, 0);
-               if (error)
-                       return error;
-
-               error = xfs_ialloc_pagi_init(mp, NULL, index);
-               if (error)
-                       return error;
-               pag = xfs_perag_get(mp, index);
-               ifree += pag->pagi_freecount;
-               ialloc += pag->pagi_count;
-               bfree += pag->pagf_freeblks;
-               bfreelst += pag->pagf_flcount;
-               btree += pag->pagf_btreeblks;
-               xfs_perag_put(pag);
-       }
-       /*
-        * Overwrite incore superblock counters with just-read data
-        */
-       spin_lock(&mp->m_sb_lock);
-       sbp->sb_ifree = ifree;
-       sbp->sb_icount = ialloc;
-       sbp->sb_fdblocks = bfree + bfreelst + btree;
-       spin_unlock(&mp->m_sb_lock);
-
-       /* Fixup the per-cpu counters as well. */
-       xfs_icsb_reinit_counters(mp);
-
-       return 0;
-}
-
-/*
- * xfs_mod_sb() can be used to copy arbitrary changes to the
- * in-core superblock into the superblock buffer to be logged.
- * It does not provide the higher level of locking that is
- * needed to protect the in-core superblock from concurrent
- * access.
- */
-void
-xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
-{
-       xfs_buf_t       *bp;
-       int             first;
-       int             last;
-       xfs_mount_t     *mp;
-       xfs_sb_field_t  f;
-
-       ASSERT(fields);
-       if (!fields)
-               return;
-       mp = tp->t_mountp;
-       bp = xfs_trans_getsb(tp, mp, 0);
-       first = sizeof(xfs_sb_t);
-       last = 0;
-
-       /* translate/copy */
-
-       xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
-
-       /* find modified range */
-       f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
-       ASSERT((1LL << f) & XFS_SB_MOD_BITS);
-       last = xfs_sb_info[f + 1].offset - 1;
-
-       f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
-       ASSERT((1LL << f) & XFS_SB_MOD_BITS);
-       first = xfs_sb_info[f].offset;
-
-       xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
-       xfs_trans_log_buf(tp, bp, first, last);
-}
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
deleted file mode 100644 (file)
index c43c2d6..0000000
+++ /dev/null
@@ -1,621 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SB_H__
-#define        __XFS_SB_H__
-
-/*
- * Super block
- * Fits into a sector-sized buffer at address 0 of each allocation group.
- * Only the first of these is ever updated except during growfs.
- */
-
-struct xfs_buf;
-struct xfs_mount;
-struct xfs_trans;
-
-#define        XFS_SB_MAGIC            0x58465342      /* 'XFSB' */
-#define        XFS_SB_VERSION_1        1               /* 5.3, 6.0.1, 6.1 */
-#define        XFS_SB_VERSION_2        2               /* 6.2 - attributes */
-#define        XFS_SB_VERSION_3        3               /* 6.2 - new inode version */
-#define        XFS_SB_VERSION_4        4               /* 6.2+ - bitmask version */
-#define        XFS_SB_VERSION_5        5               /* CRC enabled filesystem */
-#define        XFS_SB_VERSION_NUMBITS          0x000f
-#define        XFS_SB_VERSION_ALLFBITS         0xfff0
-#define        XFS_SB_VERSION_ATTRBIT          0x0010
-#define        XFS_SB_VERSION_NLINKBIT         0x0020
-#define        XFS_SB_VERSION_QUOTABIT         0x0040
-#define        XFS_SB_VERSION_ALIGNBIT         0x0080
-#define        XFS_SB_VERSION_DALIGNBIT        0x0100
-#define        XFS_SB_VERSION_SHAREDBIT        0x0200
-#define XFS_SB_VERSION_LOGV2BIT                0x0400
-#define XFS_SB_VERSION_SECTORBIT       0x0800
-#define        XFS_SB_VERSION_EXTFLGBIT        0x1000
-#define        XFS_SB_VERSION_DIRV2BIT         0x2000
-#define        XFS_SB_VERSION_BORGBIT          0x4000  /* ASCII only case-insens. */
-#define        XFS_SB_VERSION_MOREBITSBIT      0x8000
-
-/*
- * Supported feature bit list is just all bits in the versionnum field because
- * we've used them all up and understand them all. Except, of course, for the
- * shared superblock bit, which nobody knows what it does and so is unsupported.
- */
-#define        XFS_SB_VERSION_OKBITS           \
-       ((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \
-               ~XFS_SB_VERSION_SHAREDBIT)
-
-/*
- * There are two words to hold XFS "feature" bits: the original
- * word, sb_versionnum, and sb_features2.  Whenever a bit is set in
- * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set.
- *
- * These defines represent bits in sb_features2.
- */
-#define XFS_SB_VERSION2_RESERVED1BIT   0x00000001
-#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002      /* Superblk counters */
-#define XFS_SB_VERSION2_RESERVED4BIT   0x00000004
-#define XFS_SB_VERSION2_ATTR2BIT       0x00000008      /* Inline attr rework */
-#define XFS_SB_VERSION2_PARENTBIT      0x00000010      /* parent pointers */
-#define XFS_SB_VERSION2_PROJID32BIT    0x00000080      /* 32 bit project id */
-#define XFS_SB_VERSION2_CRCBIT         0x00000100      /* metadata CRCs */
-#define XFS_SB_VERSION2_FTYPE          0x00000200      /* inode type in dir */
-
-#define        XFS_SB_VERSION2_OKBITS          \
-       (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
-        XFS_SB_VERSION2_ATTR2BIT       | \
-        XFS_SB_VERSION2_PROJID32BIT    | \
-        XFS_SB_VERSION2_FTYPE)
-
-/*
- * Superblock - in core version.  Must match the ondisk version below.
- * Must be padded to 64 bit alignment.
- */
-typedef struct xfs_sb {
-       __uint32_t      sb_magicnum;    /* magic number == XFS_SB_MAGIC */
-       __uint32_t      sb_blocksize;   /* logical block size, bytes */
-       xfs_drfsbno_t   sb_dblocks;     /* number of data blocks */
-       xfs_drfsbno_t   sb_rblocks;     /* number of realtime blocks */
-       xfs_drtbno_t    sb_rextents;    /* number of realtime extents */
-       uuid_t          sb_uuid;        /* file system unique id */
-       xfs_dfsbno_t    sb_logstart;    /* starting block of log if internal */
-       xfs_ino_t       sb_rootino;     /* root inode number */
-       xfs_ino_t       sb_rbmino;      /* bitmap inode for realtime extents */
-       xfs_ino_t       sb_rsumino;     /* summary inode for rt bitmap */
-       xfs_agblock_t   sb_rextsize;    /* realtime extent size, blocks */
-       xfs_agblock_t   sb_agblocks;    /* size of an allocation group */
-       xfs_agnumber_t  sb_agcount;     /* number of allocation groups */
-       xfs_extlen_t    sb_rbmblocks;   /* number of rt bitmap blocks */
-       xfs_extlen_t    sb_logblocks;   /* number of log blocks */
-       __uint16_t      sb_versionnum;  /* header version == XFS_SB_VERSION */
-       __uint16_t      sb_sectsize;    /* volume sector size, bytes */
-       __uint16_t      sb_inodesize;   /* inode size, bytes */
-       __uint16_t      sb_inopblock;   /* inodes per block */
-       char            sb_fname[12];   /* file system name */
-       __uint8_t       sb_blocklog;    /* log2 of sb_blocksize */
-       __uint8_t       sb_sectlog;     /* log2 of sb_sectsize */
-       __uint8_t       sb_inodelog;    /* log2 of sb_inodesize */
-       __uint8_t       sb_inopblog;    /* log2 of sb_inopblock */
-       __uint8_t       sb_agblklog;    /* log2 of sb_agblocks (rounded up) */
-       __uint8_t       sb_rextslog;    /* log2 of sb_rextents */
-       __uint8_t       sb_inprogress;  /* mkfs is in progress, don't mount */
-       __uint8_t       sb_imax_pct;    /* max % of fs for inode space */
-                                       /* statistics */
-       /*
-        * These fields must remain contiguous.  If you really
-        * want to change their layout, make sure you fix the
-        * code in xfs_trans_apply_sb_deltas().
-        */
-       __uint64_t      sb_icount;      /* allocated inodes */
-       __uint64_t      sb_ifree;       /* free inodes */
-       __uint64_t      sb_fdblocks;    /* free data blocks */
-       __uint64_t      sb_frextents;   /* free realtime extents */
-       /*
-        * End contiguous fields.
-        */
-       xfs_ino_t       sb_uquotino;    /* user quota inode */
-       xfs_ino_t       sb_gquotino;    /* group quota inode */
-       __uint16_t      sb_qflags;      /* quota flags */
-       __uint8_t       sb_flags;       /* misc. flags */
-       __uint8_t       sb_shared_vn;   /* shared version number */
-       xfs_extlen_t    sb_inoalignmt;  /* inode chunk alignment, fsblocks */
-       __uint32_t      sb_unit;        /* stripe or raid unit */
-       __uint32_t      sb_width;       /* stripe or raid width */
-       __uint8_t       sb_dirblklog;   /* log2 of dir block size (fsbs) */
-       __uint8_t       sb_logsectlog;  /* log2 of the log sector size */
-       __uint16_t      sb_logsectsize; /* sector size for the log, bytes */
-       __uint32_t      sb_logsunit;    /* stripe unit size for the log */
-       __uint32_t      sb_features2;   /* additional feature bits */
-
-       /*
-        * bad features2 field as a result of failing to pad the sb
-        * structure to 64 bits. Some machines will be using this field
-        * for features2 bits. Easiest just to mark it bad and not use
-        * it for anything else.
-        */
-       __uint32_t      sb_bad_features2;
-
-       /* version 5 superblock fields start here */
-
-       /* feature masks */
-       __uint32_t      sb_features_compat;
-       __uint32_t      sb_features_ro_compat;
-       __uint32_t      sb_features_incompat;
-       __uint32_t      sb_features_log_incompat;
-
-       __uint32_t      sb_crc;         /* superblock crc */
-       __uint32_t      sb_pad;
-
-       xfs_ino_t       sb_pquotino;    /* project quota inode */
-       xfs_lsn_t       sb_lsn;         /* last write sequence */
-
-       /* must be padded to 64 bit alignment */
-} xfs_sb_t;
-
-#define XFS_SB_CRC_OFF         offsetof(struct xfs_sb, sb_crc)
-
-/*
- * Superblock - on disk version.  Must match the in core version above.
- * Must be padded to 64 bit alignment.
- */
-typedef struct xfs_dsb {
-       __be32          sb_magicnum;    /* magic number == XFS_SB_MAGIC */
-       __be32          sb_blocksize;   /* logical block size, bytes */
-       __be64          sb_dblocks;     /* number of data blocks */
-       __be64          sb_rblocks;     /* number of realtime blocks */
-       __be64          sb_rextents;    /* number of realtime extents */
-       uuid_t          sb_uuid;        /* file system unique id */
-       __be64          sb_logstart;    /* starting block of log if internal */
-       __be64          sb_rootino;     /* root inode number */
-       __be64          sb_rbmino;      /* bitmap inode for realtime extents */
-       __be64          sb_rsumino;     /* summary inode for rt bitmap */
-       __be32          sb_rextsize;    /* realtime extent size, blocks */
-       __be32          sb_agblocks;    /* size of an allocation group */
-       __be32          sb_agcount;     /* number of allocation groups */
-       __be32          sb_rbmblocks;   /* number of rt bitmap blocks */
-       __be32          sb_logblocks;   /* number of log blocks */
-       __be16          sb_versionnum;  /* header version == XFS_SB_VERSION */
-       __be16          sb_sectsize;    /* volume sector size, bytes */
-       __be16          sb_inodesize;   /* inode size, bytes */
-       __be16          sb_inopblock;   /* inodes per block */
-       char            sb_fname[12];   /* file system name */
-       __u8            sb_blocklog;    /* log2 of sb_blocksize */
-       __u8            sb_sectlog;     /* log2 of sb_sectsize */
-       __u8            sb_inodelog;    /* log2 of sb_inodesize */
-       __u8            sb_inopblog;    /* log2 of sb_inopblock */
-       __u8            sb_agblklog;    /* log2 of sb_agblocks (rounded up) */
-       __u8            sb_rextslog;    /* log2 of sb_rextents */
-       __u8            sb_inprogress;  /* mkfs is in progress, don't mount */
-       __u8            sb_imax_pct;    /* max % of fs for inode space */
-                                       /* statistics */
-       /*
-        * These fields must remain contiguous.  If you really
-        * want to change their layout, make sure you fix the
-        * code in xfs_trans_apply_sb_deltas().
-        */
-       __be64          sb_icount;      /* allocated inodes */
-       __be64          sb_ifree;       /* free inodes */
-       __be64          sb_fdblocks;    /* free data blocks */
-       __be64          sb_frextents;   /* free realtime extents */
-       /*
-        * End contiguous fields.
-        */
-       __be64          sb_uquotino;    /* user quota inode */
-       __be64          sb_gquotino;    /* group quota inode */
-       __be16          sb_qflags;      /* quota flags */
-       __u8            sb_flags;       /* misc. flags */
-       __u8            sb_shared_vn;   /* shared version number */
-       __be32          sb_inoalignmt;  /* inode chunk alignment, fsblocks */
-       __be32          sb_unit;        /* stripe or raid unit */
-       __be32          sb_width;       /* stripe or raid width */
-       __u8            sb_dirblklog;   /* log2 of dir block size (fsbs) */
-       __u8            sb_logsectlog;  /* log2 of the log sector size */
-       __be16          sb_logsectsize; /* sector size for the log, bytes */
-       __be32          sb_logsunit;    /* stripe unit size for the log */
-       __be32          sb_features2;   /* additional feature bits */
-       /*
-        * bad features2 field as a result of failing to pad the sb
-        * structure to 64 bits. Some machines will be using this field
-        * for features2 bits. Easiest just to mark it bad and not use
-        * it for anything else.
-        */
-       __be32          sb_bad_features2;
-
-       /* version 5 superblock fields start here */
-
-       /* feature masks */
-       __be32          sb_features_compat;
-       __be32          sb_features_ro_compat;
-       __be32          sb_features_incompat;
-       __be32          sb_features_log_incompat;
-
-       __le32          sb_crc;         /* superblock crc */
-       __be32          sb_pad;
-
-       __be64          sb_pquotino;    /* project quota inode */
-       __be64          sb_lsn;         /* last write sequence */
-
-       /* must be padded to 64 bit alignment */
-} xfs_dsb_t;
-
-/*
- * Sequence number values for the fields.
- */
-typedef enum {
-       XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS,
-       XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO,
-       XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS,
-       XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS,
-       XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE,
-       XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG,
-       XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG,
-       XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT,
-       XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO,
-       XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN,
-       XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG,
-       XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT,
-       XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT,
-       XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT,
-       XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD,
-       XFS_SBS_PQUOTINO, XFS_SBS_LSN,
-       XFS_SBS_FIELDCOUNT
-} xfs_sb_field_t;
-
-/*
- * Mask values, defined based on the xfs_sb_field_t values.
- * Only define the ones we're using.
- */
-#define        XFS_SB_MVAL(x)          (1LL << XFS_SBS_ ## x)
-#define        XFS_SB_UUID             XFS_SB_MVAL(UUID)
-#define        XFS_SB_FNAME            XFS_SB_MVAL(FNAME)
-#define        XFS_SB_ROOTINO          XFS_SB_MVAL(ROOTINO)
-#define        XFS_SB_RBMINO           XFS_SB_MVAL(RBMINO)
-#define        XFS_SB_RSUMINO          XFS_SB_MVAL(RSUMINO)
-#define        XFS_SB_VERSIONNUM       XFS_SB_MVAL(VERSIONNUM)
-#define XFS_SB_UQUOTINO                XFS_SB_MVAL(UQUOTINO)
-#define XFS_SB_GQUOTINO                XFS_SB_MVAL(GQUOTINO)
-#define XFS_SB_QFLAGS          XFS_SB_MVAL(QFLAGS)
-#define XFS_SB_SHARED_VN       XFS_SB_MVAL(SHARED_VN)
-#define XFS_SB_UNIT            XFS_SB_MVAL(UNIT)
-#define XFS_SB_WIDTH           XFS_SB_MVAL(WIDTH)
-#define XFS_SB_ICOUNT          XFS_SB_MVAL(ICOUNT)
-#define XFS_SB_IFREE           XFS_SB_MVAL(IFREE)
-#define XFS_SB_FDBLOCKS                XFS_SB_MVAL(FDBLOCKS)
-#define XFS_SB_FEATURES2       XFS_SB_MVAL(FEATURES2)
-#define XFS_SB_BAD_FEATURES2   XFS_SB_MVAL(BAD_FEATURES2)
-#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT)
-#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT)
-#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT)
-#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT)
-#define XFS_SB_CRC             XFS_SB_MVAL(CRC)
-#define XFS_SB_PQUOTINO                XFS_SB_MVAL(PQUOTINO)
-#define        XFS_SB_NUM_BITS         ((int)XFS_SBS_FIELDCOUNT)
-#define        XFS_SB_ALL_BITS         ((1LL << XFS_SB_NUM_BITS) - 1)
-#define        XFS_SB_MOD_BITS         \
-       (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \
-        XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
-        XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
-        XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
-        XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \
-        XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \
-        XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO)
-
-
-/*
- * Misc. Flags - warning - these will be cleared by xfs_repair unless
- * a feature bit is set when the flag is used.
- */
-#define XFS_SBF_NOFLAGS                0x00    /* no flags set */
-#define XFS_SBF_READONLY       0x01    /* only read-only mounts allowed */
-
-/*
- * define max. shared version we can interoperate with
- */
-#define XFS_SB_MAX_SHARED_VN   0
-
-#define        XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
-
-/*
- * The first XFS version we support is a v4 superblock with V2 directories.
- */
-static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp)
-{
-       if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
-               return false;
-
-       /* check for unknown features in the fs */
-       if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
-           ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
-            (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
-               return false;
-
-       return true;
-}
-
-static inline bool xfs_sb_good_version(struct xfs_sb *sbp)
-{
-       if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5)
-               return true;
-       if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
-               return xfs_sb_good_v4_features(sbp);
-       return false;
-}
-
-/*
- * Detect a mismatched features2 field.  Older kernels read/wrote
- * this into the wrong slot, so to be safe we keep them in sync.
- */
-static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp)
-{
-       return sbp->sb_bad_features2 != sbp->sb_features2;
-}
-
-static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp)
-{
-       return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT);
-}
-
-static inline void xfs_sb_version_addattr(struct xfs_sb *sbp)
-{
-       sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
-}
-
-static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp)
-{
-       return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
-}
-
-static inline void xfs_sb_version_addquota(struct xfs_sb *sbp)
-{
-       sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
-}
-
-static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp)
-{
-       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
-               (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT));
-}
-
-static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp)
-{
-       return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
-}
-
-static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp)
-{
-       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
-              (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
-}
-
-static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp)
-{
-       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
-              (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
-}
-
-static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp)
-{
-       return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
-}
-
-static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp)
-{
-       return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
-}
-
-static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp)
-{
-       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
-              (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
-}
-
-/*
- * sb_features2 bit version macros.
- */
-static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp)
-{
-       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
-              (xfs_sb_version_hasmorebits(sbp) &&
-               (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT));
-}
-
-static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp)
-{
-       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
-              (xfs_sb_version_hasmorebits(sbp) &&
-               (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT));
-}
-
-static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp)
-{
-       sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
-       sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
-       sbp->sb_bad_features2 |= XFS_SB_VERSION2_ATTR2BIT;
-}
-
-static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp)
-{
-       sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
-       sbp->sb_bad_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
-       if (!sbp->sb_features2)
-               sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
-}
-
-static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp)
-{
-       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
-              (xfs_sb_version_hasmorebits(sbp) &&
-               (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT));
-}
-
-static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp)
-{
-       sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
-       sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
-       sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT;
-}
-
-/*
- * Extended v5 superblock feature masks. These are to be used for new v5
- * superblock features only.
- *
- * Compat features are new features that old kernels will not notice or affect
- * and so can mount read-write without issues.
- *
- * RO-Compat (read only) are features that old kernels can read but will break
- * if they write. Hence only read-only mounts of such filesystems are allowed on
- * kernels that don't support the feature bit.
- *
- * InCompat features are features which old kernels will not understand and so
- * must not mount.
- *
- * Log-InCompat features are for changes to log formats or new transactions that
- * can't be replayed on older kernels. The fields are set when the filesystem is
- * mounted, and a clean unmount clears the fields.
- */
-#define XFS_SB_FEAT_COMPAT_ALL 0
-#define XFS_SB_FEAT_COMPAT_UNKNOWN     ~XFS_SB_FEAT_COMPAT_ALL
-static inline bool
-xfs_sb_has_compat_feature(
-       struct xfs_sb   *sbp,
-       __uint32_t      feature)
-{
-       return (sbp->sb_features_compat & feature) != 0;
-}
-
-#define XFS_SB_FEAT_RO_COMPAT_FINOBT   (1 << 0)                /* free inode btree */
-#define XFS_SB_FEAT_RO_COMPAT_ALL \
-               (XFS_SB_FEAT_RO_COMPAT_FINOBT)
-#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN  ~XFS_SB_FEAT_RO_COMPAT_ALL
-static inline bool
-xfs_sb_has_ro_compat_feature(
-       struct xfs_sb   *sbp,
-       __uint32_t      feature)
-{
-       return (sbp->sb_features_ro_compat & feature) != 0;
-}
-
-#define XFS_SB_FEAT_INCOMPAT_FTYPE     (1 << 0)        /* filetype in dirent */
-#define XFS_SB_FEAT_INCOMPAT_ALL \
-               (XFS_SB_FEAT_INCOMPAT_FTYPE)
-
-#define XFS_SB_FEAT_INCOMPAT_UNKNOWN   ~XFS_SB_FEAT_INCOMPAT_ALL
-static inline bool
-xfs_sb_has_incompat_feature(
-       struct xfs_sb   *sbp,
-       __uint32_t      feature)
-{
-       return (sbp->sb_features_incompat & feature) != 0;
-}
-
-#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0
-#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN       ~XFS_SB_FEAT_INCOMPAT_LOG_ALL
-static inline bool
-xfs_sb_has_incompat_log_feature(
-       struct xfs_sb   *sbp,
-       __uint32_t      feature)
-{
-       return (sbp->sb_features_log_incompat & feature) != 0;
-}
-
-/*
- * V5 superblock specific feature checks
- */
-static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp)
-{
-       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
-}
-
-static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp)
-{
-       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
-}
-
-static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp)
-{
-       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
-               xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) ||
-              (xfs_sb_version_hasmorebits(sbp) &&
-                (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE));
-}
-
-static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
-{
-       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
-               (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
-}
-
-/*
- * end of superblock version macros
- */
-
-static inline bool
-xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
-{
-       return (ino == sbp->sb_uquotino ||
-               ino == sbp->sb_gquotino ||
-               ino == sbp->sb_pquotino);
-}
-
-#define XFS_SB_DADDR           ((xfs_daddr_t)0) /* daddr in filesystem/ag */
-#define        XFS_SB_BLOCK(mp)        XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
-#define XFS_BUF_TO_SBP(bp)     ((xfs_dsb_t *)((bp)->b_addr))
-
-#define        XFS_HDR_BLOCK(mp,d)     ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
-#define        XFS_DADDR_TO_FSB(mp,d)  XFS_AGB_TO_FSB(mp, \
-                       xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d))
-#define        XFS_FSB_TO_DADDR(mp,fsbno)      XFS_AGB_TO_DADDR(mp, \
-                       XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno))
-
-/*
- * File system sector to basic block conversions.
- */
-#define XFS_FSS_TO_BB(mp,sec)  ((sec) << (mp)->m_sectbb_log)
-
-/*
- * File system block to basic block conversions.
- */
-#define        XFS_FSB_TO_BB(mp,fsbno) ((fsbno) << (mp)->m_blkbb_log)
-#define        XFS_BB_TO_FSB(mp,bb)    \
-       (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log)
-#define        XFS_BB_TO_FSBT(mp,bb)   ((bb) >> (mp)->m_blkbb_log)
-
-/*
- * File system block to byte conversions.
- */
-#define XFS_FSB_TO_B(mp,fsbno) ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog)
-#define XFS_B_TO_FSB(mp,b)     \
-       ((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog)
-#define XFS_B_TO_FSBT(mp,b)    (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
-#define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask)
-
-/*
- * perag get/put wrappers for ref counting
- */
-extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t);
-extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t,
-                                          int tag);
-extern void    xfs_perag_put(struct xfs_perag *pag);
-extern int     xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
-
-extern void    xfs_sb_calc_crc(struct xfs_buf  *);
-extern void    xfs_mod_sb(struct xfs_trans *, __int64_t);
-extern void    xfs_sb_mount_common(struct xfs_mount *, struct xfs_sb *);
-extern void    xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
-extern void    xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
-extern void    xfs_sb_quota_from_disk(struct xfs_sb *sbp);
-
-#endif /* __XFS_SB_H__ */
diff --git a/fs/xfs/xfs_shared.h b/fs/xfs/xfs_shared.h
deleted file mode 100644 (file)
index 82404da..0000000
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SHARED_H__
-#define __XFS_SHARED_H__
-
-/*
- * Definitions shared between kernel and userspace that don't fit into any other
- * header file that is shared with userspace.
- */
-struct xfs_ifork;
-struct xfs_buf;
-struct xfs_buf_ops;
-struct xfs_mount;
-struct xfs_trans;
-struct xfs_inode;
-
-/*
- * Buffer verifier operations are widely used, including userspace tools
- */
-extern const struct xfs_buf_ops xfs_agf_buf_ops;
-extern const struct xfs_buf_ops xfs_agi_buf_ops;
-extern const struct xfs_buf_ops xfs_agf_buf_ops;
-extern const struct xfs_buf_ops xfs_agfl_buf_ops;
-extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
-extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
-extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
-extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
-extern const struct xfs_buf_ops xfs_da3_node_buf_ops;
-extern const struct xfs_buf_ops xfs_dquot_buf_ops;
-extern const struct xfs_buf_ops xfs_symlink_buf_ops;
-extern const struct xfs_buf_ops xfs_agi_buf_ops;
-extern const struct xfs_buf_ops xfs_inobt_buf_ops;
-extern const struct xfs_buf_ops xfs_inode_buf_ops;
-extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
-extern const struct xfs_buf_ops xfs_dquot_buf_ops;
-extern const struct xfs_buf_ops xfs_sb_buf_ops;
-extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
-extern const struct xfs_buf_ops xfs_symlink_buf_ops;
-
-/*
- * Transaction types.  Used to distinguish types of buffers. These never reach
- * the log.
- */
-#define XFS_TRANS_SETATTR_NOT_SIZE     1
-#define XFS_TRANS_SETATTR_SIZE         2
-#define XFS_TRANS_INACTIVE             3
-#define XFS_TRANS_CREATE               4
-#define XFS_TRANS_CREATE_TRUNC         5
-#define XFS_TRANS_TRUNCATE_FILE                6
-#define XFS_TRANS_REMOVE               7
-#define XFS_TRANS_LINK                 8
-#define XFS_TRANS_RENAME               9
-#define XFS_TRANS_MKDIR                        10
-#define XFS_TRANS_RMDIR                        11
-#define XFS_TRANS_SYMLINK              12
-#define XFS_TRANS_SET_DMATTRS          13
-#define XFS_TRANS_GROWFS               14
-#define XFS_TRANS_STRAT_WRITE          15
-#define XFS_TRANS_DIOSTRAT             16
-/* 17 was XFS_TRANS_WRITE_SYNC */
-#define        XFS_TRANS_WRITEID               18
-#define        XFS_TRANS_ADDAFORK              19
-#define        XFS_TRANS_ATTRINVAL             20
-#define        XFS_TRANS_ATRUNCATE             21
-#define        XFS_TRANS_ATTR_SET              22
-#define        XFS_TRANS_ATTR_RM               23
-#define        XFS_TRANS_ATTR_FLAG             24
-#define        XFS_TRANS_CLEAR_AGI_BUCKET      25
-#define XFS_TRANS_QM_SBCHANGE          26
-/*
- * Dummy entries since we use the transaction type to index into the
- * trans_type[] in xlog_recover_print_trans_head()
- */
-#define XFS_TRANS_DUMMY1               27
-#define XFS_TRANS_DUMMY2               28
-#define XFS_TRANS_QM_QUOTAOFF          29
-#define XFS_TRANS_QM_DQALLOC           30
-#define XFS_TRANS_QM_SETQLIM           31
-#define XFS_TRANS_QM_DQCLUSTER         32
-#define XFS_TRANS_QM_QINOCREATE                33
-#define XFS_TRANS_QM_QUOTAOFF_END      34
-#define XFS_TRANS_SB_UNIT              35
-#define XFS_TRANS_FSYNC_TS             36
-#define        XFS_TRANS_GROWFSRT_ALLOC        37
-#define        XFS_TRANS_GROWFSRT_ZERO         38
-#define        XFS_TRANS_GROWFSRT_FREE         39
-#define        XFS_TRANS_SWAPEXT               40
-#define        XFS_TRANS_SB_COUNT              41
-#define        XFS_TRANS_CHECKPOINT            42
-#define        XFS_TRANS_ICREATE               43
-#define        XFS_TRANS_CREATE_TMPFILE        44
-#define        XFS_TRANS_TYPE_MAX              44
-/* new transaction types need to be reflected in xfs_logprint(8) */
-
-#define XFS_TRANS_TYPES \
-       { XFS_TRANS_SETATTR_NOT_SIZE,   "SETATTR_NOT_SIZE" }, \
-       { XFS_TRANS_SETATTR_SIZE,       "SETATTR_SIZE" }, \
-       { XFS_TRANS_INACTIVE,           "INACTIVE" }, \
-       { XFS_TRANS_CREATE,             "CREATE" }, \
-       { XFS_TRANS_CREATE_TMPFILE,     "CREATE_TMPFILE" }, \
-       { XFS_TRANS_CREATE_TRUNC,       "CREATE_TRUNC" }, \
-       { XFS_TRANS_TRUNCATE_FILE,      "TRUNCATE_FILE" }, \
-       { XFS_TRANS_REMOVE,             "REMOVE" }, \
-       { XFS_TRANS_LINK,               "LINK" }, \
-       { XFS_TRANS_RENAME,             "RENAME" }, \
-       { XFS_TRANS_MKDIR,              "MKDIR" }, \
-       { XFS_TRANS_RMDIR,              "RMDIR" }, \
-       { XFS_TRANS_SYMLINK,            "SYMLINK" }, \
-       { XFS_TRANS_SET_DMATTRS,        "SET_DMATTRS" }, \
-       { XFS_TRANS_GROWFS,             "GROWFS" }, \
-       { XFS_TRANS_STRAT_WRITE,        "STRAT_WRITE" }, \
-       { XFS_TRANS_DIOSTRAT,           "DIOSTRAT" }, \
-       { XFS_TRANS_WRITEID,            "WRITEID" }, \
-       { XFS_TRANS_ADDAFORK,           "ADDAFORK" }, \
-       { XFS_TRANS_ATTRINVAL,          "ATTRINVAL" }, \
-       { XFS_TRANS_ATRUNCATE,          "ATRUNCATE" }, \
-       { XFS_TRANS_ATTR_SET,           "ATTR_SET" }, \
-       { XFS_TRANS_ATTR_RM,            "ATTR_RM" }, \
-       { XFS_TRANS_ATTR_FLAG,          "ATTR_FLAG" }, \
-       { XFS_TRANS_CLEAR_AGI_BUCKET,   "CLEAR_AGI_BUCKET" }, \
-       { XFS_TRANS_QM_SBCHANGE,        "QM_SBCHANGE" }, \
-       { XFS_TRANS_QM_QUOTAOFF,        "QM_QUOTAOFF" }, \
-       { XFS_TRANS_QM_DQALLOC,         "QM_DQALLOC" }, \
-       { XFS_TRANS_QM_SETQLIM,         "QM_SETQLIM" }, \
-       { XFS_TRANS_QM_DQCLUSTER,       "QM_DQCLUSTER" }, \
-       { XFS_TRANS_QM_QINOCREATE,      "QM_QINOCREATE" }, \
-       { XFS_TRANS_QM_QUOTAOFF_END,    "QM_QOFF_END" }, \
-       { XFS_TRANS_SB_UNIT,            "SB_UNIT" }, \
-       { XFS_TRANS_FSYNC_TS,           "FSYNC_TS" }, \
-       { XFS_TRANS_GROWFSRT_ALLOC,     "GROWFSRT_ALLOC" }, \
-       { XFS_TRANS_GROWFSRT_ZERO,      "GROWFSRT_ZERO" }, \
-       { XFS_TRANS_GROWFSRT_FREE,      "GROWFSRT_FREE" }, \
-       { XFS_TRANS_SWAPEXT,            "SWAPEXT" }, \
-       { XFS_TRANS_SB_COUNT,           "SB_COUNT" }, \
-       { XFS_TRANS_CHECKPOINT,         "CHECKPOINT" }, \
-       { XFS_TRANS_DUMMY1,             "DUMMY1" }, \
-       { XFS_TRANS_DUMMY2,             "DUMMY2" }, \
-       { XLOG_UNMOUNT_REC_TYPE,        "UNMOUNT" }
-
-/*
- * This structure is used to track log items associated with
- * a transaction.  It points to the log item and keeps some
- * flags to track the state of the log item.  It also tracks
- * the amount of space needed to log the item it describes
- * once we get to commit processing (see xfs_trans_commit()).
- */
-struct xfs_log_item_desc {
-       struct xfs_log_item     *lid_item;
-       struct list_head        lid_trans;
-       unsigned char           lid_flags;
-};
-
-#define XFS_LID_DIRTY          0x1
-
-/* log size calculation functions */
-int    xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
-int    xfs_log_calc_minimum_size(struct xfs_mount *);
-
-
-/*
- * Values for t_flags.
- */
-#define        XFS_TRANS_DIRTY         0x01    /* something needs to be logged */
-#define        XFS_TRANS_SB_DIRTY      0x02    /* superblock is modified */
-#define        XFS_TRANS_PERM_LOG_RES  0x04    /* xact took a permanent log res */
-#define        XFS_TRANS_SYNC          0x08    /* make commit synchronous */
-#define XFS_TRANS_DQ_DIRTY     0x10    /* at least one dquot in trx dirty */
-#define XFS_TRANS_RESERVE      0x20    /* OK to use reserved data blocks */
-#define XFS_TRANS_FREEZE_PROT  0x40    /* Transaction has elevated writer
-                                          count in superblock */
-/*
- * Values for call flags parameter.
- */
-#define        XFS_TRANS_RELEASE_LOG_RES       0x4
-#define        XFS_TRANS_ABORT                 0x8
-
-/*
- * Field values for xfs_trans_mod_sb.
- */
-#define        XFS_TRANS_SB_ICOUNT             0x00000001
-#define        XFS_TRANS_SB_IFREE              0x00000002
-#define        XFS_TRANS_SB_FDBLOCKS           0x00000004
-#define        XFS_TRANS_SB_RES_FDBLOCKS       0x00000008
-#define        XFS_TRANS_SB_FREXTENTS          0x00000010
-#define        XFS_TRANS_SB_RES_FREXTENTS      0x00000020
-#define        XFS_TRANS_SB_DBLOCKS            0x00000040
-#define        XFS_TRANS_SB_AGCOUNT            0x00000080
-#define        XFS_TRANS_SB_IMAXPCT            0x00000100
-#define        XFS_TRANS_SB_REXTSIZE           0x00000200
-#define        XFS_TRANS_SB_RBMBLOCKS          0x00000400
-#define        XFS_TRANS_SB_RBLOCKS            0x00000800
-#define        XFS_TRANS_SB_REXTENTS           0x00001000
-#define        XFS_TRANS_SB_REXTSLOG           0x00002000
-
-/*
- * Here we centralize the specification of XFS meta-data buffer reference count
- * values.  This determines how hard the buffer cache tries to hold onto the
- * buffer.
- */
-#define        XFS_AGF_REF             4
-#define        XFS_AGI_REF             4
-#define        XFS_AGFL_REF            3
-#define        XFS_INO_BTREE_REF       3
-#define        XFS_ALLOC_BTREE_REF     2
-#define        XFS_BMAP_BTREE_REF      2
-#define        XFS_DIR_BTREE_REF       2
-#define        XFS_INO_REF             2
-#define        XFS_ATTR_BTREE_REF      1
-#define        XFS_DQUOT_REF           1
-
-/*
- * Flags for xfs_trans_ichgtime().
- */
-#define        XFS_ICHGTIME_MOD        0x1     /* data fork modification timestamp */
-#define        XFS_ICHGTIME_CHG        0x2     /* inode field change timestamp */
-#define        XFS_ICHGTIME_CREATE     0x4     /* inode create timestamp */
-
-
-/*
- * Symlink decoding/encoding functions
- */
-int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
-int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
-                       uint32_t size, struct xfs_buf *bp);
-bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset,
-                       uint32_t size, struct xfs_buf *bp);
-void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
-                                struct xfs_inode *ip, struct xfs_ifork *ifp);
-
-#endif /* __XFS_SHARED_H__ */
index 8f0333b3f7a011a25960c3ace80b57ebcaf58a19..b194652033cd11c8530ae762d01972fa455a23aa 100644 (file)
@@ -61,6 +61,7 @@
 static const struct super_operations xfs_super_operations;
 static kmem_zone_t *xfs_ioend_zone;
 mempool_t *xfs_ioend_pool;
+struct kset *xfs_kset;
 
 #define MNTOPT_LOGBUFS "logbufs"       /* number of XFS log buffers */
 #define MNTOPT_LOGBSIZE        "logbsize"      /* size of XFS log buffers */
@@ -185,7 +186,7 @@ xfs_parseargs(
         */
        mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
        if (!mp->m_fsname)
-               return ENOMEM;
+               return -ENOMEM;
        mp->m_fsname_len = strlen(mp->m_fsname) + 1;
 
        /*
@@ -204,9 +205,6 @@ xfs_parseargs(
         */
        mp->m_flags |= XFS_MOUNT_BARRIER;
        mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
-#if !XFS_BIG_INUMS
-       mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
-#endif
 
        /*
         * These can be overridden by the mount option parsing.
@@ -227,57 +225,57 @@ xfs_parseargs(
                        if (!value || !*value) {
                                xfs_warn(mp, "%s option requires an argument",
                                        this_char);
-                               return EINVAL;
+                               return -EINVAL;
                        }
                        if (kstrtoint(value, 10, &mp->m_logbufs))
-                               return EINVAL;
+                               return -EINVAL;
                } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
                        if (!value || !*value) {
                                xfs_warn(mp, "%s option requires an argument",
                                        this_char);
-                               return EINVAL;
+                               return -EINVAL;
                        }
                        if (suffix_kstrtoint(value, 10, &mp->m_logbsize))
-                               return EINVAL;
+                               return -EINVAL;
                } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
                        if (!value || !*value) {
                                xfs_warn(mp, "%s option requires an argument",
                                        this_char);
-                               return EINVAL;
+                               return -EINVAL;
                        }
                        mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
                        if (!mp->m_logname)
-                               return ENOMEM;
+                               return -ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_MTPT)) {
                        xfs_warn(mp, "%s option not allowed on this system",
                                this_char);
-                       return EINVAL;
+                       return -EINVAL;
                } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
                        if (!value || !*value) {
                                xfs_warn(mp, "%s option requires an argument",
                                        this_char);
-                               return EINVAL;
+                               return -EINVAL;
                        }
                        mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
                        if (!mp->m_rtname)
-                               return ENOMEM;
+                               return -ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
                        if (!value || !*value) {
                                xfs_warn(mp, "%s option requires an argument",
                                        this_char);
-                               return EINVAL;
+                               return -EINVAL;
                        }
                        if (kstrtoint(value, 10, &iosize))
-                               return EINVAL;
+                               return -EINVAL;
                        iosizelog = ffs(iosize) - 1;
                } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
                        if (!value || !*value) {
                                xfs_warn(mp, "%s option requires an argument",
                                        this_char);
-                               return EINVAL;
+                               return -EINVAL;
                        }
                        if (suffix_kstrtoint(value, 10, &iosize))
-                               return EINVAL;
+                               return -EINVAL;
                        iosizelog = ffs(iosize) - 1;
                } else if (!strcmp(this_char, MNTOPT_GRPID) ||
                           !strcmp(this_char, MNTOPT_BSDGROUPS)) {
@@ -297,27 +295,22 @@ xfs_parseargs(
                        if (!value || !*value) {
                                xfs_warn(mp, "%s option requires an argument",
                                        this_char);
-                               return EINVAL;
+                               return -EINVAL;
                        }
                        if (kstrtoint(value, 10, &dsunit))
-                               return EINVAL;
+                               return -EINVAL;
                } else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
                        if (!value || !*value) {
                                xfs_warn(mp, "%s option requires an argument",
                                        this_char);
-                               return EINVAL;
+                               return -EINVAL;
                        }
                        if (kstrtoint(value, 10, &dswidth))
-                               return EINVAL;
+                               return -EINVAL;
                } else if (!strcmp(this_char, MNTOPT_32BITINODE)) {
                        mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
                } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
                        mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
-#if !XFS_BIG_INUMS
-                       xfs_warn(mp, "%s option not allowed on this system",
-                               this_char);
-                       return EINVAL;
-#endif
                } else if (!strcmp(this_char, MNTOPT_NOUUID)) {
                        mp->m_flags |= XFS_MOUNT_NOUUID;
                } else if (!strcmp(this_char, MNTOPT_BARRIER)) {
@@ -390,7 +383,7 @@ xfs_parseargs(
        "irixsgid is now a sysctl(2) variable, option is deprecated.");
                } else {
                        xfs_warn(mp, "unknown mount option [%s].", this_char);
-                       return EINVAL;
+                       return -EINVAL;
                }
        }
 
@@ -400,32 +393,32 @@ xfs_parseargs(
        if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
            !(mp->m_flags & XFS_MOUNT_RDONLY)) {
                xfs_warn(mp, "no-recovery mounts must be read-only.");
-               return EINVAL;
+               return -EINVAL;
        }
 
        if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
                xfs_warn(mp,
        "sunit and swidth options incompatible with the noalign option");
-               return EINVAL;
+               return -EINVAL;
        }
 
 #ifndef CONFIG_XFS_QUOTA
        if (XFS_IS_QUOTA_RUNNING(mp)) {
                xfs_warn(mp, "quota support not available in this kernel.");
-               return EINVAL;
+               return -EINVAL;
        }
 #endif
 
        if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
                xfs_warn(mp, "sunit and swidth must be specified together");
-               return EINVAL;
+               return -EINVAL;
        }
 
        if (dsunit && (dswidth % dsunit != 0)) {
                xfs_warn(mp,
        "stripe width (%d) must be a multiple of the stripe unit (%d)",
                        dswidth, dsunit);
-               return EINVAL;
+               return -EINVAL;
        }
 
 done:
@@ -446,7 +439,7 @@ done:
             mp->m_logbufs > XLOG_MAX_ICLOGS)) {
                xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
                        mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
-               return XFS_ERROR(EINVAL);
+               return -EINVAL;
        }
        if (mp->m_logbsize != -1 &&
            mp->m_logbsize !=  0 &&
@@ -456,7 +449,7 @@ done:
                xfs_warn(mp,
                        "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
                        mp->m_logbsize);
-               return XFS_ERROR(EINVAL);
+               return -EINVAL;
        }
 
        if (iosizelog) {
@@ -465,7 +458,7 @@ done:
                        xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
                                iosizelog, XFS_MIN_IO_LOG,
                                XFS_MAX_IO_LOG);
-                       return XFS_ERROR(EINVAL);
+                       return -EINVAL;
                }
 
                mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
@@ -597,15 +590,20 @@ xfs_max_file_offset(
        return (((__uint64_t)pagefactor) << bitshift) - 1;
 }
 
+/*
+ * xfs_set_inode32() and xfs_set_inode64() are passed an agcount
+ * because in the growfs case, mp->m_sb.sb_agcount is not updated
+ * yet to the potentially higher ag count.
+ */
 xfs_agnumber_t
-xfs_set_inode32(struct xfs_mount *mp)
+xfs_set_inode32(struct xfs_mount *mp, xfs_agnumber_t agcount)
 {
        xfs_agnumber_t  index = 0;
        xfs_agnumber_t  maxagi = 0;
        xfs_sb_t        *sbp = &mp->m_sb;
        xfs_agnumber_t  max_metadata;
-       xfs_agino_t     agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks -1, 0);
-       xfs_ino_t       ino = XFS_AGINO_TO_INO(mp, sbp->sb_agcount -1, agino);
+       xfs_agino_t     agino;
+       xfs_ino_t       ino;
        xfs_perag_t     *pag;
 
        /* Calculate how much should be reserved for inodes to meet
@@ -620,10 +618,12 @@ xfs_set_inode32(struct xfs_mount *mp)
                do_div(icount, sbp->sb_agblocks);
                max_metadata = icount;
        } else {
-               max_metadata = sbp->sb_agcount;
+               max_metadata = agcount;
        }
 
-       for (index = 0; index < sbp->sb_agcount; index++) {
+       agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
+
+       for (index = 0; index < agcount; index++) {
                ino = XFS_AGINO_TO_INO(mp, index, agino);
 
                if (ino > XFS_MAXINUMBER_32) {
@@ -648,11 +648,11 @@ xfs_set_inode32(struct xfs_mount *mp)
 }
 
 xfs_agnumber_t
-xfs_set_inode64(struct xfs_mount *mp)
+xfs_set_inode64(struct xfs_mount *mp, xfs_agnumber_t agcount)
 {
        xfs_agnumber_t index = 0;
 
-       for (index = 0; index < mp->m_sb.sb_agcount; index++) {
+       for (index = 0; index < agcount; index++) {
                struct xfs_perag        *pag;
 
                pag = xfs_perag_get(mp, index);
@@ -686,7 +686,7 @@ xfs_blkdev_get(
                xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error);
        }
 
-       return -error;
+       return error;
 }
 
 STATIC void
@@ -756,7 +756,7 @@ xfs_open_devices(
                if (rtdev == ddev || rtdev == logdev) {
                        xfs_warn(mp,
        "Cannot mount filesystem with identical rtdev and ddev/logdev.");
-                       error = EINVAL;
+                       error = -EINVAL;
                        goto out_close_rtdev;
                }
        }
@@ -764,7 +764,7 @@ xfs_open_devices(
        /*
         * Setup xfs_mount buffer target pointers
         */
-       error = ENOMEM;
+       error = -ENOMEM;
        mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev);
        if (!mp->m_ddev_targp)
                goto out_close_rtdev;
@@ -1188,6 +1188,7 @@ xfs_fs_remount(
        char                    *options)
 {
        struct xfs_mount        *mp = XFS_M(sb);
+       xfs_sb_t                *sbp = &mp->m_sb;
        substring_t             args[MAX_OPT_ARGS];
        char                    *p;
        int                     error;
@@ -1208,10 +1209,10 @@ xfs_fs_remount(
                        mp->m_flags &= ~XFS_MOUNT_BARRIER;
                        break;
                case Opt_inode64:
-                       mp->m_maxagi = xfs_set_inode64(mp);
+                       mp->m_maxagi = xfs_set_inode64(mp, sbp->sb_agcount);
                        break;
                case Opt_inode32:
-                       mp->m_maxagi = xfs_set_inode32(mp);
+                       mp->m_maxagi = xfs_set_inode32(mp, sbp->sb_agcount);
                        break;
                default:
                        /*
@@ -1295,7 +1296,7 @@ xfs_fs_freeze(
 
        xfs_save_resvblks(mp);
        xfs_quiesce_attr(mp);
-       return -xfs_fs_log_dummy(mp);
+       return xfs_fs_log_dummy(mp);
 }
 
 STATIC int
@@ -1314,7 +1315,7 @@ xfs_fs_show_options(
        struct seq_file         *m,
        struct dentry           *root)
 {
-       return -xfs_showargs(XFS_M(root->d_sb), m);
+       return xfs_showargs(XFS_M(root->d_sb), m);
 }
 
 /*
@@ -1336,14 +1337,14 @@ xfs_finish_flags(
                           mp->m_logbsize < mp->m_sb.sb_logsunit) {
                        xfs_warn(mp,
                "logbuf size must be greater than or equal to log stripe size");
-                       return XFS_ERROR(EINVAL);
+                       return -EINVAL;
                }
        } else {
                /* Fail a mount if the logbuf is larger than 32K */
                if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
                        xfs_warn(mp,
                "logbuf size for version 1 logs must be 16K or 32K");
-                       return XFS_ERROR(EINVAL);
+                       return -EINVAL;
                }
        }
 
@@ -1355,7 +1356,7 @@ xfs_finish_flags(
                xfs_warn(mp,
 "Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.",
                        MNTOPT_NOATTR2, MNTOPT_ATTR2);
-               return XFS_ERROR(EINVAL);
+               return -EINVAL;
        }
 
        /*
@@ -1372,7 +1373,7 @@ xfs_finish_flags(
        if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
                xfs_warn(mp,
                        "cannot mount a read-only filesystem as read-write");
-               return XFS_ERROR(EROFS);
+               return -EROFS;
        }
 
        if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
@@ -1380,7 +1381,7 @@ xfs_finish_flags(
            !xfs_sb_version_has_pquotino(&mp->m_sb)) {
                xfs_warn(mp,
                  "Super block does not support project and group quota together");
-               return XFS_ERROR(EINVAL);
+               return -EINVAL;
        }
 
        return 0;
@@ -1394,7 +1395,7 @@ xfs_fs_fill_super(
 {
        struct inode            *root;
        struct xfs_mount        *mp = NULL;
-       int                     flags = 0, error = ENOMEM;
+       int                     flags = 0, error = -ENOMEM;
 
        mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
        if (!mp)
@@ -1428,11 +1429,11 @@ xfs_fs_fill_super(
        if (error)
                goto out_free_fsname;
 
-       error = -xfs_init_mount_workqueues(mp);
+       error = xfs_init_mount_workqueues(mp);
        if (error)
                goto out_close_devices;
 
-       error = -xfs_icsb_init_counters(mp);
+       error = xfs_icsb_init_counters(mp);
        if (error)
                goto out_destroy_workqueues;
 
@@ -1474,12 +1475,12 @@ xfs_fs_fill_super(
 
        root = igrab(VFS_I(mp->m_rootip));
        if (!root) {
-               error = ENOENT;
+               error = -ENOENT;
                goto out_unmount;
        }
        sb->s_root = d_make_root(root);
        if (!sb->s_root) {
-               error = ENOMEM;
+               error = -ENOMEM;
                goto out_unmount;
        }
 
@@ -1499,7 +1500,7 @@ out_destroy_workqueues:
        xfs_free_fsname(mp);
        kfree(mp);
  out:
-       return -error;
+       return error;
 
  out_unmount:
        xfs_filestream_unmount(mp);
@@ -1761,9 +1762,15 @@ init_xfs_fs(void)
        if (error)
                goto out_cleanup_procfs;
 
+       xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj);
+       if (!xfs_kset) {
+               error = -ENOMEM;
+               goto out_sysctl_unregister;;
+       }
+
        error = xfs_qm_init();
        if (error)
-               goto out_sysctl_unregister;
+               goto out_kset_unregister;
 
        error = register_filesystem(&xfs_fs_type);
        if (error)
@@ -1772,6 +1779,8 @@ init_xfs_fs(void)
 
  out_qm_exit:
        xfs_qm_exit();
+ out_kset_unregister:
+       kset_unregister(xfs_kset);
  out_sysctl_unregister:
        xfs_sysctl_unregister();
  out_cleanup_procfs:
@@ -1793,6 +1802,7 @@ exit_xfs_fs(void)
 {
        xfs_qm_exit();
        unregister_filesystem(&xfs_fs_type);
+       kset_unregister(xfs_kset);
        xfs_sysctl_unregister();
        xfs_cleanup_procfs();
        xfs_buf_terminate();
index bbe3d15a7904d6185ab440c3b2a12533322e72a7..2b830c2f322e36ec4ce58753c4db96c2b6842a58 100644 (file)
@@ -44,16 +44,6 @@ extern void xfs_qm_exit(void);
 # define XFS_REALTIME_STRING
 #endif
 
-#if XFS_BIG_BLKNOS
-# if XFS_BIG_INUMS
-#  define XFS_BIGFS_STRING     "large block/inode numbers, "
-# else
-#  define XFS_BIGFS_STRING     "large block numbers, "
-# endif
-#else
-# define XFS_BIGFS_STRING
-#endif
-
 #ifdef DEBUG
 # define XFS_DBG_STRING                "debug"
 #else
@@ -64,7 +54,6 @@ extern void xfs_qm_exit(void);
 #define XFS_BUILD_OPTIONS      XFS_ACL_STRING \
                                XFS_SECURITY_STRING \
                                XFS_REALTIME_STRING \
-                               XFS_BIGFS_STRING \
                                XFS_DBG_STRING /* DBG must be last */
 
 struct xfs_inode;
@@ -76,8 +65,8 @@ extern __uint64_t xfs_max_file_offset(unsigned int);
 
 extern void xfs_flush_inodes(struct xfs_mount *mp);
 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
-extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *);
-extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *);
+extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *, xfs_agnumber_t agcount);
+extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *, xfs_agnumber_t agcount);
 
 extern const struct export_operations xfs_export_operations;
 extern const struct xattr_handler *xfs_xattr_handlers[];
index d69363c833e1bae24b56d17ee22748fc2cf01244..6a944a2cd36fbf97717ea31f56e82c101caef3a9 100644 (file)
@@ -76,15 +76,15 @@ xfs_readlink_bmap(
                bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0,
                                  &xfs_symlink_buf_ops);
                if (!bp)
-                       return XFS_ERROR(ENOMEM);
+                       return -ENOMEM;
                error = bp->b_error;
                if (error) {
                        xfs_buf_ioerror_alert(bp, __func__);
                        xfs_buf_relse(bp);
 
                        /* bad CRC means corrupted metadata */
-                       if (error == EFSBADCRC)
-                               error = EFSCORRUPTED;
+                       if (error == -EFSBADCRC)
+                               error = -EFSCORRUPTED;
                        goto out;
                }
                byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
@@ -95,7 +95,7 @@ xfs_readlink_bmap(
                if (xfs_sb_version_hascrc(&mp->m_sb)) {
                        if (!xfs_symlink_hdr_ok(ip->i_ino, offset,
                                                        byte_cnt, bp)) {
-                               error = EFSCORRUPTED;
+                               error = -EFSCORRUPTED;
                                xfs_alert(mp,
 "symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)",
                                        offset, byte_cnt, ip->i_ino);
@@ -135,7 +135,7 @@ xfs_readlink(
        trace_xfs_readlink(ip);
 
        if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
+               return -EIO;
 
        xfs_ilock(ip, XFS_ILOCK_SHARED);
 
@@ -148,7 +148,7 @@ xfs_readlink(
                         __func__, (unsigned long long) ip->i_ino,
                         (long long) pathlen);
                ASSERT(0);
-               error = XFS_ERROR(EFSCORRUPTED);
+               error = -EFSCORRUPTED;
                goto out;
        }
 
@@ -203,14 +203,14 @@ xfs_symlink(
        trace_xfs_symlink(dp, link_name);
 
        if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
+               return -EIO;
 
        /*
         * Check component lengths of the target path name.
         */
        pathlen = strlen(target_path);
        if (pathlen >= MAXPATHLEN)      /* total string too long */
-               return XFS_ERROR(ENAMETOOLONG);
+               return -ENAMETOOLONG;
 
        udqp = gdqp = NULL;
        prid = xfs_get_initial_prid(dp);
@@ -238,7 +238,7 @@ xfs_symlink(
                fs_blocks = xfs_symlink_blocks(mp, pathlen);
        resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0);
-       if (error == ENOSPC && fs_blocks == 0) {
+       if (error == -ENOSPC && fs_blocks == 0) {
                resblks = 0;
                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0);
        }
@@ -254,7 +254,7 @@ xfs_symlink(
         * Check whether the directory allows new symlinks or not.
         */
        if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
-               error = XFS_ERROR(EPERM);
+               error = -EPERM;
                goto error_return;
        }
 
@@ -284,7 +284,7 @@ xfs_symlink(
        error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
                               prid, resblks > 0, &ip, NULL);
        if (error) {
-               if (error == ENOSPC)
+               if (error == -ENOSPC)
                        goto error_return;
                goto error1;
        }
@@ -348,7 +348,7 @@ xfs_symlink(
                        bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
                                               BTOBB(byte_cnt), 0);
                        if (!bp) {
-                               error = ENOMEM;
+                               error = -ENOMEM;
                                goto error2;
                        }
                        bp->b_ops = &xfs_symlink_buf_ops;
@@ -489,7 +489,7 @@ xfs_inactive_symlink_rmt(
                        XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
                        XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
                if (!bp) {
-                       error = ENOMEM;
+                       error = -ENOMEM;
                        goto error_bmap_cancel;
                }
                xfs_trans_binval(tp, bp);
@@ -562,7 +562,7 @@ xfs_inactive_symlink(
        trace_xfs_inactive_symlink(ip);
 
        if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
+               return -EIO;
 
        xfs_ilock(ip, XFS_ILOCK_EXCL);
 
@@ -580,7 +580,7 @@ xfs_inactive_symlink(
                         __func__, (unsigned long long)ip->i_ino, pathlen);
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                ASSERT(0);
-               return XFS_ERROR(EFSCORRUPTED);
+               return -EFSCORRUPTED;
        }
 
        if (ip->i_df.if_flags & XFS_IFINLINE) {
diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/xfs_symlink_remote.c
deleted file mode 100644 (file)
index 23c2f25..0000000
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * Copyright (c) 2012-2013 Red Hat, Inc.
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_shared.h"
-#include "xfs_trans_resv.h"
-#include "xfs_ag.h"
-#include "xfs_sb.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_symlink.h"
-#include "xfs_cksum.h"
-#include "xfs_trans.h"
-#include "xfs_buf_item.h"
-
-
-/*
- * Each contiguous block has a header, so it is not just a simple pathlen
- * to FSB conversion.
- */
-int
-xfs_symlink_blocks(
-       struct xfs_mount *mp,
-       int             pathlen)
-{
-       int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
-
-       return (pathlen + buflen - 1) / buflen;
-}
-
-int
-xfs_symlink_hdr_set(
-       struct xfs_mount        *mp,
-       xfs_ino_t               ino,
-       uint32_t                offset,
-       uint32_t                size,
-       struct xfs_buf          *bp)
-{
-       struct xfs_dsymlink_hdr *dsl = bp->b_addr;
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return 0;
-
-       dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
-       dsl->sl_offset = cpu_to_be32(offset);
-       dsl->sl_bytes = cpu_to_be32(size);
-       uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid);
-       dsl->sl_owner = cpu_to_be64(ino);
-       dsl->sl_blkno = cpu_to_be64(bp->b_bn);
-       bp->b_ops = &xfs_symlink_buf_ops;
-
-       return sizeof(struct xfs_dsymlink_hdr);
-}
-
-/*
- * Checking of the symlink header is split into two parts. the verifier does
- * CRC, location and bounds checking, the unpacking function checks the path
- * parameters and owner.
- */
-bool
-xfs_symlink_hdr_ok(
-       xfs_ino_t               ino,
-       uint32_t                offset,
-       uint32_t                size,
-       struct xfs_buf          *bp)
-{
-       struct xfs_dsymlink_hdr *dsl = bp->b_addr;
-
-       if (offset != be32_to_cpu(dsl->sl_offset))
-               return false;
-       if (size != be32_to_cpu(dsl->sl_bytes))
-               return false;
-       if (ino != be64_to_cpu(dsl->sl_owner))
-               return false;
-
-       /* ok */
-       return true;
-}
-
-static bool
-xfs_symlink_verify(
-       struct xfs_buf          *bp)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-       struct xfs_dsymlink_hdr *dsl = bp->b_addr;
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return false;
-       if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
-               return false;
-       if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid))
-               return false;
-       if (bp->b_bn != be64_to_cpu(dsl->sl_blkno))
-               return false;
-       if (be32_to_cpu(dsl->sl_offset) +
-                               be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN)
-               return false;
-       if (dsl->sl_owner == 0)
-               return false;
-
-       return true;
-}
-
-static void
-xfs_symlink_read_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount *mp = bp->b_target->bt_mount;
-
-       /* no verification of non-crc buffers */
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return;
-
-       if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF))
-               xfs_buf_ioerror(bp, EFSBADCRC);
-       else if (!xfs_symlink_verify(bp))
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-       if (bp->b_error)
-               xfs_verifier_error(bp);
-}
-
-static void
-xfs_symlink_write_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount *mp = bp->b_target->bt_mount;
-       struct xfs_buf_log_item *bip = bp->b_fspriv;
-
-       /* no verification of non-crc buffers */
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return;
-
-       if (!xfs_symlink_verify(bp)) {
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-               xfs_verifier_error(bp);
-               return;
-       }
-
-       if (bip) {
-               struct xfs_dsymlink_hdr *dsl = bp->b_addr;
-               dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
-       }
-       xfs_buf_update_cksum(bp, XFS_SYMLINK_CRC_OFF);
-}
-
-const struct xfs_buf_ops xfs_symlink_buf_ops = {
-       .verify_read = xfs_symlink_read_verify,
-       .verify_write = xfs_symlink_write_verify,
-};
-
-void
-xfs_symlink_local_to_remote(
-       struct xfs_trans        *tp,
-       struct xfs_buf          *bp,
-       struct xfs_inode        *ip,
-       struct xfs_ifork        *ifp)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       char                    *buf;
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb)) {
-               bp->b_ops = NULL;
-               memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
-               return;
-       }
-
-       /*
-        * As this symlink fits in an inode literal area, it must also fit in
-        * the smallest buffer the filesystem supports.
-        */
-       ASSERT(BBTOB(bp->b_length) >=
-                       ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr));
-
-       bp->b_ops = &xfs_symlink_buf_ops;
-
-       buf = bp->b_addr;
-       buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
-       memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
-}
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
new file mode 100644 (file)
index 0000000..9835139
--- /dev/null
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "xfs.h"
+#include "xfs_sysfs.h"
+#include "xfs_log_format.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+
+struct xfs_sysfs_attr {
+       struct attribute attr;
+       ssize_t (*show)(char *buf, void *data);
+       ssize_t (*store)(const char *buf, size_t count, void *data);
+};
+
+static inline struct xfs_sysfs_attr *
+to_attr(struct attribute *attr)
+{
+       return container_of(attr, struct xfs_sysfs_attr, attr);
+}
+
+#define XFS_SYSFS_ATTR_RW(name) \
+       static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RW(name)
+#define XFS_SYSFS_ATTR_RO(name) \
+       static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RO(name)
+
+#define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr
+
+/*
+ * xfs_mount kobject. This currently has no attributes and thus no need for show
+ * and store helpers. The mp kobject serves as the per-mount parent object that
+ * is identified by the fsname under sysfs.
+ */
+
+struct kobj_type xfs_mp_ktype = {
+       .release = xfs_sysfs_release,
+};
+
+/* xlog */
+
+STATIC ssize_t
+log_head_lsn_show(
+       char    *buf,
+       void    *data)
+{
+       struct xlog *log = data;
+       int cycle;
+       int block;
+
+       spin_lock(&log->l_icloglock);
+       cycle = log->l_curr_cycle;
+       block = log->l_curr_block;
+       spin_unlock(&log->l_icloglock);
+
+       return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
+}
+XFS_SYSFS_ATTR_RO(log_head_lsn);
+
+STATIC ssize_t
+log_tail_lsn_show(
+       char    *buf,
+       void    *data)
+{
+       struct xlog *log = data;
+       int cycle;
+       int block;
+
+       xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block);
+       return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
+}
+XFS_SYSFS_ATTR_RO(log_tail_lsn);
+
+STATIC ssize_t
+reserve_grant_head_show(
+       char    *buf,
+       void    *data)
+{
+       struct xlog *log = data;
+       int cycle;
+       int bytes;
+
+       xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes);
+       return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
+}
+XFS_SYSFS_ATTR_RO(reserve_grant_head);
+
+STATIC ssize_t
+write_grant_head_show(
+       char    *buf,
+       void    *data)
+{
+       struct xlog *log = data;
+       int cycle;
+       int bytes;
+
+       xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes);
+       return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
+}
+XFS_SYSFS_ATTR_RO(write_grant_head);
+
+static struct attribute *xfs_log_attrs[] = {
+       ATTR_LIST(log_head_lsn),
+       ATTR_LIST(log_tail_lsn),
+       ATTR_LIST(reserve_grant_head),
+       ATTR_LIST(write_grant_head),
+       NULL,
+};
+
+static inline struct xlog *
+to_xlog(struct kobject *kobject)
+{
+       struct xfs_kobj *kobj = to_kobj(kobject);
+       return container_of(kobj, struct xlog, l_kobj);
+}
+
+STATIC ssize_t
+xfs_log_show(
+       struct kobject          *kobject,
+       struct attribute        *attr,
+       char                    *buf)
+{
+       struct xlog *log = to_xlog(kobject);
+       struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+       return xfs_attr->show ? xfs_attr->show(buf, log) : 0;
+}
+
+STATIC ssize_t
+xfs_log_store(
+       struct kobject          *kobject,
+       struct attribute        *attr,
+       const char              *buf,
+       size_t                  count)
+{
+       struct xlog *log = to_xlog(kobject);
+       struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+       return xfs_attr->store ? xfs_attr->store(buf, count, log) : 0;
+}
+
+static struct sysfs_ops xfs_log_ops = {
+       .show = xfs_log_show,
+       .store = xfs_log_store,
+};
+
+struct kobj_type xfs_log_ktype = {
+       .release = xfs_sysfs_release,
+       .sysfs_ops = &xfs_log_ops,
+       .default_attrs = xfs_log_attrs,
+};
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
new file mode 100644 (file)
index 0000000..54a2091
--- /dev/null
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef __XFS_SYSFS_H__
+#define __XFS_SYSFS_H__
+
+extern struct kobj_type xfs_mp_ktype;  /* xfs_mount */
+extern struct kobj_type xfs_log_ktype; /* xlog */
+
+static inline struct xfs_kobj *
+to_kobj(struct kobject *kobject)
+{
+       return container_of(kobject, struct xfs_kobj, kobject);
+}
+
+static inline void
+xfs_sysfs_release(struct kobject *kobject)
+{
+       struct xfs_kobj *kobj = to_kobj(kobject);
+       complete(&kobj->complete);
+}
+
+static inline int
+xfs_sysfs_init(
+       struct xfs_kobj         *kobj,
+       struct kobj_type        *ktype,
+       struct xfs_kobj         *parent_kobj,
+       const char              *name)
+{
+       init_completion(&kobj->complete);
+       return kobject_init_and_add(&kobj->kobject, ktype,
+                                   &parent_kobj->kobject, "%s", name);
+}
+
+static inline void
+xfs_sysfs_del(
+       struct xfs_kobj *kobj)
+{
+       kobject_del(&kobj->kobject);
+       kobject_put(&kobj->kobject);
+       wait_for_completion(&kobj->complete);
+}
+
+#endif /* __XFS_SYSFS_H__ */
index d03932564ccb3a3e75efc3e39b3991ba6f27d8c9..30e8e34109553d3aee7673382f8b2ab2e44c8446 100644 (file)
@@ -190,7 +190,7 @@ xfs_trans_reserve(
                                          -((int64_t)blocks), rsvd);
                if (error != 0) {
                        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-                       return (XFS_ERROR(ENOSPC));
+                       return -ENOSPC;
                }
                tp->t_blk_res += blocks;
        }
@@ -241,7 +241,7 @@ xfs_trans_reserve(
                error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS,
                                          -((int64_t)rtextents), rsvd);
                if (error) {
-                       error = XFS_ERROR(ENOSPC);
+                       error = -ENOSPC;
                        goto undo_log;
                }
                tp->t_rtx_res += rtextents;
@@ -874,7 +874,7 @@ xfs_trans_commit(
                goto out_unreserve;
 
        if (XFS_FORCED_SHUTDOWN(mp)) {
-               error = XFS_ERROR(EIO);
+               error = -EIO;
                goto out_unreserve;
        }
 
@@ -917,7 +917,7 @@ out_unreserve:
        if (tp->t_ticket) {
                commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
                if (commit_lsn == -1 && !error)
-                       error = XFS_ERROR(EIO);
+                       error = -EIO;
        }
        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
        xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
@@ -1024,7 +1024,7 @@ xfs_trans_roll(
         */
        error = xfs_trans_commit(trans, 0);
        if (error)
-               return (error);
+               return error;
 
        trans = *tpp;
 
index cb0f3a84cc68452155b36c6b4a754edd450aa619..859482f53b5a87540e54b2cddb7097c53caa7447 100644 (file)
@@ -762,7 +762,7 @@ xfs_trans_ail_init(
 
        ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
        if (!ailp)
-               return ENOMEM;
+               return -ENOMEM;
 
        ailp->xa_mount = mp;
        INIT_LIST_HEAD(&ailp->xa_ail);
@@ -781,7 +781,7 @@ xfs_trans_ail_init(
 
 out_free_ailp:
        kmem_free(ailp);
-       return ENOMEM;
+       return -ENOMEM;
 }
 
 void
index b8eef0549f3f9a39cc68d06cf7a9cbcb04af469f..96c898e7ac9a7af67f0ab5a6147c7ba9f01cf4b9 100644 (file)
@@ -166,7 +166,7 @@ xfs_trans_get_buf_map(
                ASSERT(atomic_read(&bip->bli_refcount) > 0);
                bip->bli_recur++;
                trace_xfs_trans_get_buf_recur(bip);
-               return (bp);
+               return bp;
        }
 
        bp = xfs_buf_get_map(target, map, nmaps, flags);
@@ -178,7 +178,7 @@ xfs_trans_get_buf_map(
 
        _xfs_trans_bjoin(tp, bp, 1);
        trace_xfs_trans_get_buf(bp->b_fspriv);
-       return (bp);
+       return bp;
 }
 
 /*
@@ -201,9 +201,8 @@ xfs_trans_getsb(xfs_trans_t *tp,
         * Default to just trying to lock the superblock buffer
         * if tp is NULL.
         */
-       if (tp == NULL) {
-               return (xfs_getsb(mp, flags));
-       }
+       if (tp == NULL)
+               return xfs_getsb(mp, flags);
 
        /*
         * If the superblock buffer already has this transaction
@@ -218,7 +217,7 @@ xfs_trans_getsb(xfs_trans_t *tp,
                ASSERT(atomic_read(&bip->bli_refcount) > 0);
                bip->bli_recur++;
                trace_xfs_trans_getsb_recur(bip);
-               return (bp);
+               return bp;
        }
 
        bp = xfs_getsb(mp, flags);
@@ -227,7 +226,7 @@ xfs_trans_getsb(xfs_trans_t *tp,
 
        _xfs_trans_bjoin(tp, bp, 1);
        trace_xfs_trans_getsb(bp->b_fspriv);
-       return (bp);
+       return bp;
 }
 
 #ifdef DEBUG
@@ -267,7 +266,7 @@ xfs_trans_read_buf_map(
                bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
                if (!bp)
                        return (flags & XBF_TRYLOCK) ?
-                                       EAGAIN : XFS_ERROR(ENOMEM);
+                                       -EAGAIN : -ENOMEM;
 
                if (bp->b_error) {
                        error = bp->b_error;
@@ -277,8 +276,8 @@ xfs_trans_read_buf_map(
                        xfs_buf_relse(bp);
 
                        /* bad CRC means corrupted metadata */
-                       if (error == EFSBADCRC)
-                               error = EFSCORRUPTED;
+                       if (error == -EFSBADCRC)
+                               error = -EFSCORRUPTED;
                        return error;
                }
 #ifdef DEBUG
@@ -287,7 +286,7 @@ xfs_trans_read_buf_map(
                                if (((xfs_req_num++) % xfs_error_mod) == 0) {
                                        xfs_buf_relse(bp);
                                        xfs_debug(mp, "Returning error!");
-                                       return XFS_ERROR(EIO);
+                                       return -EIO;
                                }
                        }
                }
@@ -343,8 +342,8 @@ xfs_trans_read_buf_map(
                                        xfs_force_shutdown(tp->t_mountp,
                                                        SHUTDOWN_META_IO_ERROR);
                                /* bad CRC means corrupted metadata */
-                               if (error == EFSBADCRC)
-                                       error = EFSCORRUPTED;
+                               if (error == -EFSBADCRC)
+                                       error = -EFSCORRUPTED;
                                return error;
                        }
                }
@@ -355,7 +354,7 @@ xfs_trans_read_buf_map(
                if (XFS_FORCED_SHUTDOWN(mp)) {
                        trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
                        *bpp = NULL;
-                       return XFS_ERROR(EIO);
+                       return -EIO;
                }
 
 
@@ -372,7 +371,7 @@ xfs_trans_read_buf_map(
        if (bp == NULL) {
                *bpp = NULL;
                return (flags & XBF_TRYLOCK) ?
-                                       0 : XFS_ERROR(ENOMEM);
+                                       0 : -ENOMEM;
        }
        if (bp->b_error) {
                error = bp->b_error;
@@ -384,8 +383,8 @@ xfs_trans_read_buf_map(
                xfs_buf_relse(bp);
 
                /* bad CRC means corrupted metadata */
-               if (error == EFSBADCRC)
-                       error = EFSCORRUPTED;
+               if (error == -EFSBADCRC)
+                       error = -EFSCORRUPTED;
                return error;
        }
 #ifdef DEBUG
@@ -396,7 +395,7 @@ xfs_trans_read_buf_map(
                                                   SHUTDOWN_META_IO_ERROR);
                                xfs_buf_relse(bp);
                                xfs_debug(mp, "Returning trans error!");
-                               return XFS_ERROR(EIO);
+                               return -EIO;
                        }
                }
        }
@@ -414,7 +413,7 @@ shutdown_abort:
        trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
        xfs_buf_relse(bp);
        *bpp = NULL;
-       return XFS_ERROR(EIO);
+       return -EIO;
 }
 
 /*
index 41172861e857bf8c487f7adf3037aad0214b1681..846e061c2e9870efebfd98ecb9f7e6de6be7a200 100644 (file)
@@ -722,8 +722,8 @@ xfs_trans_dqresv(
 error_return:
        xfs_dqunlock(dqp);
        if (flags & XFS_QMOPT_ENOSPC)
-               return ENOSPC;
-       return EDQUOT;
+               return -ENOSPC;
+       return -EDQUOT;
 }
 
 
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
deleted file mode 100644 (file)
index f2bda7c..0000000
+++ /dev/null
@@ -1,894 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * Copyright (C) 2010 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_inode.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_quota.h"
-#include "xfs_trans.h"
-#include "xfs_qm.h"
-#include "xfs_trans_space.h"
-#include "xfs_trace.h"
-
-/*
- * A buffer has a format structure overhead in the log in addition
- * to the data, so we need to take this into account when reserving
- * space in a transaction for a buffer.  Round the space required up
- * to a multiple of 128 bytes so that we don't change the historical
- * reservation that has been used for this overhead.
- */
-STATIC uint
-xfs_buf_log_overhead(void)
-{
-       return round_up(sizeof(struct xlog_op_header) +
-                       sizeof(struct xfs_buf_log_format), 128);
-}
-
-/*
- * Calculate out transaction log reservation per item in bytes.
- *
- * The nbufs argument is used to indicate the number of items that
- * will be changed in a transaction.  size is used to tell how many
- * bytes should be reserved per item.
- */
-STATIC uint
-xfs_calc_buf_res(
-       uint            nbufs,
-       uint            size)
-{
-       return nbufs * (size + xfs_buf_log_overhead());
-}
-
-/*
- * Logging inodes is really tricksy. They are logged in memory format,
- * which means that what we write into the log doesn't directly translate into
- * the amount of space they use on disk.
- *
- * Case in point - btree format forks in memory format use more space than the
- * on-disk format. In memory, the buffer contains a normal btree block header so
- * the btree code can treat it as though it is just another generic buffer.
- * However, when we write it to the inode fork, we don't write all of this
- * header as it isn't needed. e.g. the root is only ever in the inode, so
- * there's no need for sibling pointers which would waste 16 bytes of space.
- *
- * Hence when we have an inode with a maximally sized btree format fork, then
- * amount of information we actually log is greater than the size of the inode
- * on disk. Hence we need an inode reservation function that calculates all this
- * correctly. So, we log:
- *
- * - 4 log op headers for object
- *     - for the ilf, the inode core and 2 forks
- * - inode log format object
- * - the inode core
- * - two inode forks containing bmap btree root blocks.
- *     - the btree data contained by both forks will fit into the inode size,
- *       hence when combined with the inode core above, we have a total of the
- *       actual inode size.
- *     - the BMBT headers need to be accounted separately, as they are
- *       additional to the records and pointers that fit inside the inode
- *       forks.
- */
-STATIC uint
-xfs_calc_inode_res(
-       struct xfs_mount        *mp,
-       uint                    ninodes)
-{
-       return ninodes *
-               (4 * sizeof(struct xlog_op_header) +
-                sizeof(struct xfs_inode_log_format) +
-                mp->m_sb.sb_inodesize +
-                2 * XFS_BMBT_BLOCK_LEN(mp));
-}
-
-/*
- * The free inode btree is a conditional feature and the log reservation
- * requirements differ slightly from that of the traditional inode allocation
- * btree. The finobt tracks records for inode chunks with at least one free
- * inode. A record can be removed from the tree for an inode allocation
- * or free and thus the finobt reservation is unconditional across:
- *
- *     - inode allocation
- *     - inode free
- *     - inode chunk allocation
- *
- * The 'modify' param indicates to include the record modification scenario. The
- * 'alloc' param indicates to include the reservation for free space btree
- * modifications on behalf of finobt modifications. This is required only for
- * transactions that do not already account for free space btree modifications.
- *
- * the free inode btree: max depth * block size
- * the allocation btrees: 2 trees * (max depth - 1) * block size
- * the free inode btree entry: block size
- */
-STATIC uint
-xfs_calc_finobt_res(
-       struct xfs_mount        *mp,
-       int                     alloc,
-       int                     modify)
-{
-       uint res;
-
-       if (!xfs_sb_version_hasfinobt(&mp->m_sb))
-               return 0;
-
-       res = xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1));
-       if (alloc)
-               res += xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), 
-                                       XFS_FSB_TO_B(mp, 1));
-       if (modify)
-               res += (uint)XFS_FSB_TO_B(mp, 1);
-
-       return res;
-}
-
-/*
- * Various log reservation values.
- *
- * These are based on the size of the file system block because that is what
- * most transactions manipulate.  Each adds in an additional 128 bytes per
- * item logged to try to account for the overhead of the transaction mechanism.
- *
- * Note:  Most of the reservations underestimate the number of allocation
- * groups into which they could free extents in the xfs_bmap_finish() call.
- * This is because the number in the worst case is quite high and quite
- * unusual.  In order to fix this we need to change xfs_bmap_finish() to free
- * extents in only a single AG at a time.  This will require changes to the
- * EFI code as well, however, so that the EFI for the extents not freed is
- * logged again in each transaction.  See SGI PV #261917.
- *
- * Reservation functions here avoid a huge stack in xfs_trans_init due to
- * register overflow from temporaries in the calculations.
- */
-
-
-/*
- * In a write transaction we can allocate a maximum of 2
- * extents.  This gives:
- *    the inode getting the new extents: inode size
- *    the inode's bmap btree: max depth * block size
- *    the agfs of the ags from which the extents are allocated: 2 * sector
- *    the superblock free block counter: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- * And the bmap_finish transaction can free bmap blocks in a join:
- *    the agfs of the ags containing the blocks: 2 * sector size
- *    the agfls of the ags containing the blocks: 2 * sector size
- *    the super block free block counter: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_write_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               MAX((xfs_calc_inode_res(mp, 1) +
-                    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
-                                     XFS_FSB_TO_B(mp, 1)) +
-                    xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
-                                     XFS_FSB_TO_B(mp, 1))),
-                   (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
-                                     XFS_FSB_TO_B(mp, 1))));
-}
-
-/*
- * In truncating a file we free up to two extents at once.  We can modify:
- *    the inode being truncated: inode size
- *    the inode's bmap btree: (max depth + 1) * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- *    the agf for each of the ags: 4 * sector size
- *    the agfl for each of the ags: 4 * sector size
- *    the super block to reflect the freed blocks: sector size
- *    worst case split in allocation btrees per extent assuming 4 extents:
- *             4 exts * 2 trees * (2 * max depth - 1) * block size
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_itruncate_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               MAX((xfs_calc_inode_res(mp, 1) +
-                    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
-                                     XFS_FSB_TO_B(mp, 1))),
-                   (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
-                                     XFS_FSB_TO_B(mp, 1)) +
-                   xfs_calc_buf_res(5, 0) +
-                   xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                    XFS_FSB_TO_B(mp, 1)) +
-                   xfs_calc_buf_res(2 + mp->m_ialloc_blks +
-                                    mp->m_in_maxlevels, 0)));
-}
-
-/*
- * In renaming a files we can modify:
- *    the four inodes involved: 4 * inode size
- *    the two directory btrees: 2 * (max depth + v2) * dir block size
- *    the two directory bmap btrees: 2 * max depth * block size
- * And the bmap_finish transaction can free dir and bmap blocks (two sets
- *     of bmap blocks) giving:
- *    the agf for the ags in which the blocks live: 3 * sector size
- *    the agfl for the ags in which the blocks live: 3 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_rename_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               MAX((xfs_calc_inode_res(mp, 4) +
-                    xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
-                                     XFS_FSB_TO_B(mp, 1))),
-                   (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3),
-                                     XFS_FSB_TO_B(mp, 1))));
-}
-
-/*
- * For removing an inode from unlinked list at first, we can modify:
- *    the agi hash list and counters: sector size
- *    the on disk inode before ours in the agi hash list: inode cluster size
- */
-STATIC uint
-xfs_calc_iunlink_remove_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-              max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size);
-}
-
-/*
- * For creating a link to an inode:
- *    the parent directory inode: inode size
- *    the linked inode: inode size
- *    the directory btree could split: (max depth + v2) * dir block size
- *    the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free some bmap blocks giving:
- *    the agf for the ag in which the blocks live: sector size
- *    the agfl for the ag in which the blocks live: sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_link_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               xfs_calc_iunlink_remove_reservation(mp) +
-               MAX((xfs_calc_inode_res(mp, 2) +
-                    xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
-                                     XFS_FSB_TO_B(mp, 1))),
-                   (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                     XFS_FSB_TO_B(mp, 1))));
-}
-
-/*
- * For adding an inode to unlinked list we can modify:
- *    the agi hash list: sector size
- *    the unlinked inode: inode size
- */
-STATIC uint
-xfs_calc_iunlink_add_reservation(xfs_mount_t *mp)
-{
-       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-               xfs_calc_inode_res(mp, 1);
-}
-
-/*
- * For removing a directory entry we can modify:
- *    the parent directory inode: inode size
- *    the removed inode: inode size
- *    the directory btree could join: (max depth + v2) * dir block size
- *    the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free the dir and bmap blocks giving:
- *    the agf for the ag in which the blocks live: 2 * sector size
- *    the agfl for the ag in which the blocks live: 2 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_remove_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               xfs_calc_iunlink_add_reservation(mp) +
-               MAX((xfs_calc_inode_res(mp, 1) +
-                    xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
-                                     XFS_FSB_TO_B(mp, 1))),
-                   (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
-                                     XFS_FSB_TO_B(mp, 1))));
-}
-
-/*
- * For create, break it in to the two cases that the transaction
- * covers. We start with the modify case - allocation done by modification
- * of the state of existing inodes - and the allocation case.
- */
-
-/*
- * For create we can modify:
- *    the parent directory inode: inode size
- *    the new inode: inode size
- *    the inode btree entry: block size
- *    the superblock for the nlink flag: sector size
- *    the directory btree: (max depth + v2) * dir block size
- *    the directory inode's bmap btree: (max depth + v2) * block size
- *    the finobt (record modification and allocation btrees)
- */
-STATIC uint
-xfs_calc_create_resv_modify(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_inode_res(mp, 2) +
-               xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-               (uint)XFS_FSB_TO_B(mp, 1) +
-               xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)) +
-               xfs_calc_finobt_res(mp, 1, 1);
-}
-
-/*
- * For create we can allocate some inodes giving:
- *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
- *    the superblock for the nlink flag: sector size
- *    the inode blocks allocated: mp->m_ialloc_blks * blocksize
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_create_resv_alloc(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
-               mp->m_sb.sb_sectsize +
-               xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) +
-               xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
-               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                XFS_FSB_TO_B(mp, 1));
-}
-
-STATIC uint
-__xfs_calc_create_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               MAX(xfs_calc_create_resv_alloc(mp),
-                   xfs_calc_create_resv_modify(mp));
-}
-
-/*
- * For icreate we can allocate some inodes giving:
- *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
- *    the superblock for the nlink flag: sector size
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- *    the finobt (record insertion)
- */
-STATIC uint
-xfs_calc_icreate_resv_alloc(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
-               mp->m_sb.sb_sectsize +
-               xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
-               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                XFS_FSB_TO_B(mp, 1)) +
-               xfs_calc_finobt_res(mp, 0, 0);
-}
-
-STATIC uint
-xfs_calc_icreate_reservation(xfs_mount_t *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               MAX(xfs_calc_icreate_resv_alloc(mp),
-                   xfs_calc_create_resv_modify(mp));
-}
-
-STATIC uint
-xfs_calc_create_reservation(
-       struct xfs_mount        *mp)
-{
-       if (xfs_sb_version_hascrc(&mp->m_sb))
-               return xfs_calc_icreate_reservation(mp);
-       return __xfs_calc_create_reservation(mp);
-
-}
-
-STATIC uint
-xfs_calc_create_tmpfile_reservation(
-       struct xfs_mount        *mp)
-{
-       uint    res = XFS_DQUOT_LOGRES(mp);
-
-       if (xfs_sb_version_hascrc(&mp->m_sb))
-               res += xfs_calc_icreate_resv_alloc(mp);
-       else
-               res += xfs_calc_create_resv_alloc(mp);
-
-       return res + xfs_calc_iunlink_add_reservation(mp);
-}
-
-/*
- * Making a new directory is the same as creating a new file.
- */
-STATIC uint
-xfs_calc_mkdir_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_create_reservation(mp);
-}
-
-
-/*
- * Making a new symplink is the same as creating a new file, but
- * with the added blocks for remote symlink data which can be up to 1kB in
- * length (MAXPATHLEN).
- */
-STATIC uint
-xfs_calc_symlink_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_create_reservation(mp) +
-              xfs_calc_buf_res(1, MAXPATHLEN);
-}
-
-/*
- * In freeing an inode we can modify:
- *    the inode being freed: inode size
- *    the super block free inode counter: sector size
- *    the agi hash list and counters: sector size
- *    the inode btree entry: block size
- *    the on disk inode before ours in the agi hash list: inode cluster size
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- *    the finobt (record insertion, removal or modification)
- */
-STATIC uint
-xfs_calc_ifree_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               xfs_calc_inode_res(mp, 1) +
-               xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-               xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
-               xfs_calc_iunlink_remove_reservation(mp) +
-               xfs_calc_buf_res(1, 0) +
-               xfs_calc_buf_res(2 + mp->m_ialloc_blks +
-                                mp->m_in_maxlevels, 0) +
-               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                XFS_FSB_TO_B(mp, 1)) +
-               xfs_calc_finobt_res(mp, 0, 1);
-}
-
-/*
- * When only changing the inode we log the inode and possibly the superblock
- * We also add a bit of slop for the transaction stuff.
- */
-STATIC uint
-xfs_calc_ichange_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               xfs_calc_inode_res(mp, 1) +
-               xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
-
-}
-
-/*
- * Growing the data section of the filesystem.
- *     superblock
- *     agi and agf
- *     allocation btrees
- */
-STATIC uint
-xfs_calc_growdata_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
-               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                XFS_FSB_TO_B(mp, 1));
-}
-
-/*
- * Growing the rt section of the filesystem.
- * In the first set of transactions (ALLOC) we allocate space to the
- * bitmap or summary files.
- *     superblock: sector size
- *     agf of the ag from which the extent is allocated: sector size
- *     bmap btree for bitmap/summary inode: max depth * blocksize
- *     bitmap/summary inode: inode size
- *     allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
- */
-STATIC uint
-xfs_calc_growrtalloc_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
-               xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
-                                XFS_FSB_TO_B(mp, 1)) +
-               xfs_calc_inode_res(mp, 1) +
-               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                XFS_FSB_TO_B(mp, 1));
-}
-
-/*
- * Growing the rt section of the filesystem.
- * In the second set of transactions (ZERO) we zero the new metadata blocks.
- *     one bitmap/summary block: blocksize
- */
-STATIC uint
-xfs_calc_growrtzero_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize);
-}
-
-/*
- * Growing the rt section of the filesystem.
- * In the third set of transactions (FREE) we update metadata without
- * allocating any new blocks.
- *     superblock: sector size
- *     bitmap inode: inode size
- *     summary inode: inode size
- *     one bitmap block: blocksize
- *     summary blocks: new summary size
- */
-STATIC uint
-xfs_calc_growrtfree_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-               xfs_calc_inode_res(mp, 2) +
-               xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) +
-               xfs_calc_buf_res(1, mp->m_rsumsize);
-}
-
-/*
- * Logging the inode modification timestamp on a synchronous write.
- *     inode
- */
-STATIC uint
-xfs_calc_swrite_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_inode_res(mp, 1);
-}
-
-/*
- * Logging the inode mode bits when writing a setuid/setgid file
- *     inode
- */
-STATIC uint
-xfs_calc_writeid_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_inode_res(mp, 1);
-}
-
-/*
- * Converting the inode from non-attributed to attributed.
- *     the inode being converted: inode size
- *     agf block and superblock (for block allocation)
- *     the new block (directory sized)
- *     bmap blocks for the new directory block
- *     allocation btrees
- */
-STATIC uint
-xfs_calc_addafork_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               xfs_calc_inode_res(mp, 1) +
-               xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
-               xfs_calc_buf_res(1, mp->m_dir_geo->blksize) +
-               xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
-                                XFS_FSB_TO_B(mp, 1)) +
-               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                XFS_FSB_TO_B(mp, 1));
-}
-
-/*
- * Removing the attribute fork of a file
- *    the inode being truncated: inode size
- *    the inode's bmap btree: max depth * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- *    the agf for each of the ags: 4 * sector size
- *    the agfl for each of the ags: 4 * sector size
- *    the super block to reflect the freed blocks: sector size
- *    worst case split in allocation btrees per extent assuming 4 extents:
- *             4 exts * 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_attrinval_reservation(
-       struct xfs_mount        *mp)
-{
-       return MAX((xfs_calc_inode_res(mp, 1) +
-                   xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
-                                    XFS_FSB_TO_B(mp, 1))),
-                  (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
-                   xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
-                                    XFS_FSB_TO_B(mp, 1))));
-}
-
-/*
- * Setting an attribute at mount time.
- *     the inode getting the attribute
- *     the superblock for allocations
- *     the agfs extents are allocated from
- *     the attribute btree * max depth
- *     the inode allocation btree
- * Since attribute transaction space is dependent on the size of the attribute,
- * the calculation is done partially at mount time and partially at runtime(see
- * below).
- */
-STATIC uint
-xfs_calc_attrsetm_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               xfs_calc_inode_res(mp, 1) +
-               xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-               xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1));
-}
-
-/*
- * Setting an attribute at runtime, transaction space unit per block.
- *     the superblock for allocations: sector size
- *     the inode bmap btree could join or split: max depth * block size
- * Since the runtime attribute transaction space is dependent on the total
- * blocks needed for the 1st bmap, here we calculate out the space unit for
- * one block so that the caller could figure out the total space according
- * to the attibute extent length in blocks by:
- *     ext * M_RES(mp)->tr_attrsetrt.tr_logres
- */
-STATIC uint
-xfs_calc_attrsetrt_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-               xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
-                                XFS_FSB_TO_B(mp, 1));
-}
-
-/*
- * Removing an attribute.
- *    the inode: inode size
- *    the attribute btree could join: max depth * block size
- *    the inode bmap btree could join or split: max depth * block size
- * And the bmap_finish transaction can free the attr blocks freed giving:
- *    the agf for the ag in which the blocks live: 2 * sector size
- *    the agfl for the ag in which the blocks live: 2 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_attrrm_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               MAX((xfs_calc_inode_res(mp, 1) +
-                    xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH,
-                                     XFS_FSB_TO_B(mp, 1)) +
-                    (uint)XFS_FSB_TO_B(mp,
-                                       XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
-                    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
-                   (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
-                                     XFS_FSB_TO_B(mp, 1))));
-}
-
-/*
- * Clearing a bad agino number in an agi hash bucket.
- */
-STATIC uint
-xfs_calc_clear_agi_bucket_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
-}
-
-/*
- * Clearing the quotaflags in the superblock.
- *     the super block for changing quota flags: sector size
- */
-STATIC uint
-xfs_calc_qm_sbchange_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
-}
-
-/*
- * Adjusting quota limits.
- *    the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
- */
-STATIC uint
-xfs_calc_qm_setqlim_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot));
-}
-
-/*
- * Allocating quota on disk if needed.
- *     the write transaction log space for quota file extent allocation
- *     the unit of quota allocation: one system block size
- */
-STATIC uint
-xfs_calc_qm_dqalloc_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_write_reservation(mp) +
-               xfs_calc_buf_res(1,
-                       XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
-}
-
-/*
- * Turning off quotas.
- *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
- *    the superblock for the quota flags: sector size
- */
-STATIC uint
-xfs_calc_qm_quotaoff_reservation(
-       struct xfs_mount        *mp)
-{
-       return sizeof(struct xfs_qoff_logitem) * 2 +
-               xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
-}
-
-/*
- * End of turning off quotas.
- *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
- */
-STATIC uint
-xfs_calc_qm_quotaoff_end_reservation(
-       struct xfs_mount        *mp)
-{
-       return sizeof(struct xfs_qoff_logitem) * 2;
-}
-
-/*
- * Syncing the incore super block changes to disk.
- *     the super block to reflect the changes: sector size
- */
-STATIC uint
-xfs_calc_sb_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
-}
-
-void
-xfs_trans_resv_calc(
-       struct xfs_mount        *mp,
-       struct xfs_trans_resv   *resp)
-{
-       /*
-        * The following transactions are logged in physical format and
-        * require a permanent reservation on space.
-        */
-       resp->tr_write.tr_logres = xfs_calc_write_reservation(mp);
-       resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
-       resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-       resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp);
-       resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
-       resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-       resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
-       resp->tr_rename.tr_logcount = XFS_RENAME_LOG_COUNT;
-       resp->tr_rename.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-       resp->tr_link.tr_logres = xfs_calc_link_reservation(mp);
-       resp->tr_link.tr_logcount = XFS_LINK_LOG_COUNT;
-       resp->tr_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-       resp->tr_remove.tr_logres = xfs_calc_remove_reservation(mp);
-       resp->tr_remove.tr_logcount = XFS_REMOVE_LOG_COUNT;
-       resp->tr_remove.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-       resp->tr_symlink.tr_logres = xfs_calc_symlink_reservation(mp);
-       resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT;
-       resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-       resp->tr_create.tr_logres = xfs_calc_create_reservation(mp);
-       resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT;
-       resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-       resp->tr_create_tmpfile.tr_logres =
-                       xfs_calc_create_tmpfile_reservation(mp);
-       resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT;
-       resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-       resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp);
-       resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT;
-       resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-       resp->tr_ifree.tr_logres = xfs_calc_ifree_reservation(mp);
-       resp->tr_ifree.tr_logcount = XFS_INACTIVE_LOG_COUNT;
-       resp->tr_ifree.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-       resp->tr_addafork.tr_logres = xfs_calc_addafork_reservation(mp);
-       resp->tr_addafork.tr_logcount = XFS_ADDAFORK_LOG_COUNT;
-       resp->tr_addafork.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-       resp->tr_attrinval.tr_logres = xfs_calc_attrinval_reservation(mp);
-       resp->tr_attrinval.tr_logcount = XFS_ATTRINVAL_LOG_COUNT;
-       resp->tr_attrinval.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-       resp->tr_attrsetm.tr_logres = xfs_calc_attrsetm_reservation(mp);
-       resp->tr_attrsetm.tr_logcount = XFS_ATTRSET_LOG_COUNT;
-       resp->tr_attrsetm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-       resp->tr_attrrm.tr_logres = xfs_calc_attrrm_reservation(mp);
-       resp->tr_attrrm.tr_logcount = XFS_ATTRRM_LOG_COUNT;
-       resp->tr_attrrm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-       resp->tr_growrtalloc.tr_logres = xfs_calc_growrtalloc_reservation(mp);
-       resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT;
-       resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-       resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp);
-       resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
-       resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-       /*
-        * The following transactions are logged in logical format with
-        * a default log count.
-        */
-       resp->tr_qm_sbchange.tr_logres = xfs_calc_qm_sbchange_reservation(mp);
-       resp->tr_qm_sbchange.tr_logcount = XFS_DEFAULT_LOG_COUNT;
-
-       resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp);
-       resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT;
-
-       resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp);
-       resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
-
-       resp->tr_qm_equotaoff.tr_logres =
-               xfs_calc_qm_quotaoff_end_reservation(mp);
-       resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
-
-       resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp);
-       resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT;
-
-       /* The following transaction are logged in logical format */
-       resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp);
-       resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp);
-       resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp);
-       resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp);
-       resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp);
-       resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp);
-       resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp);
-       resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp);
-}
diff --git a/fs/xfs/xfs_trans_resv.h b/fs/xfs/xfs_trans_resv.h
deleted file mode 100644 (file)
index 1097d14..0000000
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef        __XFS_TRANS_RESV_H__
-#define        __XFS_TRANS_RESV_H__
-
-struct xfs_mount;
-
-/*
- * structure for maintaining pre-calculated transaction reservations.
- */
-struct xfs_trans_res {
-       uint    tr_logres;      /* log space unit in bytes per log ticket */
-       int     tr_logcount;    /* number of log operations per log ticket */
-       int     tr_logflags;    /* log flags, currently only used for indicating
-                                * a reservation request is permanent or not */
-};
-
-struct xfs_trans_resv {
-       struct xfs_trans_res    tr_write;       /* extent alloc trans */
-       struct xfs_trans_res    tr_itruncate;   /* truncate trans */
-       struct xfs_trans_res    tr_rename;      /* rename trans */
-       struct xfs_trans_res    tr_link;        /* link trans */
-       struct xfs_trans_res    tr_remove;      /* unlink trans */
-       struct xfs_trans_res    tr_symlink;     /* symlink trans */
-       struct xfs_trans_res    tr_create;      /* create trans */
-       struct xfs_trans_res    tr_create_tmpfile; /* create O_TMPFILE trans */
-       struct xfs_trans_res    tr_mkdir;       /* mkdir trans */
-       struct xfs_trans_res    tr_ifree;       /* inode free trans */
-       struct xfs_trans_res    tr_ichange;     /* inode update trans */
-       struct xfs_trans_res    tr_growdata;    /* fs data section grow trans */
-       struct xfs_trans_res    tr_addafork;    /* add inode attr fork trans */
-       struct xfs_trans_res    tr_writeid;     /* write setuid/setgid file */
-       struct xfs_trans_res    tr_attrinval;   /* attr fork buffer
-                                                * invalidation */
-       struct xfs_trans_res    tr_attrsetm;    /* set/create an attribute at
-                                                * mount time */
-       struct xfs_trans_res    tr_attrsetrt;   /* set/create an attribute at
-                                                * runtime */
-       struct xfs_trans_res    tr_attrrm;      /* remove an attribute */
-       struct xfs_trans_res    tr_clearagi;    /* clear agi unlinked bucket */
-       struct xfs_trans_res    tr_growrtalloc; /* grow realtime allocations */
-       struct xfs_trans_res    tr_growrtzero;  /* grow realtime zeroing */
-       struct xfs_trans_res    tr_growrtfree;  /* grow realtime freeing */
-       struct xfs_trans_res    tr_qm_sbchange; /* change quota flags */
-       struct xfs_trans_res    tr_qm_setqlim;  /* adjust quota limits */
-       struct xfs_trans_res    tr_qm_dqalloc;  /* allocate quota on disk */
-       struct xfs_trans_res    tr_qm_quotaoff; /* turn quota off */
-       struct xfs_trans_res    tr_qm_equotaoff;/* end of turn quota off */
-       struct xfs_trans_res    tr_sb;          /* modify superblock */
-       struct xfs_trans_res    tr_fsyncts;     /* update timestamps on fsync */
-};
-
-/* shorthand way of accessing reservation structure */
-#define M_RES(mp)      (&(mp)->m_resv)
-
-/*
- * Per-extent log reservation for the allocation btree changes
- * involved in freeing or allocating an extent.
- * 2 trees * (2 blocks/level * max depth - 1) * block size
- */
-#define        XFS_ALLOCFREE_LOG_RES(mp,nx) \
-       ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
-#define        XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
-       ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
-
-/*
- * Per-directory log reservation for any directory change.
- * dir blocks: (1 btree block per level + data block + free block) * dblock size
- * bmap btree: (levels + 2) * max depth * block size
- * v2 directory blocks can be fragmented below the dirblksize down to the fsb
- * size, so account for that in the DAENTER macros.
- */
-#define        XFS_DIROP_LOG_RES(mp)   \
-       (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
-        (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
-#define        XFS_DIROP_LOG_COUNT(mp) \
-       (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
-        XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
-
-/*
- * Various log count values.
- */
-#define        XFS_DEFAULT_LOG_COUNT           1
-#define        XFS_DEFAULT_PERM_LOG_COUNT      2
-#define        XFS_ITRUNCATE_LOG_COUNT         2
-#define XFS_INACTIVE_LOG_COUNT         2
-#define        XFS_CREATE_LOG_COUNT            2
-#define        XFS_CREATE_TMPFILE_LOG_COUNT    2
-#define        XFS_MKDIR_LOG_COUNT             3
-#define        XFS_SYMLINK_LOG_COUNT           3
-#define        XFS_REMOVE_LOG_COUNT            2
-#define        XFS_LINK_LOG_COUNT              2
-#define        XFS_RENAME_LOG_COUNT            2
-#define        XFS_WRITE_LOG_COUNT             2
-#define        XFS_ADDAFORK_LOG_COUNT          2
-#define        XFS_ATTRINVAL_LOG_COUNT         1
-#define        XFS_ATTRSET_LOG_COUNT           3
-#define        XFS_ATTRRM_LOG_COUNT            3
-
-void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
-
-#endif /* __XFS_TRANS_RESV_H__ */
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h
deleted file mode 100644 (file)
index bf9c457..0000000
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_TRANS_SPACE_H__
-#define __XFS_TRANS_SPACE_H__
-
-/*
- * Components of space reservations.
- */
-#define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)    \
-               (((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0]))
-#define        XFS_EXTENTADD_SPACE_RES(mp,w)   (XFS_BM_MAXLEVELS(mp,w) - 1)
-#define XFS_NEXTENTADD_SPACE_RES(mp,b,w)\
-       (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
-         XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
-         XFS_EXTENTADD_SPACE_RES(mp,w))
-#define        XFS_DAENTER_1B(mp,w)    \
-       ((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1)
-#define        XFS_DAENTER_DBS(mp,w)   \
-       (XFS_DA_NODE_MAXDEPTH + (((w) == XFS_DATA_FORK) ? 2 : 0))
-#define        XFS_DAENTER_BLOCKS(mp,w)        \
-       (XFS_DAENTER_1B(mp,w) * XFS_DAENTER_DBS(mp,w))
-#define        XFS_DAENTER_BMAP1B(mp,w)        \
-       XFS_NEXTENTADD_SPACE_RES(mp, XFS_DAENTER_1B(mp, w), w)
-#define        XFS_DAENTER_BMAPS(mp,w)         \
-       (XFS_DAENTER_DBS(mp,w) * XFS_DAENTER_BMAP1B(mp,w))
-#define        XFS_DAENTER_SPACE_RES(mp,w)     \
-       (XFS_DAENTER_BLOCKS(mp,w) + XFS_DAENTER_BMAPS(mp,w))
-#define        XFS_DAREMOVE_SPACE_RES(mp,w)    XFS_DAENTER_BMAPS(mp,w)
-#define        XFS_DIRENTER_MAX_SPLIT(mp,nl)   1
-#define        XFS_DIRENTER_SPACE_RES(mp,nl)   \
-       (XFS_DAENTER_SPACE_RES(mp, XFS_DATA_FORK) * \
-        XFS_DIRENTER_MAX_SPLIT(mp,nl))
-#define        XFS_DIRREMOVE_SPACE_RES(mp)     \
-       XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
-#define        XFS_IALLOC_SPACE_RES(mp)        \
-       ((mp)->m_ialloc_blks + \
-        (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \
-         ((mp)->m_in_maxlevels - 1)))
-
-/*
- * Space reservation values for various transactions.
- */
-#define        XFS_ADDAFORK_SPACE_RES(mp)      \
-       ((mp)->m_dir_geo->fsbcount + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK))
-#define        XFS_ATTRRM_SPACE_RES(mp)        \
-       XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK)
-/* This macro is not used - see inline code in xfs_attr_set */
-#define        XFS_ATTRSET_SPACE_RES(mp, v)    \
-       (XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + XFS_B_TO_FSB(mp, v))
-#define        XFS_CREATE_SPACE_RES(mp,nl)     \
-       (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
-#define        XFS_DIOSTRAT_SPACE_RES(mp, v)   \
-       (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v))
-#define        XFS_GROWFS_SPACE_RES(mp)        \
-       (2 * XFS_AG_MAXLEVELS(mp))
-#define        XFS_GROWFSRT_SPACE_RES(mp,b)    \
-       ((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK))
-#define        XFS_LINK_SPACE_RES(mp,nl)       \
-       XFS_DIRENTER_SPACE_RES(mp,nl)
-#define        XFS_MKDIR_SPACE_RES(mp,nl)      \
-       (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
-#define        XFS_QM_DQALLOC_SPACE_RES(mp)    \
-       (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + \
-        XFS_DQUOT_CLUSTER_SIZE_FSB)
-#define        XFS_QM_QINOCREATE_SPACE_RES(mp) \
-       XFS_IALLOC_SPACE_RES(mp)
-#define        XFS_REMOVE_SPACE_RES(mp)        \
-       XFS_DIRREMOVE_SPACE_RES(mp)
-#define        XFS_RENAME_SPACE_RES(mp,nl)     \
-       (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
-#define        XFS_SYMLINK_SPACE_RES(mp,nl,b)  \
-       (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b))
-#define XFS_IFREE_SPACE_RES(mp)                \
-       (xfs_sb_version_hasfinobt(&mp->m_sb) ? (mp)->m_in_maxlevels : 0)
-
-
-#endif /* __XFS_TRANS_SPACE_H__ */
index 65c6e6650b1a5e86f586d7bbbc7b352758dd842b..b79dc66b2ecd4afb89f924cb046ac5b37f1ff8d8 100644 (file)
@@ -38,43 +38,18 @@ typedef     __int32_t       xfs_tid_t;      /* transaction identifier */
 typedef        __uint32_t      xfs_dablk_t;    /* dir/attr block number (in file) */
 typedef        __uint32_t      xfs_dahash_t;   /* dir/attr hash value */
 
-/*
- * These types are 64 bits on disk but are either 32 or 64 bits in memory.
- * Disk based types:
- */
-typedef __uint64_t     xfs_dfsbno_t;   /* blockno in filesystem (agno|agbno) */
-typedef __uint64_t     xfs_drfsbno_t;  /* blockno in filesystem (raw) */
-typedef        __uint64_t      xfs_drtbno_t;   /* extent (block) in realtime area */
-typedef        __uint64_t      xfs_dfiloff_t;  /* block number in a file */
-typedef        __uint64_t      xfs_dfilblks_t; /* number of blocks in a file */
-
-/*
- * Memory based types are conditional.
- */
-#if XFS_BIG_BLKNOS
 typedef        __uint64_t      xfs_fsblock_t;  /* blockno in filesystem (agno|agbno) */
 typedef __uint64_t     xfs_rfsblock_t; /* blockno in filesystem (raw) */
 typedef __uint64_t     xfs_rtblock_t;  /* extent (block) in realtime area */
-typedef        __int64_t       xfs_srtblock_t; /* signed version of xfs_rtblock_t */
-#else
-typedef        __uint32_t      xfs_fsblock_t;  /* blockno in filesystem (agno|agbno) */
-typedef __uint32_t     xfs_rfsblock_t; /* blockno in filesystem (raw) */
-typedef __uint32_t     xfs_rtblock_t;  /* extent (block) in realtime area */
-typedef        __int32_t       xfs_srtblock_t; /* signed version of xfs_rtblock_t */
-#endif
 typedef __uint64_t     xfs_fileoff_t;  /* block number in a file */
-typedef __int64_t      xfs_sfiloff_t;  /* signed block number in a file */
 typedef __uint64_t     xfs_filblks_t;  /* number of blocks in a file */
 
+typedef        __int64_t       xfs_srtblock_t; /* signed version of xfs_rtblock_t */
+typedef __int64_t      xfs_sfiloff_t;  /* signed block number in a file */
 
 /*
  * Null values for the types.
  */
-#define        NULLDFSBNO      ((xfs_dfsbno_t)-1)
-#define        NULLDRFSBNO     ((xfs_drfsbno_t)-1)
-#define        NULLDRTBNO      ((xfs_drtbno_t)-1)
-#define        NULLDFILOFF     ((xfs_dfiloff_t)-1)
-
 #define        NULLFSBLOCK     ((xfs_fsblock_t)-1)
 #define        NULLRFSBLOCK    ((xfs_rfsblock_t)-1)
 #define        NULLRTBLOCK     ((xfs_rtblock_t)-1)
diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h
deleted file mode 100644 (file)
index e8a7738..0000000
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_VNODE_H__
-#define __XFS_VNODE_H__
-
-#include "xfs_fs.h"
-
-struct file;
-struct xfs_inode;
-struct attrlist_cursor_kern;
-
-/*
- * Flags for read/write calls - same values as IRIX
- */
-#define IO_ISDIRECT    0x00004         /* bypass page cache */
-#define IO_INVIS       0x00020         /* don't update inode timestamps */
-
-#define XFS_IO_FLAGS \
-       { IO_ISDIRECT,  "DIRECT" }, \
-       { IO_INVIS,     "INVIS"}
-
-/*
- * Some useful predicates.
- */
-#define VN_MAPPED(vp)  mapping_mapped(vp->i_mapping)
-#define VN_CACHED(vp)  (vp->i_mapping->nrpages)
-#define VN_DIRTY(vp)   mapping_tagged(vp->i_mapping, \
-                                       PAGECACHE_TAG_DIRTY)
-
-
-#endif /* __XFS_VNODE_H__ */
index 78ed92a46fdd3323c9bada9a35257f839285c630..93455b99804155d11c748239fde20700eea76c05 100644 (file)
@@ -49,7 +49,7 @@ xfs_xattr_get(struct dentry *dentry, const char *name,
                value = NULL;
        }
 
-       error = -xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
+       error = xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
        if (error)
                return error;
        return asize;
@@ -71,8 +71,8 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
                xflags |= ATTR_REPLACE;
 
        if (!value)
-               return -xfs_attr_remove(ip, (unsigned char *)name, xflags);
-       return -xfs_attr_set(ip, (unsigned char *)name,
+               return xfs_attr_remove(ip, (unsigned char *)name, xflags);
+       return xfs_attr_set(ip, (unsigned char *)name,
                                (void *)value, size, xflags);
 }
 
diff --git a/include/dt-bindings/dma/nbpfaxi.h b/include/dt-bindings/dma/nbpfaxi.h
new file mode 100644 (file)
index 0000000..c1a5b9e
--- /dev/null
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C) 2013-2014 Renesas Electronics Europe Ltd.
+ * Author: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef DT_BINDINGS_NBPFAXI_H
+#define DT_BINDINGS_NBPFAXI_H
+
+/**
+ * Use "#dma-cells = <2>;" with the second integer defining slave DMA flags:
+ */
+#define NBPF_SLAVE_RQ_HIGH     1
+#define NBPF_SLAVE_RQ_LOW      2
+#define NBPF_SLAVE_RQ_LEVEL    4
+
+#endif
index 4a5b7cb56079121152b780d7a75e54f43de4f344..dccc2d4fe7de690b30589334bf615d0e814590b5 100644 (file)
@@ -24,14 +24,10 @@ struct super_block;
 struct pacct_struct;
 struct pid_namespace;
 extern int acct_parm[]; /* for sysctl */
-extern void acct_auto_close_mnt(struct vfsmount *m);
-extern void acct_auto_close(struct super_block *sb);
 extern void acct_collect(long exitcode, int group_dead);
 extern void acct_process(void);
 extern void acct_exit_ns(struct pid_namespace *);
 #else
-#define acct_auto_close_mnt(x) do { } while (0)
-#define acct_auto_close(x)     do { } while (0)
 #define acct_collect(x,y)      do { } while (0)
 #define acct_process()         do { } while (0)
 #define acct_exit_ns(ns)       do { } while (0)
index d2633ee099d975836b1fc457cd58e323bfcc8784..b39e5000ff589e6dfdc556c0b80390005fc5718b 100644 (file)
@@ -308,6 +308,7 @@ struct bio_integrity_payload {
 
        unsigned short          bip_slab;       /* slab the bip came from */
        unsigned short          bip_vcnt;       /* # of integrity bio_vecs */
+       unsigned short          bip_max_vcnt;   /* integrity bio_vec slots */
        unsigned                bip_owns_buf:1; /* should free bip_buf */
 
        struct work_struct      bip_work;       /* I/O completion */
index 8699bcf5f0999db98a8f2a2917c284d950a12d75..518b46555b80968c3d29df956f677763fe292c51 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/bsg.h>
 #include <linux/smp.h>
 #include <linux/rcupdate.h>
+#include <linux/percpu-refcount.h>
 
 #include <asm/scatterlist.h>
 
@@ -470,6 +471,7 @@ struct request_queue {
        struct mutex            sysfs_lock;
 
        int                     bypass_depth;
+       int                     mq_freeze_depth;
 
 #if defined(CONFIG_BLK_DEV_BSG)
        bsg_job_fn              *bsg_job_fn;
@@ -483,7 +485,7 @@ struct request_queue {
 #endif
        struct rcu_head         rcu_head;
        wait_queue_head_t       mq_freeze_wq;
-       struct percpu_counter   mq_usage_counter;
+       struct percpu_ref       mq_usage_counter;
        struct list_head        all_q_node;
 
        struct blk_mq_tag_set   *tag_set;
index d21f2dba07314c48dce2414c4be23d2191180c81..40ae58e3e9db67d5adbfac4c6207ee4af8b1bebc 100644 (file)
@@ -285,19 +285,9 @@ extern void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio,
 
 extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
                                     bool can_fail);
-extern void ceph_msg_kfree(struct ceph_msg *m);
 
-
-static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
-{
-       kref_get(&msg->kref);
-       return msg;
-}
-extern void ceph_msg_last_put(struct kref *kref);
-static inline void ceph_msg_put(struct ceph_msg *msg)
-{
-       kref_put(&msg->kref, ceph_msg_last_put);
-}
+extern struct ceph_msg *ceph_msg_get(struct ceph_msg *msg);
+extern void ceph_msg_put(struct ceph_msg *msg);
 
 extern void ceph_msg_dump(struct ceph_msg *msg);
 
index 94ec69672164c9dd84b41c1ab3a7c995a761c1fc..03aeb27fcc69d74484de4db06550abd05b891905 100644 (file)
@@ -117,7 +117,7 @@ struct ceph_osd_request {
        struct list_head r_req_lru_item;
        struct list_head r_osd_item;
        struct list_head r_linger_item;
-       struct list_head r_linger_osd;
+       struct list_head r_linger_osd_item;
        struct ceph_osd *r_osd;
        struct ceph_pg   r_pgid;
        int              r_pg_osds[CEPH_PG_MAX_SIZE];
@@ -325,22 +325,14 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
 
 extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
                                         struct ceph_osd_request *req);
-extern void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc,
-                                               struct ceph_osd_request *req);
-
-static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
-{
-       kref_get(&req->r_kref);
-}
-extern void ceph_osdc_release_request(struct kref *kref);
-static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
-{
-       kref_put(&req->r_kref, ceph_osdc_release_request);
-}
+
+extern void ceph_osdc_get_request(struct ceph_osd_request *req);
+extern void ceph_osdc_put_request(struct ceph_osd_request *req);
 
 extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
                                   struct ceph_osd_request *req,
                                   bool nofail);
+extern void ceph_osdc_cancel_request(struct ceph_osd_request *req);
 extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
                                  struct ceph_osd_request *req);
 extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
index 3c7ec327ebd2eecc2fb63ac4f742ca0d75c56ec8..e4ae2ad48d072efcd78e3905c3f1e212130c88cc 100644 (file)
@@ -249,6 +249,7 @@ extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
 extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
 extern struct dentry *d_find_any_alias(struct inode *inode);
 extern struct dentry * d_obtain_alias(struct inode *);
+extern struct dentry * d_obtain_root(struct inode *);
 extern void shrink_dcache_sb(struct super_block *);
 extern void shrink_dcache_parent(struct dentry *);
 extern void shrink_dcache_for_umount(struct super_block *);
index 3d1c2aa515308e9484365a3bb2f1712f9adeccbb..1f9e642c66adf8a6b0f620582dcb5598fca89f88 100644 (file)
@@ -37,7 +37,6 @@
  */
 typedef s32 dma_cookie_t;
 #define DMA_MIN_COOKIE 1
-#define DMA_MAX_COOKIE INT_MAX
 
 static inline int dma_submit_error(dma_cookie_t cookie)
 {
@@ -671,7 +670,7 @@ struct dma_device {
        struct dma_async_tx_descriptor *(*device_prep_dma_cyclic)(
                struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len,
                size_t period_len, enum dma_transfer_direction direction,
-               unsigned long flags, void *context);
+               unsigned long flags);
        struct dma_async_tx_descriptor *(*device_prep_interleaved_dma)(
                struct dma_chan *chan, struct dma_interleaved_template *xt,
                unsigned long flags);
@@ -746,7 +745,7 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_cyclic(
                unsigned long flags)
 {
        return chan->device->device_prep_dma_cyclic(chan, buf_addr, buf_len,
-                                               period_len, dir, flags, NULL);
+                                               period_len, dir, flags);
 }
 
 static inline struct dma_async_tx_descriptor *dmaengine_prep_interleaved_dma(
index 3dbe9bd57a094b9b63175de44d6f9fd1022c322b..debb70d4054757e44e291064fd73d362e738b68b 100644 (file)
@@ -52,7 +52,7 @@
 #endif
 
 extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.4.3"
+#define REL_VERSION "8.4.5"
 #define API_VERSION 1
 #define PRO_VERSION_MIN 86
 #define PRO_VERSION_MAX 101
@@ -245,7 +245,7 @@ enum drbd_disk_state {
        D_DISKLESS,
        D_ATTACHING,      /* In the process of reading the meta-data */
        D_FAILED,         /* Becomes D_DISKLESS as soon as we told it the peer */
-                       /* when >= D_FAILED it is legal to access mdev->bc */
+                         /* when >= D_FAILED it is legal to access mdev->ldev */
        D_NEGOTIATING,    /* Late attaching state, we need to talk to the peer */
        D_INCONSISTENT,
        D_OUTDATED,
index 4193f5f2636c011686cb105c2bba428fc88be2c5..7b131ed8f9c6696cfb1ec8b470c0d77c95dff07e 100644 (file)
@@ -171,6 +171,10 @@ GENL_struct(DRBD_NLA_NET_CONF, 5, net_conf,
        __flg_field(28, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT,      tentative)
        __flg_field_def(29,     DRBD_GENLA_F_MANDATORY, use_rle, DRBD_USE_RLE_DEF)
        /* 9: __u32_field_def(30,       DRBD_GENLA_F_MANDATORY, fencing_policy, DRBD_FENCING_DEF) */
+       /* 9: __str_field_def(31,     DRBD_GENLA_F_MANDATORY, name, SHARED_SECRET_MAX) */
+       /* 9: __u32_field(32,         DRBD_F_REQUIRED | DRBD_F_INVARIANT,     peer_node_id) */
+       __flg_field_def(33, 0 /* OPTIONAL */,   csums_after_crash_only, DRBD_CSUMS_AFTER_CRASH_ONLY_DEF)
+       __u32_field_def(34, 0 /* OPTIONAL */, sock_check_timeo, DRBD_SOCKET_CHECK_TIMEO_DEF)
 )
 
 GENL_struct(DRBD_NLA_SET_ROLE_PARMS, 6, set_role_parms,
index 17e50bb00521362f0840eae9e79da52e2b34d139..8ac8c5d9a3ad08482bee1bcd0e5d4c57603436a7 100644 (file)
 #define DRBD_ALLOW_TWO_PRIMARIES_DEF   0
 #define DRBD_ALWAYS_ASBP_DEF   0
 #define DRBD_USE_RLE_DEF       1
+#define DRBD_CSUMS_AFTER_CRASH_ONLY_DEF 0
 
 #define DRBD_AL_STRIPES_MIN     1
 #define DRBD_AL_STRIPES_MAX     1024
 #define DRBD_AL_STRIPE_SIZE_MAX   16777216
 #define DRBD_AL_STRIPE_SIZE_DEF   32
 #define DRBD_AL_STRIPE_SIZE_SCALE 'k' /* kilobytes */
+
+#define DRBD_SOCKET_CHECK_TIMEO_MIN 0
+#define DRBD_SOCKET_CHECK_TIMEO_MAX DRBD_PING_TIMEO_MAX
+#define DRBD_SOCKET_CHECK_TIMEO_DEF 0
+#define DRBD_SOCKET_CHECK_TIMEO_SCALE '1'
 #endif
index f0890e4a7c25755c9531cb5903aa8e084043d2c8..94187721ad412c6c8da4b44e946d24ecd240beb0 100644 (file)
@@ -1275,6 +1275,7 @@ struct super_block {
 
        /* AIO completions deferred from interrupt context */
        struct workqueue_struct *s_dio_done_wq;
+       struct hlist_head s_pins;
 
        /*
         * Keep the lru lists last in the structure so they always sit on their
@@ -2360,6 +2361,7 @@ extern int do_pipe_flags(int *, int);
 
 extern int kernel_read(struct file *, loff_t, char *, unsigned long);
 extern ssize_t kernel_write(struct file *, const char *, size_t, loff_t);
+extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
 extern struct file * open_exec(const char *);
  
 /* fs/dcache.c -- generic fs support functions */
diff --git a/include/linux/fs_pin.h b/include/linux/fs_pin.h
new file mode 100644 (file)
index 0000000..f66525e
--- /dev/null
@@ -0,0 +1,17 @@
+#include <linux/fs.h>
+
+struct fs_pin {
+       atomic_long_t           count;
+       union {
+               struct {
+                       struct hlist_node       s_list;
+                       struct hlist_node       m_list;
+               };
+               struct rcu_head rcu;
+       };
+       void (*kill)(struct fs_pin *);
+};
+
+void pin_put(struct fs_pin *);
+void pin_remove(struct fs_pin *);
+void pin_insert(struct fs_pin *, struct vfsmount *);
index 4967916fe4ac8c6732930d768a2e60fe1bec35c7..d69f0577a319d6875ee62352717f45bbfe447a23 100644 (file)
@@ -187,7 +187,6 @@ vlan_dev_get_egress_qos_mask(struct net_device *dev, u32 skprio)
 }
 
 extern bool vlan_do_receive(struct sk_buff **skb);
-extern struct sk_buff *vlan_untag(struct sk_buff *skb);
 
 extern int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid);
 extern void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid);
@@ -241,11 +240,6 @@ static inline bool vlan_do_receive(struct sk_buff **skb)
        return false;
 }
 
-static inline struct sk_buff *vlan_untag(struct sk_buff *skb)
-{
-       return skb;
-}
-
 static inline int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid)
 {
        return 0;
index babaea93bca646d302ae19435e468e9ba68b0042..29ce014ab42139cef63da0c6bc78b3d1ca9dd060 100644 (file)
@@ -213,6 +213,8 @@ struct dw_mci_dma_ops {
 #define DW_MCI_QUIRK_HIGHSPEED                 BIT(2)
 /* Unreliable card detection */
 #define DW_MCI_QUIRK_BROKEN_CARD_DETECTION     BIT(3)
+/* No write protect */
+#define DW_MCI_QUIRK_NO_WRITE_PROTECT          BIT(4)
 
 /* Slot level quirks */
 /* This slot has no write protect */
index 08abe9941884ebc40667ee246881b2b2bfd66f70..09ebe57d5ce9b4a810d82b7900dea772b99abf02 100644 (file)
@@ -104,9 +104,6 @@ struct sdhci_host {
 
        const struct sdhci_ops *ops;    /* Low level hw interface */
 
-       struct regulator *vmmc;         /* Power regulator (vmmc) */
-       struct regulator *vqmmc;        /* Signaling regulator (vccq) */
-
        /* Internal data */
        struct mmc_host *mmc;   /* MMC structure */
        u64 dma_mask;           /* custom DMA mask */
index b0c1e6574e7fc7f5ff204fb39a12cfe1cb751268..9262e4bf0cc3408588adc819abc7ea9be69b88dd 100644 (file)
@@ -69,6 +69,7 @@ struct vfsmount {
 };
 
 struct file; /* forward dec */
+struct path;
 
 extern int mnt_want_write(struct vfsmount *mnt);
 extern int mnt_want_write_file(struct file *file);
@@ -77,8 +78,7 @@ extern void mnt_drop_write(struct vfsmount *mnt);
 extern void mnt_drop_write_file(struct file *file);
 extern void mntput(struct vfsmount *mnt);
 extern struct vfsmount *mntget(struct vfsmount *mnt);
-extern void mnt_pin(struct vfsmount *mnt);
-extern void mnt_unpin(struct vfsmount *mnt);
+extern struct vfsmount *mnt_clone_internal(struct path *path);
 extern int __mnt_is_readonly(struct vfsmount *mnt);
 
 struct file_system_type;
index e30f6059ecd642b44c0cc599344c0421b713958f..5180a7ededecf2797b4cb4a23c1a223f48bbe3cf 100644 (file)
@@ -52,6 +52,7 @@ struct nfs_access_entry {
        unsigned long           jiffies;
        struct rpc_cred *       cred;
        int                     mask;
+       struct rcu_head         rcu_head;
 };
 
 struct nfs_lockowner {
@@ -352,6 +353,7 @@ extern int nfs_release(struct inode *, struct file *);
 extern int nfs_attribute_timeout(struct inode *inode);
 extern int nfs_attribute_cache_expired(struct inode *inode);
 extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
+extern int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *inode);
 extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
 extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
 extern int nfs_setattr(struct dentry *, struct iattr *);
index 1150ea41b626723b67720320a23f0810a2ec2be4..922be2e050f5c938b561daf4b5533d01f1d1d0bc 100644 (file)
@@ -45,6 +45,7 @@ struct nfs_client {
        struct sockaddr_storage cl_addr;        /* server identifier */
        size_t                  cl_addrlen;
        char *                  cl_hostname;    /* hostname of server */
+       char *                  cl_acceptor;    /* GSSAPI acceptor name */
        struct list_head        cl_share_link;  /* link in global client list */
        struct list_head        cl_superblocks; /* List of nfs_server structs */
 
index 7d9096d95d4aa5f2d276f1e05383a596acd6d4c6..6ad2bbcad4050c12105778c3011b5196fcbf4b9e 100644 (file)
@@ -26,7 +26,7 @@ enum {
        PG_MAPPED,              /* page private set for buffered io */
        PG_CLEAN,               /* write succeeded */
        PG_COMMIT_TO_DS,        /* used by pnfs layouts */
-       PG_INODE_REF,           /* extra ref held by inode (head req only) */
+       PG_INODE_REF,           /* extra ref held by inode when in writeback */
        PG_HEADLOCK,            /* page group lock of wb_head */
        PG_TEARDOWN,            /* page group sync for destroy */
        PG_UNLOCKPAGE,          /* page group sync bit in read path */
@@ -62,12 +62,13 @@ struct nfs_pageio_ops {
 
 struct nfs_rw_ops {
        const fmode_t rw_mode;
-       struct nfs_rw_header *(*rw_alloc_header)(void);
-       void (*rw_free_header)(struct nfs_rw_header *);
-       void (*rw_release)(struct nfs_pgio_data *);
-       int  (*rw_done)(struct rpc_task *, struct nfs_pgio_data *, struct inode *);
-       void (*rw_result)(struct rpc_task *, struct nfs_pgio_data *);
-       void (*rw_initiate)(struct nfs_pgio_data *, struct rpc_message *,
+       struct nfs_pgio_header *(*rw_alloc_header)(void);
+       void (*rw_free_header)(struct nfs_pgio_header *);
+       void (*rw_release)(struct nfs_pgio_header *);
+       int  (*rw_done)(struct rpc_task *, struct nfs_pgio_header *,
+                       struct inode *);
+       void (*rw_result)(struct rpc_task *, struct nfs_pgio_header *);
+       void (*rw_initiate)(struct nfs_pgio_header *, struct rpc_message *,
                            struct rpc_task_setup *, int);
 };
 
@@ -111,6 +112,8 @@ extern      void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
                             int how);
 extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *,
                                   struct nfs_page *);
+extern  int nfs_pageio_resend(struct nfs_pageio_descriptor *,
+                             struct nfs_pgio_header *);
 extern void nfs_pageio_complete(struct nfs_pageio_descriptor *desc);
 extern void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *, pgoff_t);
 extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
@@ -119,7 +122,7 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
 extern  int nfs_wait_on_request(struct nfs_page *);
 extern void nfs_unlock_request(struct nfs_page *req);
 extern void nfs_unlock_and_release_request(struct nfs_page *);
-extern void nfs_page_group_lock(struct nfs_page *);
+extern int nfs_page_group_lock(struct nfs_page *, bool);
 extern void nfs_page_group_unlock(struct nfs_page *);
 extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
 
index 9a1396e70310f92c40e6a0fe563b86b34f5e6836..0040629894dfa42084161124edb2f494df5be4ea 100644 (file)
@@ -993,6 +993,7 @@ struct nfs4_setclientid {
        unsigned int                    sc_uaddr_len;
        char                            sc_uaddr[RPCBIND_MAXUADDRLEN + 1];
        u32                             sc_cb_ident;
+       struct rpc_cred                 *sc_cred;
 };
 
 struct nfs4_setclientid_res {
@@ -1253,18 +1254,12 @@ enum {
        NFS_IOHDR_ERROR = 0,
        NFS_IOHDR_EOF,
        NFS_IOHDR_REDO,
-       NFS_IOHDR_NEED_COMMIT,
-       NFS_IOHDR_NEED_RESCHED,
 };
 
-struct nfs_pgio_data;
-
 struct nfs_pgio_header {
        struct inode            *inode;
        struct rpc_cred         *cred;
        struct list_head        pages;
-       struct nfs_pgio_data    *data;
-       atomic_t                refcnt;
        struct nfs_page         *req;
        struct nfs_writeverf    verf;           /* Used for writes */
        struct pnfs_layout_segment *lseg;
@@ -1281,28 +1276,22 @@ struct nfs_pgio_header {
        int                     error;          /* merge with pnfs_error */
        unsigned long           good_bytes;     /* boundary of good data */
        unsigned long           flags;
-};
 
-struct nfs_pgio_data {
-       struct nfs_pgio_header  *header;
+       /*
+        * rpc data
+        */
        struct rpc_task         task;
        struct nfs_fattr        fattr;
-       struct nfs_writeverf    verf;           /* Used for writes */
        struct nfs_pgio_args    args;           /* argument struct */
        struct nfs_pgio_res     res;            /* result struct */
        unsigned long           timestamp;      /* For lease renewal */
-       int (*pgio_done_cb) (struct rpc_task *task, struct nfs_pgio_data *data);
+       int (*pgio_done_cb)(struct rpc_task *, struct nfs_pgio_header *);
        __u64                   mds_offset;     /* Filelayout dense stripe */
-       struct nfs_page_array   pages;
+       struct nfs_page_array   page_array;
        struct nfs_client       *ds_clp;        /* pNFS data server */
        int                     ds_idx;         /* ds index if ds_clp is set */
 };
 
-struct nfs_rw_header {
-       struct nfs_pgio_header  header;
-       struct nfs_pgio_data    rpc_data;
-};
-
 struct nfs_mds_commit_info {
        atomic_t rpcs_out;
        unsigned long           ncommit;
@@ -1432,11 +1421,12 @@ struct nfs_rpc_ops {
                             struct nfs_pathconf *);
        int     (*set_capabilities)(struct nfs_server *, struct nfs_fh *);
        int     (*decode_dirent)(struct xdr_stream *, struct nfs_entry *, int);
-       int     (*pgio_rpc_prepare)(struct rpc_task *, struct nfs_pgio_data *);
-       void    (*read_setup)   (struct nfs_pgio_data *, struct rpc_message *);
-       int     (*read_done)  (struct rpc_task *, struct nfs_pgio_data *);
-       void    (*write_setup)  (struct nfs_pgio_data *, struct rpc_message *);
-       int     (*write_done)  (struct rpc_task *, struct nfs_pgio_data *);
+       int     (*pgio_rpc_prepare)(struct rpc_task *,
+                                   struct nfs_pgio_header *);
+       void    (*read_setup)(struct nfs_pgio_header *, struct rpc_message *);
+       int     (*read_done)(struct rpc_task *, struct nfs_pgio_header *);
+       void    (*write_setup)(struct nfs_pgio_header *, struct rpc_message *);
+       int     (*write_done)(struct rpc_task *, struct nfs_pgio_header *);
        void    (*commit_setup) (struct nfs_commit_data *, struct rpc_message *);
        void    (*commit_rpc_prepare)(struct rpc_task *, struct nfs_commit_data *);
        int     (*commit_done) (struct rpc_task *, struct nfs_commit_data *);
index 196b34c1ef4e1b3bdd54cb851110715c584b43df..6c4363b8ddc3ddba1a75d9f2396f453ae92ab5e9 100644 (file)
@@ -74,8 +74,6 @@ struct of_phandle_args {
        uint32_t args[MAX_PHANDLE_ARGS];
 };
 
-extern int of_node_add(struct device_node *node);
-
 /* initialize a node */
 extern struct kobj_type of_node_ktype;
 static inline void of_node_init(struct device_node *node)
@@ -113,6 +111,7 @@ static inline void of_node_put(struct device_node *node) { }
 extern struct device_node *of_allnodes;
 extern struct device_node *of_chosen;
 extern struct device_node *of_aliases;
+extern struct device_node *of_stdout;
 extern raw_spinlock_t devtree_lock;
 
 static inline bool of_have_populated_dt(void)
@@ -204,6 +203,7 @@ static inline unsigned long of_read_ulong(const __be32 *cell, int size)
 #define OF_DYNAMIC     1 /* node and properties were allocated via kmalloc */
 #define OF_DETACHED    2 /* node has been detached from the device tree */
 #define OF_POPULATED   3 /* device already created for the node */
+#define OF_POPULATED_BUS       4 /* of_platform_populate recursed to children of this node */
 
 #define OF_IS_DYNAMIC(x) test_bit(OF_DYNAMIC, &x->_flags)
 #define OF_MARK_DYNAMIC(x) set_bit(OF_DYNAMIC, &x->_flags)
@@ -322,6 +322,7 @@ extern int of_update_property(struct device_node *np, struct property *newprop);
 struct of_prop_reconfig {
        struct device_node      *dn;
        struct property         *prop;
+       struct property         *old_prop;
 };
 
 extern int of_reconfig_notifier_register(struct notifier_block *);
@@ -352,7 +353,7 @@ const __be32 *of_prop_next_u32(struct property *prop, const __be32 *cur,
  */
 const char *of_prop_next_string(struct property *prop, const char *cur);
 
-int of_device_is_stdout_path(struct device_node *dn);
+bool of_console_check(struct device_node *dn, char *name, int index);
 
 #else /* CONFIG_OF */
 
@@ -564,9 +565,9 @@ static inline int of_machine_is_compatible(const char *compat)
        return 0;
 }
 
-static inline int of_device_is_stdout_path(struct device_node *dn)
+static inline bool of_console_check(const struct device_node *dn, const char *name, int index)
 {
-       return 0;
+       return false;
 }
 
 static inline const __be32 *of_prop_next_u32(struct property *prop,
@@ -786,4 +787,80 @@ typedef void (*of_init_fn_1)(struct device_node *);
 #define OF_DECLARE_2(table, name, compat, fn) \
                _OF_DECLARE(table, name, compat, fn, of_init_fn_2)
 
+/**
+ * struct of_changeset_entry   - Holds a changeset entry
+ *
+ * @node:      list_head for the log list
+ * @action:    notifier action
+ * @np:                pointer to the device node affected
+ * @prop:      pointer to the property affected
+ * @old_prop:  hold a pointer to the original property
+ *
+ * Every modification of the device tree during a changeset
+ * is held in a list of of_changeset_entry structures.
+ * That way we can recover from a partial application, or we can
+ * revert the changeset
+ */
+struct of_changeset_entry {
+       struct list_head node;
+       unsigned long action;
+       struct device_node *np;
+       struct property *prop;
+       struct property *old_prop;
+};
+
+/**
+ * struct of_changeset - changeset tracker structure
+ *
+ * @entries:   list_head for the changeset entries
+ *
+ * changesets are a convenient way to apply bulk changes to the
+ * live tree. In case of an error, changes are rolled-back.
+ * changesets live on after initial application, and if not
+ * destroyed after use, they can be reverted in one single call.
+ */
+struct of_changeset {
+       struct list_head entries;
+};
+
+#ifdef CONFIG_OF_DYNAMIC
+extern void of_changeset_init(struct of_changeset *ocs);
+extern void of_changeset_destroy(struct of_changeset *ocs);
+extern int of_changeset_apply(struct of_changeset *ocs);
+extern int of_changeset_revert(struct of_changeset *ocs);
+extern int of_changeset_action(struct of_changeset *ocs,
+               unsigned long action, struct device_node *np,
+               struct property *prop);
+
+static inline int of_changeset_attach_node(struct of_changeset *ocs,
+               struct device_node *np)
+{
+       return of_changeset_action(ocs, OF_RECONFIG_ATTACH_NODE, np, NULL);
+}
+
+static inline int of_changeset_detach_node(struct of_changeset *ocs,
+               struct device_node *np)
+{
+       return of_changeset_action(ocs, OF_RECONFIG_DETACH_NODE, np, NULL);
+}
+
+static inline int of_changeset_add_property(struct of_changeset *ocs,
+               struct device_node *np, struct property *prop)
+{
+       return of_changeset_action(ocs, OF_RECONFIG_ADD_PROPERTY, np, prop);
+}
+
+static inline int of_changeset_remove_property(struct of_changeset *ocs,
+               struct device_node *np, struct property *prop)
+{
+       return of_changeset_action(ocs, OF_RECONFIG_REMOVE_PROPERTY, np, prop);
+}
+
+static inline int of_changeset_update_property(struct of_changeset *ocs,
+               struct device_node *np, struct property *prop)
+{
+       return of_changeset_action(ocs, OF_RECONFIG_UPDATE_PROPERTY, np, prop);
+}
+#endif
+
 #endif /* _LINUX_OF_H */
index ae36298ba076eb5187b1fff7bffc96f8c0b6a28a..56bc026c143f2eceb3324bc20d3786ab9a3e96af 100644 (file)
@@ -41,6 +41,8 @@ extern struct dma_chan *of_dma_request_slave_channel(struct device_node *np,
                                                     const char *name);
 extern struct dma_chan *of_dma_simple_xlate(struct of_phandle_args *dma_spec,
                struct of_dma *ofdma);
+extern struct dma_chan *of_dma_xlate_by_chan_id(struct of_phandle_args *dma_spec,
+               struct of_dma *ofdma);
 #else
 static inline int of_dma_controller_register(struct device_node *np,
                struct dma_chan *(*of_dma_xlate)
@@ -66,6 +68,8 @@ static inline struct dma_chan *of_dma_simple_xlate(struct of_phandle_args *dma_s
        return NULL;
 }
 
+#define of_dma_xlate_by_chan_id NULL
+
 #endif
 
 #endif /* __LINUX_OF_DMA_H */
index d96e1badbee05b31538df9c34b1c298a1cc2ca1e..c2b0627a23175380b3b1e5b3278c1e5663491469 100644 (file)
@@ -72,7 +72,7 @@ extern int of_platform_populate(struct device_node *root,
                                const struct of_device_id *matches,
                                const struct of_dev_auxdata *lookup,
                                struct device *parent);
-extern int of_platform_depopulate(struct device *parent);
+extern void of_platform_depopulate(struct device *parent);
 #else
 static inline int of_platform_populate(struct device_node *root,
                                        const struct of_device_id *matches,
@@ -81,10 +81,7 @@ static inline int of_platform_populate(struct device_node *root,
 {
        return -ENODEV;
 }
-static inline int of_platform_depopulate(struct device *parent)
-{
-       return -ENODEV;
-}
+static inline void of_platform_depopulate(struct device *parent) { }
 #endif
 
 #endif /* _LINUX_OF_PLATFORM_H */
index 4669ddfdd5af5b10edf5fa92ea9bde133bf0b4bf..5b5efae091350a839e5b27ff5c5b5b34406e9f29 100644 (file)
@@ -8,6 +8,7 @@ struct reserved_mem_ops;
 struct reserved_mem {
        const char                      *name;
        unsigned long                   fdt_node;
+       unsigned long                   phandle;
        const struct reserved_mem_ops   *ops;
        phys_addr_t                     base;
        phys_addr_t                     size;
@@ -27,10 +28,16 @@ typedef int (*reservedmem_of_init_fn)(struct reserved_mem *rmem);
        _OF_DECLARE(reservedmem, name, compat, init, reservedmem_of_init_fn)
 
 #ifdef CONFIG_OF_RESERVED_MEM
+void of_reserved_mem_device_init(struct device *dev);
+void of_reserved_mem_device_release(struct device *dev);
+
 void fdt_init_reserved_mem(void);
 void fdt_reserved_mem_save_node(unsigned long node, const char *uname,
                               phys_addr_t base, phys_addr_t size);
 #else
+static inline void of_reserved_mem_device_init(struct device *dev) { }
+static inline void of_reserved_mem_device_release(struct device *pdev) { }
+
 static inline void fdt_init_reserved_mem(void) { }
 static inline void fdt_reserved_mem_save_node(unsigned long node,
                const char *uname, phys_addr_t base, phys_addr_t size) { }
index d05542aafa3e50454c21948b0ab6b3d271922304..6a1357d3187130bfa12005b591f8973ce1a03b16 100644 (file)
@@ -40,6 +40,7 @@ enum sdma_peripheral_type {
        IMX_DMATYPE_ASRC,       /* ASRC */
        IMX_DMATYPE_ESAI,       /* ESAI */
        IMX_DMATYPE_SSI_DUAL,   /* SSI Dual FIFO */
+       IMX_DMATYPE_ASRC_SP,    /* Shared ASRC */
 };
 
 enum imx_dma_prio {
index eb8d5627d080dd5b90c4a1d6ea18faaba7233285..bdb2710e2aab0332c9cbdd6fcbe350e320ca0df7 100644 (file)
@@ -150,6 +150,8 @@ void edma_clear_event(unsigned channel);
 void edma_pause(unsigned channel);
 void edma_resume(unsigned channel);
 
+void edma_assign_channel_eventq(unsigned channel, enum dma_event_q eventq_no);
+
 struct edma_rsv_info {
 
        const s16       (*rsv_chans)[2];
index 2bf1b30cb5dcf9fe13d667a9c5da5dff2760260c..51e70cf25cbcb3476999372baeb605f0e96261dd 100644 (file)
@@ -28,6 +28,7 @@
  */
 #define OMAP_HSMMC_SUPPORTS_DUAL_VOLT          BIT(0)
 #define OMAP_HSMMC_BROKEN_MULTIBLOCK_READ      BIT(1)
+#define OMAP_HSMMC_SWAKEUP_MISSING             BIT(2)
 
 struct mmc_card;
 
index 0f3c5d38da1fb299b3ad03da4469635c375bf0ae..80d345a3524cc52b8915b79d2e8ec916131a143c 100644 (file)
@@ -390,7 +390,6 @@ struct quota_info {
        unsigned int flags;                     /* Flags for diskquotas on this device */
        struct mutex dqio_mutex;                /* lock device while I/O in progress */
        struct mutex dqonoff_mutex;             /* Serialize quotaon & quotaoff */
-       struct rw_semaphore dqptr_sem;          /* serialize ops using quota_info struct, pointers from inode to dquots */
        struct inode *files[MAXQUOTAS];         /* inodes of quotafiles */
        struct mem_dqinfo info[MAXQUOTAS];      /* Information for each quota type */
        const struct quota_format_ops *ops[MAXQUOTAS];  /* Operations for each type */
index b7b43b82231e0ae7fed55cae10c302ba4a04c105..56b97eed28a4ea819477065c8f6eda654dc32f55 100644 (file)
@@ -95,19 +95,21 @@ struct sh_dmae_pdata {
 };
 
 /* DMAOR definitions */
-#define DMAOR_AE       0x00000004
+#define DMAOR_AE       0x00000004      /* Address Error Flag */
 #define DMAOR_NMIF     0x00000002
-#define DMAOR_DME      0x00000001
+#define DMAOR_DME      0x00000001      /* DMA Master Enable */
 
 /* Definitions for the SuperH DMAC */
-#define DM_INC 0x00004000
-#define DM_DEC 0x00008000
-#define DM_FIX 0x0000c000
-#define SM_INC 0x00001000
-#define SM_DEC 0x00002000
-#define SM_FIX 0x00003000
-#define CHCR_DE        0x00000001
-#define CHCR_TE        0x00000002
-#define CHCR_IE        0x00000004
+#define DM_INC 0x00004000      /* Destination addresses are incremented */
+#define DM_DEC 0x00008000      /* Destination addresses are decremented */
+#define DM_FIX 0x0000c000      /* Destination address is fixed */
+#define SM_INC 0x00001000      /* Source addresses are incremented */
+#define SM_DEC 0x00002000      /* Source addresses are decremented */
+#define SM_FIX 0x00003000      /* Source address is fixed */
+#define RS_AUTO        0x00000400      /* Auto Request */
+#define RS_ERS 0x00000800      /* DMA extended resource selector */
+#define CHCR_DE        0x00000001      /* DMA Enable */
+#define CHCR_TE        0x00000002      /* Transfer End Flag */
+#define CHCR_IE        0x00000004      /* Interrupt Enable */
 
 #endif
index 11c270551d25dde53babf59f522a754b278f6dbf..abde271c18ae30989e6675708e70a6c9bb656300 100644 (file)
@@ -2555,6 +2555,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen);
 void skb_scrub_packet(struct sk_buff *skb, bool xnet);
 unsigned int skb_gso_transport_seglen(const struct sk_buff *skb);
 struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features);
+struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
 
 struct skb_checksum_ops {
        __wsum (*update)(const void *mem, int len, __wsum wsum);
index 790be1472792a3fc49fcf81edd7d7e9c2ab08128..8e030075fe7906bbf46db5296e0abaf12c7c6131 100644 (file)
@@ -103,6 +103,7 @@ struct rpc_auth_create_args {
 
 /* Flags for rpcauth_lookupcred() */
 #define RPCAUTH_LOOKUP_NEW             0x01    /* Accept an uninitialised cred */
+#define RPCAUTH_LOOKUP_RCU             0x02    /* lock-less lookup */
 
 /*
  * Client authentication ops
@@ -140,6 +141,7 @@ struct rpc_credops {
                                                void *, __be32 *, void *);
        int                     (*crkey_timeout)(struct rpc_cred *);
        bool                    (*crkey_to_expire)(struct rpc_cred *);
+       char *                  (*crstringify_acceptor)(struct rpc_cred *);
 };
 
 extern const struct rpc_authops        authunix_ops;
@@ -153,6 +155,7 @@ void                        rpc_destroy_generic_auth(void);
 void                   rpc_destroy_authunix(void);
 
 struct rpc_cred *      rpc_lookup_cred(void);
+struct rpc_cred *      rpc_lookup_cred_nonblock(void);
 struct rpc_cred *      rpc_lookup_machine_cred(const char *service_name);
 int                    rpcauth_register(const struct rpc_authops *);
 int                    rpcauth_unregister(const struct rpc_authops *);
@@ -182,6 +185,7 @@ void                        rpcauth_clear_credcache(struct rpc_cred_cache *);
 int                    rpcauth_key_timeout_notify(struct rpc_auth *,
                                                struct rpc_cred *);
 bool                   rpcauth_cred_key_to_expire(struct rpc_cred *);
+char *                 rpcauth_stringify_acceptor(struct rpc_cred *);
 
 static inline
 struct rpc_cred *      get_rpccred(struct rpc_cred *cred)
index f1cfd4c85cd047c4b2fadd367eeb819aabc57d29..36eebc451b416878db871f6ea1577d783d4a6296 100644 (file)
@@ -69,8 +69,9 @@ struct gss_cl_ctx {
        enum rpc_gss_proc       gc_proc;
        u32                     gc_seq;
        spinlock_t              gc_seq_lock;
-       struct gss_ctx __rcu    *gc_gss_ctx;
+       struct gss_ctx          *gc_gss_ctx;
        struct xdr_netobj       gc_wire_ctx;
+       struct xdr_netobj       gc_acceptor;
        u32                     gc_win;
        unsigned long           gc_expiry;
        struct rcu_head         gc_rcu;
index 5af2931cf58d07daf6d16d2a4deb5956f8811794..df02a41884874f68dfb2aded42a2d38658eaff06 100644 (file)
@@ -81,7 +81,7 @@ struct gss_krb5_enctype {
                       struct xdr_netobj *in,
                       struct xdr_netobj *out); /* complete key generation */
        u32 (*encrypt_v2) (struct krb5_ctx *kctx, u32 offset,
-                          struct xdr_buf *buf, int ec,
+                          struct xdr_buf *buf,
                           struct page **pages); /* v2 encryption function */
        u32 (*decrypt_v2) (struct krb5_ctx *kctx, u32 offset,
                           struct xdr_buf *buf, u32 *headskip,
@@ -310,7 +310,7 @@ gss_krb5_aes_make_key(const struct gss_krb5_enctype *gk5e,
 
 u32
 gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
-                    struct xdr_buf *buf, int ec,
+                    struct xdr_buf *buf,
                     struct page **pages);
 
 u32
index c2f04e1ae15973a4e15e55d0fb9f34d086628ef7..64a0a0a97b2396492352f99fa1a1a04f225ef06a 100644 (file)
@@ -62,8 +62,6 @@
 #define RPCRDMA_INLINE_PAD_THRESH  (512)/* payload threshold to pad (bytes) */
 
 /* memory registration strategies */
-#define RPCRDMA_PERSISTENT_REGISTRATION (1)
-
 enum rpcrdma_memreg {
        RPCRDMA_BOUNCEBUFFERS = 0,
        RPCRDMA_REGISTER,
index f7e11c7ea7d9691b045e84a2274cf614307e4e97..0305cde21a74d0bd37cf8ce707d8033632182cd6 100644 (file)
@@ -158,6 +158,42 @@ struct thermal_attr {
        char name[THERMAL_NAME_LENGTH];
 };
 
+/**
+ * struct thermal_zone_device - structure for a thermal zone
+ * @id:                unique id number for each thermal zone
+ * @type:      the thermal zone device type
+ * @device:    &struct device for this thermal zone
+ * @trip_temp_attrs:   attributes for trip points for sysfs: trip temperature
+ * @trip_type_attrs:   attributes for trip points for sysfs: trip type
+ * @trip_hyst_attrs:   attributes for trip points for sysfs: trip hysteresis
+ * @devdata:   private pointer for device private data
+ * @trips:     number of trip points the thermal zone supports
+ * @passive_delay:     number of milliseconds to wait between polls when
+ *                     performing passive cooling.  Currenty only used by the
+ *                     step-wise governor
+ * @polling_delay:     number of milliseconds to wait between polls when
+ *                     checking whether trip points have been crossed (0 for
+ *                     interrupt driven systems)
+ * @temperature:       current temperature.  This is only for core code,
+ *                     drivers should use thermal_zone_get_temp() to get the
+ *                     current temperature
+ * @last_temperature:  previous temperature read
+ * @emul_temperature:  emulated temperature when using CONFIG_THERMAL_EMULATION
+ * @passive:           1 if you've crossed a passive trip point, 0 otherwise.
+ *                     Currenty only used by the step-wise governor.
+ * @forced_passive:    If > 0, temperature at which to switch on all ACPI
+ *                     processor cooling devices.  Currently only used by the
+ *                     step-wise governor.
+ * @ops:       operations this &thermal_zone_device supports
+ * @tzp:       thermal zone parameters
+ * @governor:  pointer to the governor for this thermal zone
+ * @thermal_instances: list of &struct thermal_instance of this thermal zone
+ * @idr:       &struct idr to generate unique id for this zone's cooling
+ *             devices
+ * @lock:      lock to protect thermal_instances list
+ * @node:      node in thermal_tz_list (in thermal_core.c)
+ * @poll_queue:        delayed work for polling
+ */
 struct thermal_zone_device {
        int id;
        char type[THERMAL_NAME_LENGTH];
@@ -179,12 +215,18 @@ struct thermal_zone_device {
        struct thermal_governor *governor;
        struct list_head thermal_instances;
        struct idr idr;
-       struct mutex lock; /* protect thermal_instances list */
+       struct mutex lock;
        struct list_head node;
        struct delayed_work poll_queue;
 };
 
-/* Structure that holds thermal governor information */
+/**
+ * struct thermal_governor - structure that holds thermal governor information
+ * @name:      name of the governor
+ * @throttle:  callback called for every trip point even if temperature is
+ *             below the trip point temperature
+ * @governor_list:     node in thermal_governor_list (in thermal_core.c)
+ */
 struct thermal_governor {
        char name[THERMAL_NAME_LENGTH];
        int (*throttle)(struct thermal_zone_device *tz, int trip);
index 09a7cffc224e2d8293d747255db856427e105925..48d64e6ab29279a5d46e57114b8272b7a2069c4f 100644 (file)
@@ -84,7 +84,7 @@ unsigned long iov_iter_alignment(const struct iov_iter *i);
 void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov,
                        unsigned long nr_segs, size_t count);
 ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
-                       size_t maxsize, size_t *start);
+                       unsigned maxpages, size_t *start);
 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
                        size_t maxsize, size_t *start);
 int iov_iter_npages(const struct iov_iter *i, int maxpages);
index 25a0fbd4b998f7572b8d8d705faf761b5c20b5fe..d3204115f15d21dd7ef3d879df2393884795b037 100644 (file)
@@ -98,16 +98,16 @@ extern int vfio_external_user_iommu_id(struct vfio_group *group);
 extern long vfio_external_check_extension(struct vfio_group *group,
                                          unsigned long arg);
 
+struct pci_dev;
 #ifdef CONFIG_EEH
-extern int vfio_spapr_pci_eeh_open(struct pci_dev *pdev);
+extern void vfio_spapr_pci_eeh_open(struct pci_dev *pdev);
 extern void vfio_spapr_pci_eeh_release(struct pci_dev *pdev);
 extern long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
                                       unsigned int cmd,
                                       unsigned long arg);
 #else
-static inline int vfio_spapr_pci_eeh_open(struct pci_dev *pdev)
+static inline void vfio_spapr_pci_eeh_open(struct pci_dev *pdev)
 {
-       return 0;
 }
 
 static inline void vfio_spapr_pci_eeh_release(struct pci_dev *pdev)
index 9859355a7cf9944a0fd27726b602733b2eea243b..750e5db7c6bff10b04e7d5ee8853d244cb7fc610 100644 (file)
@@ -86,7 +86,9 @@ typedef struct sg_io_hdr
 #define SG_FLAG_MMAP_IO 4       /* request memory mapped IO */
 #define SG_FLAG_NO_DXFER 0x10000 /* no transfer of kernel buffers to/from */
                                /* user space (debug indirect IO) */
-#define SG_FLAG_Q_AT_TAIL 0x10  /* default is Q_AT_HEAD */
+/* defaults:: for sg driver: Q_AT_HEAD; for block layer: Q_AT_TAIL */
+#define SG_FLAG_Q_AT_TAIL 0x10
+#define SG_FLAG_Q_AT_HEAD 0x20
 
 /* following 'info' values are "or"-ed together */
 #define SG_INFO_OK_MASK 0x1
index c9c3c044b32f060b749c63381b956fceabc64f89..981acf74b14f1fdbf00fc2c7bd915c82a9b37c3c 100644 (file)
@@ -148,11 +148,13 @@ TRACE_EVENT(bcache_read,
 );
 
 TRACE_EVENT(bcache_write,
-       TP_PROTO(struct bio *bio, bool writeback, bool bypass),
-       TP_ARGS(bio, writeback, bypass),
+       TP_PROTO(struct cache_set *c, u64 inode, struct bio *bio,
+               bool writeback, bool bypass),
+       TP_ARGS(c, inode, bio, writeback, bypass),
 
        TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
+               __array(char,           uuid,   16              )
+               __field(u64,            inode                   )
                __field(sector_t,       sector                  )
                __field(unsigned int,   nr_sector               )
                __array(char,           rwbs,   6               )
@@ -161,7 +163,8 @@ TRACE_EVENT(bcache_write,
        ),
 
        TP_fast_assign(
-               __entry->dev            = bio->bi_bdev->bd_dev;
+               memcpy(__entry->uuid, c->sb.set_uuid, 16);
+               __entry->inode          = inode;
                __entry->sector         = bio->bi_iter.bi_sector;
                __entry->nr_sector      = bio->bi_iter.bi_size >> 9;
                blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size);
@@ -169,8 +172,8 @@ TRACE_EVENT(bcache_write,
                __entry->bypass = bypass;
        ),
 
-       TP_printk("%d,%d  %s %llu + %u hit %u bypass %u",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
+       TP_printk("%pU inode %llu  %s %llu + %u hit %u bypass %u",
+                 __entry->uuid, __entry->inode,
                  __entry->rwbs, (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->writeback, __entry->bypass)
 );
@@ -258,9 +261,9 @@ DEFINE_EVENT(btree_node, bcache_btree_node_alloc,
        TP_ARGS(b)
 );
 
-DEFINE_EVENT(btree_node, bcache_btree_node_alloc_fail,
-       TP_PROTO(struct btree *b),
-       TP_ARGS(b)
+DEFINE_EVENT(cache_set, bcache_btree_node_alloc_fail,
+       TP_PROTO(struct cache_set *c),
+       TP_ARGS(c)
 );
 
 DEFINE_EVENT(btree_node, bcache_btree_node_free,
index 7a12e1c0f371d0ce9ede8cc524b40851f2c8b30f..02986cf8b6f12c41f651f896f75b5cabe5f36925 100644 (file)
 #define BSG_SUB_PROTOCOL_SCSI_TRANSPORT        2
 
 /*
- * For flags member below
- * sg.h sg_io_hdr also has bits defined for it's flags member. However
- * none of these bits are implemented/used by bsg. The bits below are
- * allocated to not conflict with sg.h ones anyway.
+ * For flag constants below:
+ * sg.h sg_io_hdr also has bits defined for it's flags member. These
+ * two flag values (0x10 and 0x20) have the same meaning in sg.h . For
+ * bsg the BSG_FLAG_Q_AT_HEAD flag is ignored since it is the deafult.
  */
-#define BSG_FLAG_Q_AT_TAIL 0x10 /* default, == 0 at this bit, is Q_AT_HEAD */
+#define BSG_FLAG_Q_AT_TAIL 0x10 /* default is Q_AT_HEAD */
+#define BSG_FLAG_Q_AT_HEAD 0x20
 
 struct sg_io_v4 {
        __s32 guard;            /* [i] 'Q' to differentiate from v3 */
index 6d8e61c48563a41a56379643585d610cff7f096a..9ad67b2675847b237bab3ab0117a3fa449f68fd4 100644 (file)
@@ -40,6 +40,7 @@
 #define VIRTIO_BLK_F_WCE       9       /* Writeback mode enabled after reset */
 #define VIRTIO_BLK_F_TOPOLOGY  10      /* Topology information is available */
 #define VIRTIO_BLK_F_CONFIG_WCE        11      /* Writeback mode available in config */
+#define VIRTIO_BLK_F_MQ                12      /* support more than one vq */
 
 #ifndef __KERNEL__
 /* Old (deprecated) name for VIRTIO_BLK_F_WCE. */
@@ -77,6 +78,10 @@ struct virtio_blk_config {
 
        /* writeback mode (if VIRTIO_BLK_F_CONFIG_WCE) */
        __u8 wce;
+       __u8 unused;
+
+       /* number of vqs, only available when VIRTIO_BLK_F_MQ is set */
+       __u16 num_queues;
 } __attribute__((packed));
 
 /*
index 51793520566fade30ec645d0cdd74d31f4c369ed..b4c667d22e7930968dfd65f7f6b5f6924f52f5bd 100644 (file)
@@ -59,6 +59,7 @@
 #include <asm/div64.h>
 #include <linux/blkdev.h> /* sector_div */
 #include <linux/pid_namespace.h>
+#include <linux/fs_pin.h>
 
 /*
  * These constants control the amount of freespace that suspend and
@@ -75,172 +76,190 @@ int acct_parm[3] = {4, 2, 30};
 /*
  * External references and all of the globals.
  */
-static void do_acct_process(struct bsd_acct_struct *acct,
-               struct pid_namespace *ns, struct file *);
+static void do_acct_process(struct bsd_acct_struct *acct);
 
-/*
- * This structure is used so that all the data protected by lock
- * can be placed in the same cache line as the lock.  This primes
- * the cache line to have the data after getting the lock.
- */
 struct bsd_acct_struct {
+       struct fs_pin           pin;
+       struct mutex            lock;
        int                     active;
        unsigned long           needcheck;
        struct file             *file;
        struct pid_namespace    *ns;
-       struct list_head        list;
+       struct work_struct      work;
+       struct completion       done;
 };
 
-static DEFINE_SPINLOCK(acct_lock);
-static LIST_HEAD(acct_list);
-
 /*
  * Check the amount of free space and suspend/resume accordingly.
  */
-static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
+static int check_free_space(struct bsd_acct_struct *acct)
 {
        struct kstatfs sbuf;
-       int res;
-       int act;
-       u64 resume;
-       u64 suspend;
-
-       spin_lock(&acct_lock);
-       res = acct->active;
-       if (!file || time_is_before_jiffies(acct->needcheck))
+
+       if (time_is_before_jiffies(acct->needcheck))
                goto out;
-       spin_unlock(&acct_lock);
 
        /* May block */
-       if (vfs_statfs(&file->f_path, &sbuf))
-               return res;
-       suspend = sbuf.f_blocks * SUSPEND;
-       resume = sbuf.f_blocks * RESUME;
-
-       do_div(suspend, 100);
-       do_div(resume, 100);
-
-       if (sbuf.f_bavail <= suspend)
-               act = -1;
-       else if (sbuf.f_bavail >= resume)
-               act = 1;
-       else
-               act = 0;
-
-       /*
-        * If some joker switched acct->file under us we'ld better be
-        * silent and _not_ touch anything.
-        */
-       spin_lock(&acct_lock);
-       if (file != acct->file) {
-               if (act)
-                       res = act > 0;
+       if (vfs_statfs(&acct->file->f_path, &sbuf))
                goto out;
-       }
 
        if (acct->active) {
-               if (act < 0) {
+               u64 suspend = sbuf.f_blocks * SUSPEND;
+               do_div(suspend, 100);
+               if (sbuf.f_bavail <= suspend) {
                        acct->active = 0;
                        pr_info("Process accounting paused\n");
                }
        } else {
-               if (act > 0) {
+               u64 resume = sbuf.f_blocks * RESUME;
+               do_div(resume, 100);
+               if (sbuf.f_bavail >= resume) {
                        acct->active = 1;
                        pr_info("Process accounting resumed\n");
                }
        }
 
        acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
-       res = acct->active;
 out:
-       spin_unlock(&acct_lock);
+       return acct->active;
+}
+
+static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
+{
+       struct bsd_acct_struct *res;
+again:
+       smp_rmb();
+       rcu_read_lock();
+       res = ACCESS_ONCE(ns->bacct);
+       if (!res) {
+               rcu_read_unlock();
+               return NULL;
+       }
+       if (!atomic_long_inc_not_zero(&res->pin.count)) {
+               rcu_read_unlock();
+               cpu_relax();
+               goto again;
+       }
+       rcu_read_unlock();
+       mutex_lock(&res->lock);
+       if (!res->ns) {
+               mutex_unlock(&res->lock);
+               pin_put(&res->pin);
+               goto again;
+       }
        return res;
 }
 
-/*
- * Close the old accounting file (if currently open) and then replace
- * it with file (if non-NULL).
- *
- * NOTE: acct_lock MUST be held on entry and exit.
- */
-static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
-               struct pid_namespace *ns)
+static void close_work(struct work_struct *work)
 {
-       struct file *old_acct = NULL;
-       struct pid_namespace *old_ns = NULL;
-
-       if (acct->file) {
-               old_acct = acct->file;
-               old_ns = acct->ns;
-               acct->active = 0;
-               acct->file = NULL;
+       struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
+       struct file *file = acct->file;
+       if (file->f_op->flush)
+               file->f_op->flush(file, NULL);
+       __fput_sync(file);
+       complete(&acct->done);
+}
+
+static void acct_kill(struct bsd_acct_struct *acct,
+                     struct bsd_acct_struct *new)
+{
+       if (acct) {
+               struct pid_namespace *ns = acct->ns;
+               do_acct_process(acct);
+               INIT_WORK(&acct->work, close_work);
+               init_completion(&acct->done);
+               schedule_work(&acct->work);
+               wait_for_completion(&acct->done);
+               pin_remove(&acct->pin);
+               ns->bacct = new;
                acct->ns = NULL;
-               list_del(&acct->list);
-       }
-       if (file) {
-               acct->file = file;
-               acct->ns = ns;
-               acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
-               acct->active = 1;
-               list_add(&acct->list, &acct_list);
+               atomic_long_dec(&acct->pin.count);
+               mutex_unlock(&acct->lock);
+               pin_put(&acct->pin);
        }
-       if (old_acct) {
-               mnt_unpin(old_acct->f_path.mnt);
-               spin_unlock(&acct_lock);
-               do_acct_process(acct, old_ns, old_acct);
-               filp_close(old_acct, NULL);
-               spin_lock(&acct_lock);
+}
+
+static void acct_pin_kill(struct fs_pin *pin)
+{
+       struct bsd_acct_struct *acct;
+       acct = container_of(pin, struct bsd_acct_struct, pin);
+       mutex_lock(&acct->lock);
+       if (!acct->ns) {
+               mutex_unlock(&acct->lock);
+               pin_put(pin);
+               acct = NULL;
        }
+       acct_kill(acct, NULL);
 }
 
 static int acct_on(struct filename *pathname)
 {
        struct file *file;
-       struct vfsmount *mnt;
-       struct pid_namespace *ns;
-       struct bsd_acct_struct *acct = NULL;
+       struct vfsmount *mnt, *internal;
+       struct pid_namespace *ns = task_active_pid_ns(current);
+       struct bsd_acct_struct *acct, *old;
+       int err;
+
+       acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
+       if (!acct)
+               return -ENOMEM;
 
        /* Difference from BSD - they don't do O_APPEND */
        file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
-       if (IS_ERR(file))
+       if (IS_ERR(file)) {
+               kfree(acct);
                return PTR_ERR(file);
+       }
 
        if (!S_ISREG(file_inode(file)->i_mode)) {
+               kfree(acct);
                filp_close(file, NULL);
                return -EACCES;
        }
 
        if (!file->f_op->write) {
+               kfree(acct);
                filp_close(file, NULL);
                return -EIO;
        }
-
-       ns = task_active_pid_ns(current);
-       if (ns->bacct == NULL) {
-               acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
-               if (acct == NULL) {
-                       filp_close(file, NULL);
-                       return -ENOMEM;
-               }
+       internal = mnt_clone_internal(&file->f_path);
+       if (IS_ERR(internal)) {
+               kfree(acct);
+               filp_close(file, NULL);
+               return PTR_ERR(internal);
        }
-
-       spin_lock(&acct_lock);
-       if (ns->bacct == NULL) {
-               ns->bacct = acct;
-               acct = NULL;
+       err = mnt_want_write(internal);
+       if (err) {
+               mntput(internal);
+               kfree(acct);
+               filp_close(file, NULL);
+               return err;
        }
-
        mnt = file->f_path.mnt;
-       mnt_pin(mnt);
-       acct_file_reopen(ns->bacct, file, ns);
-       spin_unlock(&acct_lock);
-
-       mntput(mnt); /* it's pinned, now give up active reference */
-       kfree(acct);
-
+       file->f_path.mnt = internal;
+
+       atomic_long_set(&acct->pin.count, 1);
+       acct->pin.kill = acct_pin_kill;
+       acct->file = file;
+       acct->needcheck = jiffies;
+       acct->ns = ns;
+       mutex_init(&acct->lock);
+       mutex_lock_nested(&acct->lock, 1);      /* nobody has seen it yet */
+       pin_insert(&acct->pin, mnt);
+
+       old = acct_get(ns);
+       if (old)
+               acct_kill(old, acct);
+       else
+               ns->bacct = acct;
+       mutex_unlock(&acct->lock);
+       mnt_drop_write(mnt);
+       mntput(mnt);
        return 0;
 }
 
+static DEFINE_MUTEX(acct_on_mutex);
+
 /**
  * sys_acct - enable/disable process accounting
  * @name: file name for accounting records or NULL to shutdown accounting
@@ -264,78 +283,20 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
 
                if (IS_ERR(tmp))
                        return PTR_ERR(tmp);
+               mutex_lock(&acct_on_mutex);
                error = acct_on(tmp);
+               mutex_unlock(&acct_on_mutex);
                putname(tmp);
        } else {
-               struct bsd_acct_struct *acct;
-
-               acct = task_active_pid_ns(current)->bacct;
-               if (acct == NULL)
-                       return 0;
-
-               spin_lock(&acct_lock);
-               acct_file_reopen(acct, NULL, NULL);
-               spin_unlock(&acct_lock);
+               acct_kill(acct_get(task_active_pid_ns(current)), NULL);
        }
 
        return error;
 }
 
-/**
- * acct_auto_close - turn off a filesystem's accounting if it is on
- * @m: vfsmount being shut down
- *
- * If the accounting is turned on for a file in the subtree pointed to
- * to by m, turn accounting off.  Done when m is about to die.
- */
-void acct_auto_close_mnt(struct vfsmount *m)
-{
-       struct bsd_acct_struct *acct;
-
-       spin_lock(&acct_lock);
-restart:
-       list_for_each_entry(acct, &acct_list, list)
-               if (acct->file && acct->file->f_path.mnt == m) {
-                       acct_file_reopen(acct, NULL, NULL);
-                       goto restart;
-               }
-       spin_unlock(&acct_lock);
-}
-
-/**
- * acct_auto_close - turn off a filesystem's accounting if it is on
- * @sb: super block for the filesystem
- *
- * If the accounting is turned on for a file in the filesystem pointed
- * to by sb, turn accounting off.
- */
-void acct_auto_close(struct super_block *sb)
-{
-       struct bsd_acct_struct *acct;
-
-       spin_lock(&acct_lock);
-restart:
-       list_for_each_entry(acct, &acct_list, list)
-               if (acct->file && acct->file->f_path.dentry->d_sb == sb) {
-                       acct_file_reopen(acct, NULL, NULL);
-                       goto restart;
-               }
-       spin_unlock(&acct_lock);
-}
-
 void acct_exit_ns(struct pid_namespace *ns)
 {
-       struct bsd_acct_struct *acct = ns->bacct;
-
-       if (acct == NULL)
-               return;
-
-       spin_lock(&acct_lock);
-       if (acct->file != NULL)
-               acct_file_reopen(acct, NULL, NULL);
-       spin_unlock(&acct_lock);
-
-       kfree(acct);
+       acct_kill(acct_get(ns), NULL);
 }
 
 /*
@@ -450,38 +411,20 @@ static u32 encode_float(u64 value)
  *  do_exit() or when switching to a different output file.
  */
 
-/*
- *  do_acct_process does all actual work. Caller holds the reference to file.
- */
-static void do_acct_process(struct bsd_acct_struct *acct,
-               struct pid_namespace *ns, struct file *file)
+static void fill_ac(acct_t *ac)
 {
        struct pacct_struct *pacct = &current->signal->pacct;
-       acct_t ac;
-       mm_segment_t fs;
-       unsigned long flim;
        u64 elapsed, run_time;
        struct tty_struct *tty;
-       const struct cred *orig_cred;
-
-       /* Perform file operations on behalf of whoever enabled accounting */
-       orig_cred = override_creds(file->f_cred);
-
-       /*
-        * First check to see if there is enough free_space to continue
-        * the process accounting system.
-        */
-       if (!check_free_space(acct, file))
-               goto out;
 
        /*
         * Fill the accounting struct with the needed info as recorded
         * by the different kernel functions.
         */
-       memset(&ac, 0, sizeof(acct_t));
+       memset(ac, 0, sizeof(acct_t));
 
-       ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
-       strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
+       ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
+       strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
 
        /* calculate run_time in nsec*/
        run_time = ktime_get_ns();
@@ -489,9 +432,9 @@ static void do_acct_process(struct bsd_acct_struct *acct,
        /* convert nsec -> AHZ */
        elapsed = nsec_to_AHZ(run_time);
 #if ACCT_VERSION == 3
-       ac.ac_etime = encode_float(elapsed);
+       ac->ac_etime = encode_float(elapsed);
 #else
-       ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
+       ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
                                (unsigned long) elapsed : (unsigned long) -1l);
 #endif
 #if ACCT_VERSION == 1 || ACCT_VERSION == 2
@@ -499,18 +442,58 @@ static void do_acct_process(struct bsd_acct_struct *acct,
                /* new enlarged etime field */
                comp2_t etime = encode_comp2_t(elapsed);
 
-               ac.ac_etime_hi = etime >> 16;
-               ac.ac_etime_lo = (u16) etime;
+               ac->ac_etime_hi = etime >> 16;
+               ac->ac_etime_lo = (u16) etime;
        }
 #endif
        do_div(elapsed, AHZ);
-       ac.ac_btime = get_seconds() - elapsed;
+       ac->ac_btime = get_seconds() - elapsed;
+#if ACCT_VERSION==2
+       ac->ac_ahz = AHZ;
+#endif
+
+       spin_lock_irq(&current->sighand->siglock);
+       tty = current->signal->tty;     /* Safe as we hold the siglock */
+       ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
+       ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
+       ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
+       ac->ac_flag = pacct->ac_flag;
+       ac->ac_mem = encode_comp_t(pacct->ac_mem);
+       ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
+       ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
+       ac->ac_exitcode = pacct->ac_exitcode;
+       spin_unlock_irq(&current->sighand->siglock);
+}
+/*
+ *  do_acct_process does all actual work. Caller holds the reference to file.
+ */
+static void do_acct_process(struct bsd_acct_struct *acct)
+{
+       acct_t ac;
+       unsigned long flim;
+       const struct cred *orig_cred;
+       struct pid_namespace *ns = acct->ns;
+       struct file *file = acct->file;
+
+       /*
+        * Accounting records are not subject to resource limits.
+        */
+       flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+       current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+       /* Perform file operations on behalf of whoever enabled accounting */
+       orig_cred = override_creds(file->f_cred);
+
+       /*
+        * First check to see if there is enough free_space to continue
+        * the process accounting system.
+        */
+       if (!check_free_space(acct))
+               goto out;
+
+       fill_ac(&ac);
        /* we really need to bite the bullet and change layout */
        ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
        ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
-#if ACCT_VERSION == 2
-       ac.ac_ahz = AHZ;
-#endif
 #if ACCT_VERSION == 1 || ACCT_VERSION == 2
        /* backward-compatible 16 bit fields */
        ac.ac_uid16 = ac.ac_uid;
@@ -522,45 +505,18 @@ static void do_acct_process(struct bsd_acct_struct *acct,
        ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
        rcu_read_unlock();
 #endif
-
-       spin_lock_irq(&current->sighand->siglock);
-       tty = current->signal->tty;     /* Safe as we hold the siglock */
-       ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
-       ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
-       ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
-       ac.ac_flag = pacct->ac_flag;
-       ac.ac_mem = encode_comp_t(pacct->ac_mem);
-       ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
-       ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
-       ac.ac_exitcode = pacct->ac_exitcode;
-       spin_unlock_irq(&current->sighand->siglock);
-       ac.ac_io = encode_comp_t(0 /* current->io_usage */);    /* %% */
-       ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
-       ac.ac_swaps = encode_comp_t(0);
-
        /*
         * Get freeze protection. If the fs is frozen, just skip the write
         * as we could deadlock the system otherwise.
         */
-       if (!file_start_write_trylock(file))
-               goto out;
-       /*
-        * Kernel segment override to datasegment and write it
-        * to the accounting file.
-        */
-       fs = get_fs();
-       set_fs(KERNEL_DS);
-       /*
-        * Accounting records are not subject to resource limits.
-        */
-       flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-       current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
-       file->f_op->write(file, (char *)&ac,
-                              sizeof(acct_t), &file->f_pos);
-       current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
-       set_fs(fs);
-       file_end_write(file);
+       if (file_start_write_trylock(file)) {
+               /* it's been opened O_APPEND, so position is irrelevant */
+               loff_t pos = 0;
+               __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos);
+               file_end_write(file);
+       }
 out:
+       current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
        revert_creds(orig_cred);
 }
 
@@ -609,34 +565,20 @@ void acct_collect(long exitcode, int group_dead)
        spin_unlock_irq(&current->sighand->siglock);
 }
 
-static void acct_process_in_ns(struct pid_namespace *ns)
+static void slow_acct_process(struct pid_namespace *ns)
 {
-       struct file *file = NULL;
-       struct bsd_acct_struct *acct;
-
-       acct = ns->bacct;
-       /*
-        * accelerate the common fastpath:
-        */
-       if (!acct || !acct->file)
-               return;
-
-       spin_lock(&acct_lock);
-       file = acct->file;
-       if (unlikely(!file)) {
-               spin_unlock(&acct_lock);
-               return;
+       for ( ; ns; ns = ns->parent) {
+               struct bsd_acct_struct *acct = acct_get(ns);
+               if (acct) {
+                       do_acct_process(acct);
+                       mutex_unlock(&acct->lock);
+                       pin_put(&acct->pin);
+               }
        }
-       get_file(file);
-       spin_unlock(&acct_lock);
-
-       do_acct_process(acct, ns, file);
-       fput(file);
 }
 
 /**
- * acct_process - now just a wrapper around acct_process_in_ns,
- * which in turn is a wrapper around do_acct_process.
+ * acct_process
  *
  * handles process accounting for an exiting task
  */
@@ -649,6 +591,10 @@ void acct_process(void)
         * alive and holds its namespace, which in turn holds
         * its parent.
         */
-       for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent)
-               acct_process_in_ns(ns);
+       for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) {
+               if (ns->bacct)
+                       break;
+       }
+       if (unlikely(ns))
+               slow_acct_process(ns);
 }
index 4a83ecd03650157d47ca3a68b8c6273f9dff73c8..852c81e3ba9a55bf17e2e4c3fe037c39040cc0f3 100644 (file)
@@ -169,7 +169,7 @@ out_fail:
        return NULL;
 }
 
-void lc_free_by_index(struct lru_cache *lc, unsigned i)
+static void lc_free_by_index(struct lru_cache *lc, unsigned i)
 {
        void *p = lc->lc_element[i];
        WARN_ON(!p);
@@ -643,9 +643,10 @@ void lc_set(struct lru_cache *lc, unsigned int enr, int index)
  * lc_dump - Dump a complete LRU cache to seq in textual form.
  * @lc: the lru cache to operate on
  * @seq: the &struct seq_file pointer to seq_printf into
- * @utext: user supplied "heading" or other info
+ * @utext: user supplied additional "heading" or other info
  * @detail: function pointer the user may provide to dump further details
- * of the object the lc_element is embedded in.
+ * of the object the lc_element is embedded in. May be NULL.
+ * Note: a leading space ' ' and trailing newline '\n' is implied.
  */
 void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
             void (*detail) (struct seq_file *, struct lc_element *))
@@ -654,16 +655,18 @@ void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext
        struct lc_element *e;
        int i;
 
-       seq_printf(seq, "\tnn: lc_number refcnt %s\n ", utext);
+       seq_printf(seq, "\tnn: lc_number (new nr) refcnt %s\n ", utext);
        for (i = 0; i < nr_elements; i++) {
                e = lc_element_by_index(lc, i);
-               if (e->lc_number == LC_FREE) {
-                       seq_printf(seq, "\t%2d: FREE\n", i);
-               } else {
-                       seq_printf(seq, "\t%2d: %4u %4u    ", i,
-                                  e->lc_number, e->refcnt);
+               if (e->lc_number != e->lc_new_number)
+                       seq_printf(seq, "\t%5d: %6d %8d %6d ",
+                               i, e->lc_number, e->lc_new_number, e->refcnt);
+               else
+                       seq_printf(seq, "\t%5d: %6d %-8s %6d ",
+                               i, e->lc_number, "-\"-", e->refcnt);
+               if (detail)
                        detail(seq, e);
-               }
+               seq_putc(seq, '\n');
        }
 }
 
index f501b56ec2c6e2c3d3c662669afbffc9017797b5..90effcdf948d6c463afa817e277e7eae94b657a6 100644 (file)
@@ -2602,7 +2602,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
                 * that this differs from normal direct-io semantics, which
                 * will return -EFOO even if some bytes were written.
                 */
-               if (unlikely(status < 0) && !written) {
+               if (unlikely(status < 0)) {
                        err = status;
                        goto out;
                }
index 7b5dbd1517b5594b05d5590cae29c3eb3a1dada2..ab88dc0ea1d36a3a7e971f096e8e10c4a1646c93 100644 (file)
@@ -310,7 +310,7 @@ void iov_iter_init(struct iov_iter *i, int direction,
 EXPORT_SYMBOL(iov_iter_init);
 
 static ssize_t get_pages_iovec(struct iov_iter *i,
-                  struct page **pages, size_t maxsize,
+                  struct page **pages, unsigned maxpages,
                   size_t *start)
 {
        size_t offset = i->iov_offset;
@@ -323,10 +323,10 @@ static ssize_t get_pages_iovec(struct iov_iter *i,
        len = iov->iov_len - offset;
        if (len > i->count)
                len = i->count;
-       if (len > maxsize)
-               len = maxsize;
        addr = (unsigned long)iov->iov_base + offset;
        len += *start = addr & (PAGE_SIZE - 1);
+       if (len > maxpages * PAGE_SIZE)
+               len = maxpages * PAGE_SIZE;
        addr &= ~(PAGE_SIZE - 1);
        n = (len + PAGE_SIZE - 1) / PAGE_SIZE;
        res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages);
@@ -588,15 +588,14 @@ static unsigned long alignment_bvec(const struct iov_iter *i)
 }
 
 static ssize_t get_pages_bvec(struct iov_iter *i,
-                  struct page **pages, size_t maxsize,
+                  struct page **pages, unsigned maxpages,
                   size_t *start)
 {
        const struct bio_vec *bvec = i->bvec;
        size_t len = bvec->bv_len - i->iov_offset;
        if (len > i->count)
                len = i->count;
-       if (len > maxsize)
-               len = maxsize;
+       /* can't be more than PAGE_SIZE */
        *start = bvec->bv_offset + i->iov_offset;
 
        get_page(*pages = bvec->bv_page);
@@ -712,13 +711,13 @@ unsigned long iov_iter_alignment(const struct iov_iter *i)
 EXPORT_SYMBOL(iov_iter_alignment);
 
 ssize_t iov_iter_get_pages(struct iov_iter *i,
-                  struct page **pages, size_t maxsize,
+                  struct page **pages, unsigned maxpages,
                   size_t *start)
 {
        if (i->type & ITER_BVEC)
-               return get_pages_bvec(i, pages, maxsize, start);
+               return get_pages_bvec(i, pages, maxpages, start);
        else
-               return get_pages_iovec(i, pages, maxsize, start);
+               return get_pages_iovec(i, pages, maxpages, start);
 }
 EXPORT_SYMBOL(iov_iter_get_pages);
 
index a42add14331c02171f5f3fbb30ed4e82b8d9fad3..0e5fb225007c519a27b680673160011bd74dd445 100644 (file)
@@ -2323,17 +2323,45 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
        return shmem_unlink(dir, dentry);
 }
 
+static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
+{
+       bool old_is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
+       bool new_is_dir = S_ISDIR(new_dentry->d_inode->i_mode);
+
+       if (old_dir != new_dir && old_is_dir != new_is_dir) {
+               if (old_is_dir) {
+                       drop_nlink(old_dir);
+                       inc_nlink(new_dir);
+               } else {
+                       drop_nlink(new_dir);
+                       inc_nlink(old_dir);
+               }
+       }
+       old_dir->i_ctime = old_dir->i_mtime =
+       new_dir->i_ctime = new_dir->i_mtime =
+       old_dentry->d_inode->i_ctime =
+       new_dentry->d_inode->i_ctime = CURRENT_TIME;
+
+       return 0;
+}
+
 /*
  * The VFS layer already does all the dentry stuff for rename,
  * we just have to decrement the usage count for the target if
  * it exists so that the VFS layer correctly free's it when it
  * gets overwritten.
  */
-static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
+static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags)
 {
        struct inode *inode = old_dentry->d_inode;
        int they_are_dirs = S_ISDIR(inode->i_mode);
 
+       if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+               return -EINVAL;
+
+       if (flags & RENAME_EXCHANGE)
+               return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry);
+
        if (!simple_empty(new_dentry))
                return -ENOTEMPTY;
 
@@ -3087,7 +3115,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
        .mkdir          = shmem_mkdir,
        .rmdir          = shmem_rmdir,
        .mknod          = shmem_mknod,
-       .rename         = shmem_rename,
+       .rename2        = shmem_rename2,
        .tmpfile        = shmem_tmpfile,
 #endif
 #ifdef CONFIG_TMPFS_XATTR
index 028a5c6d1f61383a27736df107b5b7c72e70b186..e4a02ef551020ebb222a34ae1188430613ab2710 100644 (file)
@@ -1,5 +1,5 @@
 config 6LOWPAN
-       bool "6LoWPAN Support"
+       tristate "6LoWPAN Support"
        depends on IPV6
        ---help---
          This enables IPv6 over Low power Wireless Personal Area Network -
index 75d427763992b1b4bb47c782ead2d2300ffb61c3..90cc2bdd406444df8c122066e0b5f23c33c8ed37 100644 (file)
@@ -112,59 +112,6 @@ __be16 vlan_dev_vlan_proto(const struct net_device *dev)
 }
 EXPORT_SYMBOL(vlan_dev_vlan_proto);
 
-static struct sk_buff *vlan_reorder_header(struct sk_buff *skb)
-{
-       if (skb_cow(skb, skb_headroom(skb)) < 0) {
-               kfree_skb(skb);
-               return NULL;
-       }
-
-       memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN);
-       skb->mac_header += VLAN_HLEN;
-       return skb;
-}
-
-struct sk_buff *vlan_untag(struct sk_buff *skb)
-{
-       struct vlan_hdr *vhdr;
-       u16 vlan_tci;
-
-       if (unlikely(vlan_tx_tag_present(skb))) {
-               /* vlan_tci is already set-up so leave this for another time */
-               return skb;
-       }
-
-       skb = skb_share_check(skb, GFP_ATOMIC);
-       if (unlikely(!skb))
-               goto err_free;
-
-       if (unlikely(!pskb_may_pull(skb, VLAN_HLEN)))
-               goto err_free;
-
-       vhdr = (struct vlan_hdr *) skb->data;
-       vlan_tci = ntohs(vhdr->h_vlan_TCI);
-       __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);
-
-       skb_pull_rcsum(skb, VLAN_HLEN);
-       vlan_set_encap_proto(skb, vhdr);
-
-       skb = vlan_reorder_header(skb);
-       if (unlikely(!skb))
-               goto err_free;
-
-       skb_reset_network_header(skb);
-       skb_reset_transport_header(skb);
-       skb_reset_mac_len(skb);
-
-       return skb;
-
-err_free:
-       kfree_skb(skb);
-       return NULL;
-}
-EXPORT_SYMBOL(vlan_untag);
-
-
 /*
  * vlan info and vid list
  */
index 96b66fd30f964021243b747db246d3db3574bc88..ab6bb2af1d45d51a77b93a062b8a5b59cc69b1c0 100644 (file)
@@ -20,7 +20,6 @@
 #include "originator.h"
 #include "hard-interface.h"
 #include "translation-table.h"
-#include "multicast.h"
 
 /**
  * batadv_mcast_mla_softif_get - get softif multicast listeners
index febb0f87fa37a1840d8ad470fb2491348da53bd3..e1bcd653899b4ed0a7ca8714817f9a9350eb414d 100644 (file)
@@ -181,7 +181,7 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v,
         */
        if (unlikely(!vlan_tx_tag_present(skb) &&
                     skb->protocol == proto)) {
-               skb = vlan_untag(skb);
+               skb = skb_vlan_untag(skb);
                if (unlikely(!skb))
                        return false;
        }
index 1059ed3bc2557d597cb0548962a888e544d74a67..6d69631b9f4d2bf5c667d1f76e4f5bc384cdf61c 100644 (file)
@@ -327,10 +327,7 @@ find_inlist_lock_noload(struct list_head *head, const char *name, int *error,
                char name[EBT_FUNCTION_MAXNAMELEN];
        } *e;
 
-       *error = mutex_lock_interruptible(mutex);
-       if (*error != 0)
-               return NULL;
-
+       mutex_lock(mutex);
        list_for_each_entry(e, head, list) {
                if (strcmp(e->name, name) == 0)
                        return e;
@@ -1203,10 +1200,7 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table)
 
        table->private = newinfo;
        rwlock_init(&table->lock);
-       ret = mutex_lock_interruptible(&ebt_mutex);
-       if (ret != 0)
-               goto free_chainstack;
-
+       mutex_lock(&ebt_mutex);
        list_for_each_entry(t, &net->xt.tables[NFPROTO_BRIDGE], list) {
                if (strcmp(t->name, table->name) == 0) {
                        ret = -EEXIST;
index 1948d592aa54c7a1831df546702904898cd68da4..b2f571dd933dde47dd8887c392b0f13c7bfef101 100644 (file)
@@ -174,6 +174,7 @@ static struct lock_class_key socket_class;
 #define SKIP_BUF_SIZE  1024
 
 static void queue_con(struct ceph_connection *con);
+static void cancel_con(struct ceph_connection *con);
 static void con_work(struct work_struct *);
 static void con_fault(struct ceph_connection *con);
 
@@ -680,7 +681,7 @@ void ceph_con_close(struct ceph_connection *con)
 
        reset_connection(con);
        con->peer_global_seq = 0;
-       cancel_delayed_work(&con->work);
+       cancel_con(con);
        con_close_socket(con);
        mutex_unlock(&con->mutex);
 }
@@ -900,7 +901,7 @@ static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor,
        BUG_ON(page_count > (int)USHRT_MAX);
        cursor->page_count = (unsigned short)page_count;
        BUG_ON(length > SIZE_MAX - cursor->page_offset);
-       cursor->last_piece = (size_t)cursor->page_offset + length <= PAGE_SIZE;
+       cursor->last_piece = cursor->page_offset + cursor->resid <= PAGE_SIZE;
 }
 
 static struct page *
@@ -2667,19 +2668,16 @@ static int queue_con_delay(struct ceph_connection *con, unsigned long delay)
 {
        if (!con->ops->get(con)) {
                dout("%s %p ref count 0\n", __func__, con);
-
                return -ENOENT;
        }
 
        if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) {
                dout("%s %p - already queued\n", __func__, con);
                con->ops->put(con);
-
                return -EBUSY;
        }
 
        dout("%s %p %lu\n", __func__, con, delay);
-
        return 0;
 }
 
@@ -2688,6 +2686,14 @@ static void queue_con(struct ceph_connection *con)
        (void) queue_con_delay(con, 0);
 }
 
+static void cancel_con(struct ceph_connection *con)
+{
+       if (cancel_delayed_work(&con->work)) {
+               dout("%s %p\n", __func__, con);
+               con->ops->put(con);
+       }
+}
+
 static bool con_sock_closed(struct ceph_connection *con)
 {
        if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED))
@@ -3269,24 +3275,21 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
 /*
  * Free a generically kmalloc'd message.
  */
-void ceph_msg_kfree(struct ceph_msg *m)
+static void ceph_msg_free(struct ceph_msg *m)
 {
-       dout("msg_kfree %p\n", m);
+       dout("%s %p\n", __func__, m);
        ceph_kvfree(m->front.iov_base);
        kmem_cache_free(ceph_msg_cache, m);
 }
 
-/*
- * Drop a msg ref.  Destroy as needed.
- */
-void ceph_msg_last_put(struct kref *kref)
+static void ceph_msg_release(struct kref *kref)
 {
        struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
        LIST_HEAD(data);
        struct list_head *links;
        struct list_head *next;
 
-       dout("ceph_msg_put last one on %p\n", m);
+       dout("%s %p\n", __func__, m);
        WARN_ON(!list_empty(&m->list_head));
 
        /* drop middle, data, if any */
@@ -3308,9 +3311,25 @@ void ceph_msg_last_put(struct kref *kref)
        if (m->pool)
                ceph_msgpool_put(m->pool, m);
        else
-               ceph_msg_kfree(m);
+               ceph_msg_free(m);
+}
+
+struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
+{
+       dout("%s %p (was %d)\n", __func__, msg,
+            atomic_read(&msg->kref.refcount));
+       kref_get(&msg->kref);
+       return msg;
+}
+EXPORT_SYMBOL(ceph_msg_get);
+
+void ceph_msg_put(struct ceph_msg *msg)
+{
+       dout("%s %p (was %d)\n", __func__, msg,
+            atomic_read(&msg->kref.refcount));
+       kref_put(&msg->kref, ceph_msg_release);
 }
-EXPORT_SYMBOL(ceph_msg_last_put);
+EXPORT_SYMBOL(ceph_msg_put);
 
 void ceph_msg_dump(struct ceph_msg *msg)
 {
index 05be0c1816958b0d0db6b2c319631d41f273e3d0..30f6faf3584fb529ffdb5b6f9fae5041acf55166 100644 (file)
@@ -297,12 +297,21 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
 /*
  * requests
  */
-void ceph_osdc_release_request(struct kref *kref)
+static void ceph_osdc_release_request(struct kref *kref)
 {
-       struct ceph_osd_request *req;
+       struct ceph_osd_request *req = container_of(kref,
+                                           struct ceph_osd_request, r_kref);
        unsigned int which;
 
-       req = container_of(kref, struct ceph_osd_request, r_kref);
+       dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
+            req->r_request, req->r_reply);
+       WARN_ON(!RB_EMPTY_NODE(&req->r_node));
+       WARN_ON(!list_empty(&req->r_req_lru_item));
+       WARN_ON(!list_empty(&req->r_osd_item));
+       WARN_ON(!list_empty(&req->r_linger_item));
+       WARN_ON(!list_empty(&req->r_linger_osd_item));
+       WARN_ON(req->r_osd);
+
        if (req->r_request)
                ceph_msg_put(req->r_request);
        if (req->r_reply) {
@@ -320,7 +329,22 @@ void ceph_osdc_release_request(struct kref *kref)
                kmem_cache_free(ceph_osd_request_cache, req);
 
 }
-EXPORT_SYMBOL(ceph_osdc_release_request);
+
+void ceph_osdc_get_request(struct ceph_osd_request *req)
+{
+       dout("%s %p (was %d)\n", __func__, req,
+            atomic_read(&req->r_kref.refcount));
+       kref_get(&req->r_kref);
+}
+EXPORT_SYMBOL(ceph_osdc_get_request);
+
+void ceph_osdc_put_request(struct ceph_osd_request *req)
+{
+       dout("%s %p (was %d)\n", __func__, req,
+            atomic_read(&req->r_kref.refcount));
+       kref_put(&req->r_kref, ceph_osdc_release_request);
+}
+EXPORT_SYMBOL(ceph_osdc_put_request);
 
 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
                                               struct ceph_snap_context *snapc,
@@ -364,7 +388,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
        RB_CLEAR_NODE(&req->r_node);
        INIT_LIST_HEAD(&req->r_unsafe_item);
        INIT_LIST_HEAD(&req->r_linger_item);
-       INIT_LIST_HEAD(&req->r_linger_osd);
+       INIT_LIST_HEAD(&req->r_linger_osd_item);
        INIT_LIST_HEAD(&req->r_req_lru_item);
        INIT_LIST_HEAD(&req->r_osd_item);
 
@@ -916,7 +940,7 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc,
         * list at the end to keep things in tid order.
         */
        list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
-                                r_linger_osd) {
+                                r_linger_osd_item) {
                /*
                 * reregister request prior to unregistering linger so
                 * that r_osd is preserved.
@@ -1008,6 +1032,8 @@ static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 {
        dout("__remove_osd %p\n", osd);
        BUG_ON(!list_empty(&osd->o_requests));
+       BUG_ON(!list_empty(&osd->o_linger_requests));
+
        rb_erase(&osd->o_node, &osdc->osds);
        list_del_init(&osd->o_osd_lru);
        ceph_con_close(&osd->o_con);
@@ -1029,12 +1055,23 @@ static void remove_all_osds(struct ceph_osd_client *osdc)
 static void __move_osd_to_lru(struct ceph_osd_client *osdc,
                              struct ceph_osd *osd)
 {
-       dout("__move_osd_to_lru %p\n", osd);
+       dout("%s %p\n", __func__, osd);
        BUG_ON(!list_empty(&osd->o_osd_lru));
+
        list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
        osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ;
 }
 
+static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc,
+                                 struct ceph_osd *osd)
+{
+       dout("%s %p\n", __func__, osd);
+
+       if (list_empty(&osd->o_requests) &&
+           list_empty(&osd->o_linger_requests))
+               __move_osd_to_lru(osdc, osd);
+}
+
 static void __remove_osd_from_lru(struct ceph_osd *osd)
 {
        dout("__remove_osd_from_lru %p\n", osd);
@@ -1175,6 +1212,7 @@ static void __unregister_request(struct ceph_osd_client *osdc,
 
        dout("__unregister_request %p tid %lld\n", req, req->r_tid);
        rb_erase(&req->r_node, &osdc->requests);
+       RB_CLEAR_NODE(&req->r_node);
        osdc->num_requests--;
 
        if (req->r_osd) {
@@ -1182,12 +1220,8 @@ static void __unregister_request(struct ceph_osd_client *osdc,
                ceph_msg_revoke(req->r_request);
 
                list_del_init(&req->r_osd_item);
-               if (list_empty(&req->r_osd->o_requests) &&
-                   list_empty(&req->r_osd->o_linger_requests)) {
-                       dout("moving osd to %p lru\n", req->r_osd);
-                       __move_osd_to_lru(osdc, req->r_osd);
-               }
-               if (list_empty(&req->r_linger_item))
+               maybe_move_osd_to_lru(osdc, req->r_osd);
+               if (list_empty(&req->r_linger_osd_item))
                        req->r_osd = NULL;
        }
 
@@ -1214,45 +1248,39 @@ static void __cancel_request(struct ceph_osd_request *req)
 static void __register_linger_request(struct ceph_osd_client *osdc,
                                    struct ceph_osd_request *req)
 {
-       dout("__register_linger_request %p\n", req);
+       dout("%s %p tid %llu\n", __func__, req, req->r_tid);
+       WARN_ON(!req->r_linger);
+
        ceph_osdc_get_request(req);
        list_add_tail(&req->r_linger_item, &osdc->req_linger);
        if (req->r_osd)
-               list_add_tail(&req->r_linger_osd,
+               list_add_tail(&req->r_linger_osd_item,
                              &req->r_osd->o_linger_requests);
 }
 
 static void __unregister_linger_request(struct ceph_osd_client *osdc,
                                        struct ceph_osd_request *req)
 {
-       dout("__unregister_linger_request %p\n", req);
+       WARN_ON(!req->r_linger);
+
+       if (list_empty(&req->r_linger_item)) {
+               dout("%s %p tid %llu not registered\n", __func__, req,
+                    req->r_tid);
+               return;
+       }
+
+       dout("%s %p tid %llu\n", __func__, req, req->r_tid);
        list_del_init(&req->r_linger_item);
-       if (req->r_osd) {
-               list_del_init(&req->r_linger_osd);
 
-               if (list_empty(&req->r_osd->o_requests) &&
-                   list_empty(&req->r_osd->o_linger_requests)) {
-                       dout("moving osd to %p lru\n", req->r_osd);
-                       __move_osd_to_lru(osdc, req->r_osd);
-               }
+       if (req->r_osd) {
+               list_del_init(&req->r_linger_osd_item);
+               maybe_move_osd_to_lru(osdc, req->r_osd);
                if (list_empty(&req->r_osd_item))
                        req->r_osd = NULL;
        }
        ceph_osdc_put_request(req);
 }
 
-void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc,
-                                        struct ceph_osd_request *req)
-{
-       mutex_lock(&osdc->request_mutex);
-       if (req->r_linger) {
-               req->r_linger = 0;
-               __unregister_linger_request(osdc, req);
-       }
-       mutex_unlock(&osdc->request_mutex);
-}
-EXPORT_SYMBOL(ceph_osdc_unregister_linger_request);
-
 void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
                                  struct ceph_osd_request *req)
 {
@@ -2429,6 +2457,25 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 }
 EXPORT_SYMBOL(ceph_osdc_start_request);
 
+/*
+ * Unregister a registered request.  The request is not completed (i.e.
+ * no callbacks or wakeups) - higher layers are supposed to know what
+ * they are canceling.
+ */
+void ceph_osdc_cancel_request(struct ceph_osd_request *req)
+{
+       struct ceph_osd_client *osdc = req->r_osdc;
+
+       mutex_lock(&osdc->request_mutex);
+       if (req->r_linger)
+               __unregister_linger_request(osdc, req);
+       __unregister_request(osdc, req);
+       mutex_unlock(&osdc->request_mutex);
+
+       dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid);
+}
+EXPORT_SYMBOL(ceph_osdc_cancel_request);
+
 /*
  * wait for a request to complete
  */
@@ -2437,18 +2484,18 @@ int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
 {
        int rc;
 
+       dout("%s %p tid %llu\n", __func__, req, req->r_tid);
+
        rc = wait_for_completion_interruptible(&req->r_completion);
        if (rc < 0) {
-               mutex_lock(&osdc->request_mutex);
-               __cancel_request(req);
-               __unregister_request(osdc, req);
-               mutex_unlock(&osdc->request_mutex);
+               dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid);
+               ceph_osdc_cancel_request(req);
                complete_request(req);
-               dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
                return rc;
        }
 
-       dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
+       dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid,
+            req->r_result);
        return req->r_result;
 }
 EXPORT_SYMBOL(ceph_osdc_wait_request);
index 1c15b189c52b24e5f2d63172299e555387c2b3d9..b65a5051361f2dea31a0fac078b3dd656e126cc8 100644 (file)
@@ -3602,7 +3602,7 @@ another_round:
 
        if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
            skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
-               skb = vlan_untag(skb);
+               skb = skb_vlan_untag(skb);
                if (unlikely(!skb))
                        goto unlock;
        }
index 8d39071f32d76a41d10c5b48e06f4cf59dca1ee5..f0493e3b7471099f0245b0ea0c4b52de4a03034f 100644 (file)
@@ -804,7 +804,8 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
                        (nla_total_size(sizeof(struct ifla_vf_mac)) +
                         nla_total_size(sizeof(struct ifla_vf_vlan)) +
                         nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
-                        nla_total_size(sizeof(struct ifla_vf_rate)));
+                        nla_total_size(sizeof(struct ifla_vf_rate)) +
+                        nla_total_size(sizeof(struct ifla_vf_link_state)));
                return size;
        } else
                return 0;
index 224506a6fa80369b0061d40760f3d2c74765c2cd..163b673f9e62d212230abd1c9b848c35ba923a0d 100644 (file)
@@ -62,6 +62,7 @@
 #include <linux/scatterlist.h>
 #include <linux/errqueue.h>
 #include <linux/prefetch.h>
+#include <linux/if_vlan.h>
 
 #include <net/protocol.h>
 #include <net/dst.h>
@@ -3973,3 +3974,55 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
        return shinfo->gso_size;
 }
 EXPORT_SYMBOL_GPL(skb_gso_transport_seglen);
+
+static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
+{
+       if (skb_cow(skb, skb_headroom(skb)) < 0) {
+               kfree_skb(skb);
+               return NULL;
+       }
+
+       memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN);
+       skb->mac_header += VLAN_HLEN;
+       return skb;
+}
+
+struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
+{
+       struct vlan_hdr *vhdr;
+       u16 vlan_tci;
+
+       if (unlikely(vlan_tx_tag_present(skb))) {
+               /* vlan_tci is already set-up so leave this for another time */
+               return skb;
+       }
+
+       skb = skb_share_check(skb, GFP_ATOMIC);
+       if (unlikely(!skb))
+               goto err_free;
+
+       if (unlikely(!pskb_may_pull(skb, VLAN_HLEN)))
+               goto err_free;
+
+       vhdr = (struct vlan_hdr *)skb->data;
+       vlan_tci = ntohs(vhdr->h_vlan_TCI);
+       __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);
+
+       skb_pull_rcsum(skb, VLAN_HLEN);
+       vlan_set_encap_proto(skb, vhdr);
+
+       skb = skb_reorder_vlan_header(skb);
+       if (unlikely(!skb))
+               goto err_free;
+
+       skb_reset_network_header(skb);
+       skb_reset_transport_header(skb);
+       skb_reset_mac_len(skb);
+
+       return skb;
+
+err_free:
+       kfree_skb(skb);
+       return NULL;
+}
+EXPORT_SYMBOL(skb_vlan_untag);
index 190199851c9abbd0017f8e6c7134283505f6ceff..eaa4b000c7b443898be7c5ce36f4da4eb20158a6 100644 (file)
@@ -1798,8 +1798,6 @@ local_input:
 no_route:
        RT_CACHE_STAT_INC(in_no_route);
        res.type = RTN_UNREACHABLE;
-       if (err == -ESRCH)
-               err = -ENETUNREACH;
        goto local_input;
 
        /*
index 1fbab0cdd302bdafe199d434fc10f2f6401ef6a8..a93c97f106d4a5022cd0e1196ee70ecdfd8c2873 100644 (file)
@@ -35,11 +35,7 @@ EXPORT_SYMBOL_GPL(nf_ipv6_ops);
 
 int nf_register_afinfo(const struct nf_afinfo *afinfo)
 {
-       int err;
-
-       err = mutex_lock_interruptible(&afinfo_mutex);
-       if (err < 0)
-               return err;
+       mutex_lock(&afinfo_mutex);
        RCU_INIT_POINTER(nf_afinfo[afinfo->family], afinfo);
        mutex_unlock(&afinfo_mutex);
        return 0;
@@ -68,11 +64,8 @@ static DEFINE_MUTEX(nf_hook_mutex);
 int nf_register_hook(struct nf_hook_ops *reg)
 {
        struct nf_hook_ops *elem;
-       int err;
 
-       err = mutex_lock_interruptible(&nf_hook_mutex);
-       if (err < 0)
-               return err;
+       mutex_lock(&nf_hook_mutex);
        list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) {
                if (reg->priority < elem->priority)
                        break;
index 8416307fdd1d431e5f306efdedaf8682c36fa62b..fd3f444a4f964428ae7290ba337384cf08d6028a 100644 (file)
@@ -2271,10 +2271,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
            cmd == IP_VS_SO_SET_STOPDAEMON) {
                struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
 
-               if (mutex_lock_interruptible(&ipvs->sync_mutex)) {
-                       ret = -ERESTARTSYS;
-                       goto out_dec;
-               }
+               mutex_lock(&ipvs->sync_mutex);
                if (cmd == IP_VS_SO_SET_STARTDAEMON)
                        ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
                                                dm->syncid);
@@ -2284,11 +2281,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
                goto out_dec;
        }
 
-       if (mutex_lock_interruptible(&__ip_vs_mutex)) {
-               ret = -ERESTARTSYS;
-               goto out_dec;
-       }
-
+       mutex_lock(&__ip_vs_mutex);
        if (cmd == IP_VS_SO_SET_FLUSH) {
                /* Flush the virtual service */
                ret = ip_vs_flush(net, false);
@@ -2573,9 +2566,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
                struct ip_vs_daemon_user d[2];
 
                memset(&d, 0, sizeof(d));
-               if (mutex_lock_interruptible(&ipvs->sync_mutex))
-                       return -ERESTARTSYS;
-
+               mutex_lock(&ipvs->sync_mutex);
                if (ipvs->sync_state & IP_VS_STATE_MASTER) {
                        d[0].state = IP_VS_STATE_MASTER;
                        strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
@@ -2594,9 +2585,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
                return ret;
        }
 
-       if (mutex_lock_interruptible(&__ip_vs_mutex))
-               return -ERESTARTSYS;
-
+       mutex_lock(&__ip_vs_mutex);
        switch (cmd) {
        case IP_VS_SO_GET_VERSION:
        {
index f042ae521557b340be5252aa41ccdc67f5202a29..c68c1e58b3628930495c5fe5a24f00f15adc1537 100644 (file)
@@ -26,9 +26,7 @@ int nf_register_sockopt(struct nf_sockopt_ops *reg)
        struct nf_sockopt_ops *ops;
        int ret = 0;
 
-       if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0)
-               return -EINTR;
-
+       mutex_lock(&nf_sockopt_mutex);
        list_for_each_entry(ops, &nf_sockopts, list) {
                if (ops->pf == reg->pf
                    && (overlap(ops->set_optmin, ops->set_optmax,
@@ -65,9 +63,7 @@ static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, u_int8_t pf,
 {
        struct nf_sockopt_ops *ops;
 
-       if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0)
-               return ERR_PTR(-EINTR);
-
+       mutex_lock(&nf_sockopt_mutex);
        list_for_each_entry(ops, &nf_sockopts, list) {
                if (ops->pf == pf) {
                        if (!try_module_get(ops->owner))
index b8035c2d6667de98da7d31403b4c51a9b964ccee..deeb95fb702833ac9d2dc9ec39d57778e31a570c 100644 (file)
@@ -899,6 +899,9 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr)
 static void nft_chain_stats_replace(struct nft_base_chain *chain,
                                    struct nft_stats __percpu *newstats)
 {
+       if (newstats == NULL)
+               return;
+
        if (chain->stats) {
                struct nft_stats __percpu *oldstats =
                                nft_dereference(chain->stats);
@@ -3134,16 +3137,13 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
                goto err2;
 
        trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set);
-       if (trans == NULL)
+       if (trans == NULL) {
+               err = -ENOMEM;
                goto err2;
+       }
 
        nft_trans_elem(trans) = elem;
        list_add_tail(&trans->list, &ctx->net->nft.commit_list);
-
-       nft_data_uninit(&elem.key, NFT_DATA_VALUE);
-       if (set->flags & NFT_SET_MAP)
-               nft_data_uninit(&elem.data, set->dtype);
-
        return 0;
 err2:
        nft_data_uninit(&elem.key, desc.type);
@@ -3310,7 +3310,7 @@ static int nf_tables_commit(struct sk_buff *skb)
 {
        struct net *net = sock_net(skb->sk);
        struct nft_trans *trans, *next;
-       struct nft_set *set;
+       struct nft_trans_elem *te;
 
        /* Bump generation counter, invalidate any dump in progress */
        while (++net->nft.base_seq == 0);
@@ -3396,13 +3396,17 @@ static int nf_tables_commit(struct sk_buff *skb)
                        nft_trans_destroy(trans);
                        break;
                case NFT_MSG_DELSETELEM:
-                       nf_tables_setelem_notify(&trans->ctx,
-                                                nft_trans_elem_set(trans),
-                                                &nft_trans_elem(trans),
+                       te = (struct nft_trans_elem *)trans->data;
+                       nf_tables_setelem_notify(&trans->ctx, te->set,
+                                                &te->elem,
                                                 NFT_MSG_DELSETELEM, 0);
-                       set = nft_trans_elem_set(trans);
-                       set->ops->get(set, &nft_trans_elem(trans));
-                       set->ops->remove(set, &nft_trans_elem(trans));
+                       te->set->ops->get(te->set, &te->elem);
+                       te->set->ops->remove(te->set, &te->elem);
+                       nft_data_uninit(&te->elem.key, NFT_DATA_VALUE);
+                       if (te->elem.flags & NFT_SET_MAP) {
+                               nft_data_uninit(&te->elem.data,
+                                               te->set->dtype);
+                       }
                        nft_trans_destroy(trans);
                        break;
                }
index 47b978bc310039626232525ee08849df56e909ca..272ae4d6fdf4f1dcb27eb2bc6e68414a25b63363 100644 (file)
@@ -71,18 +71,14 @@ static const char *const xt_prefix[NFPROTO_NUMPROTO] = {
 static const unsigned int xt_jumpstack_multiplier = 2;
 
 /* Registration hooks for targets. */
-int
-xt_register_target(struct xt_target *target)
+int xt_register_target(struct xt_target *target)
 {
        u_int8_t af = target->family;
-       int ret;
 
-       ret = mutex_lock_interruptible(&xt[af].mutex);
-       if (ret != 0)
-               return ret;
+       mutex_lock(&xt[af].mutex);
        list_add(&target->list, &xt[af].target);
        mutex_unlock(&xt[af].mutex);
-       return ret;
+       return 0;
 }
 EXPORT_SYMBOL(xt_register_target);
 
@@ -125,20 +121,14 @@ xt_unregister_targets(struct xt_target *target, unsigned int n)
 }
 EXPORT_SYMBOL(xt_unregister_targets);
 
-int
-xt_register_match(struct xt_match *match)
+int xt_register_match(struct xt_match *match)
 {
        u_int8_t af = match->family;
-       int ret;
-
-       ret = mutex_lock_interruptible(&xt[af].mutex);
-       if (ret != 0)
-               return ret;
 
+       mutex_lock(&xt[af].mutex);
        list_add(&match->list, &xt[af].match);
        mutex_unlock(&xt[af].mutex);
-
-       return ret;
+       return 0;
 }
 EXPORT_SYMBOL(xt_register_match);
 
@@ -194,9 +184,7 @@ struct xt_match *xt_find_match(u8 af, const char *name, u8 revision)
        struct xt_match *m;
        int err = -ENOENT;
 
-       if (mutex_lock_interruptible(&xt[af].mutex) != 0)
-               return ERR_PTR(-EINTR);
-
+       mutex_lock(&xt[af].mutex);
        list_for_each_entry(m, &xt[af].match, list) {
                if (strcmp(m->name, name) == 0) {
                        if (m->revision == revision) {
@@ -239,9 +227,7 @@ struct xt_target *xt_find_target(u8 af, const char *name, u8 revision)
        struct xt_target *t;
        int err = -ENOENT;
 
-       if (mutex_lock_interruptible(&xt[af].mutex) != 0)
-               return ERR_PTR(-EINTR);
-
+       mutex_lock(&xt[af].mutex);
        list_for_each_entry(t, &xt[af].target, list) {
                if (strcmp(t->name, name) == 0) {
                        if (t->revision == revision) {
@@ -323,10 +309,7 @@ int xt_find_revision(u8 af, const char *name, u8 revision, int target,
 {
        int have_rev, best = -1;
 
-       if (mutex_lock_interruptible(&xt[af].mutex) != 0) {
-               *err = -EINTR;
-               return 1;
-       }
+       mutex_lock(&xt[af].mutex);
        if (target == 1)
                have_rev = target_revfn(af, name, revision, &best);
        else
@@ -732,9 +715,7 @@ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 {
        struct xt_table *t;
 
-       if (mutex_lock_interruptible(&xt[af].mutex) != 0)
-               return ERR_PTR(-EINTR);
-
+       mutex_lock(&xt[af].mutex);
        list_for_each_entry(t, &net->xt.tables[af], list)
                if (strcmp(t->name, name) == 0 && try_module_get(t->me))
                        return t;
@@ -883,10 +864,7 @@ struct xt_table *xt_register_table(struct net *net,
                goto out;
        }
 
-       ret = mutex_lock_interruptible(&xt[table->af].mutex);
-       if (ret != 0)
-               goto out_free;
-
+       mutex_lock(&xt[table->af].mutex);
        /* Don't autoload: we'd eat our tail... */
        list_for_each_entry(t, &net->xt.tables[table->af], list) {
                if (strcmp(t->name, table->name) == 0) {
@@ -911,9 +889,8 @@ struct xt_table *xt_register_table(struct net *net,
        mutex_unlock(&xt[table->af].mutex);
        return table;
 
- unlock:
+unlock:
        mutex_unlock(&xt[table->af].mutex);
-out_free:
        kfree(table);
 out:
        return ERR_PTR(ret);
index a324b4b34c909094f1b1aac14e986e3777bccd5d..2e152e5f218660e94ce2bed0d45925d9676e9bec 100644 (file)
@@ -213,7 +213,7 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,
                nskb->protocol = htons((u16) sk->sk_protocol);
                nskb->pkt_type = netlink_is_kernel(sk) ?
                                 PACKET_KERNEL : PACKET_USER;
-
+               skb_reset_network_header(nskb);
                ret = dev_queue_xmit(nskb);
                if (unlikely(ret > 0))
                        ret = net_xmit_errno(ret);
index 7ad3f029baae50f86b87f36721dedf9ebf11e54a..7228ec3faf19cdc02a685caa04ccbf04ddf8005a 100644 (file)
@@ -47,8 +47,6 @@
 #include <linux/openvswitch.h>
 #include <linux/rculist.h>
 #include <linux/dmi.h>
-#include <linux/genetlink.h>
-#include <net/genetlink.h>
 #include <net/genetlink.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
index a622ad64acd8686a9baa3ad11ffea8ea91bb6731..2e0a6f92e563d7942b3bf7fa17b43e3f44dc9355 100644 (file)
@@ -176,7 +176,7 @@ static int rpc_parse_scope_id(struct net *net, const char *buf,
        len = (buf + buflen) - delim - 1;
        p = kstrndup(delim + 1, len, GFP_KERNEL);
        if (p) {
-               unsigned long scope_id = 0;
+               u32 scope_id = 0;
                struct net_device *dev;
 
                dev = dev_get_by_name(net, p);
@@ -184,7 +184,7 @@ static int rpc_parse_scope_id(struct net *net, const char *buf,
                        scope_id = dev->ifindex;
                        dev_put(dev);
                } else {
-                       if (strict_strtoul(p, 10, &scope_id) == 0) {
+                       if (kstrtou32(p, 10, &scope_id) == 0) {
                                kfree(p);
                                return 0;
                        }
@@ -304,7 +304,7 @@ char *rpc_sockaddr2uaddr(const struct sockaddr *sap, gfp_t gfp_flags)
  * @sap: buffer into which to plant socket address
  * @salen: size of buffer
  *
- * @uaddr does not have to be '\0'-terminated, but strict_strtoul() and
+ * @uaddr does not have to be '\0'-terminated, but kstrtou8() and
  * rpc_pton() require proper string termination to be successful.
  *
  * Returns the size of the socket address if successful; otherwise
@@ -315,7 +315,7 @@ size_t rpc_uaddr2sockaddr(struct net *net, const char *uaddr,
                          const size_t salen)
 {
        char *c, buf[RPCBIND_MAXUADDRLEN + sizeof('\0')];
-       unsigned long portlo, porthi;
+       u8 portlo, porthi;
        unsigned short port;
 
        if (uaddr_len > RPCBIND_MAXUADDRLEN)
@@ -327,18 +327,14 @@ size_t rpc_uaddr2sockaddr(struct net *net, const char *uaddr,
        c = strrchr(buf, '.');
        if (unlikely(c == NULL))
                return 0;
-       if (unlikely(strict_strtoul(c + 1, 10, &portlo) != 0))
-               return 0;
-       if (unlikely(portlo > 255))
+       if (unlikely(kstrtou8(c + 1, 10, &portlo) != 0))
                return 0;
 
        *c = '\0';
        c = strrchr(buf, '.');
        if (unlikely(c == NULL))
                return 0;
-       if (unlikely(strict_strtoul(c + 1, 10, &porthi) != 0))
-               return 0;
-       if (unlikely(porthi > 255))
+       if (unlikely(kstrtou8(c + 1, 10, &porthi) != 0))
                return 0;
 
        port = (unsigned short)((porthi << 8) | portlo);
index f773667174200cbabb92c7fc02adfb5d57e3e61c..383eb919ac0be3ed1348528d59f0bd637cb65bf5 100644 (file)
@@ -48,7 +48,7 @@ static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp)
 
        if (!val)
                goto out_inval;
-       ret = strict_strtoul(val, 0, &num);
+       ret = kstrtoul(val, 0, &num);
        if (ret == -EINVAL)
                goto out_inval;
        nbits = fls(num);
@@ -80,6 +80,10 @@ static struct kernel_param_ops param_ops_hashtbl_sz = {
 module_param_named(auth_hashtable_size, auth_hashbits, hashtbl_sz, 0644);
 MODULE_PARM_DESC(auth_hashtable_size, "RPC credential cache hashtable size");
 
+static unsigned long auth_max_cred_cachesize = ULONG_MAX;
+module_param(auth_max_cred_cachesize, ulong, 0644);
+MODULE_PARM_DESC(auth_max_cred_cachesize, "RPC credential maximum total cache size");
+
 static u32
 pseudoflavor_to_flavor(u32 flavor) {
        if (flavor > RPC_AUTH_MAXFLAVOR)
@@ -363,6 +367,15 @@ rpcauth_cred_key_to_expire(struct rpc_cred *cred)
 }
 EXPORT_SYMBOL_GPL(rpcauth_cred_key_to_expire);
 
+char *
+rpcauth_stringify_acceptor(struct rpc_cred *cred)
+{
+       if (!cred->cr_ops->crstringify_acceptor)
+               return NULL;
+       return cred->cr_ops->crstringify_acceptor(cred);
+}
+EXPORT_SYMBOL_GPL(rpcauth_stringify_acceptor);
+
 /*
  * Destroy a list of credentials
  */
@@ -472,6 +485,20 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
        return freed;
 }
 
+static unsigned long
+rpcauth_cache_do_shrink(int nr_to_scan)
+{
+       LIST_HEAD(free);
+       unsigned long freed;
+
+       spin_lock(&rpc_credcache_lock);
+       freed = rpcauth_prune_expired(&free, nr_to_scan);
+       spin_unlock(&rpc_credcache_lock);
+       rpcauth_destroy_credlist(&free);
+
+       return freed;
+}
+
 /*
  * Run memory cache shrinker.
  */
@@ -479,9 +506,6 @@ static unsigned long
 rpcauth_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 
 {
-       LIST_HEAD(free);
-       unsigned long freed;
-
        if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL)
                return SHRINK_STOP;
 
@@ -489,12 +513,7 @@ rpcauth_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
        if (list_empty(&cred_unused))
                return SHRINK_STOP;
 
-       spin_lock(&rpc_credcache_lock);
-       freed = rpcauth_prune_expired(&free, sc->nr_to_scan);
-       spin_unlock(&rpc_credcache_lock);
-       rpcauth_destroy_credlist(&free);
-
-       return freed;
+       return rpcauth_cache_do_shrink(sc->nr_to_scan);
 }
 
 static unsigned long
@@ -504,6 +523,21 @@ rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
        return (number_cred_unused / 100) * sysctl_vfs_cache_pressure;
 }
 
+static void
+rpcauth_cache_enforce_limit(void)
+{
+       unsigned long diff;
+       unsigned int nr_to_scan;
+
+       if (number_cred_unused <= auth_max_cred_cachesize)
+               return;
+       diff = number_cred_unused - auth_max_cred_cachesize;
+       nr_to_scan = 100;
+       if (diff < nr_to_scan)
+               nr_to_scan = diff;
+       rpcauth_cache_do_shrink(nr_to_scan);
+}
+
 /*
  * Look up a process' credentials in the authentication cache
  */
@@ -523,6 +557,12 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
        hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) {
                if (!entry->cr_ops->crmatch(acred, entry, flags))
                        continue;
+               if (flags & RPCAUTH_LOOKUP_RCU) {
+                       if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) &&
+                           !test_bit(RPCAUTH_CRED_NEW, &entry->cr_flags))
+                               cred = entry;
+                       break;
+               }
                spin_lock(&cache->lock);
                if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) == 0) {
                        spin_unlock(&cache->lock);
@@ -537,6 +577,9 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
        if (cred != NULL)
                goto found;
 
+       if (flags & RPCAUTH_LOOKUP_RCU)
+               return ERR_PTR(-ECHILD);
+
        new = auth->au_ops->crcreate(auth, acred, flags);
        if (IS_ERR(new)) {
                cred = new;
@@ -557,6 +600,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
        } else
                list_add_tail(&new->cr_lru, &free);
        spin_unlock(&cache->lock);
+       rpcauth_cache_enforce_limit();
 found:
        if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) &&
            cred->cr_ops->cr_init != NULL &&
@@ -586,10 +630,8 @@ rpcauth_lookupcred(struct rpc_auth *auth, int flags)
        memset(&acred, 0, sizeof(acred));
        acred.uid = cred->fsuid;
        acred.gid = cred->fsgid;
-       acred.group_info = get_group_info(((struct cred *)cred)->group_info);
-
+       acred.group_info = cred->group_info;
        ret = auth->au_ops->lookup_cred(auth, &acred, flags);
-       put_group_info(acred.group_info);
        return ret;
 }
 EXPORT_SYMBOL_GPL(rpcauth_lookupcred);
index ed04869b2d4f4f097ea85e8fe899c2ea262a0aca..6f6b829c9e8ee2bab63ba5f30d03d3c39dc5ba52 100644 (file)
@@ -38,6 +38,12 @@ struct rpc_cred *rpc_lookup_cred(void)
 }
 EXPORT_SYMBOL_GPL(rpc_lookup_cred);
 
+struct rpc_cred *rpc_lookup_cred_nonblock(void)
+{
+       return rpcauth_lookupcred(&generic_auth, RPCAUTH_LOOKUP_RCU);
+}
+EXPORT_SYMBOL_GPL(rpc_lookup_cred_nonblock);
+
 /*
  * Public call interface for looking up machine creds.
  */
index b6e440baccc3733f7b8963ed7ab6fddc72fd0c4c..afb292cd797decf08561492925d87d34d09e485b 100644 (file)
@@ -183,8 +183,9 @@ gss_cred_get_ctx(struct rpc_cred *cred)
        struct gss_cl_ctx *ctx = NULL;
 
        rcu_read_lock();
-       if (gss_cred->gc_ctx)
-               ctx = gss_get_ctx(gss_cred->gc_ctx);
+       ctx = rcu_dereference(gss_cred->gc_ctx);
+       if (ctx)
+               gss_get_ctx(ctx);
        rcu_read_unlock();
        return ctx;
 }
@@ -262,9 +263,22 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct
                p = ERR_PTR(ret);
                goto err;
        }
-       dprintk("RPC:       %s Success. gc_expiry %lu now %lu timeout %u\n",
-               __func__, ctx->gc_expiry, now, timeout);
-       return q;
+
+       /* is there any trailing data? */
+       if (q == end) {
+               p = q;
+               goto done;
+       }
+
+       /* pull in acceptor name (if there is one) */
+       p = simple_get_netobj(q, end, &ctx->gc_acceptor);
+       if (IS_ERR(p))
+               goto err;
+done:
+       dprintk("RPC:       %s Success. gc_expiry %lu now %lu timeout %u acceptor %.*s\n",
+               __func__, ctx->gc_expiry, now, timeout, ctx->gc_acceptor.len,
+               ctx->gc_acceptor.data);
+       return p;
 err:
        dprintk("RPC:       %s returns error %ld\n", __func__, -PTR_ERR(p));
        return p;
@@ -1194,13 +1208,13 @@ gss_destroying_context(struct rpc_cred *cred)
 {
        struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
        struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth);
+       struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1);
        struct rpc_task *task;
 
-       if (gss_cred->gc_ctx == NULL ||
-           test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0)
+       if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0)
                return 0;
 
-       gss_cred->gc_ctx->gc_proc = RPC_GSS_PROC_DESTROY;
+       ctx->gc_proc = RPC_GSS_PROC_DESTROY;
        cred->cr_ops = &gss_nullops;
 
        /* Take a reference to ensure the cred will be destroyed either
@@ -1225,6 +1239,7 @@ gss_do_free_ctx(struct gss_cl_ctx *ctx)
 
        gss_delete_sec_context(&ctx->gc_gss_ctx);
        kfree(ctx->gc_wire_ctx.data);
+       kfree(ctx->gc_acceptor.data);
        kfree(ctx);
 }
 
@@ -1260,7 +1275,7 @@ gss_destroy_nullcred(struct rpc_cred *cred)
 {
        struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
        struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth);
-       struct gss_cl_ctx *ctx = gss_cred->gc_ctx;
+       struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1);
 
        RCU_INIT_POINTER(gss_cred->gc_ctx, NULL);
        call_rcu(&cred->cr_rcu, gss_free_cred_callback);
@@ -1332,6 +1347,36 @@ gss_cred_init(struct rpc_auth *auth, struct rpc_cred *cred)
        return err;
 }
 
+static char *
+gss_stringify_acceptor(struct rpc_cred *cred)
+{
+       char *string = NULL;
+       struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
+       struct gss_cl_ctx *ctx;
+       struct xdr_netobj *acceptor;
+
+       rcu_read_lock();
+       ctx = rcu_dereference(gss_cred->gc_ctx);
+       if (!ctx)
+               goto out;
+
+       acceptor = &ctx->gc_acceptor;
+
+       /* no point if there's no string */
+       if (!acceptor->len)
+               goto out;
+
+       string = kmalloc(acceptor->len + 1, GFP_KERNEL);
+       if (!string)
+               goto out;
+
+       memcpy(string, acceptor->data, acceptor->len);
+       string[acceptor->len] = '\0';
+out:
+       rcu_read_unlock();
+       return string;
+}
+
 /*
  * Returns -EACCES if GSS context is NULL or will expire within the
  * timeout (miliseconds)
@@ -1340,15 +1385,16 @@ static int
 gss_key_timeout(struct rpc_cred *rc)
 {
        struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base);
+       struct gss_cl_ctx *ctx;
        unsigned long now = jiffies;
        unsigned long expire;
 
-       if (gss_cred->gc_ctx == NULL)
-               return -EACCES;
-
-       expire = gss_cred->gc_ctx->gc_expiry - (gss_key_expire_timeo * HZ);
-
-       if (time_after(now, expire))
+       rcu_read_lock();
+       ctx = rcu_dereference(gss_cred->gc_ctx);
+       if (ctx)
+               expire = ctx->gc_expiry - (gss_key_expire_timeo * HZ);
+       rcu_read_unlock();
+       if (!ctx || time_after(now, expire))
                return -EACCES;
        return 0;
 }
@@ -1357,13 +1403,19 @@ static int
 gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags)
 {
        struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base);
+       struct gss_cl_ctx *ctx;
        int ret;
 
        if (test_bit(RPCAUTH_CRED_NEW, &rc->cr_flags))
                goto out;
        /* Don't match with creds that have expired. */
-       if (time_after(jiffies, gss_cred->gc_ctx->gc_expiry))
+       rcu_read_lock();
+       ctx = rcu_dereference(gss_cred->gc_ctx);
+       if (!ctx || time_after(jiffies, ctx->gc_expiry)) {
+               rcu_read_unlock();
                return 0;
+       }
+       rcu_read_unlock();
        if (!test_bit(RPCAUTH_CRED_UPTODATE, &rc->cr_flags))
                return 0;
 out:
@@ -1909,29 +1961,31 @@ static const struct rpc_authops authgss_ops = {
 };
 
 static const struct rpc_credops gss_credops = {
-       .cr_name        = "AUTH_GSS",
-       .crdestroy      = gss_destroy_cred,
-       .cr_init        = gss_cred_init,
-       .crbind         = rpcauth_generic_bind_cred,
-       .crmatch        = gss_match,
-       .crmarshal      = gss_marshal,
-       .crrefresh      = gss_refresh,
-       .crvalidate     = gss_validate,
-       .crwrap_req     = gss_wrap_req,
-       .crunwrap_resp  = gss_unwrap_resp,
-       .crkey_timeout  = gss_key_timeout,
+       .cr_name                = "AUTH_GSS",
+       .crdestroy              = gss_destroy_cred,
+       .cr_init                = gss_cred_init,
+       .crbind                 = rpcauth_generic_bind_cred,
+       .crmatch                = gss_match,
+       .crmarshal              = gss_marshal,
+       .crrefresh              = gss_refresh,
+       .crvalidate             = gss_validate,
+       .crwrap_req             = gss_wrap_req,
+       .crunwrap_resp          = gss_unwrap_resp,
+       .crkey_timeout          = gss_key_timeout,
+       .crstringify_acceptor   = gss_stringify_acceptor,
 };
 
 static const struct rpc_credops gss_nullops = {
-       .cr_name        = "AUTH_GSS",
-       .crdestroy      = gss_destroy_nullcred,
-       .crbind         = rpcauth_generic_bind_cred,
-       .crmatch        = gss_match,
-       .crmarshal      = gss_marshal,
-       .crrefresh      = gss_refresh_null,
-       .crvalidate     = gss_validate,
-       .crwrap_req     = gss_wrap_req,
-       .crunwrap_resp  = gss_unwrap_resp,
+       .cr_name                = "AUTH_GSS",
+       .crdestroy              = gss_destroy_nullcred,
+       .crbind                 = rpcauth_generic_bind_cred,
+       .crmatch                = gss_match,
+       .crmarshal              = gss_marshal,
+       .crrefresh              = gss_refresh_null,
+       .crvalidate             = gss_validate,
+       .crwrap_req             = gss_wrap_req,
+       .crunwrap_resp          = gss_unwrap_resp,
+       .crstringify_acceptor   = gss_stringify_acceptor,
 };
 
 static const struct rpc_pipe_ops gss_upcall_ops_v0 = {
index 0f43e894bc0a47e913ca5999afc69d392cc6e6ad..f5ed9f6ece0699cbc89208f278554962f9409912 100644 (file)
@@ -641,7 +641,7 @@ out:
 
 u32
 gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
-                    struct xdr_buf *buf, int ec, struct page **pages)
+                    struct xdr_buf *buf, struct page **pages)
 {
        u32 err;
        struct xdr_netobj hmac;
@@ -684,13 +684,8 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
                ecptr = buf->tail[0].iov_base;
        }
 
-       memset(ecptr, 'X', ec);
-       buf->tail[0].iov_len += ec;
-       buf->len += ec;
-
        /* copy plaintext gss token header after filler (if any) */
-       memcpy(ecptr + ec, buf->head[0].iov_base + offset,
-                                               GSS_KRB5_TOK_HDR_LEN);
+       memcpy(ecptr, buf->head[0].iov_base + offset, GSS_KRB5_TOK_HDR_LEN);
        buf->tail[0].iov_len += GSS_KRB5_TOK_HDR_LEN;
        buf->len += GSS_KRB5_TOK_HDR_LEN;
 
index 62ae3273186cdd94545d26742ae7a2ece246a685..42768e5c3994e3d4570bdea259ab6c7f658f76c0 100644 (file)
 
 DEFINE_SPINLOCK(krb5_seq_lock);
 
-static char *
+static void *
 setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token)
 {
-       __be16 *ptr, *krb5_hdr;
+       u16 *ptr;
+       void *krb5_hdr;
        int body_size = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength;
 
        token->len = g_token_size(&ctx->mech_used, body_size);
 
-       ptr = (__be16 *)token->data;
+       ptr = (u16 *)token->data;
        g_make_token_header(&ctx->mech_used, body_size, (unsigned char **)&ptr);
 
        /* ptr now at start of header described in rfc 1964, section 1.2.1: */
        krb5_hdr = ptr;
        *ptr++ = KG_TOK_MIC_MSG;
-       *ptr++ = cpu_to_le16(ctx->gk5e->signalg);
+       /*
+        * signalg is stored as if it were converted from LE to host endian, even
+        * though it's an opaque pair of bytes according to the RFC.
+        */
+       *ptr++ = (__force u16)cpu_to_le16(ctx->gk5e->signalg);
        *ptr++ = SEAL_ALG_NONE;
-       *ptr++ = 0xffff;
+       *ptr = 0xffff;
 
-       return (char *)krb5_hdr;
+       return krb5_hdr;
 }
 
 static void *
 setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token)
 {
-       __be16 *ptr, *krb5_hdr;
+       u16 *ptr;
+       void *krb5_hdr;
        u8 *p, flags = 0x00;
 
        if ((ctx->flags & KRB5_CTX_FLAG_INITIATOR) == 0)
@@ -104,15 +110,15 @@ setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token)
 
        /* Per rfc 4121, sec 4.2.6.1, there is no header,
         * just start the token */
-       krb5_hdr = ptr = (__be16 *)token->data;
+       krb5_hdr = ptr = (u16 *)token->data;
 
        *ptr++ = KG2_TOK_MIC;
        p = (u8 *)ptr;
        *p++ = flags;
        *p++ = 0xff;
-       ptr = (__be16 *)p;
-       *ptr++ = 0xffff;
+       ptr = (u16 *)p;
        *ptr++ = 0xffff;
+       *ptr = 0xffff;
 
        token->len = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength;
        return krb5_hdr;
@@ -181,7 +187,7 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text,
        spin_lock(&krb5_seq_lock);
        seq_send = ctx->seq_send64++;
        spin_unlock(&krb5_seq_lock);
-       *((u64 *)(krb5_hdr + 8)) = cpu_to_be64(seq_send);
+       *((__be64 *)(krb5_hdr + 8)) = cpu_to_be64(seq_send);
 
        if (ctx->initiate) {
                cksumkey = ctx->initiator_sign;
index 42560e55d9789e946f5c02e73a0ed4b6c179409d..4b614c604fe09afd8a7ef03c635662d2117d05db 100644 (file)
@@ -201,9 +201,15 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset,
 
        msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength;
 
-       *(__be16 *)(ptr + 2) = cpu_to_le16(kctx->gk5e->signalg);
-       memset(ptr + 4, 0xff, 4);
-       *(__be16 *)(ptr + 4) = cpu_to_le16(kctx->gk5e->sealalg);
+       /*
+        * signalg and sealalg are stored as if they were converted from LE
+        * to host endian, even though they're opaque pairs of bytes according
+        * to the RFC.
+        */
+       *(__le16 *)(ptr + 2) = cpu_to_le16(kctx->gk5e->signalg);
+       *(__le16 *)(ptr + 4) = cpu_to_le16(kctx->gk5e->sealalg);
+       ptr[6] = 0xff;
+       ptr[7] = 0xff;
 
        gss_krb5_make_confounder(msg_start, conflen);
 
@@ -438,7 +444,7 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
        u8              *ptr, *plainhdr;
        s32             now;
        u8              flags = 0x00;
-       __be16          *be16ptr, ec = 0;
+       __be16          *be16ptr;
        __be64          *be64ptr;
        u32             err;
 
@@ -468,16 +474,16 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
        be16ptr = (__be16 *)ptr;
 
        blocksize = crypto_blkcipher_blocksize(kctx->acceptor_enc);
-       *be16ptr++ = cpu_to_be16(ec);
+       *be16ptr++ = 0;
        /* "inner" token header always uses 0 for RRC */
-       *be16ptr++ = cpu_to_be16(0);
+       *be16ptr++ = 0;
 
        be64ptr = (__be64 *)be16ptr;
        spin_lock(&krb5_seq_lock);
        *be64ptr = cpu_to_be64(kctx->seq_send64++);
        spin_unlock(&krb5_seq_lock);
 
-       err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, ec, pages);
+       err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, pages);
        if (err)
                return err;
 
index f0ebe07978a236e66744bc2dfe332a92bfb85d05..712c123e04e9ec43464581115b7bfb4cf42e514b 100644 (file)
@@ -35,6 +35,8 @@ nul_destroy(struct rpc_auth *auth)
 static struct rpc_cred *
 nul_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 {
+       if (flags & RPCAUTH_LOOKUP_RCU)
+               return &null_cred;
        return get_rpccred(&null_cred);
 }
 
index 2e6ab10734f6869af45a422bad6253e9de5cd580..488ddeed9363db71e1b5a80df4863a464345cb20 100644 (file)
@@ -1746,6 +1746,7 @@ call_bind_status(struct rpc_task *task)
        case -EHOSTDOWN:
        case -EHOSTUNREACH:
        case -ENETUNREACH:
+       case -ENOBUFS:
        case -EPIPE:
                dprintk("RPC: %5u remote rpcbind unreachable: %d\n",
                                task->tk_pid, task->tk_status);
@@ -1812,6 +1813,8 @@ call_connect_status(struct rpc_task *task)
        case -ECONNABORTED:
        case -ENETUNREACH:
        case -EHOSTUNREACH:
+       case -ENOBUFS:
+       case -EPIPE:
                if (RPC_IS_SOFTCONN(task))
                        break;
                /* retry with existing socket, after a delay */
@@ -1918,6 +1921,7 @@ call_transmit_status(struct rpc_task *task)
        case -ECONNRESET:
        case -ECONNABORTED:
        case -ENOTCONN:
+       case -ENOBUFS:
        case -EPIPE:
                rpc_task_force_reencode(task);
        }
@@ -2034,6 +2038,7 @@ call_status(struct rpc_task *task)
        case -ECONNRESET:
        case -ECONNABORTED:
                rpc_force_rebind(clnt);
+       case -ENOBUFS:
                rpc_delay(task, 3*HZ);
        case -EPIPE:
        case -ENOTCONN:
index b185548985622c0c23b1e0dc01acb783ee81246a..2d12b76b5a64f958e4fa8f45ffaeea00a5d857a9 100644 (file)
@@ -195,7 +195,7 @@ static struct inode *
 rpc_alloc_inode(struct super_block *sb)
 {
        struct rpc_inode *rpci;
-       rpci = (struct rpc_inode *)kmem_cache_alloc(rpc_inode_cachep, GFP_KERNEL);
+       rpci = kmem_cache_alloc(rpc_inode_cachep, GFP_KERNEL);
        if (!rpci)
                return NULL;
        return &rpci->vfs_inode;
index 51c63165073c08044bf94b081e3cd1f698de4e82..56e4e150e80ee8931e4f15e0fe4f5d9527f07b1f 100644 (file)
@@ -744,6 +744,7 @@ static void xprt_connect_status(struct rpc_task *task)
        case -ECONNABORTED:
        case -ENETUNREACH:
        case -EHOSTUNREACH:
+       case -EPIPE:
        case -EAGAIN:
                dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid);
                break;
index 693966d3f33ba12c1220538ff58f632ae762562d..6166c985fe24850b94f4cccb56dfceee6bbd575d 100644 (file)
 # define RPCDBG_FACILITY       RPCDBG_TRANS
 #endif
 
-enum rpcrdma_chunktype {
-       rpcrdma_noch = 0,
-       rpcrdma_readch,
-       rpcrdma_areadch,
-       rpcrdma_writech,
-       rpcrdma_replych
-};
-
 #ifdef RPC_DEBUG
 static const char transfertypes[][12] = {
        "pure inline",  /* no chunks */
@@ -279,12 +271,36 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
        return (unsigned char *)iptr - (unsigned char *)headerp;
 
 out:
-       for (pos = 0; nchunks--;)
-               pos += rpcrdma_deregister_external(
-                               &req->rl_segments[pos], r_xprt);
+       if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) {
+               for (pos = 0; nchunks--;)
+                       pos += rpcrdma_deregister_external(
+                                       &req->rl_segments[pos], r_xprt);
+       }
        return n;
 }
 
+/*
+ * Marshal chunks. This routine returns the header length
+ * consumed by marshaling.
+ *
+ * Returns positive RPC/RDMA header size, or negative errno.
+ */
+
+ssize_t
+rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result)
+{
+       struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+       struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)req->rl_base;
+
+       if (req->rl_rtype != rpcrdma_noch)
+               result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
+                                              headerp, req->rl_rtype);
+       else if (req->rl_wtype != rpcrdma_noch)
+               result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
+                                              headerp, req->rl_wtype);
+       return result;
+}
+
 /*
  * Copy write data inline.
  * This function is used for "small" requests. Data which is passed
@@ -377,7 +393,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
        char *base;
        size_t rpclen, padlen;
        ssize_t hdrlen;
-       enum rpcrdma_chunktype rtype, wtype;
        struct rpcrdma_msg *headerp;
 
        /*
@@ -415,13 +430,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
         * into pages; otherwise use reply chunks.
         */
        if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
-               wtype = rpcrdma_noch;
+               req->rl_wtype = rpcrdma_noch;
        else if (rqst->rq_rcv_buf.page_len == 0)
-               wtype = rpcrdma_replych;
+               req->rl_wtype = rpcrdma_replych;
        else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
-               wtype = rpcrdma_writech;
+               req->rl_wtype = rpcrdma_writech;
        else
-               wtype = rpcrdma_replych;
+               req->rl_wtype = rpcrdma_replych;
 
        /*
         * Chunks needed for arguments?
@@ -438,16 +453,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
         * TBD check NFSv4 setacl
         */
        if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
-               rtype = rpcrdma_noch;
+               req->rl_rtype = rpcrdma_noch;
        else if (rqst->rq_snd_buf.page_len == 0)
-               rtype = rpcrdma_areadch;
+               req->rl_rtype = rpcrdma_areadch;
        else
-               rtype = rpcrdma_readch;
+               req->rl_rtype = rpcrdma_readch;
 
        /* The following simplification is not true forever */
-       if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
-               wtype = rpcrdma_noch;
-       if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
+       if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych)
+               req->rl_wtype = rpcrdma_noch;
+       if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) {
                dprintk("RPC:       %s: cannot marshal multiple chunk lists\n",
                        __func__);
                return -EIO;
@@ -461,7 +476,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
         * When padding is in use and applies to the transfer, insert
         * it and change the message type.
         */
-       if (rtype == rpcrdma_noch) {
+       if (req->rl_rtype == rpcrdma_noch) {
 
                padlen = rpcrdma_inline_pullup(rqst,
                                                RPCRDMA_INLINE_PAD_VALUE(rqst));
@@ -476,7 +491,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
                        headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
                        headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
                        hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
-                       if (wtype != rpcrdma_noch) {
+                       if (req->rl_wtype != rpcrdma_noch) {
                                dprintk("RPC:       %s: invalid chunk list\n",
                                        __func__);
                                return -EIO;
@@ -497,30 +512,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
                         * on receive. Therefore, we request a reply chunk
                         * for non-writes wherever feasible and efficient.
                         */
-                       if (wtype == rpcrdma_noch)
-                               wtype = rpcrdma_replych;
+                       if (req->rl_wtype == rpcrdma_noch)
+                               req->rl_wtype = rpcrdma_replych;
                }
        }
 
-       /*
-        * Marshal chunks. This routine will return the header length
-        * consumed by marshaling.
-        */
-       if (rtype != rpcrdma_noch) {
-               hdrlen = rpcrdma_create_chunks(rqst,
-                                       &rqst->rq_snd_buf, headerp, rtype);
-               wtype = rtype;  /* simplify dprintk */
-
-       } else if (wtype != rpcrdma_noch) {
-               hdrlen = rpcrdma_create_chunks(rqst,
-                                       &rqst->rq_rcv_buf, headerp, wtype);
-       }
+       hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen);
        if (hdrlen < 0)
                return hdrlen;
 
        dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd padlen %zd"
                " headerp 0x%p base 0x%p lkey 0x%x\n",
-               __func__, transfertypes[wtype], hdrlen, rpclen, padlen,
+               __func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen,
                headerp, base, req->rl_iov.lkey);
 
        /*
index 66f91f0d071a9bbdec3e440aaa09c94df57ffbe0..2faac49405633000f7aa528085a7ecf7994015fd 100644 (file)
@@ -296,7 +296,6 @@ xprt_setup_rdma(struct xprt_create *args)
 
        xprt->resvport = 0;             /* privileged port not needed */
        xprt->tsh_size = 0;             /* RPC-RDMA handles framing */
-       xprt->max_payload = RPCRDMA_MAX_DATA_SEGS * PAGE_SIZE;
        xprt->ops = &xprt_rdma_procs;
 
        /*
@@ -382,6 +381,9 @@ xprt_setup_rdma(struct xprt_create *args)
        new_ep->rep_xprt = xprt;
 
        xprt_rdma_format_addresses(xprt);
+       xprt->max_payload = rpcrdma_max_payload(new_xprt);
+       dprintk("RPC:       %s: transport data payload maximum: %zu bytes\n",
+               __func__, xprt->max_payload);
 
        if (!try_module_get(THIS_MODULE))
                goto out4;
@@ -412,7 +414,7 @@ xprt_rdma_close(struct rpc_xprt *xprt)
        if (r_xprt->rx_ep.rep_connected > 0)
                xprt->reestablish_timeout = 0;
        xprt_disconnect_done(xprt);
-       (void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+       rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
 }
 
 static void
@@ -595,13 +597,14 @@ xprt_rdma_send_request(struct rpc_task *task)
        struct rpc_xprt *xprt = rqst->rq_xprt;
        struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
        struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
-       int rc;
+       int rc = 0;
 
-       if (req->rl_niovs == 0) {
+       if (req->rl_niovs == 0)
                rc = rpcrdma_marshal_req(rqst);
-               if (rc < 0)
-                       goto failed_marshal;
-       }
+       else if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
+               rc = rpcrdma_marshal_chunks(rqst, 0);
+       if (rc < 0)
+               goto failed_marshal;
 
        if (req->rl_reply == NULL)              /* e.g. reconnection */
                rpcrdma_recv_buffer_get(req);
index 13dbd1c389ff07b02c6fa362ebbbf5fd5b6662d7..61c41298b4ea7b09b727548bb1a00d71b60d50b9 100644 (file)
@@ -61,6 +61,8 @@
 # define RPCDBG_FACILITY       RPCDBG_TRANS
 #endif
 
+static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
+
 /*
  * internal functions
  */
@@ -103,17 +105,6 @@ rpcrdma_run_tasklet(unsigned long data)
 
 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
 
-static inline void
-rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
-       list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
-       spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
-       tasklet_schedule(&rpcrdma_tasklet_g);
-}
-
 static void
 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
 {
@@ -153,12 +144,7 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc)
        if (wc->wr_id == 0ULL)
                return;
        if (wc->status != IB_WC_SUCCESS)
-               return;
-
-       if (wc->opcode == IB_WC_FAST_REG_MR)
-               frmr->r.frmr.state = FRMR_IS_VALID;
-       else if (wc->opcode == IB_WC_LOCAL_INV)
-               frmr->r.frmr.state = FRMR_IS_INVALID;
+               frmr->r.frmr.fr_state = FRMR_IS_STALE;
 }
 
 static int
@@ -217,7 +203,7 @@ rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
 }
 
 static void
-rpcrdma_recvcq_process_wc(struct ib_wc *wc)
+rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
 {
        struct rpcrdma_rep *rep =
                        (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
@@ -248,28 +234,38 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc)
        }
 
 out_schedule:
-       rpcrdma_schedule_tasklet(rep);
+       list_add_tail(&rep->rr_list, sched_list);
 }
 
 static int
 rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
 {
+       struct list_head sched_list;
        struct ib_wc *wcs;
        int budget, count, rc;
+       unsigned long flags;
 
+       INIT_LIST_HEAD(&sched_list);
        budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
        do {
                wcs = ep->rep_recv_wcs;
 
                rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
                if (rc <= 0)
-                       return rc;
+                       goto out_schedule;
 
                count = rc;
                while (count-- > 0)
-                       rpcrdma_recvcq_process_wc(wcs++);
+                       rpcrdma_recvcq_process_wc(wcs++, &sched_list);
        } while (rc == RPCRDMA_POLLSIZE && --budget);
-       return 0;
+       rc = 0;
+
+out_schedule:
+       spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
+       list_splice_tail(&sched_list, &rpcrdma_tasklets_g);
+       spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
+       tasklet_schedule(&rpcrdma_tasklet_g);
+       return rc;
 }
 
 /*
@@ -310,6 +306,13 @@ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
        rpcrdma_recvcq_poll(cq, ep);
 }
 
+static void
+rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
+{
+       rpcrdma_recvcq_upcall(ep->rep_attr.recv_cq, ep);
+       rpcrdma_sendcq_upcall(ep->rep_attr.send_cq, ep);
+}
+
 #ifdef RPC_DEBUG
 static const char * const conn[] = {
        "address resolved",
@@ -323,8 +326,16 @@ static const char * const conn[] = {
        "rejected",
        "established",
        "disconnected",
-       "device removal"
+       "device removal",
+       "multicast join",
+       "multicast error",
+       "address change",
+       "timewait exit",
 };
+
+#define CONNECTION_MSG(status)                                         \
+       ((status) < ARRAY_SIZE(conn) ?                                  \
+               conn[(status)] : "unrecognized connection error")
 #endif
 
 static int
@@ -382,23 +393,18 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
        case RDMA_CM_EVENT_DEVICE_REMOVAL:
                connstate = -ENODEV;
 connected:
-               dprintk("RPC:       %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
-                       __func__,
-                       (event->event <= 11) ? conn[event->event] :
-                                               "unknown connection error",
-                       &addr->sin_addr.s_addr,
-                       ntohs(addr->sin_port),
-                       ep, event->event);
                atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
                dprintk("RPC:       %s: %sconnected\n",
                                        __func__, connstate > 0 ? "" : "dis");
                ep->rep_connected = connstate;
                ep->rep_func(ep);
                wake_up_all(&ep->rep_connect_wait);
-               break;
+               /*FALLTHROUGH*/
        default:
-               dprintk("RPC:       %s: unexpected CM event %d\n",
-                       __func__, event->event);
+               dprintk("RPC:       %s: %pI4:%u (ep 0x%p): %s\n",
+                       __func__, &addr->sin_addr.s_addr,
+                       ntohs(addr->sin_port), ep,
+                       CONNECTION_MSG(event->event));
                break;
        }
 
@@ -558,12 +564,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
                if (!ia->ri_id->device->alloc_fmr) {
                        dprintk("RPC:       %s: MTHCAFMR registration "
                                "not supported by HCA\n", __func__);
-#if RPCRDMA_PERSISTENT_REGISTRATION
                        memreg = RPCRDMA_ALLPHYSICAL;
-#else
-                       rc = -ENOMEM;
-                       goto out2;
-#endif
                }
        }
 
@@ -578,20 +579,16 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
        switch (memreg) {
        case RPCRDMA_FRMR:
                break;
-#if RPCRDMA_PERSISTENT_REGISTRATION
        case RPCRDMA_ALLPHYSICAL:
                mem_priv = IB_ACCESS_LOCAL_WRITE |
                                IB_ACCESS_REMOTE_WRITE |
                                IB_ACCESS_REMOTE_READ;
                goto register_setup;
-#endif
        case RPCRDMA_MTHCAFMR:
                if (ia->ri_have_dma_lkey)
                        break;
                mem_priv = IB_ACCESS_LOCAL_WRITE;
-#if RPCRDMA_PERSISTENT_REGISTRATION
        register_setup:
-#endif
                ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
                if (IS_ERR(ia->ri_bind_mem)) {
                        printk(KERN_ALERT "%s: ib_get_dma_mr for "
@@ -613,6 +610,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
        /* Else will do memory reg/dereg for each chunk */
        ia->ri_memreg_strategy = memreg;
 
+       rwlock_init(&ia->ri_qplock);
        return 0;
 out2:
        rdma_destroy_id(ia->ri_id);
@@ -826,10 +824,7 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
        cancel_delayed_work_sync(&ep->rep_connect_worker);
 
        if (ia->ri_id->qp) {
-               rc = rpcrdma_ep_disconnect(ep, ia);
-               if (rc)
-                       dprintk("RPC:       %s: rpcrdma_ep_disconnect"
-                               " returned %i\n", __func__, rc);
+               rpcrdma_ep_disconnect(ep, ia);
                rdma_destroy_qp(ia->ri_id);
                ia->ri_id->qp = NULL;
        }
@@ -859,7 +854,7 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 int
 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 {
-       struct rdma_cm_id *id;
+       struct rdma_cm_id *id, *old;
        int rc = 0;
        int retry_count = 0;
 
@@ -867,13 +862,12 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
                struct rpcrdma_xprt *xprt;
 retry:
                dprintk("RPC:       %s: reconnecting...\n", __func__);
-               rc = rpcrdma_ep_disconnect(ep, ia);
-               if (rc && rc != -ENOTCONN)
-                       dprintk("RPC:       %s: rpcrdma_ep_disconnect"
-                               " status %i\n", __func__, rc);
 
-               rpcrdma_clean_cq(ep->rep_attr.recv_cq);
-               rpcrdma_clean_cq(ep->rep_attr.send_cq);
+               rpcrdma_ep_disconnect(ep, ia);
+               rpcrdma_flush_cqs(ep);
+
+               if (ia->ri_memreg_strategy == RPCRDMA_FRMR)
+                       rpcrdma_reset_frmrs(ia);
 
                xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
                id = rpcrdma_create_id(xprt, ia,
@@ -905,9 +899,14 @@ retry:
                        rc = -ENETUNREACH;
                        goto out;
                }
-               rdma_destroy_qp(ia->ri_id);
-               rdma_destroy_id(ia->ri_id);
+
+               write_lock(&ia->ri_qplock);
+               old = ia->ri_id;
                ia->ri_id = id;
+               write_unlock(&ia->ri_qplock);
+
+               rdma_destroy_qp(old);
+               rdma_destroy_id(old);
        } else {
                dprintk("RPC:       %s: connecting...\n", __func__);
                rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
@@ -974,13 +973,12 @@ out:
  * This call is not reentrant, and must not be made in parallel
  * on the same endpoint.
  */
-int
+void
 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 {
        int rc;
 
-       rpcrdma_clean_cq(ep->rep_attr.recv_cq);
-       rpcrdma_clean_cq(ep->rep_attr.send_cq);
+       rpcrdma_flush_cqs(ep);
        rc = rdma_disconnect(ia->ri_id);
        if (!rc) {
                /* returns without wait if not connected */
@@ -992,12 +990,93 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
                dprintk("RPC:       %s: rdma_disconnect %i\n", __func__, rc);
                ep->rep_connected = rc;
        }
+}
+
+static int
+rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
+{
+       int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
+       struct ib_fmr_attr fmr_attr = {
+               .max_pages      = RPCRDMA_MAX_DATA_SEGS,
+               .max_maps       = 1,
+               .page_shift     = PAGE_SHIFT
+       };
+       struct rpcrdma_mw *r;
+       int i, rc;
+
+       i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
+       dprintk("RPC:       %s: initalizing %d FMRs\n", __func__, i);
+
+       while (i--) {
+               r = kzalloc(sizeof(*r), GFP_KERNEL);
+               if (r == NULL)
+                       return -ENOMEM;
+
+               r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
+               if (IS_ERR(r->r.fmr)) {
+                       rc = PTR_ERR(r->r.fmr);
+                       dprintk("RPC:       %s: ib_alloc_fmr failed %i\n",
+                               __func__, rc);
+                       goto out_free;
+               }
+
+               list_add(&r->mw_list, &buf->rb_mws);
+               list_add(&r->mw_all, &buf->rb_all);
+       }
+       return 0;
+
+out_free:
+       kfree(r);
+       return rc;
+}
+
+static int
+rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
+{
+       struct rpcrdma_frmr *f;
+       struct rpcrdma_mw *r;
+       int i, rc;
+
+       i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
+       dprintk("RPC:       %s: initalizing %d FRMRs\n", __func__, i);
+
+       while (i--) {
+               r = kzalloc(sizeof(*r), GFP_KERNEL);
+               if (r == NULL)
+                       return -ENOMEM;
+               f = &r->r.frmr;
+
+               f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
+                                               ia->ri_max_frmr_depth);
+               if (IS_ERR(f->fr_mr)) {
+                       rc = PTR_ERR(f->fr_mr);
+                       dprintk("RPC:       %s: ib_alloc_fast_reg_mr "
+                               "failed %i\n", __func__, rc);
+                       goto out_free;
+               }
+
+               f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
+                                                       ia->ri_max_frmr_depth);
+               if (IS_ERR(f->fr_pgl)) {
+                       rc = PTR_ERR(f->fr_pgl);
+                       dprintk("RPC:       %s: ib_alloc_fast_reg_page_list "
+                               "failed %i\n", __func__, rc);
+
+                       ib_dereg_mr(f->fr_mr);
+                       goto out_free;
+               }
+
+               list_add(&r->mw_list, &buf->rb_mws);
+               list_add(&r->mw_all, &buf->rb_all);
+       }
+
+       return 0;
+
+out_free:
+       kfree(r);
        return rc;
 }
 
-/*
- * Initialize buffer memory
- */
 int
 rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
        struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
@@ -1005,7 +1084,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
        char *p;
        size_t len, rlen, wlen;
        int i, rc;
-       struct rpcrdma_mw *r;
 
        buf->rb_max_requests = cdata->max_requests;
        spin_lock_init(&buf->rb_lock);
@@ -1016,28 +1094,12 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
         *   2.  arrays of struct rpcrdma_req to fill in pointers
         *   3.  array of struct rpcrdma_rep for replies
         *   4.  padding, if any
-        *   5.  mw's, fmr's or frmr's, if any
         * Send/recv buffers in req/rep need to be registered
         */
-
        len = buf->rb_max_requests *
                (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
        len += cdata->padding;
-       switch (ia->ri_memreg_strategy) {
-       case RPCRDMA_FRMR:
-               len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
-                               sizeof(struct rpcrdma_mw);
-               break;
-       case RPCRDMA_MTHCAFMR:
-               /* TBD we are perhaps overallocating here */
-               len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
-                               sizeof(struct rpcrdma_mw);
-               break;
-       default:
-               break;
-       }
 
-       /* allocate 1, 4 and 5 in one shot */
        p = kzalloc(len, GFP_KERNEL);
        if (p == NULL) {
                dprintk("RPC:       %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
@@ -1064,51 +1126,17 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
        p += cdata->padding;
 
        INIT_LIST_HEAD(&buf->rb_mws);
-       r = (struct rpcrdma_mw *)p;
+       INIT_LIST_HEAD(&buf->rb_all);
        switch (ia->ri_memreg_strategy) {
        case RPCRDMA_FRMR:
-               for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
-                       r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
-                                               ia->ri_max_frmr_depth);
-                       if (IS_ERR(r->r.frmr.fr_mr)) {
-                               rc = PTR_ERR(r->r.frmr.fr_mr);
-                               dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
-                                       " failed %i\n", __func__, rc);
-                               goto out;
-                       }
-                       r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
-                                               ia->ri_id->device,
-                                               ia->ri_max_frmr_depth);
-                       if (IS_ERR(r->r.frmr.fr_pgl)) {
-                               rc = PTR_ERR(r->r.frmr.fr_pgl);
-                               dprintk("RPC:       %s: "
-                                       "ib_alloc_fast_reg_page_list "
-                                       "failed %i\n", __func__, rc);
-
-                               ib_dereg_mr(r->r.frmr.fr_mr);
-                               goto out;
-                       }
-                       list_add(&r->mw_list, &buf->rb_mws);
-                       ++r;
-               }
+               rc = rpcrdma_init_frmrs(ia, buf);
+               if (rc)
+                       goto out;
                break;
        case RPCRDMA_MTHCAFMR:
-               /* TBD we are perhaps overallocating here */
-               for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
-                       static struct ib_fmr_attr fa =
-                               { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
-                       r->r.fmr = ib_alloc_fmr(ia->ri_pd,
-                               IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
-                               &fa);
-                       if (IS_ERR(r->r.fmr)) {
-                               rc = PTR_ERR(r->r.fmr);
-                               dprintk("RPC:       %s: ib_alloc_fmr"
-                                       " failed %i\n", __func__, rc);
-                               goto out;
-                       }
-                       list_add(&r->mw_list, &buf->rb_mws);
-                       ++r;
-               }
+               rc = rpcrdma_init_fmrs(ia, buf);
+               if (rc)
+                       goto out;
                break;
        default:
                break;
@@ -1176,24 +1204,57 @@ out:
        return rc;
 }
 
-/*
- * Unregister and destroy buffer memory. Need to deal with
- * partial initialization, so it's callable from failed create.
- * Must be called before destroying endpoint, as registrations
- * reference it.
- */
+static void
+rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
+{
+       struct rpcrdma_mw *r;
+       int rc;
+
+       while (!list_empty(&buf->rb_all)) {
+               r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+               list_del(&r->mw_all);
+               list_del(&r->mw_list);
+
+               rc = ib_dealloc_fmr(r->r.fmr);
+               if (rc)
+                       dprintk("RPC:       %s: ib_dealloc_fmr failed %i\n",
+                               __func__, rc);
+
+               kfree(r);
+       }
+}
+
+static void
+rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
+{
+       struct rpcrdma_mw *r;
+       int rc;
+
+       while (!list_empty(&buf->rb_all)) {
+               r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+               list_del(&r->mw_all);
+               list_del(&r->mw_list);
+
+               rc = ib_dereg_mr(r->r.frmr.fr_mr);
+               if (rc)
+                       dprintk("RPC:       %s: ib_dereg_mr failed %i\n",
+                               __func__, rc);
+               ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+
+               kfree(r);
+       }
+}
+
 void
 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 {
-       int rc, i;
        struct rpcrdma_ia *ia = rdmab_to_ia(buf);
-       struct rpcrdma_mw *r;
+       int i;
 
        /* clean up in reverse order from create
         *   1.  recv mr memory (mr free, then kfree)
         *   2.  send mr memory (mr free, then kfree)
-        *   3.  padding (if any) [moved to rpcrdma_ep_destroy]
-        *   4.  arrays
+        *   3.  MWs
         */
        dprintk("RPC:       %s: entering\n", __func__);
 
@@ -1212,34 +1273,217 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
                }
        }
 
+       switch (ia->ri_memreg_strategy) {
+       case RPCRDMA_FRMR:
+               rpcrdma_destroy_frmrs(buf);
+               break;
+       case RPCRDMA_MTHCAFMR:
+               rpcrdma_destroy_fmrs(buf);
+               break;
+       default:
+               break;
+       }
+
+       kfree(buf->rb_pool);
+}
+
+/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
+ * an unusable state. Find FRMRs in this state and dereg / reg
+ * each.  FRMRs that are VALID and attached to an rpcrdma_req are
+ * also torn down.
+ *
+ * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
+ *
+ * This is invoked only in the transport connect worker in order
+ * to serialize with rpcrdma_register_frmr_external().
+ */
+static void
+rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
+{
+       struct rpcrdma_xprt *r_xprt =
+                               container_of(ia, struct rpcrdma_xprt, rx_ia);
+       struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+       struct list_head *pos;
+       struct rpcrdma_mw *r;
+       int rc;
+
+       list_for_each(pos, &buf->rb_all) {
+               r = list_entry(pos, struct rpcrdma_mw, mw_all);
+
+               if (r->r.frmr.fr_state == FRMR_IS_INVALID)
+                       continue;
+
+               rc = ib_dereg_mr(r->r.frmr.fr_mr);
+               if (rc)
+                       dprintk("RPC:       %s: ib_dereg_mr failed %i\n",
+                               __func__, rc);
+               ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+
+               r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
+                                       ia->ri_max_frmr_depth);
+               if (IS_ERR(r->r.frmr.fr_mr)) {
+                       rc = PTR_ERR(r->r.frmr.fr_mr);
+                       dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
+                               " failed %i\n", __func__, rc);
+                       continue;
+               }
+               r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
+                                       ia->ri_id->device,
+                                       ia->ri_max_frmr_depth);
+               if (IS_ERR(r->r.frmr.fr_pgl)) {
+                       rc = PTR_ERR(r->r.frmr.fr_pgl);
+                       dprintk("RPC:       %s: "
+                               "ib_alloc_fast_reg_page_list "
+                               "failed %i\n", __func__, rc);
+
+                       ib_dereg_mr(r->r.frmr.fr_mr);
+                       continue;
+               }
+               r->r.frmr.fr_state = FRMR_IS_INVALID;
+       }
+}
+
+/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
+ * some req segments uninitialized.
+ */
+static void
+rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
+{
+       if (*mw) {
+               list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
+               *mw = NULL;
+       }
+}
+
+/* Cycle mw's back in reverse order, and "spin" them.
+ * This delays and scrambles reuse as much as possible.
+ */
+static void
+rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+{
+       struct rpcrdma_mr_seg *seg = req->rl_segments;
+       struct rpcrdma_mr_seg *seg1 = seg;
+       int i;
+
+       for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
+               rpcrdma_buffer_put_mr(&seg->mr_chunk.rl_mw, buf);
+       rpcrdma_buffer_put_mr(&seg1->mr_chunk.rl_mw, buf);
+}
+
+static void
+rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+{
+       buf->rb_send_bufs[--buf->rb_send_index] = req;
+       req->rl_niovs = 0;
+       if (req->rl_reply) {
+               buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
+               req->rl_reply->rr_func = NULL;
+               req->rl_reply = NULL;
+       }
+}
+
+/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
+ * Redo only the ib_post_send().
+ */
+static void
+rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
+{
+       struct rpcrdma_xprt *r_xprt =
+                               container_of(ia, struct rpcrdma_xprt, rx_ia);
+       struct ib_send_wr invalidate_wr, *bad_wr;
+       int rc;
+
+       dprintk("RPC:       %s: FRMR %p is stale\n", __func__, r);
+
+       /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
+       r->r.frmr.fr_state = FRMR_IS_INVALID;
+
+       memset(&invalidate_wr, 0, sizeof(invalidate_wr));
+       invalidate_wr.wr_id = (unsigned long)(void *)r;
+       invalidate_wr.opcode = IB_WR_LOCAL_INV;
+       invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
+       DECR_CQCOUNT(&r_xprt->rx_ep);
+
+       dprintk("RPC:       %s: frmr %p invalidating rkey %08x\n",
+               __func__, r, r->r.frmr.fr_mr->rkey);
+
+       read_lock(&ia->ri_qplock);
+       rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+       read_unlock(&ia->ri_qplock);
+       if (rc) {
+               /* Force rpcrdma_buffer_get() to retry */
+               r->r.frmr.fr_state = FRMR_IS_STALE;
+               dprintk("RPC:       %s: ib_post_send failed, %i\n",
+                       __func__, rc);
+       }
+}
+
+static void
+rpcrdma_retry_flushed_linv(struct list_head *stale,
+                          struct rpcrdma_buffer *buf)
+{
+       struct rpcrdma_ia *ia = rdmab_to_ia(buf);
+       struct list_head *pos;
+       struct rpcrdma_mw *r;
+       unsigned long flags;
+
+       list_for_each(pos, stale) {
+               r = list_entry(pos, struct rpcrdma_mw, mw_list);
+               rpcrdma_retry_local_inv(r, ia);
+       }
+
+       spin_lock_irqsave(&buf->rb_lock, flags);
+       list_splice_tail(stale, &buf->rb_mws);
+       spin_unlock_irqrestore(&buf->rb_lock, flags);
+}
+
+static struct rpcrdma_req *
+rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
+                        struct list_head *stale)
+{
+       struct rpcrdma_mw *r;
+       int i;
+
+       i = RPCRDMA_MAX_SEGS - 1;
        while (!list_empty(&buf->rb_mws)) {
                r = list_entry(buf->rb_mws.next,
-                       struct rpcrdma_mw, mw_list);
+                              struct rpcrdma_mw, mw_list);
                list_del(&r->mw_list);
-               switch (ia->ri_memreg_strategy) {
-               case RPCRDMA_FRMR:
-                       rc = ib_dereg_mr(r->r.frmr.fr_mr);
-                       if (rc)
-                               dprintk("RPC:       %s:"
-                                       " ib_dereg_mr"
-                                       " failed %i\n",
-                                       __func__, rc);
-                       ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
-                       break;
-               case RPCRDMA_MTHCAFMR:
-                       rc = ib_dealloc_fmr(r->r.fmr);
-                       if (rc)
-                               dprintk("RPC:       %s:"
-                                       " ib_dealloc_fmr"
-                                       " failed %i\n",
-                                       __func__, rc);
-                       break;
-               default:
-                       break;
+               if (r->r.frmr.fr_state == FRMR_IS_STALE) {
+                       list_add(&r->mw_list, stale);
+                       continue;
                }
+               req->rl_segments[i].mr_chunk.rl_mw = r;
+               if (unlikely(i-- == 0))
+                       return req;     /* Success */
        }
 
-       kfree(buf->rb_pool);
+       /* Not enough entries on rb_mws for this req */
+       rpcrdma_buffer_put_sendbuf(req, buf);
+       rpcrdma_buffer_put_mrs(req, buf);
+       return NULL;
+}
+
+static struct rpcrdma_req *
+rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+{
+       struct rpcrdma_mw *r;
+       int i;
+
+       i = RPCRDMA_MAX_SEGS - 1;
+       while (!list_empty(&buf->rb_mws)) {
+               r = list_entry(buf->rb_mws.next,
+                              struct rpcrdma_mw, mw_list);
+               list_del(&r->mw_list);
+               req->rl_segments[i].mr_chunk.rl_mw = r;
+               if (unlikely(i-- == 0))
+                       return req;     /* Success */
+       }
+
+       /* Not enough entries on rb_mws for this req */
+       rpcrdma_buffer_put_sendbuf(req, buf);
+       rpcrdma_buffer_put_mrs(req, buf);
+       return NULL;
 }
 
 /*
@@ -1254,10 +1498,10 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 struct rpcrdma_req *
 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
 {
+       struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
+       struct list_head stale;
        struct rpcrdma_req *req;
        unsigned long flags;
-       int i;
-       struct rpcrdma_mw *r;
 
        spin_lock_irqsave(&buffers->rb_lock, flags);
        if (buffers->rb_send_index == buffers->rb_max_requests) {
@@ -1277,16 +1521,21 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
                buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
        }
        buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
-       if (!list_empty(&buffers->rb_mws)) {
-               i = RPCRDMA_MAX_SEGS - 1;
-               do {
-                       r = list_entry(buffers->rb_mws.next,
-                                       struct rpcrdma_mw, mw_list);
-                       list_del(&r->mw_list);
-                       req->rl_segments[i].mr_chunk.rl_mw = r;
-               } while (--i >= 0);
+
+       INIT_LIST_HEAD(&stale);
+       switch (ia->ri_memreg_strategy) {
+       case RPCRDMA_FRMR:
+               req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
+               break;
+       case RPCRDMA_MTHCAFMR:
+               req = rpcrdma_buffer_get_fmrs(req, buffers);
+               break;
+       default:
+               break;
        }
        spin_unlock_irqrestore(&buffers->rb_lock, flags);
+       if (!list_empty(&stale))
+               rpcrdma_retry_flushed_linv(&stale, buffers);
        return req;
 }
 
@@ -1299,34 +1548,14 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
 {
        struct rpcrdma_buffer *buffers = req->rl_buffer;
        struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
-       int i;
        unsigned long flags;
 
        spin_lock_irqsave(&buffers->rb_lock, flags);
-       buffers->rb_send_bufs[--buffers->rb_send_index] = req;
-       req->rl_niovs = 0;
-       if (req->rl_reply) {
-               buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
-               req->rl_reply->rr_func = NULL;
-               req->rl_reply = NULL;
-       }
+       rpcrdma_buffer_put_sendbuf(req, buffers);
        switch (ia->ri_memreg_strategy) {
        case RPCRDMA_FRMR:
        case RPCRDMA_MTHCAFMR:
-               /*
-                * Cycle mw's back in reverse order, and "spin" them.
-                * This delays and scrambles reuse as much as possible.
-                */
-               i = 1;
-               do {
-                       struct rpcrdma_mw **mw;
-                       mw = &req->rl_segments[i].mr_chunk.rl_mw;
-                       list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
-                       *mw = NULL;
-               } while (++i < RPCRDMA_MAX_SEGS);
-               list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
-                                       &buffers->rb_mws);
-               req->rl_segments[0].mr_chunk.rl_mw = NULL;
+               rpcrdma_buffer_put_mrs(req, buffers);
                break;
        default:
                break;
@@ -1388,6 +1617,9 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
         */
        iov->addr = ib_dma_map_single(ia->ri_id->device,
                        va, len, DMA_BIDIRECTIONAL);
+       if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
+               return -ENOMEM;
+
        iov->length = len;
 
        if (ia->ri_have_dma_lkey) {
@@ -1483,8 +1715,10 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
                        struct rpcrdma_xprt *r_xprt)
 {
        struct rpcrdma_mr_seg *seg1 = seg;
-       struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
-
+       struct rpcrdma_mw *mw = seg1->mr_chunk.rl_mw;
+       struct rpcrdma_frmr *frmr = &mw->r.frmr;
+       struct ib_mr *mr = frmr->fr_mr;
+       struct ib_send_wr fastreg_wr, *bad_wr;
        u8 key;
        int len, pageoff;
        int i, rc;
@@ -1502,8 +1736,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
                rpcrdma_map_one(ia, seg, writing);
                pa = seg->mr_dma;
                for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
-                       seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->
-                               page_list[page_no++] = pa;
+                       frmr->fr_pgl->page_list[page_no++] = pa;
                        pa += PAGE_SIZE;
                }
                len += seg->mr_len;
@@ -1515,65 +1748,51 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
                        break;
        }
        dprintk("RPC:       %s: Using frmr %p to map %d segments\n",
-               __func__, seg1->mr_chunk.rl_mw, i);
-
-       if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
-               dprintk("RPC:       %s: frmr %x left valid, posting invalidate.\n",
-                       __func__,
-                       seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
-               /* Invalidate before using. */
-               memset(&invalidate_wr, 0, sizeof invalidate_wr);
-               invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
-               invalidate_wr.next = &frmr_wr;
-               invalidate_wr.opcode = IB_WR_LOCAL_INV;
-               invalidate_wr.send_flags = IB_SEND_SIGNALED;
-               invalidate_wr.ex.invalidate_rkey =
-                       seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
-               DECR_CQCOUNT(&r_xprt->rx_ep);
-               post_wr = &invalidate_wr;
-       } else
-               post_wr = &frmr_wr;
-
-       /* Prepare FRMR WR */
-       memset(&frmr_wr, 0, sizeof frmr_wr);
-       frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
-       frmr_wr.opcode = IB_WR_FAST_REG_MR;
-       frmr_wr.send_flags = IB_SEND_SIGNALED;
-       frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
-       frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
-       frmr_wr.wr.fast_reg.page_list_len = page_no;
-       frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
-       frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
-       if (frmr_wr.wr.fast_reg.length < len) {
-               while (seg1->mr_nsegs--)
-                       rpcrdma_unmap_one(ia, seg++);
-               return -EIO;
+               __func__, mw, i);
+
+       frmr->fr_state = FRMR_IS_VALID;
+
+       memset(&fastreg_wr, 0, sizeof(fastreg_wr));
+       fastreg_wr.wr_id = (unsigned long)(void *)mw;
+       fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+       fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma;
+       fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
+       fastreg_wr.wr.fast_reg.page_list_len = page_no;
+       fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+       fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
+       if (fastreg_wr.wr.fast_reg.length < len) {
+               rc = -EIO;
+               goto out_err;
        }
 
        /* Bump the key */
-       key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
-       ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
+       key = (u8)(mr->rkey & 0x000000FF);
+       ib_update_fast_reg_key(mr, ++key);
 
-       frmr_wr.wr.fast_reg.access_flags = (writing ?
+       fastreg_wr.wr.fast_reg.access_flags = (writing ?
                                IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
                                IB_ACCESS_REMOTE_READ);
-       frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+       fastreg_wr.wr.fast_reg.rkey = mr->rkey;
        DECR_CQCOUNT(&r_xprt->rx_ep);
 
-       rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
-
+       rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
        if (rc) {
                dprintk("RPC:       %s: failed ib_post_send for register,"
                        " status %i\n", __func__, rc);
-               while (i--)
-                       rpcrdma_unmap_one(ia, --seg);
+               ib_update_fast_reg_key(mr, --key);
+               goto out_err;
        } else {
-               seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+               seg1->mr_rkey = mr->rkey;
                seg1->mr_base = seg1->mr_dma + pageoff;
                seg1->mr_nsegs = i;
                seg1->mr_len = len;
        }
        *nsegs = i;
+       return 0;
+out_err:
+       frmr->fr_state = FRMR_IS_INVALID;
+       while (i--)
+               rpcrdma_unmap_one(ia, --seg);
        return rc;
 }
 
@@ -1585,20 +1804,25 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
        struct ib_send_wr invalidate_wr, *bad_wr;
        int rc;
 
-       while (seg1->mr_nsegs--)
-               rpcrdma_unmap_one(ia, seg++);
+       seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
 
        memset(&invalidate_wr, 0, sizeof invalidate_wr);
        invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
        invalidate_wr.opcode = IB_WR_LOCAL_INV;
-       invalidate_wr.send_flags = IB_SEND_SIGNALED;
        invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
        DECR_CQCOUNT(&r_xprt->rx_ep);
 
+       read_lock(&ia->ri_qplock);
+       while (seg1->mr_nsegs--)
+               rpcrdma_unmap_one(ia, seg++);
        rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
-       if (rc)
+       read_unlock(&ia->ri_qplock);
+       if (rc) {
+               /* Force rpcrdma_buffer_get() to retry */
+               seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
                dprintk("RPC:       %s: failed ib_post_send for invalidate,"
                        " status %i\n", __func__, rc);
+       }
        return rc;
 }
 
@@ -1656,8 +1880,10 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
 
        list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
        rc = ib_unmap_fmr(&l);
+       read_lock(&ia->ri_qplock);
        while (seg1->mr_nsegs--)
                rpcrdma_unmap_one(ia, seg++);
+       read_unlock(&ia->ri_qplock);
        if (rc)
                dprintk("RPC:       %s: failed ib_unmap_fmr,"
                        " status %i\n", __func__, rc);
@@ -1673,7 +1899,6 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
 
        switch (ia->ri_memreg_strategy) {
 
-#if RPCRDMA_PERSISTENT_REGISTRATION
        case RPCRDMA_ALLPHYSICAL:
                rpcrdma_map_one(ia, seg, writing);
                seg->mr_rkey = ia->ri_bind_mem->rkey;
@@ -1681,7 +1906,6 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
                seg->mr_nsegs = 1;
                nsegs = 1;
                break;
-#endif
 
        /* Registration using frmr registration */
        case RPCRDMA_FRMR:
@@ -1711,11 +1935,11 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
 
        switch (ia->ri_memreg_strategy) {
 
-#if RPCRDMA_PERSISTENT_REGISTRATION
        case RPCRDMA_ALLPHYSICAL:
+               read_lock(&ia->ri_qplock);
                rpcrdma_unmap_one(ia, seg);
+               read_unlock(&ia->ri_qplock);
                break;
-#endif
 
        case RPCRDMA_FRMR:
                rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
@@ -1809,3 +2033,44 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
                        rc);
        return rc;
 }
+
+/* Physical mapping means one Read/Write list entry per-page.
+ * All list entries must fit within an inline buffer
+ *
+ * NB: The server must return a Write list for NFS READ,
+ *     which has the same constraint. Factor in the inline
+ *     rsize as well.
+ */
+static size_t
+rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt)
+{
+       struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+       unsigned int inline_size, pages;
+
+       inline_size = min_t(unsigned int,
+                           cdata->inline_wsize, cdata->inline_rsize);
+       inline_size -= RPCRDMA_HDRLEN_MIN;
+       pages = inline_size / sizeof(struct rpcrdma_segment);
+       return pages << PAGE_SHIFT;
+}
+
+static size_t
+rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
+{
+       return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
+}
+
+size_t
+rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
+{
+       size_t result;
+
+       switch (r_xprt->rx_ia.ri_memreg_strategy) {
+       case RPCRDMA_ALLPHYSICAL:
+               result = rpcrdma_physical_max_payload(r_xprt);
+               break;
+       default:
+               result = rpcrdma_mr_max_payload(r_xprt);
+       }
+       return result;
+}
index 89e7cd479705da640519cfe790fae5fed3f43e63..c419498b8f468a3a1c14bdb1a8f7f1200d32160c 100644 (file)
@@ -59,6 +59,7 @@
  * Interface Adapter -- one per transport instance
  */
 struct rpcrdma_ia {
+       rwlock_t                ri_qplock;
        struct rdma_cm_id       *ri_id;
        struct ib_pd            *ri_pd;
        struct ib_mr            *ri_bind_mem;
@@ -98,6 +99,14 @@ struct rpcrdma_ep {
 #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
 #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
 
+enum rpcrdma_chunktype {
+       rpcrdma_noch = 0,
+       rpcrdma_readch,
+       rpcrdma_areadch,
+       rpcrdma_writech,
+       rpcrdma_replych
+};
+
 /*
  * struct rpcrdma_rep -- this structure encapsulates state required to recv
  * and complete a reply, asychronously. It needs several pieces of
@@ -136,6 +145,40 @@ struct rpcrdma_rep {
        char    rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */
 };
 
+/*
+ * struct rpcrdma_mw - external memory region metadata
+ *
+ * An external memory region is any buffer or page that is registered
+ * on the fly (ie, not pre-registered).
+ *
+ * Each rpcrdma_buffer has a list of free MWs anchored in rb_mws. During
+ * call_allocate, rpcrdma_buffer_get() assigns one to each segment in
+ * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep
+ * track of registration metadata while each RPC is pending.
+ * rpcrdma_deregister_external() uses this metadata to unmap and
+ * release these resources when an RPC is complete.
+ */
+enum rpcrdma_frmr_state {
+       FRMR_IS_INVALID,        /* ready to be used */
+       FRMR_IS_VALID,          /* in use */
+       FRMR_IS_STALE,          /* failed completion */
+};
+
+struct rpcrdma_frmr {
+       struct ib_fast_reg_page_list    *fr_pgl;
+       struct ib_mr                    *fr_mr;
+       enum rpcrdma_frmr_state         fr_state;
+};
+
+struct rpcrdma_mw {
+       union {
+               struct ib_fmr           *fmr;
+               struct rpcrdma_frmr     frmr;
+       } r;
+       struct list_head        mw_list;
+       struct list_head        mw_all;
+};
+
 /*
  * struct rpcrdma_req -- structure central to the request/reply sequence.
  *
@@ -163,17 +206,7 @@ struct rpcrdma_rep {
 struct rpcrdma_mr_seg {                /* chunk descriptors */
        union {                         /* chunk memory handles */
                struct ib_mr    *rl_mr;         /* if registered directly */
-               struct rpcrdma_mw {             /* if registered from region */
-                       union {
-                               struct ib_fmr   *fmr;
-                               struct {
-                                       struct ib_fast_reg_page_list *fr_pgl;
-                                       struct ib_mr *fr_mr;
-                                       enum { FRMR_IS_INVALID, FRMR_IS_VALID  } state;
-                               } frmr;
-                       } r;
-                       struct list_head mw_list;
-               } *rl_mw;
+               struct rpcrdma_mw *rl_mw;       /* if registered from region */
        } mr_chunk;
        u64             mr_base;        /* registration result */
        u32             mr_rkey;        /* registration result */
@@ -191,6 +224,7 @@ struct rpcrdma_req {
        unsigned int    rl_niovs;       /* 0, 2 or 4 */
        unsigned int    rl_nchunks;     /* non-zero if chunks */
        unsigned int    rl_connect_cookie;      /* retry detection */
+       enum rpcrdma_chunktype  rl_rtype, rl_wtype;
        struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
        struct rpcrdma_rep      *rl_reply;/* holder for reply buffer */
        struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */
@@ -214,6 +248,7 @@ struct rpcrdma_buffer {
        atomic_t        rb_credits;     /* most recent server credits */
        int             rb_max_requests;/* client max requests */
        struct list_head rb_mws;        /* optional memory windows/fmrs/frmrs */
+       struct list_head rb_all;
        int             rb_send_index;
        struct rpcrdma_req      **rb_send_bufs;
        int             rb_recv_index;
@@ -306,7 +341,7 @@ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
                                struct rpcrdma_create_data_internal *);
 void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
 int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
-int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
 
 int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
                                struct rpcrdma_req *);
@@ -346,7 +381,9 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *);
 /*
  * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
  */
+ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t);
 int rpcrdma_marshal_req(struct rpc_rqst *);
+size_t rpcrdma_max_payload(struct rpcrdma_xprt *);
 
 /* Temporary NFS request map cache. Created in svc_rdma.c  */
 extern struct kmem_cache *svc_rdma_map_cachep;
index be8bbd5d65ec6914f6178816556ce31ee7ef0883..43cd89eacfab2caa79a4db7fd38448f330e12373 100644 (file)
@@ -594,6 +594,7 @@ static int xs_local_send_request(struct rpc_task *task)
        }
 
        switch (status) {
+       case -ENOBUFS:
        case -EAGAIN:
                status = xs_nospace(task);
                break;
@@ -661,6 +662,7 @@ static int xs_udp_send_request(struct rpc_task *task)
                dprintk("RPC:       sendmsg returned unrecognized error %d\n",
                        -status);
        case -ENETUNREACH:
+       case -ENOBUFS:
        case -EPIPE:
        case -ECONNREFUSED:
                /* When the server has died, an ICMP port unreachable message
@@ -758,6 +760,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
                status = -ENOTCONN;
                /* Should we call xs_close() here? */
                break;
+       case -ENOBUFS:
        case -EAGAIN:
                status = xs_nospace(task);
                break;
@@ -1946,6 +1949,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
                dprintk("RPC:       xprt %p connected to %s\n",
                                xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);
                xprt_set_connected(xprt);
+       case -ENOBUFS:
                break;
        case -ENOENT:
                dprintk("RPC:       xprt %p: socket %s does not exist\n",
@@ -2281,6 +2285,7 @@ static void xs_tcp_setup_socket(struct work_struct *work)
        case -ECONNREFUSED:
        case -ECONNRESET:
        case -ENETUNREACH:
+       case -ENOBUFS:
                /* retry with existing socket, after a delay */
                goto out;
        }
@@ -3054,12 +3059,12 @@ static int param_set_uint_minmax(const char *val,
                const struct kernel_param *kp,
                unsigned int min, unsigned int max)
 {
-       unsigned long num;
+       unsigned int num;
        int ret;
 
        if (!val)
                return -EINVAL;
-       ret = strict_strtoul(val, 0, &num);
+       ret = kstrtouint(val, 0, &num);
        if (ret == -EINVAL || num < min || num > max)
                return -EINVAL;
        *((unsigned int *)kp->arg) = num;
index 7a43c0c38316c7879815cf42190cd64f51376373..8a431bcb056cecc5414ba30e93c238fce7288df9 100644 (file)
@@ -992,9 +992,9 @@ static int snd_pmac_detect(struct snd_pmac *chip)
                return -ENODEV;
 
        if (!sound) {
-               sound = of_find_node_by_name(NULL, "sound");
-               while (sound && sound->parent != chip->node)
-                       sound = of_find_node_by_name(sound, "sound");
+               for_each_node_by_name(sound, "sound")
+                       if (sound->parent == chip->node)
+                               break;
        }
        if (! sound) {
                of_node_put(chip->node);