Merge branch 'akpm' (patches from Andrew)
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 5 Sep 2015 21:27:38 +0000 (14:27 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 5 Sep 2015 21:27:38 +0000 (14:27 -0700)
Merge patch-bomb from Andrew Morton:

 - a few misc things

 - Andy's "ambient capabilities"

 - fs/nofity updates

 - the ocfs2 queue

 - kernel/watchdog.c updates and feature work.

 - some of MM.  Includes Andrea's userfaultfd feature.

[ Hadn't noticed that userfaultfd was 'default y' when applying the
  patches, so that got fixed in this merge instead.  We do _not_ mark
  new features that nobody uses yet 'default y'   - Linus ]

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (118 commits)
  mm/hugetlb.c: make vma_has_reserves() return bool
  mm/madvise.c: make madvise_behaviour_valid() return bool
  mm/memory.c: make tlb_next_batch() return bool
  mm/dmapool.c: change is_page_busy() return from int to bool
  mm: remove struct node_active_region
  mremap: simplify the "overlap" check in mremap_to()
  mremap: don't do uneccesary checks if new_len == old_len
  mremap: don't do mm_populate(new_addr) on failure
  mm: move ->mremap() from file_operations to vm_operations_struct
  mremap: don't leak new_vma if f_op->mremap() fails
  mm/hugetlb.c: make vma_shareable() return bool
  mm: make GUP handle pfn mapping unless FOLL_GET is requested
  mm: fix status code which move_pages() returns for zero page
  mm: memcontrol: bring back the VM_BUG_ON() in mem_cgroup_swapout()
  genalloc: add support of multiple gen_pools per device
  genalloc: add name arg to gen_pool_get() and devm_gen_pool_create()
  mm/memblock: WARN_ON when nid differs from overlap region
  Documentation/features/vm: add feature description and arch support status for batched TLB flush after unmap
  mm: defer flush of writable TLB entries
  mm: send one IPI per CPU to TLB flush all entries after unmapping pages
  ...

143 files changed:
Documentation/features/vm/TLB/arch-support.txt [new file with mode: 0644]
Documentation/ioctl/ioctl-number.txt
Documentation/vm/userfaultfd.txt [new file with mode: 0644]
arch/arm/mach-at91/pm.c
arch/arm/mach-imx/pm-imx5.c
arch/arm/mach-imx/pm-imx6.c
arch/arm/mach-socfpga/pm.c
arch/sh/mm/init.c
arch/sh/mm/numa.c
arch/x86/Kconfig
arch/x86/entry/syscalls/syscall_32.tbl
arch/x86/entry/syscalls/syscall_64.tbl
arch/x86/include/asm/tlbflush.h
arch/x86/kernel/cpu/perf_event_intel.c
arch/x86/mm/tlb.c
drivers/base/node.c
drivers/media/platform/coda/coda-common.c
drivers/misc/sram.c
drivers/video/console/Kconfig
fs/Makefile
fs/aio.c
fs/ceph/super.c
fs/cifs/cifsfs.c
fs/ext4/super.c
fs/gfs2/super.c
fs/hfs/super.c
fs/hfsplus/options.c
fs/hostfs/hostfs_kern.c
fs/notify/dnotify/dnotify.c
fs/notify/fanotify/fanotify_user.c
fs/notify/fdinfo.c
fs/notify/fsnotify.c
fs/notify/fsnotify.h
fs/notify/inode_mark.c
fs/notify/mark.c
fs/notify/vfsmount_mark.c
fs/ntfs/super.c
fs/ocfs2/acl.c
fs/ocfs2/alloc.c
fs/ocfs2/aops.c
fs/ocfs2/buffer_head_io.c
fs/ocfs2/cluster/heartbeat.c
fs/ocfs2/dir.c
fs/ocfs2/dlm/dlmdomain.c
fs/ocfs2/dlm/dlmmaster.c
fs/ocfs2/dlm/dlmrecovery.c
fs/ocfs2/dlm/dlmthread.c
fs/ocfs2/dlmglue.c
fs/ocfs2/extent_map.c
fs/ocfs2/file.c
fs/ocfs2/inode.c
fs/ocfs2/inode.h
fs/ocfs2/journal.c
fs/ocfs2/localalloc.c
fs/ocfs2/move_extents.c
fs/ocfs2/namei.c
fs/ocfs2/ocfs2.h
fs/ocfs2/quota_local.c
fs/ocfs2/refcounttree.c
fs/ocfs2/suballoc.c
fs/ocfs2/super.c
fs/ocfs2/super.h
fs/ocfs2/xattr.c
fs/overlayfs/super.c
fs/proc/array.c
fs/proc/task_mmu.c
fs/reiserfs/super.c
fs/userfaultfd.c [new file with mode: 0644]
fs/xfs/xfs_super.c
include/linux/cred.h
include/linux/fs.h
include/linux/fsnotify_backend.h
include/linux/genalloc.h
include/linux/kthread.h
include/linux/mm.h
include/linux/mm_types.h
include/linux/mmzone.h
include/linux/nmi.h
include/linux/rmap.h
include/linux/sched.h
include/linux/seq_file.h
include/linux/slab.h
include/linux/smpboot.h
include/linux/syscalls.h
include/linux/userfaultfd_k.h [new file with mode: 0644]
include/linux/wait.h
include/linux/watchdog.h
include/trace/events/tlb.h
include/uapi/linux/Kbuild
include/uapi/linux/prctl.h
include/uapi/linux/securebits.h
include/uapi/linux/userfaultfd.h [new file with mode: 0644]
init/Kconfig
kernel/cgroup.c
kernel/fork.c
kernel/kthread.c
kernel/sched/wait.c
kernel/smpboot.c
kernel/sys_ni.c
kernel/user_namespace.c
kernel/watchdog.c
lib/genalloc.c
mm/Makefile
mm/dmapool.c
mm/gup.c
mm/huge_memory.c
mm/hugetlb.c
mm/internal.h
mm/madvise.c
mm/memblock.c
mm/memcontrol.c
mm/memory.c
mm/memory_hotplug.c
mm/mempolicy.c
mm/migrate.c
mm/mlock.c
mm/mmap.c
mm/mprotect.c
mm/mremap.c
mm/rmap.c
mm/slab.c
mm/slab.h
mm/slab_common.c
mm/slob.c
mm/slub.c
mm/userfaultfd.c [new file with mode: 0644]
mm/vmscan.c
net/ceph/ceph_common.c
net/sunrpc/sched.c
scripts/Lindent
scripts/decode_stacktrace.sh
scripts/kernel-doc
scripts/spelling.txt
security/commoncap.c
security/keys/process_keys.c
security/selinux/hooks.c
tools/testing/selftests/capabilities/.gitignore [new file with mode: 0644]
tools/testing/selftests/capabilities/Makefile [new file with mode: 0644]
tools/testing/selftests/capabilities/test_execve.c [new file with mode: 0644]
tools/testing/selftests/capabilities/validate_cap.c [new file with mode: 0644]
tools/testing/selftests/vm/Makefile
tools/testing/selftests/vm/run_vmtests
tools/testing/selftests/vm/userfaultfd.c [new file with mode: 0644]

diff --git a/Documentation/features/vm/TLB/arch-support.txt b/Documentation/features/vm/TLB/arch-support.txt
new file mode 100644 (file)
index 0000000..261b92e
--- /dev/null
@@ -0,0 +1,40 @@
+#
+# Feature name:          batch-unmap-tlb-flush
+#         Kconfig:       ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+#         description:   arch supports deferral of TLB flush until multiple pages are unmapped
+#
+    -----------------------
+    |         arch |status|
+    -----------------------
+    |       alpha: | TODO |
+    |         arc: | TODO |
+    |         arm: | TODO |
+    |       arm64: | TODO |
+    |       avr32: |  ..  |
+    |    blackfin: | TODO |
+    |         c6x: |  ..  |
+    |        cris: |  ..  |
+    |         frv: |  ..  |
+    |       h8300: |  ..  |
+    |     hexagon: | TODO |
+    |        ia64: | TODO |
+    |        m32r: | TODO |
+    |        m68k: |  ..  |
+    |       metag: | TODO |
+    |  microblaze: |  ..  |
+    |        mips: | TODO |
+    |     mn10300: | TODO |
+    |       nios2: |  ..  |
+    |    openrisc: |  ..  |
+    |      parisc: | TODO |
+    |     powerpc: | TODO |
+    |        s390: | TODO |
+    |       score: |  ..  |
+    |          sh: | TODO |
+    |       sparc: | TODO |
+    |        tile: | TODO |
+    |          um: |  ..  |
+    |   unicore32: |  ..  |
+    |         x86: |  ok  |
+    |      xtensa: | TODO |
+    -----------------------
index 64df08db46577e4525cc20c37f8ddfbc4b4c4217..39ac6546d4a42f5a97f027647aa1fb1147ec6fd2 100644 (file)
@@ -303,6 +303,7 @@ Code  Seq#(hex)     Include File            Comments
 0xA3   80-8F   Port ACL                in development:
                                        <mailto:tlewis@mindspring.com>
 0xA3   90-9F   linux/dtlk.h
+0xAA   00-3F   linux/uapi/linux/userfaultfd.h
 0xAB   00-1F   linux/nbd.h
 0xAC   00-1F   linux/raw.h
 0xAD   00      Netfilter device        in development:
diff --git a/Documentation/vm/userfaultfd.txt b/Documentation/vm/userfaultfd.txt
new file mode 100644 (file)
index 0000000..70a3c94
--- /dev/null
@@ -0,0 +1,144 @@
+= Userfaultfd =
+
+== Objective ==
+
+Userfaults allow the implementation of on-demand paging from userland
+and more generally they allow userland to take control of various
+memory page faults, something otherwise only the kernel code could do.
+
+For example userfaults allows a proper and more optimal implementation
+of the PROT_NONE+SIGSEGV trick.
+
+== Design ==
+
+Userfaults are delivered and resolved through the userfaultfd syscall.
+
+The userfaultfd (aside from registering and unregistering virtual
+memory ranges) provides two primary functionalities:
+
+1) read/POLLIN protocol to notify a userland thread of the faults
+   happening
+
+2) various UFFDIO_* ioctls that can manage the virtual memory regions
+   registered in the userfaultfd that allows userland to efficiently
+   resolve the userfaults it receives via 1) or to manage the virtual
+   memory in the background
+
+The real advantage of userfaults if compared to regular virtual memory
+management of mremap/mprotect is that the userfaults in all their
+operations never involve heavyweight structures like vmas (in fact the
+userfaultfd runtime load never takes the mmap_sem for writing).
+
+Vmas are not suitable for page- (or hugepage) granular fault tracking
+when dealing with virtual address spaces that could span
+Terabytes. Too many vmas would be needed for that.
+
+The userfaultfd once opened by invoking the syscall, can also be
+passed using unix domain sockets to a manager process, so the same
+manager process could handle the userfaults of a multitude of
+different processes without them being aware about what is going on
+(well of course unless they later try to use the userfaultfd
+themselves on the same region the manager is already tracking, which
+is a corner case that would currently return -EBUSY).
+
+== API ==
+
+When first opened the userfaultfd must be enabled invoking the
+UFFDIO_API ioctl specifying a uffdio_api.api value set to UFFD_API (or
+a later API version) which will specify the read/POLLIN protocol
+userland intends to speak on the UFFD and the uffdio_api.features
+userland requires. The UFFDIO_API ioctl if successful (i.e. if the
+requested uffdio_api.api is spoken also by the running kernel and the
+requested features are going to be enabled) will return into
+uffdio_api.features and uffdio_api.ioctls two 64bit bitmasks of
+respectively all the available features of the read(2) protocol and
+the generic ioctl available.
+
+Once the userfaultfd has been enabled the UFFDIO_REGISTER ioctl should
+be invoked (if present in the returned uffdio_api.ioctls bitmask) to
+register a memory range in the userfaultfd by setting the
+uffdio_register structure accordingly. The uffdio_register.mode
+bitmask will specify to the kernel which kind of faults to track for
+the range (UFFDIO_REGISTER_MODE_MISSING would track missing
+pages). The UFFDIO_REGISTER ioctl will return the
+uffdio_register.ioctls bitmask of ioctls that are suitable to resolve
+userfaults on the range registered. Not all ioctls will necessarily be
+supported for all memory types depending on the underlying virtual
+memory backend (anonymous memory vs tmpfs vs real filebacked
+mappings).
+
+Userland can use the uffdio_register.ioctls to manage the virtual
+address space in the background (to add or potentially also remove
+memory from the userfaultfd registered range). This means a userfault
+could be triggering just before userland maps in the background the
+user-faulted page.
+
+The primary ioctl to resolve userfaults is UFFDIO_COPY. That
+atomically copies a page into the userfault registered range and wakes
+up the blocked userfaults (unless uffdio_copy.mode &
+UFFDIO_COPY_MODE_DONTWAKE is set). Other ioctl works similarly to
+UFFDIO_COPY. They're atomic as in guaranteeing that nothing can see an
+half copied page since it'll keep userfaulting until the copy has
+finished.
+
+== QEMU/KVM ==
+
+QEMU/KVM is using the userfaultfd syscall to implement postcopy live
+migration. Postcopy live migration is one form of memory
+externalization consisting of a virtual machine running with part or
+all of its memory residing on a different node in the cloud. The
+userfaultfd abstraction is generic enough that not a single line of
+KVM kernel code had to be modified in order to add postcopy live
+migration to QEMU.
+
+Guest async page faults, FOLL_NOWAIT and all other GUP features work
+just fine in combination with userfaults. Userfaults trigger async
+page faults in the guest scheduler so those guest processes that
+aren't waiting for userfaults (i.e. network bound) can keep running in
+the guest vcpus.
+
+It is generally beneficial to run one pass of precopy live migration
+just before starting postcopy live migration, in order to avoid
+generating userfaults for readonly guest regions.
+
+The implementation of postcopy live migration currently uses one
+single bidirectional socket but in the future two different sockets
+will be used (to reduce the latency of the userfaults to the minimum
+possible without having to decrease /proc/sys/net/ipv4/tcp_wmem).
+
+The QEMU in the source node writes all pages that it knows are missing
+in the destination node, into the socket, and the migration thread of
+the QEMU running in the destination node runs UFFDIO_COPY|ZEROPAGE
+ioctls on the userfaultfd in order to map the received pages into the
+guest (UFFDIO_ZEROCOPY is used if the source page was a zero page).
+
+A different postcopy thread in the destination node listens with
+poll() to the userfaultfd in parallel. When a POLLIN event is
+generated after a userfault triggers, the postcopy thread read() from
+the userfaultfd and receives the fault address (or -EAGAIN in case the
+userfault was already resolved and waken by a UFFDIO_COPY|ZEROPAGE run
+by the parallel QEMU migration thread).
+
+After the QEMU postcopy thread (running in the destination node) gets
+the userfault address it writes the information about the missing page
+into the socket. The QEMU source node receives the information and
+roughly "seeks" to that page address and continues sending all
+remaining missing pages from that new page offset. Soon after that
+(just the time to flush the tcp_wmem queue through the network) the
+migration thread in the QEMU running in the destination node will
+receive the page that triggered the userfault and it'll map it as
+usual with the UFFDIO_COPY|ZEROPAGE (without actually knowing if it
+was spontaneously sent by the source or if it was an urgent page
+requested through an userfault).
+
+By the time the userfaults start, the QEMU in the destination node
+doesn't need to keep any per-page state bitmap relative to the live
+migration around and a single per-page bitmap has to be maintained in
+the QEMU running in the source node to know which pages are still
+missing in the destination node. The bitmap in the source node is
+checked to find which missing pages to send in round robin and we seek
+over it when receiving incoming userfaults. After sending each page of
+course the bitmap is updated accordingly. It's also useful to avoid
+sending the same page twice (in case the userfault is read by the
+postcopy thread just before UFFDIO_COPY|ZEROPAGE runs in the migration
+thread).
index 265ffeb2037ec327a731ae81d9812e40c97cc6d8..80e277cfcc8b6965d954473d877b66f91cdec52c 100644 (file)
@@ -369,7 +369,7 @@ static void __init at91_pm_sram_init(void)
                return;
        }
 
-       sram_pool = gen_pool_get(&pdev->dev);
+       sram_pool = gen_pool_get(&pdev->dev, NULL);
        if (!sram_pool) {
                pr_warn("%s: sram pool unavailable!\n", __func__);
                return;
index 1885676c23c08238d3ebb193de6239daf84a3e89..532d4b08276dc84c149b525e6fcc0a85d30a543b 100644 (file)
@@ -297,7 +297,7 @@ static int __init imx_suspend_alloc_ocram(
                goto put_node;
        }
 
-       ocram_pool = gen_pool_get(&pdev->dev);
+       ocram_pool = gen_pool_get(&pdev->dev, NULL);
        if (!ocram_pool) {
                pr_warn("%s: ocram pool unavailable!\n", __func__);
                ret = -ENODEV;
index 93ecf559d06d64215a6de790e79624e14f59f4f5..8ff8fc0b261ccd7a6d6912b4478e60b15606c097 100644 (file)
@@ -451,7 +451,7 @@ static int __init imx6q_suspend_init(const struct imx6_pm_socdata *socdata)
                goto put_node;
        }
 
-       ocram_pool = gen_pool_get(&pdev->dev);
+       ocram_pool = gen_pool_get(&pdev->dev, NULL);
        if (!ocram_pool) {
                pr_warn("%s: ocram pool unavailable!\n", __func__);
                ret = -ENODEV;
index 6a4199f2bffb8b36248d0993bf995e82cc1bc991..c378ab0c24317ccfd273e2a08a06e3f698e89b23 100644 (file)
@@ -56,7 +56,7 @@ static int socfpga_setup_ocram_self_refresh(void)
                goto put_node;
        }
 
-       ocram_pool = gen_pool_get(&pdev->dev);
+       ocram_pool = gen_pool_get(&pdev->dev, NULL);
        if (!ocram_pool) {
                pr_warn("%s: ocram pool unavailable!\n", __func__);
                ret = -ENODEV;
index 2790b6a64157f79663fe5232afe9f857e6d81cb7..17f486233db03c4d6d14d4c283b2699a73b60caa 100644 (file)
@@ -488,7 +488,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 int arch_add_memory(int nid, u64 start, u64 size)
 {
        pg_data_t *pgdat;
-       unsigned long start_pfn = start >> PAGE_SHIFT;
+       unsigned long start_pfn = PFN_DOWN(start);
        unsigned long nr_pages = size >> PAGE_SHIFT;
        int ret;
 
@@ -517,7 +517,7 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #ifdef CONFIG_MEMORY_HOTREMOVE
 int arch_remove_memory(u64 start, u64 size)
 {
-       unsigned long start_pfn = start >> PAGE_SHIFT;
+       unsigned long start_pfn = PFN_DOWN(start);
        unsigned long nr_pages = size >> PAGE_SHIFT;
        struct zone *zone;
        int ret;
index bce52ba66206f6a3cd5c958239ec0cd8ba98d7aa..05713d190247c6842907a35c8f72b856ab1ce2dd 100644 (file)
@@ -33,8 +33,8 @@ void __init setup_bootmem_node(int nid, unsigned long start, unsigned long end)
        /* Don't allow bogus node assignment */
        BUG_ON(nid >= MAX_NUMNODES || nid <= 0);
 
-       start_pfn = start >> PAGE_SHIFT;
-       end_pfn = end >> PAGE_SHIFT;
+       start_pfn = PFN_DOWN(start);
+       end_pfn = PFN_DOWN(end);
 
        pmb_bolt_mapping((unsigned long)__va(start), start, end - start,
                         PAGE_KERNEL);
index 48f7433dac6f5ac4e76b6a8841b3db0d45c5f19d..117e2f373e50d40cee118dcab42a0e86a617b2ee 100644 (file)
@@ -41,6 +41,7 @@ config X86
        select ARCH_USE_CMPXCHG_LOCKREF         if X86_64
        select ARCH_USE_QUEUED_RWLOCKS
        select ARCH_USE_QUEUED_SPINLOCKS
+       select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
        select ARCH_WANTS_DYNAMIC_TASK_STRUCT
        select ARCH_WANT_FRAME_POINTERS
        select ARCH_WANT_IPC_PARSE_VERSION      if X86_32
index 25e3cf1cd8fdeb8cf8726b9b985ec42fa9f3a14e..477bfa6db370783294e858210a8064b295da3082 100644 (file)
 371    i386    recvfrom                sys_recvfrom                    compat_sys_recvfrom
 372    i386    recvmsg                 sys_recvmsg                     compat_sys_recvmsg
 373    i386    shutdown                sys_shutdown
+374    i386    userfaultfd             sys_userfaultfd
index 9ef32d5f1b19e67ed10b69c67be5f53806c19ffa..81c490634db994ba810984f8f6dab052a54c8139 100644 (file)
 320    common  kexec_file_load         sys_kexec_file_load
 321    common  bpf                     sys_bpf
 322    64      execveat                stub_execveat
+323    common  userfaultfd             sys_userfaultfd
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
index cd791948b286a13a7c5cf35e71662cb8066d697a..6df2029405a3ae55df8b9718dd320b55dde5c1ad 100644 (file)
@@ -261,6 +261,12 @@ static inline void reset_lazy_tlbstate(void)
 
 #endif /* SMP */
 
+/* Not inlined due to inc_irq_stat not being defined yet */
+#define flush_tlb_local() {            \
+       inc_irq_stat(irq_tlb_count);    \
+       local_flush_tlb();              \
+}
+
 #ifndef CONFIG_PARAVIRT
 #define flush_tlb_others(mask, mm, start, end) \
        native_flush_tlb_others(mask, mm, start, end)
index 3f124d553c5aaa2fef8baf4f03f609a65bb5f191..cd9b6d0b10bf408d04956e45c1a2d77bd3f99b07 100644 (file)
@@ -12,7 +12,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/export.h>
-#include <linux/watchdog.h>
+#include <linux/nmi.h>
 
 #include <asm/cpufeature.h>
 #include <asm/hardirq.h>
@@ -3627,7 +3627,10 @@ static __init int fixup_ht_bug(void)
                return 0;
        }
 
-       watchdog_nmi_disable_all();
+       if (lockup_detector_suspend() != 0) {
+               pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
+               return 0;
+       }
 
        x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
 
@@ -3635,7 +3638,7 @@ static __init int fixup_ht_bug(void)
        x86_pmu.commit_scheduling = NULL;
        x86_pmu.stop_scheduling = NULL;
 
-       watchdog_nmi_enable_all();
+       lockup_detector_resume();
 
        get_online_cpus();
 
index 90b924acd9822ffdd9409b9fd97325ea7954b97a..8ddb5d0d66fb6f6353e735760489a47a0b98dea7 100644 (file)
@@ -140,6 +140,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
        info.flush_end = end;
 
        count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+       trace_tlb_flush(TLB_REMOTE_SEND_IPI, end - start);
        if (is_uv_system()) {
                unsigned int cpu;
 
index 31df474d72f4a275ba0c87fb79a26c538fc661ca..560751bad2947105a0d06e7650bf906c3e4f0de7 100644 (file)
@@ -392,6 +392,16 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, int nid)
        for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
                int page_nid;
 
+               /*
+                * memory block could have several absent sections from start.
+                * skip pfn range from absent section
+                */
+               if (!pfn_present(pfn)) {
+                       pfn = round_down(pfn + PAGES_PER_SECTION,
+                                        PAGES_PER_SECTION) - 1;
+                       continue;
+               }
+
                page_nid = get_nid_for_pfn(pfn);
                if (page_nid < 0)
                        continue;
index 58f65486de332cd893d6175ad98ca405c999a221..284ac4c934ba9c1d43e212f23afe0a24574f760b 100644 (file)
@@ -2157,7 +2157,7 @@ static int coda_probe(struct platform_device *pdev)
        /* Get IRAM pool from device tree or platform data */
        pool = of_gen_pool_get(np, "iram", 0);
        if (!pool && pdata)
-               pool = gen_pool_get(pdata->iram_dev);
+               pool = gen_pool_get(pdata->iram_dev, NULL);
        if (!pool) {
                dev_err(&pdev->dev, "iram pool not available\n");
                return -ENOMEM;
index 15c33cc34a802fd23f93846da023eeff85261ce4..431e1dd528bcb8dd0d662eec74bdc865d663dd4c 100644 (file)
@@ -186,10 +186,10 @@ static int sram_probe(struct platform_device *pdev)
        if (IS_ERR(sram->virt_base))
                return PTR_ERR(sram->virt_base);
 
-       sram->pool = devm_gen_pool_create(sram->dev,
-                                         ilog2(SRAM_GRANULARITY), -1);
-       if (!sram->pool)
-               return -ENOMEM;
+       sram->pool = devm_gen_pool_create(sram->dev, ilog2(SRAM_GRANULARITY),
+                                         NUMA_NO_NODE, NULL);
+       if (IS_ERR(sram->pool))
+               return PTR_ERR(sram->pool);
 
        ret = sram_reserve_regions(sram, res);
        if (ret)
index ba97efc3bf707db82924300b68b684fe4f21f243..071280643db75f3f48dafc2dcbe6a385d7ac53b8 100644 (file)
@@ -9,7 +9,7 @@ config VGA_CONSOLE
        depends on !4xx && !8xx && !SPARC && !M68K && !PARISC && !FRV && \
                !SUPERH && !BLACKFIN && !AVR32 && !MN10300 && !CRIS && \
                (!ARM || ARCH_FOOTBRIDGE || ARCH_INTEGRATOR || ARCH_NETWINDER) && \
-               !ARM64
+               !ARM64 && !ARC
        default y
        help
          Saying Y here will allow you to use Linux in text mode through a
index 09e051fefc5b4ed5b6759c9e191c2cca4cd496e7..f79cf4043e60d9c854adfce1634164e1a0fef53f 100644 (file)
@@ -27,6 +27,7 @@ obj-$(CONFIG_ANON_INODES)     += anon_inodes.o
 obj-$(CONFIG_SIGNALFD)         += signalfd.o
 obj-$(CONFIG_TIMERFD)          += timerfd.o
 obj-$(CONFIG_EVENTFD)          += eventfd.o
+obj-$(CONFIG_USERFAULTFD)      += userfaultfd.o
 obj-$(CONFIG_AIO)               += aio.o
 obj-$(CONFIG_FS_DAX)           += dax.o
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
index 480440f4701fb8c546d9e39c640295cad4224b46..155f84253f331a4d9d13ffac3d1ea70322b09b11 100644 (file)
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -308,15 +308,9 @@ static void aio_free_ring(struct kioctx *ctx)
        }
 }
 
-static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
-{
-       vma->vm_flags |= VM_DONTEXPAND;
-       vma->vm_ops = &generic_file_vm_ops;
-       return 0;
-}
-
-static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
+static int aio_ring_mremap(struct vm_area_struct *vma)
 {
+       struct file *file = vma->vm_file;
        struct mm_struct *mm = vma->vm_mm;
        struct kioctx_table *table;
        int i, res = -EINVAL;
@@ -342,9 +336,24 @@ static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
        return res;
 }
 
+static const struct vm_operations_struct aio_ring_vm_ops = {
+       .mremap         = aio_ring_mremap,
+#if IS_ENABLED(CONFIG_MMU)
+       .fault          = filemap_fault,
+       .map_pages      = filemap_map_pages,
+       .page_mkwrite   = filemap_page_mkwrite,
+#endif
+};
+
+static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       vma->vm_flags |= VM_DONTEXPAND;
+       vma->vm_ops = &aio_ring_vm_ops;
+       return 0;
+}
+
 static const struct file_operations aio_ring_fops = {
        .mmap = aio_ring_mmap,
-       .mremap = aio_ring_remap,
 };
 
 #if IS_ENABLED(CONFIG_MIGRATION)
index d1c833c321b92eff48d9f35bf7171ef0ac59e7bf..7b6bfcbf801cac7bf5c54f4543809c1bb6c76d87 100644 (file)
@@ -479,7 +479,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
        if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
                seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
        if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
-               seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
+               seq_show_option(m, "snapdirname", fsopt->snapdir_name);
 
        return 0;
 }
index 0a9fb6b53126a7c95715a862bfb3b067f443fc1a..6a1119e87fbb6fb636e4d76e814574402b7dc139 100644 (file)
@@ -394,17 +394,17 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
        struct sockaddr *srcaddr;
        srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
 
-       seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string);
+       seq_show_option(s, "vers", tcon->ses->server->vals->version_string);
        cifs_show_security(s, tcon->ses);
        cifs_show_cache_flavor(s, cifs_sb);
 
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
                seq_puts(s, ",multiuser");
        else if (tcon->ses->user_name)
-               seq_printf(s, ",username=%s", tcon->ses->user_name);
+               seq_show_option(s, "username", tcon->ses->user_name);
 
        if (tcon->ses->domainName)
-               seq_printf(s, ",domain=%s", tcon->ses->domainName);
+               seq_show_option(s, "domain", tcon->ses->domainName);
 
        if (srcaddr->sa_family != AF_UNSPEC) {
                struct sockaddr_in *saddr4;
index ee3878262a495cfa57d20c4fbafac7b73b2e743a..a63c7b0a10cfca3b3075f4dc14435add1bcec91b 100644 (file)
@@ -1776,10 +1776,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
        }
 
        if (sbi->s_qf_names[USRQUOTA])
-               seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+               seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]);
 
        if (sbi->s_qf_names[GRPQUOTA])
-               seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
+               seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]);
 #endif
 }
 
index 2982445947e174a5bd0f7e6ebaa3292c1302c7ac..894fb01a91dab74be395685ecc5ab08f58533d37 100644 (file)
@@ -1334,11 +1334,11 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
        if (is_ancestor(root, sdp->sd_master_dir))
                seq_puts(s, ",meta");
        if (args->ar_lockproto[0])
-               seq_printf(s, ",lockproto=%s", args->ar_lockproto);
+               seq_show_option(s, "lockproto", args->ar_lockproto);
        if (args->ar_locktable[0])
-               seq_printf(s, ",locktable=%s", args->ar_locktable);
+               seq_show_option(s, "locktable", args->ar_locktable);
        if (args->ar_hostdata[0])
-               seq_printf(s, ",hostdata=%s", args->ar_hostdata);
+               seq_show_option(s, "hostdata", args->ar_hostdata);
        if (args->ar_spectator)
                seq_puts(s, ",spectator");
        if (args->ar_localflocks)
index 55c03b9e90708e1230210c271779350ca3c72cc7..4574fdd3d4219f86aa779f0012cee42252a7ea10 100644 (file)
@@ -136,9 +136,9 @@ static int hfs_show_options(struct seq_file *seq, struct dentry *root)
        struct hfs_sb_info *sbi = HFS_SB(root->d_sb);
 
        if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f))
-               seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator);
+               seq_show_option_n(seq, "creator", (char *)&sbi->s_creator, 4);
        if (sbi->s_type != cpu_to_be32(0x3f3f3f3f))
-               seq_printf(seq, ",type=%.4s", (char *)&sbi->s_type);
+               seq_show_option_n(seq, "type", (char *)&sbi->s_type, 4);
        seq_printf(seq, ",uid=%u,gid=%u",
                        from_kuid_munged(&init_user_ns, sbi->s_uid),
                        from_kgid_munged(&init_user_ns, sbi->s_gid));
index c90b72ee676d8a022dd47b8577be6f6a40967425..bb806e58c9770ec5491235bb8ac5fcdcd2e5574b 100644 (file)
@@ -218,9 +218,9 @@ int hfsplus_show_options(struct seq_file *seq, struct dentry *root)
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb);
 
        if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
-               seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
+               seq_show_option_n(seq, "creator", (char *)&sbi->creator, 4);
        if (sbi->type != HFSPLUS_DEF_CR_TYPE)
-               seq_printf(seq, ",type=%.4s", (char *)&sbi->type);
+               seq_show_option_n(seq, "type", (char *)&sbi->type, 4);
        seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask,
                        from_kuid_munged(&init_user_ns, sbi->uid),
                        from_kgid_munged(&init_user_ns, sbi->gid));
index 059597b23f677b0959d8264b83cf4c4a2cec34b7..2ac99db3750ef7b2d2bf3e9ea9e90e69320a0d83 100644 (file)
@@ -260,7 +260,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root)
        size_t offset = strlen(root_ino) + 1;
 
        if (strlen(root_path) > offset)
-               seq_printf(seq, ",%s", root_path + offset);
+               seq_show_option(seq, root_path + offset, NULL);
 
        if (append)
                seq_puts(seq, ",append");
index 44523f4a608414187d24aa86204bf6d4b92c77f9..6faaf710e563ee184e20204f80c63c2157cbb186 100644 (file)
@@ -154,6 +154,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
        struct dnotify_struct *dn;
        struct dnotify_struct **prev;
        struct inode *inode;
+       bool free = false;
 
        inode = file_inode(filp);
        if (!S_ISDIR(inode->i_mode))
@@ -182,11 +183,15 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
 
        /* nothing else could have found us thanks to the dnotify_groups
           mark_mutex */
-       if (dn_mark->dn == NULL)
-               fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
+       if (dn_mark->dn == NULL) {
+               fsnotify_detach_mark(fsn_mark);
+               free = true;
+       }
 
        mutex_unlock(&dnotify_group->mark_mutex);
 
+       if (free)
+               fsnotify_free_mark(fsn_mark);
        fsnotify_put_mark(fsn_mark);
 }
 
@@ -362,9 +367,10 @@ out:
        spin_unlock(&fsn_mark->lock);
 
        if (destroy)
-               fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
-
+               fsnotify_detach_mark(fsn_mark);
        mutex_unlock(&dnotify_group->mark_mutex);
+       if (destroy)
+               fsnotify_free_mark(fsn_mark);
        fsnotify_put_mark(fsn_mark);
 out_err:
        if (new_fsn_mark)
index cf275500a6658eb13cc5990ea508571b55592197..8e8e6bcd1d43d266346bac16dbb12ff8c893bae2 100644 (file)
@@ -529,8 +529,10 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
        removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
                                                 &destroy_mark);
        if (destroy_mark)
-               fsnotify_destroy_mark_locked(fsn_mark, group);
+               fsnotify_detach_mark(fsn_mark);
        mutex_unlock(&group->mark_mutex);
+       if (destroy_mark)
+               fsnotify_free_mark(fsn_mark);
 
        fsnotify_put_mark(fsn_mark);
        if (removed & real_mount(mnt)->mnt_fsnotify_mask)
@@ -557,8 +559,10 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
        removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
                                                 &destroy_mark);
        if (destroy_mark)
-               fsnotify_destroy_mark_locked(fsn_mark, group);
+               fsnotify_detach_mark(fsn_mark);
        mutex_unlock(&group->mark_mutex);
+       if (destroy_mark)
+               fsnotify_free_mark(fsn_mark);
 
        /* matches the fsnotify_find_inode_mark() */
        fsnotify_put_mark(fsn_mark);
index 58b7cdb63da9f2e6e9fc4fa095ceb9e762b4ffb9..6b6f0d472ae816e3cd726796caccf346b86bc461 100644 (file)
@@ -76,7 +76,8 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
        struct inotify_inode_mark *inode_mark;
        struct inode *inode;
 
-       if (!(mark->flags & (FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_INODE)))
+       if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE) ||
+           !(mark->flags & FSNOTIFY_MARK_FLAG_INODE))
                return;
 
        inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
index dd3fb0b17be7cc5d914275c6e83da49e7b8e3f48..db39de2dd4cbc8b0e4e962e5a874a6e5b5777bc7 100644 (file)
@@ -26,7 +26,6 @@
 
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
-#include "../mount.h"
 
 /*
  * Clear all of the marks on an inode when it is being evicted from core
@@ -204,6 +203,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
        else
                mnt = NULL;
 
+       /*
+        * Optimization: srcu_read_lock() has a memory barrier which can
+        * be expensive.  It protects walking the *_fsnotify_marks lists.
+        * However, if we do not walk the lists, we do not have to do
+        * SRCU because we have no references to any objects and do not
+        * need SRCU to keep them "alive".
+        */
+       if (hlist_empty(&to_tell->i_fsnotify_marks) &&
+           (!mnt || hlist_empty(&mnt->mnt_fsnotify_marks)))
+               return 0;
        /*
         * if this is a modify event we may need to clear the ignored masks
         * otherwise return if neither the inode nor the vfsmount care about
index 13a00be516d250bc1328697060c8f73ddb688827..b44c68a857e7760743fa74aa0383258b0b6f8e4a 100644 (file)
@@ -6,6 +6,8 @@
 #include <linux/srcu.h>
 #include <linux/types.h>
 
+#include "../mount.h"
+
 /* destroy all events sitting in this groups notification queue */
 extern void fsnotify_flush_notify(struct fsnotify_group *group);
 
@@ -38,15 +40,22 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
 extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
 /* inode specific destruction of a mark */
 extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
-/* Destroy all marks in the given list */
-extern void fsnotify_destroy_marks(struct list_head *to_free);
 /* Find mark belonging to given group in the list of marks */
 extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head,
                                                struct fsnotify_group *group);
-/* run the list of all marks associated with inode and flag them to be freed */
-extern void fsnotify_clear_marks_by_inode(struct inode *inode);
-/* run the list of all marks associated with vfsmount and flag them to be freed */
-extern void fsnotify_clear_marks_by_mount(struct vfsmount *mnt);
+/* Destroy all marks in the given list protected by 'lock' */
+extern void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock);
+/* run the list of all marks associated with inode and destroy them */
+static inline void fsnotify_clear_marks_by_inode(struct inode *inode)
+{
+       fsnotify_destroy_marks(&inode->i_fsnotify_marks, &inode->i_lock);
+}
+/* run the list of all marks associated with vfsmount and destroy them */
+static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
+{
+       fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks,
+                              &mnt->mnt_root->d_lock);
+}
 /*
  * update the dentry->d_flags of all of inode's children to indicate if inode cares
  * about events that happen to its children.
index 3daf513ee99e6ccf21ce01bfed71f85a21b5717e..474a3ce1b5e104ccf5c73f6acbfb1e044e147d1d 100644 (file)
@@ -64,26 +64,6 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
        spin_unlock(&inode->i_lock);
 }
 
-/*
- * Given an inode, destroy all of the marks associated with that inode.
- */
-void fsnotify_clear_marks_by_inode(struct inode *inode)
-{
-       struct fsnotify_mark *mark;
-       struct hlist_node *n;
-       LIST_HEAD(free_list);
-
-       spin_lock(&inode->i_lock);
-       hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, obj_list) {
-               list_add(&mark->free_list, &free_list);
-               hlist_del_init_rcu(&mark->obj_list);
-               fsnotify_get_mark(mark);
-       }
-       spin_unlock(&inode->i_lock);
-
-       fsnotify_destroy_marks(&free_list);
-}
-
 /*
  * Given a group clear all of the inode marks associated with that group.
  */
index 39ddcaf0918f145fb3f2cb916d27aa1b866a220e..fc0df4442f7b45bb7c93983669a20848431a11c3 100644 (file)
@@ -122,26 +122,27 @@ u32 fsnotify_recalc_mask(struct hlist_head *head)
 }
 
 /*
- * Any time a mark is getting freed we end up here.
- * The caller had better be holding a reference to this mark so we don't actually
- * do the final put under the mark->lock
+ * Remove mark from inode / vfsmount list, group list, drop inode reference
+ * if we got one.
+ *
+ * Must be called with group->mark_mutex held.
  */
-void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
-                                 struct fsnotify_group *group)
+void fsnotify_detach_mark(struct fsnotify_mark *mark)
 {
        struct inode *inode = NULL;
+       struct fsnotify_group *group = mark->group;
 
        BUG_ON(!mutex_is_locked(&group->mark_mutex));
 
        spin_lock(&mark->lock);
 
        /* something else already called this function on this mark */
-       if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
+       if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
                spin_unlock(&mark->lock);
                return;
        }
 
-       mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+       mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED;
 
        if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
                inode = mark->inode;
@@ -150,6 +151,12 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
                fsnotify_destroy_vfsmount_mark(mark);
        else
                BUG();
+       /*
+        * Note that we didn't update flags telling whether inode cares about
+        * what's happening with children. We update these flags from
+        * __fsnotify_parent() lazily when next event happens on one of our
+        * children.
+        */
 
        list_del_init(&mark->g_list);
 
@@ -157,18 +164,32 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
 
        if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
                iput(inode);
-       /* release lock temporarily */
-       mutex_unlock(&group->mark_mutex);
+
+       atomic_dec(&group->num_marks);
+}
+
+/*
+ * Free fsnotify mark. The freeing is actually happening from a kthread which
+ * first waits for srcu period end. Caller must have a reference to the mark
+ * or be protected by fsnotify_mark_srcu.
+ */
+void fsnotify_free_mark(struct fsnotify_mark *mark)
+{
+       struct fsnotify_group *group = mark->group;
+
+       spin_lock(&mark->lock);
+       /* something else already called this function on this mark */
+       if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
+               spin_unlock(&mark->lock);
+               return;
+       }
+       mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+       spin_unlock(&mark->lock);
 
        spin_lock(&destroy_lock);
        list_add(&mark->g_list, &destroy_list);
        spin_unlock(&destroy_lock);
        wake_up(&destroy_waitq);
-       /*
-        * We don't necessarily have a ref on mark from caller so the above destroy
-        * may have actually freed it, unless this group provides a 'freeing_mark'
-        * function which must be holding a reference.
-        */
 
        /*
         * Some groups like to know that marks are being freed.  This is a
@@ -177,50 +198,45 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
         */
        if (group->ops->freeing_mark)
                group->ops->freeing_mark(mark, group);
-
-       /*
-        * __fsnotify_update_child_dentry_flags(inode);
-        *
-        * I really want to call that, but we can't, we have no idea if the inode
-        * still exists the second we drop the mark->lock.
-        *
-        * The next time an event arrive to this inode from one of it's children
-        * __fsnotify_parent will see that the inode doesn't care about it's
-        * children and will update all of these flags then.  So really this
-        * is just a lazy update (and could be a perf win...)
-        */
-
-       atomic_dec(&group->num_marks);
-
-       mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
 }
 
 void fsnotify_destroy_mark(struct fsnotify_mark *mark,
                           struct fsnotify_group *group)
 {
        mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
-       fsnotify_destroy_mark_locked(mark, group);
+       fsnotify_detach_mark(mark);
        mutex_unlock(&group->mark_mutex);
+       fsnotify_free_mark(mark);
 }
 
-/*
- * Destroy all marks in the given list. The marks must be already detached from
- * the original inode / vfsmount.
- */
-void fsnotify_destroy_marks(struct list_head *to_free)
+void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock)
 {
-       struct fsnotify_mark *mark, *lmark;
-       struct fsnotify_group *group;
-
-       list_for_each_entry_safe(mark, lmark, to_free, free_list) {
-               spin_lock(&mark->lock);
-               fsnotify_get_group(mark->group);
-               group = mark->group;
-               spin_unlock(&mark->lock);
+       struct fsnotify_mark *mark;
 
-               fsnotify_destroy_mark(mark, group);
+       while (1) {
+               /*
+                * We have to be careful since we can race with e.g.
+                * fsnotify_clear_marks_by_group() and once we drop 'lock',
+                * mark can get removed from the obj_list and destroyed. But
+                * we are holding mark reference so mark cannot be freed and
+                * calling fsnotify_destroy_mark() more than once is fine.
+                */
+               spin_lock(lock);
+               if (hlist_empty(head)) {
+                       spin_unlock(lock);
+                       break;
+               }
+               mark = hlist_entry(head->first, struct fsnotify_mark, obj_list);
+               /*
+                * We don't update i_fsnotify_mask / mnt_fsnotify_mask here
+                * since inode / mount is going away anyway. So just remove
+                * mark from the list.
+                */
+               hlist_del_init_rcu(&mark->obj_list);
+               fsnotify_get_mark(mark);
+               spin_unlock(lock);
+               fsnotify_destroy_mark(mark, mark->group);
                fsnotify_put_mark(mark);
-               fsnotify_put_group(group);
        }
 }
 
@@ -332,7 +348,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
         * inode->i_lock
         */
        spin_lock(&mark->lock);
-       mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE;
+       mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED;
 
        fsnotify_get_group(group);
        mark->group = group;
@@ -438,8 +454,9 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
                }
                mark = list_first_entry(&to_free, struct fsnotify_mark, g_list);
                fsnotify_get_mark(mark);
-               fsnotify_destroy_mark_locked(mark, group);
+               fsnotify_detach_mark(mark);
                mutex_unlock(&group->mark_mutex);
+               fsnotify_free_mark(mark);
                fsnotify_put_mark(mark);
        }
 }
index 326b148e623cdf26d30935a985ab398033cb0950..a8fcab68faef1cdc826103d095b5b387895dfb20 100644 (file)
 
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
-#include "../mount.h"
-
-void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
-{
-       struct fsnotify_mark *mark;
-       struct hlist_node *n;
-       struct mount *m = real_mount(mnt);
-       LIST_HEAD(free_list);
-
-       spin_lock(&mnt->mnt_root->d_lock);
-       hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, obj_list) {
-               list_add(&mark->free_list, &free_list);
-               hlist_del_init_rcu(&mark->obj_list);
-               fsnotify_get_mark(mark);
-       }
-       spin_unlock(&mnt->mnt_root->d_lock);
-
-       fsnotify_destroy_marks(&free_list);
-}
 
 void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
 {
index c1128bcbeb5edb011035e7790bf85f77318f9a7b..d1a853585b539012a26cd55aca22307fa759ce87 100644 (file)
@@ -2204,17 +2204,12 @@ get_ctx_vol_failed:
        return true;
 #ifdef NTFS_RW
 iput_usnjrnl_err_out:
-       if (vol->usnjrnl_j_ino)
-               iput(vol->usnjrnl_j_ino);
-       if (vol->usnjrnl_max_ino)
-               iput(vol->usnjrnl_max_ino);
-       if (vol->usnjrnl_ino)
-               iput(vol->usnjrnl_ino);
+       iput(vol->usnjrnl_j_ino);
+       iput(vol->usnjrnl_max_ino);
+       iput(vol->usnjrnl_ino);
 iput_quota_err_out:
-       if (vol->quota_q_ino)
-               iput(vol->quota_q_ino);
-       if (vol->quota_ino)
-               iput(vol->quota_ino);
+       iput(vol->quota_q_ino);
+       iput(vol->quota_ino);
        iput(vol->extend_ino);
 #endif /* NTFS_RW */
 iput_sec_err_out:
@@ -2223,8 +2218,7 @@ iput_root_err_out:
        iput(vol->root_ino);
 iput_logfile_err_out:
 #ifdef NTFS_RW
-       if (vol->logfile_ino)
-               iput(vol->logfile_ino);
+       iput(vol->logfile_ino);
 iput_vol_err_out:
 #endif /* NTFS_RW */
        iput(vol->vol_ino);
@@ -2254,8 +2248,7 @@ iput_mftbmp_err_out:
        iput(vol->mftbmp_ino);
 iput_mirr_err_out:
 #ifdef NTFS_RW
-       if (vol->mftmirr_ino)
-               iput(vol->mftmirr_ino);
+       iput(vol->mftmirr_ino);
 #endif /* NTFS_RW */
        return false;
 }
index c58a1bcfda0fdfa83f2a42169eb308580c759def..0cdf497c91efbb915512aceed2bf58acaa37fa1d 100644 (file)
@@ -284,7 +284,19 @@ int ocfs2_set_acl(handle_t *handle,
 
 int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
-       return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+       struct buffer_head *bh = NULL;
+       int status = 0;
+
+       status = ocfs2_inode_lock(inode, &bh, 1);
+       if (status < 0) {
+               if (status != -ENOENT)
+                       mlog_errno(status);
+               return status;
+       }
+       status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
+       ocfs2_inode_unlock(inode, 1);
+       brelse(bh);
+       return status;
 }
 
 struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
@@ -292,19 +304,21 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
        struct ocfs2_super *osb;
        struct buffer_head *di_bh = NULL;
        struct posix_acl *acl;
-       int ret = -EAGAIN;
+       int ret;
 
        osb = OCFS2_SB(inode->i_sb);
        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
                return NULL;
-
-       ret = ocfs2_read_inode_block(inode, &di_bh);
-       if (ret < 0)
+       ret = ocfs2_inode_lock(inode, &di_bh, 0);
+       if (ret < 0) {
+               if (ret != -ENOENT)
+                       mlog_errno(ret);
                return ERR_PTR(ret);
+       }
 
        acl = ocfs2_get_acl_nolock(inode, type, di_bh);
 
+       ocfs2_inode_unlock(inode, 0);
        brelse(di_bh);
-
        return acl;
 }
index 5997c00a1515a6f7ec4d33a96f8eed549ea57bc2..86181d6526dc55de22b8e118660fba86546ae340 100644 (file)
@@ -908,32 +908,30 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
         */
 
        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-               ocfs2_error(sb,
-                           "Extent block #%llu has bad signature %.*s",
-                           (unsigned long long)bh->b_blocknr, 7,
-                           eb->h_signature);
-               return -EINVAL;
+               rc = ocfs2_error(sb,
+                                "Extent block #%llu has bad signature %.*s\n",
+                                (unsigned long long)bh->b_blocknr, 7,
+                                eb->h_signature);
+               goto bail;
        }
 
        if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
-               ocfs2_error(sb,
-                           "Extent block #%llu has an invalid h_blkno "
-                           "of %llu",
-                           (unsigned long long)bh->b_blocknr,
-                           (unsigned long long)le64_to_cpu(eb->h_blkno));
-               return -EINVAL;
+               rc = ocfs2_error(sb,
+                                "Extent block #%llu has an invalid h_blkno of %llu\n",
+                                (unsigned long long)bh->b_blocknr,
+                                (unsigned long long)le64_to_cpu(eb->h_blkno));
+               goto bail;
        }
 
        if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
-               ocfs2_error(sb,
-                           "Extent block #%llu has an invalid "
-                           "h_fs_generation of #%u",
-                           (unsigned long long)bh->b_blocknr,
-                           le32_to_cpu(eb->h_fs_generation));
-               return -EINVAL;
+               rc = ocfs2_error(sb,
+                                "Extent block #%llu has an invalid h_fs_generation of #%u\n",
+                                (unsigned long long)bh->b_blocknr,
+                                le32_to_cpu(eb->h_fs_generation));
+               goto bail;
        }
-
-       return 0;
+bail:
+       return rc;
 }
 
 int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
@@ -1446,8 +1444,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
        while(le16_to_cpu(el->l_tree_depth) > 1) {
                if (le16_to_cpu(el->l_next_free_rec) == 0) {
                        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-                                   "Owner %llu has empty "
-                                   "extent list (next_free_rec == 0)",
+                                   "Owner %llu has empty extent list (next_free_rec == 0)\n",
                                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
                        status = -EIO;
                        goto bail;
@@ -1456,9 +1453,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
                blkno = le64_to_cpu(el->l_recs[i].e_blkno);
                if (!blkno) {
                        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-                                   "Owner %llu has extent "
-                                   "list where extent # %d has no physical "
-                                   "block start",
+                                   "Owner %llu has extent list where extent # %d has no physical block start\n",
                                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
                        status = -EIO;
                        goto bail;
@@ -1788,8 +1783,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
        while (el->l_tree_depth) {
                if (le16_to_cpu(el->l_next_free_rec) == 0) {
                        ocfs2_error(ocfs2_metadata_cache_get_super(ci),
-                                   "Owner %llu has empty extent list at "
-                                   "depth %u\n",
+                                   "Owner %llu has empty extent list at depth %u\n",
                                    (unsigned long long)ocfs2_metadata_cache_owner(ci),
                                    le16_to_cpu(el->l_tree_depth));
                        ret = -EROFS;
@@ -1814,8 +1808,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
                blkno = le64_to_cpu(el->l_recs[i].e_blkno);
                if (blkno == 0) {
                        ocfs2_error(ocfs2_metadata_cache_get_super(ci),
-                                   "Owner %llu has bad blkno in extent list "
-                                   "at depth %u (index %d)\n",
+                                   "Owner %llu has bad blkno in extent list at depth %u (index %d)\n",
                                    (unsigned long long)ocfs2_metadata_cache_owner(ci),
                                    le16_to_cpu(el->l_tree_depth), i);
                        ret = -EROFS;
@@ -1836,8 +1829,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
                if (le16_to_cpu(el->l_next_free_rec) >
                    le16_to_cpu(el->l_count)) {
                        ocfs2_error(ocfs2_metadata_cache_get_super(ci),
-                                   "Owner %llu has bad count in extent list "
-                                   "at block %llu (next free=%u, count=%u)\n",
+                                   "Owner %llu has bad count in extent list at block %llu (next free=%u, count=%u)\n",
                                    (unsigned long long)ocfs2_metadata_cache_owner(ci),
                                    (unsigned long long)bh->b_blocknr,
                                    le16_to_cpu(el->l_next_free_rec),
@@ -2116,8 +2108,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
 
        if (left_el->l_next_free_rec != left_el->l_count) {
                ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-                           "Inode %llu has non-full interior leaf node %llu"
-                           "(next free = %u)",
+                           "Inode %llu has non-full interior leaf node %llu (next free = %u)\n",
                            (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
                            (unsigned long long)left_leaf_bh->b_blocknr,
                            le16_to_cpu(left_el->l_next_free_rec));
@@ -2256,8 +2247,7 @@ int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
                 * If we got here, we never found a valid node where
                 * the tree indicated one should be.
                 */
-               ocfs2_error(sb,
-                           "Invalid extent tree at extent block %llu\n",
+               ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
                            (unsigned long long)blkno);
                ret = -EROFS;
                goto out;
@@ -2872,8 +2862,7 @@ int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
                 * If we got here, we never found a valid node where
                 * the tree indicated one should be.
                 */
-               ocfs2_error(sb,
-                           "Invalid extent tree at extent block %llu\n",
+               ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
                            (unsigned long long)blkno);
                ret = -EROFS;
                goto out;
@@ -3131,6 +3120,30 @@ out:
        return ret;
 }
 
+static int ocfs2_remove_rightmost_empty_extent(struct ocfs2_super *osb,
+                               struct ocfs2_extent_tree *et,
+                               struct ocfs2_path *path,
+                               struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+       handle_t *handle;
+       int ret;
+       int credits = path->p_tree_depth * 2 + 1;
+
+       handle = ocfs2_start_trans(osb, credits);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               mlog_errno(ret);
+               return ret;
+       }
+
+       ret = ocfs2_remove_rightmost_path(handle, et, path, dealloc);
+       if (ret)
+               mlog_errno(ret);
+
+       ocfs2_commit_trans(osb, handle);
+       return ret;
+}
+
 /*
  * Left rotation of btree records.
  *
@@ -3200,7 +3213,7 @@ rightmost_no_delete:
                if (le16_to_cpu(el->l_next_free_rec) == 0) {
                        ret = -EIO;
                        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-                                   "Owner %llu has empty extent block at %llu",
+                                   "Owner %llu has empty extent block at %llu\n",
                                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
                                    (unsigned long long)le64_to_cpu(eb->h_blkno));
                        goto out;
@@ -3930,7 +3943,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
                next_free = le16_to_cpu(el->l_next_free_rec);
                if (next_free == 0) {
                        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-                                   "Owner %llu has a bad extent list",
+                                   "Owner %llu has a bad extent list\n",
                                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
                        ret = -EIO;
                        return;
@@ -4355,10 +4368,7 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
                                bh = path_leaf_bh(left_path);
                                eb = (struct ocfs2_extent_block *)bh->b_data;
                                ocfs2_error(sb,
-                                           "Extent block #%llu has an "
-                                           "invalid l_next_free_rec of "
-                                           "%d.  It should have "
-                                           "matched the l_count of %d",
+                                           "Extent block #%llu has an invalid l_next_free_rec of %d.  It should have matched the l_count of %d\n",
                                            (unsigned long long)le64_to_cpu(eb->h_blkno),
                                            le16_to_cpu(new_el->l_next_free_rec),
                                            le16_to_cpu(new_el->l_count));
@@ -4413,8 +4423,7 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
                                bh = path_leaf_bh(right_path);
                                eb = (struct ocfs2_extent_block *)bh->b_data;
                                ocfs2_error(sb,
-                                           "Extent block #%llu has an "
-                                           "invalid l_next_free_rec of %d",
+                                           "Extent block #%llu has an invalid l_next_free_rec of %d\n",
                                            (unsigned long long)le64_to_cpu(eb->h_blkno),
                                            le16_to_cpu(new_el->l_next_free_rec));
                                status = -EINVAL;
@@ -4970,10 +4979,9 @@ leftright:
                split_index = ocfs2_search_extent_list(el, cpos);
                if (split_index == -1) {
                        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-                                       "Owner %llu has an extent at cpos %u "
-                                       "which can no longer be found.\n",
-                                       (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
-                                       cpos);
+                                   "Owner %llu has an extent at cpos %u which can no longer be found\n",
+                                   (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+                                   cpos);
                        ret = -EROFS;
                        goto out;
                }
@@ -5158,10 +5166,9 @@ int ocfs2_change_extent_flag(handle_t *handle,
        index = ocfs2_search_extent_list(el, cpos);
        if (index == -1) {
                ocfs2_error(sb,
-                           "Owner %llu has an extent at cpos %u which can no "
-                           "longer be found.\n",
-                            (unsigned long long)
-                            ocfs2_metadata_cache_owner(et->et_ci), cpos);
+                           "Owner %llu has an extent at cpos %u which can no longer be found\n",
+                           (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+                           cpos);
                ret = -EROFS;
                goto out;
        }
@@ -5228,9 +5235,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
                cpos, len, phys);
 
        if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
-               ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
-                           "that are being written to, but the feature bit "
-                           "is not set in the super block.",
+               ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents that are being written to, but the feature bit is not set in the super block\n",
                            (unsigned long long)OCFS2_I(inode)->ip_blkno);
                ret = -EROFS;
                goto out;
@@ -5514,8 +5519,7 @@ int ocfs2_remove_extent(handle_t *handle,
        index = ocfs2_search_extent_list(el, cpos);
        if (index == -1) {
                ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-                           "Owner %llu has an extent at cpos %u which can no "
-                           "longer be found.\n",
+                           "Owner %llu has an extent at cpos %u which can no longer be found\n",
                            (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
                            cpos);
                ret = -EROFS;
@@ -5580,7 +5584,7 @@ int ocfs2_remove_extent(handle_t *handle,
                index = ocfs2_search_extent_list(el, cpos);
                if (index == -1) {
                        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-                                   "Owner %llu: split at cpos %u lost record.",
+                                   "Owner %llu: split at cpos %u lost record\n",
                                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
                                    cpos);
                        ret = -EROFS;
@@ -5596,8 +5600,7 @@ int ocfs2_remove_extent(handle_t *handle,
                        ocfs2_rec_clusters(el, rec);
                if (rec_range != trunc_range) {
                        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-                                   "Owner %llu: error after split at cpos %u"
-                                   "trunc len %u, existing record is (%u,%u)",
+                                   "Owner %llu: error after split at cpos %u trunc len %u, existing record is (%u,%u)\n",
                                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
                                    cpos, len, le32_to_cpu(rec->e_cpos),
                                    ocfs2_rec_clusters(el, rec));
@@ -6175,7 +6178,7 @@ bail:
                iput(tl_inode);
        brelse(tl_bh);
 
-       if (status < 0 && (*tl_copy)) {
+       if (status < 0) {
                kfree(*tl_copy);
                *tl_copy = NULL;
                mlog_errno(status);
@@ -7108,15 +7111,23 @@ start:
                 * to check it up here before changing the tree.
                */
                if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
-                       ocfs2_error(inode->i_sb, "Inode %lu has an empty "
+                       mlog(ML_ERROR, "Inode %lu has an empty "
                                    "extent record, depth %u\n", inode->i_ino,
                                    le16_to_cpu(root_el->l_tree_depth));
-                       status = -EROFS;
-                       goto bail;
+                       status = ocfs2_remove_rightmost_empty_extent(osb,
+                                       &et, path, &dealloc);
+                       if (status) {
+                               mlog_errno(status);
+                               goto bail;
+                       }
+
+                       ocfs2_reinit_path(path, 1);
+                       goto start;
+               } else {
+                       trunc_cpos = le32_to_cpu(rec->e_cpos);
+                       trunc_len = 0;
+                       blkno = 0;
                }
-               trunc_cpos = le32_to_cpu(rec->e_cpos);
-               trunc_len = 0;
-               blkno = 0;
        } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
                /*
                 * Truncate entire record.
@@ -7204,8 +7215,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
            !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
            !ocfs2_supports_inline_data(osb)) {
                ocfs2_error(inode->i_sb,
-                           "Inline data flags for inode %llu don't agree! "
-                           "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
+                           "Inline data flags for inode %llu don't agree! Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
                            le16_to_cpu(di->i_dyn_features),
                            OCFS2_I(inode)->ip_dyn_features,
index 0f5fd9db8194ef5d135f1896f6e2645a5f059cd8..64b11d90eca688fbce4e81b0e503c0e9ffd44197 100644 (file)
@@ -227,7 +227,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 
        if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
-               ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag",
+               ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag\n",
                            (unsigned long long)OCFS2_I(inode)->ip_blkno);
                return -EROFS;
        }
@@ -237,7 +237,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
        if (size > PAGE_CACHE_SIZE ||
            size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
                ocfs2_error(inode->i_sb,
-                           "Inode %llu has with inline data has bad size: %Lu",
+                           "Inode %llu has with inline data has bad size: %Lu\n",
                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
                            (unsigned long long)size);
                return -EROFS;
@@ -533,10 +533,14 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 
        inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
 
+       down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
        /* This figures out the size of the next contiguous block, and
         * our logical offset */
        ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
                                          &contig_blocks, &ext_flags);
+       up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
        if (ret) {
                mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
                     (unsigned long long)iblock);
@@ -557,6 +561,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 
                alloc_locked = 1;
 
+               down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
                /* fill hole, allocate blocks can't be larger than the size
                 * of the hole */
                clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
@@ -569,6 +575,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                ret = ocfs2_extend_allocation(inode, cpos,
                                clusters_to_alloc, 0);
                if (ret < 0) {
+                       up_write(&OCFS2_I(inode)->ip_alloc_sem);
                        mlog_errno(ret);
                        goto bail;
                }
@@ -576,11 +583,13 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
                                &contig_blocks, &ext_flags);
                if (ret < 0) {
+                       up_write(&OCFS2_I(inode)->ip_alloc_sem);
                        mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
                                        (unsigned long long)iblock);
                        ret = -EIO;
                        goto bail;
                }
+               up_write(&OCFS2_I(inode)->ip_alloc_sem);
        }
 
        /*
@@ -627,10 +636,13 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
                mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
        }
 
-       ocfs2_iocb_clear_rw_locked(iocb);
+       /* Let rw unlock to be done later to protect append direct io write */
+       if (offset + bytes <= i_size_read(inode)) {
+               ocfs2_iocb_clear_rw_locked(iocb);
 
-       level = ocfs2_iocb_rw_locked_level(iocb);
-       ocfs2_rw_unlock(inode, level);
+               level = ocfs2_iocb_rw_locked_level(iocb);
+               ocfs2_rw_unlock(inode, level);
+       }
 }
 
 static int ocfs2_releasepage(struct page *page, gfp_t wait)
@@ -832,12 +844,17 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
 
                /* zeroing out the previously allocated cluster tail
                 * that but not zeroed */
-               if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+               if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
+                       down_read(&OCFS2_I(inode)->ip_alloc_sem);
                        ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
                                        zero_len_tail, cluster_align_tail);
-               else
+                       up_read(&OCFS2_I(inode)->ip_alloc_sem);
+               } else {
+                       down_write(&OCFS2_I(inode)->ip_alloc_sem);
                        ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
                                        offset);
+                       up_write(&OCFS2_I(inode)->ip_alloc_sem);
+               }
                if (ret < 0) {
                        mlog_errno(ret);
                        ocfs2_inode_unlock(inode, 1);
@@ -857,7 +874,8 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
        written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
                                       offset, ocfs2_direct_IO_get_blocks,
                                       ocfs2_dio_end_io, NULL, 0);
-       if (unlikely(written < 0)) {
+       /* overwrite aio may return -EIOCBQUEUED, and it is not an error */
+       if ((written < 0) && (written != -EIOCBQUEUED)) {
                loff_t i_size = i_size_read(inode);
 
                if (offset + count > i_size) {
@@ -876,12 +894,14 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
 
                                        ocfs2_inode_unlock(inode, 1);
                                        brelse(di_bh);
+                                       di_bh = NULL;
                                        goto clean_orphan;
                                }
                        }
 
                        ocfs2_inode_unlock(inode, 1);
                        brelse(di_bh);
+                       di_bh = NULL;
 
                        ret = jbd2_journal_force_commit(journal);
                        if (ret < 0)
@@ -936,10 +956,12 @@ clean_orphan:
                if (tmp_ret < 0) {
                        ret = tmp_ret;
                        mlog_errno(ret);
+                       brelse(di_bh);
                        goto out;
                }
 
                ocfs2_inode_unlock(inode, 1);
+               brelse(di_bh);
 
                tmp_ret = jbd2_journal_force_commit(journal);
                if (tmp_ret < 0) {
@@ -2185,10 +2207,7 @@ try_again:
                if (ret)
                        goto out_commit;
        }
-       /*
-        * We don't want this to fail in ocfs2_write_end(), so do it
-        * here.
-        */
+
        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
@@ -2345,7 +2364,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
                           loff_t pos, unsigned len, unsigned copied,
                           struct page *page, void *fsdata)
 {
-       int i;
+       int i, ret;
        unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
        struct inode *inode = mapping->host;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2354,6 +2373,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
        handle_t *handle = wc->w_handle;
        struct page *tmppage;
 
+       ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
+                       OCFS2_JOURNAL_ACCESS_WRITE);
+       if (ret) {
+               copied = ret;
+               mlog_errno(ret);
+               goto out;
+       }
+
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
                ocfs2_write_end_inline(inode, pos, len, &copied, di, wc);
                goto out_write_size;
@@ -2409,6 +2436,7 @@ out_write_size:
        ocfs2_update_inode_fsync_trans(handle, inode, 1);
        ocfs2_journal_dirty(handle, wc->w_di_bh);
 
+out:
        /* unlock pages before dealloc since it needs acquiring j_trans_barrier
         * lock, or it will cause a deadlock since journal commit threads holds
         * this lock and will ask for the page lock when flushing the data.
index 1edcb141f63930919a62d4a4e827c48954113cf1..fe50ded1b4ce763e33579ab54a982e64ed06003e 100644 (file)
@@ -316,6 +316,12 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
                bh = bhs[i];
 
                if (!(flags & OCFS2_BH_READAHEAD)) {
+                       if (status) {
+                               /* Clear the rest of the buffers on error */
+                               put_bh(bh);
+                               bhs[i] = NULL;
+                               continue;
+                       }
                        /* We know this can't have changed as we hold the
                         * owner sem. Avoid doing any work on the bh if the
                         * journal has it. */
index 140de3c93d2e31a0e468b8dd670cb5ffc5fcc686..fa15debcc02be1c8fe1d3419d10dd7c5ab0ff968 100644 (file)
@@ -36,7 +36,7 @@
 #include <linux/debugfs.h>
 #include <linux/slab.h>
 #include <linux/bitmap.h>
-
+#include <linux/ktime.h>
 #include "heartbeat.h"
 #include "tcp.h"
 #include "nodemanager.h"
@@ -1060,37 +1060,6 @@ bail:
        return ret;
 }
 
-/* Subtract b from a, storing the result in a. a *must* have a larger
- * value than b. */
-static void o2hb_tv_subtract(struct timeval *a,
-                            struct timeval *b)
-{
-       /* just return 0 when a is after b */
-       if (a->tv_sec < b->tv_sec ||
-           (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) {
-               a->tv_sec = 0;
-               a->tv_usec = 0;
-               return;
-       }
-
-       a->tv_sec -= b->tv_sec;
-       a->tv_usec -= b->tv_usec;
-       while ( a->tv_usec < 0 ) {
-               a->tv_sec--;
-               a->tv_usec += 1000000;
-       }
-}
-
-static unsigned int o2hb_elapsed_msecs(struct timeval *start,
-                                      struct timeval *end)
-{
-       struct timeval res = *end;
-
-       o2hb_tv_subtract(&res, start);
-
-       return res.tv_sec * 1000 + res.tv_usec / 1000;
-}
-
 /*
  * we ride the region ref that the region dir holds.  before the region
  * dir is removed and drops it ref it will wait to tear down this
@@ -1101,7 +1070,7 @@ static int o2hb_thread(void *data)
        int i, ret;
        struct o2hb_region *reg = data;
        struct o2hb_bio_wait_ctxt write_wc;
-       struct timeval before_hb, after_hb;
+       ktime_t before_hb, after_hb;
        unsigned int elapsed_msec;
 
        mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
@@ -1118,18 +1087,18 @@ static int o2hb_thread(void *data)
                 * hr_timeout_ms between disk writes. On busy systems
                 * this should result in a heartbeat which is less
                 * likely to time itself out. */
-               do_gettimeofday(&before_hb);
+               before_hb = ktime_get_real();
 
                ret = o2hb_do_disk_heartbeat(reg);
 
-               do_gettimeofday(&after_hb);
-               elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
+               after_hb = ktime_get_real();
+
+               elapsed_msec = (unsigned int)
+                               ktime_ms_delta(after_hb, before_hb);
 
                mlog(ML_HEARTBEAT,
-                    "start = %lu.%lu, end = %lu.%lu, msec = %u, ret = %d\n",
-                    before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
-                    after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
-                    elapsed_msec, ret);
+                    "start = %lld, end = %lld, msec = %u, ret = %d\n",
+                    before_hb.tv64, after_hb.tv64, elapsed_msec, ret);
 
                if (!kthread_should_stop() &&
                    elapsed_msec < reg->hr_timeout_ms) {
@@ -1619,17 +1588,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg)
        struct o2hb_disk_slot *slot;
 
        reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL);
-       if (reg->hr_tmp_block == NULL) {
-               mlog_errno(-ENOMEM);
+       if (reg->hr_tmp_block == NULL)
                return -ENOMEM;
-       }
 
        reg->hr_slots = kcalloc(reg->hr_blocks,
                                sizeof(struct o2hb_disk_slot), GFP_KERNEL);
-       if (reg->hr_slots == NULL) {
-               mlog_errno(-ENOMEM);
+       if (reg->hr_slots == NULL)
                return -ENOMEM;
-       }
 
        for(i = 0; i < reg->hr_blocks; i++) {
                slot = &reg->hr_slots[i];
@@ -1645,17 +1610,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg)
 
        reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *),
                                    GFP_KERNEL);
-       if (!reg->hr_slot_data) {
-               mlog_errno(-ENOMEM);
+       if (!reg->hr_slot_data)
                return -ENOMEM;
-       }
 
        for(i = 0; i < reg->hr_num_pages; i++) {
                page = alloc_page(GFP_KERNEL);
-               if (!page) {
-                       mlog_errno(-ENOMEM);
+               if (!page)
                        return -ENOMEM;
-               }
 
                reg->hr_slot_data[i] = page;
 
@@ -1687,10 +1648,8 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg)
        struct o2hb_disk_heartbeat_block *hb_block;
 
        ret = o2hb_read_slots(reg, reg->hr_blocks);
-       if (ret) {
-               mlog_errno(ret);
+       if (ret)
                goto out;
-       }
 
        /* We only want to get an idea of the values initially in each
         * slot, so we do no verification - o2hb_check_slot will
index 02878a83f0b4e88655114ff78b9883e0494432a6..ffecf89c8c1cd23d532a9155660a6609e9d38551 100644 (file)
@@ -480,33 +480,26 @@ static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
 
        trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
        if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
-               rc = -EINVAL;
-               ocfs2_error(dir->i_sb,
-                           "Invalid dirblock #%llu: "
-                           "signature = %.*s\n",
-                           (unsigned long long)bh->b_blocknr, 7,
-                           trailer->db_signature);
+               rc = ocfs2_error(dir->i_sb,
+                                "Invalid dirblock #%llu: signature = %.*s\n",
+                                (unsigned long long)bh->b_blocknr, 7,
+                                trailer->db_signature);
                goto out;
        }
        if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
-               rc = -EINVAL;
-               ocfs2_error(dir->i_sb,
-                           "Directory block #%llu has an invalid "
-                           "db_blkno of %llu",
-                           (unsigned long long)bh->b_blocknr,
-                           (unsigned long long)le64_to_cpu(trailer->db_blkno));
+               rc = ocfs2_error(dir->i_sb,
+                                "Directory block #%llu has an invalid db_blkno of %llu\n",
+                                (unsigned long long)bh->b_blocknr,
+                                (unsigned long long)le64_to_cpu(trailer->db_blkno));
                goto out;
        }
        if (le64_to_cpu(trailer->db_parent_dinode) !=
            OCFS2_I(dir)->ip_blkno) {
-               rc = -EINVAL;
-               ocfs2_error(dir->i_sb,
-                           "Directory block #%llu on dinode "
-                           "#%llu has an invalid parent_dinode "
-                           "of %llu",
-                           (unsigned long long)bh->b_blocknr,
-                           (unsigned long long)OCFS2_I(dir)->ip_blkno,
-                           (unsigned long long)le64_to_cpu(trailer->db_blkno));
+               rc = ocfs2_error(dir->i_sb,
+                                "Directory block #%llu on dinode #%llu has an invalid parent_dinode of %llu\n",
+                                (unsigned long long)bh->b_blocknr,
+                                (unsigned long long)OCFS2_I(dir)->ip_blkno,
+                                (unsigned long long)le64_to_cpu(trailer->db_blkno));
                goto out;
        }
 out:
@@ -604,14 +597,13 @@ static int ocfs2_validate_dx_root(struct super_block *sb,
        }
 
        if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
-               ocfs2_error(sb,
-                           "Dir Index Root # %llu has bad signature %.*s",
-                           (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
-                           7, dx_root->dr_signature);
-               return -EINVAL;
+               ret = ocfs2_error(sb,
+                                 "Dir Index Root # %llu has bad signature %.*s\n",
+                                 (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
+                                 7, dx_root->dr_signature);
        }
 
-       return 0;
+       return ret;
 }
 
 static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
@@ -648,12 +640,11 @@ static int ocfs2_validate_dx_leaf(struct super_block *sb,
        }
 
        if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
-               ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
-                           7, dx_leaf->dl_signature);
-               return -EROFS;
+               ret = ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s\n",
+                                 7, dx_leaf->dl_signature);
        }
 
-       return 0;
+       return ret;
 }
 
 static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
@@ -812,11 +803,10 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
                el = &eb->h_list;
 
                if (el->l_tree_depth) {
-                       ocfs2_error(inode->i_sb,
-                                   "Inode %lu has non zero tree depth in "
-                                   "btree tree block %llu\n", inode->i_ino,
-                                   (unsigned long long)eb_bh->b_blocknr);
-                       ret = -EROFS;
+                       ret = ocfs2_error(inode->i_sb,
+                                         "Inode %lu has non zero tree depth in btree tree block %llu\n",
+                                         inode->i_ino,
+                                         (unsigned long long)eb_bh->b_blocknr);
                        goto out;
                }
        }
@@ -832,11 +822,11 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
        }
 
        if (!found) {
-               ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
-                           "record (%u, %u, 0) in btree", inode->i_ino,
-                           le32_to_cpu(rec->e_cpos),
-                           ocfs2_rec_clusters(el, rec));
-               ret = -EROFS;
+               ret = ocfs2_error(inode->i_sb,
+                                 "Inode %lu has bad extent record (%u, %u, 0) in btree\n",
+                                 inode->i_ino,
+                                 le32_to_cpu(rec->e_cpos),
+                                 ocfs2_rec_clusters(el, rec));
                goto out;
        }
 
index 7df88a6dd6260ce3741aa27aa28facfa73c1cba7..6918f30d02cd7631634804389ce0f3deade524e2 100644 (file)
@@ -1465,39 +1465,46 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
        if (status == -ENOPROTOOPT) {
                status = 0;
                *response = JOIN_OK_NO_MAP;
-       } else if (packet.code == JOIN_DISALLOW ||
-                  packet.code == JOIN_OK_NO_MAP) {
-               *response = packet.code;
-       } else if (packet.code == JOIN_PROTOCOL_MISMATCH) {
-               mlog(ML_NOTICE,
-                    "This node requested DLM locking protocol %u.%u and "
-                    "filesystem locking protocol %u.%u.  At least one of "
-                    "the protocol versions on node %d is not compatible, "
-                    "disconnecting\n",
-                    dlm->dlm_locking_proto.pv_major,
-                    dlm->dlm_locking_proto.pv_minor,
-                    dlm->fs_locking_proto.pv_major,
-                    dlm->fs_locking_proto.pv_minor,
-                    node);
-               status = -EPROTO;
-               *response = packet.code;
-       } else if (packet.code == JOIN_OK) {
-               *response = packet.code;
-               /* Use the same locking protocol as the remote node */
-               dlm->dlm_locking_proto.pv_minor = packet.dlm_minor;
-               dlm->fs_locking_proto.pv_minor = packet.fs_minor;
-               mlog(0,
-                    "Node %d responds JOIN_OK with DLM locking protocol "
-                    "%u.%u and fs locking protocol %u.%u\n",
-                    node,
-                    dlm->dlm_locking_proto.pv_major,
-                    dlm->dlm_locking_proto.pv_minor,
-                    dlm->fs_locking_proto.pv_major,
-                    dlm->fs_locking_proto.pv_minor);
        } else {
-               status = -EINVAL;
-               mlog(ML_ERROR, "invalid response %d from node %u\n",
-                    packet.code, node);
+               *response = packet.code;
+               switch (packet.code) {
+               case JOIN_DISALLOW:
+               case JOIN_OK_NO_MAP:
+                       break;
+               case JOIN_PROTOCOL_MISMATCH:
+                       mlog(ML_NOTICE,
+                            "This node requested DLM locking protocol %u.%u and "
+                            "filesystem locking protocol %u.%u.  At least one of "
+                            "the protocol versions on node %d is not compatible, "
+                            "disconnecting\n",
+                            dlm->dlm_locking_proto.pv_major,
+                            dlm->dlm_locking_proto.pv_minor,
+                            dlm->fs_locking_proto.pv_major,
+                            dlm->fs_locking_proto.pv_minor,
+                            node);
+                       status = -EPROTO;
+                       break;
+               case JOIN_OK:
+                       /* Use the same locking protocol as the remote node */
+                       dlm->dlm_locking_proto.pv_minor = packet.dlm_minor;
+                       dlm->fs_locking_proto.pv_minor = packet.fs_minor;
+                       mlog(0,
+                            "Node %d responds JOIN_OK with DLM locking protocol "
+                            "%u.%u and fs locking protocol %u.%u\n",
+                            node,
+                            dlm->dlm_locking_proto.pv_major,
+                            dlm->dlm_locking_proto.pv_minor,
+                            dlm->fs_locking_proto.pv_major,
+                            dlm->fs_locking_proto.pv_minor);
+                       break;
+               default:
+                       status = -EINVAL;
+                       mlog(ML_ERROR, "invalid response %d from node %u\n",
+                            packet.code, node);
+                       /* Reset response to JOIN_DISALLOW */
+                       *response = JOIN_DISALLOW;
+                       break;
+               }
        }
 
        mlog(0, "status %d, node %d response is %d\n", status, node,
@@ -1725,12 +1732,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
 
        o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
                            dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
+       o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
+                           dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
+
        status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down);
        if (status)
                goto bail;
 
-       o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
-                           dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
        status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up);
        if (status)
                goto bail;
@@ -1845,8 +1853,6 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
                                        sizeof(struct dlm_exit_domain),
                                        dlm_begin_exit_domain_handler,
                                        dlm, NULL, &dlm->dlm_domain_handlers);
-       if (status)
-               goto bail;
 
 bail:
        if (status)
index fdf4b41d0609a00e591afeed69ea7cc2ab9d5254..46b8b2bbc95ae7c1ddd776d093215f3557f76cc7 100644 (file)
@@ -498,16 +498,6 @@ static void dlm_lockres_release(struct kref *kref)
        mlog(0, "destroying lockres %.*s\n", res->lockname.len,
             res->lockname.name);
 
-       spin_lock(&dlm->track_lock);
-       if (!list_empty(&res->tracking))
-               list_del_init(&res->tracking);
-       else {
-               mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
-                    res->lockname.len, res->lockname.name);
-               dlm_print_one_lock_resource(res);
-       }
-       spin_unlock(&dlm->track_lock);
-
        atomic_dec(&dlm->res_cur_count);
 
        if (!hlist_unhashed(&res->hash_node) ||
@@ -795,8 +785,18 @@ lookup:
                dlm_lockres_grab_inflight_ref(dlm, tmpres);
 
                spin_unlock(&tmpres->spinlock);
-               if (res)
+               if (res) {
+                       spin_lock(&dlm->track_lock);
+                       if (!list_empty(&res->tracking))
+                               list_del_init(&res->tracking);
+                       else
+                               mlog(ML_ERROR, "Resource %.*s not "
+                                               "on the Tracking list\n",
+                                               res->lockname.len,
+                                               res->lockname.name);
+                       spin_unlock(&dlm->track_lock);
                        dlm_lockres_put(res);
+               }
                res = tmpres;
                goto leave;
        }
index ce12e0b1a31f180371e6ac010cac3e37fdfbad09..d0e436dc64371713af953cb475ad530e3cd69557 100644 (file)
@@ -1776,7 +1776,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                                     struct dlm_migratable_lockres *mres)
 {
        struct dlm_migratable_lock *ml;
-       struct list_head *queue, *iter;
+       struct list_head *queue;
        struct list_head *tmpq = NULL;
        struct dlm_lock *newlock = NULL;
        struct dlm_lockstatus *lksb = NULL;
@@ -1821,9 +1821,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                        spin_lock(&res->spinlock);
                        for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
                                tmpq = dlm_list_idx_to_ptr(res, j);
-                               list_for_each(iter, tmpq) {
-                                       lock = list_entry(iter,
-                                                 struct dlm_lock, list);
+                               list_for_each_entry(lock, tmpq, list) {
                                        if (lock->ml.cookie == ml->cookie)
                                                break;
                                        lock = NULL;
index 69aac6f088ada71b3dde009f21f2165a7e5b72f5..2e5e6d5fffe8d8b0458498fcf89c5ee845e20848 100644 (file)
@@ -211,6 +211,16 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
 
        __dlm_unhash_lockres(dlm, res);
 
+       spin_lock(&dlm->track_lock);
+       if (!list_empty(&res->tracking))
+               list_del_init(&res->tracking);
+       else {
+               mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
+                               res->lockname.len, res->lockname.name);
+               __dlm_print_one_lock_resource(res);
+       }
+       spin_unlock(&dlm->track_lock);
+
        /* lockres is not in the hash now.  drop the flag and wake up
         * any processes waiting in dlm_get_lock_resource. */
        if (!master) {
index 23157e40dd740204bc10f9eaeb55ec08f2f0dfb4..1c91103c13339aca31b51f7b45de4a8beb05e3bb 100644 (file)
@@ -3035,8 +3035,6 @@ local:
        ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
 
        osb->cconn = conn;
-
-       status = 0;
 bail:
        if (status < 0) {
                ocfs2_dlm_shutdown_debug(osb);
index 767370b656ca67af7ba8d2ed81783752c83742d2..e4719e0a3f9993b46a0158c78e38c08f3f1158fd 100644 (file)
@@ -305,8 +305,8 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
 
        if (el->l_tree_depth) {
                ocfs2_error(inode->i_sb,
-                           "Inode %lu has non zero tree depth in "
-                           "leaf block %llu\n", inode->i_ino,
+                           "Inode %lu has non zero tree depth in leaf block %llu\n",
+                           inode->i_ino,
                            (unsigned long long)eb_bh->b_blocknr);
                ret = -EROFS;
                goto out;
@@ -441,8 +441,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
 
                if (el->l_tree_depth) {
                        ocfs2_error(inode->i_sb,
-                                   "Inode %lu has non zero tree depth in "
-                                   "leaf block %llu\n", inode->i_ino,
+                                   "Inode %lu has non zero tree depth in leaf block %llu\n",
+                                   inode->i_ino,
                                    (unsigned long long)eb_bh->b_blocknr);
                        ret = -EROFS;
                        goto out;
@@ -475,8 +475,9 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
        BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
 
        if (!rec->e_blkno) {
-               ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
-                           "record (%u, %u, 0)", inode->i_ino,
+               ocfs2_error(inode->i_sb,
+                           "Inode %lu has bad extent record (%u, %u, 0)\n",
+                           inode->i_ino,
                            le32_to_cpu(rec->e_cpos),
                            ocfs2_rec_clusters(el, rec));
                ret = -EROFS;
@@ -564,8 +565,8 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
 
                if (el->l_tree_depth) {
                        ocfs2_error(inode->i_sb,
-                                   "Inode %lu has non zero tree depth in "
-                                   "xattr leaf block %llu\n", inode->i_ino,
+                                   "Inode %lu has non zero tree depth in xattr leaf block %llu\n",
+                                   inode->i_ino,
                                    (unsigned long long)eb_bh->b_blocknr);
                        ret = -EROFS;
                        goto out;
@@ -582,8 +583,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
                BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
 
                if (!rec->e_blkno) {
-                       ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
-                                   "record (%u, %u, 0) in xattr", inode->i_ino,
+                       ocfs2_error(inode->i_sb,
+                                   "Inode %lu has bad extent record (%u, %u, 0) in xattr\n",
+                                   inode->i_ino,
                                    le32_to_cpu(rec->e_cpos),
                                    ocfs2_rec_clusters(el, rec));
                        ret = -EROFS;
index 7210583b472f52d054f89802a893dfd45ce203ac..0e5b4515f92e7a875a6f396db0d5648c9c157ad1 100644 (file)
@@ -1130,6 +1130,7 @@ out:
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 {
        int status = 0, size_change;
+       int inode_locked = 0;
        struct inode *inode = d_inode(dentry);
        struct super_block *sb = inode->i_sb;
        struct ocfs2_super *osb = OCFS2_SB(sb);
@@ -1178,6 +1179,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                        mlog_errno(status);
                goto bail_unlock_rw;
        }
+       inode_locked = 1;
 
        if (size_change) {
                status = inode_newsize_ok(inode, attr->ia_size);
@@ -1258,7 +1260,10 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 bail_commit:
        ocfs2_commit_trans(osb, handle);
 bail_unlock:
-       ocfs2_inode_unlock(inode, 1);
+       if (status) {
+               ocfs2_inode_unlock(inode, 1);
+               inode_locked = 0;
+       }
 bail_unlock_rw:
        if (size_change)
                ocfs2_rw_unlock(inode, 1);
@@ -1274,6 +1279,8 @@ bail:
                if (status < 0)
                        mlog_errno(status);
        }
+       if (inode_locked)
+               ocfs2_inode_unlock(inode, 1);
 
        return status;
 }
@@ -2262,8 +2269,6 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
        ssize_t written = 0;
        ssize_t ret;
        size_t count = iov_iter_count(from), orig_count;
-       loff_t old_size;
-       u32 old_clusters;
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2271,6 +2276,8 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
                               OCFS2_MOUNT_COHERENCY_BUFFERED);
        int unaligned_dio = 0;
        int dropped_dio = 0;
+       int append_write = ((iocb->ki_pos + count) >=
+                       i_size_read(inode) ? 1 : 0);
 
        trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
                (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2290,8 +2297,9 @@ relock:
        /*
         * Concurrent O_DIRECT writes are allowed with
         * mount_option "coherency=buffered".
+        * For append write, we must take rw EX.
         */
-       rw_level = (!direct_io || full_coherency);
+       rw_level = (!direct_io || full_coherency || append_write);
 
        ret = ocfs2_rw_lock(inode, rw_level);
        if (ret < 0) {
@@ -2364,13 +2372,6 @@ relock:
                ocfs2_iocb_set_unaligned_aio(iocb);
        }
 
-       /*
-        * To later detect whether a journal commit for sync writes is
-        * necessary, we sample i_size, and cluster count here.
-        */
-       old_size = i_size_read(inode);
-       old_clusters = OCFS2_I(inode)->ip_clusters;
-
        /* communicate with ocfs2_dio_end_io */
        ocfs2_iocb_set_rw_locked(iocb, rw_level);
 
@@ -2378,6 +2379,20 @@ relock:
        /* buffered aio wouldn't have proper lock coverage today */
        BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT));
 
+       /*
+        * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
+        * function pointer which is called when o_direct io completes so that
+        * it can unlock our rw lock.
+        * Unfortunately there are error cases which call end_io and others
+        * that don't.  so we don't have to unlock the rw_lock if either an
+        * async dio is going to do it in the future or an end_io after an
+        * error has already done it.
+        */
+       if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
+               rw_level = -1;
+               unaligned_dio = 0;
+       }
+
        if (unlikely(written <= 0))
                goto no_sync;
 
@@ -2402,21 +2417,7 @@ relock:
        }
 
 no_sync:
-       /*
-        * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
-        * function pointer which is called when o_direct io completes so that
-        * it can unlock our rw lock.
-        * Unfortunately there are error cases which call end_io and others
-        * that don't.  so we don't have to unlock the rw_lock if either an
-        * async dio is going to do it in the future or an end_io after an
-        * error has already done it.
-        */
-       if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
-               rw_level = -1;
-               unaligned_dio = 0;
-       }
-
-       if (unaligned_dio) {
+       if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) {
                ocfs2_iocb_clear_unaligned_aio(iocb);
                mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
        }
index b254416dc8d92d0fc1c66c0b2787e313de7712b5..8f87e05ee25d3824524c7f6e040a5f43d87c723d 100644 (file)
@@ -971,6 +971,7 @@ static void ocfs2_delete_inode(struct inode *inode)
        int wipe, status;
        sigset_t oldset;
        struct buffer_head *di_bh = NULL;
+       struct ocfs2_dinode *di = NULL;
 
        trace_ocfs2_delete_inode(inode->i_ino,
                                 (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -1025,6 +1026,14 @@ static void ocfs2_delete_inode(struct inode *inode)
                goto bail_unlock_nfs_sync;
        }
 
+       di = (struct ocfs2_dinode *)di_bh->b_data;
+       /* Skip inode deletion and wait for dio orphan entry recovered
+        * first */
+       if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
+               ocfs2_cleanup_delete_inode(inode, 0);
+               goto bail_unlock_inode;
+       }
+
        /* Query the cluster. This will be the final decision made
         * before we go ahead and wipe the inode. */
        status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
@@ -1191,17 +1200,19 @@ void ocfs2_evict_inode(struct inode *inode)
 int ocfs2_drop_inode(struct inode *inode)
 {
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-       int res;
 
        trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno,
                                inode->i_nlink, oi->ip_flags);
 
-       if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
-               res = 1;
-       else
-               res = generic_drop_inode(inode);
+       assert_spin_locked(&inode->i_lock);
+       inode->i_state |= I_WILL_FREE;
+       spin_unlock(&inode->i_lock);
+       write_inode_now(inode, 1);
+       spin_lock(&inode->i_lock);
+       WARN_ON(inode->i_state & I_NEW);
+       inode->i_state &= ~I_WILL_FREE;
 
-       return res;
+       return 1;
 }
 
 /*
@@ -1350,32 +1361,32 @@ int ocfs2_validate_inode_block(struct super_block *sb,
        rc = -EINVAL;
 
        if (!OCFS2_IS_VALID_DINODE(di)) {
-               ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
-                           (unsigned long long)bh->b_blocknr, 7,
-                           di->i_signature);
+               rc = ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
+                                (unsigned long long)bh->b_blocknr, 7,
+                                di->i_signature);
                goto bail;
        }
 
        if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
-               ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
-                           (unsigned long long)bh->b_blocknr,
-                           (unsigned long long)le64_to_cpu(di->i_blkno));
+               rc = ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
+                                (unsigned long long)bh->b_blocknr,
+                                (unsigned long long)le64_to_cpu(di->i_blkno));
                goto bail;
        }
 
        if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
-               ocfs2_error(sb,
-                           "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
-                           (unsigned long long)bh->b_blocknr);
+               rc = ocfs2_error(sb,
+                                "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
+                                (unsigned long long)bh->b_blocknr);
                goto bail;
        }
 
        if (le32_to_cpu(di->i_fs_generation) !=
            OCFS2_SB(sb)->fs_generation) {
-               ocfs2_error(sb,
-                           "Invalid dinode #%llu: fs_generation is %u\n",
-                           (unsigned long long)bh->b_blocknr,
-                           le32_to_cpu(di->i_fs_generation));
+               rc = ocfs2_error(sb,
+                                "Invalid dinode #%llu: fs_generation is %u\n",
+                                (unsigned long long)bh->b_blocknr,
+                                le32_to_cpu(di->i_fs_generation));
                goto bail;
        }
 
index 5e86b247c821ce8434dfa5c9d5fe59798a8cde86..ca3431ee7f2493fb999cdb6ced2908bdb87fbe17 100644 (file)
@@ -81,8 +81,6 @@ struct ocfs2_inode_info
        tid_t i_sync_tid;
        tid_t i_datasync_tid;
 
-       wait_queue_head_t append_dio_wq;
-
        struct dquot *i_dquot[MAXQUOTAS];
 };
 
index 7c099f7032fdbcc6ce61a53eb7cad23c3a1bac29..ff82b28462a65c85cf6bff850856a77859ad987e 100644 (file)
@@ -374,7 +374,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
                mlog_errno(PTR_ERR(handle));
 
                if (is_journal_aborted(journal)) {
-                       ocfs2_abort(osb->sb, "Detected aborted journal");
+                       ocfs2_abort(osb->sb, "Detected aborted journal\n");
                        handle = ERR_PTR(-EROFS);
                }
        } else {
@@ -668,7 +668,23 @@ static int __ocfs2_journal_access(handle_t *handle,
                mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");
                mlog(ML_ERROR, "b_blocknr=%llu\n",
                     (unsigned long long)bh->b_blocknr);
-               BUG();
+
+               lock_buffer(bh);
+               /*
+                * A previous attempt to write this buffer head failed.
+                * Nothing we can do but to retry the write and hope for
+                * the best.
+                */
+               if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) {
+                       clear_buffer_write_io_error(bh);
+                       set_buffer_uptodate(bh);
+               }
+
+               if (!buffer_uptodate(bh)) {
+                       unlock_buffer(bh);
+                       return -EIO;
+               }
+               unlock_buffer(bh);
        }
 
        /* Set the current transaction information on the ci so
@@ -2170,6 +2186,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                iter = oi->ip_next_orphan;
                oi->ip_next_orphan = NULL;
 
+               mutex_lock(&inode->i_mutex);
                ret = ocfs2_rw_lock(inode, 1);
                if (ret < 0) {
                        mlog_errno(ret);
@@ -2193,7 +2210,9 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                         * ocfs2_delete_inode. */
                        oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
                        spin_unlock(&oi->ip_lock);
-               } else if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
+               }
+
+               if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
                                (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
                        ret = ocfs2_truncate_file(inode, di_bh,
                                        i_size_read(inode));
@@ -2206,17 +2225,16 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                        ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
                        if (ret)
                                mlog_errno(ret);
-
-                       wake_up(&OCFS2_I(inode)->append_dio_wq);
                } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
 unlock_inode:
                ocfs2_inode_unlock(inode, 1);
+               brelse(di_bh);
+               di_bh = NULL;
 unlock_rw:
                ocfs2_rw_unlock(inode, 1);
 next:
+               mutex_unlock(&inode->i_mutex);
                iput(inode);
-               brelse(di_bh);
-               di_bh = NULL;
                inode = iter;
        }
 
index 857bbbcd39f3b6bbd7c8bc37ef463faa6c2cd77a..0a4457fb0711b7c9f65cc3da788d482342d0a69d 100644 (file)
@@ -665,8 +665,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 #ifdef CONFIG_OCFS2_DEBUG_FS
        if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
            ocfs2_local_alloc_count_bits(alloc)) {
-               ocfs2_error(osb->sb, "local alloc inode %llu says it has "
-                           "%u used bits, but a count shows %u",
+               ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n",
                            (unsigned long long)le64_to_cpu(alloc->i_blkno),
                            le32_to_cpu(alloc->id1.bitmap1.i_used),
                            ocfs2_local_alloc_count_bits(alloc));
index 56a768d06aa6fd5beb689356349828691abea47d..124471d26a73f4fe79738e2b89662d82a7ad0561 100644 (file)
@@ -99,11 +99,9 @@ static int __ocfs2_move_extent(handle_t *handle,
 
        index = ocfs2_search_extent_list(el, cpos);
        if (index == -1) {
-               ocfs2_error(inode->i_sb,
-                           "Inode %llu has an extent at cpos %u which can no "
-                           "longer be found.\n",
-                           (unsigned long long)ino, cpos);
-               ret = -EROFS;
+               ret = ocfs2_error(inode->i_sb,
+                                 "Inode %llu has an extent at cpos %u which can no longer be found\n",
+                                 (unsigned long long)ino, cpos);
                goto out;
        }
 
index 948681e37cfdb3a863906e9f61cf3bac12b2b6e5..b7dfac226b1e2dc517877c9402832abf4cba3530 100644 (file)
@@ -1035,11 +1035,6 @@ leave:
        if (handle)
                ocfs2_commit_trans(osb, handle);
 
-       if (child_locked)
-               ocfs2_inode_unlock(inode, 1);
-
-       ocfs2_inode_unlock(dir, 1);
-
        if (orphan_dir) {
                /* This was locked for us in ocfs2_prepare_orphan_dir() */
                ocfs2_inode_unlock(orphan_dir, 1);
@@ -1047,6 +1042,11 @@ leave:
                iput(orphan_dir);
        }
 
+       if (child_locked)
+               ocfs2_inode_unlock(inode, 1);
+
+       ocfs2_inode_unlock(dir, 1);
+
        brelse(fe_bh);
        brelse(parent_node_bh);
 
@@ -1309,6 +1309,11 @@ static int ocfs2_rename(struct inode *old_dir,
        }
        parents_locked = 1;
 
+       if (!new_dir->i_nlink) {
+               status = -EACCES;
+               goto bail;
+       }
+
        /* make sure both dirs have bhs
         * get an extra ref on old_dir_bh if old==new */
        if (!new_dir_bh) {
@@ -1569,12 +1574,25 @@ static int ocfs2_rename(struct inode *old_dir,
        status = ocfs2_find_entry(old_dentry->d_name.name,
                                  old_dentry->d_name.len, old_dir,
                                  &old_entry_lookup);
-       if (status)
+       if (status) {
+               if (!is_journal_aborted(osb->journal->j_journal)) {
+                       ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s "
+                                       "is not deleted.",
+                                       new_dentry->d_name.len, new_dentry->d_name.name,
+                                       old_dentry->d_name.len, old_dentry->d_name.name);
+               }
                goto bail;
+       }
 
        status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup);
        if (status < 0) {
                mlog_errno(status);
+               if (!is_journal_aborted(osb->journal->j_journal)) {
+                       ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s "
+                                       "is not deleted.",
+                                       new_dentry->d_name.len, new_dentry->d_name.name,
+                                       old_dentry->d_name.len, old_dentry->d_name.name);
+               }
                goto bail;
        }
 
@@ -1633,21 +1651,9 @@ static int ocfs2_rename(struct inode *old_dir,
        ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
        status = 0;
 bail:
-       if (rename_lock)
-               ocfs2_rename_unlock(osb);
-
        if (handle)
                ocfs2_commit_trans(osb, handle);
 
-       if (parents_locked)
-               ocfs2_double_unlock(old_dir, new_dir);
-
-       if (old_child_locked)
-               ocfs2_inode_unlock(old_inode, 1);
-
-       if (new_child_locked)
-               ocfs2_inode_unlock(new_inode, 1);
-
        if (orphan_dir) {
                /* This was locked for us in ocfs2_prepare_orphan_dir() */
                ocfs2_inode_unlock(orphan_dir, 1);
@@ -1655,6 +1661,18 @@ bail:
                iput(orphan_dir);
        }
 
+       if (new_child_locked)
+               ocfs2_inode_unlock(new_inode, 1);
+
+       if (old_child_locked)
+               ocfs2_inode_unlock(old_inode, 1);
+
+       if (parents_locked)
+               ocfs2_double_unlock(old_dir, new_dir);
+
+       if (rename_lock)
+               ocfs2_rename_unlock(osb);
+
        if (new_inode)
                sync_mapping_buffers(old_inode->i_mapping);
 
@@ -2601,27 +2619,6 @@ leave:
        return status;
 }
 
-static int ocfs2_dio_orphan_recovered(struct inode *inode)
-{
-       int ret;
-       struct buffer_head *di_bh = NULL;
-       struct ocfs2_dinode *di = NULL;
-
-       ret = ocfs2_inode_lock(inode, &di_bh, 1);
-       if (ret < 0) {
-               mlog_errno(ret);
-               return 0;
-       }
-
-       di = (struct ocfs2_dinode *) di_bh->b_data;
-       ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL));
-       ocfs2_inode_unlock(inode, 1);
-       brelse(di_bh);
-
-       return ret;
-}
-
-#define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000
 int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
        struct inode *inode)
 {
@@ -2633,7 +2630,6 @@ int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
        handle_t *handle = NULL;
        struct ocfs2_dinode *di = NULL;
 
-restart:
        status = ocfs2_inode_lock(inode, &di_bh, 1);
        if (status < 0) {
                mlog_errno(status);
@@ -2643,15 +2639,21 @@ restart:
        di = (struct ocfs2_dinode *) di_bh->b_data;
        /*
         * Another append dio crashed?
-        * If so, wait for recovery first.
+        * If so, manually recover it first.
         */
        if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
-               ocfs2_inode_unlock(inode, 1);
-               brelse(di_bh);
-               wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq,
-                               ocfs2_dio_orphan_recovered(inode),
-                               msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL));
-               goto restart;
+               status = ocfs2_truncate_file(inode, di_bh, i_size_read(inode));
+               if (status < 0) {
+                       if (status != -ENOSPC)
+                               mlog_errno(status);
+                       goto bail_unlock_inode;
+               }
+
+               status = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail_unlock_inode;
+               }
        }
 
        status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode,
index 690ddc60189b5270246f971c4e21a63bfb0a0254..7a0126267847664e7a61d803278a2a15457ae37c 100644 (file)
@@ -286,6 +286,8 @@ enum ocfs2_mount_options
        OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
 
        OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15,  /* Journal Async Commit */
+       OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */
+       OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */
 };
 
 #define OCFS2_OSB_SOFT_RO      0x0001
index bb07004df72a36c6e7a297096201526853142bbe..8a54fd8a4fa57a76f7e0389ec8d875108c59e3a1 100644 (file)
@@ -138,8 +138,7 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
 
        if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
                ocfs2_error(inode->i_sb,
-                           "Quota file %llu is probably corrupted! Requested "
-                           "to read block %Lu but file has size only %Lu\n",
+                           "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n",
                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
                            (unsigned long long)v_block,
                            (unsigned long long)i_size_read(inode));
index 7dc818b87cd82ecb9b8f0b35de7dca55b541b5ee..e5d57cd325052a80eb4bccf63398e84019333995 100644 (file)
@@ -102,32 +102,30 @@ static int ocfs2_validate_refcount_block(struct super_block *sb,
 
 
        if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
-               ocfs2_error(sb,
-                           "Refcount block #%llu has bad signature %.*s",
-                           (unsigned long long)bh->b_blocknr, 7,
-                           rb->rf_signature);
-               return -EINVAL;
+               rc = ocfs2_error(sb,
+                                "Refcount block #%llu has bad signature %.*s\n",
+                                (unsigned long long)bh->b_blocknr, 7,
+                                rb->rf_signature);
+               goto out;
        }
 
        if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
-               ocfs2_error(sb,
-                           "Refcount block #%llu has an invalid rf_blkno "
-                           "of %llu",
-                           (unsigned long long)bh->b_blocknr,
-                           (unsigned long long)le64_to_cpu(rb->rf_blkno));
-               return -EINVAL;
+               rc = ocfs2_error(sb,
+                                "Refcount block #%llu has an invalid rf_blkno of %llu\n",
+                                (unsigned long long)bh->b_blocknr,
+                                (unsigned long long)le64_to_cpu(rb->rf_blkno));
+               goto out;
        }
 
        if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
-               ocfs2_error(sb,
-                           "Refcount block #%llu has an invalid "
-                           "rf_fs_generation of #%u",
-                           (unsigned long long)bh->b_blocknr,
-                           le32_to_cpu(rb->rf_fs_generation));
-               return -EINVAL;
+               rc = ocfs2_error(sb,
+                                "Refcount block #%llu has an invalid rf_fs_generation of #%u\n",
+                                (unsigned long long)bh->b_blocknr,
+                                le32_to_cpu(rb->rf_fs_generation));
+               goto out;
        }
-
-       return 0;
+out:
+       return rc;
 }
 
 static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
@@ -1102,12 +1100,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
                el = &eb->h_list;
 
                if (el->l_tree_depth) {
-                       ocfs2_error(sb,
-                       "refcount tree %llu has non zero tree "
-                       "depth in leaf btree tree block %llu\n",
-                       (unsigned long long)ocfs2_metadata_cache_owner(ci),
-                       (unsigned long long)eb_bh->b_blocknr);
-                       ret = -EROFS;
+                       ret = ocfs2_error(sb,
+                                         "refcount tree %llu has non zero tree depth in leaf btree tree block %llu\n",
+                                         (unsigned long long)ocfs2_metadata_cache_owner(ci),
+                                         (unsigned long long)eb_bh->b_blocknr);
                        goto out;
                }
        }
@@ -2359,10 +2355,8 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode,
                                           cpos, len, phys);
 
        if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
-               ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
-                           "tree, but the feature bit is not set in the "
-                           "super block.", inode->i_ino);
-               ret = -EROFS;
+               ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
+                                 inode->i_ino);
                goto out;
        }
 
@@ -2545,10 +2539,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
        u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
 
        if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
-               ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
-                           "tree, but the feature bit is not set in the "
-                           "super block.", inode->i_ino);
-               ret = -EROFS;
+               ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
+                                 inode->i_ino);
                goto out;
        }
 
@@ -2672,11 +2664,10 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
                el = &eb->h_list;
 
                if (el->l_tree_depth) {
-                       ocfs2_error(inode->i_sb,
-                                   "Inode %lu has non zero tree depth in "
-                                   "leaf block %llu\n", inode->i_ino,
-                                   (unsigned long long)eb_bh->b_blocknr);
-                       ret = -EROFS;
+                       ret = ocfs2_error(inode->i_sb,
+                                         "Inode %lu has non zero tree depth in leaf block %llu\n",
+                                         inode->i_ino,
+                                         (unsigned long long)eb_bh->b_blocknr);
                        goto out;
                }
        }
@@ -3106,11 +3097,9 @@ static int ocfs2_clear_ext_refcount(handle_t *handle,
 
        index = ocfs2_search_extent_list(el, cpos);
        if (index == -1) {
-               ocfs2_error(sb,
-                           "Inode %llu has an extent at cpos %u which can no "
-                           "longer be found.\n",
-                           (unsigned long long)ino, cpos);
-               ret = -EROFS;
+               ret = ocfs2_error(sb,
+                                 "Inode %llu has an extent at cpos %u which can no longer be found\n",
+                                 (unsigned long long)ino, cpos);
                goto out;
        }
 
@@ -3376,10 +3365,8 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
        if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
-               ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
-                           "tree, but the feature bit is not set in the "
-                           "super block.", inode->i_ino);
-               return -EROFS;
+               return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
+                                  inode->i_ino);
        }
 
        ocfs2_init_dealloc_ctxt(&context->dealloc);
index 4479029630bb37bb8a6e4880e94ab8633d967df6..d83d2602cf2b0aa8c7eee8a83f159ed54ccdb0c1 100644 (file)
@@ -149,10 +149,8 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
        brelse(ac->ac_bh);
        ac->ac_bh = NULL;
        ac->ac_resv = NULL;
-       if (ac->ac_find_loc_priv) {
-               kfree(ac->ac_find_loc_priv);
-               ac->ac_find_loc_priv = NULL;
-       }
+       kfree(ac->ac_find_loc_priv);
+       ac->ac_find_loc_priv = NULL;
 }
 
 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -167,12 +165,12 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 }
 
 #define do_error(fmt, ...)                                             \
-       do{                                                             \
-               if (resize)                                     \
-                       mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);        \
-               else                                                    \
-                       ocfs2_error(sb, fmt, ##__VA_ARGS__);            \
-       } while (0)
+do {                                                                   \
+       if (resize)                                                     \
+               mlog(ML_ERROR, fmt, ##__VA_ARGS__);                     \
+       else                                                            \
+               return ocfs2_error(sb, fmt, ##__VA_ARGS__);             \
+} while (0)
 
 static int ocfs2_validate_gd_self(struct super_block *sb,
                                  struct buffer_head *bh,
@@ -181,44 +179,35 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 
        if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
-               do_error("Group descriptor #%llu has bad signature %.*s",
+               do_error("Group descriptor #%llu has bad signature %.*s\n",
                         (unsigned long long)bh->b_blocknr, 7,
                         gd->bg_signature);
-               return -EINVAL;
        }
 
        if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
-               do_error("Group descriptor #%llu has an invalid bg_blkno "
-                        "of %llu",
+               do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n",
                         (unsigned long long)bh->b_blocknr,
                         (unsigned long long)le64_to_cpu(gd->bg_blkno));
-               return -EINVAL;
        }
 
        if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
-               do_error("Group descriptor #%llu has an invalid "
-                        "fs_generation of #%u",
+               do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n",
                         (unsigned long long)bh->b_blocknr,
                         le32_to_cpu(gd->bg_generation));
-               return -EINVAL;
        }
 
        if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
-               do_error("Group descriptor #%llu has bit count %u but "
-                        "claims that %u are free",
+               do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n",
                         (unsigned long long)bh->b_blocknr,
                         le16_to_cpu(gd->bg_bits),
                         le16_to_cpu(gd->bg_free_bits_count));
-               return -EINVAL;
        }
 
        if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
-               do_error("Group descriptor #%llu has bit count %u but "
-                        "max bitmap bits of %u",
+               do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n",
                         (unsigned long long)bh->b_blocknr,
                         le16_to_cpu(gd->bg_bits),
                         8 * le16_to_cpu(gd->bg_size));
-               return -EINVAL;
        }
 
        return 0;
@@ -233,20 +222,17 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 
        if (di->i_blkno != gd->bg_parent_dinode) {
-               do_error("Group descriptor #%llu has bad parent "
-                        "pointer (%llu, expected %llu)",
+               do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n",
                         (unsigned long long)bh->b_blocknr,
                         (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
                         (unsigned long long)le64_to_cpu(di->i_blkno));
-               return -EINVAL;
        }
 
        max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
        if (le16_to_cpu(gd->bg_bits) > max_bits) {
-               do_error("Group descriptor #%llu has bit count of %u",
+               do_error("Group descriptor #%llu has bit count of %u\n",
                         (unsigned long long)bh->b_blocknr,
                         le16_to_cpu(gd->bg_bits));
-               return -EINVAL;
        }
 
        /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
@@ -254,10 +240,9 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
             le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
            ((le16_to_cpu(gd->bg_chain) ==
             le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
-               do_error("Group descriptor #%llu has bad chain %u",
+               do_error("Group descriptor #%llu has bad chain %u\n",
                         (unsigned long long)bh->b_blocknr,
                         le16_to_cpu(gd->bg_chain));
-               return -EINVAL;
        }
 
        return 0;
@@ -384,11 +369,10 @@ static int ocfs2_block_group_fill(handle_t *handle,
        struct super_block * sb = alloc_inode->i_sb;
 
        if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
-               ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
-                           "b_blocknr (%llu)",
-                           (unsigned long long)group_blkno,
-                           (unsigned long long) bg_bh->b_blocknr);
-               status = -EIO;
+               status = ocfs2_error(alloc_inode->i_sb,
+                                    "group block (%llu) != b_blocknr (%llu)\n",
+                                    (unsigned long long)group_blkno,
+                                    (unsigned long long) bg_bh->b_blocknr);
                goto bail;
        }
 
@@ -834,9 +818,9 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
 
        if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
-               ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
-                           (unsigned long long)le64_to_cpu(fe->i_blkno));
-               status = -EIO;
+               status = ocfs2_error(alloc_inode->i_sb,
+                                    "Invalid chain allocator %llu\n",
+                                    (unsigned long long)le64_to_cpu(fe->i_blkno));
                goto bail;
        }
 
@@ -1370,12 +1354,11 @@ int ocfs2_block_group_set_bits(handle_t *handle,
 
        le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
        if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
-               ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
-                           " count %u but claims %u are freed. num_bits %d",
-                           (unsigned long long)le64_to_cpu(bg->bg_blkno),
-                           le16_to_cpu(bg->bg_bits),
-                           le16_to_cpu(bg->bg_free_bits_count), num_bits);
-               return -EROFS;
+               return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
+                                  (unsigned long long)le64_to_cpu(bg->bg_blkno),
+                                  le16_to_cpu(bg->bg_bits),
+                                  le16_to_cpu(bg->bg_free_bits_count),
+                                  num_bits);
        }
        while(num_bits--)
                ocfs2_set_bit(bit_off++, bitmap);
@@ -1905,13 +1888,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
 
        if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
            le32_to_cpu(fe->id1.bitmap1.i_total)) {
-               ocfs2_error(ac->ac_inode->i_sb,
-                           "Chain allocator dinode %llu has %u used "
-                           "bits but only %u total.",
-                           (unsigned long long)le64_to_cpu(fe->i_blkno),
-                           le32_to_cpu(fe->id1.bitmap1.i_used),
-                           le32_to_cpu(fe->id1.bitmap1.i_total));
-               status = -EIO;
+               status = ocfs2_error(ac->ac_inode->i_sb,
+                                    "Chain allocator dinode %llu has %u used bits but only %u total\n",
+                                    (unsigned long long)le64_to_cpu(fe->i_blkno),
+                                    le32_to_cpu(fe->id1.bitmap1.i_used),
+                                    le32_to_cpu(fe->id1.bitmap1.i_total));
                goto bail;
        }
 
@@ -2429,12 +2410,11 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
        }
        le16_add_cpu(&bg->bg_free_bits_count, num_bits);
        if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
-               ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
-                           " count %u but claims %u are freed. num_bits %d",
-                           (unsigned long long)le64_to_cpu(bg->bg_blkno),
-                           le16_to_cpu(bg->bg_bits),
-                           le16_to_cpu(bg->bg_free_bits_count), num_bits);
-               return -EROFS;
+               return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
+                                  (unsigned long long)le64_to_cpu(bg->bg_blkno),
+                                  le16_to_cpu(bg->bg_bits),
+                                  le16_to_cpu(bg->bg_free_bits_count),
+                                  num_bits);
        }
 
        if (undo_fn)
index 403c5660b30644a5c6f564ccaf795af4749e659a..2de4c8a9340c267a16381faacbd0ce66c18d123f 100644 (file)
@@ -192,6 +192,7 @@ enum {
        Opt_resv_level,
        Opt_dir_resv_level,
        Opt_journal_async_commit,
+       Opt_err_cont,
        Opt_err,
 };
 
@@ -224,6 +225,7 @@ static const match_table_t tokens = {
        {Opt_resv_level, "resv_level=%u"},
        {Opt_dir_resv_level, "dir_resv_level=%u"},
        {Opt_journal_async_commit, "journal_async_commit"},
+       {Opt_err_cont, "errors=continue"},
        {Opt_err, NULL}
 };
 
@@ -1330,10 +1332,19 @@ static int ocfs2_parse_options(struct super_block *sb,
                        mopt->mount_opt |= OCFS2_MOUNT_NOINTR;
                        break;
                case Opt_err_panic:
+                       mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT;
+                       mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS;
                        mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
                        break;
                case Opt_err_ro:
+                       mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT;
                        mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
+                       mopt->mount_opt |= OCFS2_MOUNT_ERRORS_ROFS;
+                       break;
+               case Opt_err_cont:
+                       mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS;
+                       mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
+                       mopt->mount_opt |= OCFS2_MOUNT_ERRORS_CONT;
                        break;
                case Opt_data_ordered:
                        mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
@@ -1530,6 +1541,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
 
        if (opts & OCFS2_MOUNT_ERRORS_PANIC)
                seq_printf(s, ",errors=panic");
+       else if (opts & OCFS2_MOUNT_ERRORS_CONT)
+               seq_printf(s, ",errors=continue");
        else
                seq_printf(s, ",errors=remount-ro");
 
@@ -1550,8 +1563,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
                seq_printf(s, ",localflocks,");
 
        if (osb->osb_cluster_stack[0])
-               seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
-                          osb->osb_cluster_stack);
+               seq_show_option_n(s, "cluster_stack", osb->osb_cluster_stack,
+                                 OCFS2_STACK_LABEL_LEN);
        if (opts & OCFS2_MOUNT_USRQUOTA)
                seq_printf(s, ",usrquota");
        if (opts & OCFS2_MOUNT_GRPQUOTA)
@@ -1746,8 +1759,6 @@ static void ocfs2_inode_init_once(void *data)
        ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
        ocfs2_lock_res_init_once(&oi->ip_open_lockres);
 
-       init_waitqueue_head(&oi->append_dio_wq);
-
        ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode),
                                  &ocfs2_inode_caching_ops);
 
@@ -2541,31 +2552,43 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
        memset(osb, 0, sizeof(struct ocfs2_super));
 }
 
-/* Put OCFS2 into a readonly state, or (if the user specifies it),
- * panic(). We do not support continue-on-error operation. */
-static void ocfs2_handle_error(struct super_block *sb)
+/* Depending on the mount option passed, perform one of the following:
+ * Put OCFS2 into a readonly state (default)
+ * Return EIO so that only the process errs
+ * Fix the error as if fsck.ocfs2 -y
+ * panic
+ */
+static int ocfs2_handle_error(struct super_block *sb)
 {
        struct ocfs2_super *osb = OCFS2_SB(sb);
-
-       if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC)
-               panic("OCFS2: (device %s): panic forced after error\n",
-                     sb->s_id);
+       int rv = 0;
 
        ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
+       pr_crit("On-disk corruption discovered. "
+               "Please run fsck.ocfs2 once the filesystem is unmounted.\n");
 
-       if (sb->s_flags & MS_RDONLY &&
-           (ocfs2_is_soft_readonly(osb) ||
-            ocfs2_is_hard_readonly(osb)))
-               return;
-
-       printk(KERN_CRIT "File system is now read-only due to the potential "
-              "of on-disk corruption. Please run fsck.ocfs2 once the file "
-              "system is unmounted.\n");
-       sb->s_flags |= MS_RDONLY;
-       ocfs2_set_ro_flag(osb, 0);
+       if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) {
+               panic("OCFS2: (device %s): panic forced after error\n",
+                     sb->s_id);
+       } else if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_CONT) {
+               pr_crit("OCFS2: Returning error to the calling process.\n");
+               rv = -EIO;
+       } else { /* default option */
+               rv = -EROFS;
+               if (sb->s_flags & MS_RDONLY &&
+                               (ocfs2_is_soft_readonly(osb) ||
+                                ocfs2_is_hard_readonly(osb)))
+                       return rv;
+
+               pr_crit("OCFS2: File system is now read-only.\n");
+               sb->s_flags |= MS_RDONLY;
+               ocfs2_set_ro_flag(osb, 0);
+       }
+
+       return rv;
 }
 
-void __ocfs2_error(struct super_block *sb, const char *function,
+int __ocfs2_error(struct super_block *sb, const char *function,
                  const char *fmt, ...)
 {
        struct va_format vaf;
@@ -2577,12 +2600,12 @@ void __ocfs2_error(struct super_block *sb, const char *function,
 
        /* Not using mlog here because we want to show the actual
         * function the error came from. */
-       printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV\n",
+       printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV",
               sb->s_id, function, &vaf);
 
        va_end(args);
 
-       ocfs2_handle_error(sb);
+       return ocfs2_handle_error(sb);
 }
 
 /* Handle critical errors. This is intentionally more drastic than
@@ -2599,7 +2622,7 @@ void __ocfs2_abort(struct super_block *sb, const char *function,
        vaf.fmt = fmt;
        vaf.va = &args;
 
-       printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV\n",
+       printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV",
               sb->s_id, function, &vaf);
 
        va_end(args);
index 74ff74cf78fe9202a110ff353e05163b3ce779d3..b477d0b1c7b6ce4caaf06a388f9044983e424a2b 100644 (file)
@@ -32,16 +32,18 @@ int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
                                  int node_num);
 
 __printf(3, 4)
-void __ocfs2_error(struct super_block *sb, const char *function,
+int __ocfs2_error(struct super_block *sb, const char *function,
                   const char *fmt, ...);
 
-#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
+#define ocfs2_error(sb, fmt, ...)                                      \
+       __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__)
 
 __printf(3, 4)
 void __ocfs2_abort(struct super_block *sb, const char *function,
                   const char *fmt, ...);
 
-#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
+#define ocfs2_abort(sb, fmt, ...)                                      \
+       __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__)
 
 /*
  * Void signal blockers, because in-kernel sigprocmask() only fails
index 889f3796a0d732638ce8d76fe9bf0b0c484eb7e0..ebfdea78659b1bda5a93674837e8f818c2503d44 100644 (file)
@@ -499,30 +499,24 @@ static int ocfs2_validate_xattr_block(struct super_block *sb,
         */
 
        if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
-               ocfs2_error(sb,
-                           "Extended attribute block #%llu has bad "
-                           "signature %.*s",
-                           (unsigned long long)bh->b_blocknr, 7,
-                           xb->xb_signature);
-               return -EINVAL;
+               return ocfs2_error(sb,
+                                  "Extended attribute block #%llu has bad signature %.*s\n",
+                                  (unsigned long long)bh->b_blocknr, 7,
+                                  xb->xb_signature);
        }
 
        if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
-               ocfs2_error(sb,
-                           "Extended attribute block #%llu has an "
-                           "invalid xb_blkno of %llu",
-                           (unsigned long long)bh->b_blocknr,
-                           (unsigned long long)le64_to_cpu(xb->xb_blkno));
-               return -EINVAL;
+               return ocfs2_error(sb,
+                                  "Extended attribute block #%llu has an invalid xb_blkno of %llu\n",
+                                  (unsigned long long)bh->b_blocknr,
+                                  (unsigned long long)le64_to_cpu(xb->xb_blkno));
        }
 
        if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
-               ocfs2_error(sb,
-                           "Extended attribute block #%llu has an invalid "
-                           "xb_fs_generation of #%u",
-                           (unsigned long long)bh->b_blocknr,
-                           le32_to_cpu(xb->xb_fs_generation));
-               return -EINVAL;
+               return ocfs2_error(sb,
+                                  "Extended attribute block #%llu has an invalid xb_fs_generation of #%u\n",
+                                  (unsigned long long)bh->b_blocknr,
+                                  le32_to_cpu(xb->xb_fs_generation));
        }
 
        return 0;
@@ -3694,11 +3688,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
                el = &eb->h_list;
 
                if (el->l_tree_depth) {
-                       ocfs2_error(inode->i_sb,
-                                   "Inode %lu has non zero tree depth in "
-                                   "xattr tree block %llu\n", inode->i_ino,
-                                   (unsigned long long)eb_bh->b_blocknr);
-                       ret = -EROFS;
+                       ret = ocfs2_error(inode->i_sb,
+                                         "Inode %lu has non zero tree depth in xattr tree block %llu\n",
+                                         inode->i_ino,
+                                         (unsigned long long)eb_bh->b_blocknr);
                        goto out;
                }
        }
@@ -3713,11 +3706,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
        }
 
        if (!e_blkno) {
-               ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
-                           "record (%u, %u, 0) in xattr", inode->i_ino,
-                           le32_to_cpu(rec->e_cpos),
-                           ocfs2_rec_clusters(el, rec));
-               ret = -EROFS;
+               ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent record (%u, %u, 0) in xattr\n",
+                                 inode->i_ino,
+                                 le32_to_cpu(rec->e_cpos),
+                                 ocfs2_rec_clusters(el, rec));
                goto out;
        }
 
@@ -7334,6 +7326,9 @@ static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list,
        const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
 
+       if (!capable(CAP_SYS_ADMIN))
+               return 0;
+
        if (list && total_len <= list_size) {
                memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
                memcpy(list + prefix_len, name, name_len);
index 7466ff339c667ea63ead6bf04f18d5662ef3d142..79073d68b475d71b0f87902550b3eeef945885b5 100644 (file)
@@ -588,10 +588,10 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
        struct super_block *sb = dentry->d_sb;
        struct ovl_fs *ufs = sb->s_fs_info;
 
-       seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir);
+       seq_show_option(m, "lowerdir", ufs->config.lowerdir);
        if (ufs->config.upperdir) {
-               seq_printf(m, ",upperdir=%s", ufs->config.upperdir);
-               seq_printf(m, ",workdir=%s", ufs->config.workdir);
+               seq_show_option(m, "upperdir", ufs->config.upperdir);
+               seq_show_option(m, "workdir", ufs->config.workdir);
        }
        return 0;
 }
index ce065cf3104fb5ddd042a6c1344936d0ef0e8184..f60f0121e3319ec616d0a0151d3e4df146637c88 100644 (file)
@@ -308,7 +308,8 @@ static void render_cap_t(struct seq_file *m, const char *header,
 static inline void task_cap(struct seq_file *m, struct task_struct *p)
 {
        const struct cred *cred;
-       kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset;
+       kernel_cap_t cap_inheritable, cap_permitted, cap_effective,
+                       cap_bset, cap_ambient;
 
        rcu_read_lock();
        cred = __task_cred(p);
@@ -316,12 +317,14 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
        cap_permitted   = cred->cap_permitted;
        cap_effective   = cred->cap_effective;
        cap_bset        = cred->cap_bset;
+       cap_ambient     = cred->cap_ambient;
        rcu_read_unlock();
 
        render_cap_t(m, "CapInh:\t", &cap_inheritable);
        render_cap_t(m, "CapPrm:\t", &cap_permitted);
        render_cap_t(m, "CapEff:\t", &cap_effective);
        render_cap_t(m, "CapBnd:\t", &cap_bset);
+       render_cap_t(m, "CapAmb:\t", &cap_ambient);
 }
 
 static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
index ca1e091881d44fe5797924d5a34daca41f8166e9..3b4d8255e8068dccaa99b158b9d8daab193de656 100644 (file)
@@ -597,6 +597,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
                [ilog2(VM_HUGEPAGE)]    = "hg",
                [ilog2(VM_NOHUGEPAGE)]  = "nh",
                [ilog2(VM_MERGEABLE)]   = "mg",
+               [ilog2(VM_UFFD_MISSING)]= "um",
+               [ilog2(VM_UFFD_WP)]     = "uw",
        };
        size_t i;
 
index 0e4cf728126f2b164b6f0b2f878d4ab26ad4dbb8..4a62fe8cc3bff619516fbe15d62a8cfaa1aaa581 100644 (file)
@@ -714,18 +714,20 @@ static int reiserfs_show_options(struct seq_file *seq, struct dentry *root)
                seq_puts(seq, ",acl");
 
        if (REISERFS_SB(s)->s_jdev)
-               seq_printf(seq, ",jdev=%s", REISERFS_SB(s)->s_jdev);
+               seq_show_option(seq, "jdev", REISERFS_SB(s)->s_jdev);
 
        if (journal->j_max_commit_age != journal->j_default_max_commit_age)
                seq_printf(seq, ",commit=%d", journal->j_max_commit_age);
 
 #ifdef CONFIG_QUOTA
        if (REISERFS_SB(s)->s_qf_names[USRQUOTA])
-               seq_printf(seq, ",usrjquota=%s", REISERFS_SB(s)->s_qf_names[USRQUOTA]);
+               seq_show_option(seq, "usrjquota",
+                               REISERFS_SB(s)->s_qf_names[USRQUOTA]);
        else if (opts & (1 << REISERFS_USRQUOTA))
                seq_puts(seq, ",usrquota");
        if (REISERFS_SB(s)->s_qf_names[GRPQUOTA])
-               seq_printf(seq, ",grpjquota=%s", REISERFS_SB(s)->s_qf_names[GRPQUOTA]);
+               seq_show_option(seq, "grpjquota",
+                               REISERFS_SB(s)->s_qf_names[GRPQUOTA]);
        else if (opts & (1 << REISERFS_GRPQUOTA))
                seq_puts(seq, ",grpquota");
        if (REISERFS_SB(s)->s_jquota_fmt) {
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
new file mode 100644 (file)
index 0000000..634e676
--- /dev/null
@@ -0,0 +1,1330 @@
+/*
+ *  fs/userfaultfd.c
+ *
+ *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *  Copyright (C) 2008-2009 Red Hat, Inc.
+ *  Copyright (C) 2015  Red Hat, Inc.
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ *
+ *  Some part derived from fs/eventfd.c (anon inode setup) and
+ *  mm/ksm.c (mm hashing).
+ */
+
+#include <linux/hashtable.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/file.h>
+#include <linux/bug.h>
+#include <linux/anon_inodes.h>
+#include <linux/syscalls.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/mempolicy.h>
+#include <linux/ioctl.h>
+#include <linux/security.h>
+
+static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
+
+enum userfaultfd_state {
+       UFFD_STATE_WAIT_API,
+       UFFD_STATE_RUNNING,
+};
+
+/*
+ * Start with fault_pending_wqh and fault_wqh so they're more likely
+ * to be in the same cacheline.
+ */
+struct userfaultfd_ctx {
+       /* waitqueue head for the pending (i.e. not read) userfaults */
+       wait_queue_head_t fault_pending_wqh;
+       /* waitqueue head for the userfaults */
+       wait_queue_head_t fault_wqh;
+       /* waitqueue head for the pseudo fd to wakeup poll/read */
+       wait_queue_head_t fd_wqh;
+       /* a refile sequence protected by fault_pending_wqh lock */
+       struct seqcount refile_seq;
+       /* pseudo fd refcounting */
+       atomic_t refcount;
+       /* userfaultfd syscall flags */
+       unsigned int flags;
+       /* state machine */
+       enum userfaultfd_state state;
+       /* released */
+       bool released;
+       /* mm with one ore more vmas attached to this userfaultfd_ctx */
+       struct mm_struct *mm;
+};
+
+struct userfaultfd_wait_queue {
+       struct uffd_msg msg;
+       wait_queue_t wq;
+       struct userfaultfd_ctx *ctx;
+};
+
+struct userfaultfd_wake_range {
+       unsigned long start;
+       unsigned long len;
+};
+
+static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
+                                    int wake_flags, void *key)
+{
+       struct userfaultfd_wake_range *range = key;
+       int ret;
+       struct userfaultfd_wait_queue *uwq;
+       unsigned long start, len;
+
+       uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+       ret = 0;
+       /* len == 0 means wake all */
+       start = range->start;
+       len = range->len;
+       if (len && (start > uwq->msg.arg.pagefault.address ||
+                   start + len <= uwq->msg.arg.pagefault.address))
+               goto out;
+       ret = wake_up_state(wq->private, mode);
+       if (ret)
+               /*
+                * Wake only once, autoremove behavior.
+                *
+                * After the effect of list_del_init is visible to the
+                * other CPUs, the waitqueue may disappear from under
+                * us, see the !list_empty_careful() in
+                * handle_userfault(). try_to_wake_up() has an
+                * implicit smp_mb__before_spinlock, and the
+                * wq->private is read before calling the extern
+                * function "wake_up_state" (which in turns calls
+                * try_to_wake_up). While the spin_lock;spin_unlock;
+                * wouldn't be enough, the smp_mb__before_spinlock is
+                * enough to avoid an explicit smp_mb() here.
+                */
+               list_del_init(&wq->task_list);
+out:
+       return ret;
+}
+
+/**
+ * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
+ * context.
+ * @ctx: [in] Pointer to the userfaultfd context.
+ *
+ * Returns: In case of success, returns not zero.
+ */
+static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
+{
+       if (!atomic_inc_not_zero(&ctx->refcount))
+               BUG();
+}
+
+/**
+ * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
+ * context.
+ * @ctx: [in] Pointer to userfaultfd context.
+ *
+ * The userfaultfd context reference must have been previously acquired either
+ * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
+ */
+static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
+{
+       if (atomic_dec_and_test(&ctx->refcount)) {
+               VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
+               VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
+               VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
+               VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
+               VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
+               VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
+               mmput(ctx->mm);
+               kmem_cache_free(userfaultfd_ctx_cachep, ctx);
+       }
+}
+
+static inline void msg_init(struct uffd_msg *msg)
+{
+       BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
+       /*
+        * Must use memset to zero out the paddings or kernel data is
+        * leaked to userland.
+        */
+       memset(msg, 0, sizeof(struct uffd_msg));
+}
+
+static inline struct uffd_msg userfault_msg(unsigned long address,
+                                           unsigned int flags,
+                                           unsigned long reason)
+{
+       struct uffd_msg msg;
+       msg_init(&msg);
+       msg.event = UFFD_EVENT_PAGEFAULT;
+       msg.arg.pagefault.address = address;
+       if (flags & FAULT_FLAG_WRITE)
+               /*
+                * If UFFD_FEATURE_PAGEFAULT_FLAG_WRITE was set in the
+                * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
+                * was not set in a UFFD_EVENT_PAGEFAULT, it means it
+                * was a read fault, otherwise if set it means it's
+                * a write fault.
+                */
+               msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
+       if (reason & VM_UFFD_WP)
+               /*
+                * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
+                * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was
+                * not set in a UFFD_EVENT_PAGEFAULT, it means it was
+                * a missing fault, otherwise if set it means it's a
+                * write protect fault.
+                */
+               msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
+       return msg;
+}
+
+/*
+ * Verify the pagetables are still not ok after having reigstered into
+ * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
+ * userfault that has already been resolved, if userfaultfd_read and
+ * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
+ * threads.
+ */
+static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
+                                        unsigned long address,
+                                        unsigned long flags,
+                                        unsigned long reason)
+{
+       struct mm_struct *mm = ctx->mm;
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd, _pmd;
+       pte_t *pte;
+       bool ret = true;
+
+       VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+
+       pgd = pgd_offset(mm, address);
+       if (!pgd_present(*pgd))
+               goto out;
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               goto out;
+       pmd = pmd_offset(pud, address);
+       /*
+        * READ_ONCE must function as a barrier with narrower scope
+        * and it must be equivalent to:
+        *      _pmd = *pmd; barrier();
+        *
+        * This is to deal with the instability (as in
+        * pmd_trans_unstable) of the pmd.
+        */
+       _pmd = READ_ONCE(*pmd);
+       if (!pmd_present(_pmd))
+               goto out;
+
+       ret = false;
+       if (pmd_trans_huge(_pmd))
+               goto out;
+
+       /*
+        * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
+        * and use the standard pte_offset_map() instead of parsing _pmd.
+        */
+       pte = pte_offset_map(pmd, address);
+       /*
+        * Lockless access: we're in a wait_event so it's ok if it
+        * changes under us.
+        */
+       if (pte_none(*pte))
+               ret = true;
+       pte_unmap(pte);
+
+out:
+       return ret;
+}
+
+/*
+ * The locking rules involved in returning VM_FAULT_RETRY depending on
+ * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
+ * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
+ * recommendation in __lock_page_or_retry is not an understatement.
+ *
+ * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_sem must be released
+ * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
+ * not set.
+ *
+ * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
+ * set, VM_FAULT_RETRY can still be returned if and only if there are
+ * fatal_signal_pending()s, and the mmap_sem must be released before
+ * returning it.
+ */
+int handle_userfault(struct vm_area_struct *vma, unsigned long address,
+                    unsigned int flags, unsigned long reason)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       struct userfaultfd_ctx *ctx;
+       struct userfaultfd_wait_queue uwq;
+       int ret;
+       bool must_wait, return_to_userland;
+
+       BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+
+       ret = VM_FAULT_SIGBUS;
+       ctx = vma->vm_userfaultfd_ctx.ctx;
+       if (!ctx)
+               goto out;
+
+       BUG_ON(ctx->mm != mm);
+
+       VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
+       VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
+
+       /*
+        * If it's already released don't get it. This avoids to loop
+        * in __get_user_pages if userfaultfd_release waits on the
+        * caller of handle_userfault to release the mmap_sem.
+        */
+       if (unlikely(ACCESS_ONCE(ctx->released)))
+               goto out;
+
+       /*
+        * Check that we can return VM_FAULT_RETRY.
+        *
+        * NOTE: it should become possible to return VM_FAULT_RETRY
+        * even if FAULT_FLAG_TRIED is set without leading to gup()
+        * -EBUSY failures, if the userfaultfd is to be extended for
+        * VM_UFFD_WP tracking and we intend to arm the userfault
+        * without first stopping userland access to the memory. For
+        * VM_UFFD_MISSING userfaults this is enough for now.
+        */
+       if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) {
+               /*
+                * Validate the invariant that nowait must allow retry
+                * to be sure not to return SIGBUS erroneously on
+                * nowait invocations.
+                */
+               BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT);
+#ifdef CONFIG_DEBUG_VM
+               if (printk_ratelimit()) {
+                       printk(KERN_WARNING
+                              "FAULT_FLAG_ALLOW_RETRY missing %x\n", flags);
+                       dump_stack();
+               }
+#endif
+               goto out;
+       }
+
+       /*
+        * Handle nowait, not much to do other than tell it to retry
+        * and wait.
+        */
+       ret = VM_FAULT_RETRY;
+       if (flags & FAULT_FLAG_RETRY_NOWAIT)
+               goto out;
+
+       /* take the reference before dropping the mmap_sem */
+       userfaultfd_ctx_get(ctx);
+
+       init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
+       uwq.wq.private = current;
+       uwq.msg = userfault_msg(address, flags, reason);
+       uwq.ctx = ctx;
+
+       return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
+               (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
+
+       spin_lock(&ctx->fault_pending_wqh.lock);
+       /*
+        * After the __add_wait_queue the uwq is visible to userland
+        * through poll/read().
+        */
+       __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
+       /*
+        * The smp_mb() after __set_current_state prevents the reads
+        * following the spin_unlock to happen before the list_add in
+        * __add_wait_queue.
+        */
+       set_current_state(return_to_userland ? TASK_INTERRUPTIBLE :
+                         TASK_KILLABLE);
+       spin_unlock(&ctx->fault_pending_wqh.lock);
+
+       must_wait = userfaultfd_must_wait(ctx, address, flags, reason);
+       up_read(&mm->mmap_sem);
+
+       if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
+                  (return_to_userland ? !signal_pending(current) :
+                   !fatal_signal_pending(current)))) {
+               wake_up_poll(&ctx->fd_wqh, POLLIN);
+               schedule();
+               ret |= VM_FAULT_MAJOR;
+       }
+
+       __set_current_state(TASK_RUNNING);
+
+       if (return_to_userland) {
+               if (signal_pending(current) &&
+                   !fatal_signal_pending(current)) {
+                       /*
+                        * If we got a SIGSTOP or SIGCONT and this is
+                        * a normal userland page fault, just let
+                        * userland return so the signal will be
+                        * handled and gdb debugging works.  The page
+                        * fault code immediately after we return from
+                        * this function is going to release the
+                        * mmap_sem and it's not depending on it
+                        * (unlike gup would if we were not to return
+                        * VM_FAULT_RETRY).
+                        *
+                        * If a fatal signal is pending we still take
+                        * the streamlined VM_FAULT_RETRY failure path
+                        * and there's no need to retake the mmap_sem
+                        * in such case.
+                        */
+                       down_read(&mm->mmap_sem);
+                       ret = 0;
+               }
+       }
+
+       /*
+        * Here we race with the list_del; list_add in
+        * userfaultfd_ctx_read(), however because we don't ever run
+        * list_del_init() to refile across the two lists, the prev
+        * and next pointers will never point to self. list_add also
+        * would never let any of the two pointers to point to
+        * self. So list_empty_careful won't risk to see both pointers
+        * pointing to self at any time during the list refile. The
+        * only case where list_del_init() is called is the full
+        * removal in the wake function and there we don't re-list_add
+        * and it's fine not to block on the spinlock. The uwq on this
+        * kernel stack can be released after the list_del_init.
+        */
+       if (!list_empty_careful(&uwq.wq.task_list)) {
+               spin_lock(&ctx->fault_pending_wqh.lock);
+               /*
+                * No need of list_del_init(), the uwq on the stack
+                * will be freed shortly anyway.
+                */
+               list_del(&uwq.wq.task_list);
+               spin_unlock(&ctx->fault_pending_wqh.lock);
+       }
+
+       /*
+        * ctx may go away after this if the userfault pseudo fd is
+        * already released.
+        */
+       userfaultfd_ctx_put(ctx);
+
+out:
+       return ret;
+}
+
+static int userfaultfd_release(struct inode *inode, struct file *file)
+{
+       struct userfaultfd_ctx *ctx = file->private_data;
+       struct mm_struct *mm = ctx->mm;
+       struct vm_area_struct *vma, *prev;
+       /* len == 0 means wake all */
+       struct userfaultfd_wake_range range = { .len = 0, };
+       unsigned long new_flags;
+
+       ACCESS_ONCE(ctx->released) = true;
+
+       /*
+        * Flush page faults out of all CPUs. NOTE: all page faults
+        * must be retried without returning VM_FAULT_SIGBUS if
+        * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
+        * changes while handle_userfault released the mmap_sem. So
+        * it's critical that released is set to true (above), before
+        * taking the mmap_sem for writing.
+        */
+       down_write(&mm->mmap_sem);
+       prev = NULL;
+       for (vma = mm->mmap; vma; vma = vma->vm_next) {
+               cond_resched();
+               BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
+                      !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+               if (vma->vm_userfaultfd_ctx.ctx != ctx) {
+                       prev = vma;
+                       continue;
+               }
+               new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
+               prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
+                                new_flags, vma->anon_vma,
+                                vma->vm_file, vma->vm_pgoff,
+                                vma_policy(vma),
+                                NULL_VM_UFFD_CTX);
+               if (prev)
+                       vma = prev;
+               else
+                       prev = vma;
+               vma->vm_flags = new_flags;
+               vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+       }
+       up_write(&mm->mmap_sem);
+
+       /*
+        * After no new page faults can wait on this fault_*wqh, flush
+        * the last page faults that may have been already waiting on
+        * the fault_*wqh.
+        */
+       spin_lock(&ctx->fault_pending_wqh.lock);
+       __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, &range);
+       __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range);
+       spin_unlock(&ctx->fault_pending_wqh.lock);
+
+       wake_up_poll(&ctx->fd_wqh, POLLHUP);
+       userfaultfd_ctx_put(ctx);
+       return 0;
+}
+
+/* fault_pending_wqh.lock must be hold by the caller */
+static inline struct userfaultfd_wait_queue *find_userfault(
+       struct userfaultfd_ctx *ctx)
+{
+       wait_queue_t *wq;
+       struct userfaultfd_wait_queue *uwq;
+
+       VM_BUG_ON(!spin_is_locked(&ctx->fault_pending_wqh.lock));
+
+       uwq = NULL;
+       if (!waitqueue_active(&ctx->fault_pending_wqh))
+               goto out;
+       /* walk in reverse to provide FIFO behavior to read userfaults */
+       wq = list_last_entry(&ctx->fault_pending_wqh.task_list,
+                            typeof(*wq), task_list);
+       uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+out:
+       return uwq;
+}
+
+static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
+{
+       struct userfaultfd_ctx *ctx = file->private_data;
+       unsigned int ret;
+
+       poll_wait(file, &ctx->fd_wqh, wait);
+
+       switch (ctx->state) {
+       case UFFD_STATE_WAIT_API:
+               return POLLERR;
+       case UFFD_STATE_RUNNING:
+               /*
+                * poll() never guarantees that read won't block.
+                * userfaults can be waken before they're read().
+                */
+               if (unlikely(!(file->f_flags & O_NONBLOCK)))
+                       return POLLERR;
+               /*
+                * lockless access to see if there are pending faults
+                * __pollwait last action is the add_wait_queue but
+                * the spin_unlock would allow the waitqueue_active to
+                * pass above the actual list_add inside
+                * add_wait_queue critical section. So use a full
+                * memory barrier to serialize the list_add write of
+                * add_wait_queue() with the waitqueue_active read
+                * below.
+                */
+               ret = 0;
+               smp_mb();
+               if (waitqueue_active(&ctx->fault_pending_wqh))
+                       ret = POLLIN;
+               return ret;
+       default:
+               BUG();
+       }
+}
+
+static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
+                                   struct uffd_msg *msg)
+{
+       ssize_t ret;
+       DECLARE_WAITQUEUE(wait, current);
+       struct userfaultfd_wait_queue *uwq;
+
+       /* always take the fd_wqh lock before the fault_pending_wqh lock */
+       spin_lock(&ctx->fd_wqh.lock);
+       __add_wait_queue(&ctx->fd_wqh, &wait);
+       for (;;) {
+               set_current_state(TASK_INTERRUPTIBLE);
+               spin_lock(&ctx->fault_pending_wqh.lock);
+               uwq = find_userfault(ctx);
+               if (uwq) {
+                       /*
+                        * Use a seqcount to repeat the lockless check
+                        * in wake_userfault() to avoid missing
+                        * wakeups because during the refile both
+                        * waitqueue could become empty if this is the
+                        * only userfault.
+                        */
+                       write_seqcount_begin(&ctx->refile_seq);
+
+                       /*
+                        * The fault_pending_wqh.lock prevents the uwq
+                        * to disappear from under us.
+                        *
+                        * Refile this userfault from
+                        * fault_pending_wqh to fault_wqh, it's not
+                        * pending anymore after we read it.
+                        *
+                        * Use list_del() by hand (as
+                        * userfaultfd_wake_function also uses
+                        * list_del_init() by hand) to be sure nobody
+                        * changes __remove_wait_queue() to use
+                        * list_del_init() in turn breaking the
+                        * !list_empty_careful() check in
+                        * handle_userfault(). The uwq->wq.task_list
+                        * must never be empty at any time during the
+                        * refile, or the waitqueue could disappear
+                        * from under us. The "wait_queue_head_t"
+                        * parameter of __remove_wait_queue() is unused
+                        * anyway.
+                        */
+                       list_del(&uwq->wq.task_list);
+                       __add_wait_queue(&ctx->fault_wqh, &uwq->wq);
+
+                       write_seqcount_end(&ctx->refile_seq);
+
+                       /* careful to always initialize msg if ret == 0 */
+                       *msg = uwq->msg;
+                       spin_unlock(&ctx->fault_pending_wqh.lock);
+                       ret = 0;
+                       break;
+               }
+               spin_unlock(&ctx->fault_pending_wqh.lock);
+               if (signal_pending(current)) {
+                       ret = -ERESTARTSYS;
+                       break;
+               }
+               if (no_wait) {
+                       ret = -EAGAIN;
+                       break;
+               }
+               spin_unlock(&ctx->fd_wqh.lock);
+               schedule();
+               spin_lock(&ctx->fd_wqh.lock);
+       }
+       __remove_wait_queue(&ctx->fd_wqh, &wait);
+       __set_current_state(TASK_RUNNING);
+       spin_unlock(&ctx->fd_wqh.lock);
+
+       return ret;
+}
+
+static ssize_t userfaultfd_read(struct file *file, char __user *buf,
+                               size_t count, loff_t *ppos)
+{
+       struct userfaultfd_ctx *ctx = file->private_data;
+       ssize_t _ret, ret = 0;
+       struct uffd_msg msg;
+       int no_wait = file->f_flags & O_NONBLOCK;
+
+       if (ctx->state == UFFD_STATE_WAIT_API)
+               return -EINVAL;
+
+       for (;;) {
+               if (count < sizeof(msg))
+                       return ret ? ret : -EINVAL;
+               _ret = userfaultfd_ctx_read(ctx, no_wait, &msg);
+               if (_ret < 0)
+                       return ret ? ret : _ret;
+               if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
+                       return ret ? ret : -EFAULT;
+               ret += sizeof(msg);
+               buf += sizeof(msg);
+               count -= sizeof(msg);
+               /*
+                * Allow to read more than one fault at time but only
+                * block if waiting for the very first one.
+                */
+               no_wait = O_NONBLOCK;
+       }
+}
+
+static void __wake_userfault(struct userfaultfd_ctx *ctx,
+                            struct userfaultfd_wake_range *range)
+{
+       unsigned long start, end;
+
+       start = range->start;
+       end = range->start + range->len;
+
+       spin_lock(&ctx->fault_pending_wqh.lock);
+       /* wake all in the range and autoremove */
+       if (waitqueue_active(&ctx->fault_pending_wqh))
+               __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0,
+                                    range);
+       if (waitqueue_active(&ctx->fault_wqh))
+               __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range);
+       spin_unlock(&ctx->fault_pending_wqh.lock);
+}
+
+static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
+                                          struct userfaultfd_wake_range *range)
+{
+       unsigned seq;
+       bool need_wakeup;
+
+       /*
+        * To be sure waitqueue_active() is not reordered by the CPU
+        * before the pagetable update, use an explicit SMP memory
+        * barrier here. PT lock release or up_read(mmap_sem) still
+        * have release semantics that can allow the
+        * waitqueue_active() to be reordered before the pte update.
+        */
+       smp_mb();
+
+       /*
+        * Use waitqueue_active because it's very frequent to
+        * change the address space atomically even if there are no
+        * userfaults yet. So we take the spinlock only when we're
+        * sure we've userfaults to wake.
+        */
+       do {
+               seq = read_seqcount_begin(&ctx->refile_seq);
+               need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
+                       waitqueue_active(&ctx->fault_wqh);
+               cond_resched();
+       } while (read_seqcount_retry(&ctx->refile_seq, seq));
+       if (need_wakeup)
+               __wake_userfault(ctx, range);
+}
+
+static __always_inline int validate_range(struct mm_struct *mm,
+                                         __u64 start, __u64 len)
+{
+       __u64 task_size = mm->task_size;
+
+       if (start & ~PAGE_MASK)
+               return -EINVAL;
+       if (len & ~PAGE_MASK)
+               return -EINVAL;
+       if (!len)
+               return -EINVAL;
+       if (start < mmap_min_addr)
+               return -EINVAL;
+       if (start >= task_size)
+               return -EINVAL;
+       if (len > task_size - start)
+               return -EINVAL;
+       return 0;
+}
+
+static int userfaultfd_register(struct userfaultfd_ctx *ctx,
+                               unsigned long arg)
+{
+       struct mm_struct *mm = ctx->mm;
+       struct vm_area_struct *vma, *prev, *cur;
+       int ret;
+       struct uffdio_register uffdio_register;
+       struct uffdio_register __user *user_uffdio_register;
+       unsigned long vm_flags, new_flags;
+       bool found;
+       unsigned long start, end, vma_end;
+
+       user_uffdio_register = (struct uffdio_register __user *) arg;
+
+       ret = -EFAULT;
+       if (copy_from_user(&uffdio_register, user_uffdio_register,
+                          sizeof(uffdio_register)-sizeof(__u64)))
+               goto out;
+
+       ret = -EINVAL;
+       if (!uffdio_register.mode)
+               goto out;
+       if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING|
+                                    UFFDIO_REGISTER_MODE_WP))
+               goto out;
+       vm_flags = 0;
+       if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
+               vm_flags |= VM_UFFD_MISSING;
+       if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
+               vm_flags |= VM_UFFD_WP;
+               /*
+                * FIXME: remove the below error constraint by
+                * implementing the wprotect tracking mode.
+                */
+               ret = -EINVAL;
+               goto out;
+       }
+
+       ret = validate_range(mm, uffdio_register.range.start,
+                            uffdio_register.range.len);
+       if (ret)
+               goto out;
+
+       start = uffdio_register.range.start;
+       end = start + uffdio_register.range.len;
+
+       down_write(&mm->mmap_sem);
+       vma = find_vma_prev(mm, start, &prev);
+
+       ret = -ENOMEM;
+       if (!vma)
+               goto out_unlock;
+
+       /* check that there's at least one vma in the range */
+       ret = -EINVAL;
+       if (vma->vm_start >= end)
+               goto out_unlock;
+
+       /*
+        * Search for not compatible vmas.
+        *
+        * FIXME: this shall be relaxed later so that it doesn't fail
+        * on tmpfs backed vmas (in addition to the current allowance
+        * on anonymous vmas).
+        */
+       found = false;
+       for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+               cond_resched();
+
+               BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
+                      !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+
+               /* check not compatible vmas */
+               ret = -EINVAL;
+               if (cur->vm_ops)
+                       goto out_unlock;
+
+               /*
+                * Check that this vma isn't already owned by a
+                * different userfaultfd. We can't allow more than one
+                * userfaultfd to own a single vma simultaneously or we
+                * wouldn't know which one to deliver the userfaults to.
+                */
+               ret = -EBUSY;
+               if (cur->vm_userfaultfd_ctx.ctx &&
+                   cur->vm_userfaultfd_ctx.ctx != ctx)
+                       goto out_unlock;
+
+               found = true;
+       }
+       BUG_ON(!found);
+
+       if (vma->vm_start < start)
+               prev = vma;
+
+       ret = 0;
+       do {
+               cond_resched();
+
+               BUG_ON(vma->vm_ops);
+               BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
+                      vma->vm_userfaultfd_ctx.ctx != ctx);
+
+               /*
+                * Nothing to do: this vma is already registered into this
+                * userfaultfd and with the right tracking mode too.
+                */
+               if (vma->vm_userfaultfd_ctx.ctx == ctx &&
+                   (vma->vm_flags & vm_flags) == vm_flags)
+                       goto skip;
+
+               if (vma->vm_start > start)
+                       start = vma->vm_start;
+               vma_end = min(end, vma->vm_end);
+
+               new_flags = (vma->vm_flags & ~vm_flags) | vm_flags;
+               prev = vma_merge(mm, prev, start, vma_end, new_flags,
+                                vma->anon_vma, vma->vm_file, vma->vm_pgoff,
+                                vma_policy(vma),
+                                ((struct vm_userfaultfd_ctx){ ctx }));
+               if (prev) {
+                       vma = prev;
+                       goto next;
+               }
+               if (vma->vm_start < start) {
+                       ret = split_vma(mm, vma, start, 1);
+                       if (ret)
+                               break;
+               }
+               if (vma->vm_end > end) {
+                       ret = split_vma(mm, vma, end, 0);
+                       if (ret)
+                               break;
+               }
+       next:
+               /*
+                * In the vma_merge() successful mprotect-like case 8:
+                * the next vma was merged into the current one and
+                * the current one has not been updated yet.
+                */
+               vma->vm_flags = new_flags;
+               vma->vm_userfaultfd_ctx.ctx = ctx;
+
+       skip:
+               prev = vma;
+               start = vma->vm_end;
+               vma = vma->vm_next;
+       } while (vma && vma->vm_start < end);
+out_unlock:
+       up_write(&mm->mmap_sem);
+       if (!ret) {
+               /*
+                * Now that we scanned all vmas we can already tell
+                * userland which ioctls methods are guaranteed to
+                * succeed on this range.
+                */
+               if (put_user(UFFD_API_RANGE_IOCTLS,
+                            &user_uffdio_register->ioctls))
+                       ret = -EFAULT;
+       }
+out:
+       return ret;
+}
+
+static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
+                                 unsigned long arg)
+{
+       struct mm_struct *mm = ctx->mm;
+       struct vm_area_struct *vma, *prev, *cur;
+       int ret;
+       struct uffdio_range uffdio_unregister;
+       unsigned long new_flags;
+       bool found;
+       unsigned long start, end, vma_end;
+       const void __user *buf = (void __user *)arg;
+
+       ret = -EFAULT;
+       if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
+               goto out;
+
+       ret = validate_range(mm, uffdio_unregister.start,
+                            uffdio_unregister.len);
+       if (ret)
+               goto out;
+
+       start = uffdio_unregister.start;
+       end = start + uffdio_unregister.len;
+
+       down_write(&mm->mmap_sem);
+       vma = find_vma_prev(mm, start, &prev);
+
+       ret = -ENOMEM;
+       if (!vma)
+               goto out_unlock;
+
+       /* check that there's at least one vma in the range */
+       ret = -EINVAL;
+       if (vma->vm_start >= end)
+               goto out_unlock;
+
+       /*
+        * Search for not compatible vmas.
+        *
+        * FIXME: this shall be relaxed later so that it doesn't fail
+        * on tmpfs backed vmas (in addition to the current allowance
+        * on anonymous vmas).
+        */
+       found = false;
+       ret = -EINVAL;
+       for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+               cond_resched();
+
+               BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
+                      !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+
+               /*
+                * Check not compatible vmas, not strictly required
+                * here as not compatible vmas cannot have an
+                * userfaultfd_ctx registered on them, but this
+                * provides for more strict behavior to notice
+                * unregistration errors.
+                */
+               if (cur->vm_ops)
+                       goto out_unlock;
+
+               found = true;
+       }
+       BUG_ON(!found);
+
+       if (vma->vm_start < start)
+               prev = vma;
+
+       ret = 0;
+       do {
+               cond_resched();
+
+               BUG_ON(vma->vm_ops);
+
+               /*
+                * Nothing to do: this vma is already registered into this
+                * userfaultfd and with the right tracking mode too.
+                */
+               if (!vma->vm_userfaultfd_ctx.ctx)
+                       goto skip;
+
+               if (vma->vm_start > start)
+                       start = vma->vm_start;
+               vma_end = min(end, vma->vm_end);
+
+               new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
+               prev = vma_merge(mm, prev, start, vma_end, new_flags,
+                                vma->anon_vma, vma->vm_file, vma->vm_pgoff,
+                                vma_policy(vma),
+                                NULL_VM_UFFD_CTX);
+               if (prev) {
+                       vma = prev;
+                       goto next;
+               }
+               if (vma->vm_start < start) {
+                       ret = split_vma(mm, vma, start, 1);
+                       if (ret)
+                               break;
+               }
+               if (vma->vm_end > end) {
+                       ret = split_vma(mm, vma, end, 0);
+                       if (ret)
+                               break;
+               }
+       next:
+               /*
+                * In the vma_merge() successful mprotect-like case 8:
+                * the next vma was merged into the current one and
+                * the current one has not been updated yet.
+                */
+               vma->vm_flags = new_flags;
+               vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+
+       skip:
+               prev = vma;
+               start = vma->vm_end;
+               vma = vma->vm_next;
+       } while (vma && vma->vm_start < end);
+out_unlock:
+       up_write(&mm->mmap_sem);
+out:
+       return ret;
+}
+
+/*
+ * userfaultfd_wake may be used in combination with the
+ * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
+ */
+static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
+                           unsigned long arg)
+{
+       int ret;
+       struct uffdio_range uffdio_wake;
+       struct userfaultfd_wake_range range;
+       const void __user *buf = (void __user *)arg;
+
+       ret = -EFAULT;
+       if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
+               goto out;
+
+       ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
+       if (ret)
+               goto out;
+
+       range.start = uffdio_wake.start;
+       range.len = uffdio_wake.len;
+
+       /*
+        * len == 0 means wake all and we don't want to wake all here,
+        * so check it again to be sure.
+        */
+       VM_BUG_ON(!range.len);
+
+       wake_userfault(ctx, &range);
+       ret = 0;
+
+out:
+       return ret;
+}
+
+static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
+                           unsigned long arg)
+{
+       __s64 ret;
+       struct uffdio_copy uffdio_copy;
+       struct uffdio_copy __user *user_uffdio_copy;
+       struct userfaultfd_wake_range range;
+
+       user_uffdio_copy = (struct uffdio_copy __user *) arg;
+
+       ret = -EFAULT;
+       if (copy_from_user(&uffdio_copy, user_uffdio_copy,
+                          /* don't copy "copy" last field */
+                          sizeof(uffdio_copy)-sizeof(__s64)))
+               goto out;
+
+       ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
+       if (ret)
+               goto out;
+       /*
+        * double check for wraparound just in case. copy_from_user()
+        * will later check uffdio_copy.src + uffdio_copy.len to fit
+        * in the userland range.
+        */
+       ret = -EINVAL;
+       if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
+               goto out;
+       if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE)
+               goto out;
+
+       ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
+                          uffdio_copy.len);
+       if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
+               return -EFAULT;
+       if (ret < 0)
+               goto out;
+       BUG_ON(!ret);
+       /* len == 0 would wake all */
+       range.len = ret;
+       if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
+               range.start = uffdio_copy.dst;
+               wake_userfault(ctx, &range);
+       }
+       ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
+out:
+       return ret;
+}
+
+static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
+                               unsigned long arg)
+{
+       __s64 ret;
+       struct uffdio_zeropage uffdio_zeropage;
+       struct uffdio_zeropage __user *user_uffdio_zeropage;
+       struct userfaultfd_wake_range range;
+
+       user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
+
+       ret = -EFAULT;
+       if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
+                          /* don't copy "zeropage" last field */
+                          sizeof(uffdio_zeropage)-sizeof(__s64)))
+               goto out;
+
+       ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
+                            uffdio_zeropage.range.len);
+       if (ret)
+               goto out;
+       ret = -EINVAL;
+       if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
+               goto out;
+
+       ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
+                            uffdio_zeropage.range.len);
+       if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
+               return -EFAULT;
+       if (ret < 0)
+               goto out;
+       /* len == 0 would wake all */
+       BUG_ON(!ret);
+       range.len = ret;
+       if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
+               range.start = uffdio_zeropage.range.start;
+               wake_userfault(ctx, &range);
+       }
+       ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
+out:
+       return ret;
+}
+
+/*
+ * userland asks for a certain API version and we return which bits
+ * and ioctl commands are implemented in this kernel for such API
+ * version or -EINVAL if unknown.
+ */
+static int userfaultfd_api(struct userfaultfd_ctx *ctx,
+                          unsigned long arg)
+{
+       struct uffdio_api uffdio_api;
+       void __user *buf = (void __user *)arg;
+       int ret;
+
+       ret = -EINVAL;
+       if (ctx->state != UFFD_STATE_WAIT_API)
+               goto out;
+       ret = -EFAULT;
+       if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
+               goto out;
+       if (uffdio_api.api != UFFD_API || uffdio_api.features) {
+               memset(&uffdio_api, 0, sizeof(uffdio_api));
+               if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
+                       goto out;
+               ret = -EINVAL;
+               goto out;
+       }
+       uffdio_api.features = UFFD_API_FEATURES;
+       uffdio_api.ioctls = UFFD_API_IOCTLS;
+       ret = -EFAULT;
+       if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
+               goto out;
+       ctx->state = UFFD_STATE_RUNNING;
+       ret = 0;
+out:
+       return ret;
+}
+
+static long userfaultfd_ioctl(struct file *file, unsigned cmd,
+                             unsigned long arg)
+{
+       int ret = -EINVAL;
+       struct userfaultfd_ctx *ctx = file->private_data;
+
+       if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API)
+               return -EINVAL;
+
+       switch(cmd) {
+       case UFFDIO_API:
+               ret = userfaultfd_api(ctx, arg);
+               break;
+       case UFFDIO_REGISTER:
+               ret = userfaultfd_register(ctx, arg);
+               break;
+       case UFFDIO_UNREGISTER:
+               ret = userfaultfd_unregister(ctx, arg);
+               break;
+       case UFFDIO_WAKE:
+               ret = userfaultfd_wake(ctx, arg);
+               break;
+       case UFFDIO_COPY:
+               ret = userfaultfd_copy(ctx, arg);
+               break;
+       case UFFDIO_ZEROPAGE:
+               ret = userfaultfd_zeropage(ctx, arg);
+               break;
+       }
+       return ret;
+}
+
+#ifdef CONFIG_PROC_FS
+static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+       struct userfaultfd_ctx *ctx = f->private_data;
+       wait_queue_t *wq;
+       struct userfaultfd_wait_queue *uwq;
+       unsigned long pending = 0, total = 0;
+
+       spin_lock(&ctx->fault_pending_wqh.lock);
+       list_for_each_entry(wq, &ctx->fault_pending_wqh.task_list, task_list) {
+               uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+               pending++;
+               total++;
+       }
+       list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) {
+               uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+               total++;
+       }
+       spin_unlock(&ctx->fault_pending_wqh.lock);
+
+       /*
+        * If more protocols will be added, there will be all shown
+        * separated by a space. Like this:
+        *      protocols: aa:... bb:...
+        */
+       seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
+                  pending, total, UFFD_API, UFFD_API_FEATURES,
+                  UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
+}
+#endif
+
+static const struct file_operations userfaultfd_fops = {
+#ifdef CONFIG_PROC_FS
+       .show_fdinfo    = userfaultfd_show_fdinfo,
+#endif
+       .release        = userfaultfd_release,
+       .poll           = userfaultfd_poll,
+       .read           = userfaultfd_read,
+       .unlocked_ioctl = userfaultfd_ioctl,
+       .compat_ioctl   = userfaultfd_ioctl,
+       .llseek         = noop_llseek,
+};
+
+static void init_once_userfaultfd_ctx(void *mem)
+{
+       struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
+
+       init_waitqueue_head(&ctx->fault_pending_wqh);
+       init_waitqueue_head(&ctx->fault_wqh);
+       init_waitqueue_head(&ctx->fd_wqh);
+       seqcount_init(&ctx->refile_seq);
+}
+
+/**
+ * userfaultfd_file_create - Creates an userfaultfd file pointer.
+ * @flags: Flags for the userfaultfd file.
+ *
+ * This function creates an userfaultfd file pointer, w/out installing
+ * it into the fd table. This is useful when the userfaultfd file is
+ * used during the initialization of data structures that require
+ * extra setup after the userfaultfd creation. So the userfaultfd
+ * creation is split into the file pointer creation phase, and the
+ * file descriptor installation phase.  In this way races with
+ * userspace closing the newly installed file descriptor can be
+ * avoided.  Returns an userfaultfd file pointer, or a proper error
+ * pointer.
+ */
+static struct file *userfaultfd_file_create(int flags)
+{
+       struct file *file;
+       struct userfaultfd_ctx *ctx;
+
+       BUG_ON(!current->mm);
+
+       /* Check the UFFD_* constants for consistency.  */
+       BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
+       BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
+
+       file = ERR_PTR(-EINVAL);
+       if (flags & ~UFFD_SHARED_FCNTL_FLAGS)
+               goto out;
+
+       file = ERR_PTR(-ENOMEM);
+       ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
+       if (!ctx)
+               goto out;
+
+       atomic_set(&ctx->refcount, 1);
+       ctx->flags = flags;
+       ctx->state = UFFD_STATE_WAIT_API;
+       ctx->released = false;
+       ctx->mm = current->mm;
+       /* prevent the mm struct to be freed */
+       atomic_inc(&ctx->mm->mm_users);
+
+       file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
+                                 O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
+       if (IS_ERR(file))
+               kmem_cache_free(userfaultfd_ctx_cachep, ctx);
+out:
+       return file;
+}
+
+SYSCALL_DEFINE1(userfaultfd, int, flags)
+{
+       int fd, error;
+       struct file *file;
+
+       error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
+       if (error < 0)
+               return error;
+       fd = error;
+
+       file = userfaultfd_file_create(flags);
+       if (IS_ERR(file)) {
+               error = PTR_ERR(file);
+               goto err_put_unused_fd;
+       }
+       fd_install(fd, file);
+
+       return fd;
+
+err_put_unused_fd:
+       put_unused_fd(fd);
+
+       return error;
+}
+
+static int __init userfaultfd_init(void)
+{
+       userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
+                                               sizeof(struct userfaultfd_ctx),
+                                               0,
+                                               SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+                                               init_once_userfaultfd_ctx);
+       return 0;
+}
+__initcall(userfaultfd_init);
index 1fb16562c159947ac27adae43f6abb4f195e1cfa..bbd9b1f10ffb2d9a19995ca0f7f30ed500128a4e 100644 (file)
@@ -511,9 +511,9 @@ xfs_showargs(
                seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10);
 
        if (mp->m_logname)
-               seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname);
+               seq_show_option(m, MNTOPT_LOGDEV, mp->m_logname);
        if (mp->m_rtname)
-               seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname);
+               seq_show_option(m, MNTOPT_RTDEV, mp->m_rtname);
 
        if (mp->m_dalign > 0)
                seq_printf(m, "," MNTOPT_SUNIT "=%d",
index 8b6c083e68a7338c4b5eee9938cc73f40c7faa19..8d70e1361ecd0ee01b4f11380623ada6a7751888 100644 (file)
@@ -137,6 +137,7 @@ struct cred {
        kernel_cap_t    cap_permitted;  /* caps we're permitted */
        kernel_cap_t    cap_effective;  /* caps we can actually use */
        kernel_cap_t    cap_bset;       /* capability bounding set */
+       kernel_cap_t    cap_ambient;    /* Ambient capability set */
 #ifdef CONFIG_KEYS
        unsigned char   jit_keyring;    /* default keyring to attach requested
                                         * keys to */
@@ -212,6 +213,13 @@ static inline void validate_process_creds(void)
 }
 #endif
 
+static inline bool cap_ambient_invariant_ok(const struct cred *cred)
+{
+       return cap_issubset(cred->cap_ambient,
+                           cap_intersect(cred->cap_permitted,
+                                         cred->cap_inheritable));
+}
+
 /**
  * get_new_cred - Get a reference on a new set of credentials
  * @cred: The new credentials to reference
index fbd780c33c5fb3caf422b8127644e5b58207d795..864203c10dbcb3972a88cec15a1db52be674b439 100644 (file)
@@ -1612,7 +1612,6 @@ struct file_operations {
        long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
        long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
        int (*mmap) (struct file *, struct vm_area_struct *);
-       int (*mremap)(struct file *, struct vm_area_struct *);
        int (*open) (struct inode *, struct file *);
        int (*flush) (struct file *, fl_owner_t id);
        int (*release) (struct inode *, struct file *);
index 65a517dd32f7ad0e23ce5ed69129be657d3b3fdf..e0727d77feafb92747f8fc5fd040fc508b3b5d80 100644 (file)
@@ -195,40 +195,49 @@ struct fsnotify_group {
 #define FSNOTIFY_EVENT_INODE   2
 
 /*
- * a mark is simply an object attached to an in core inode which allows an
+ * A mark is simply an object attached to an in core inode which allows an
  * fsnotify listener to indicate they are either no longer interested in events
  * of a type matching mask or only interested in those events.
  *
- * these are flushed when an inode is evicted from core and may be flushed
- * when the inode is modified (as seen by fsnotify_access).  Some fsnotify users
- * (such as dnotify) will flush these when the open fd is closed and not at
- * inode eviction or modification.
+ * These are flushed when an inode is evicted from core and may be flushed
+ * when the inode is modified (as seen by fsnotify_access).  Some fsnotify
+ * users (such as dnotify) will flush these when the open fd is closed and not
+ * at inode eviction or modification.
+ *
+ * Text in brackets is showing the lock(s) protecting modifications of a
+ * particular entry. obj_lock means either inode->i_lock or
+ * mnt->mnt_root->d_lock depending on the mark type.
  */
 struct fsnotify_mark {
-       __u32 mask;                     /* mask this mark is for */
-       /* we hold ref for each i_list and g_list.  also one ref for each 'thing'
+       /* Mask this mark is for [mark->lock, group->mark_mutex] */
+       __u32 mask;
+       /* We hold one for presence in g_list. Also one ref for each 'thing'
         * in kernel that found and may be using this mark. */
-       atomic_t refcnt;                /* active things looking at this mark */
-       struct fsnotify_group *group;   /* group this mark is for */
-       struct list_head g_list;        /* list of marks by group->i_fsnotify_marks
-                                        * Also reused for queueing mark into
-                                        * destroy_list when it's waiting for
-                                        * the end of SRCU period before it can
-                                        * be freed */
-       spinlock_t lock;                /* protect group and inode */
-       struct hlist_node obj_list;     /* list of marks for inode / vfsmount */
-       struct list_head free_list;     /* tmp list used when freeing this mark */
-       union {
+       atomic_t refcnt;
+       /* Group this mark is for. Set on mark creation, stable until last ref
+        * is dropped */
+       struct fsnotify_group *group;
+       /* List of marks by group->i_fsnotify_marks. Also reused for queueing
+        * mark into destroy_list when it's waiting for the end of SRCU period
+        * before it can be freed. [group->mark_mutex] */
+       struct list_head g_list;
+       /* Protects inode / mnt pointers, flags, masks */
+       spinlock_t lock;
+       /* List of marks for inode / vfsmount [obj_lock] */
+       struct hlist_node obj_list;
+       union { /* Object pointer [mark->lock, group->mark_mutex] */
                struct inode *inode;    /* inode this mark is associated with */
                struct vfsmount *mnt;   /* vfsmount this mark is associated with */
        };
-       __u32 ignored_mask;             /* events types to ignore */
+       /* Events types to ignore [mark->lock, group->mark_mutex] */
+       __u32 ignored_mask;
 #define FSNOTIFY_MARK_FLAG_INODE               0x01
 #define FSNOTIFY_MARK_FLAG_VFSMOUNT            0x02
 #define FSNOTIFY_MARK_FLAG_OBJECT_PINNED       0x04
 #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x08
 #define FSNOTIFY_MARK_FLAG_ALIVE               0x10
-       unsigned int flags;             /* vfsmount or inode mark? */
+#define FSNOTIFY_MARK_FLAG_ATTACHED            0x20
+       unsigned int flags;             /* flags [mark->lock] */
        void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */
 };
 
@@ -345,8 +354,10 @@ extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark, struct fsnotify_
 /* given a group and a mark, flag mark to be freed when all references are dropped */
 extern void fsnotify_destroy_mark(struct fsnotify_mark *mark,
                                  struct fsnotify_group *group);
-extern void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
-                                        struct fsnotify_group *group);
+/* detach mark from inode / mount list, group list, drop inode reference */
+extern void fsnotify_detach_mark(struct fsnotify_mark *mark);
+/* free mark */
+extern void fsnotify_free_mark(struct fsnotify_mark *mark);
 /* run all the marks in a group, and clear all of the vfsmount marks */
 extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group);
 /* run all the marks in a group, and clear all of the inode marks */
index 5383bb1394a1a75abc932c3a0fece2fc58f8a076..7ff168d06967c9e2544faa50e4e0d995c27bf4ff 100644 (file)
@@ -59,6 +59,8 @@ struct gen_pool {
 
        genpool_algo_t algo;            /* allocation function */
        void *data;
+
+       const char *name;
 };
 
 /*
@@ -118,8 +120,8 @@ extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size,
                unsigned long start, unsigned int nr, void *data);
 
 extern struct gen_pool *devm_gen_pool_create(struct device *dev,
-               int min_alloc_order, int nid);
-extern struct gen_pool *gen_pool_get(struct device *dev);
+               int min_alloc_order, int nid, const char *name);
+extern struct gen_pool *gen_pool_get(struct device *dev, const char *name);
 
 bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
                        size_t size);
index 869b21dcf503a8220b7ed4628bb642b10867c83e..e691b6a23f72230bf50652b0ebf40a3fe5ef54c9 100644 (file)
@@ -11,7 +11,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
                                           const char namefmt[], ...);
 
 #define kthread_create(threadfn, data, namefmt, arg...) \
-       kthread_create_on_node(threadfn, data, -1, namefmt, ##arg)
+       kthread_create_on_node(threadfn, data, NUMA_NO_NODE, namefmt, ##arg)
 
 
 struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
index bf6f117fcf4d80cb7de6147e86c6ba19fa13febd..8b257c43855bbc32c04698a184055bb3be56bd80 100644 (file)
@@ -124,8 +124,10 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_MAYSHARE    0x00000080
 
 #define VM_GROWSDOWN   0x00000100      /* general info on the segment */
+#define VM_UFFD_MISSING        0x00000200      /* missing pages tracking */
 #define VM_PFNMAP      0x00000400      /* Page-ranges managed without "struct page", just pure PFN */
 #define VM_DENYWRITE   0x00000800      /* ETXTBSY on write attempts.. */
+#define VM_UFFD_WP     0x00001000      /* wrprotect pages tracking */
 
 #define VM_LOCKED      0x00002000
 #define VM_IO           0x00004000     /* Memory mapped I/O or similar */
@@ -245,6 +247,7 @@ struct vm_fault {
 struct vm_operations_struct {
        void (*open)(struct vm_area_struct * area);
        void (*close)(struct vm_area_struct * area);
+       int (*mremap)(struct vm_area_struct * area);
        int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
        void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf);
 
@@ -1833,7 +1836,7 @@ extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,
 extern struct vm_area_struct *vma_merge(struct mm_struct *,
        struct vm_area_struct *prev, unsigned long addr, unsigned long end,
        unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
-       struct mempolicy *);
+       struct mempolicy *, struct vm_userfaultfd_ctx);
 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
 extern int split_vma(struct mm_struct *,
        struct vm_area_struct *, unsigned long addr, int new_below);
index 15549578d55998e5497c5da58a50fa1531e132d8..c8d0a73d64c455f95ba9ed3a0d855bcbc4d839ce 100644 (file)
@@ -256,6 +256,16 @@ struct vm_region {
                                                * this region */
 };
 
+#ifdef CONFIG_USERFAULTFD
+#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, })
+struct vm_userfaultfd_ctx {
+       struct userfaultfd_ctx *ctx;
+};
+#else /* CONFIG_USERFAULTFD */
+#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {})
+struct vm_userfaultfd_ctx {};
+#endif /* CONFIG_USERFAULTFD */
+
 /*
  * This struct defines a memory VMM memory area. There is one of these
  * per VM-area/task.  A VM area is any part of the process virtual memory
@@ -322,6 +332,7 @@ struct vm_area_struct {
 #ifdef CONFIG_NUMA
        struct mempolicy *vm_policy;    /* NUMA policy for the VMA */
 #endif
+       struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 };
 
 struct core_thread {
@@ -543,6 +554,7 @@ enum tlb_flush_reason {
        TLB_REMOTE_SHOOTDOWN,
        TLB_LOCAL_SHOOTDOWN,
        TLB_LOCAL_MM_SHOOTDOWN,
+       TLB_REMOTE_SEND_IPI,
        NR_TLB_FLUSH_REASONS,
 };
 
index 754c25966a0a7828901deaf2c69dc1d5243736f4..ac00e2050943b4352feeb0f196ff5b5b48a3204a 100644 (file)
@@ -690,14 +690,6 @@ struct zonelist {
 #endif
 };
 
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-struct node_active_region {
-       unsigned long start_pfn;
-       unsigned long end_pfn;
-       int nid;
-};
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-
 #ifndef CONFIG_DISCONTIGMEM
 /* The array of struct pages - for discontigmem use pgdat->lmem_map */
 extern struct page *mem_map;
index f94da0e65dea90e1fa0950420a66c14f3dcb472a..a91adf6e02f2a7c47117093eea682c06651034b8 100644 (file)
@@ -27,9 +27,7 @@ static inline void touch_nmi_watchdog(void)
 #if defined(CONFIG_HARDLOCKUP_DETECTOR)
 extern void hardlockup_detector_disable(void);
 #else
-static inline void hardlockup_detector_disable(void)
-{
-}
+static inline void hardlockup_detector_disable(void) {}
 #endif
 
 /*
@@ -80,6 +78,17 @@ extern int proc_watchdog_thresh(struct ctl_table *, int ,
                                void __user *, size_t *, loff_t *);
 extern int proc_watchdog_cpumask(struct ctl_table *, int,
                                 void __user *, size_t *, loff_t *);
+extern int lockup_detector_suspend(void);
+extern void lockup_detector_resume(void);
+#else
+static inline int lockup_detector_suspend(void)
+{
+       return 0;
+}
+
+static inline void lockup_detector_resume(void)
+{
+}
 #endif
 
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
index c89c53a113a8d59c6a427139d220b9d75d6de7d7..29446aeef36e553aa361774d39c0852517c87405 100644 (file)
@@ -89,6 +89,9 @@ enum ttu_flags {
        TTU_IGNORE_MLOCK = (1 << 8),    /* ignore mlock */
        TTU_IGNORE_ACCESS = (1 << 9),   /* don't age */
        TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
+       TTU_BATCH_FLUSH = (1 << 11),    /* Batch TLB flushes where possible
+                                        * and caller guarantees they will
+                                        * do a final flush if necessary */
 };
 
 #ifdef CONFIG_MMU
index 119823decc4631eb26842df9fd7a9a1e63709577..a4ab9daa387c0bbcaca1923620ceb2ed74bfd84e 100644 (file)
@@ -1344,6 +1344,25 @@ enum perf_event_task_context {
        perf_nr_task_contexts,
 };
 
+/* Track pages that require TLB flushes */
+struct tlbflush_unmap_batch {
+       /*
+        * Each bit set is a CPU that potentially has a TLB entry for one of
+        * the PFNs being flushed. See set_tlb_ubc_flush_pending().
+        */
+       struct cpumask cpumask;
+
+       /* True if any bit in cpumask is set */
+       bool flush_required;
+
+       /*
+        * If true then the PTE was dirty when unmapped. The entry must be
+        * flushed before IO is initiated or a stale TLB entry potentially
+        * allows an update without redirtying the page.
+        */
+       bool writable;
+};
+
 struct task_struct {
        volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
        void *stack;
@@ -1700,6 +1719,10 @@ struct task_struct {
        unsigned long numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
 
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+       struct tlbflush_unmap_batch tlb_ubc;
+#endif
+
        struct rcu_head rcu;
 
        /*
index 912a7c482649e63bc3232ddd5a88461c20e01e49..d4c7271382cb310edc3d2bf4ffd5ef997e5bef87 100644 (file)
@@ -149,6 +149,41 @@ static inline struct user_namespace *seq_user_ns(struct seq_file *seq)
 #endif
 }
 
+/**
+ * seq_show_options - display mount options with appropriate escapes.
+ * @m: the seq_file handle
+ * @name: the mount option name
+ * @value: the mount option name's value, can be NULL
+ */
+static inline void seq_show_option(struct seq_file *m, const char *name,
+                                  const char *value)
+{
+       seq_putc(m, ',');
+       seq_escape(m, name, ",= \t\n\\");
+       if (value) {
+               seq_putc(m, '=');
+               seq_escape(m, value, ", \t\n\\");
+       }
+}
+
+/**
+ * seq_show_option_n - display mount options with appropriate escapes
+ *                    where @value must be a specific length.
+ * @m: the seq_file handle
+ * @name: the mount option name
+ * @value: the mount option name's value, cannot be NULL
+ * @length: the length of @value to display
+ *
+ * This is a macro since this uses "length" to define the size of the
+ * stack buffer.
+ */
+#define seq_show_option_n(m, name, value, length) {    \
+       char val_buf[length + 1];                       \
+       strncpy(val_buf, value, length);                \
+       val_buf[length] = '\0';                         \
+       seq_show_option(m, name, val_buf);              \
+}
+
 #define SEQ_START_TOKEN ((void *)1)
 /*
  * Helpers for iteration over list_head-s in seq_files
index a99f0e5243e1e0d30feff453515ae40cc7247334..7e37d448ed910854e1876682ac1038cebee93cd1 100644 (file)
@@ -290,6 +290,16 @@ void *__kmalloc(size_t size, gfp_t flags);
 void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags);
 void kmem_cache_free(struct kmem_cache *, void *);
 
+/*
+ * Bulk allocation and freeing operations. These are accellerated in an
+ * allocator specific way to avoid taking locks repeatedly or building
+ * metadata structures unnecessarily.
+ *
+ * Note that interrupts must be enabled when calling these functions.
+ */
+void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
+bool kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
+
 #ifdef CONFIG_NUMA
 void *__kmalloc_node(size_t size, gfp_t flags, int node);
 void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
index da3c593f9845b09702453e4db9096806ff7271b7..e6109a6cd8f65eb779163d1a084a6e0256a11db0 100644 (file)
@@ -48,7 +48,16 @@ struct smp_hotplug_thread {
        const char                      *thread_comm;
 };
 
-int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
+int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
+                                          const struct cpumask *cpumask);
+
+static inline int
+smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+{
+       return smpboot_register_percpu_thread_cpumask(plug_thread,
+                                                     cpu_possible_mask);
+}
+
 void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
 int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
                                         const struct cpumask *);
index b45c45b8c829fa6eb438a7ba5c62e98f5bf82943..08001317aee7376babb542aa9bb15936d38ef533 100644 (file)
@@ -810,6 +810,7 @@ asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr);
 asmlinkage long sys_eventfd(unsigned int count);
 asmlinkage long sys_eventfd2(unsigned int count, int flags);
 asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags);
+asmlinkage long sys_userfaultfd(int flags);
 asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
 asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int);
 asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
new file mode 100644 (file)
index 0000000..587480a
--- /dev/null
@@ -0,0 +1,85 @@
+/*
+ *  include/linux/userfaultfd_k.h
+ *
+ *  Copyright (C) 2015  Red Hat, Inc.
+ *
+ */
+
+#ifndef _LINUX_USERFAULTFD_K_H
+#define _LINUX_USERFAULTFD_K_H
+
+#ifdef CONFIG_USERFAULTFD
+
+#include <linux/userfaultfd.h> /* linux/include/uapi/linux/userfaultfd.h */
+
+#include <linux/fcntl.h>
+
+/*
+ * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
+ * new flags, since they might collide with O_* ones. We want
+ * to re-use O_* flags that couldn't possibly have a meaning
+ * from userfaultfd, in order to leave a free define-space for
+ * shared O_* flags.
+ */
+#define UFFD_CLOEXEC O_CLOEXEC
+#define UFFD_NONBLOCK O_NONBLOCK
+
+#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
+#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
+
+extern int handle_userfault(struct vm_area_struct *vma, unsigned long address,
+                           unsigned int flags, unsigned long reason);
+
+extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
+                           unsigned long src_start, unsigned long len);
+extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
+                             unsigned long dst_start,
+                             unsigned long len);
+
+/* mm helpers */
+static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
+                                       struct vm_userfaultfd_ctx vm_ctx)
+{
+       return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx;
+}
+
+static inline bool userfaultfd_missing(struct vm_area_struct *vma)
+{
+       return vma->vm_flags & VM_UFFD_MISSING;
+}
+
+static inline bool userfaultfd_armed(struct vm_area_struct *vma)
+{
+       return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
+}
+
+#else /* CONFIG_USERFAULTFD */
+
+/* mm helpers */
+static inline int handle_userfault(struct vm_area_struct *vma,
+                                  unsigned long address,
+                                  unsigned int flags,
+                                  unsigned long reason)
+{
+       return VM_FAULT_SIGBUS;
+}
+
+static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
+                                       struct vm_userfaultfd_ctx vm_ctx)
+{
+       return true;
+}
+
+static inline bool userfaultfd_missing(struct vm_area_struct *vma)
+{
+       return false;
+}
+
+static inline bool userfaultfd_armed(struct vm_area_struct *vma)
+{
+       return false;
+}
+
+#endif /* CONFIG_USERFAULTFD */
+
+#endif /* _LINUX_USERFAULTFD_K_H */
index 1e1bf9f963a947fc686125d0a2809ad63b8a13ed..d3d077228d4c155ad4b08a6668dc725679996372 100644 (file)
@@ -147,7 +147,8 @@ __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old)
 
 typedef int wait_bit_action_f(struct wait_bit_key *);
 void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
+                         void *key);
 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr);
 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
@@ -179,7 +180,7 @@ wait_queue_head_t *bit_waitqueue(void *, int);
 #define wake_up_poll(x, m)                                             \
        __wake_up(x, TASK_NORMAL, 1, (void *) (m))
 #define wake_up_locked_poll(x, m)                                      \
-       __wake_up_locked_key((x), TASK_NORMAL, (void *) (m))
+       __wake_up_locked_key((x), TASK_NORMAL, 1, (void *) (m))
 #define wake_up_interruptible_poll(x, m)                               \
        __wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m))
 #define wake_up_interruptible_sync_poll(x, m)                          \
index f47feada5b42c99da7d17aefb9b0e24fc5a06e1d..d74a0e907b9e761472662ec17c4e167448028983 100644 (file)
@@ -140,12 +140,4 @@ extern int watchdog_init_timeout(struct watchdog_device *wdd,
 extern int watchdog_register_device(struct watchdog_device *);
 extern void watchdog_unregister_device(struct watchdog_device *);
 
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-void watchdog_nmi_disable_all(void);
-void watchdog_nmi_enable_all(void);
-#else
-static inline void watchdog_nmi_disable_all(void) {}
-static inline void watchdog_nmi_enable_all(void) {}
-#endif
-
 #endif  /* ifndef _LINUX_WATCHDOG_H */
index 4250f364a6caad06a436636bf2a33a05aa442dbf..bc8815f45f3b384e411738664ff0d3fa1b99bc0b 100644 (file)
@@ -11,7 +11,8 @@
        EM(  TLB_FLUSH_ON_TASK_SWITCH,  "flush on task switch" )        \
        EM(  TLB_REMOTE_SHOOTDOWN,      "remote shootdown" )            \
        EM(  TLB_LOCAL_SHOOTDOWN,       "local shootdown" )             \
-       EMe( TLB_LOCAL_MM_SHOOTDOWN,    "local mm shootdown" )
+       EM(  TLB_LOCAL_MM_SHOOTDOWN,    "local mm shootdown" )          \
+       EMe( TLB_REMOTE_SEND_IPI,       "remote ipi send" )
 
 /*
  * First define the enums in TLB_FLUSH_REASON to be exported to userspace
index aafb9937b162b47ce047e6becb180b7cd6d3c447..70ff1d9abf0ddab0055d64b1d9ec70b8f488fd9f 100644 (file)
@@ -456,3 +456,4 @@ header-y += xfrm.h
 header-y += xilinx-v4l2-controls.h
 header-y += zorro.h
 header-y += zorro_ids.h
+header-y += userfaultfd.h
index 31891d9535e2a4ede364627a805d6d346fae8b9c..a8d0759a9e400c5d472fe37d13a924bc9e9777a6 100644 (file)
@@ -190,4 +190,11 @@ struct prctl_mm_map {
 # define PR_FP_MODE_FR         (1 << 0)        /* 64b FP registers */
 # define PR_FP_MODE_FRE                (1 << 1)        /* 32b compatibility */
 
+/* Control the ambient capability set */
+#define PR_CAP_AMBIENT                 47
+# define PR_CAP_AMBIENT_IS_SET         1
+# define PR_CAP_AMBIENT_RAISE          2
+# define PR_CAP_AMBIENT_LOWER          3
+# define PR_CAP_AMBIENT_CLEAR_ALL      4
+
 #endif /* _LINUX_PRCTL_H */
index 985aac9e6bf888214aa8943359112c596037957a..35ac35cef2170b0a760efd06bbc8b4e1225d5fb8 100644 (file)
 #define SECBIT_KEEP_CAPS       (issecure_mask(SECURE_KEEP_CAPS))
 #define SECBIT_KEEP_CAPS_LOCKED (issecure_mask(SECURE_KEEP_CAPS_LOCKED))
 
+/* When set, a process cannot add new capabilities to its ambient set. */
+#define SECURE_NO_CAP_AMBIENT_RAISE            6
+#define SECURE_NO_CAP_AMBIENT_RAISE_LOCKED     7  /* make bit-6 immutable */
+
+#define SECBIT_NO_CAP_AMBIENT_RAISE (issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE))
+#define SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED \
+                       (issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE_LOCKED))
+
 #define SECURE_ALL_BITS                (issecure_mask(SECURE_NOROOT) | \
                                 issecure_mask(SECURE_NO_SETUID_FIXUP) | \
-                                issecure_mask(SECURE_KEEP_CAPS))
+                                issecure_mask(SECURE_KEEP_CAPS) | \
+                                issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE))
 #define SECURE_ALL_LOCKS       (SECURE_ALL_BITS << 1)
 
 #endif /* _UAPI_LINUX_SECUREBITS_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
new file mode 100644 (file)
index 0000000..df0e09b
--- /dev/null
@@ -0,0 +1,169 @@
+/*
+ *  include/linux/userfaultfd.h
+ *
+ *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *  Copyright (C) 2015  Red Hat, Inc.
+ *
+ */
+
+#ifndef _LINUX_USERFAULTFD_H
+#define _LINUX_USERFAULTFD_H
+
+#include <linux/types.h>
+
+#include <linux/compiler.h>
+
+#define UFFD_API ((__u64)0xAA)
+/*
+ * After implementing the respective features it will become:
+ * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
+ *                           UFFD_FEATURE_EVENT_FORK)
+ */
+#define UFFD_API_FEATURES (0)
+#define UFFD_API_IOCTLS                                \
+       ((__u64)1 << _UFFDIO_REGISTER |         \
+        (__u64)1 << _UFFDIO_UNREGISTER |       \
+        (__u64)1 << _UFFDIO_API)
+#define UFFD_API_RANGE_IOCTLS                  \
+       ((__u64)1 << _UFFDIO_WAKE |             \
+        (__u64)1 << _UFFDIO_COPY |             \
+        (__u64)1 << _UFFDIO_ZEROPAGE)
+
+/*
+ * Valid ioctl command number range with this API is from 0x00 to
+ * 0x3F.  UFFDIO_API is the fixed number, everything else can be
+ * changed by implementing a different UFFD_API. If sticking to the
+ * same UFFD_API more ioctl can be added and userland will be aware of
+ * which ioctl the running kernel implements through the ioctl command
+ * bitmask written by the UFFDIO_API.
+ */
+#define _UFFDIO_REGISTER               (0x00)
+#define _UFFDIO_UNREGISTER             (0x01)
+#define _UFFDIO_WAKE                   (0x02)
+#define _UFFDIO_COPY                   (0x03)
+#define _UFFDIO_ZEROPAGE               (0x04)
+#define _UFFDIO_API                    (0x3F)
+
+/* userfaultfd ioctl ids */
+#define UFFDIO 0xAA
+#define UFFDIO_API             _IOWR(UFFDIO, _UFFDIO_API,      \
+                                     struct uffdio_api)
+#define UFFDIO_REGISTER                _IOWR(UFFDIO, _UFFDIO_REGISTER, \
+                                     struct uffdio_register)
+#define UFFDIO_UNREGISTER      _IOR(UFFDIO, _UFFDIO_UNREGISTER,        \
+                                    struct uffdio_range)
+#define UFFDIO_WAKE            _IOR(UFFDIO, _UFFDIO_WAKE,      \
+                                    struct uffdio_range)
+#define UFFDIO_COPY            _IOWR(UFFDIO, _UFFDIO_COPY,     \
+                                     struct uffdio_copy)
+#define UFFDIO_ZEROPAGE                _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \
+                                     struct uffdio_zeropage)
+
+/* read() structure */
+struct uffd_msg {
+       __u8    event;
+
+       __u8    reserved1;
+       __u16   reserved2;
+       __u32   reserved3;
+
+       union {
+               struct {
+                       __u64   flags;
+                       __u64   address;
+               } pagefault;
+
+               struct {
+                       /* unused reserved fields */
+                       __u64   reserved1;
+                       __u64   reserved2;
+                       __u64   reserved3;
+               } reserved;
+       } arg;
+} __packed;
+
+/*
+ * Start at 0x12 and not at 0 to be more strict against bugs.
+ */
+#define UFFD_EVENT_PAGEFAULT   0x12
+#if 0 /* not available yet */
+#define UFFD_EVENT_FORK                0x13
+#endif
+
+/* flags for UFFD_EVENT_PAGEFAULT */
+#define UFFD_PAGEFAULT_FLAG_WRITE      (1<<0)  /* If this was a write fault */
+#define UFFD_PAGEFAULT_FLAG_WP         (1<<1)  /* If reason is VM_UFFD_WP */
+
+struct uffdio_api {
+       /* userland asks for an API number and the features to enable */
+       __u64 api;
+       /*
+        * Kernel answers below with the all available features for
+        * the API, this notifies userland of which events and/or
+        * which flags for each event are enabled in the current
+        * kernel.
+        *
+        * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE
+        * are to be considered implicitly always enabled in all kernels as
+        * long as the uffdio_api.api requested matches UFFD_API.
+        */
+#if 0 /* not available yet */
+#define UFFD_FEATURE_PAGEFAULT_FLAG_WP         (1<<0)
+#define UFFD_FEATURE_EVENT_FORK                        (1<<1)
+#endif
+       __u64 features;
+
+       __u64 ioctls;
+};
+
+struct uffdio_range {
+       __u64 start;
+       __u64 len;
+};
+
+struct uffdio_register {
+       struct uffdio_range range;
+#define UFFDIO_REGISTER_MODE_MISSING   ((__u64)1<<0)
+#define UFFDIO_REGISTER_MODE_WP                ((__u64)1<<1)
+       __u64 mode;
+
+       /*
+        * kernel answers which ioctl commands are available for the
+        * range, keep at the end as the last 8 bytes aren't read.
+        */
+       __u64 ioctls;
+};
+
+struct uffdio_copy {
+       __u64 dst;
+       __u64 src;
+       __u64 len;
+       /*
+        * There will be a wrprotection flag later that allows to map
+        * pages wrprotected on the fly. And such a flag will be
+        * available if the wrprotection ioctl are implemented for the
+        * range according to the uffdio_register.ioctls.
+        */
+#define UFFDIO_COPY_MODE_DONTWAKE              ((__u64)1<<0)
+       __u64 mode;
+
+       /*
+        * "copy" is written by the ioctl and must be at the end: the
+        * copy_from_user will not read the last 8 bytes.
+        */
+       __s64 copy;
+};
+
+struct uffdio_zeropage {
+       struct uffdio_range range;
+#define UFFDIO_ZEROPAGE_MODE_DONTWAKE          ((__u64)1<<0)
+       __u64 mode;
+
+       /*
+        * "zeropage" is written by the ioctl and must be at the end:
+        * the copy_from_user will not read the last 8 bytes.
+        */
+       __s64 zeropage;
+};
+
+#endif /* _LINUX_USERFAULTFD_H */
index bb9b4dd55889f0605b07ddfc73cc105d72b18908..2c0e50ef554a39c1a4ebb20bb4494968271733e1 100644 (file)
@@ -882,6 +882,16 @@ config GENERIC_SCHED_CLOCK
 config ARCH_SUPPORTS_NUMA_BALANCING
        bool
 
+#
+# For architectures that prefer to flush all TLBs after a number of pages
+# are unmapped instead of sending one IPI per page to flush. The architecture
+# must provide guarantees on what happens if a clean TLB cache entry is
+# written after the unmap. Details are in mm/rmap.c near the check for
+# should_defer_flush. The architecture should also consider if the full flush
+# and the refill costs are offset by the savings of sending fewer IPIs.
+config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+       bool
+
 #
 # For architectures that know their GCC __int128 support is sound
 #
@@ -1576,6 +1586,14 @@ config ADVISE_SYSCALLS
          applications use these syscalls, you can disable this option to save
          space.
 
+config USERFAULTFD
+       bool "Enable userfaultfd() system call"
+       select ANON_INODES
+       depends on MMU
+       help
+         Enable the userfaultfd() system call that allows to intercept and
+         handle page faults in userland.
+
 config PCI_QUIRKS
        default y
        bool "Enable PCI quirk workarounds" if EXPERT
index f3f5cd5e2c0d9ccd8b954e9191cd9169d53c32d7..a8538e4437842d9cc85027acc516ed9a680d06cf 100644 (file)
@@ -1342,7 +1342,7 @@ static int cgroup_show_options(struct seq_file *seq,
        if (root != &cgrp_dfl_root)
                for_each_subsys(ss, ssid)
                        if (root->subsys_mask & (1 << ssid))
-                               seq_printf(seq, ",%s", ss->legacy_name);
+                               seq_show_option(seq, ss->name, NULL);
        if (root->flags & CGRP_ROOT_NOPREFIX)
                seq_puts(seq, ",noprefix");
        if (root->flags & CGRP_ROOT_XATTR)
@@ -1350,13 +1350,14 @@ static int cgroup_show_options(struct seq_file *seq,
 
        spin_lock(&release_agent_path_lock);
        if (strlen(root->release_agent_path))
-               seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+               seq_show_option(seq, "release_agent",
+                               root->release_agent_path);
        spin_unlock(&release_agent_path_lock);
 
        if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
                seq_puts(seq, ",clone_children");
        if (strlen(root->name))
-               seq_printf(seq, ",name=%s", root->name);
+               seq_show_option(seq, "name", root->name);
        return 0;
 }
 
index 03aa2e6de7a4e90696c003792641d2c3a150cd02..7d5f0f118a6348f81f08f10dd7dbb499f89dd243 100644 (file)
@@ -454,8 +454,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                tmp->vm_mm = mm;
                if (anon_vma_fork(tmp, mpnt))
                        goto fail_nomem_anon_vma_fork;
-               tmp->vm_flags &= ~VM_LOCKED;
+               tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP);
                tmp->vm_next = tmp->vm_prev = NULL;
+               tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
                file = tmp->vm_file;
                if (file) {
                        struct inode *inode = file_inode(file);
index 490924cc9e7c8252c447e802f142d0a3a150865d..9ff173dca1aef5e09fd640dc6757fee99c7a956c 100644 (file)
@@ -248,15 +248,16 @@ static void create_kthread(struct kthread_create_info *create)
  * kthread_create_on_node - create a kthread.
  * @threadfn: the function to run until signal_pending(current).
  * @data: data ptr for @threadfn.
- * @node: memory node number.
+ * @node: task and thread structures for the thread are allocated on this node
  * @namefmt: printf-style name for the thread.
  *
  * Description: This helper function creates and names a kernel
  * thread.  The thread will be stopped: use wake_up_process() to start
- * it.  See also kthread_run().
+ * it.  See also kthread_run().  The new thread has SCHED_NORMAL policy and
+ * is affine to all CPUs.
  *
  * If thread is going to be bound on a particular cpu, give its node
- * in @node, to get NUMA affinity for kthread stack, or else give -1.
+ * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
  * When woken, the thread will run @threadfn() with @data as its
  * argument. @threadfn() can either call do_exit() directly if it is a
  * standalone thread for which no one will call kthread_stop(), or
index 052e02672d12428ce1e9e1f7266c7cd754ace5af..272d9322bc5dfb6b82e650950e16ab89b572f442 100644 (file)
@@ -106,9 +106,10 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
 
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
+                         void *key)
 {
-       __wake_up_common(q, mode, 1, 0, key);
+       __wake_up_common(q, mode, nr, 0, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
@@ -283,7 +284,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
        if (!list_empty(&wait->task_list))
                list_del_init(&wait->task_list);
        else if (waitqueue_active(q))
-               __wake_up_locked_key(q, mode, key);
+               __wake_up_locked_key(q, mode, 1, key);
        spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(abort_exclusive_wait);
index 7c434c39f02a250f4721475910e881b43b603313..a818cbc73e147382488cb0ea5bb5c490c6a46e15 100644 (file)
@@ -113,7 +113,8 @@ static int smpboot_thread_fn(void *data)
                if (kthread_should_stop()) {
                        __set_current_state(TASK_RUNNING);
                        preempt_enable();
-                       if (ht->cleanup)
+                       /* cleanup must mirror setup */
+                       if (ht->cleanup && td->status != HP_THREAD_NONE)
                                ht->cleanup(td->cpu, cpu_online(td->cpu));
                        kfree(td);
                        return 0;
@@ -259,15 +260,6 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
 {
        unsigned int cpu;
 
-       /* Unpark any threads that were voluntarily parked. */
-       for_each_cpu_not(cpu, ht->cpumask) {
-               if (cpu_online(cpu)) {
-                       struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
-                       if (tsk)
-                               kthread_unpark(tsk);
-               }
-       }
-
        /* We need to destroy also the parked threads of offline cpus */
        for_each_possible_cpu(cpu) {
                struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
@@ -281,19 +273,22 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
 }
 
 /**
- * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
+ * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related
+ *                                         to hotplug
  * @plug_thread:       Hotplug thread descriptor
+ * @cpumask:           The cpumask where threads run
  *
  * Creates and starts the threads on all online cpus.
  */
-int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
+                                          const struct cpumask *cpumask)
 {
        unsigned int cpu;
        int ret = 0;
 
        if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
                return -ENOMEM;
-       cpumask_copy(plug_thread->cpumask, cpu_possible_mask);
+       cpumask_copy(plug_thread->cpumask, cpumask);
 
        get_online_cpus();
        mutex_lock(&smpboot_threads_lock);
@@ -301,9 +296,11 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
                ret = __smpboot_create_thread(plug_thread, cpu);
                if (ret) {
                        smpboot_destroy_threads(plug_thread);
+                       free_cpumask_var(plug_thread->cpumask);
                        goto out;
                }
-               smpboot_unpark_thread(plug_thread, cpu);
+               if (cpumask_test_cpu(cpu, cpumask))
+                       smpboot_unpark_thread(plug_thread, cpu);
        }
        list_add(&plug_thread->list, &hotplug_threads);
 out:
@@ -311,7 +308,7 @@ out:
        put_online_cpus();
        return ret;
 }
-EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
+EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask);
 
 /**
  * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
index ca7d84f438f1ed5ab79f41e95baee0fc1ac1926d..03c3875d995898b2af39ac593379085cb250c242 100644 (file)
@@ -219,6 +219,7 @@ cond_syscall(compat_sys_timerfd_gettime);
 cond_syscall(sys_eventfd);
 cond_syscall(sys_eventfd2);
 cond_syscall(sys_memfd_create);
+cond_syscall(sys_userfaultfd);
 
 /* performance counters: */
 cond_syscall(sys_perf_event_open);
index f65a0a06a8c067726b76a37418c80919cf59c4d7..88fefa68c5164c88e5ec2487c942b15e3914666b 100644 (file)
@@ -39,6 +39,7 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
        cred->cap_inheritable = CAP_EMPTY_SET;
        cred->cap_permitted = CAP_FULL_SET;
        cred->cap_effective = CAP_FULL_SET;
+       cred->cap_ambient = CAP_EMPTY_SET;
        cred->cap_bset = CAP_FULL_SET;
 #ifdef CONFIG_KEYS
        key_put(cred->request_key_auth);
index a6ffa43f299301dd750e9be092975df0d5e83786..64ed1c37bd1fdc6c2874f797e987b8bd4c4c5308 100644 (file)
@@ -24,6 +24,7 @@
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
 #include <linux/perf_event.h>
+#include <linux/kthread.h>
 
 /*
  * The run state of the lockup detectors is controlled by the content of the
@@ -66,7 +67,26 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
 #define for_each_watchdog_cpu(cpu) \
        for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
 
+/*
+ * The 'watchdog_running' variable is set to 1 when the watchdog threads
+ * are registered/started and is set to 0 when the watchdog threads are
+ * unregistered/stopped, so it is an indicator whether the threads exist.
+ */
 static int __read_mostly watchdog_running;
+/*
+ * If a subsystem has a need to deactivate the watchdog temporarily, it
+ * can use the suspend/resume interface to achieve this. The content of
+ * the 'watchdog_suspended' variable reflects this state. Existing threads
+ * are parked/unparked by the lockup_detector_{suspend|resume} functions
+ * (see comment blocks pertaining to those functions for further details).
+ *
+ * 'watchdog_suspended' also prevents threads from being registered/started
+ * or unregistered/stopped via parameters in /proc/sys/kernel, so the state
+ * of 'watchdog_running' cannot change while the watchdog is deactivated
+ * temporarily (see related code in 'proc' handlers).
+ */
+static int __read_mostly watchdog_suspended;
+
 static u64 __read_mostly sample_period;
 
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -613,46 +633,9 @@ static void watchdog_nmi_disable(unsigned int cpu)
        }
 }
 
-void watchdog_nmi_enable_all(void)
-{
-       int cpu;
-
-       mutex_lock(&watchdog_proc_mutex);
-
-       if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
-               goto unlock;
-
-       get_online_cpus();
-       for_each_watchdog_cpu(cpu)
-               watchdog_nmi_enable(cpu);
-       put_online_cpus();
-
-unlock:
-       mutex_unlock(&watchdog_proc_mutex);
-}
-
-void watchdog_nmi_disable_all(void)
-{
-       int cpu;
-
-       mutex_lock(&watchdog_proc_mutex);
-
-       if (!watchdog_running)
-               goto unlock;
-
-       get_online_cpus();
-       for_each_watchdog_cpu(cpu)
-               watchdog_nmi_disable(cpu);
-       put_online_cpus();
-
-unlock:
-       mutex_unlock(&watchdog_proc_mutex);
-}
 #else
 static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
 static void watchdog_nmi_disable(unsigned int cpu) { return; }
-void watchdog_nmi_enable_all(void) {}
-void watchdog_nmi_disable_all(void) {}
 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
 
 static struct smp_hotplug_thread watchdog_threads = {
@@ -666,46 +649,89 @@ static struct smp_hotplug_thread watchdog_threads = {
        .unpark                 = watchdog_enable,
 };
 
-static void restart_watchdog_hrtimer(void *info)
+/*
+ * park all watchdog threads that are specified in 'watchdog_cpumask'
+ */
+static int watchdog_park_threads(void)
 {
-       struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
-       int ret;
+       int cpu, ret = 0;
 
+       get_online_cpus();
+       for_each_watchdog_cpu(cpu) {
+               ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
+               if (ret)
+                       break;
+       }
+       if (ret) {
+               for_each_watchdog_cpu(cpu)
+                       kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+       }
+       put_online_cpus();
+
+       return ret;
+}
+
+/*
+ * unpark all watchdog threads that are specified in 'watchdog_cpumask'
+ */
+static void watchdog_unpark_threads(void)
+{
+       int cpu;
+
+       get_online_cpus();
+       for_each_watchdog_cpu(cpu)
+               kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+       put_online_cpus();
+}
+
+/*
+ * Suspend the hard and soft lockup detector by parking the watchdog threads.
+ */
+int lockup_detector_suspend(void)
+{
+       int ret = 0;
+
+       mutex_lock(&watchdog_proc_mutex);
        /*
-        * No need to cancel and restart hrtimer if it is currently executing
-        * because it will reprogram itself with the new period now.
-        * We should never see it unqueued here because we are running per-cpu
-        * with interrupts disabled.
+        * Multiple suspend requests can be active in parallel (counted by
+        * the 'watchdog_suspended' variable). If the watchdog threads are
+        * running, the first caller takes care that they will be parked.
+        * The state of 'watchdog_running' cannot change while a suspend
+        * request is active (see related code in 'proc' handlers).
         */
-       ret = hrtimer_try_to_cancel(hrtimer);
-       if (ret == 1)
-               hrtimer_start(hrtimer, ns_to_ktime(sample_period),
-                               HRTIMER_MODE_REL_PINNED);
+       if (watchdog_running && !watchdog_suspended)
+               ret = watchdog_park_threads();
+
+       if (ret == 0)
+               watchdog_suspended++;
+
+       mutex_unlock(&watchdog_proc_mutex);
+
+       return ret;
 }
 
-static void update_watchdog(int cpu)
+/*
+ * Resume the hard and soft lockup detector by unparking the watchdog threads.
+ */
+void lockup_detector_resume(void)
 {
+       mutex_lock(&watchdog_proc_mutex);
+
+       watchdog_suspended--;
        /*
-        * Make sure that perf event counter will adopt to a new
-        * sampling period. Updating the sampling period directly would
-        * be much nicer but we do not have an API for that now so
-        * let's use a big hammer.
-        * Hrtimer will adopt the new period on the next tick but this
-        * might be late already so we have to restart the timer as well.
+        * The watchdog threads are unparked if they were previously running
+        * and if there is no more active suspend request.
         */
-       watchdog_nmi_disable(cpu);
-       smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1);
-       watchdog_nmi_enable(cpu);
+       if (watchdog_running && !watchdog_suspended)
+               watchdog_unpark_threads();
+
+       mutex_unlock(&watchdog_proc_mutex);
 }
 
 static void update_watchdog_all_cpus(void)
 {
-       int cpu;
-
-       get_online_cpus();
-       for_each_watchdog_cpu(cpu)
-               update_watchdog(cpu);
-       put_online_cpus();
+       watchdog_park_threads();
+       watchdog_unpark_threads();
 }
 
 static int watchdog_enable_all_cpus(void)
@@ -713,15 +739,12 @@ static int watchdog_enable_all_cpus(void)
        int err = 0;
 
        if (!watchdog_running) {
-               err = smpboot_register_percpu_thread(&watchdog_threads);
+               err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
+                                                            &watchdog_cpumask);
                if (err)
                        pr_err("Failed to create watchdog threads, disabled\n");
-               else {
-                       if (smpboot_update_cpumask_percpu_thread(
-                                   &watchdog_threads, &watchdog_cpumask))
-                               pr_err("Failed to set cpumask for watchdog threads\n");
+               else
                        watchdog_running = 1;
-               }
        } else {
                /*
                 * Enable/disable the lockup detectors or
@@ -787,6 +810,12 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
 
        mutex_lock(&watchdog_proc_mutex);
 
+       if (watchdog_suspended) {
+               /* no parameter changes allowed while watchdog is suspended */
+               err = -EAGAIN;
+               goto out;
+       }
+
        /*
         * If the parameter is being read return the state of the corresponding
         * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
@@ -872,6 +901,12 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
 
        mutex_lock(&watchdog_proc_mutex);
 
+       if (watchdog_suspended) {
+               /* no parameter changes allowed while watchdog is suspended */
+               err = -EAGAIN;
+               goto out;
+       }
+
        old = ACCESS_ONCE(watchdog_thresh);
        err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 
@@ -903,6 +938,13 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
        int err;
 
        mutex_lock(&watchdog_proc_mutex);
+
+       if (watchdog_suspended) {
+               /* no parameter changes allowed while watchdog is suspended */
+               err = -EAGAIN;
+               goto out;
+       }
+
        err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
        if (!err && write) {
                /* Remove impossible cpus to keep sysctl output cleaner. */
@@ -920,6 +962,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
                                pr_err("cpumask update failed\n");
                }
        }
+out:
        mutex_unlock(&watchdog_proc_mutex);
        return err;
 }
@@ -932,10 +975,8 @@ void __init lockup_detector_init(void)
 
 #ifdef CONFIG_NO_HZ_FULL
        if (tick_nohz_full_enabled()) {
-               if (!cpumask_empty(tick_nohz_full_mask))
-                       pr_info("Disabling watchdog on nohz_full cores by default\n");
-               cpumask_andnot(&watchdog_cpumask, cpu_possible_mask,
-                              tick_nohz_full_mask);
+               pr_info("Disabling watchdog on nohz_full cores by default\n");
+               cpumask_copy(&watchdog_cpumask, housekeeping_mask);
        } else
                cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
 #else
index daf0afb6d979e1074cdcee38cd69d2de3a94e162..116a166b096f06eb64ed288e4d8b694648f854b8 100644 (file)
@@ -160,6 +160,7 @@ struct gen_pool *gen_pool_create(int min_alloc_order, int nid)
                pool->min_alloc_order = min_alloc_order;
                pool->algo = gen_pool_first_fit;
                pool->data = NULL;
+               pool->name = NULL;
        }
        return pool;
 }
@@ -252,8 +253,8 @@ void gen_pool_destroy(struct gen_pool *pool)
 
                kfree(chunk);
        }
+       kfree_const(pool->name);
        kfree(pool);
-       return;
 }
 EXPORT_SYMBOL(gen_pool_destroy);
 
@@ -570,53 +571,88 @@ static void devm_gen_pool_release(struct device *dev, void *res)
        gen_pool_destroy(*(struct gen_pool **)res);
 }
 
+static int devm_gen_pool_match(struct device *dev, void *res, void *data)
+{
+       struct gen_pool **p = res;
+
+       /* NULL data matches only a pool without an assigned name */
+       if (!data && !(*p)->name)
+               return 1;
+
+       if (!data || !(*p)->name)
+               return 0;
+
+       return !strcmp((*p)->name, data);
+}
+
+/**
+ * gen_pool_get - Obtain the gen_pool (if any) for a device
+ * @dev: device to retrieve the gen_pool from
+ * @name: name of a gen_pool or NULL, identifies a particular gen_pool on device
+ *
+ * Returns the gen_pool for the device if one is present, or NULL.
+ */
+struct gen_pool *gen_pool_get(struct device *dev, const char *name)
+{
+       struct gen_pool **p;
+
+       p = devres_find(dev, devm_gen_pool_release, devm_gen_pool_match,
+                       (void *)name);
+       if (!p)
+               return NULL;
+       return *p;
+}
+EXPORT_SYMBOL_GPL(gen_pool_get);
+
 /**
  * devm_gen_pool_create - managed gen_pool_create
  * @dev: device that provides the gen_pool
  * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents
- * @nid: node id of the node the pool structure should be allocated on, or -1
+ * @nid: node selector for allocated gen_pool, %NUMA_NO_NODE for all nodes
+ * @name: name of a gen_pool or NULL, identifies a particular gen_pool on device
  *
  * Create a new special memory pool that can be used to manage special purpose
  * memory not managed by the regular kmalloc/kfree interface. The pool will be
  * automatically destroyed by the device management code.
  */
 struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order,
-               int nid)
+                                     int nid, const char *name)
 {
        struct gen_pool **ptr, *pool;
+       const char *pool_name = NULL;
+
+       /* Check that genpool to be created is uniquely addressed on device */
+       if (gen_pool_get(dev, name))
+               return ERR_PTR(-EINVAL);
+
+       if (name) {
+               pool_name = kstrdup_const(name, GFP_KERNEL);
+               if (!pool_name)
+                       return ERR_PTR(-ENOMEM);
+       }
 
        ptr = devres_alloc(devm_gen_pool_release, sizeof(*ptr), GFP_KERNEL);
        if (!ptr)
-               return NULL;
+               goto free_pool_name;
 
        pool = gen_pool_create(min_alloc_order, nid);
-       if (pool) {
-               *ptr = pool;
-               devres_add(dev, ptr);
-       } else {
-               devres_free(ptr);
-       }
+       if (!pool)
+               goto free_devres;
+
+       *ptr = pool;
+       pool->name = pool_name;
+       devres_add(dev, ptr);
 
        return pool;
-}
-EXPORT_SYMBOL(devm_gen_pool_create);
 
-/**
- * gen_pool_get - Obtain the gen_pool (if any) for a device
- * @dev: device to retrieve the gen_pool from
- *
- * Returns the gen_pool for the device if one is present, or NULL.
- */
-struct gen_pool *gen_pool_get(struct device *dev)
-{
-       struct gen_pool **p = devres_find(dev, devm_gen_pool_release, NULL,
-                                       NULL);
+free_devres:
+       devres_free(ptr);
+free_pool_name:
+       kfree_const(pool_name);
 
-       if (!p)
-               return NULL;
-       return *p;
+       return ERR_PTR(-ENOMEM);
 }
-EXPORT_SYMBOL_GPL(gen_pool_get);
+EXPORT_SYMBOL(devm_gen_pool_create);
 
 #ifdef CONFIG_OF
 /**
@@ -633,16 +669,30 @@ struct gen_pool *of_gen_pool_get(struct device_node *np,
        const char *propname, int index)
 {
        struct platform_device *pdev;
-       struct device_node *np_pool;
+       struct device_node *np_pool, *parent;
+       const char *name = NULL;
+       struct gen_pool *pool = NULL;
 
        np_pool = of_parse_phandle(np, propname, index);
        if (!np_pool)
                return NULL;
+
        pdev = of_find_device_by_node(np_pool);
+       if (!pdev) {
+               /* Check if named gen_pool is created by parent node device */
+               parent = of_get_parent(np_pool);
+               pdev = of_find_device_by_node(parent);
+               of_node_put(parent);
+
+               of_property_read_string(np_pool, "label", &name);
+               if (!name)
+                       name = np_pool->name;
+       }
+       if (pdev)
+               pool = gen_pool_get(&pdev->dev, name);
        of_node_put(np_pool);
-       if (!pdev)
-               return NULL;
-       return gen_pool_get(&pdev->dev);
+
+       return pool;
 }
 EXPORT_SYMBOL_GPL(of_gen_pool_get);
 #endif /* CONFIG_OF */
index 98c4eaeabdcb4592d0b15fd71f3f520aa1bd3d35..b424d5e5b6ff5b1dec8f95fdd089451dba4dd19c 100644 (file)
@@ -78,3 +78,4 @@ obj-$(CONFIG_CMA)     += cma.o
 obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
 obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
 obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
+obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
index fd5fe4342e9320de634db2e92d9757dd1677f5be..59d10d16f0a5d906b18d98d1962f454075c828fe 100644 (file)
@@ -242,7 +242,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
        return page;
 }
 
-static inline int is_page_busy(struct dma_page *page)
+static inline bool is_page_busy(struct dma_page *page)
 {
        return page->in_use != 0;
 }
index 6297f6bccfb1e42bedde7d69479eefbd52317b23..a798293fc6486bac215ecb58ed071263a5f775f0 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -12,7 +12,9 @@
 #include <linux/sched.h>
 #include <linux/rwsem.h>
 #include <linux/hugetlb.h>
+
 #include <asm/pgtable.h>
+#include <asm/tlbflush.h>
 
 #include "internal.h"
 
@@ -32,6 +34,30 @@ static struct page *no_page_table(struct vm_area_struct *vma,
        return NULL;
 }
 
+static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
+               pte_t *pte, unsigned int flags)
+{
+       /* No page to get reference */
+       if (flags & FOLL_GET)
+               return -EFAULT;
+
+       if (flags & FOLL_TOUCH) {
+               pte_t entry = *pte;
+
+               if (flags & FOLL_WRITE)
+                       entry = pte_mkdirty(entry);
+               entry = pte_mkyoung(entry);
+
+               if (!pte_same(*pte, entry)) {
+                       set_pte_at(vma->vm_mm, address, pte, entry);
+                       update_mmu_cache(vma, address, pte);
+               }
+       }
+
+       /* Proper page table entry exists, but no corresponding struct page */
+       return -EEXIST;
+}
+
 static struct page *follow_page_pte(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmd, unsigned int flags)
 {
@@ -73,10 +99,21 @@ retry:
 
        page = vm_normal_page(vma, address, pte);
        if (unlikely(!page)) {
-               if ((flags & FOLL_DUMP) ||
-                   !is_zero_pfn(pte_pfn(pte)))
-                       goto bad_page;
-               page = pte_page(pte);
+               if (flags & FOLL_DUMP) {
+                       /* Avoid special (like zero) pages in core dumps */
+                       page = ERR_PTR(-EFAULT);
+                       goto out;
+               }
+
+               if (is_zero_pfn(pte_pfn(pte))) {
+                       page = pte_page(pte);
+               } else {
+                       int ret;
+
+                       ret = follow_pfn_pte(vma, address, ptep, flags);
+                       page = ERR_PTR(ret);
+                       goto out;
+               }
        }
 
        if (flags & FOLL_GET)
@@ -114,12 +151,9 @@ retry:
                        unlock_page(page);
                }
        }
+out:
        pte_unmap_unlock(ptep, ptl);
        return page;
-bad_page:
-       pte_unmap_unlock(ptep, ptl);
-       return ERR_PTR(-EFAULT);
-
 no_page:
        pte_unmap_unlock(ptep, ptl);
        if (!pte_none(pte))
@@ -489,9 +523,15 @@ retry:
                                goto next_page;
                        }
                        BUG();
-               }
-               if (IS_ERR(page))
+               } else if (PTR_ERR(page) == -EEXIST) {
+                       /*
+                        * Proper page table entry exists, but no corresponding
+                        * struct page.
+                        */
+                       goto next_page;
+               } else if (IS_ERR(page)) {
                        return i ? i : PTR_ERR(page);
+               }
                if (pages) {
                        pages[i] = page;
                        flush_anon_page(vma, page, start);
index 097c7a4bfbd9f13f4845acae80d73aa7b0e66fb2..279a818a39b13d76e574bf8f330c7c925b8e3a67 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/pagemap.h>
 #include <linux/migrate.h>
 #include <linux/hashtable.h>
+#include <linux/userfaultfd_k.h>
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -716,21 +717,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
 
 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
-                                       unsigned long haddr, pmd_t *pmd,
-                                       struct page *page, gfp_t gfp)
+                                       unsigned long address, pmd_t *pmd,
+                                       struct page *page, gfp_t gfp,
+                                       unsigned int flags)
 {
        struct mem_cgroup *memcg;
        pgtable_t pgtable;
        spinlock_t *ptl;
+       unsigned long haddr = address & HPAGE_PMD_MASK;
 
        VM_BUG_ON_PAGE(!PageCompound(page), page);
 
-       if (mem_cgroup_try_charge(page, mm, gfp, &memcg))
-               return VM_FAULT_OOM;
+       if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) {
+               put_page(page);
+               count_vm_event(THP_FAULT_FALLBACK);
+               return VM_FAULT_FALLBACK;
+       }
 
        pgtable = pte_alloc_one(mm, haddr);
        if (unlikely(!pgtable)) {
                mem_cgroup_cancel_charge(page, memcg);
+               put_page(page);
                return VM_FAULT_OOM;
        }
 
@@ -750,6 +757,21 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                pte_free(mm, pgtable);
        } else {
                pmd_t entry;
+
+               /* Deliver the page fault to userland */
+               if (userfaultfd_missing(vma)) {
+                       int ret;
+
+                       spin_unlock(ptl);
+                       mem_cgroup_cancel_charge(page, memcg);
+                       put_page(page);
+                       pte_free(mm, pgtable);
+                       ret = handle_userfault(vma, address, flags,
+                                              VM_UFFD_MISSING);
+                       VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+                       return ret;
+               }
+
                entry = mk_huge_pmd(page, vma->vm_page_prot);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                page_add_new_anon_rmap(page, vma, haddr);
@@ -760,6 +782,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
                atomic_long_inc(&mm->nr_ptes);
                spin_unlock(ptl);
+               count_vm_event(THP_FAULT_ALLOC);
        }
 
        return 0;
@@ -771,19 +794,16 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
 }
 
 /* Caller must hold page table lock. */
-static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
                struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
                struct page *zero_page)
 {
        pmd_t entry;
-       if (!pmd_none(*pmd))
-               return false;
        entry = mk_pmd(zero_page, vma->vm_page_prot);
        entry = pmd_mkhuge(entry);
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, haddr, pmd, entry);
        atomic_long_inc(&mm->nr_ptes);
-       return true;
 }
 
 int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -806,6 +826,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                pgtable_t pgtable;
                struct page *zero_page;
                bool set;
+               int ret;
                pgtable = pte_alloc_one(mm, haddr);
                if (unlikely(!pgtable))
                        return VM_FAULT_OOM;
@@ -816,14 +837,28 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        return VM_FAULT_FALLBACK;
                }
                ptl = pmd_lock(mm, pmd);
-               set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
-                               zero_page);
-               spin_unlock(ptl);
+               ret = 0;
+               set = false;
+               if (pmd_none(*pmd)) {
+                       if (userfaultfd_missing(vma)) {
+                               spin_unlock(ptl);
+                               ret = handle_userfault(vma, address, flags,
+                                                      VM_UFFD_MISSING);
+                               VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+                       } else {
+                               set_huge_zero_page(pgtable, mm, vma,
+                                                  haddr, pmd,
+                                                  zero_page);
+                               spin_unlock(ptl);
+                               set = true;
+                       }
+               } else
+                       spin_unlock(ptl);
                if (!set) {
                        pte_free(mm, pgtable);
                        put_huge_zero_page();
                }
-               return 0;
+               return ret;
        }
        gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
        page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
@@ -831,14 +866,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
        }
-       if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) {
-               put_page(page);
-               count_vm_event(THP_FAULT_FALLBACK);
-               return VM_FAULT_FALLBACK;
-       }
-
-       count_vm_event(THP_FAULT_ALLOC);
-       return 0;
+       return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
+                                           flags);
 }
 
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -873,16 +902,14 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         */
        if (is_huge_zero_pmd(pmd)) {
                struct page *zero_page;
-               bool set;
                /*
                 * get_huge_zero_page() will never allocate a new page here,
                 * since we already have a zero page to copy. It just takes a
                 * reference.
                 */
                zero_page = get_huge_zero_page();
-               set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
+               set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
                                zero_page);
-               BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
                ret = 0;
                goto out_unlock;
        }
@@ -2133,7 +2160,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
             _pte++, address += PAGE_SIZE) {
                pte_t pteval = *_pte;
                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
-                       if (++none_or_zero <= khugepaged_max_ptes_none)
+                       if (!userfaultfd_armed(vma) &&
+                           ++none_or_zero <= khugepaged_max_ptes_none)
                                continue;
                        else
                                goto out;
@@ -2586,7 +2614,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
             _pte++, _address += PAGE_SIZE) {
                pte_t pteval = *_pte;
                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
-                       if (++none_or_zero <= khugepaged_max_ptes_none)
+                       if (!userfaultfd_armed(vma) &&
+                           ++none_or_zero <= khugepaged_max_ptes_none)
                                continue;
                        else
                                goto out_unmap;
index a8c3087089d8a8627c66a03602ecb6154a238ee8..51ae41d0fbc0d8ba3556e2f272c90442d90b2ab2 100644 (file)
@@ -616,7 +616,7 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
 }
 
 /* Returns true if the VMA has associated reserve pages */
-static int vma_has_reserves(struct vm_area_struct *vma, long chg)
+static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
 {
        if (vma->vm_flags & VM_NORESERVE) {
                /*
@@ -629,23 +629,23 @@ static int vma_has_reserves(struct vm_area_struct *vma, long chg)
                 * properly, so add work-around here.
                 */
                if (vma->vm_flags & VM_MAYSHARE && chg == 0)
-                       return 1;
+                       return true;
                else
-                       return 0;
+                       return false;
        }
 
        /* Shared mappings always use reserves */
        if (vma->vm_flags & VM_MAYSHARE)
-               return 1;
+               return true;
 
        /*
         * Only the process that called mmap() has reserves for
         * private mappings.
         */
        if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
-               return 1;
+               return true;
 
-       return 0;
+       return false;
 }
 
 static void enqueue_huge_page(struct hstate *h, struct page *page)
@@ -3779,7 +3779,7 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
        return saddr;
 }
 
-static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
+static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
 {
        unsigned long base = addr & PUD_MASK;
        unsigned long end = base + PUD_SIZE;
@@ -3789,8 +3789,8 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
         */
        if (vma->vm_flags & VM_MAYSHARE &&
            vma->vm_start <= base && end <= vma->vm_end)
-               return 1;
-       return 0;
+               return true;
+       return false;
 }
 
 /*
index 36b23f1e2ca62612e6e1d1b2b9d74c3cd7e87db7..1195dd2d6a2b94214e9ebb11e8c2037cd0e0cea6 100644 (file)
@@ -426,4 +426,19 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 #define ALLOC_CMA              0x80 /* allow allocations from CMA areas */
 #define ALLOC_FAIR             0x100 /* fair zone allocation */
 
+enum ttu_flags;
+struct tlbflush_unmap_batch;
+
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+void try_to_unmap_flush(void);
+void try_to_unmap_flush_dirty(void);
+#else
+static inline void try_to_unmap_flush(void)
+{
+}
+static inline void try_to_unmap_flush_dirty(void)
+{
+}
+
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
 #endif /* __MM_INTERNAL_H */
index 64bb8a22110c23b7989256b02c7d4851b89aab3f..ce3a4222c7e7ae4558b201704a0ee5d55518fcfc 100644 (file)
@@ -103,7 +103,8 @@ static long madvise_behavior(struct vm_area_struct *vma,
 
        pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
        *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
-                               vma->vm_file, pgoff, vma_policy(vma));
+                         vma->vm_file, pgoff, vma_policy(vma),
+                         vma->vm_userfaultfd_ctx);
        if (*prev) {
                vma = *prev;
                goto success;
@@ -385,7 +386,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
        }
 }
 
-static int
+static bool
 madvise_behavior_valid(int behavior)
 {
        switch (behavior) {
@@ -407,10 +408,10 @@ madvise_behavior_valid(int behavior)
 #endif
        case MADV_DONTDUMP:
        case MADV_DODUMP:
-               return 1;
+               return true;
 
        default:
-               return 0;
+               return false;
        }
 }
 
index 87108e77e476a326d69ec748f4e300d713fbd192..95ce68c6da8adc0b8d4c925027a5b04b824ab381 100644 (file)
@@ -566,6 +566,9 @@ repeat:
                 * area, insert that portion.
                 */
                if (rbase > base) {
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+                       WARN_ON(nid != memblock_get_region_node(rgn));
+#endif
                        nr_new++;
                        if (insert)
                                memblock_insert_region(type, i++, base,
index acb93c554f6e8456dc9312734162317d1adea54d..1af057575ce9e65c862dfc61f574c46d42b7cd5f 100644 (file)
@@ -5965,7 +5965,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
        if (!mem_cgroup_is_root(memcg))
                page_counter_uncharge(&memcg->memory, 1);
 
-       /* Caller disabled preemption with mapping->tree_lock */
+       /*
+        * Interrupts should be disabled here because the caller holds the
+        * mapping->tree_lock lock which is taken with interrupts-off. It is
+        * important here to have the interrupts disabled because it is the
+        * only synchronisation we have for udpating the per-CPU variables.
+        */
+       VM_BUG_ON(!irqs_disabled());
        mem_cgroup_charge_statistics(memcg, page, -1);
        memcg_check_events(memcg, page);
 }
index 388dcf9aa283c83ee78dcf22503de253e812a27d..bb04d8f2f86c415c24a52dadd71623a2316454d1 100644 (file)
@@ -61,6 +61,7 @@
 #include <linux/string.h>
 #include <linux/dma-debug.h>
 #include <linux/debugfs.h>
+#include <linux/userfaultfd_k.h>
 
 #include <asm/io.h>
 #include <asm/pgalloc.h>
@@ -180,22 +181,22 @@ static void check_sync_rss_stat(struct task_struct *task)
 
 #ifdef HAVE_GENERIC_MMU_GATHER
 
-static int tlb_next_batch(struct mmu_gather *tlb)
+static bool tlb_next_batch(struct mmu_gather *tlb)
 {
        struct mmu_gather_batch *batch;
 
        batch = tlb->active;
        if (batch->next) {
                tlb->active = batch->next;
-               return 1;
+               return true;
        }
 
        if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
-               return 0;
+               return false;
 
        batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
        if (!batch)
-               return 0;
+               return false;
 
        tlb->batch_count++;
        batch->next = NULL;
@@ -205,7 +206,7 @@ static int tlb_next_batch(struct mmu_gather *tlb)
        tlb->active->next = batch;
        tlb->active = batch;
 
-       return 1;
+       return true;
 }
 
 /* tlb_gather_mmu
@@ -2685,6 +2686,12 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
                if (!pte_none(*page_table))
                        goto unlock;
+               /* Deliver the page fault to userland, check inside PT lock */
+               if (userfaultfd_missing(vma)) {
+                       pte_unmap_unlock(page_table, ptl);
+                       return handle_userfault(vma, address, flags,
+                                               VM_UFFD_MISSING);
+               }
                goto setpte;
        }
 
@@ -2713,6 +2720,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (!pte_none(*page_table))
                goto release;
 
+       /* Deliver the page fault to userland, check inside PT lock */
+       if (userfaultfd_missing(vma)) {
+               pte_unmap_unlock(page_table, ptl);
+               mem_cgroup_cancel_charge(page, memcg);
+               page_cache_release(page);
+               return handle_userfault(vma, address, flags,
+                                       VM_UFFD_MISSING);
+       }
+
        inc_mm_counter_fast(mm, MM_ANONPAGES);
        page_add_new_anon_rmap(page, vma, address);
        mem_cgroup_commit_charge(page, memcg, false);
index 6da82bcb0a8b66b7326c1a021a7eac3b476cd85e..8fd97dac538a46c4a5768273060631c7ac10a80e 100644 (file)
@@ -1248,6 +1248,14 @@ int __ref add_memory(int nid, u64 start, u64 size)
 
        mem_hotplug_begin();
 
+       /*
+        * Add new range to memblock so that when hotadd_new_pgdat() is called
+        * to allocate new pgdat, get_pfn_range_for_nid() will be able to find
+        * this new range and calculate total pages correctly.  The range will
+        * be removed at hot-remove time.
+        */
+       memblock_add_node(start, size, nid);
+
        new_node = !node_online(nid);
        if (new_node) {
                pgdat = hotadd_new_pgdat(nid, start);
@@ -1277,7 +1285,6 @@ int __ref add_memory(int nid, u64 start, u64 size)
 
        /* create new memmap entry */
        firmware_map_add_hotplug(start, start + size, "System RAM");
-       memblock_add_node(start, size, nid);
 
        goto out;
 
@@ -1286,6 +1293,7 @@ error:
        if (new_pgdat)
                rollback_node_hotadd(nid, pgdat);
        release_memory_resource(res);
+       memblock_remove(start, size);
 
 out:
        mem_hotplug_done();
index 99d4c1d0b8583dc453ef992582074ef015f1fb49..a7f1e0d1d6b8fed5d1e3d9bd1380c9cd9b1447e8 100644 (file)
@@ -722,8 +722,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
                pgoff = vma->vm_pgoff +
                        ((vmstart - vma->vm_start) >> PAGE_SHIFT);
                prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
-                                 vma->anon_vma, vma->vm_file, pgoff,
-                                 new_pol);
+                                vma->anon_vma, vma->vm_file, pgoff,
+                                new_pol, vma->vm_userfaultfd_ctx);
                if (prev) {
                        vma = prev;
                        next = vma->vm_next;
index eb4267107d1fee9fa2a55e4076c014500e3b1edb..5c08cab5419e771d4b2d04762ae3d876cf8080eb 100644 (file)
@@ -1226,7 +1226,9 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
                if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
                        goto set_status;
 
-               page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
+               /* FOLL_DUMP to ignore special (like zero) pages */
+               page = follow_page(vma, pp->addr,
+                               FOLL_GET | FOLL_SPLIT | FOLL_DUMP);
 
                err = PTR_ERR(page);
                if (IS_ERR(page))
@@ -1236,10 +1238,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
                if (!page)
                        goto set_status;
 
-               /* Use PageReserved to check for zero page */
-               if (PageReserved(page))
-                       goto put_and_set;
-
                pp->page = page;
                err = page_to_nid(page);
 
@@ -1396,18 +1394,14 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
                if (!vma || addr < vma->vm_start)
                        goto set_status;
 
-               page = follow_page(vma, addr, 0);
+               /* FOLL_DUMP to ignore special (like zero) pages */
+               page = follow_page(vma, addr, FOLL_DUMP);
 
                err = PTR_ERR(page);
                if (IS_ERR(page))
                        goto set_status;
 
-               err = -ENOENT;
-               /* Use PageReserved to check for zero page */
-               if (!page || PageReserved(page))
-                       goto set_status;
-
-               err = page_to_nid(page);
+               err = page ? page_to_nid(page) : -ENOENT;
 set_status:
                *status = err;
 
index 6fd2cf15e8687d32114f8e81516aed0a52997147..25936680064fd433d4cf693c88c3195bdab4bb98 100644 (file)
@@ -510,7 +510,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 
        pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
        *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
-                         vma->vm_file, pgoff, vma_policy(vma));
+                         vma->vm_file, pgoff, vma_policy(vma),
+                         vma->vm_userfaultfd_ctx);
        if (*prev) {
                vma = *prev;
                goto success;
index f126923ce683c6969a9ce2498bcd481d90d39b8e..82db4fc0a9d34040bf7fa6750eddf7b508fb0e1a 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -41,6 +41,7 @@
 #include <linux/notifier.h>
 #include <linux/memory.h>
 #include <linux/printk.h>
+#include <linux/userfaultfd_k.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -919,7 +920,8 @@ again:                      remove_next = 1 + (end > next->vm_end);
  * per-vma resources, so we don't attempt to merge those.
  */
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
-                       struct file *file, unsigned long vm_flags)
+                               struct file *file, unsigned long vm_flags,
+                               struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
 {
        /*
         * VM_SOFTDIRTY should not prevent from VMA merging, if we
@@ -935,6 +937,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
                return 0;
        if (vma->vm_ops && vma->vm_ops->close)
                return 0;
+       if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
+               return 0;
        return 1;
 }
 
@@ -965,9 +969,11 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
  */
 static int
 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
-       struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+                    struct anon_vma *anon_vma, struct file *file,
+                    pgoff_t vm_pgoff,
+                    struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
 {
-       if (is_mergeable_vma(vma, file, vm_flags) &&
+       if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                if (vma->vm_pgoff == vm_pgoff)
                        return 1;
@@ -984,9 +990,11 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
  */
 static int
 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
-       struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+                   struct anon_vma *anon_vma, struct file *file,
+                   pgoff_t vm_pgoff,
+                   struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
 {
-       if (is_mergeable_vma(vma, file, vm_flags) &&
+       if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                pgoff_t vm_pglen;
                vm_pglen = vma_pages(vma);
@@ -1029,7 +1037,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                        struct vm_area_struct *prev, unsigned long addr,
                        unsigned long end, unsigned long vm_flags,
                        struct anon_vma *anon_vma, struct file *file,
-                       pgoff_t pgoff, struct mempolicy *policy)
+                       pgoff_t pgoff, struct mempolicy *policy,
+                       struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
 {
        pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
        struct vm_area_struct *area, *next;
@@ -1056,14 +1065,17 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
        if (prev && prev->vm_end == addr &&
                        mpol_equal(vma_policy(prev), policy) &&
                        can_vma_merge_after(prev, vm_flags,
-                                               anon_vma, file, pgoff)) {
+                                           anon_vma, file, pgoff,
+                                           vm_userfaultfd_ctx)) {
                /*
                 * OK, it can.  Can we now merge in the successor as well?
                 */
                if (next && end == next->vm_start &&
                                mpol_equal(policy, vma_policy(next)) &&
                                can_vma_merge_before(next, vm_flags,
-                                       anon_vma, file, pgoff+pglen) &&
+                                                    anon_vma, file,
+                                                    pgoff+pglen,
+                                                    vm_userfaultfd_ctx) &&
                                is_mergeable_anon_vma(prev->anon_vma,
                                                      next->anon_vma, NULL)) {
                                                        /* cases 1, 6 */
@@ -1084,7 +1096,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
        if (next && end == next->vm_start &&
                        mpol_equal(policy, vma_policy(next)) &&
                        can_vma_merge_before(next, vm_flags,
-                                       anon_vma, file, pgoff+pglen)) {
+                                            anon_vma, file, pgoff+pglen,
+                                            vm_userfaultfd_ctx)) {
                if (prev && addr < prev->vm_end)        /* case 4 */
                        err = vma_adjust(prev, prev->vm_start,
                                addr, prev->vm_pgoff, NULL);
@@ -1570,8 +1583,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
        /*
         * Can we just expand an old mapping?
         */
-       vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff,
-                       NULL);
+       vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
+                       NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
        if (vma)
                goto out;
 
@@ -2757,7 +2770,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 
        /* Can we just expand an old private anonymous mapping? */
        vma = vma_merge(mm, prev, addr, addr + len, flags,
-                                       NULL, NULL, pgoff, NULL);
+                       NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
        if (vma)
                goto out;
 
@@ -2913,7 +2926,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
        if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
                return NULL;    /* should never get here */
        new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
-                       vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+                           vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+                           vma->vm_userfaultfd_ctx);
        if (new_vma) {
                /*
                 * Source vma may have been merged into new_vma
index e7d6f1171ecb6ec453e62edd88cc258b31cfe9f3..ef5be8eaab001792b469fac1bd5b43cb139d1b0b 100644 (file)
@@ -292,7 +292,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
         */
        pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
        *pprev = vma_merge(mm, *pprev, start, end, newflags,
-                       vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+                          vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+                          vma->vm_userfaultfd_ctx);
        if (*pprev) {
                vma = *pprev;
                goto success;
index a7c93eceb1c8d1ce59235d47ec7e49df4efefb86..5a71cce8c6ea8cd679dad306bd3ee655ab370a47 100644 (file)
@@ -276,6 +276,12 @@ static unsigned long move_vma(struct vm_area_struct *vma,
        moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
                                     need_rmap_locks);
        if (moved_len < old_len) {
+               err = -ENOMEM;
+       } else if (vma->vm_ops && vma->vm_ops->mremap) {
+               err = vma->vm_ops->mremap(new_vma);
+       }
+
+       if (unlikely(err)) {
                /*
                 * On error, move entries back from new area to old,
                 * which will succeed since page tables still there,
@@ -286,16 +292,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                vma = new_vma;
                old_len = new_len;
                old_addr = new_addr;
-               new_addr = -ENOMEM;
+               new_addr = err;
        } else {
-               if (vma->vm_file && vma->vm_file->f_op->mremap) {
-                       err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
-                       if (err < 0) {
-                               move_page_tables(new_vma, new_addr, vma,
-                                                old_addr, moved_len, true);
-                               return err;
-                       }
-               }
                arch_remap(mm, old_addr, old_addr + old_len,
                           new_addr, new_addr + new_len);
        }
@@ -348,6 +346,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma = find_vma(mm, addr);
+       unsigned long pgoff;
 
        if (!vma || vma->vm_start > addr)
                return ERR_PTR(-EFAULT);
@@ -359,17 +358,17 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
        if (old_len > vma->vm_end - addr)
                return ERR_PTR(-EFAULT);
 
+       if (new_len == old_len)
+               return vma;
+
        /* Need to be careful about a growing mapping */
-       if (new_len > old_len) {
-               unsigned long pgoff;
-
-               if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
-                       return ERR_PTR(-EFAULT);
-               pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
-               pgoff += vma->vm_pgoff;
-               if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
-                       return ERR_PTR(-EINVAL);
-       }
+       pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
+       pgoff += vma->vm_pgoff;
+       if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
+               return ERR_PTR(-EINVAL);
+
+       if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
+               return ERR_PTR(-EFAULT);
 
        if (vma->vm_flags & VM_LOCKED) {
                unsigned long locked, lock_limit;
@@ -408,13 +407,8 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
        if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
                goto out;
 
-       /* Check if the location we're moving into overlaps the
-        * old location at all, and fail if it does.
-        */
-       if ((new_addr <= addr) && (new_addr+new_len) > addr)
-               goto out;
-
-       if ((addr <= new_addr) && (addr+old_len) > new_addr)
+       /* Ensure the old/new locations do not overlap */
+       if (addr + old_len > new_addr && new_addr + new_len > addr)
                goto out;
 
        ret = do_munmap(mm, new_addr, new_len);
@@ -580,8 +574,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
                ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
        }
 out:
-       if (ret & ~PAGE_MASK)
+       if (ret & ~PAGE_MASK) {
                vm_unacct_memory(charged);
+               locked = 0;
+       }
        up_write(&current->mm->mmap_sem);
        if (locked && new_len > old_len)
                mm_populate(new_addr + old_len, new_len - old_len);
index 171b68768df1478355bcddd5e30c2edd616ba05b..0db38e7d0a72b20ce63a6653ba24934ac3ce7825 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -62,6 +62,8 @@
 
 #include <asm/tlbflush.h>
 
+#include <trace/events/tlb.h>
+
 #include "internal.h"
 
 static struct kmem_cache *anon_vma_cachep;
@@ -583,6 +585,107 @@ vma_address(struct page *page, struct vm_area_struct *vma)
        return address;
 }
 
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+static void percpu_flush_tlb_batch_pages(void *data)
+{
+       /*
+        * All TLB entries are flushed on the assumption that it is
+        * cheaper to flush all TLBs and let them be refilled than
+        * flushing individual PFNs. Note that we do not track mm's
+        * to flush as that might simply be multiple full TLB flushes
+        * for no gain.
+        */
+       count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+       flush_tlb_local();
+}
+
+/*
+ * Flush TLB entries for recently unmapped pages from remote CPUs. It is
+ * important if a PTE was dirty when it was unmapped that it's flushed
+ * before any IO is initiated on the page to prevent lost writes. Similarly,
+ * it must be flushed before freeing to prevent data leakage.
+ */
+void try_to_unmap_flush(void)
+{
+       struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+       int cpu;
+
+       if (!tlb_ubc->flush_required)
+               return;
+
+       cpu = get_cpu();
+
+       trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL);
+
+       if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask))
+               percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask);
+
+       if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) {
+               smp_call_function_many(&tlb_ubc->cpumask,
+                       percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true);
+       }
+       cpumask_clear(&tlb_ubc->cpumask);
+       tlb_ubc->flush_required = false;
+       tlb_ubc->writable = false;
+       put_cpu();
+}
+
+/* Flush iff there are potentially writable TLB entries that can race with IO */
+void try_to_unmap_flush_dirty(void)
+{
+       struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+
+       if (tlb_ubc->writable)
+               try_to_unmap_flush();
+}
+
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
+               struct page *page, bool writable)
+{
+       struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+
+       cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
+       tlb_ubc->flush_required = true;
+
+       /*
+        * If the PTE was dirty then it's best to assume it's writable. The
+        * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
+        * before the page is queued for IO.
+        */
+       if (writable)
+               tlb_ubc->writable = true;
+}
+
+/*
+ * Returns true if the TLB flush should be deferred to the end of a batch of
+ * unmap operations to reduce IPIs.
+ */
+static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
+{
+       bool should_defer = false;
+
+       if (!(flags & TTU_BATCH_FLUSH))
+               return false;
+
+       /* If remote CPUs need to be flushed then defer batch the flush */
+       if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
+               should_defer = true;
+       put_cpu();
+
+       return should_defer;
+}
+#else
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
+               struct page *page, bool writable)
+{
+}
+
+static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
+{
+       return false;
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
 /*
  * At what user virtual address is page expected in vma?
  * Caller should check the page is actually part of the vma.
@@ -1220,7 +1323,20 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 
        /* Nuke the page table entry. */
        flush_cache_page(vma, address, page_to_pfn(page));
-       pteval = ptep_clear_flush(vma, address, pte);
+       if (should_defer_flush(mm, flags)) {
+               /*
+                * We clear the PTE but do not flush so potentially a remote
+                * CPU could still be writing to the page. If the entry was
+                * previously clean then the architecture must guarantee that
+                * a clear->dirty transition on a cached TLB entry is written
+                * through and traps if the PTE is unmapped.
+                */
+               pteval = ptep_get_and_clear(mm, address, pte);
+
+               set_tlb_ubc_flush_pending(mm, page, pte_dirty(pteval));
+       } else {
+               pteval = ptep_clear_flush(vma, address, pte);
+       }
 
        /* Move the dirty bit to the physical page now the pte is gone. */
        if (pte_dirty(pteval))
index bbd0b47dc6a97eecea7650ce6b351e88d5a17295..60c936938b8486b1763c8f9477b479a5d4a54dc4 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3416,6 +3416,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
 
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+       __kmem_cache_free_bulk(s, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
+bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+                                                               void **p)
+{
+       return __kmem_cache_alloc_bulk(s, flags, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_bulk);
+
 #ifdef CONFIG_TRACING
 void *
 kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
index 8da63e4e470f21b935e12f7dc5a47199cca704fe..a3a967d7d7c27f1fec35acdfb3e07eb6f33f6fc3 100644 (file)
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -163,6 +163,15 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
 ssize_t slabinfo_write(struct file *file, const char __user *buffer,
                       size_t count, loff_t *ppos);
 
+/*
+ * Generic implementation of bulk operations
+ * These are useful for situations in which the allocator cannot
+ * perform optimizations. In that case segments of the objecct listed
+ * may be allocated or freed using these operations.
+ */
+void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
+bool __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
+
 #ifdef CONFIG_MEMCG_KMEM
 /*
  * Iterate over all memcg caches of the given root cache. The caller must hold
@@ -321,7 +330,7 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
                return cachep;
 
        pr_err("%s: Wrong slab cache. %s but object is from %s\n",
-              __func__, cachep->name, s->name);
+              __func__, s->name, cachep->name);
        WARN_ON_ONCE(1);
        return s;
 }
index 86831105a09f44ffae37c074a6e5587c5b7056ce..c26829fe4e37ea0b38ce6c26fef84a7dcb528eee 100644 (file)
@@ -104,6 +104,29 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
 }
 #endif
 
+void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
+{
+       size_t i;
+
+       for (i = 0; i < nr; i++)
+               kmem_cache_free(s, p[i]);
+}
+
+bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
+                                                               void **p)
+{
+       size_t i;
+
+       for (i = 0; i < nr; i++) {
+               void *x = p[i] = kmem_cache_alloc(s, flags);
+               if (!x) {
+                       __kmem_cache_free_bulk(s, i, p);
+                       return false;
+               }
+       }
+       return true;
+}
+
 #ifdef CONFIG_MEMCG_KMEM
 void slab_init_memcg_params(struct kmem_cache *s)
 {
index 4765f65019c733a558b2cb6e5063db50e8dcf5ca..165bbd3cd60626e0aa8b0c98ba18c2d327af6117 100644 (file)
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -611,6 +611,19 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+       __kmem_cache_free_bulk(s, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
+bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+                                                               void **p)
+{
+       return __kmem_cache_alloc_bulk(s, flags, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_bulk);
+
 int __kmem_cache_shutdown(struct kmem_cache *c)
 {
        /* No way to check for remaining objects */
index f68c0e50f3c083abe295a1dcd60668321a8f9232..084184e706c63184124bcf874cfe6702e0343950 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1306,6 +1306,17 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
        kasan_slab_free(s, x);
 }
 
+static void setup_object(struct kmem_cache *s, struct page *page,
+                               void *object)
+{
+       setup_object_debug(s, page, object);
+       if (unlikely(s->ctor)) {
+               kasan_unpoison_object_data(s, object);
+               s->ctor(object);
+               kasan_poison_object_data(s, object);
+       }
+}
+
 /*
  * Slab allocation and freeing
  */
@@ -1336,6 +1347,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
        struct page *page;
        struct kmem_cache_order_objects oo = s->oo;
        gfp_t alloc_gfp;
+       void *start, *p;
+       int idx, order;
 
        flags &= gfp_allowed_mask;
 
@@ -1349,6 +1362,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
         * so we fall-back to the minimum order allocation.
         */
        alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
+       if ((alloc_gfp & __GFP_WAIT) && oo_order(oo) > oo_order(s->min))
+               alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_WAIT;
 
        page = alloc_slab_page(s, alloc_gfp, node, oo);
        if (unlikely(!page)) {
@@ -1359,13 +1374,13 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
                 * Try a lower order alloc if possible
                 */
                page = alloc_slab_page(s, alloc_gfp, node, oo);
-
-               if (page)
-                       stat(s, ORDER_FALLBACK);
+               if (unlikely(!page))
+                       goto out;
+               stat(s, ORDER_FALLBACK);
        }
 
-       if (kmemcheck_enabled && page
-               && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
+       if (kmemcheck_enabled &&
+           !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
                int pages = 1 << oo_order(oo);
 
                kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
@@ -1380,51 +1395,9 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
                        kmemcheck_mark_unallocated_pages(page, pages);
        }
 
-       if (flags & __GFP_WAIT)
-               local_irq_disable();
-       if (!page)
-               return NULL;
-
        page->objects = oo_objects(oo);
-       mod_zone_page_state(page_zone(page),
-               (s->flags & SLAB_RECLAIM_ACCOUNT) ?
-               NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
-               1 << oo_order(oo));
-
-       return page;
-}
-
-static void setup_object(struct kmem_cache *s, struct page *page,
-                               void *object)
-{
-       setup_object_debug(s, page, object);
-       if (unlikely(s->ctor)) {
-               kasan_unpoison_object_data(s, object);
-               s->ctor(object);
-               kasan_poison_object_data(s, object);
-       }
-}
-
-static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
-{
-       struct page *page;
-       void *start;
-       void *p;
-       int order;
-       int idx;
-
-       if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
-               pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
-               BUG();
-       }
-
-       page = allocate_slab(s,
-               flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
-       if (!page)
-               goto out;
 
        order = compound_order(page);
-       inc_slabs_node(s, page_to_nid(page), page->objects);
        page->slab_cache = s;
        __SetPageSlab(page);
        if (page_is_pfmemalloc(page))
@@ -1448,10 +1421,34 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        page->freelist = start;
        page->inuse = page->objects;
        page->frozen = 1;
+
 out:
+       if (flags & __GFP_WAIT)
+               local_irq_disable();
+       if (!page)
+               return NULL;
+
+       mod_zone_page_state(page_zone(page),
+               (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+               NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
+               1 << oo_order(oo));
+
+       inc_slabs_node(s, page_to_nid(page), page->objects);
+
        return page;
 }
 
+static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
+{
+       if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
+               pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
+               BUG();
+       }
+
+       return allocate_slab(s,
+               flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
+}
+
 static void __free_slab(struct kmem_cache *s, struct page *page)
 {
        int order = compound_order(page);
@@ -2712,7 +2709,7 @@ redo:
         * Determine the currently cpus per cpu slab.
         * The cpu may change afterward. However that does not matter since
         * data is retrieved via this pointer. If we are on the same cpu
-        * during the cmpxchg then the free will succedd.
+        * during the cmpxchg then the free will succeed.
         */
        do {
                tid = this_cpu_read(s->cpu_slab->tid);
@@ -2750,6 +2747,113 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
+/* Note that interrupts must be enabled when calling this function. */
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+       struct kmem_cache_cpu *c;
+       struct page *page;
+       int i;
+
+       local_irq_disable();
+       c = this_cpu_ptr(s->cpu_slab);
+
+       for (i = 0; i < size; i++) {
+               void *object = p[i];
+
+               BUG_ON(!object);
+               /* kmem cache debug support */
+               s = cache_from_obj(s, object);
+               if (unlikely(!s))
+                       goto exit;
+               slab_free_hook(s, object);
+
+               page = virt_to_head_page(object);
+
+               if (c->page == page) {
+                       /* Fastpath: local CPU free */
+                       set_freepointer(s, object, c->freelist);
+                       c->freelist = object;
+               } else {
+                       c->tid = next_tid(c->tid);
+                       local_irq_enable();
+                       /* Slowpath: overhead locked cmpxchg_double_slab */
+                       __slab_free(s, page, object, _RET_IP_);
+                       local_irq_disable();
+                       c = this_cpu_ptr(s->cpu_slab);
+               }
+       }
+exit:
+       c->tid = next_tid(c->tid);
+       local_irq_enable();
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
+/* Note that interrupts must be enabled when calling this function. */
+bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+                          void **p)
+{
+       struct kmem_cache_cpu *c;
+       int i;
+
+       /*
+        * Drain objects in the per cpu slab, while disabling local
+        * IRQs, which protects against PREEMPT and interrupts
+        * handlers invoking normal fastpath.
+        */
+       local_irq_disable();
+       c = this_cpu_ptr(s->cpu_slab);
+
+       for (i = 0; i < size; i++) {
+               void *object = c->freelist;
+
+               if (unlikely(!object)) {
+                       local_irq_enable();
+                       /*
+                        * Invoking slow path likely have side-effect
+                        * of re-populating per CPU c->freelist
+                        */
+                       p[i] = __slab_alloc(s, flags, NUMA_NO_NODE,
+                                           _RET_IP_, c);
+                       if (unlikely(!p[i])) {
+                               __kmem_cache_free_bulk(s, i, p);
+                               return false;
+                       }
+                       local_irq_disable();
+                       c = this_cpu_ptr(s->cpu_slab);
+                       continue; /* goto for-loop */
+               }
+
+               /* kmem_cache debug support */
+               s = slab_pre_alloc_hook(s, flags);
+               if (unlikely(!s)) {
+                       __kmem_cache_free_bulk(s, i, p);
+                       c->tid = next_tid(c->tid);
+                       local_irq_enable();
+                       return false;
+               }
+
+               c->freelist = get_freepointer(s, object);
+               p[i] = object;
+
+               /* kmem_cache debug support */
+               slab_post_alloc_hook(s, flags, object);
+       }
+       c->tid = next_tid(c->tid);
+       local_irq_enable();
+
+       /* Clear memory outside IRQ disabled fastpath loop */
+       if (unlikely(flags & __GFP_ZERO)) {
+               int j;
+
+               for (j = 0; j < i; j++)
+                       memset(p[j], 0, s->object_size);
+       }
+
+       return true;
+}
+EXPORT_SYMBOL(kmem_cache_alloc_bulk);
+
+
 /*
  * Object placement in a slab is made very easy because we always start at
  * offset 0. If we tune the size of the object to the alignment then we can
@@ -5181,7 +5285,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
        s->kobj.kset = cache_kset(s);
        err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
        if (err)
-               goto out_put_kobj;
+               goto out;
 
        err = sysfs_create_group(&s->kobj, &slab_attr_group);
        if (err)
@@ -5208,8 +5312,6 @@ out:
        return err;
 out_del_kobj:
        kobject_del(&s->kobj);
-out_put_kobj:
-       kobject_put(&s->kobj);
        goto out;
 }
 
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
new file mode 100644 (file)
index 0000000..77fee93
--- /dev/null
@@ -0,0 +1,308 @@
+/*
+ *  mm/userfaultfd.c
+ *
+ *  Copyright (C) 2015  Red Hat, Inc.
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ */
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/mmu_notifier.h>
+#include <asm/tlbflush.h>
+#include "internal.h"
+
+static int mcopy_atomic_pte(struct mm_struct *dst_mm,
+                           pmd_t *dst_pmd,
+                           struct vm_area_struct *dst_vma,
+                           unsigned long dst_addr,
+                           unsigned long src_addr,
+                           struct page **pagep)
+{
+       struct mem_cgroup *memcg;
+       pte_t _dst_pte, *dst_pte;
+       spinlock_t *ptl;
+       void *page_kaddr;
+       int ret;
+       struct page *page;
+
+       if (!*pagep) {
+               ret = -ENOMEM;
+               page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
+               if (!page)
+                       goto out;
+
+               page_kaddr = kmap_atomic(page);
+               ret = copy_from_user(page_kaddr,
+                                    (const void __user *) src_addr,
+                                    PAGE_SIZE);
+               kunmap_atomic(page_kaddr);
+
+               /* fallback to copy_from_user outside mmap_sem */
+               if (unlikely(ret)) {
+                       ret = -EFAULT;
+                       *pagep = page;
+                       /* don't free the page */
+                       goto out;
+               }
+       } else {
+               page = *pagep;
+               *pagep = NULL;
+       }
+
+       /*
+        * The memory barrier inside __SetPageUptodate makes sure that
+        * preceeding stores to the page contents become visible before
+        * the set_pte_at() write.
+        */
+       __SetPageUptodate(page);
+
+       ret = -ENOMEM;
+       if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg))
+               goto out_release;
+
+       _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
+       if (dst_vma->vm_flags & VM_WRITE)
+               _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
+
+       ret = -EEXIST;
+       dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+       if (!pte_none(*dst_pte))
+               goto out_release_uncharge_unlock;
+
+       inc_mm_counter(dst_mm, MM_ANONPAGES);
+       page_add_new_anon_rmap(page, dst_vma, dst_addr);
+       mem_cgroup_commit_charge(page, memcg, false);
+       lru_cache_add_active_or_unevictable(page, dst_vma);
+
+       set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+       /* No need to invalidate - it was non-present before */
+       update_mmu_cache(dst_vma, dst_addr, dst_pte);
+
+       pte_unmap_unlock(dst_pte, ptl);
+       ret = 0;
+out:
+       return ret;
+out_release_uncharge_unlock:
+       pte_unmap_unlock(dst_pte, ptl);
+       mem_cgroup_cancel_charge(page, memcg);
+out_release:
+       page_cache_release(page);
+       goto out;
+}
+
+static int mfill_zeropage_pte(struct mm_struct *dst_mm,
+                             pmd_t *dst_pmd,
+                             struct vm_area_struct *dst_vma,
+                             unsigned long dst_addr)
+{
+       pte_t _dst_pte, *dst_pte;
+       spinlock_t *ptl;
+       int ret;
+
+       _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
+                                        dst_vma->vm_page_prot));
+       ret = -EEXIST;
+       dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+       if (!pte_none(*dst_pte))
+               goto out_unlock;
+       set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+       /* No need to invalidate - it was non-present before */
+       update_mmu_cache(dst_vma, dst_addr, dst_pte);
+       ret = 0;
+out_unlock:
+       pte_unmap_unlock(dst_pte, ptl);
+       return ret;
+}
+
+static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd = NULL;
+
+       pgd = pgd_offset(mm, address);
+       pud = pud_alloc(mm, pgd, address);
+       if (pud)
+               /*
+                * Note that we didn't run this because the pmd was
+                * missing, the *pmd may be already established and in
+                * turn it may also be a trans_huge_pmd.
+                */
+               pmd = pmd_alloc(mm, pud, address);
+       return pmd;
+}
+
+static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
+                                             unsigned long dst_start,
+                                             unsigned long src_start,
+                                             unsigned long len,
+                                             bool zeropage)
+{
+       struct vm_area_struct *dst_vma;
+       ssize_t err;
+       pmd_t *dst_pmd;
+       unsigned long src_addr, dst_addr;
+       long copied;
+       struct page *page;
+
+       /*
+        * Sanitize the command parameters:
+        */
+       BUG_ON(dst_start & ~PAGE_MASK);
+       BUG_ON(len & ~PAGE_MASK);
+
+       /* Does the address range wrap, or is the span zero-sized? */
+       BUG_ON(src_start + len <= src_start);
+       BUG_ON(dst_start + len <= dst_start);
+
+       src_addr = src_start;
+       dst_addr = dst_start;
+       copied = 0;
+       page = NULL;
+retry:
+       down_read(&dst_mm->mmap_sem);
+
+       /*
+        * Make sure the vma is not shared, that the dst range is
+        * both valid and fully within a single existing vma.
+        */
+       err = -EINVAL;
+       dst_vma = find_vma(dst_mm, dst_start);
+       if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
+               goto out_unlock;
+       if (dst_start < dst_vma->vm_start ||
+           dst_start + len > dst_vma->vm_end)
+               goto out_unlock;
+
+       /*
+        * Be strict and only allow __mcopy_atomic on userfaultfd
+        * registered ranges to prevent userland errors going
+        * unnoticed. As far as the VM consistency is concerned, it
+        * would be perfectly safe to remove this check, but there's
+        * no useful usage for __mcopy_atomic ouside of userfaultfd
+        * registered ranges. This is after all why these are ioctls
+        * belonging to the userfaultfd and not syscalls.
+        */
+       if (!dst_vma->vm_userfaultfd_ctx.ctx)
+               goto out_unlock;
+
+       /*
+        * FIXME: only allow copying on anonymous vmas, tmpfs should
+        * be added.
+        */
+       if (dst_vma->vm_ops)
+               goto out_unlock;
+
+       /*
+        * Ensure the dst_vma has a anon_vma or this page
+        * would get a NULL anon_vma when moved in the
+        * dst_vma.
+        */
+       err = -ENOMEM;
+       if (unlikely(anon_vma_prepare(dst_vma)))
+               goto out_unlock;
+
+       while (src_addr < src_start + len) {
+               pmd_t dst_pmdval;
+
+               BUG_ON(dst_addr >= dst_start + len);
+
+               dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
+               if (unlikely(!dst_pmd)) {
+                       err = -ENOMEM;
+                       break;
+               }
+
+               dst_pmdval = pmd_read_atomic(dst_pmd);
+               /*
+                * If the dst_pmd is mapped as THP don't
+                * override it and just be strict.
+                */
+               if (unlikely(pmd_trans_huge(dst_pmdval))) {
+                       err = -EEXIST;
+                       break;
+               }
+               if (unlikely(pmd_none(dst_pmdval)) &&
+                   unlikely(__pte_alloc(dst_mm, dst_vma, dst_pmd,
+                                        dst_addr))) {
+                       err = -ENOMEM;
+                       break;
+               }
+               /* If an huge pmd materialized from under us fail */
+               if (unlikely(pmd_trans_huge(*dst_pmd))) {
+                       err = -EFAULT;
+                       break;
+               }
+
+               BUG_ON(pmd_none(*dst_pmd));
+               BUG_ON(pmd_trans_huge(*dst_pmd));
+
+               if (!zeropage)
+                       err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
+                                              dst_addr, src_addr, &page);
+               else
+                       err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma,
+                                                dst_addr);
+
+               cond_resched();
+
+               if (unlikely(err == -EFAULT)) {
+                       void *page_kaddr;
+
+                       up_read(&dst_mm->mmap_sem);
+                       BUG_ON(!page);
+
+                       page_kaddr = kmap(page);
+                       err = copy_from_user(page_kaddr,
+                                            (const void __user *) src_addr,
+                                            PAGE_SIZE);
+                       kunmap(page);
+                       if (unlikely(err)) {
+                               err = -EFAULT;
+                               goto out;
+                       }
+                       goto retry;
+               } else
+                       BUG_ON(page);
+
+               if (!err) {
+                       dst_addr += PAGE_SIZE;
+                       src_addr += PAGE_SIZE;
+                       copied += PAGE_SIZE;
+
+                       if (fatal_signal_pending(current))
+                               err = -EINTR;
+               }
+               if (err)
+                       break;
+       }
+
+out_unlock:
+       up_read(&dst_mm->mmap_sem);
+out:
+       if (page)
+               page_cache_release(page);
+       BUG_ON(copied < 0);
+       BUG_ON(err > 0);
+       BUG_ON(!copied && !err);
+       return copied ? copied : err;
+}
+
+ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
+                    unsigned long src_start, unsigned long len)
+{
+       return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
+}
+
+ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
+                      unsigned long len)
+{
+       return __mcopy_atomic(dst_mm, start, 0, len, true);
+}
index 8286938c70ded6b82d4268174c92669a90eeb674..b1139039122a05389019aa569b82dcde442f25d7 100644 (file)
@@ -1057,7 +1057,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 * processes. Try to unmap it here.
                 */
                if (page_mapped(page) && mapping) {
-                       switch (try_to_unmap(page, ttu_flags)) {
+                       switch (try_to_unmap(page,
+                                       ttu_flags|TTU_BATCH_FLUSH)) {
                        case SWAP_FAIL:
                                goto activate_locked;
                        case SWAP_AGAIN:
@@ -1097,7 +1098,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        if (!sc->may_writepage)
                                goto keep_locked;
 
-                       /* Page is dirty, try to write it out here */
+                       /*
+                        * Page is dirty. Flush the TLB if a writable entry
+                        * potentially exists to avoid CPU writes after IO
+                        * starts and then write it out here.
+                        */
+                       try_to_unmap_flush_dirty();
                        switch (pageout(page, mapping, sc)) {
                        case PAGE_KEEP:
                                goto keep_locked;
@@ -1208,6 +1214,7 @@ keep:
        }
 
        mem_cgroup_uncharge_list(&free_pages);
+       try_to_unmap_flush();
        free_hot_cold_page_list(&free_pages, true);
 
        list_splice(&ret_pages, page_list);
@@ -2151,6 +2158,23 @@ out:
        }
 }
 
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+static void init_tlb_ubc(void)
+{
+       /*
+        * This deliberately does not clear the cpumask as it's expensive
+        * and unnecessary. If there happens to be data in there then the
+        * first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and
+        * then will be cleared.
+        */
+       current->tlb_ubc.flush_required = false;
+}
+#else
+static inline void init_tlb_ubc(void)
+{
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
@@ -2185,6 +2209,8 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
        scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
                         sc->priority == DEF_PRIORITY);
 
+       init_tlb_ubc();
+
        blk_start_plug(&plug);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                        nr[LRU_INACTIVE_FILE]) {
index f30329f726418bdc2e82bb5aced4c3634e215850..69a4d30a9ccf44900961e0691d942acfb4262201 100644 (file)
@@ -517,8 +517,11 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
        struct ceph_options *opt = client->options;
        size_t pos = m->count;
 
-       if (opt->name)
-               seq_printf(m, "name=%s,", opt->name);
+       if (opt->name) {
+               seq_puts(m, "name=");
+               seq_escape(m, opt->name, ", \t\n\\");
+               seq_putc(m, ',');
+       }
        if (opt->key)
                seq_puts(m, "secret=<hidden>,");
 
index 337ca851a350cc412532ff4d3c96d87a8605e785..b140c092d226edd7d207f077e511d828cc08a615 100644 (file)
@@ -297,7 +297,7 @@ static int rpc_complete_task(struct rpc_task *task)
        clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
        ret = atomic_dec_and_test(&task->tk_count);
        if (waitqueue_active(wq))
-               __wake_up_locked_key(wq, TASK_NORMAL, &k);
+               __wake_up_locked_key(wq, TASK_NORMAL, 1, &k);
        spin_unlock_irqrestore(&wq->lock, flags);
        return ret;
 }
index 9c4b3e2b7098a9d2af14c50614189c9a28cd689b..6d889de4e70b683a958dca75db40bee42f616fb9 100755 (executable)
@@ -1,6 +1,9 @@
 #!/bin/sh
 PARAM="-npro -kr -i8 -ts8 -sob -l80 -ss -ncs -cp1"
 RES=`indent --version`
+if [ "$RES" = "" ]; then
+       exit 1
+fi
 V1=`echo $RES | cut -d' ' -f3 | cut -d'.' -f1`
 V2=`echo $RES | cut -d' ' -f3 | cut -d'.' -f2`
 V3=`echo $RES | cut -d' ' -f3 | cut -d'.' -f3`
index 515c4c00e957cae89febefc38533415490ae88c8..00d6d53c2681dac3ce2f736baff7cbc177825b0e 100755 (executable)
@@ -14,11 +14,14 @@ declare -A cache
 
 parse_symbol() {
        # The structure of symbol at this point is:
-       #   [name]+[offset]/[total length]
+       #   ([name]+[offset]/[total length])
        #
        # For example:
        #   do_basic_setup+0x9c/0xbf
 
+       # Remove the englobing parenthesis
+       symbol=${symbol#\(}
+       symbol=${symbol%\)}
 
        # Strip the symbol name so that we could look it up
        local name=${symbol%+*}
index a7bf5f68aacb2f80417f31c6b6663d3d76a149f6..9a08fb5c1af640f1f65e5ae6a28cb8b95e6f0ad0 100755 (executable)
@@ -469,7 +469,7 @@ sub dump_section {
     } else {
 #      print STDERR "other section '$name' = '$contents'\n";
        if (defined($sections{$name}) && ($sections{$name} ne "")) {
-               print STDERR "Error(${file}:$.): duplicate section name '$name'\n";
+               print STDERR "${file}:$.: error: duplicate section name '$name'\n";
                ++$errors;
        }
        $sections{$name} = $contents;
@@ -1820,7 +1820,7 @@ sub dump_struct($$) {
                           });
     }
     else {
-       print STDERR "Error(${file}:$.): Cannot parse struct or union!\n";
+       print STDERR "${file}:$.: error: Cannot parse struct or union!\n";
        ++$errors;
     }
 }
@@ -1841,7 +1841,7 @@ sub dump_enum($$) {
            push @parameterlist, $arg;
            if (!$parameterdescs{$arg}) {
                $parameterdescs{$arg} = $undescribed;
-               print STDERR "Warning(${file}:$.): Enum value '$arg' ".
+               print STDERR "${file}:$.: warning: Enum value '$arg' ".
                    "not described in enum '$declaration_name'\n";
            }
 
@@ -1859,7 +1859,7 @@ sub dump_enum($$) {
                           });
     }
     else {
-       print STDERR "Error(${file}:$.): Cannot parse enum!\n";
+       print STDERR "${file}:$.: error: Cannot parse enum!\n";
        ++$errors;
     }
 }
@@ -1887,7 +1887,7 @@ sub dump_typedef($$) {
                           });
     }
     else {
-       print STDERR "Error(${file}:$.): Cannot parse typedef!\n";
+       print STDERR "${file}:$.: error: Cannot parse typedef!\n";
        ++$errors;
     }
 }
@@ -2019,11 +2019,11 @@ sub push_parameter($$$) {
            $parameterdescs{$param_name} = $undescribed;
 
            if (($type eq 'function') || ($type eq 'enum')) {
-               print STDERR "Warning(${file}:$.): Function parameter ".
+               print STDERR "${file}:$.: warning: Function parameter ".
                    "or member '$param' not " .
                    "described in '$declaration_name'\n";
            }
-           print STDERR "Warning(${file}:$.):" .
+           print STDERR "${file}:$.: warning:" .
                         " No description found for parameter '$param'\n";
            ++$warnings;
        }
@@ -2074,14 +2074,14 @@ sub check_sections($$$$$$) {
                }
                if ($err) {
                        if ($decl_type eq "function") {
-                               print STDERR "Warning(${file}:$.): " .
+                               print STDERR "${file}:$.: warning: " .
                                        "Excess function parameter " .
                                        "'$sects[$sx]' " .
                                        "description in '$decl_name'\n";
                                ++$warnings;
                        } else {
                                if ($nested !~ m/\Q$sects[$sx]\E/) {
-                                   print STDERR "Warning(${file}:$.): " .
+                                   print STDERR "${file}:$.: warning: " .
                                        "Excess struct/union/enum/typedef member " .
                                        "'$sects[$sx]' " .
                                        "description in '$decl_name'\n";
@@ -2107,7 +2107,7 @@ sub check_return_section {
 
         if (!defined($sections{$section_return}) ||
             $sections{$section_return} eq "") {
-                print STDERR "Warning(${file}:$.): " .
+                print STDERR "${file}:$.: warning: " .
                         "No description found for return value of " .
                         "'$declaration_name'\n";
                 ++$warnings;
@@ -2186,7 +2186,7 @@ sub dump_function($$) {
 
        create_parameterlist($args, ',', $file);
     } else {
-       print STDERR "Warning(${file}:$.): cannot understand function prototype: '$prototype'\n";
+       print STDERR "${file}:$.: warning: cannot understand function prototype: '$prototype'\n";
        return;
     }
 
@@ -2251,7 +2251,7 @@ sub tracepoint_munge($) {
                $tracepointargs = $1;
        }
        if (($tracepointname eq 0) || ($tracepointargs eq 0)) {
-               print STDERR "Warning(${file}:$.): Unrecognized tracepoint format: \n".
+               print STDERR "${file}:$.: warning: Unrecognized tracepoint format: \n".
                             "$prototype\n";
        } else {
                $prototype = "static inline void trace_$tracepointname($tracepointargs)";
@@ -2450,7 +2450,7 @@ sub process_file($) {
                }
 
                if (($declaration_purpose eq "") && $verbose) {
-                       print STDERR "Warning(${file}:$.): missing initial short description on line:\n";
+                       print STDERR "${file}:$.: warning: missing initial short description on line:\n";
                        print STDERR $_;
                        ++$warnings;
                }
@@ -2468,10 +2468,10 @@ sub process_file($) {
                }
 
                if ($verbose) {
-                   print STDERR "Info(${file}:$.): Scanning doc for $identifier\n";
+                   print STDERR "${file}:$.: info: Scanning doc for $identifier\n";
                }
            } else {
-               print STDERR "Warning(${file}:$.): Cannot understand $_ on line $.",
+               print STDERR "${file}:$.: warning: Cannot understand $_ on line $.",
                " - I thought it was a doc line\n";
                ++$warnings;
                $state = 0;
@@ -2483,7 +2483,7 @@ sub process_file($) {
 
                if (($contents ne "") && ($contents ne "\n")) {
                    if (!$in_doc_sect && $verbose) {
-                       print STDERR "Warning(${file}:$.): contents before sections\n";
+                       print STDERR "${file}:$.: warning: contents before sections\n";
                        ++$warnings;
                    }
                    dump_section($file, $section, xml_escape($contents));
@@ -2509,7 +2509,7 @@ sub process_file($) {
                }
                # look for doc_com + <text> + doc_end:
                if ($_ =~ m'\s*\*\s*[a-zA-Z_0-9:\.]+\*/') {
-                   print STDERR "Warning(${file}:$.): suspicious ending line: $_";
+                   print STDERR "${file}:$.: warning: suspicious ending line: $_";
                    ++$warnings;
                }
 
@@ -2539,7 +2539,7 @@ sub process_file($) {
                }
            } else {
                # i dont know - bad line?  ignore.
-               print STDERR "Warning(${file}:$.): bad line: $_";
+               print STDERR "${file}:$.: warning: bad line: $_";
                ++$warnings;
            }
        } elsif ($state == 5) { # scanning for split parameters
@@ -2631,7 +2631,7 @@ sub process_file($) {
        }
     }
     if ($initial_section_counter == $section_counter) {
-       print STDERR "Warning(${file}): no structured comments found\n";
+       print STDERR "${file}:1: warning: no structured comments found\n";
        if (($function_only == 1) && ($show_not_found == 1)) {
            print STDERR "    Was looking for '$_'.\n" for keys %function_table;
        }
index bb8e4d0a19119dd1bd8b118c22a74eccc8337502..946caf3bd694ea4f41c92d8b0e51ac996f8b98e7 100644 (file)
@@ -32,6 +32,7 @@ accoring||according
 accout||account
 accquire||acquire
 accquired||acquired
+accross||across
 acessable||accessible
 acess||access
 achitecture||architecture
@@ -100,8 +101,10 @@ appropiate||appropriate
 appropriatly||appropriately
 approriate||appropriate
 approriately||appropriately
+apropriate||appropriate
 aquainted||acquainted
 aquired||acquired
+aquisition||acquisition
 arbitary||arbitrary
 architechture||architecture
 arguement||argument
@@ -111,6 +114,8 @@ arne't||aren't
 arraival||arrival
 artifical||artificial
 artillary||artillery
+asign||assign
+assertation||assertion
 assiged||assigned
 assigment||assignment
 assigments||assignments
@@ -136,6 +141,7 @@ automatize||automate
 automatized||automated
 automatizes||automates
 autonymous||autonomous
+auxillary||auxiliary
 auxilliary||auxiliary
 avaiable||available
 avaible||available
@@ -187,6 +193,7 @@ capatibilities||capabilities
 carefuly||carefully
 cariage||carriage
 catagory||category
+cehck||check
 challange||challenge
 challanges||challenges
 chanell||channel
@@ -199,6 +206,8 @@ charactor||character
 charater||character
 charaters||characters
 charcter||character
+chcek||check
+chck||check
 checksuming||checksumming
 childern||children
 childs||children
@@ -231,6 +240,8 @@ compatability||compatibility
 compatable||compatible
 compatibiliy||compatibility
 compatibilty||compatibility
+compatiblity||compatibility
+competion||completion
 compilant||compliant
 compleatly||completely
 completly||completely
@@ -291,6 +302,7 @@ defferred||deferred
 definate||definite
 definately||definitely
 defintion||definition
+defintions||definitions
 defualt||default
 defult||default
 deivce||device
@@ -306,6 +318,7 @@ depreacted||deprecated
 depreacte||deprecate
 desactivate||deactivate
 desciptors||descriptors
+descripton||description
 descrition||description
 descritptor||descriptor
 desctiptor||descriptor
@@ -327,6 +340,7 @@ devided||divided
 deviece||device
 diable||disable
 dictionnary||dictionary
+didnt||didn't
 diferent||different
 differrence||difference
 difinition||definition
@@ -344,6 +358,7 @@ docuentation||documentation
 documantation||documentation
 documentaion||documentation
 documment||document
+doesnt||doesn't
 dorp||drop
 dosen||doesn
 downlad||download
@@ -450,11 +465,13 @@ grahical||graphical
 grahpical||graphical
 grapic||graphic
 guage||gauge
+guarenteed||guaranteed
 guarentee||guarantee
 halfs||halves
 hander||handler
 handfull||handful
 hanled||handled
+happend||happened
 harware||hardware
 heirarchically||hierarchically
 helpfull||helpful
@@ -512,6 +529,7 @@ initialzed||initialized
 initilization||initialization
 initilize||initialize
 inofficial||unofficial
+insititute||institute
 instal||install
 inteface||interface
 integreated||integrated
@@ -546,6 +564,7 @@ invididual||individual
 invokation||invocation
 invokations||invocations
 irrelevent||irrelevant
+isnt||isn't
 isssue||issue
 itslef||itself
 jave||java
@@ -558,6 +577,7 @@ langauage||language
 langauge||language
 langugage||language
 lauch||launch
+layed||laid
 leightweight||lightweight
 lengh||length
 lenght||length
@@ -714,6 +734,7 @@ preceeding||preceding
 preceed||precede
 precendence||precedence
 precission||precision
+preemptable||preemptible
 prefered||preferred
 prefferably||preferably
 premption||preemption
@@ -744,6 +765,7 @@ programers||programmers
 programm||program
 programms||programs
 progresss||progress
+promiscous||promiscuous
 promps||prompts
 pronnounced||pronounced
 prononciation||pronunciation
@@ -817,6 +839,7 @@ reseting||resetting
 resizeable||resizable
 resouces||resources
 resoures||resources
+responce||response
 ressizes||resizes
 ressource||resource
 ressources||resources
@@ -869,6 +892,7 @@ setts||sets
 settting||setting
 shotdown||shutdown
 shoud||should
+shouldnt||shouldn't
 shoule||should
 shrinked||shrunk
 siginificantly||significantly
@@ -913,9 +937,11 @@ straming||streaming
 struc||struct
 structres||structures
 stuct||struct
+stucture||structure
 sturcture||structure
 subdirectoires||subdirectories
 suble||subtle
+substract||subtract
 succesfully||successfully
 succesful||successful
 successfull||successful
@@ -987,6 +1013,7 @@ unexpectd||unexpected
 unexpeted||unexpected
 unfortunatelly||unfortunately
 unifiy||unify
+unintialized||uninitialized
 unknonw||unknown
 unknow||unknown
 unkown||unknown
@@ -1027,7 +1054,9 @@ virtiual||virtual
 visiters||visitors
 vitual||virtual
 wating||waiting
+wether||whether
 whataver||whatever
+whcih||which
 whenver||whenever
 wheter||whether
 whe||when
index d103f5a4043d3ec6b780dacb8ca496c28ebf9c8a..1832cf701c3d6d44d90adeb278bd04bca489d274 100644 (file)
@@ -267,6 +267,16 @@ int cap_capset(struct cred *new,
        new->cap_effective   = *effective;
        new->cap_inheritable = *inheritable;
        new->cap_permitted   = *permitted;
+
+       /*
+        * Mask off ambient bits that are no longer both permitted and
+        * inheritable.
+        */
+       new->cap_ambient = cap_intersect(new->cap_ambient,
+                                        cap_intersect(*permitted,
+                                                      *inheritable));
+       if (WARN_ON(!cap_ambient_invariant_ok(new)))
+               return -EINVAL;
        return 0;
 }
 
@@ -347,6 +357,7 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
 
                /*
                 * pP' = (X & fP) | (pI & fI)
+                * The addition of pA' is handled later.
                 */
                new->cap_permitted.cap[i] =
                        (new->cap_bset.cap[i] & permitted) |
@@ -474,10 +485,13 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
 {
        const struct cred *old = current_cred();
        struct cred *new = bprm->cred;
-       bool effective, has_cap = false;
+       bool effective, has_cap = false, is_setid;
        int ret;
        kuid_t root_uid;
 
+       if (WARN_ON(!cap_ambient_invariant_ok(old)))
+               return -EPERM;
+
        effective = false;
        ret = get_file_caps(bprm, &effective, &has_cap);
        if (ret < 0)
@@ -522,8 +536,9 @@ skip:
         *
         * In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
         */
-       if ((!uid_eq(new->euid, old->uid) ||
-            !gid_eq(new->egid, old->gid) ||
+       is_setid = !uid_eq(new->euid, old->uid) || !gid_eq(new->egid, old->gid);
+
+       if ((is_setid ||
             !cap_issubset(new->cap_permitted, old->cap_permitted)) &&
            bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
                /* downgrade; they get no more than they had, and maybe less */
@@ -539,10 +554,28 @@ skip:
        new->suid = new->fsuid = new->euid;
        new->sgid = new->fsgid = new->egid;
 
+       /* File caps or setid cancels ambient. */
+       if (has_cap || is_setid)
+               cap_clear(new->cap_ambient);
+
+       /*
+        * Now that we've computed pA', update pP' to give:
+        *   pP' = (X & fP) | (pI & fI) | pA'
+        */
+       new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient);
+
+       /*
+        * Set pE' = (fE ? pP' : pA').  Because pA' is zero if fE is set,
+        * this is the same as pE' = (fE ? pP' : 0) | pA'.
+        */
        if (effective)
                new->cap_effective = new->cap_permitted;
        else
-               cap_clear(new->cap_effective);
+               new->cap_effective = new->cap_ambient;
+
+       if (WARN_ON(!cap_ambient_invariant_ok(new)))
+               return -EPERM;
+
        bprm->cap_effective = effective;
 
        /*
@@ -557,7 +590,7 @@ skip:
         * Number 1 above might fail if you don't have a full bset, but I think
         * that is interesting information to audit.
         */
-       if (!cap_isclear(new->cap_effective)) {
+       if (!cap_issubset(new->cap_effective, new->cap_ambient)) {
                if (!cap_issubset(CAP_FULL_SET, new->cap_effective) ||
                    !uid_eq(new->euid, root_uid) || !uid_eq(new->uid, root_uid) ||
                    issecure(SECURE_NOROOT)) {
@@ -568,6 +601,10 @@ skip:
        }
 
        new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
+
+       if (WARN_ON(!cap_ambient_invariant_ok(new)))
+               return -EPERM;
+
        return 0;
 }
 
@@ -589,7 +626,7 @@ int cap_bprm_secureexec(struct linux_binprm *bprm)
        if (!uid_eq(cred->uid, root_uid)) {
                if (bprm->cap_effective)
                        return 1;
-               if (!cap_isclear(cred->cap_permitted))
+               if (!cap_issubset(cred->cap_permitted, cred->cap_ambient))
                        return 1;
        }
 
@@ -691,10 +728,18 @@ static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
             uid_eq(old->suid, root_uid)) &&
            (!uid_eq(new->uid, root_uid) &&
             !uid_eq(new->euid, root_uid) &&
-            !uid_eq(new->suid, root_uid)) &&
-           !issecure(SECURE_KEEP_CAPS)) {
-               cap_clear(new->cap_permitted);
-               cap_clear(new->cap_effective);
+            !uid_eq(new->suid, root_uid))) {
+               if (!issecure(SECURE_KEEP_CAPS)) {
+                       cap_clear(new->cap_permitted);
+                       cap_clear(new->cap_effective);
+               }
+
+               /*
+                * Pre-ambient programs expect setresuid to nonroot followed
+                * by exec to drop capabilities.  We should make sure that
+                * this remains the case.
+                */
+               cap_clear(new->cap_ambient);
        }
        if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid))
                cap_clear(new->cap_effective);
@@ -924,6 +969,44 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
                        new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
                return commit_creds(new);
 
+       case PR_CAP_AMBIENT:
+               if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
+                       if (arg3 | arg4 | arg5)
+                               return -EINVAL;
+
+                       new = prepare_creds();
+                       if (!new)
+                               return -ENOMEM;
+                       cap_clear(new->cap_ambient);
+                       return commit_creds(new);
+               }
+
+               if (((!cap_valid(arg3)) | arg4 | arg5))
+                       return -EINVAL;
+
+               if (arg2 == PR_CAP_AMBIENT_IS_SET) {
+                       return !!cap_raised(current_cred()->cap_ambient, arg3);
+               } else if (arg2 != PR_CAP_AMBIENT_RAISE &&
+                          arg2 != PR_CAP_AMBIENT_LOWER) {
+                       return -EINVAL;
+               } else {
+                       if (arg2 == PR_CAP_AMBIENT_RAISE &&
+                           (!cap_raised(current_cred()->cap_permitted, arg3) ||
+                            !cap_raised(current_cred()->cap_inheritable,
+                                        arg3) ||
+                            issecure(SECURE_NO_CAP_AMBIENT_RAISE)))
+                               return -EPERM;
+
+                       new = prepare_creds();
+                       if (!new)
+                               return -ENOMEM;
+                       if (arg2 == PR_CAP_AMBIENT_RAISE)
+                               cap_raise(new->cap_ambient, arg3);
+                       else
+                               cap_lower(new->cap_ambient, arg3);
+                       return commit_creds(new);
+               }
+
        default:
                /* No functionality available - continue with default */
                return -ENOSYS;
index bd536cb221e237c6cab654b4d4d927daf3f27340..43b4cddbf2b39ebd838e3849a1e59e9c7c22c945 100644 (file)
@@ -848,6 +848,7 @@ void key_change_session_keyring(struct callback_head *twork)
        new->cap_inheritable    = old->cap_inheritable;
        new->cap_permitted      = old->cap_permitted;
        new->cap_effective      = old->cap_effective;
+       new->cap_ambient        = old->cap_ambient;
        new->cap_bset           = old->cap_bset;
 
        new->jit_keyring        = old->jit_keyring;
index 564079c5c49dce530f56fd0626827d81c0ec75d4..cdf4c589a3914bbc7d315b1e55fb1264f4be0309 100644 (file)
@@ -1100,7 +1100,7 @@ static void selinux_write_opts(struct seq_file *m,
                seq_puts(m, prefix);
                if (has_comma)
                        seq_putc(m, '\"');
-               seq_puts(m, opts->mnt_opts[i]);
+               seq_escape(m, opts->mnt_opts[i], "\"\n\\");
                if (has_comma)
                        seq_putc(m, '\"');
        }
diff --git a/tools/testing/selftests/capabilities/.gitignore b/tools/testing/selftests/capabilities/.gitignore
new file mode 100644 (file)
index 0000000..b732dd0
--- /dev/null
@@ -0,0 +1,2 @@
+test_execve
+validate_cap
diff --git a/tools/testing/selftests/capabilities/Makefile b/tools/testing/selftests/capabilities/Makefile
new file mode 100644 (file)
index 0000000..8c8f0c1
--- /dev/null
@@ -0,0 +1,18 @@
+all:
+
+include ../lib.mk
+
+.PHONY: all clean
+
+TARGETS := validate_cap test_execve
+TEST_PROGS := test_execve
+
+CFLAGS := -O2 -g -std=gnu99 -Wall -lcap-ng
+
+all: $(TARGETS)
+
+clean:
+       $(RM) $(TARGETS)
+
+$(TARGETS): %: %.c
+       $(CC) -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
diff --git a/tools/testing/selftests/capabilities/test_execve.c b/tools/testing/selftests/capabilities/test_execve.c
new file mode 100644 (file)
index 0000000..10a21a9
--- /dev/null
@@ -0,0 +1,427 @@
+#define _GNU_SOURCE
+
+#include <cap-ng.h>
+#include <err.h>
+#include <linux/capability.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <sched.h>
+#include <sys/mount.h>
+#include <limits.h>
+#include <libgen.h>
+#include <malloc.h>
+#include <sys/wait.h>
+#include <sys/prctl.h>
+#include <sys/stat.h>
+
+#ifndef PR_CAP_AMBIENT
+#define PR_CAP_AMBIENT                 47
+# define PR_CAP_AMBIENT_IS_SET         1
+# define PR_CAP_AMBIENT_RAISE          2
+# define PR_CAP_AMBIENT_LOWER          3
+# define PR_CAP_AMBIENT_CLEAR_ALL      4
+#endif
+
+static int nerrs;
+
+static void vmaybe_write_file(bool enoent_ok, char *filename, char *fmt, va_list ap)
+{
+       char buf[4096];
+       int fd;
+       ssize_t written;
+       int buf_len;
+
+       buf_len = vsnprintf(buf, sizeof(buf), fmt, ap);
+       if (buf_len < 0) {
+               err(1, "vsnprintf failed");
+       }
+       if (buf_len >= sizeof(buf)) {
+               errx(1, "vsnprintf output truncated");
+       }
+
+       fd = open(filename, O_WRONLY);
+       if (fd < 0) {
+               if ((errno == ENOENT) && enoent_ok)
+                       return;
+               err(1, "open of %s failed", filename);
+       }
+       written = write(fd, buf, buf_len);
+       if (written != buf_len) {
+               if (written >= 0) {
+                       errx(1, "short write to %s", filename);
+               } else {
+                       err(1, "write to %s failed", filename);
+               }
+       }
+       if (close(fd) != 0) {
+               err(1, "close of %s failed", filename);
+       }
+}
+
+static void maybe_write_file(char *filename, char *fmt, ...)
+{
+       va_list ap;
+
+       va_start(ap, fmt);
+       vmaybe_write_file(true, filename, fmt, ap);
+       va_end(ap);
+}
+
+static void write_file(char *filename, char *fmt, ...)
+{
+       va_list ap;
+
+       va_start(ap, fmt);
+       vmaybe_write_file(false, filename, fmt, ap);
+       va_end(ap);
+}
+
+static bool create_and_enter_ns(uid_t inner_uid)
+{
+       uid_t outer_uid;
+       gid_t outer_gid;
+       int i;
+       bool have_outer_privilege;
+
+       outer_uid = getuid();
+       outer_gid = getgid();
+
+       /*
+        * TODO: If we're already root, we could skip creating the userns.
+        */
+
+       if (unshare(CLONE_NEWNS) == 0) {
+               printf("[NOTE]\tUsing global UIDs for tests\n");
+               if (prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0) != 0)
+                       err(1, "PR_SET_KEEPCAPS");
+               if (setresuid(inner_uid, inner_uid, -1) != 0)
+                       err(1, "setresuid");
+
+               // Re-enable effective caps
+               capng_get_caps_process();
+               for (i = 0; i < CAP_LAST_CAP; i++)
+                       if (capng_have_capability(CAPNG_PERMITTED, i))
+                               capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, i);
+               if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+                       err(1, "capng_apply");
+
+               have_outer_privilege = true;
+       } else if (unshare(CLONE_NEWUSER | CLONE_NEWNS) == 0) {
+               printf("[NOTE]\tUsing a user namespace for tests\n");
+               maybe_write_file("/proc/self/setgroups", "deny");
+               write_file("/proc/self/uid_map", "%d %d 1", inner_uid, outer_uid);
+               write_file("/proc/self/gid_map", "0 %d 1", outer_gid);
+
+               have_outer_privilege = false;
+       } else {
+               errx(1, "must be root or be able to create a userns");
+       }
+
+       if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL) != 0)
+               err(1, "remount everything private");
+
+       return have_outer_privilege;
+}
+
+static void chdir_to_tmpfs(void)
+{
+       char cwd[PATH_MAX];
+       if (getcwd(cwd, sizeof(cwd)) != cwd)
+               err(1, "getcwd");
+
+       if (mount("private_tmp", ".", "tmpfs", 0, "mode=0777") != 0)
+               err(1, "mount private tmpfs");
+
+       if (chdir(cwd) != 0)
+               err(1, "chdir to private tmpfs");
+
+       if (umount2(".", MNT_DETACH) != 0)
+               err(1, "detach private tmpfs");
+}
+
+static void copy_fromat_to(int fromfd, const char *fromname, const char *toname)
+{
+       int from = openat(fromfd, fromname, O_RDONLY);
+       if (from == -1)
+               err(1, "open copy source");
+
+       int to = open(toname, O_CREAT | O_WRONLY | O_EXCL, 0700);
+
+       while (true) {
+               char buf[4096];
+               ssize_t sz = read(from, buf, sizeof(buf));
+               if (sz == 0)
+                       break;
+               if (sz < 0)
+                       err(1, "read");
+
+               if (write(to, buf, sz) != sz)
+                       err(1, "write");        /* no short writes on tmpfs */
+       }
+
+       close(from);
+       close(to);
+}
+
+static bool fork_wait(void)
+{
+       pid_t child = fork();
+       if (child == 0) {
+               nerrs = 0;
+               return true;
+       } else if (child > 0) {
+               int status;
+               if (waitpid(child, &status, 0) != child ||
+                   !WIFEXITED(status)) {
+                       printf("[FAIL]\tChild died\n");
+                       nerrs++;
+               } else if (WEXITSTATUS(status) != 0) {
+                       printf("[FAIL]\tChild failed\n");
+                       nerrs++;
+               } else {
+                       printf("[OK]\tChild succeeded\n");
+               }
+
+               return false;
+       } else {
+               err(1, "fork");
+       }
+}
+
+static void exec_other_validate_cap(const char *name,
+                                   bool eff, bool perm, bool inh, bool ambient)
+{
+       execl(name, name, (eff ? "1" : "0"),
+             (perm ? "1" : "0"), (inh ? "1" : "0"), (ambient ? "1" : "0"),
+             NULL);
+       err(1, "execl");
+}
+
+static void exec_validate_cap(bool eff, bool perm, bool inh, bool ambient)
+{
+       exec_other_validate_cap("./validate_cap", eff, perm, inh, ambient);
+}
+
+static int do_tests(int uid, const char *our_path)
+{
+       bool have_outer_privilege = create_and_enter_ns(uid);
+
+       int ourpath_fd = open(our_path, O_RDONLY | O_DIRECTORY);
+       if (ourpath_fd == -1)
+               err(1, "open '%s'", our_path);
+
+       chdir_to_tmpfs();
+
+       copy_fromat_to(ourpath_fd, "validate_cap", "validate_cap");
+
+       if (have_outer_privilege) {
+               uid_t gid = getegid();
+
+               copy_fromat_to(ourpath_fd, "validate_cap",
+                              "validate_cap_suidroot");
+               if (chown("validate_cap_suidroot", 0, -1) != 0)
+                       err(1, "chown");
+               if (chmod("validate_cap_suidroot", S_ISUID | 0700) != 0)
+                       err(1, "chmod");
+
+               copy_fromat_to(ourpath_fd, "validate_cap",
+                              "validate_cap_suidnonroot");
+               if (chown("validate_cap_suidnonroot", uid + 1, -1) != 0)
+                       err(1, "chown");
+               if (chmod("validate_cap_suidnonroot", S_ISUID | 0700) != 0)
+                       err(1, "chmod");
+
+               copy_fromat_to(ourpath_fd, "validate_cap",
+                              "validate_cap_sgidroot");
+               if (chown("validate_cap_sgidroot", -1, 0) != 0)
+                       err(1, "chown");
+               if (chmod("validate_cap_sgidroot", S_ISGID | 0710) != 0)
+                       err(1, "chmod");
+
+               copy_fromat_to(ourpath_fd, "validate_cap",
+                              "validate_cap_sgidnonroot");
+               if (chown("validate_cap_sgidnonroot", -1, gid + 1) != 0)
+                       err(1, "chown");
+               if (chmod("validate_cap_sgidnonroot", S_ISGID | 0710) != 0)
+                       err(1, "chmod");
+}
+
+       capng_get_caps_process();
+
+       /* Make sure that i starts out clear */
+       capng_update(CAPNG_DROP, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE);
+       if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+               err(1, "capng_apply");
+
+       if (uid == 0) {
+               printf("[RUN]\tRoot => ep\n");
+               if (fork_wait())
+                       exec_validate_cap(true, true, false, false);
+       } else {
+               printf("[RUN]\tNon-root => no caps\n");
+               if (fork_wait())
+                       exec_validate_cap(false, false, false, false);
+       }
+
+       printf("[OK]\tCheck cap_ambient manipulation rules\n");
+
+       /* We should not be able to add ambient caps yet. */
+       if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != -1 || errno != EPERM) {
+               if (errno == EINVAL)
+                       printf("[FAIL]\tPR_CAP_AMBIENT_RAISE isn't supported\n");
+               else
+                       printf("[FAIL]\tPR_CAP_AMBIENT_RAISE should have failed eith EPERM on a non-inheritable cap\n");
+               return 1;
+       }
+       printf("[OK]\tPR_CAP_AMBIENT_RAISE failed on non-inheritable cap\n");
+
+       capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_RAW);
+       capng_update(CAPNG_DROP, CAPNG_PERMITTED, CAP_NET_RAW);
+       capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, CAP_NET_RAW);
+       if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+               err(1, "capng_apply");
+       if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_RAW, 0, 0, 0) != -1 || errno != EPERM) {
+               printf("[FAIL]\tPR_CAP_AMBIENT_RAISE should have failed on a non-permitted cap\n");
+               return 1;
+       }
+       printf("[OK]\tPR_CAP_AMBIENT_RAISE failed on non-permitted cap\n");
+
+       capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE);
+       if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+               err(1, "capng_apply");
+       if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) {
+               printf("[FAIL]\tPR_CAP_AMBIENT_RAISE should have succeeded\n");
+               return 1;
+       }
+       printf("[OK]\tPR_CAP_AMBIENT_RAISE worked\n");
+
+       if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 1) {
+               printf("[FAIL]\tPR_CAP_AMBIENT_IS_SET is broken\n");
+               return 1;
+       }
+
+       if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0, 0) != 0)
+               err(1, "PR_CAP_AMBIENT_CLEAR_ALL");
+
+       if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) {
+               printf("[FAIL]\tPR_CAP_AMBIENT_CLEAR_ALL didn't work\n");
+               return 1;
+       }
+
+       if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0)
+               err(1, "PR_CAP_AMBIENT_RAISE");
+
+       capng_update(CAPNG_DROP, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE);
+       if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+               err(1, "capng_apply");
+
+       if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) {
+               printf("[FAIL]\tDropping I should have dropped A\n");
+               return 1;
+       }
+
+       printf("[OK]\tBasic manipulation appears to work\n");
+
+       capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE);
+       if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+               err(1, "capng_apply");
+       if (uid == 0) {
+               printf("[RUN]\tRoot +i => eip\n");
+               if (fork_wait())
+                       exec_validate_cap(true, true, true, false);
+       } else {
+               printf("[RUN]\tNon-root +i => i\n");
+               if (fork_wait())
+                       exec_validate_cap(false, false, true, false);
+       }
+
+       if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0)
+               err(1, "PR_CAP_AMBIENT_RAISE");
+
+       printf("[RUN]\tUID %d +ia => eipa\n", uid);
+       if (fork_wait())
+               exec_validate_cap(true, true, true, true);
+
+       /* The remaining tests need real privilege */
+
+       if (!have_outer_privilege) {
+               printf("[SKIP]\tSUID/SGID tests (needs privilege)\n");
+               goto done;
+       }
+
+       if (uid == 0) {
+               printf("[RUN]\tRoot +ia, suidroot => eipa\n");
+               if (fork_wait())
+                       exec_other_validate_cap("./validate_cap_suidroot",
+                                               true, true, true, true);
+
+               printf("[RUN]\tRoot +ia, suidnonroot => ip\n");
+               if (fork_wait())
+                       exec_other_validate_cap("./validate_cap_suidnonroot",
+                                               false, true, true, false);
+
+               printf("[RUN]\tRoot +ia, sgidroot => eipa\n");
+               if (fork_wait())
+                       exec_other_validate_cap("./validate_cap_sgidroot",
+                                               true, true, true, true);
+
+               if (fork_wait()) {
+                       printf("[RUN]\tRoot, gid != 0, +ia, sgidroot => eip\n");
+                       if (setresgid(1, 1, 1) != 0)
+                               err(1, "setresgid");
+                       exec_other_validate_cap("./validate_cap_sgidroot",
+                                               true, true, true, false);
+               }
+
+               printf("[RUN]\tRoot +ia, sgidnonroot => eip\n");
+               if (fork_wait())
+                       exec_other_validate_cap("./validate_cap_sgidnonroot",
+                                               true, true, true, false);
+       } else {
+               printf("[RUN]\tNon-root +ia, sgidnonroot => i\n");
+               exec_other_validate_cap("./validate_cap_sgidnonroot",
+                                               false, false, true, false);
+
+               if (fork_wait()) {
+                       printf("[RUN]\tNon-root +ia, sgidroot => i\n");
+                       if (setresgid(1, 1, 1) != 0)
+                               err(1, "setresgid");
+                       exec_other_validate_cap("./validate_cap_sgidroot",
+                                               false, false, true, false);
+               }
+       }
+
+done:
+       return nerrs ? 1 : 0;
+}
+
+int main(int argc, char **argv)
+{
+       char *tmp1, *tmp2, *our_path;
+
+       /* Find our path */
+       tmp1 = strdup(argv[0]);
+       if (!tmp1)
+               err(1, "strdup");
+       tmp2 = dirname(tmp1);
+       our_path = strdup(tmp2);
+       if (!our_path)
+               err(1, "strdup");
+       free(tmp1);
+
+       if (fork_wait()) {
+               printf("[RUN]\t+++ Tests with uid == 0 +++\n");
+               return do_tests(0, our_path);
+       }
+
+       if (fork_wait()) {
+               printf("[RUN]\t+++ Tests with uid != 0 +++\n");
+               return do_tests(1, our_path);
+       }
+
+       return nerrs ? 1 : 0;
+}
diff --git a/tools/testing/selftests/capabilities/validate_cap.c b/tools/testing/selftests/capabilities/validate_cap.c
new file mode 100644 (file)
index 0000000..dd3c45f
--- /dev/null
@@ -0,0 +1,73 @@
+#include <cap-ng.h>
+#include <err.h>
+#include <linux/capability.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/prctl.h>
+#include <sys/auxv.h>
+
+#ifndef PR_CAP_AMBIENT
+#define PR_CAP_AMBIENT                 47
+# define PR_CAP_AMBIENT_IS_SET         1
+# define PR_CAP_AMBIENT_RAISE          2
+# define PR_CAP_AMBIENT_LOWER          3
+# define PR_CAP_AMBIENT_CLEAR_ALL      4
+#endif
+
+#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 19)
+# define HAVE_GETAUXVAL
+#endif
+
+static bool bool_arg(char **argv, int i)
+{
+       if (!strcmp(argv[i], "0"))
+               return false;
+       else if (!strcmp(argv[i], "1"))
+               return true;
+       else
+               errx(1, "wrong argv[%d]", i);
+}
+
+int main(int argc, char **argv)
+{
+       const char *atsec = "";
+
+       /*
+        * Be careful just in case a setgid or setcapped copy of this
+        * helper gets out.
+        */
+
+       if (argc != 5)
+               errx(1, "wrong argc");
+
+#ifdef HAVE_GETAUXVAL
+       if (getauxval(AT_SECURE))
+               atsec = " (AT_SECURE is set)";
+       else
+               atsec = " (AT_SECURE is not set)";
+#endif
+
+       capng_get_caps_process();
+
+       if (capng_have_capability(CAPNG_EFFECTIVE, CAP_NET_BIND_SERVICE) != bool_arg(argv, 1)) {
+               printf("[FAIL]\tWrong effective state%s\n", atsec);
+               return 1;
+       }
+       if (capng_have_capability(CAPNG_PERMITTED, CAP_NET_BIND_SERVICE) != bool_arg(argv, 2)) {
+               printf("[FAIL]\tWrong permitted state%s\n", atsec);
+               return 1;
+       }
+       if (capng_have_capability(CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE) != bool_arg(argv, 3)) {
+               printf("[FAIL]\tWrong inheritable state%s\n", atsec);
+               return 1;
+       }
+
+       if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != bool_arg(argv, 4)) {
+               printf("[FAIL]\tWrong ambient state%s\n", atsec);
+               return 1;
+       }
+
+       printf("[OK]\tCapabilities after execve were correct\n");
+       return 0;
+}
index 231b9a031f6ad9e22be6cd1f8a49d6dc8029d56c..0d6854744b37307fe446e350219b6678fe215a49 100644 (file)
@@ -8,10 +8,13 @@ BINARIES += hugetlbfstest
 BINARIES += map_hugetlb
 BINARIES += thuge-gen
 BINARIES += transhuge-stress
+BINARIES += userfaultfd
 
 all: $(BINARIES)
 %: %.c
        $(CC) $(CFLAGS) -o $@ $^ -lrt
+userfaultfd: userfaultfd.c
+       $(CC) $(CFLAGS) -O2 -o $@ $^ -lpthread
 
 TEST_PROGS := run_vmtests
 TEST_FILES := $(BINARIES)
index 49ece11ff7fdc5a5dfd5f4f30b82dc8a76a5d8c9..831adeb5fc552b3c889510c24308d744c734c267 100755 (executable)
@@ -86,6 +86,17 @@ else
        echo "[PASS]"
 fi
 
+echo "--------------------"
+echo "running userfaultfd"
+echo "--------------------"
+./userfaultfd 128 32
+if [ $? -ne 0 ]; then
+       echo "[FAIL]"
+       exitcode=1
+else
+       echo "[PASS]"
+fi
+
 #cleanup
 umount $mnt
 rm -rf $mnt
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
new file mode 100644 (file)
index 0000000..0c0b839
--- /dev/null
@@ -0,0 +1,636 @@
+/*
+ * Stress userfaultfd syscall.
+ *
+ *  Copyright (C) 2015  Red Hat, Inc.
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ *
+ * This test allocates two virtual areas and bounces the physical
+ * memory across the two virtual areas (from area_src to area_dst)
+ * using userfaultfd.
+ *
+ * There are three threads running per CPU:
+ *
+ * 1) one per-CPU thread takes a per-page pthread_mutex in a random
+ *    page of the area_dst (while the physical page may still be in
+ *    area_src), and increments a per-page counter in the same page,
+ *    and checks its value against a verification region.
+ *
+ * 2) another per-CPU thread handles the userfaults generated by
+ *    thread 1 above. userfaultfd blocking reads or poll() modes are
+ *    exercised interleaved.
+ *
+ * 3) one last per-CPU thread transfers the memory in the background
+ *    at maximum bandwidth (if not already transferred by thread
+ *    2). Each cpu thread takes cares of transferring a portion of the
+ *    area.
+ *
+ * When all threads of type 3 completed the transfer, one bounce is
+ * complete. area_src and area_dst are then swapped. All threads are
+ * respawned and so the bounce is immediately restarted in the
+ * opposite direction.
+ *
+ * per-CPU threads 1 by triggering userfaults inside
+ * pthread_mutex_lock will also verify the atomicity of the memory
+ * transfer (UFFDIO_COPY).
+ *
+ * The program takes two parameters: the amounts of physical memory in
+ * megabytes (MiB) of the area and the number of bounces to execute.
+ *
+ * # 100MiB 99999 bounces
+ * ./userfaultfd 100 99999
+ *
+ * # 1GiB 99 bounces
+ * ./userfaultfd 1000 99
+ *
+ * # 10MiB-~6GiB 999 bounces, continue forever unless an error triggers
+ * while ./userfaultfd $[RANDOM % 6000 + 10] 999; do true; done
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <signal.h>
+#include <poll.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <pthread.h>
+#include "../../../../include/uapi/linux/userfaultfd.h"
+
+#ifdef __x86_64__
+#define __NR_userfaultfd 323
+#elif defined(__i386__)
+#define __NR_userfaultfd 359
+#elif defined(__powewrpc__)
+#define __NR_userfaultfd 364
+#else
+#error "missing __NR_userfaultfd definition"
+#endif
+
+static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
+
+#define BOUNCE_RANDOM          (1<<0)
+#define BOUNCE_RACINGFAULTS    (1<<1)
+#define BOUNCE_VERIFY          (1<<2)
+#define BOUNCE_POLL            (1<<3)
+static int bounces;
+
+static unsigned long long *count_verify;
+static int uffd, finished, *pipefd;
+static char *area_src, *area_dst;
+static char *zeropage;
+pthread_attr_t attr;
+
+/* pthread_mutex_t starts at page offset 0 */
+#define area_mutex(___area, ___nr)                                     \
+       ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
+/*
+ * count is placed in the page after pthread_mutex_t naturally aligned
+ * to avoid non alignment faults on non-x86 archs.
+ */
+#define area_count(___area, ___nr)                                     \
+       ((volatile unsigned long long *) ((unsigned long)               \
+                                ((___area) + (___nr)*page_size +       \
+                                 sizeof(pthread_mutex_t) +             \
+                                 sizeof(unsigned long long) - 1) &     \
+                                ~(unsigned long)(sizeof(unsigned long long) \
+                                                 -  1)))
+
+static int my_bcmp(char *str1, char *str2, size_t n)
+{
+       unsigned long i;
+       for (i = 0; i < n; i++)
+               if (str1[i] != str2[i])
+                       return 1;
+       return 0;
+}
+
+static void *locking_thread(void *arg)
+{
+       unsigned long cpu = (unsigned long) arg;
+       struct random_data rand;
+       unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
+       int32_t rand_nr;
+       unsigned long long count;
+       char randstate[64];
+       unsigned int seed;
+       time_t start;
+
+       if (bounces & BOUNCE_RANDOM) {
+               seed = (unsigned int) time(NULL) - bounces;
+               if (!(bounces & BOUNCE_RACINGFAULTS))
+                       seed += cpu;
+               bzero(&rand, sizeof(rand));
+               bzero(&randstate, sizeof(randstate));
+               if (initstate_r(seed, randstate, sizeof(randstate), &rand))
+                       fprintf(stderr, "srandom_r error\n"), exit(1);
+       } else {
+               page_nr = -bounces;
+               if (!(bounces & BOUNCE_RACINGFAULTS))
+                       page_nr += cpu * nr_pages_per_cpu;
+       }
+
+       while (!finished) {
+               if (bounces & BOUNCE_RANDOM) {
+                       if (random_r(&rand, &rand_nr))
+                               fprintf(stderr, "random_r 1 error\n"), exit(1);
+                       page_nr = rand_nr;
+                       if (sizeof(page_nr) > sizeof(rand_nr)) {
+                               if (random_r(&rand, &rand_nr))
+                                       fprintf(stderr, "random_r 2 error\n"), exit(1);
+                               page_nr |= ((unsigned long) rand_nr) << 32;
+                       }
+               } else
+                       page_nr += 1;
+               page_nr %= nr_pages;
+
+               start = time(NULL);
+               if (bounces & BOUNCE_VERIFY) {
+                       count = *area_count(area_dst, page_nr);
+                       if (!count)
+                               fprintf(stderr,
+                                       "page_nr %lu wrong count %Lu %Lu\n",
+                                       page_nr, count,
+                                       count_verify[page_nr]), exit(1);
+
+
+                       /*
+                        * We can't use bcmp (or memcmp) because that
+                        * returns 0 erroneously if the memory is
+                        * changing under it (even if the end of the
+                        * page is never changing and always
+                        * different).
+                        */
+#if 1
+                       if (!my_bcmp(area_dst + page_nr * page_size, zeropage,
+                                    page_size))
+                               fprintf(stderr,
+                                       "my_bcmp page_nr %lu wrong count %Lu %Lu\n",
+                                       page_nr, count,
+                                       count_verify[page_nr]), exit(1);
+#else
+                       unsigned long loops;
+
+                       loops = 0;
+                       /* uncomment the below line to test with mutex */
+                       /* pthread_mutex_lock(area_mutex(area_dst, page_nr)); */
+                       while (!bcmp(area_dst + page_nr * page_size, zeropage,
+                                    page_size)) {
+                               loops += 1;
+                               if (loops > 10)
+                                       break;
+                       }
+                       /* uncomment below line to test with mutex */
+                       /* pthread_mutex_unlock(area_mutex(area_dst, page_nr)); */
+                       if (loops) {
+                               fprintf(stderr,
+                                       "page_nr %lu all zero thread %lu %p %lu\n",
+                                       page_nr, cpu, area_dst + page_nr * page_size,
+                                       loops);
+                               if (loops > 10)
+                                       exit(1);
+                       }
+#endif
+               }
+
+               pthread_mutex_lock(area_mutex(area_dst, page_nr));
+               count = *area_count(area_dst, page_nr);
+               if (count != count_verify[page_nr]) {
+                       fprintf(stderr,
+                               "page_nr %lu memory corruption %Lu %Lu\n",
+                               page_nr, count,
+                               count_verify[page_nr]), exit(1);
+               }
+               count++;
+               *area_count(area_dst, page_nr) = count_verify[page_nr] = count;
+               pthread_mutex_unlock(area_mutex(area_dst, page_nr));
+
+               if (time(NULL) - start > 1)
+                       fprintf(stderr,
+                               "userfault too slow %ld "
+                               "possible false positive with overcommit\n",
+                               time(NULL) - start);
+       }
+
+       return NULL;
+}
+
+static int copy_page(unsigned long offset)
+{
+       struct uffdio_copy uffdio_copy;
+
+       if (offset >= nr_pages * page_size)
+               fprintf(stderr, "unexpected offset %lu\n",
+                       offset), exit(1);
+       uffdio_copy.dst = (unsigned long) area_dst + offset;
+       uffdio_copy.src = (unsigned long) area_src + offset;
+       uffdio_copy.len = page_size;
+       uffdio_copy.mode = 0;
+       uffdio_copy.copy = 0;
+       if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy)) {
+               /* real retval in ufdio_copy.copy */
+               if (uffdio_copy.copy != -EEXIST)
+                       fprintf(stderr, "UFFDIO_COPY error %Ld\n",
+                               uffdio_copy.copy), exit(1);
+       } else if (uffdio_copy.copy != page_size) {
+               fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n",
+                       uffdio_copy.copy), exit(1);
+       } else
+               return 1;
+       return 0;
+}
+
+static void *uffd_poll_thread(void *arg)
+{
+       unsigned long cpu = (unsigned long) arg;
+       struct pollfd pollfd[2];
+       struct uffd_msg msg;
+       int ret;
+       unsigned long offset;
+       char tmp_chr;
+       unsigned long userfaults = 0;
+
+       pollfd[0].fd = uffd;
+       pollfd[0].events = POLLIN;
+       pollfd[1].fd = pipefd[cpu*2];
+       pollfd[1].events = POLLIN;
+
+       for (;;) {
+               ret = poll(pollfd, 2, -1);
+               if (!ret)
+                       fprintf(stderr, "poll error %d\n", ret), exit(1);
+               if (ret < 0)
+                       perror("poll"), exit(1);
+               if (pollfd[1].revents & POLLIN) {
+                       if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
+                               fprintf(stderr, "read pipefd error\n"),
+                                       exit(1);
+                       break;
+               }
+               if (!(pollfd[0].revents & POLLIN))
+                       fprintf(stderr, "pollfd[0].revents %d\n",
+                               pollfd[0].revents), exit(1);
+               ret = read(uffd, &msg, sizeof(msg));
+               if (ret < 0) {
+                       if (errno == EAGAIN)
+                               continue;
+                       perror("nonblocking read error"), exit(1);
+               }
+               if (msg.event != UFFD_EVENT_PAGEFAULT)
+                       fprintf(stderr, "unexpected msg event %u\n",
+                               msg.event), exit(1);
+               if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+                       fprintf(stderr, "unexpected write fault\n"), exit(1);
+               offset = (char *)msg.arg.pagefault.address - area_dst;
+               offset &= ~(page_size-1);
+               if (copy_page(offset))
+                       userfaults++;
+       }
+       return (void *)userfaults;
+}
+
+pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static void *uffd_read_thread(void *arg)
+{
+       unsigned long *this_cpu_userfaults;
+       struct uffd_msg msg;
+       unsigned long offset;
+       int ret;
+
+       this_cpu_userfaults = (unsigned long *) arg;
+       *this_cpu_userfaults = 0;
+
+       pthread_mutex_unlock(&uffd_read_mutex);
+       /* from here cancellation is ok */
+
+       for (;;) {
+               ret = read(uffd, &msg, sizeof(msg));
+               if (ret != sizeof(msg)) {
+                       if (ret < 0)
+                               perror("blocking read error"), exit(1);
+                       else
+                               fprintf(stderr, "short read\n"), exit(1);
+               }
+               if (msg.event != UFFD_EVENT_PAGEFAULT)
+                       fprintf(stderr, "unexpected msg event %u\n",
+                               msg.event), exit(1);
+               if (bounces & BOUNCE_VERIFY &&
+                   msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+                       fprintf(stderr, "unexpected write fault\n"), exit(1);
+               offset = (char *)msg.arg.pagefault.address - area_dst;
+               offset &= ~(page_size-1);
+               if (copy_page(offset))
+                       (*this_cpu_userfaults)++;
+       }
+       return (void *)NULL;
+}
+
+static void *background_thread(void *arg)
+{
+       unsigned long cpu = (unsigned long) arg;
+       unsigned long page_nr;
+
+       for (page_nr = cpu * nr_pages_per_cpu;
+            page_nr < (cpu+1) * nr_pages_per_cpu;
+            page_nr++)
+               copy_page(page_nr * page_size);
+
+       return NULL;
+}
+
+static int stress(unsigned long *userfaults)
+{
+       unsigned long cpu;
+       pthread_t locking_threads[nr_cpus];
+       pthread_t uffd_threads[nr_cpus];
+       pthread_t background_threads[nr_cpus];
+       void **_userfaults = (void **) userfaults;
+
+       finished = 0;
+       for (cpu = 0; cpu < nr_cpus; cpu++) {
+               if (pthread_create(&locking_threads[cpu], &attr,
+                                  locking_thread, (void *)cpu))
+                       return 1;
+               if (bounces & BOUNCE_POLL) {
+                       if (pthread_create(&uffd_threads[cpu], &attr,
+                                          uffd_poll_thread, (void *)cpu))
+                               return 1;
+               } else {
+                       if (pthread_create(&uffd_threads[cpu], &attr,
+                                          uffd_read_thread,
+                                          &_userfaults[cpu]))
+                               return 1;
+                       pthread_mutex_lock(&uffd_read_mutex);
+               }
+               if (pthread_create(&background_threads[cpu], &attr,
+                                  background_thread, (void *)cpu))
+                       return 1;
+       }
+       for (cpu = 0; cpu < nr_cpus; cpu++)
+               if (pthread_join(background_threads[cpu], NULL))
+                       return 1;
+
+       /*
+        * Be strict and immediately zap area_src, the whole area has
+        * been transferred already by the background treads. The
+        * area_src could then be faulted in in a racy way by still
+        * running uffdio_threads reading zeropages after we zapped
+        * area_src (but they're guaranteed to get -EEXIST from
+        * UFFDIO_COPY without writing zero pages into area_dst
+        * because the background threads already completed).
+        */
+       if (madvise(area_src, nr_pages * page_size, MADV_DONTNEED)) {
+               perror("madvise");
+               return 1;
+       }
+
+       for (cpu = 0; cpu < nr_cpus; cpu++) {
+               char c;
+               if (bounces & BOUNCE_POLL) {
+                       if (write(pipefd[cpu*2+1], &c, 1) != 1) {
+                               fprintf(stderr, "pipefd write error\n");
+                               return 1;
+                       }
+                       if (pthread_join(uffd_threads[cpu], &_userfaults[cpu]))
+                               return 1;
+               } else {
+                       if (pthread_cancel(uffd_threads[cpu]))
+                               return 1;
+                       if (pthread_join(uffd_threads[cpu], NULL))
+                               return 1;
+               }
+       }
+
+       finished = 1;
+       for (cpu = 0; cpu < nr_cpus; cpu++)
+               if (pthread_join(locking_threads[cpu], NULL))
+                       return 1;
+
+       return 0;
+}
+
+static int userfaultfd_stress(void)
+{
+       void *area;
+       char *tmp_area;
+       unsigned long nr;
+       struct uffdio_register uffdio_register;
+       struct uffdio_api uffdio_api;
+       unsigned long cpu;
+       int uffd_flags;
+       unsigned long userfaults[nr_cpus];
+
+       if (posix_memalign(&area, page_size, nr_pages * page_size)) {
+               fprintf(stderr, "out of memory\n");
+               return 1;
+       }
+       area_src = area;
+       if (posix_memalign(&area, page_size, nr_pages * page_size)) {
+               fprintf(stderr, "out of memory\n");
+               return 1;
+       }
+       area_dst = area;
+
+       uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+       if (uffd < 0) {
+               fprintf(stderr,
+                       "userfaultfd syscall not available in this kernel\n");
+               return 1;
+       }
+       uffd_flags = fcntl(uffd, F_GETFD, NULL);
+
+       uffdio_api.api = UFFD_API;
+       uffdio_api.features = 0;
+       if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
+               fprintf(stderr, "UFFDIO_API\n");
+               return 1;
+       }
+       if (uffdio_api.api != UFFD_API) {
+               fprintf(stderr, "UFFDIO_API error %Lu\n", uffdio_api.api);
+               return 1;
+       }
+
+       count_verify = malloc(nr_pages * sizeof(unsigned long long));
+       if (!count_verify) {
+               perror("count_verify");
+               return 1;
+       }
+
+       for (nr = 0; nr < nr_pages; nr++) {
+               *area_mutex(area_src, nr) = (pthread_mutex_t)
+                       PTHREAD_MUTEX_INITIALIZER;
+               count_verify[nr] = *area_count(area_src, nr) = 1;
+       }
+
+       pipefd = malloc(sizeof(int) * nr_cpus * 2);
+       if (!pipefd) {
+               perror("pipefd");
+               return 1;
+       }
+       for (cpu = 0; cpu < nr_cpus; cpu++) {
+               if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) {
+                       perror("pipe");
+                       return 1;
+               }
+       }
+
+       if (posix_memalign(&area, page_size, page_size)) {
+               fprintf(stderr, "out of memory\n");
+               return 1;
+       }
+       zeropage = area;
+       bzero(zeropage, page_size);
+
+       pthread_mutex_lock(&uffd_read_mutex);
+
+       pthread_attr_init(&attr);
+       pthread_attr_setstacksize(&attr, 16*1024*1024);
+
+       while (bounces--) {
+               unsigned long expected_ioctls;
+
+               printf("bounces: %d, mode:", bounces);
+               if (bounces & BOUNCE_RANDOM)
+                       printf(" rnd");
+               if (bounces & BOUNCE_RACINGFAULTS)
+                       printf(" racing");
+               if (bounces & BOUNCE_VERIFY)
+                       printf(" ver");
+               if (bounces & BOUNCE_POLL)
+                       printf(" poll");
+               printf(", ");
+               fflush(stdout);
+
+               if (bounces & BOUNCE_POLL)
+                       fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+               else
+                       fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
+
+               /* register */
+               uffdio_register.range.start = (unsigned long) area_dst;
+               uffdio_register.range.len = nr_pages * page_size;
+               uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+               if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
+                       fprintf(stderr, "register failure\n");
+                       return 1;
+               }
+               expected_ioctls = (1 << _UFFDIO_WAKE) |
+                                 (1 << _UFFDIO_COPY) |
+                                 (1 << _UFFDIO_ZEROPAGE);
+               if ((uffdio_register.ioctls & expected_ioctls) !=
+                   expected_ioctls) {
+                       fprintf(stderr,
+                               "unexpected missing ioctl for anon memory\n");
+                       return 1;
+               }
+
+               /*
+                * The madvise done previously isn't enough: some
+                * uffd_thread could have read userfaults (one of
+                * those already resolved by the background thread)
+                * and it may be in the process of calling
+                * UFFDIO_COPY. UFFDIO_COPY will read the zapped
+                * area_src and it would map a zero page in it (of
+                * course such a UFFDIO_COPY is perfectly safe as it'd
+                * return -EEXIST). The problem comes at the next
+                * bounce though: that racing UFFDIO_COPY would
+                * generate zeropages in the area_src, so invalidating
+                * the previous MADV_DONTNEED. Without this additional
+                * MADV_DONTNEED those zeropages leftovers in the
+                * area_src would lead to -EEXIST failure during the
+                * next bounce, effectively leaving a zeropage in the
+                * area_dst.
+                *
+                * Try to comment this out madvise to see the memory
+                * corruption being caught pretty quick.
+                *
+                * khugepaged is also inhibited to collapse THP after
+                * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
+                * required to MADV_DONTNEED here.
+                */
+               if (madvise(area_dst, nr_pages * page_size, MADV_DONTNEED)) {
+                       perror("madvise 2");
+                       return 1;
+               }
+
+               /* bounce pass */
+               if (stress(userfaults))
+                       return 1;
+
+               /* unregister */
+               if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
+                       fprintf(stderr, "register failure\n");
+                       return 1;
+               }
+
+               /* verification */
+               if (bounces & BOUNCE_VERIFY) {
+                       for (nr = 0; nr < nr_pages; nr++) {
+                               if (my_bcmp(area_dst,
+                                           area_dst + nr * page_size,
+                                           sizeof(pthread_mutex_t))) {
+                                       fprintf(stderr,
+                                               "error mutex 2 %lu\n",
+                                               nr);
+                                       bounces = 0;
+                               }
+                               if (*area_count(area_dst, nr) != count_verify[nr]) {
+                                       fprintf(stderr,
+                                               "error area_count %Lu %Lu %lu\n",
+                                               *area_count(area_src, nr),
+                                               count_verify[nr],
+                                               nr);
+                                       bounces = 0;
+                               }
+                       }
+               }
+
+               /* prepare next bounce */
+               tmp_area = area_src;
+               area_src = area_dst;
+               area_dst = tmp_area;
+
+               printf("userfaults:");
+               for (cpu = 0; cpu < nr_cpus; cpu++)
+                       printf(" %lu", userfaults[cpu]);
+               printf("\n");
+       }
+
+       return 0;
+}
+
+int main(int argc, char **argv)
+{
+       if (argc < 3)
+               fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+       nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+       page_size = sysconf(_SC_PAGE_SIZE);
+       if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) >
+           page_size)
+               fprintf(stderr, "Impossible to run this test\n"), exit(2);
+       nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
+               nr_cpus;
+       if (!nr_pages_per_cpu) {
+               fprintf(stderr, "invalid MiB\n");
+               fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+       }
+       bounces = atoi(argv[2]);
+       if (bounces <= 0) {
+               fprintf(stderr, "invalid bounces\n");
+               fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+       }
+       nr_pages = nr_pages_per_cpu * nr_cpus;
+       printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
+              nr_pages, nr_pages_per_cpu);
+       return userfaultfd_stress();
+}