arm64: dts: rk3368: hdmi disabled default and remove hdmi node from p9 board
[firefly-linux-kernel-4.4.55.git] / kernel / workqueue.c
index ee8e29a2320c7c76d67df8f4816af52cd9da5f68..2c2f971f3e759df3c812d749740047f05864931b 100644 (file)
  *
  * This is the generic async execution mechanism.  Work items as are
  * executed in process context.  The worker pool is shared and
- * automatically managed.  There is one worker pool for each CPU and
- * one extra for works which are better served by workers which are
- * not bound to any specific CPU.
+ * automatically managed.  There are two worker pools for each CPU (one for
+ * normal work items and the other for high priority ones) and some extra
+ * pools for workqueues which are not bound to any specific CPU - the
+ * number of these backing pools is dynamic.
  *
  * Please read Documentation/workqueue.txt for details.
  */
@@ -64,15 +65,12 @@ enum {
         * be executing on any CPU.  The pool behaves as an unbound one.
         *
         * Note that DISASSOCIATED should be flipped only while holding
-        * manager_mutex to avoid changing binding state while
-        * create_worker() is in progress.
+        * attach_mutex to avoid changing binding state while
+        * worker_attach_to_pool() is in progress.
         */
-       POOL_MANAGE_WORKERS     = 1 << 0,       /* need to manage workers */
        POOL_DISASSOCIATED      = 1 << 2,       /* cpu can't serve workers */
-       POOL_FREEZING           = 1 << 3,       /* freeze in progress */
 
        /* worker flags */
-       WORKER_STARTED          = 1 << 0,       /* started */
        WORKER_DIE              = 1 << 1,       /* die die die */
        WORKER_IDLE             = 1 << 2,       /* is idle */
        WORKER_PREP             = 1 << 3,       /* preparing to run works */
@@ -99,10 +97,10 @@ enum {
 
        /*
         * Rescue workers are used only on emergencies and shared by
-        * all cpus.  Give -20.
+        * all cpus.  Give MIN_NICE.
         */
-       RESCUER_NICE_LEVEL      = -20,
-       HIGHPRI_NICE_LEVEL      = -20,
+       RESCUER_NICE_LEVEL      = MIN_NICE,
+       HIGHPRI_NICE_LEVEL      = MIN_NICE,
 
        WQ_NAME_LEN             = 24,
 };
@@ -123,13 +121,17 @@ enum {
  *    cpu or grabbing pool->lock is enough for read access.  If
  *    POOL_DISASSOCIATED is set, it's identical to L.
  *
- * MG: pool->manager_mutex and pool->lock protected.  Writes require both
- *     locks.  Reads can happen under either lock.
+ * A: pool->attach_mutex protected.
  *
  * PL: wq_pool_mutex protected.
  *
  * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
  *
+ * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
+ *
+ * PWR: wq_pool_mutex and wq->mutex protected for writes.  Either or
+ *      sched-RCU for reads.
+ *
  * WQ: wq->mutex protected.
  *
  * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
@@ -162,8 +164,12 @@ struct worker_pool {
 
        /* see manage_workers() for details on the two manager mutexes */
        struct mutex            manager_arb;    /* manager arbitration */
-       struct mutex            manager_mutex;  /* manager exclusion */
-       struct idr              worker_idr;     /* MG: worker IDs and iteration */
+       struct worker           *manager;       /* L: purely informational */
+       struct mutex            attach_mutex;   /* attach/detach exclusion */
+       struct list_head        workers;        /* A: attached workers */
+       struct completion       *detach_completion; /* all workers detached */
+
+       struct ida              worker_ida;     /* worker IDs for task name */
 
        struct workqueue_attrs  *attrs;         /* I: worker attributes */
        struct hlist_node       hash_node;      /* PL: unbound_pool_hash node */
@@ -230,7 +236,7 @@ struct wq_device;
  */
 struct workqueue_struct {
        struct list_head        pwqs;           /* WR: all pwqs of this wq */
-       struct list_head        list;           /* PL: list of all workqueues */
+       struct list_head        list;           /* PR: list of all workqueues */
 
        struct mutex            mutex;          /* protects this wq */
        int                     work_color;     /* WQ: current work color */
@@ -246,8 +252,8 @@ struct workqueue_struct {
        int                     nr_drainers;    /* WQ: drain in progress */
        int                     saved_max_active; /* WQ: saved pwq max_active */
 
-       struct workqueue_attrs  *unbound_attrs; /* WQ: only for unbound wqs */
-       struct pool_workqueue   *dfl_pwq;       /* WQ: only for unbound wqs */
+       struct workqueue_attrs  *unbound_attrs; /* PW: only for unbound wqs */
+       struct pool_workqueue   *dfl_pwq;       /* PW: only for unbound wqs */
 
 #ifdef CONFIG_SYSFS
        struct wq_device        *wq_dev;        /* I: for sysfs interface */
@@ -257,21 +263,31 @@ struct workqueue_struct {
 #endif
        char                    name[WQ_NAME_LEN]; /* I: workqueue name */
 
+       /*
+        * Destruction of workqueue_struct is sched-RCU protected to allow
+        * walking the workqueues list without grabbing wq_pool_mutex.
+        * This is used to dump all workqueues from sysrq.
+        */
+       struct rcu_head         rcu;
+
        /* hot fields used during command issue, aligned to cacheline */
        unsigned int            flags ____cacheline_aligned; /* WQ: WQ_* flags */
        struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
-       struct pool_workqueue __rcu *numa_pwq_tbl[]; /* FR: unbound pwqs indexed by node */
+       struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */
 };
 
 static struct kmem_cache *pwq_cache;
 
-static int wq_numa_tbl_len;            /* highest possible NUMA node id + 1 */
 static cpumask_var_t *wq_numa_possible_cpumask;
                                        /* possible CPUs of each node */
 
 static bool wq_disable_numa;
 module_param_named(disable_numa, wq_disable_numa, bool, 0444);
 
+/* see the comment above the definition of WQ_POWER_EFFICIENT */
+static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
+module_param_named(power_efficient, wq_power_efficient, bool, 0444);
+
 static bool wq_numa_enabled;           /* unbound NUMA affinity enabled */
 
 /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
@@ -280,9 +296,11 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
 static DEFINE_MUTEX(wq_pool_mutex);    /* protects pools and workqueues list */
 static DEFINE_SPINLOCK(wq_mayday_lock);        /* protects wq->maydays list */
 
-static LIST_HEAD(workqueues);          /* PL: list of all workqueues */
+static LIST_HEAD(workqueues);          /* PR: list of all workqueues */
 static bool workqueue_freezing;                /* PL: have wqs started freezing? */
 
+static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */
+
 /* the per-cpu worker pools */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
                                     cpu_worker_pools);
@@ -295,6 +313,9 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
 /* I: attributes used when instantiating standard unbound pools on demand */
 static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
 
+/* I: attributes used when instantiating ordered pools on demand */
+static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
+
 struct workqueue_struct *system_wq __read_mostly;
 EXPORT_SYMBOL(system_wq);
 struct workqueue_struct *system_highpri_wq __read_mostly;
@@ -305,33 +326,32 @@ struct workqueue_struct *system_unbound_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_unbound_wq);
 struct workqueue_struct *system_freezable_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_freezable_wq);
+struct workqueue_struct *system_power_efficient_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_power_efficient_wq);
+struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
 
 static int worker_thread(void *__worker);
-static void copy_workqueue_attrs(struct workqueue_attrs *to,
-                                const struct workqueue_attrs *from);
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
 
 #define assert_rcu_or_pool_mutex()                                     \
-       rcu_lockdep_assert(rcu_read_lock_sched_held() ||                \
-                          lockdep_is_held(&wq_pool_mutex),             \
-                          "sched RCU or wq_pool_mutex should be held")
+       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
+                        !lockdep_is_held(&wq_pool_mutex),              \
+                        "sched RCU or wq_pool_mutex should be held")
 
 #define assert_rcu_or_wq_mutex(wq)                                     \
-       rcu_lockdep_assert(rcu_read_lock_sched_held() ||                \
-                          lockdep_is_held(&wq->mutex),                 \
-                          "sched RCU or wq->mutex should be held")
+       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
+                        !lockdep_is_held(&wq->mutex),                  \
+                        "sched RCU or wq->mutex should be held")
 
-#ifdef CONFIG_LOCKDEP
-#define assert_manager_or_pool_lock(pool)                              \
-       WARN_ONCE(debug_locks &&                                        \
-                 !lockdep_is_held(&(pool)->manager_mutex) &&           \
-                 !lockdep_is_held(&(pool)->lock),                      \
-                 "pool->manager_mutex or ->lock should be held")
-#else
-#define assert_manager_or_pool_lock(pool)      do { } while (0)
-#endif
+#define assert_rcu_or_wq_mutex_or_pool_mutex(wq)                       \
+       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
+                        !lockdep_is_held(&wq->mutex) &&                \
+                        !lockdep_is_held(&wq_pool_mutex),              \
+                        "sched RCU, wq->mutex or wq_pool_mutex should be held")
 
 #define for_each_cpu_worker_pool(pool, cpu)                            \
        for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];               \
@@ -358,17 +378,16 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
 /**
  * for_each_pool_worker - iterate through all workers of a worker_pool
  * @worker: iteration cursor
- * @wi: integer used for iteration
  * @pool: worker_pool to iterate workers of
  *
- * This must be called with either @pool->manager_mutex or ->lock held.
+ * This must be called with @pool->attach_mutex.
  *
  * The if/else clause exists only for the lockdep assertion and can be
  * ignored.
  */
-#define for_each_pool_worker(worker, wi, pool)                         \
-       idr_for_each_entry(&(pool)->worker_idr, (worker), (wi))         \
-               if (({ assert_manager_or_pool_lock((pool)); false; })) { } \
+#define for_each_pool_worker(worker, pool)                             \
+       list_for_each_entry((worker), &(pool)->workers, node)           \
+               if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \
                else
 
 /**
@@ -499,19 +518,33 @@ void destroy_work_on_stack(struct work_struct *work)
 }
 EXPORT_SYMBOL_GPL(destroy_work_on_stack);
 
+void destroy_delayed_work_on_stack(struct delayed_work *work)
+{
+       destroy_timer_on_stack(&work->timer);
+       debug_object_free(&work->work, &work_debug_descr);
+}
+EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);
+
 #else
 static inline void debug_work_activate(struct work_struct *work) { }
 static inline void debug_work_deactivate(struct work_struct *work) { }
 #endif
 
-/* allocate ID and assign it to @pool */
+/**
+ * worker_pool_assign_id - allocate ID and assing it to @pool
+ * @pool: the pool pointer of interest
+ *
+ * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
+ * successfully, -errno on failure.
+ */
 static int worker_pool_assign_id(struct worker_pool *pool)
 {
        int ret;
 
        lockdep_assert_held(&wq_pool_mutex);
 
-       ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL);
+       ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
+                       GFP_KERNEL);
        if (ret >= 0) {
                pool->id = ret;
                return 0;
@@ -524,14 +557,27 @@ static int worker_pool_assign_id(struct worker_pool *pool)
  * @wq: the target workqueue
  * @node: the node ID
  *
- * This must be called either with pwq_lock held or sched RCU read locked.
+ * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
+ * read locked.
  * If the pwq needs to be used beyond the locking in effect, the caller is
  * responsible for guaranteeing that the pwq stays online.
+ *
+ * Return: The unbound pool_workqueue for @node.
  */
 static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
                                                  int node)
 {
-       assert_rcu_or_wq_mutex(wq);
+       assert_rcu_or_wq_mutex_or_pool_mutex(wq);
+
+       /*
+        * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a
+        * delayed item is pending.  The plan is to keep CPU -> NODE
+        * mapping valid and stable across CPU on/offlines.  Once that
+        * happens, this workaround can be removed.
+        */
+       if (unlikely(node == NUMA_NO_NODE))
+               return wq->dfl_pwq;
+
        return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
 }
 
@@ -603,6 +649,35 @@ static void set_work_pool_and_clear_pending(struct work_struct *work,
         */
        smp_wmb();
        set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
+       /*
+        * The following mb guarantees that previous clear of a PENDING bit
+        * will not be reordered with any speculative LOADS or STORES from
+        * work->current_func, which is executed afterwards.  This possible
+        * reordering can lead to a missed execution on attempt to qeueue
+        * the same @work.  E.g. consider this case:
+        *
+        *   CPU#0                         CPU#1
+        *   ----------------------------  --------------------------------
+        *
+        * 1  STORE event_indicated
+        * 2  queue_work_on() {
+        * 3    test_and_set_bit(PENDING)
+        * 4 }                             set_..._and_clear_pending() {
+        * 5                                 set_work_data() # clear bit
+        * 6                                 smp_mb()
+        * 7                               work->current_func() {
+        * 8                                  LOAD event_indicated
+        *                                 }
+        *
+        * Without an explicit full barrier speculative LOAD on line 8 can
+        * be executed before CPU#0 does STORE on line 1.  If that happens,
+        * CPU#0 observes the PENDING bit is still set and new execution of
+        * a @work is not queued in a hope, that CPU#1 will eventually
+        * finish the queued @work.  Meanwhile CPU#1 does not see
+        * event_indicated is set, because speculative LOAD was executed
+        * before actual STORE.
+        */
+       smp_mb();
 }
 
 static void clear_work_data(struct work_struct *work)
@@ -625,8 +700,6 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
  * get_work_pool - return the worker_pool a given work was associated with
  * @work: the work item of interest
  *
- * Return the worker_pool @work was last associated with.  %NULL if none.
- *
  * Pools are created and destroyed under wq_pool_mutex, and allows read
  * access under sched-RCU read lock.  As such, this function should be
  * called under wq_pool_mutex or with preemption disabled.
@@ -635,6 +708,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
  * mentioned locking is in effect.  If the returned pool needs to be used
  * beyond the critical section, the caller is responsible for ensuring the
  * returned pool is and stays online.
+ *
+ * Return: The worker_pool @work was last associated with.  %NULL if none.
  */
 static struct worker_pool *get_work_pool(struct work_struct *work)
 {
@@ -658,7 +733,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work)
  * get_work_pool_id - return the worker pool ID a given work is associated with
  * @work: the work item of interest
  *
- * Return the worker_pool ID @work was last associated with.
+ * Return: The worker_pool ID @work was last associated with.
  * %WORK_OFFQ_POOL_NONE if none.
  */
 static int get_work_pool_id(struct work_struct *work)
@@ -730,13 +805,6 @@ static bool need_to_create_worker(struct worker_pool *pool)
        return need_more_worker(pool) && !may_start_working(pool);
 }
 
-/* Do I need to be the manager? */
-static bool need_to_manage_workers(struct worker_pool *pool)
-{
-       return need_to_create_worker(pool) ||
-               (pool->flags & POOL_MANAGE_WORKERS);
-}
-
 /* Do we have too many workers and should some go away? */
 static bool too_many_workers(struct worker_pool *pool)
 {
@@ -744,13 +812,6 @@ static bool too_many_workers(struct worker_pool *pool)
        int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
        int nr_busy = pool->nr_workers - nr_idle;
 
-       /*
-        * nr_idle and idle_list may disagree if idle rebinding is in
-        * progress.  Never return %true if idle_list is empty.
-        */
-       if (list_empty(&pool->idle_list))
-               return false;
-
        return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
 }
 
@@ -758,8 +819,8 @@ static bool too_many_workers(struct worker_pool *pool)
  * Wake up functions.
  */
 
-/* Return the first worker.  Safe with preemption disabled */
-static struct worker *first_worker(struct worker_pool *pool)
+/* Return the first idle worker.  Safe with preemption disabled */
+static struct worker *first_idle_worker(struct worker_pool *pool)
 {
        if (unlikely(list_empty(&pool->idle_list)))
                return NULL;
@@ -778,7 +839,7 @@ static struct worker *first_worker(struct worker_pool *pool)
  */
 static void wake_up_worker(struct worker_pool *pool)
 {
-       struct worker *worker = first_worker(pool);
+       struct worker *worker = first_idle_worker(pool);
 
        if (likely(worker))
                wake_up_process(worker->task);
@@ -817,7 +878,7 @@ void wq_worker_waking_up(struct task_struct *task, int cpu)
  * CONTEXT:
  * spin_lock_irq(rq->lock)
  *
- * RETURNS:
+ * Return:
  * Worker task on @cpu to wake up, %NULL if none.
  */
 struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
@@ -836,7 +897,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
        pool = worker->pool;
 
        /* this can only happen on the local cpu */
-       if (WARN_ON_ONCE(cpu != raw_smp_processor_id()))
+       if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu))
                return NULL;
 
        /*
@@ -852,7 +913,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
         */
        if (atomic_dec_and_test(&pool->nr_running) &&
            !list_empty(&pool->worklist))
-               to_wakeup = first_worker(pool);
+               to_wakeup = first_idle_worker(pool);
        return to_wakeup ? to_wakeup->task : NULL;
 }
 
@@ -860,35 +921,22 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
  * worker_set_flags - set worker flags and adjust nr_running accordingly
  * @worker: self
  * @flags: flags to set
- * @wakeup: wakeup an idle worker if necessary
  *
- * Set @flags in @worker->flags and adjust nr_running accordingly.  If
- * nr_running becomes zero and @wakeup is %true, an idle worker is
- * woken up.
+ * Set @flags in @worker->flags and adjust nr_running accordingly.
  *
  * CONTEXT:
  * spin_lock_irq(pool->lock)
  */
-static inline void worker_set_flags(struct worker *worker, unsigned int flags,
-                                   bool wakeup)
+static inline void worker_set_flags(struct worker *worker, unsigned int flags)
 {
        struct worker_pool *pool = worker->pool;
 
        WARN_ON_ONCE(worker->task != current);
 
-       /*
-        * If transitioning into NOT_RUNNING, adjust nr_running and
-        * wake up an idle worker as necessary if requested by
-        * @wakeup.
-        */
+       /* If transitioning into NOT_RUNNING, adjust nr_running. */
        if ((flags & WORKER_NOT_RUNNING) &&
            !(worker->flags & WORKER_NOT_RUNNING)) {
-               if (wakeup) {
-                       if (atomic_dec_and_test(&pool->nr_running) &&
-                           !list_empty(&pool->worklist))
-                               wake_up_worker(pool);
-               } else
-                       atomic_dec(&pool->nr_running);
+               atomic_dec(&pool->nr_running);
        }
 
        worker->flags |= flags;
@@ -952,8 +1000,8 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
  * CONTEXT:
  * spin_lock_irq(pool->lock).
  *
- * RETURNS:
- * Pointer to worker which is executing @work if found, NULL
+ * Return:
+ * Pointer to worker which is executing @work if found, %NULL
  * otherwise.
  */
 static struct worker *find_worker_executing_work(struct worker_pool *pool,
@@ -974,7 +1022,7 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool,
  * move_linked_works - move linked works to a list
  * @work: start of series of works to be scheduled
  * @head: target list to append @work to
- * @nextp: out paramter for nested worklist walking
+ * @nextp: out parameter for nested worklist walking
  *
  * Schedule linked works starting from @work to @head.  Work series to
  * be scheduled starts at @work and includes any consecutive work with
@@ -1141,14 +1189,16 @@ out_put:
  * @flags: place to store irq state
  *
  * Try to grab PENDING bit of @work.  This function can handle @work in any
- * stable state - idle, on timer or on worklist.  Return values are
+ * stable state - idle, on timer or on worklist.
  *
+ * Return:
  *  1          if @work was pending and we successfully stole PENDING
  *  0          if @work was idle and we claimed PENDING
  *  -EAGAIN    if PENDING couldn't be grabbed at the moment, safe to busy-retry
  *  -ENOENT    if someone else is canceling @work, this state may persist
  *             for arbitrarily long
  *
+ * Note:
  * On >= 0 return, the caller owns @work's PENDING bit.  To avoid getting
  * interrupted while holding PENDING and @work off queue, irq must be
  * disabled on entry.  This, combined with delayed_work->timer being
@@ -1216,7 +1266,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
                        pwq_activate_delayed_work(work);
 
                list_del_init(&work->entry);
-               pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work));
+               pwq_dec_nr_in_flight(pwq, get_work_color(work));
 
                /* work->data points to pwq iff queued, point to pool */
                set_work_pool_and_keep_pending(work, pool->id);
@@ -1302,7 +1352,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
 
        debug_work_activate(work);
 
-       /* if dying, only works from the same workqueue are allowed */
+       /* if draining, only works from the same workqueue are allowed */
        if (unlikely(wq->flags & __WQ_DRAINING) &&
            WARN_ON_ONCE(!is_chained_work(wq)))
                return;
@@ -1390,10 +1440,10 @@ retry:
  * @wq: workqueue to use
  * @work: work to queue
  *
- * Returns %false if @work was already on a queue, %true otherwise.
- *
  * We queue the work to a specific CPU, the caller must ensure it
  * can't go away.
+ *
+ * Return: %false if @work was already on a queue, %true otherwise.
  */
 bool queue_work_on(int cpu, struct workqueue_struct *wq,
                   struct work_struct *work)
@@ -1463,7 +1513,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
  * @dwork: work to queue
  * @delay: number of jiffies to wait before queueing
  *
- * Returns %false if @work was already on a queue, %true otherwise.  If
+ * Return: %false if @work was already on a queue, %true otherwise.  If
  * @delay is zero and @dwork is idle, it will be scheduled for immediate
  * execution.
  */
@@ -1499,7 +1549,7 @@ EXPORT_SYMBOL(queue_delayed_work_on);
  * zero, @work is guaranteed to be scheduled immediately regardless of its
  * current state.
  *
- * Returns %false if @dwork was idle and queued, %true if @dwork was
+ * Return: %false if @dwork was idle and queued, %true if @dwork was
  * pending and its timer was modified.
  *
  * This function is safe to call from any context including IRQ handler.
@@ -1544,7 +1594,7 @@ static void worker_enter_idle(struct worker *worker)
                         (worker->hentry.next || worker->hentry.pprev)))
                return;
 
-       /* can't use worker_set_flags(), also called from start_worker() */
+       /* can't use worker_set_flags(), also called from create_worker() */
        worker->flags |= WORKER_IDLE;
        pool->nr_idle++;
        worker->last_active = jiffies;
@@ -1586,96 +1636,91 @@ static void worker_leave_idle(struct worker *worker)
        list_del_init(&worker->entry);
 }
 
-/**
- * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it
- * @pool: target worker_pool
- *
- * Bind %current to the cpu of @pool if it is associated and lock @pool.
- *
- * Works which are scheduled while the cpu is online must at least be
- * scheduled to a worker which is bound to the cpu so that if they are
- * flushed from cpu callbacks while cpu is going down, they are
- * guaranteed to execute on the cpu.
- *
- * This function is to be used by unbound workers and rescuers to bind
- * themselves to the target cpu and may race with cpu going down or
- * coming online.  kthread_bind() can't be used because it may put the
- * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
- * verbatim as it's best effort and blocking and pool may be
- * [dis]associated in the meantime.
- *
- * This function tries set_cpus_allowed() and locks pool and verifies the
- * binding against %POOL_DISASSOCIATED which is set during
- * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
- * enters idle state or fetches works without dropping lock, it can
- * guarantee the scheduling requirement described in the first paragraph.
- *
- * CONTEXT:
- * Might sleep.  Called without any lock but returns with pool->lock
- * held.
- *
- * RETURNS:
- * %true if the associated pool is online (@worker is successfully
- * bound), %false if offline.
- */
-static bool worker_maybe_bind_and_lock(struct worker_pool *pool)
-__acquires(&pool->lock)
-{
-       while (true) {
-               /*
-                * The following call may fail, succeed or succeed
-                * without actually migrating the task to the cpu if
-                * it races with cpu hotunplug operation.  Verify
-                * against POOL_DISASSOCIATED.
-                */
-               if (!(pool->flags & POOL_DISASSOCIATED))
-                       set_cpus_allowed_ptr(current, pool->attrs->cpumask);
-
-               spin_lock_irq(&pool->lock);
-               if (pool->flags & POOL_DISASSOCIATED)
-                       return false;
-               if (task_cpu(current) == pool->cpu &&
-                   cpumask_equal(&current->cpus_allowed, pool->attrs->cpumask))
-                       return true;
-               spin_unlock_irq(&pool->lock);
-
-               /*
-                * We've raced with CPU hot[un]plug.  Give it a breather
-                * and retry migration.  cond_resched() is required here;
-                * otherwise, we might deadlock against cpu_stop trying to
-                * bring down the CPU on non-preemptive kernel.
-                */
-               cpu_relax();
-               cond_resched();
-       }
-}
-
-static struct worker *alloc_worker(void)
+static struct worker *alloc_worker(int node)
 {
        struct worker *worker;
 
-       worker = kzalloc(sizeof(*worker), GFP_KERNEL);
+       worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);
        if (worker) {
                INIT_LIST_HEAD(&worker->entry);
                INIT_LIST_HEAD(&worker->scheduled);
+               INIT_LIST_HEAD(&worker->node);
                /* on creation a worker is in !idle && prep state */
                worker->flags = WORKER_PREP;
        }
        return worker;
 }
 
+/**
+ * worker_attach_to_pool() - attach a worker to a pool
+ * @worker: worker to be attached
+ * @pool: the target pool
+ *
+ * Attach @worker to @pool.  Once attached, the %WORKER_UNBOUND flag and
+ * cpu-binding of @worker are kept coordinated with the pool across
+ * cpu-[un]hotplugs.
+ */
+static void worker_attach_to_pool(struct worker *worker,
+                                  struct worker_pool *pool)
+{
+       mutex_lock(&pool->attach_mutex);
+
+       /*
+        * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
+        * online CPUs.  It'll be re-applied when any of the CPUs come up.
+        */
+       set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
+
+       /*
+        * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains
+        * stable across this function.  See the comments above the
+        * flag definition for details.
+        */
+       if (pool->flags & POOL_DISASSOCIATED)
+               worker->flags |= WORKER_UNBOUND;
+
+       list_add_tail(&worker->node, &pool->workers);
+
+       mutex_unlock(&pool->attach_mutex);
+}
+
+/**
+ * worker_detach_from_pool() - detach a worker from its pool
+ * @worker: worker which is attached to its pool
+ * @pool: the pool @worker is attached to
+ *
+ * Undo the attaching which had been done in worker_attach_to_pool().  The
+ * caller worker shouldn't access to the pool after detached except it has
+ * other reference to the pool.
+ */
+static void worker_detach_from_pool(struct worker *worker,
+                                   struct worker_pool *pool)
+{
+       struct completion *detach_completion = NULL;
+
+       mutex_lock(&pool->attach_mutex);
+       list_del(&worker->node);
+       if (list_empty(&pool->workers))
+               detach_completion = pool->detach_completion;
+       mutex_unlock(&pool->attach_mutex);
+
+       /* clear leftover flags without pool->lock after it is detached */
+       worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
+
+       if (detach_completion)
+               complete(detach_completion);
+}
+
 /**
  * create_worker - create a new workqueue worker
  * @pool: pool the new worker will belong to
  *
- * Create a new worker which is bound to @pool.  The returned worker
- * can be started by calling start_worker() or destroyed using
- * destroy_worker().
+ * Create and start a new worker which is attached to @pool.
  *
  * CONTEXT:
  * Might sleep.  Does GFP_KERNEL allocations.
  *
- * RETURNS:
+ * Return:
  * Pointer to the newly created worker.
  */
 static struct worker *create_worker(struct worker_pool *pool)
@@ -1684,23 +1729,12 @@ static struct worker *create_worker(struct worker_pool *pool)
        int id = -1;
        char id_buf[16];
 
-       lockdep_assert_held(&pool->manager_mutex);
-
-       /*
-        * ID is needed to determine kthread name.  Allocate ID first
-        * without installing the pointer.
-        */
-       idr_preload(GFP_KERNEL);
-       spin_lock_irq(&pool->lock);
-
-       id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT);
-
-       spin_unlock_irq(&pool->lock);
-       idr_preload_end();
+       /* ID is needed to determine kthread name */
+       id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);
        if (id < 0)
                goto fail;
 
-       worker = alloc_worker();
+       worker = alloc_worker(pool->node);
        if (!worker)
                goto fail;
 
@@ -1718,119 +1752,56 @@ static struct worker *create_worker(struct worker_pool *pool)
        if (IS_ERR(worker->task))
                goto fail;
 
-       /*
-        * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
-        * online CPUs.  It'll be re-applied when any of the CPUs come up.
-        */
        set_user_nice(worker->task, pool->attrs->nice);
-       set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
-
-       /* prevent userland from meddling with cpumask of workqueue workers */
-       worker->task->flags |= PF_NO_SETAFFINITY;
+       kthread_bind_mask(worker->task, pool->attrs->cpumask);
 
-       /*
-        * The caller is responsible for ensuring %POOL_DISASSOCIATED
-        * remains stable across this function.  See the comments above the
-        * flag definition for details.
-        */
-       if (pool->flags & POOL_DISASSOCIATED)
-               worker->flags |= WORKER_UNBOUND;
+       /* successful, attach the worker to the pool */
+       worker_attach_to_pool(worker, pool);
 
-       /* successful, commit the pointer to idr */
+       /* start the newly created worker */
        spin_lock_irq(&pool->lock);
-       idr_replace(&pool->worker_idr, worker, worker->id);
+       worker->pool->nr_workers++;
+       worker_enter_idle(worker);
+       wake_up_process(worker->task);
        spin_unlock_irq(&pool->lock);
 
        return worker;
 
 fail:
-       if (id >= 0) {
-               spin_lock_irq(&pool->lock);
-               idr_remove(&pool->worker_idr, id);
-               spin_unlock_irq(&pool->lock);
-       }
+       if (id >= 0)
+               ida_simple_remove(&pool->worker_ida, id);
        kfree(worker);
        return NULL;
 }
 
-/**
- * start_worker - start a newly created worker
- * @worker: worker to start
- *
- * Make the pool aware of @worker and start it.
- *
- * CONTEXT:
- * spin_lock_irq(pool->lock).
- */
-static void start_worker(struct worker *worker)
-{
-       worker->flags |= WORKER_STARTED;
-       worker->pool->nr_workers++;
-       worker_enter_idle(worker);
-       wake_up_process(worker->task);
-}
-
-/**
- * create_and_start_worker - create and start a worker for a pool
- * @pool: the target pool
- *
- * Grab the managership of @pool and create and start a new worker for it.
- */
-static int create_and_start_worker(struct worker_pool *pool)
-{
-       struct worker *worker;
-
-       mutex_lock(&pool->manager_mutex);
-
-       worker = create_worker(pool);
-       if (worker) {
-               spin_lock_irq(&pool->lock);
-               start_worker(worker);
-               spin_unlock_irq(&pool->lock);
-       }
-
-       mutex_unlock(&pool->manager_mutex);
-
-       return worker ? 0 : -ENOMEM;
-}
-
 /**
  * destroy_worker - destroy a workqueue worker
  * @worker: worker to be destroyed
  *
- * Destroy @worker and adjust @pool stats accordingly.
+ * Destroy @worker and adjust @pool stats accordingly.  The worker should
+ * be idle.
  *
  * CONTEXT:
- * spin_lock_irq(pool->lock) which is released and regrabbed.
+ * spin_lock_irq(pool->lock).
  */
 static void destroy_worker(struct worker *worker)
 {
        struct worker_pool *pool = worker->pool;
 
-       lockdep_assert_held(&pool->manager_mutex);
        lockdep_assert_held(&pool->lock);
 
        /* sanity check frenzy */
        if (WARN_ON(worker->current_work) ||
-           WARN_ON(!list_empty(&worker->scheduled)))
+           WARN_ON(!list_empty(&worker->scheduled)) ||
+           WARN_ON(!(worker->flags & WORKER_IDLE)))
                return;
 
-       if (worker->flags & WORKER_STARTED)
-               pool->nr_workers--;
-       if (worker->flags & WORKER_IDLE)
-               pool->nr_idle--;
+       pool->nr_workers--;
+       pool->nr_idle--;
 
        list_del_init(&worker->entry);
        worker->flags |= WORKER_DIE;
-
-       idr_remove(&pool->worker_idr, worker->id);
-
-       spin_unlock_irq(&pool->lock);
-
-       kthread_stop(worker->task);
-       kfree(worker);
-
-       spin_lock_irq(&pool->lock);
+       wake_up_process(worker->task);
 }
 
 static void idle_worker_timeout(unsigned long __pool)
@@ -1839,7 +1810,7 @@ static void idle_worker_timeout(unsigned long __pool)
 
        spin_lock_irq(&pool->lock);
 
-       if (too_many_workers(pool)) {
+       while (too_many_workers(pool)) {
                struct worker *worker;
                unsigned long expires;
 
@@ -1847,13 +1818,12 @@ static void idle_worker_timeout(unsigned long __pool)
                worker = list_entry(pool->idle_list.prev, struct worker, entry);
                expires = worker->last_active + IDLE_WORKER_TIMEOUT;
 
-               if (time_before(jiffies, expires))
+               if (time_before(jiffies, expires)) {
                        mod_timer(&pool->idle_timer, expires);
-               else {
-                       /* it's been idle for too long, wake up manager */
-                       pool->flags |= POOL_MANAGE_WORKERS;
-                       wake_up_worker(pool);
+                       break;
                }
+
+               destroy_worker(worker);
        }
 
        spin_unlock_irq(&pool->lock);
@@ -1871,6 +1841,12 @@ static void send_mayday(struct work_struct *work)
 
        /* mayday mayday mayday */
        if (list_empty(&pwq->mayday_node)) {
+               /*
+                * If @pwq is for an unbound wq, its base ref may be put at
+                * any time due to an attribute change.  Pin @pwq until the
+                * rescuer is done with it.
+                */
+               get_pwq(pwq);
                list_add_tail(&pwq->mayday_node, &wq->maydays);
                wake_up_process(wq->rescuer->task);
        }
@@ -1881,8 +1857,8 @@ static void pool_mayday_timeout(unsigned long __pool)
        struct worker_pool *pool = (void *)__pool;
        struct work_struct *work;
 
-       spin_lock_irq(&wq_mayday_lock);         /* for wq->maydays */
-       spin_lock(&pool->lock);
+       spin_lock_irq(&pool->lock);
+       spin_lock(&wq_mayday_lock);             /* for wq->maydays */
 
        if (need_to_create_worker(pool)) {
                /*
@@ -1895,8 +1871,8 @@ static void pool_mayday_timeout(unsigned long __pool)
                        send_mayday(work);
        }
 
-       spin_unlock(&pool->lock);
-       spin_unlock_irq(&wq_mayday_lock);
+       spin_unlock(&wq_mayday_lock);
+       spin_unlock_irq(&pool->lock);
 
        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
 }
@@ -1918,17 +1894,11 @@ static void pool_mayday_timeout(unsigned long __pool)
  * spin_lock_irq(pool->lock) which may be released and regrabbed
  * multiple times.  Does GFP_KERNEL allocations.  Called only from
  * manager.
- *
- * RETURNS:
- * %false if no action was taken and pool->lock stayed locked, %true
- * otherwise.
  */
-static bool maybe_create_worker(struct worker_pool *pool)
+static void maybe_create_worker(struct worker_pool *pool)
 __releases(&pool->lock)
 __acquires(&pool->lock)
 {
-       if (!need_to_create_worker(pool))
-               return false;
 restart:
        spin_unlock_irq(&pool->lock);
 
@@ -1936,23 +1906,10 @@ restart:
        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
 
        while (true) {
-               struct worker *worker;
-
-               worker = create_worker(pool);
-               if (worker) {
-                       del_timer_sync(&pool->mayday_timer);
-                       spin_lock_irq(&pool->lock);
-                       start_worker(worker);
-                       if (WARN_ON_ONCE(need_to_create_worker(pool)))
-                               goto restart;
-                       return true;
-               }
-
-               if (!need_to_create_worker(pool))
+               if (create_worker(pool) || !need_to_create_worker(pool))
                        break;
 
-               __set_current_state(TASK_INTERRUPTIBLE);
-               schedule_timeout(CREATE_COOLDOWN);
+               schedule_timeout_interruptible(CREATE_COOLDOWN);
 
                if (!need_to_create_worker(pool))
                        break;
@@ -1960,47 +1917,13 @@ restart:
 
        del_timer_sync(&pool->mayday_timer);
        spin_lock_irq(&pool->lock);
+       /*
+        * This is necessary even after a new worker was just successfully
+        * created as @pool->lock was dropped and the new worker might have
+        * already become busy.
+        */
        if (need_to_create_worker(pool))
                goto restart;
-       return true;
-}
-
-/**
- * maybe_destroy_worker - destroy workers which have been idle for a while
- * @pool: pool to destroy workers for
- *
- * Destroy @pool workers which have been idle for longer than
- * IDLE_WORKER_TIMEOUT.
- *
- * LOCKING:
- * spin_lock_irq(pool->lock) which may be released and regrabbed
- * multiple times.  Called only from manager.
- *
- * RETURNS:
- * %false if no action was taken and pool->lock stayed locked, %true
- * otherwise.
- */
-static bool maybe_destroy_workers(struct worker_pool *pool)
-{
-       bool ret = false;
-
-       while (too_many_workers(pool)) {
-               struct worker *worker;
-               unsigned long expires;
-
-               worker = list_entry(pool->idle_list.prev, struct worker, entry);
-               expires = worker->last_active + IDLE_WORKER_TIMEOUT;
-
-               if (time_before(jiffies, expires)) {
-                       mod_timer(&pool->idle_timer, expires);
-                       break;
-               }
-
-               destroy_worker(worker);
-               ret = true;
-       }
-
-       return ret;
 }
 
 /**
@@ -2019,18 +1942,17 @@ static bool maybe_destroy_workers(struct worker_pool *pool)
  * spin_lock_irq(pool->lock) which may be released and regrabbed
  * multiple times.  Does GFP_KERNEL allocations.
  *
- * RETURNS:
- * spin_lock_irq(pool->lock) which may be released and regrabbed
- * multiple times.  Does GFP_KERNEL allocations.
+ * Return:
+ * %false if the pool doesn't need management and the caller can safely
+ * start processing works, %true if management function was performed and
+ * the conditions that the caller verified before calling the function may
+ * no longer be true.
  */
 static bool manage_workers(struct worker *worker)
 {
        struct worker_pool *pool = worker->pool;
-       bool ret = false;
 
        /*
-        * Managership is governed by two mutexes - manager_arb and
-        * manager_mutex.  manager_arb handles arbitration of manager role.
         * Anyone who successfully grabs manager_arb wins the arbitration
         * and becomes the manager.  mutex_trylock() on pool->manager_arb
         * failure while holding pool->lock reliably indicates that someone
@@ -2039,42 +1961,16 @@ static bool manage_workers(struct worker *worker)
         * grabbing manager_arb is responsible for actually performing
         * manager duties.  If manager_arb is grabbed and released without
         * actual management, the pool may stall indefinitely.
-        *
-        * manager_mutex is used for exclusion of actual management
-        * operations.  The holder of manager_mutex can be sure that none
-        * of management operations, including creation and destruction of
-        * workers, won't take place until the mutex is released.  Because
-        * manager_mutex doesn't interfere with manager role arbitration,
-        * it is guaranteed that the pool's management, while may be
-        * delayed, won't be disturbed by someone else grabbing
-        * manager_mutex.
         */
        if (!mutex_trylock(&pool->manager_arb))
-               return ret;
-
-       /*
-        * With manager arbitration won, manager_mutex would be free in
-        * most cases.  trylock first without dropping @pool->lock.
-        */
-       if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
-               spin_unlock_irq(&pool->lock);
-               mutex_lock(&pool->manager_mutex);
-               spin_lock_irq(&pool->lock);
-               ret = true;
-       }
-
-       pool->flags &= ~POOL_MANAGE_WORKERS;
+               return false;
+       pool->manager = worker;
 
-       /*
-        * Destroy and then create so that may_start_working() is true
-        * on return.
-        */
-       ret |= maybe_destroy_workers(pool);
-       ret |= maybe_create_worker(pool);
+       maybe_create_worker(pool);
 
-       mutex_unlock(&pool->manager_mutex);
+       pool->manager = NULL;
        mutex_unlock(&pool->manager_arb);
-       return ret;
+       return true;
 }
 
 /**
@@ -2112,13 +2008,8 @@ __acquires(&pool->lock)
 
        lockdep_copy_map(&lockdep_map, &work->lockdep_map);
 #endif
-       /*
-        * Ensure we're on the correct CPU.  DISASSOCIATED test is
-        * necessary to avoid spurious warnings from rescuers servicing the
-        * unbound or a disassociated pool.
-        */
-       WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
-                    !(pool->flags & POOL_DISASSOCIATED) &&
+       /* ensure we're on the correct CPU */
+       WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
                     raw_smp_processor_id() != pool->cpu);
 
        /*
@@ -2144,17 +2035,22 @@ __acquires(&pool->lock)
        list_del_init(&work->entry);
 
        /*
-        * CPU intensive works don't participate in concurrency
-        * management.  They're the scheduler's responsibility.
+        * CPU intensive works don't participate in concurrency management.
+        * They're the scheduler's responsibility.  This takes @worker out
+        * of concurrency management and the next code block will chain
+        * execution of the pending work items.
         */
        if (unlikely(cpu_intensive))
-               worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
+               worker_set_flags(worker, WORKER_CPU_INTENSIVE);
 
        /*
-        * Unbound pool isn't concurrency managed and work items should be
-        * executed ASAP.  Wake up another worker if necessary.
+        * Wake up another worker if necessary.  The condition is always
+        * false for normal per-cpu workers since nr_running would always
+        * be >= 1 at this point.  This is used to chain execution of the
+        * pending work items for WORKER_NOT_RUNNING workers such as the
+        * UNBOUND and CPU_INTENSIVE ones.
         */
-       if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
+       if (need_more_worker(pool))
                wake_up_worker(pool);
 
        /*
@@ -2188,6 +2084,16 @@ __acquires(&pool->lock)
                dump_stack();
        }
 
+       /*
+        * The following prevents a kworker from hogging CPU on !PREEMPT
+        * kernels, where a requeueing work item waiting for something to
+        * happen could deadlock with stop_machine as such work item could
+        * indefinitely requeue itself while all other CPUs are trapped in
+        * stop_machine. At the same time, report a quiescent RCU state so
+        * the same condition doesn't freeze RCU.
+        */
+       cond_resched_rcu_qs();
+
        spin_lock_irq(&pool->lock);
 
        /* clear cpu intensive status */
@@ -2233,6 +2139,8 @@ static void process_scheduled_works(struct worker *worker)
  * work items regardless of their specific target workqueue.  The only
  * exception is work items which belong to workqueues with a rescuer which
  * will be explained in rescuer_thread().
+ *
+ * Return: 0
  */
 static int worker_thread(void *__worker)
 {
@@ -2249,6 +2157,11 @@ woke_up:
                spin_unlock_irq(&pool->lock);
                WARN_ON_ONCE(!list_empty(&worker->entry));
                worker->task->flags &= ~PF_WQ_WORKER;
+
+               set_task_comm(worker->task, "kworker/dying");
+               ida_simple_remove(&pool->worker_ida, worker->id);
+               worker_detach_from_pool(worker, pool);
+               kfree(worker);
                return 0;
        }
 
@@ -2294,11 +2207,8 @@ recheck:
                }
        } while (keep_working(pool));
 
-       worker_set_flags(worker, WORKER_PREP, false);
+       worker_set_flags(worker, WORKER_PREP);
 sleep:
-       if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker))
-               goto recheck;
-
        /*
         * pool->lock is held and there's no work to process and no need to
         * manage, sleep.  Workers are woken up only while holding
@@ -2331,12 +2241,15 @@ sleep:
  * those works so that forward progress can be guaranteed.
  *
  * This should happen rarely.
+ *
+ * Return: 0
  */
 static int rescuer_thread(void *__rescuer)
 {
        struct worker *rescuer = __rescuer;
        struct workqueue_struct *wq = rescuer->rescue_wq;
        struct list_head *scheduled = &rescuer->scheduled;
+       bool should_stop;
 
        set_user_nice(current, RESCUER_NICE_LEVEL);
 
@@ -2348,11 +2261,15 @@ static int rescuer_thread(void *__rescuer)
 repeat:
        set_current_state(TASK_INTERRUPTIBLE);
 
-       if (kthread_should_stop()) {
-               __set_current_state(TASK_RUNNING);
-               rescuer->task->flags &= ~PF_WQ_WORKER;
-               return 0;
-       }
+       /*
+        * By the time the rescuer is requested to stop, the workqueue
+        * shouldn't have any work pending, but @wq->maydays may still have
+        * pwq(s) queued.  This can happen by non-rescuer workers consuming
+        * all the work items before the rescuer got to them.  Go through
+        * @wq->maydays processing before acting on should_stop so that the
+        * list is always empty on exit.
+        */
+       should_stop = kthread_should_stop();
 
        /* see whether any pwq is asking for help */
        spin_lock_irq(&wq_mayday_lock);
@@ -2368,36 +2285,70 @@ repeat:
 
                spin_unlock_irq(&wq_mayday_lock);
 
-               /* migrate to the target cpu if possible */
-               worker_maybe_bind_and_lock(pool);
+               worker_attach_to_pool(rescuer, pool);
+
+               spin_lock_irq(&pool->lock);
                rescuer->pool = pool;
 
                /*
                 * Slurp in all works issued via this workqueue and
                 * process'em.
                 */
-               WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
+               WARN_ON_ONCE(!list_empty(scheduled));
                list_for_each_entry_safe(work, n, &pool->worklist, entry)
                        if (get_work_pwq(work) == pwq)
                                move_linked_works(work, scheduled, &n);
 
-               process_scheduled_works(rescuer);
+               if (!list_empty(scheduled)) {
+                       process_scheduled_works(rescuer);
+
+                       /*
+                        * The above execution of rescued work items could
+                        * have created more to rescue through
+                        * pwq_activate_first_delayed() or chained
+                        * queueing.  Let's put @pwq back on mayday list so
+                        * that such back-to-back work items, which may be
+                        * being used to relieve memory pressure, don't
+                        * incur MAYDAY_INTERVAL delay inbetween.
+                        */
+                       if (need_to_create_worker(pool)) {
+                               spin_lock(&wq_mayday_lock);
+                               get_pwq(pwq);
+                               list_move_tail(&pwq->mayday_node, &wq->maydays);
+                               spin_unlock(&wq_mayday_lock);
+                       }
+               }
 
                /*
-                * Leave this pool.  If keep_working() is %true, notify a
+                * Put the reference grabbed by send_mayday().  @pool won't
+                * go away while we're still attached to it.
+                */
+               put_pwq(pwq);
+
+               /*
+                * Leave this pool.  If need_more_worker() is %true, notify a
                 * regular worker; otherwise, we end up with 0 concurrency
                 * and stalling the execution.
                 */
-               if (keep_working(pool))
+               if (need_more_worker(pool))
                        wake_up_worker(pool);
 
                rescuer->pool = NULL;
-               spin_unlock(&pool->lock);
-               spin_lock(&wq_mayday_lock);
+               spin_unlock_irq(&pool->lock);
+
+               worker_detach_from_pool(rescuer, pool);
+
+               spin_lock_irq(&wq_mayday_lock);
        }
 
        spin_unlock_irq(&wq_mayday_lock);
 
+       if (should_stop) {
+               __set_current_state(TASK_RUNNING);
+               rescuer->task->flags &= ~PF_WQ_WORKER;
+               return 0;
+       }
+
        /* rescuers should never participate in concurrency management */
        WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
        schedule();
@@ -2407,6 +2358,7 @@ repeat:
 struct wq_barrier {
        struct work_struct      work;
        struct completion       done;
+       struct task_struct      *task;  /* purely informational */
 };
 
 static void wq_barrier_func(struct work_struct *work)
@@ -2455,6 +2407,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
        INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
        __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
        init_completion(&barr->done);
+       barr->task = current;
 
        /*
         * If @target is currently being executed, schedule the
@@ -2503,7 +2456,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
  * CONTEXT:
  * mutex_lock(wq->mutex).
  *
- * RETURNS:
+ * Return:
  * %true if @flush_color >= 0 and there's something to flush.  %false
  * otherwise.
  */
@@ -2698,7 +2651,7 @@ void flush_workqueue(struct workqueue_struct *wq)
 out_unlock:
        mutex_unlock(&wq->mutex);
 }
-EXPORT_SYMBOL_GPL(flush_workqueue);
+EXPORT_SYMBOL(flush_workqueue);
 
 /**
  * drain_workqueue - drain a workqueue
@@ -2707,7 +2660,7 @@ EXPORT_SYMBOL_GPL(flush_workqueue);
  * Wait until the workqueue becomes empty.  While draining is in progress,
  * only chain queueing is allowed.  IOW, only currently pending or running
  * work items on @wq can queue further work items on it.  @wq is flushed
- * repeatedly until it becomes empty.  The number of flushing is detemined
+ * repeatedly until it becomes empty.  The number of flushing is determined
  * by the depth of chaining and should be relatively short.  Whine if it
  * takes too long.
  */
@@ -2811,7 +2764,7 @@ already_gone:
  * Wait until @work has finished execution.  @work is guaranteed to be idle
  * on return if it hasn't been requeued since flush started.
  *
- * RETURNS:
+ * Return:
  * %true if flush_work() waited for the work to finish execution,
  * %false if it was already idle.
  */
@@ -2832,19 +2785,57 @@ bool flush_work(struct work_struct *work)
 }
 EXPORT_SYMBOL_GPL(flush_work);
 
+struct cwt_wait {
+       wait_queue_t            wait;
+       struct work_struct      *work;
+};
+
+static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+       struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait);
+
+       if (cwait->work != key)
+               return 0;
+       return autoremove_wake_function(wait, mode, sync, key);
+}
+
 static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
 {
+       static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq);
        unsigned long flags;
        int ret;
 
        do {
                ret = try_to_grab_pending(work, is_dwork, &flags);
                /*
-                * If someone else is canceling, wait for the same event it
-                * would be waiting for before retrying.
+                * If someone else is already canceling, wait for it to
+                * finish.  flush_work() doesn't work for PREEMPT_NONE
+                * because we may get scheduled between @work's completion
+                * and the other canceling task resuming and clearing
+                * CANCELING - flush_work() will return false immediately
+                * as @work is no longer busy, try_to_grab_pending() will
+                * return -ENOENT as @work is still being canceled and the
+                * other canceling task won't be able to clear CANCELING as
+                * we're hogging the CPU.
+                *
+                * Let's wait for completion using a waitqueue.  As this
+                * may lead to the thundering herd problem, use a custom
+                * wake function which matches @work along with exclusive
+                * wait and wakeup.
                 */
-               if (unlikely(ret == -ENOENT))
-                       flush_work(work);
+               if (unlikely(ret == -ENOENT)) {
+                       struct cwt_wait cwait;
+
+                       init_wait(&cwait.wait);
+                       cwait.wait.func = cwt_wakefn;
+                       cwait.work = work;
+
+                       prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait,
+                                                 TASK_UNINTERRUPTIBLE);
+                       if (work_is_canceling(work))
+                               schedule();
+                       finish_wait(&cancel_waitq, &cwait.wait);
+               }
        } while (unlikely(ret < 0));
 
        /* tell other tasks trying to grab @work to back off */
@@ -2853,6 +2844,16 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
 
        flush_work(work);
        clear_work_data(work);
+
+       /*
+        * Paired with prepare_to_wait() above so that either
+        * waitqueue_active() is visible here or !work_is_canceling() is
+        * visible there.
+        */
+       smp_mb();
+       if (waitqueue_active(&cancel_waitq))
+               __wake_up(&cancel_waitq, TASK_NORMAL, 1, work);
+
        return ret;
 }
 
@@ -2871,7 +2872,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
  * The caller must ensure that the workqueue on which @work was last
  * queued can't be destroyed before this function returns.
  *
- * RETURNS:
+ * Return:
  * %true if @work was pending, %false otherwise.
  */
 bool cancel_work_sync(struct work_struct *work)
@@ -2888,7 +2889,7 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
  * immediate execution.  Like flush_work(), this function only
  * considers the last queueing instance of @dwork.
  *
- * RETURNS:
+ * Return:
  * %true if flush_work() waited for the work to finish execution,
  * %false if it was already idle.
  */
@@ -2906,11 +2907,15 @@ EXPORT_SYMBOL(flush_delayed_work);
  * cancel_delayed_work - cancel a delayed work
  * @dwork: delayed_work to cancel
  *
- * Kill off a pending delayed_work.  Returns %true if @dwork was pending
- * and canceled; %false if wasn't pending.  Note that the work callback
- * function may still be running on return, unless it returns %true and the
- * work doesn't re-arm itself.  Explicitly flush or use
- * cancel_delayed_work_sync() to wait on it.
+ * Kill off a pending delayed_work.
+ *
+ * Return: %true if @dwork was pending and canceled; %false if it wasn't
+ * pending.
+ *
+ * Note:
+ * The work callback function may still be running on return, unless
+ * it returns %true and the work doesn't re-arm itself.  Explicitly flush or
+ * use cancel_delayed_work_sync() to wait on it.
  *
  * This function is safe to call from any context including IRQ handler.
  */
@@ -2939,7 +2944,7 @@ EXPORT_SYMBOL(cancel_delayed_work);
  *
  * This is cancel_work_sync() for delayed works.
  *
- * RETURNS:
+ * Return:
  * %true if @dwork was pending, %false otherwise.
  */
 bool cancel_delayed_work_sync(struct delayed_work *dwork)
@@ -2956,7 +2961,7 @@ EXPORT_SYMBOL(cancel_delayed_work_sync);
  * system workqueue and blocks until all CPUs have completed.
  * schedule_on_each_cpu() is very slow.
  *
- * RETURNS:
+ * Return:
  * 0 on success, -errno on failure.
  */
 int schedule_on_each_cpu(work_func_t func)
@@ -2985,36 +2990,6 @@ int schedule_on_each_cpu(work_func_t func)
        return 0;
 }
 
-/**
- * flush_scheduled_work - ensure that any scheduled work has run to completion.
- *
- * Forces execution of the kernel-global workqueue and blocks until its
- * completion.
- *
- * Think twice before calling this function!  It's very easy to get into
- * trouble if you don't take great care.  Either of the following situations
- * will lead to deadlock:
- *
- *     One of the work items currently on the workqueue needs to acquire
- *     a lock held by your code or its caller.
- *
- *     Your code is running in the context of a work routine.
- *
- * They will be detected by lockdep when they occur, but the first might not
- * occur very often.  It depends on what work items are on the workqueue and
- * what locks they need, which you have no control over.
- *
- * In most situations flushing the entire workqueue is overkill; you merely
- * need to know that a particular work item isn't queued and isn't running.
- * In such cases you should use cancel_delayed_work_sync() or
- * cancel_work_sync() instead.
- */
-void flush_scheduled_work(void)
-{
-       flush_workqueue(system_wq);
-}
-EXPORT_SYMBOL(flush_scheduled_work);
-
 /**
  * execute_in_process_context - reliably execute the routine with user context
  * @fn:                the function to execute
@@ -3024,7 +2999,7 @@ EXPORT_SYMBOL(flush_scheduled_work);
  * Executes the function immediately if process context is available,
  * otherwise schedules the function for delayed execution.
  *
- * Returns:    0 - function was executed
+ * Return    0 - function was executed
  *             1 - function was scheduled for execution
  */
 int execute_in_process_context(work_func_t fn, struct execute_work *ew)
@@ -3041,628 +3016,351 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew)
 }
 EXPORT_SYMBOL_GPL(execute_in_process_context);
 
-#ifdef CONFIG_SYSFS
-/*
- * Workqueues with WQ_SYSFS flag set is visible to userland via
- * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
- * following attributes.
- *
- *  per_cpu    RO bool : whether the workqueue is per-cpu or unbound
- *  max_active RW int  : maximum number of in-flight work items
- *
- * Unbound workqueues have the following extra attributes.
+/**
+ * free_workqueue_attrs - free a workqueue_attrs
+ * @attrs: workqueue_attrs to free
  *
- *  id         RO int  : the associated pool ID
- *  nice       RW int  : nice value of the workers
- *  cpumask    RW mask : bitmask of allowed CPUs for the workers
+ * Undo alloc_workqueue_attrs().
  */
-struct wq_device {
-       struct workqueue_struct         *wq;
-       struct device                   dev;
-};
-
-static struct workqueue_struct *dev_to_wq(struct device *dev)
+void free_workqueue_attrs(struct workqueue_attrs *attrs)
 {
-       struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
-
-       return wq_dev->wq;
+       if (attrs) {
+               free_cpumask_var(attrs->cpumask);
+               kfree(attrs);
+       }
 }
 
-static ssize_t wq_per_cpu_show(struct device *dev,
-                              struct device_attribute *attr, char *buf)
+/**
+ * alloc_workqueue_attrs - allocate a workqueue_attrs
+ * @gfp_mask: allocation mask to use
+ *
+ * Allocate a new workqueue_attrs, initialize with default settings and
+ * return it.
+ *
+ * Return: The allocated new workqueue_attr on success. %NULL on failure.
+ */
+struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask)
 {
-       struct workqueue_struct *wq = dev_to_wq(dev);
+       struct workqueue_attrs *attrs;
 
-       return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
+       attrs = kzalloc(sizeof(*attrs), gfp_mask);
+       if (!attrs)
+               goto fail;
+       if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask))
+               goto fail;
+
+       cpumask_copy(attrs->cpumask, cpu_possible_mask);
+       return attrs;
+fail:
+       free_workqueue_attrs(attrs);
+       return NULL;
 }
 
-static ssize_t wq_max_active_show(struct device *dev,
-                                 struct device_attribute *attr, char *buf)
+static void copy_workqueue_attrs(struct workqueue_attrs *to,
+                                const struct workqueue_attrs *from)
 {
-       struct workqueue_struct *wq = dev_to_wq(dev);
-
-       return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
+       to->nice = from->nice;
+       cpumask_copy(to->cpumask, from->cpumask);
+       /*
+        * Unlike hash and equality test, this function doesn't ignore
+        * ->no_numa as it is used for both pool and wq attrs.  Instead,
+        * get_unbound_pool() explicitly clears ->no_numa after copying.
+        */
+       to->no_numa = from->no_numa;
 }
 
-static ssize_t wq_max_active_store(struct device *dev,
-                                  struct device_attribute *attr,
-                                  const char *buf, size_t count)
+/* hash value of the content of @attr */
+static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
 {
-       struct workqueue_struct *wq = dev_to_wq(dev);
-       int val;
-
-       if (sscanf(buf, "%d", &val) != 1 || val <= 0)
-               return -EINVAL;
+       u32 hash = 0;
 
-       workqueue_set_max_active(wq, val);
-       return count;
+       hash = jhash_1word(attrs->nice, hash);
+       hash = jhash(cpumask_bits(attrs->cpumask),
+                    BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
+       return hash;
 }
 
-static struct device_attribute wq_sysfs_attrs[] = {
-       __ATTR(per_cpu, 0444, wq_per_cpu_show, NULL),
-       __ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store),
-       __ATTR_NULL,
-};
-
-static ssize_t wq_pool_ids_show(struct device *dev,
-                               struct device_attribute *attr, char *buf)
+/* content equality test */
+static bool wqattrs_equal(const struct workqueue_attrs *a,
+                         const struct workqueue_attrs *b)
 {
-       struct workqueue_struct *wq = dev_to_wq(dev);
-       const char *delim = "";
-       int node, written = 0;
-
-       rcu_read_lock_sched();
-       for_each_node(node) {
-               written += scnprintf(buf + written, PAGE_SIZE - written,
-                                    "%s%d:%d", delim, node,
-                                    unbound_pwq_by_node(wq, node)->pool->id);
-               delim = " ";
-       }
-       written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
-       rcu_read_unlock_sched();
-
-       return written;
+       if (a->nice != b->nice)
+               return false;
+       if (!cpumask_equal(a->cpumask, b->cpumask))
+               return false;
+       return true;
 }
 
-static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
-                           char *buf)
+/**
+ * init_worker_pool - initialize a newly zalloc'd worker_pool
+ * @pool: worker_pool to initialize
+ *
+ * Initialize a newly zalloc'd @pool.  It also allocates @pool->attrs.
+ *
+ * Return: 0 on success, -errno on failure.  Even on failure, all fields
+ * inside @pool proper are initialized and put_unbound_pool() can be called
+ * on @pool safely to release it.
+ */
+static int init_worker_pool(struct worker_pool *pool)
 {
-       struct workqueue_struct *wq = dev_to_wq(dev);
-       int written;
+       spin_lock_init(&pool->lock);
+       pool->id = -1;
+       pool->cpu = -1;
+       pool->node = NUMA_NO_NODE;
+       pool->flags |= POOL_DISASSOCIATED;
+       INIT_LIST_HEAD(&pool->worklist);
+       INIT_LIST_HEAD(&pool->idle_list);
+       hash_init(pool->busy_hash);
 
-       mutex_lock(&wq->mutex);
-       written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
-       mutex_unlock(&wq->mutex);
+       init_timer_deferrable(&pool->idle_timer);
+       pool->idle_timer.function = idle_worker_timeout;
+       pool->idle_timer.data = (unsigned long)pool;
 
-       return written;
-}
+       setup_timer(&pool->mayday_timer, pool_mayday_timeout,
+                   (unsigned long)pool);
 
-/* prepare workqueue_attrs for sysfs store operations */
-static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
-{
-       struct workqueue_attrs *attrs;
+       mutex_init(&pool->manager_arb);
+       mutex_init(&pool->attach_mutex);
+       INIT_LIST_HEAD(&pool->workers);
 
-       attrs = alloc_workqueue_attrs(GFP_KERNEL);
-       if (!attrs)
-               return NULL;
+       ida_init(&pool->worker_ida);
+       INIT_HLIST_NODE(&pool->hash_node);
+       pool->refcnt = 1;
 
-       mutex_lock(&wq->mutex);
-       copy_workqueue_attrs(attrs, wq->unbound_attrs);
-       mutex_unlock(&wq->mutex);
-       return attrs;
+       /* shouldn't fail above this point */
+       pool->attrs = alloc_workqueue_attrs(GFP_KERNEL);
+       if (!pool->attrs)
+               return -ENOMEM;
+       return 0;
 }
 
-static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
-                            const char *buf, size_t count)
+static void rcu_free_wq(struct rcu_head *rcu)
 {
-       struct workqueue_struct *wq = dev_to_wq(dev);
-       struct workqueue_attrs *attrs;
-       int ret;
+       struct workqueue_struct *wq =
+               container_of(rcu, struct workqueue_struct, rcu);
 
-       attrs = wq_sysfs_prep_attrs(wq);
-       if (!attrs)
-               return -ENOMEM;
-
-       if (sscanf(buf, "%d", &attrs->nice) == 1 &&
-           attrs->nice >= -20 && attrs->nice <= 19)
-               ret = apply_workqueue_attrs(wq, attrs);
+       if (!(wq->flags & WQ_UNBOUND))
+               free_percpu(wq->cpu_pwqs);
        else
-               ret = -EINVAL;
+               free_workqueue_attrs(wq->unbound_attrs);
 
-       free_workqueue_attrs(attrs);
-       return ret ?: count;
+       kfree(wq->rescuer);
+       kfree(wq);
 }
 
-static ssize_t wq_cpumask_show(struct device *dev,
-                              struct device_attribute *attr, char *buf)
+static void rcu_free_pool(struct rcu_head *rcu)
 {
-       struct workqueue_struct *wq = dev_to_wq(dev);
-       int written;
-
-       mutex_lock(&wq->mutex);
-       written = cpumask_scnprintf(buf, PAGE_SIZE, wq->unbound_attrs->cpumask);
-       mutex_unlock(&wq->mutex);
+       struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
 
-       written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
-       return written;
+       ida_destroy(&pool->worker_ida);
+       free_workqueue_attrs(pool->attrs);
+       kfree(pool);
 }
 
-static ssize_t wq_cpumask_store(struct device *dev,
-                               struct device_attribute *attr,
-                               const char *buf, size_t count)
+/**
+ * put_unbound_pool - put a worker_pool
+ * @pool: worker_pool to put
+ *
+ * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
+ * safe manner.  get_unbound_pool() calls this function on its failure path
+ * and this function should be able to release pools which went through,
+ * successfully or not, init_worker_pool().
+ *
+ * Should be called with wq_pool_mutex held.
+ */
+static void put_unbound_pool(struct worker_pool *pool)
 {
-       struct workqueue_struct *wq = dev_to_wq(dev);
-       struct workqueue_attrs *attrs;
-       int ret;
-
-       attrs = wq_sysfs_prep_attrs(wq);
-       if (!attrs)
-               return -ENOMEM;
+       DECLARE_COMPLETION_ONSTACK(detach_completion);
+       struct worker *worker;
 
-       ret = cpumask_parse(buf, attrs->cpumask);
-       if (!ret)
-               ret = apply_workqueue_attrs(wq, attrs);
+       lockdep_assert_held(&wq_pool_mutex);
 
-       free_workqueue_attrs(attrs);
-       return ret ?: count;
-}
+       if (--pool->refcnt)
+               return;
 
-static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
-                           char *buf)
-{
-       struct workqueue_struct *wq = dev_to_wq(dev);
-       int written;
-
-       mutex_lock(&wq->mutex);
-       written = scnprintf(buf, PAGE_SIZE, "%d\n",
-                           !wq->unbound_attrs->no_numa);
-       mutex_unlock(&wq->mutex);
-
-       return written;
-}
-
-static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
-                            const char *buf, size_t count)
-{
-       struct workqueue_struct *wq = dev_to_wq(dev);
-       struct workqueue_attrs *attrs;
-       int v, ret;
+       /* sanity checks */
+       if (WARN_ON(!(pool->cpu < 0)) ||
+           WARN_ON(!list_empty(&pool->worklist)))
+               return;
 
-       attrs = wq_sysfs_prep_attrs(wq);
-       if (!attrs)
-               return -ENOMEM;
+       /* release id and unhash */
+       if (pool->id >= 0)
+               idr_remove(&worker_pool_idr, pool->id);
+       hash_del(&pool->hash_node);
 
-       ret = -EINVAL;
-       if (sscanf(buf, "%d", &v) == 1) {
-               attrs->no_numa = !v;
-               ret = apply_workqueue_attrs(wq, attrs);
-       }
+       /*
+        * Become the manager and destroy all workers.  Grabbing
+        * manager_arb prevents @pool's workers from blocking on
+        * attach_mutex.
+        */
+       mutex_lock(&pool->manager_arb);
 
-       free_workqueue_attrs(attrs);
-       return ret ?: count;
-}
+       spin_lock_irq(&pool->lock);
+       while ((worker = first_idle_worker(pool)))
+               destroy_worker(worker);
+       WARN_ON(pool->nr_workers || pool->nr_idle);
+       spin_unlock_irq(&pool->lock);
 
-static struct device_attribute wq_sysfs_unbound_attrs[] = {
-       __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
-       __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
-       __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
-       __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
-       __ATTR_NULL,
-};
+       mutex_lock(&pool->attach_mutex);
+       if (!list_empty(&pool->workers))
+               pool->detach_completion = &detach_completion;
+       mutex_unlock(&pool->attach_mutex);
 
-static struct bus_type wq_subsys = {
-       .name                           = "workqueue",
-       .dev_attrs                      = wq_sysfs_attrs,
-};
+       if (pool->detach_completion)
+               wait_for_completion(pool->detach_completion);
 
-static int __init wq_sysfs_init(void)
-{
-       return subsys_virtual_register(&wq_subsys, NULL);
-}
-core_initcall(wq_sysfs_init);
+       mutex_unlock(&pool->manager_arb);
 
-static void wq_device_release(struct device *dev)
-{
-       struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+       /* shut down the timers */
+       del_timer_sync(&pool->idle_timer);
+       del_timer_sync(&pool->mayday_timer);
 
-       kfree(wq_dev);
+       /* sched-RCU protected to allow dereferences from get_work_pool() */
+       call_rcu_sched(&pool->rcu, rcu_free_pool);
 }
 
 /**
- * workqueue_sysfs_register - make a workqueue visible in sysfs
- * @wq: the workqueue to register
+ * get_unbound_pool - get a worker_pool with the specified attributes
+ * @attrs: the attributes of the worker_pool to get
  *
- * Expose @wq in sysfs under /sys/bus/workqueue/devices.
- * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
- * which is the preferred method.
+ * Obtain a worker_pool which has the same attributes as @attrs, bump the
+ * reference count and return it.  If there already is a matching
+ * worker_pool, it will be used; otherwise, this function attempts to
+ * create a new one.
  *
- * Workqueue user should use this function directly iff it wants to apply
- * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
- * apply_workqueue_attrs() may race against userland updating the
- * attributes.
+ * Should be called with wq_pool_mutex held.
  *
- * Returns 0 on success, -errno on failure.
+ * Return: On success, a worker_pool with the same attributes as @attrs.
+ * On failure, %NULL.
  */
-int workqueue_sysfs_register(struct workqueue_struct *wq)
+static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
 {
-       struct wq_device *wq_dev;
-       int ret;
+       u32 hash = wqattrs_hash(attrs);
+       struct worker_pool *pool;
+       int node;
+       int target_node = NUMA_NO_NODE;
 
-       /*
-        * Adjusting max_active or creating new pwqs by applyting
-        * attributes breaks ordering guarantee.  Disallow exposing ordered
-        * workqueues.
-        */
-       if (WARN_ON(wq->flags & __WQ_ORDERED))
-               return -EINVAL;
+       lockdep_assert_held(&wq_pool_mutex);
 
-       wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
-       if (!wq_dev)
-               return -ENOMEM;
+       /* do we already have a matching pool? */
+       hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
+               if (wqattrs_equal(pool->attrs, attrs)) {
+                       pool->refcnt++;
+                       return pool;
+               }
+       }
 
-       wq_dev->wq = wq;
-       wq_dev->dev.bus = &wq_subsys;
-       wq_dev->dev.init_name = wq->name;
-       wq_dev->dev.release = wq_device_release;
+       /* if cpumask is contained inside a NUMA node, we belong to that node */
+       if (wq_numa_enabled) {
+               for_each_node(node) {
+                       if (cpumask_subset(attrs->cpumask,
+                                          wq_numa_possible_cpumask[node])) {
+                               target_node = node;
+                               break;
+                       }
+               }
+       }
+
+       /* nope, create a new one */
+       pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node);
+       if (!pool || init_worker_pool(pool) < 0)
+               goto fail;
+
+       lockdep_set_subclass(&pool->lock, 1);   /* see put_pwq() */
+       copy_workqueue_attrs(pool->attrs, attrs);
+       pool->node = target_node;
 
        /*
-        * unbound_attrs are created separately.  Suppress uevent until
-        * everything is ready.
+        * no_numa isn't a worker_pool attribute, always clear it.  See
+        * 'struct workqueue_attrs' comments for detail.
         */
-       dev_set_uevent_suppress(&wq_dev->dev, true);
+       pool->attrs->no_numa = false;
 
-       ret = device_register(&wq_dev->dev);
-       if (ret) {
-               kfree(wq_dev);
-               wq->wq_dev = NULL;
-               return ret;
-       }
+       if (worker_pool_assign_id(pool) < 0)
+               goto fail;
 
-       if (wq->flags & WQ_UNBOUND) {
-               struct device_attribute *attr;
+       /* create and start the initial worker */
+       if (!create_worker(pool))
+               goto fail;
 
-               for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
-                       ret = device_create_file(&wq_dev->dev, attr);
-                       if (ret) {
-                               device_unregister(&wq_dev->dev);
-                               wq->wq_dev = NULL;
-                               return ret;
-                       }
-               }
-       }
+       /* install */
+       hash_add(unbound_pool_hash, &pool->hash_node, hash);
 
-       kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
-       return 0;
+       return pool;
+fail:
+       if (pool)
+               put_unbound_pool(pool);
+       return NULL;
 }
 
-/**
- * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
- * @wq: the workqueue to unregister
- *
- * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
+static void rcu_free_pwq(struct rcu_head *rcu)
+{
+       kmem_cache_free(pwq_cache,
+                       container_of(rcu, struct pool_workqueue, rcu));
+}
+
+/*
+ * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt
+ * and needs to be destroyed.
  */
-static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
+static void pwq_unbound_release_workfn(struct work_struct *work)
 {
-       struct wq_device *wq_dev = wq->wq_dev;
+       struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
+                                                 unbound_release_work);
+       struct workqueue_struct *wq = pwq->wq;
+       struct worker_pool *pool = pwq->pool;
+       bool is_last;
 
-       if (!wq->wq_dev)
+       if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
                return;
 
-       wq->wq_dev = NULL;
-       device_unregister(&wq_dev->dev);
-}
-#else  /* CONFIG_SYSFS */
-static void workqueue_sysfs_unregister(struct workqueue_struct *wq)    { }
-#endif /* CONFIG_SYSFS */
+       mutex_lock(&wq->mutex);
+       list_del_rcu(&pwq->pwqs_node);
+       is_last = list_empty(&wq->pwqs);
+       mutex_unlock(&wq->mutex);
 
-/**
- * free_workqueue_attrs - free a workqueue_attrs
- * @attrs: workqueue_attrs to free
- *
- * Undo alloc_workqueue_attrs().
- */
-void free_workqueue_attrs(struct workqueue_attrs *attrs)
-{
-       if (attrs) {
-               free_cpumask_var(attrs->cpumask);
-               kfree(attrs);
-       }
+       mutex_lock(&wq_pool_mutex);
+       put_unbound_pool(pool);
+       mutex_unlock(&wq_pool_mutex);
+
+       call_rcu_sched(&pwq->rcu, rcu_free_pwq);
+
+       /*
+        * If we're the last pwq going away, @wq is already dead and no one
+        * is gonna access it anymore.  Schedule RCU free.
+        */
+       if (is_last)
+               call_rcu_sched(&wq->rcu, rcu_free_wq);
 }
 
 /**
- * alloc_workqueue_attrs - allocate a workqueue_attrs
- * @gfp_mask: allocation mask to use
+ * pwq_adjust_max_active - update a pwq's max_active to the current setting
+ * @pwq: target pool_workqueue
  *
- * Allocate a new workqueue_attrs, initialize with default settings and
- * return it.  Returns NULL on failure.
+ * If @pwq isn't freezing, set @pwq->max_active to the associated
+ * workqueue's saved_max_active and activate delayed work items
+ * accordingly.  If @pwq is freezing, clear @pwq->max_active to zero.
  */
-struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask)
+static void pwq_adjust_max_active(struct pool_workqueue *pwq)
 {
-       struct workqueue_attrs *attrs;
-
-       attrs = kzalloc(sizeof(*attrs), gfp_mask);
-       if (!attrs)
-               goto fail;
-       if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask))
-               goto fail;
+       struct workqueue_struct *wq = pwq->wq;
+       bool freezable = wq->flags & WQ_FREEZABLE;
 
-       cpumask_copy(attrs->cpumask, cpu_possible_mask);
-       return attrs;
-fail:
-       free_workqueue_attrs(attrs);
-       return NULL;
-}
+       /* for @wq->saved_max_active */
+       lockdep_assert_held(&wq->mutex);
 
-static void copy_workqueue_attrs(struct workqueue_attrs *to,
-                                const struct workqueue_attrs *from)
-{
-       to->nice = from->nice;
-       cpumask_copy(to->cpumask, from->cpumask);
-}
+       /* fast exit for non-freezable wqs */
+       if (!freezable && pwq->max_active == wq->saved_max_active)
+               return;
 
-/* hash value of the content of @attr */
-static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
-{
-       u32 hash = 0;
-
-       hash = jhash_1word(attrs->nice, hash);
-       hash = jhash(cpumask_bits(attrs->cpumask),
-                    BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
-       return hash;
-}
-
-/* content equality test */
-static bool wqattrs_equal(const struct workqueue_attrs *a,
-                         const struct workqueue_attrs *b)
-{
-       if (a->nice != b->nice)
-               return false;
-       if (!cpumask_equal(a->cpumask, b->cpumask))
-               return false;
-       return true;
-}
-
-/**
- * init_worker_pool - initialize a newly zalloc'd worker_pool
- * @pool: worker_pool to initialize
- *
- * Initiailize a newly zalloc'd @pool.  It also allocates @pool->attrs.
- * Returns 0 on success, -errno on failure.  Even on failure, all fields
- * inside @pool proper are initialized and put_unbound_pool() can be called
- * on @pool safely to release it.
- */
-static int init_worker_pool(struct worker_pool *pool)
-{
-       spin_lock_init(&pool->lock);
-       pool->id = -1;
-       pool->cpu = -1;
-       pool->node = NUMA_NO_NODE;
-       pool->flags |= POOL_DISASSOCIATED;
-       INIT_LIST_HEAD(&pool->worklist);
-       INIT_LIST_HEAD(&pool->idle_list);
-       hash_init(pool->busy_hash);
-
-       init_timer_deferrable(&pool->idle_timer);
-       pool->idle_timer.function = idle_worker_timeout;
-       pool->idle_timer.data = (unsigned long)pool;
-
-       setup_timer(&pool->mayday_timer, pool_mayday_timeout,
-                   (unsigned long)pool);
-
-       mutex_init(&pool->manager_arb);
-       mutex_init(&pool->manager_mutex);
-       idr_init(&pool->worker_idr);
-
-       INIT_HLIST_NODE(&pool->hash_node);
-       pool->refcnt = 1;
-
-       /* shouldn't fail above this point */
-       pool->attrs = alloc_workqueue_attrs(GFP_KERNEL);
-       if (!pool->attrs)
-               return -ENOMEM;
-       return 0;
-}
-
-static void rcu_free_pool(struct rcu_head *rcu)
-{
-       struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
-
-       idr_destroy(&pool->worker_idr);
-       free_workqueue_attrs(pool->attrs);
-       kfree(pool);
-}
-
-/**
- * put_unbound_pool - put a worker_pool
- * @pool: worker_pool to put
- *
- * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
- * safe manner.  get_unbound_pool() calls this function on its failure path
- * and this function should be able to release pools which went through,
- * successfully or not, init_worker_pool().
- *
- * Should be called with wq_pool_mutex held.
- */
-static void put_unbound_pool(struct worker_pool *pool)
-{
-       struct worker *worker;
-
-       lockdep_assert_held(&wq_pool_mutex);
-
-       if (--pool->refcnt)
-               return;
-
-       /* sanity checks */
-       if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) ||
-           WARN_ON(!list_empty(&pool->worklist)))
-               return;
-
-       /* release id and unhash */
-       if (pool->id >= 0)
-               idr_remove(&worker_pool_idr, pool->id);
-       hash_del(&pool->hash_node);
-
-       /*
-        * Become the manager and destroy all workers.  Grabbing
-        * manager_arb prevents @pool's workers from blocking on
-        * manager_mutex.
-        */
-       mutex_lock(&pool->manager_arb);
-       mutex_lock(&pool->manager_mutex);
-       spin_lock_irq(&pool->lock);
-
-       while ((worker = first_worker(pool)))
-               destroy_worker(worker);
-       WARN_ON(pool->nr_workers || pool->nr_idle);
-
-       spin_unlock_irq(&pool->lock);
-       mutex_unlock(&pool->manager_mutex);
-       mutex_unlock(&pool->manager_arb);
-
-       /* shut down the timers */
-       del_timer_sync(&pool->idle_timer);
-       del_timer_sync(&pool->mayday_timer);
-
-       /* sched-RCU protected to allow dereferences from get_work_pool() */
-       call_rcu_sched(&pool->rcu, rcu_free_pool);
-}
-
-/**
- * get_unbound_pool - get a worker_pool with the specified attributes
- * @attrs: the attributes of the worker_pool to get
- *
- * Obtain a worker_pool which has the same attributes as @attrs, bump the
- * reference count and return it.  If there already is a matching
- * worker_pool, it will be used; otherwise, this function attempts to
- * create a new one.  On failure, returns NULL.
- *
- * Should be called with wq_pool_mutex held.
- */
-static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
-{
-       u32 hash = wqattrs_hash(attrs);
-       struct worker_pool *pool;
-       int node;
-
-       lockdep_assert_held(&wq_pool_mutex);
-
-       /* do we already have a matching pool? */
-       hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
-               if (wqattrs_equal(pool->attrs, attrs)) {
-                       pool->refcnt++;
-                       goto out_unlock;
-               }
-       }
-
-       /* nope, create a new one */
-       pool = kzalloc(sizeof(*pool), GFP_KERNEL);
-       if (!pool || init_worker_pool(pool) < 0)
-               goto fail;
-
-       if (workqueue_freezing)
-               pool->flags |= POOL_FREEZING;
-
-       lockdep_set_subclass(&pool->lock, 1);   /* see put_pwq() */
-       copy_workqueue_attrs(pool->attrs, attrs);
-
-       /* if cpumask is contained inside a NUMA node, we belong to that node */
-       if (wq_numa_enabled) {
-               for_each_node(node) {
-                       if (cpumask_subset(pool->attrs->cpumask,
-                                          wq_numa_possible_cpumask[node])) {
-                               pool->node = node;
-                               break;
-                       }
-               }
-       }
-
-       if (worker_pool_assign_id(pool) < 0)
-               goto fail;
-
-       /* create and start the initial worker */
-       if (create_and_start_worker(pool) < 0)
-               goto fail;
-
-       /* install */
-       hash_add(unbound_pool_hash, &pool->hash_node, hash);
-out_unlock:
-       return pool;
-fail:
-       if (pool)
-               put_unbound_pool(pool);
-       return NULL;
-}
-
-static void rcu_free_pwq(struct rcu_head *rcu)
-{
-       kmem_cache_free(pwq_cache,
-                       container_of(rcu, struct pool_workqueue, rcu));
-}
-
-/*
- * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt
- * and needs to be destroyed.
- */
-static void pwq_unbound_release_workfn(struct work_struct *work)
-{
-       struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
-                                                 unbound_release_work);
-       struct workqueue_struct *wq = pwq->wq;
-       struct worker_pool *pool = pwq->pool;
-       bool is_last;
-
-       if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
-               return;
-
-       /*
-        * Unlink @pwq.  Synchronization against wq->mutex isn't strictly
-        * necessary on release but do it anyway.  It's easier to verify
-        * and consistent with the linking path.
-        */
-       mutex_lock(&wq->mutex);
-       list_del_rcu(&pwq->pwqs_node);
-       is_last = list_empty(&wq->pwqs);
-       mutex_unlock(&wq->mutex);
-
-       mutex_lock(&wq_pool_mutex);
-       put_unbound_pool(pool);
-       mutex_unlock(&wq_pool_mutex);
-
-       call_rcu_sched(&pwq->rcu, rcu_free_pwq);
+       spin_lock_irq(&pwq->pool->lock);
 
        /*
-        * If we're the last pwq going away, @wq is already dead and no one
-        * is gonna access it anymore.  Free it.
+        * During [un]freezing, the caller is responsible for ensuring that
+        * this function is called at least once after @workqueue_freezing
+        * is updated and visible.
         */
-       if (is_last) {
-               free_workqueue_attrs(wq->unbound_attrs);
-               kfree(wq);
-       }
-}
-
-/**
- * pwq_adjust_max_active - update a pwq's max_active to the current setting
- * @pwq: target pool_workqueue
- *
- * If @pwq isn't freezing, set @pwq->max_active to the associated
- * workqueue's saved_max_active and activate delayed work items
- * accordingly.  If @pwq is freezing, clear @pwq->max_active to zero.
- */
-static void pwq_adjust_max_active(struct pool_workqueue *pwq)
-{
-       struct workqueue_struct *wq = pwq->wq;
-       bool freezable = wq->flags & WQ_FREEZABLE;
-
-       /* for @wq->saved_max_active */
-       lockdep_assert_held(&wq->mutex);
-
-       /* fast exit for non-freezable wqs */
-       if (!freezable && pwq->max_active == wq->saved_max_active)
-               return;
-
-       spin_lock_irq(&pwq->pool->lock);
-
-       if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) {
+       if (!freezable || !workqueue_freezing) {
                pwq->max_active = wq->saved_max_active;
 
                while (!list_empty(&pwq->delayed_works) &&
@@ -3710,10 +3408,7 @@ static void link_pwq(struct pool_workqueue *pwq)
        if (!list_empty(&pwq->pwqs_node))
                return;
 
-       /*
-        * Set the matching work_color.  This is synchronized with
-        * wq->mutex to avoid confusing flush_workqueue().
-        */
+       /* set the matching work_color */
        pwq->work_color = wq->work_color;
 
        /* sync max_active to the current setting */
@@ -3746,29 +3441,16 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
        return pwq;
 }
 
-/* undo alloc_unbound_pwq(), used only in the error path */
-static void free_unbound_pwq(struct pool_workqueue *pwq)
-{
-       lockdep_assert_held(&wq_pool_mutex);
-
-       if (pwq) {
-               put_unbound_pool(pwq->pool);
-               kmem_cache_free(pwq_cache, pwq);
-       }
-}
-
 /**
- * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node
- * @attrs: the wq_attrs of interest
+ * wq_calc_node_cpumask - calculate a wq_attrs' cpumask for the specified node
+ * @attrs: the wq_attrs of the default pwq of the target workqueue
  * @node: the target NUMA node
  * @cpu_going_down: if >= 0, the CPU to consider as offline
  * @cpumask: outarg, the resulting cpumask
  *
  * Calculate the cpumask a workqueue with @attrs should use on @node.  If
  * @cpu_going_down is >= 0, that cpu is considered offline during
- * calculation.  The result is stored in @cpumask.  This function returns
- * %true if the resulting @cpumask is different from @attrs->cpumask,
- * %false if equal.
+ * calculation.  The result is stored in @cpumask.
  *
  * If NUMA affinity is not enabled, @attrs->cpumask is always used.  If
  * enabled and @node has online CPUs requested by @attrs, the returned
@@ -3777,6 +3459,9 @@ static void free_unbound_pwq(struct pool_workqueue *pwq)
  *
  * The caller is responsible for ensuring that the cpumask of @node stays
  * stable.
+ *
+ * Return: %true if the resulting @cpumask is different from @attrs->cpumask,
+ * %false if equal.
  */
 static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
                                 int cpu_going_down, cpumask_t *cpumask)
@@ -3808,6 +3493,7 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
 {
        struct pool_workqueue *old_pwq;
 
+       lockdep_assert_held(&wq_pool_mutex);
        lockdep_assert_held(&wq->mutex);
 
        /* link_pwq() can handle duplicate calls */
@@ -3818,45 +3504,59 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
        return old_pwq;
 }
 
-/**
- * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
- * @wq: the target workqueue
- * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
- *
- * Apply @attrs to an unbound workqueue @wq.  Unless disabled, on NUMA
- * machines, this function maps a separate pwq to each NUMA node with
- * possibles CPUs in @attrs->cpumask so that work items are affine to the
- * NUMA node it was issued on.  Older pwqs are released as in-flight work
- * items finish.  Note that a work item which repeatedly requeues itself
- * back-to-back will stay on its current pwq.
- *
- * Performs GFP_KERNEL allocations.  Returns 0 on success and -errno on
- * failure.
- */
-int apply_workqueue_attrs(struct workqueue_struct *wq,
-                         const struct workqueue_attrs *attrs)
+/* context to store the prepared attrs & pwqs before applying */
+struct apply_wqattrs_ctx {
+       struct workqueue_struct *wq;            /* target workqueue */
+       struct workqueue_attrs  *attrs;         /* attrs to apply */
+       struct list_head        list;           /* queued for batching commit */
+       struct pool_workqueue   *dfl_pwq;
+       struct pool_workqueue   *pwq_tbl[];
+};
+
+/* free the resources after success or abort */
+static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
+{
+       if (ctx) {
+               int node;
+
+               for_each_node(node)
+                       put_pwq_unlocked(ctx->pwq_tbl[node]);
+               put_pwq_unlocked(ctx->dfl_pwq);
+
+               free_workqueue_attrs(ctx->attrs);
+
+               kfree(ctx);
+       }
+}
+
+/* allocate the attrs and pwqs for later installation */
+static struct apply_wqattrs_ctx *
+apply_wqattrs_prepare(struct workqueue_struct *wq,
+                     const struct workqueue_attrs *attrs)
 {
+       struct apply_wqattrs_ctx *ctx;
        struct workqueue_attrs *new_attrs, *tmp_attrs;
-       struct pool_workqueue **pwq_tbl, *dfl_pwq;
-       int node, ret;
+       int node;
 
-       /* only unbound workqueues can change attributes */
-       if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
-               return -EINVAL;
+       lockdep_assert_held(&wq_pool_mutex);
 
-       /* creating multiple pwqs breaks ordering guarantee */
-       if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
-               return -EINVAL;
+       ctx = kzalloc(sizeof(*ctx) + nr_node_ids * sizeof(ctx->pwq_tbl[0]),
+                     GFP_KERNEL);
 
-       pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL);
        new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
        tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
-       if (!pwq_tbl || !new_attrs || !tmp_attrs)
-               goto enomem;
+       if (!ctx || !new_attrs || !tmp_attrs)
+               goto out_free;
 
-       /* make a copy of @attrs and sanitize it */
+       /*
+        * Calculate the attrs of the default pwq.
+        * If the user configured cpumask doesn't overlap with the
+        * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask.
+        */
        copy_workqueue_attrs(new_attrs, attrs);
-       cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+       cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask);
+       if (unlikely(cpumask_empty(new_attrs->cpumask)))
+               cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask);
 
        /*
         * We may create multiple pwqs with differing cpumasks.  Make a
@@ -3865,76 +3565,130 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
         */
        copy_workqueue_attrs(tmp_attrs, new_attrs);
 
-       /*
-        * CPUs should stay stable across pwq creations and installations.
-        * Pin CPUs, determine the target cpumask for each node and create
-        * pwqs accordingly.
-        */
-       get_online_cpus();
-
-       mutex_lock(&wq_pool_mutex);
-
        /*
         * If something goes wrong during CPU up/down, we'll fall back to
         * the default pwq covering whole @attrs->cpumask.  Always create
         * it even if we don't use it immediately.
         */
-       dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
-       if (!dfl_pwq)
-               goto enomem_pwq;
+       ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
+       if (!ctx->dfl_pwq)
+               goto out_free;
 
        for_each_node(node) {
-               if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
-                       pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
-                       if (!pwq_tbl[node])
-                               goto enomem_pwq;
+               if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) {
+                       ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
+                       if (!ctx->pwq_tbl[node])
+                               goto out_free;
                } else {
-                       dfl_pwq->refcnt++;
-                       pwq_tbl[node] = dfl_pwq;
+                       ctx->dfl_pwq->refcnt++;
+                       ctx->pwq_tbl[node] = ctx->dfl_pwq;
                }
        }
 
-       mutex_unlock(&wq_pool_mutex);
+       /* save the user configured attrs and sanitize it. */
+       copy_workqueue_attrs(new_attrs, attrs);
+       cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+       ctx->attrs = new_attrs;
+
+       ctx->wq = wq;
+       free_workqueue_attrs(tmp_attrs);
+       return ctx;
+
+out_free:
+       free_workqueue_attrs(tmp_attrs);
+       free_workqueue_attrs(new_attrs);
+       apply_wqattrs_cleanup(ctx);
+       return NULL;
+}
+
+/* set attrs and install prepared pwqs, @ctx points to old pwqs on return */
+static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
+{
+       int node;
 
        /* all pwqs have been created successfully, let's install'em */
-       mutex_lock(&wq->mutex);
+       mutex_lock(&ctx->wq->mutex);
 
-       copy_workqueue_attrs(wq->unbound_attrs, new_attrs);
+       copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);
 
        /* save the previous pwq and install the new one */
        for_each_node(node)
-               pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]);
+               ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node,
+                                                         ctx->pwq_tbl[node]);
 
        /* @dfl_pwq might not have been used, ensure it's linked */
-       link_pwq(dfl_pwq);
-       swap(wq->dfl_pwq, dfl_pwq);
+       link_pwq(ctx->dfl_pwq);
+       swap(ctx->wq->dfl_pwq, ctx->dfl_pwq);
 
-       mutex_unlock(&wq->mutex);
+       mutex_unlock(&ctx->wq->mutex);
+}
 
-       /* put the old pwqs */
-       for_each_node(node)
-               put_pwq_unlocked(pwq_tbl[node]);
-       put_pwq_unlocked(dfl_pwq);
+static void apply_wqattrs_lock(void)
+{
+       /* CPUs should stay stable across pwq creations and installations */
+       get_online_cpus();
+       mutex_lock(&wq_pool_mutex);
+}
 
+static void apply_wqattrs_unlock(void)
+{
+       mutex_unlock(&wq_pool_mutex);
        put_online_cpus();
-       ret = 0;
-       /* fall through */
-out_free:
-       free_workqueue_attrs(tmp_attrs);
-       free_workqueue_attrs(new_attrs);
-       kfree(pwq_tbl);
+}
+
+static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
+                                       const struct workqueue_attrs *attrs)
+{
+       struct apply_wqattrs_ctx *ctx;
+       int ret = -ENOMEM;
+
+       /* only unbound workqueues can change attributes */
+       if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
+               return -EINVAL;
+
+       /* creating multiple pwqs breaks ordering guarantee */
+       if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
+               return -EINVAL;
+
+       ctx = apply_wqattrs_prepare(wq, attrs);
+
+       /* the ctx has been prepared successfully, let's commit it */
+       if (ctx) {
+               apply_wqattrs_commit(ctx);
+               ret = 0;
+       }
+
+       apply_wqattrs_cleanup(ctx);
+
        return ret;
+}
 
-enomem_pwq:
-       free_unbound_pwq(dfl_pwq);
-       for_each_node(node)
-               if (pwq_tbl && pwq_tbl[node] != dfl_pwq)
-                       free_unbound_pwq(pwq_tbl[node]);
-       mutex_unlock(&wq_pool_mutex);
-       put_online_cpus();
-enomem:
-       ret = -ENOMEM;
-       goto out_free;
+/**
+ * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
+ * @wq: the target workqueue
+ * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
+ *
+ * Apply @attrs to an unbound workqueue @wq.  Unless disabled, on NUMA
+ * machines, this function maps a separate pwq to each NUMA node with
+ * possibles CPUs in @attrs->cpumask so that work items are affine to the
+ * NUMA node it was issued on.  Older pwqs are released as in-flight work
+ * items finish.  Note that a work item which repeatedly requeues itself
+ * back-to-back will stay on its current pwq.
+ *
+ * Performs GFP_KERNEL allocations.
+ *
+ * Return: 0 on success and -errno on failure.
+ */
+int apply_workqueue_attrs(struct workqueue_struct *wq,
+                         const struct workqueue_attrs *attrs)
+{
+       int ret;
+
+       apply_wqattrs_lock();
+       ret = apply_workqueue_attrs_locked(wq, attrs);
+       apply_wqattrs_unlock();
+
+       return ret;
 }
 
 /**
@@ -3970,7 +3724,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
 
        lockdep_assert_held(&wq_pool_mutex);
 
-       if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND))
+       if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND) ||
+           wq->unbound_attrs->no_numa)
                return;
 
        /*
@@ -3981,51 +3736,37 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
        target_attrs = wq_update_unbound_numa_attrs_buf;
        cpumask = target_attrs->cpumask;
 
-       mutex_lock(&wq->mutex);
-       if (wq->unbound_attrs->no_numa)
-               goto out_unlock;
-
        copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
        pwq = unbound_pwq_by_node(wq, node);
 
        /*
         * Let's determine what needs to be done.  If the target cpumask is
-        * different from wq's, we need to compare it to @pwq's and create
-        * a new one if they don't match.  If the target cpumask equals
-        * wq's, the default pwq should be used.  If @pwq is already the
-        * default one, nothing to do; otherwise, install the default one.
+        * different from the default pwq's, we need to compare it to @pwq's
+        * and create a new one if they don't match.  If the target cpumask
+        * equals the default pwq's, the default pwq should be used.
         */
-       if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
+       if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, cpumask)) {
                if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
-                       goto out_unlock;
+                       return;
        } else {
-               if (pwq == wq->dfl_pwq)
-                       goto out_unlock;
-               else
-                       goto use_dfl_pwq;
+               goto use_dfl_pwq;
        }
 
-       mutex_unlock(&wq->mutex);
-
        /* create a new pwq */
        pwq = alloc_unbound_pwq(wq, target_attrs);
        if (!pwq) {
-               pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
-                          wq->name);
-               goto out_unlock;
+               pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
+                       wq->name);
+               goto use_dfl_pwq;
        }
 
-       /*
-        * Install the new pwq.  As this function is called only from CPU
-        * hotplug callbacks and applying a new attrs is wrapped with
-        * get/put_online_cpus(), @wq->unbound_attrs couldn't have changed
-        * inbetween.
-        */
+       /* Install the new pwq. */
        mutex_lock(&wq->mutex);
        old_pwq = numa_pwq_tbl_install(wq, node, pwq);
        goto out_unlock;
 
 use_dfl_pwq:
+       mutex_lock(&wq->mutex);
        spin_lock_irq(&wq->dfl_pwq->pool->lock);
        get_pwq(wq->dfl_pwq);
        spin_unlock_irq(&wq->dfl_pwq->pool->lock);
@@ -4038,7 +3779,7 @@ out_unlock:
 static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 {
        bool highpri = wq->flags & WQ_HIGHPRI;
-       int cpu;
+       int cpu, ret;
 
        if (!(wq->flags & WQ_UNBOUND)) {
                wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
@@ -4058,6 +3799,13 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
                        mutex_unlock(&wq->mutex);
                }
                return 0;
+       } else if (wq->flags & __WQ_ORDERED) {
+               ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
+               /* there should only be single pwq for ordering guarantee */
+               WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
+                             wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
+                    "ordering guarantee broken for workqueue %s\n", wq->name);
+               return ret;
        } else {
                return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
        }
@@ -4086,9 +3834,13 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
        struct workqueue_struct *wq;
        struct pool_workqueue *pwq;
 
+       /* see the comment above the definition of WQ_POWER_EFFICIENT */
+       if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
+               flags |= WQ_UNBOUND;
+
        /* allocate wq and format name */
        if (flags & WQ_UNBOUND)
-               tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
+               tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);
 
        wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
        if (!wq)
@@ -4130,7 +3882,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
        if (flags & WQ_MEM_RECLAIM) {
                struct worker *rescuer;
 
-               rescuer = alloc_worker();
+               rescuer = alloc_worker(NUMA_NO_NODE);
                if (!rescuer)
                        goto err_destroy;
 
@@ -4143,7 +3895,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
                }
 
                wq->rescuer = rescuer;
-               rescuer->task->flags |= PF_NO_SETAFFINITY;
+               kthread_bind_mask(rescuer->task, cpu_possible_mask);
                wake_up_process(rescuer->task);
        }
 
@@ -4162,7 +3914,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
                pwq_adjust_max_active(pwq);
        mutex_unlock(&wq->mutex);
 
-       list_add(&wq->list, &workqueues);
+       list_add_tail_rcu(&wq->list, &workqueues);
 
        mutex_unlock(&wq_pool_mutex);
 
@@ -4218,24 +3970,20 @@ void destroy_workqueue(struct workqueue_struct *wq)
         * flushing is complete in case freeze races us.
         */
        mutex_lock(&wq_pool_mutex);
-       list_del_init(&wq->list);
+       list_del_rcu(&wq->list);
        mutex_unlock(&wq_pool_mutex);
 
        workqueue_sysfs_unregister(wq);
 
-       if (wq->rescuer) {
+       if (wq->rescuer)
                kthread_stop(wq->rescuer->task);
-               kfree(wq->rescuer);
-               wq->rescuer = NULL;
-       }
 
        if (!(wq->flags & WQ_UNBOUND)) {
                /*
                 * The base ref is never dropped on per-cpu pwqs.  Directly
-                * free the pwqs and wq.
+                * schedule RCU free.
                 */
-               free_percpu(wq->cpu_pwqs);
-               kfree(wq);
+               call_rcu_sched(&wq->rcu, rcu_free_wq);
        } else {
                /*
                 * We're the sole accessor of @wq at this point.  Directly
@@ -4295,6 +4043,8 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active);
  *
  * Determine whether %current is a workqueue rescuer.  Can be used from
  * work functions to determine whether it's being run off the rescuer task.
+ *
+ * Return: %true if %current is a workqueue rescuer. %false otherwise.
  */
 bool current_is_workqueue_rescuer(void)
 {
@@ -4318,7 +4068,7 @@ bool current_is_workqueue_rescuer(void)
  * workqueue being congested on one CPU doesn't mean the workqueue is also
  * contested on other CPUs / NUMA nodes.
  *
- * RETURNS:
+ * Return:
  * %true if congested, %false otherwise.
  */
 bool workqueue_congested(int cpu, struct workqueue_struct *wq)
@@ -4351,7 +4101,7 @@ EXPORT_SYMBOL_GPL(workqueue_congested);
  * synchronization around this function and the test result is
  * unreliable and only useful as advisory hints or for debugging.
  *
- * RETURNS:
+ * Return:
  * OR'd bitmask of WORK_BUSY_* bits.
  */
 unsigned int work_busy(struct work_struct *work)
@@ -4454,6 +4204,166 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
        }
 }
 
+static void pr_cont_pool_info(struct worker_pool *pool)
+{
+       pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
+       if (pool->node != NUMA_NO_NODE)
+               pr_cont(" node=%d", pool->node);
+       pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
+}
+
+static void pr_cont_work(bool comma, struct work_struct *work)
+{
+       if (work->func == wq_barrier_func) {
+               struct wq_barrier *barr;
+
+               barr = container_of(work, struct wq_barrier, work);
+
+               pr_cont("%s BAR(%d)", comma ? "," : "",
+                       task_pid_nr(barr->task));
+       } else {
+               pr_cont("%s %pf", comma ? "," : "", work->func);
+       }
+}
+
+static void show_pwq(struct pool_workqueue *pwq)
+{
+       struct worker_pool *pool = pwq->pool;
+       struct work_struct *work;
+       struct worker *worker;
+       bool has_in_flight = false, has_pending = false;
+       int bkt;
+
+       pr_info("  pwq %d:", pool->id);
+       pr_cont_pool_info(pool);
+
+       pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active,
+               !list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
+
+       hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+               if (worker->current_pwq == pwq) {
+                       has_in_flight = true;
+                       break;
+               }
+       }
+       if (has_in_flight) {
+               bool comma = false;
+
+               pr_info("    in-flight:");
+               hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+                       if (worker->current_pwq != pwq)
+                               continue;
+
+                       pr_cont("%s %d%s:%pf", comma ? "," : "",
+                               task_pid_nr(worker->task),
+                               worker == pwq->wq->rescuer ? "(RESCUER)" : "",
+                               worker->current_func);
+                       list_for_each_entry(work, &worker->scheduled, entry)
+                               pr_cont_work(false, work);
+                       comma = true;
+               }
+               pr_cont("\n");
+       }
+
+       list_for_each_entry(work, &pool->worklist, entry) {
+               if (get_work_pwq(work) == pwq) {
+                       has_pending = true;
+                       break;
+               }
+       }
+       if (has_pending) {
+               bool comma = false;
+
+               pr_info("    pending:");
+               list_for_each_entry(work, &pool->worklist, entry) {
+                       if (get_work_pwq(work) != pwq)
+                               continue;
+
+                       pr_cont_work(comma, work);
+                       comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
+               }
+               pr_cont("\n");
+       }
+
+       if (!list_empty(&pwq->delayed_works)) {
+               bool comma = false;
+
+               pr_info("    delayed:");
+               list_for_each_entry(work, &pwq->delayed_works, entry) {
+                       pr_cont_work(comma, work);
+                       comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
+               }
+               pr_cont("\n");
+       }
+}
+
+/**
+ * show_workqueue_state - dump workqueue state
+ *
+ * Called from a sysrq handler and prints out all busy workqueues and
+ * pools.
+ */
+void show_workqueue_state(void)
+{
+       struct workqueue_struct *wq;
+       struct worker_pool *pool;
+       unsigned long flags;
+       int pi;
+
+       rcu_read_lock_sched();
+
+       pr_info("Showing busy workqueues and worker pools:\n");
+
+       list_for_each_entry_rcu(wq, &workqueues, list) {
+               struct pool_workqueue *pwq;
+               bool idle = true;
+
+               for_each_pwq(pwq, wq) {
+                       if (pwq->nr_active || !list_empty(&pwq->delayed_works)) {
+                               idle = false;
+                               break;
+                       }
+               }
+               if (idle)
+                       continue;
+
+               pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
+
+               for_each_pwq(pwq, wq) {
+                       spin_lock_irqsave(&pwq->pool->lock, flags);
+                       if (pwq->nr_active || !list_empty(&pwq->delayed_works))
+                               show_pwq(pwq);
+                       spin_unlock_irqrestore(&pwq->pool->lock, flags);
+               }
+       }
+
+       for_each_pool(pool, pi) {
+               struct worker *worker;
+               bool first = true;
+
+               spin_lock_irqsave(&pool->lock, flags);
+               if (pool->nr_workers == pool->nr_idle)
+                       goto next_pool;
+
+               pr_info("pool %d:", pool->id);
+               pr_cont_pool_info(pool);
+               pr_cont(" workers=%d", pool->nr_workers);
+               if (pool->manager)
+                       pr_cont(" manager: %d",
+                               task_pid_nr(pool->manager->task));
+               list_for_each_entry(worker, &pool->idle_list, entry) {
+                       pr_cont(" %s%d", first ? "idle: " : "",
+                               task_pid_nr(worker->task));
+                       first = false;
+               }
+               pr_cont("\n");
+       next_pool:
+               spin_unlock_irqrestore(&pool->lock, flags);
+       }
+
+       rcu_read_unlock_sched();
+}
+
 /*
  * CPU hotplug.
  *
@@ -4474,28 +4384,25 @@ static void wq_unbind_fn(struct work_struct *work)
        int cpu = smp_processor_id();
        struct worker_pool *pool;
        struct worker *worker;
-       int wi;
 
        for_each_cpu_worker_pool(pool, cpu) {
-               WARN_ON_ONCE(cpu != smp_processor_id());
-
-               mutex_lock(&pool->manager_mutex);
+               mutex_lock(&pool->attach_mutex);
                spin_lock_irq(&pool->lock);
 
                /*
-                * We've blocked all manager operations.  Make all workers
+                * We've blocked all attach/detach operations. Make all workers
                 * unbound and set DISASSOCIATED.  Before this, all workers
                 * except for the ones which are still executing works from
                 * before the last CPU down must be on the cpu.  After
                 * this, they may become diasporas.
                 */
-               for_each_pool_worker(worker, wi, pool)
+               for_each_pool_worker(worker, pool)
                        worker->flags |= WORKER_UNBOUND;
 
                pool->flags |= POOL_DISASSOCIATED;
 
                spin_unlock_irq(&pool->lock);
-               mutex_unlock(&pool->manager_mutex);
+               mutex_unlock(&pool->attach_mutex);
 
                /*
                 * Call schedule() so that we cross rq->lock and thus can
@@ -4535,24 +4442,35 @@ static void wq_unbind_fn(struct work_struct *work)
 static void rebind_workers(struct worker_pool *pool)
 {
        struct worker *worker;
-       int wi;
 
-       lockdep_assert_held(&pool->manager_mutex);
+       lockdep_assert_held(&pool->attach_mutex);
 
        /*
         * Restore CPU affinity of all workers.  As all idle workers should
         * be on the run-queue of the associated CPU before any local
-        * wake-ups for concurrency management happen, restore CPU affinty
+        * wake-ups for concurrency management happen, restore CPU affinity
         * of all workers first and then clear UNBOUND.  As we're called
         * from CPU_ONLINE, the following shouldn't fail.
         */
-       for_each_pool_worker(worker, wi, pool)
+       for_each_pool_worker(worker, pool)
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
                                                  pool->attrs->cpumask) < 0);
 
        spin_lock_irq(&pool->lock);
 
-       for_each_pool_worker(worker, wi, pool) {
+       /*
+        * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED
+        * w/o preceding DOWN_PREPARE.  Work around it.  CPU hotplug is
+        * being reworked and this can go away in time.
+        */
+       if (!(pool->flags & POOL_DISASSOCIATED)) {
+               spin_unlock_irq(&pool->lock);
+               return;
+       }
+
+       pool->flags &= ~POOL_DISASSOCIATED;
+
+       for_each_pool_worker(worker, pool) {
                unsigned int worker_flags = worker->flags;
 
                /*
@@ -4604,9 +4522,8 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
 {
        static cpumask_t cpumask;
        struct worker *worker;
-       int wi;
 
-       lockdep_assert_held(&pool->manager_mutex);
+       lockdep_assert_held(&pool->attach_mutex);
 
        /* is @cpu allowed for @pool? */
        if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
@@ -4618,7 +4535,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
                return;
 
        /* as we're called from CPU_ONLINE, the following shouldn't fail */
-       for_each_pool_worker(worker, wi, pool)
+       for_each_pool_worker(worker, pool)
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
                                                  pool->attrs->cpumask) < 0);
 }
@@ -4627,7 +4544,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
  * Workqueues should be brought up before normal priority CPU notifiers.
  * This will be registered high priority CPU notifier.
  */
-static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
+static int workqueue_cpu_up_callback(struct notifier_block *nfb,
                                               unsigned long action,
                                               void *hcpu)
 {
@@ -4641,7 +4558,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
                for_each_cpu_worker_pool(pool, cpu) {
                        if (pool->nr_workers)
                                continue;
-                       if (create_and_start_worker(pool) < 0)
+                       if (!create_worker(pool))
                                return NOTIFY_BAD;
                }
                break;
@@ -4651,19 +4568,14 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
                mutex_lock(&wq_pool_mutex);
 
                for_each_pool(pool, pi) {
-                       mutex_lock(&pool->manager_mutex);
-
-                       if (pool->cpu == cpu) {
-                               spin_lock_irq(&pool->lock);
-                               pool->flags &= ~POOL_DISASSOCIATED;
-                               spin_unlock_irq(&pool->lock);
+                       mutex_lock(&pool->attach_mutex);
 
+                       if (pool->cpu == cpu)
                                rebind_workers(pool);
-                       } else if (pool->cpu < 0) {
+                       else if (pool->cpu < 0)
                                restore_unbound_workers_cpumask(pool, cpu);
-                       }
 
-                       mutex_unlock(&pool->manager_mutex);
+                       mutex_unlock(&pool->attach_mutex);
                }
 
                /* update NUMA affinity of unbound workqueues */
@@ -4676,215 +4588,640 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
        return NOTIFY_OK;
 }
 
-/*
- * Workqueues should be brought down after normal priority CPU notifiers.
- * This will be registered as low priority CPU notifier.
- */
-static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
-                                                unsigned long action,
-                                                void *hcpu)
+/*
+ * Workqueues should be brought down after normal priority CPU notifiers.
+ * This will be registered as low priority CPU notifier.
+ */
+static int workqueue_cpu_down_callback(struct notifier_block *nfb,
+                                                unsigned long action,
+                                                void *hcpu)
+{
+       int cpu = (unsigned long)hcpu;
+       struct work_struct unbind_work;
+       struct workqueue_struct *wq;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_DOWN_PREPARE:
+               /* unbinding per-cpu workers should happen on the local CPU */
+               INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
+               queue_work_on(cpu, system_highpri_wq, &unbind_work);
+
+               /* update NUMA affinity of unbound workqueues */
+               mutex_lock(&wq_pool_mutex);
+               list_for_each_entry(wq, &workqueues, list)
+                       wq_update_unbound_numa(wq, cpu, false);
+               mutex_unlock(&wq_pool_mutex);
+
+               /* wait for per-cpu unbinding to finish */
+               flush_work(&unbind_work);
+               destroy_work_on_stack(&unbind_work);
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+#ifdef CONFIG_SMP
+
+struct work_for_cpu {
+       struct work_struct work;
+       long (*fn)(void *);
+       void *arg;
+       long ret;
+};
+
+static void work_for_cpu_fn(struct work_struct *work)
+{
+       struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);
+
+       wfc->ret = wfc->fn(wfc->arg);
+}
+
+/**
+ * work_on_cpu - run a function in user context on a particular cpu
+ * @cpu: the cpu to run on
+ * @fn: the function to run
+ * @arg: the function arg
+ *
+ * It is up to the caller to ensure that the cpu doesn't go offline.
+ * The caller must not hold any locks which would prevent @fn from completing.
+ *
+ * Return: The value @fn returns.
+ */
+long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
+{
+       struct work_for_cpu wfc = { .fn = fn, .arg = arg };
+
+       INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
+       schedule_work_on(cpu, &wfc.work);
+       flush_work(&wfc.work);
+       destroy_work_on_stack(&wfc.work);
+       return wfc.ret;
+}
+EXPORT_SYMBOL_GPL(work_on_cpu);
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_FREEZER
+
+/**
+ * freeze_workqueues_begin - begin freezing workqueues
+ *
+ * Start freezing workqueues.  After this function returns, all freezable
+ * workqueues will queue new works to their delayed_works list instead of
+ * pool->worklist.
+ *
+ * CONTEXT:
+ * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
+ */
+void freeze_workqueues_begin(void)
+{
+       struct workqueue_struct *wq;
+       struct pool_workqueue *pwq;
+
+       mutex_lock(&wq_pool_mutex);
+
+       WARN_ON_ONCE(workqueue_freezing);
+       workqueue_freezing = true;
+
+       list_for_each_entry(wq, &workqueues, list) {
+               mutex_lock(&wq->mutex);
+               for_each_pwq(pwq, wq)
+                       pwq_adjust_max_active(pwq);
+               mutex_unlock(&wq->mutex);
+       }
+
+       mutex_unlock(&wq_pool_mutex);
+}
+
+/**
+ * freeze_workqueues_busy - are freezable workqueues still busy?
+ *
+ * Check whether freezing is complete.  This function must be called
+ * between freeze_workqueues_begin() and thaw_workqueues().
+ *
+ * CONTEXT:
+ * Grabs and releases wq_pool_mutex.
+ *
+ * Return:
+ * %true if some freezable workqueues are still busy.  %false if freezing
+ * is complete.
+ */
+bool freeze_workqueues_busy(void)
+{
+       bool busy = false;
+       struct workqueue_struct *wq;
+       struct pool_workqueue *pwq;
+
+       mutex_lock(&wq_pool_mutex);
+
+       WARN_ON_ONCE(!workqueue_freezing);
+
+       list_for_each_entry(wq, &workqueues, list) {
+               if (!(wq->flags & WQ_FREEZABLE))
+                       continue;
+               /*
+                * nr_active is monotonically decreasing.  It's safe
+                * to peek without lock.
+                */
+               rcu_read_lock_sched();
+               for_each_pwq(pwq, wq) {
+                       WARN_ON_ONCE(pwq->nr_active < 0);
+                       if (pwq->nr_active) {
+                               busy = true;
+                               rcu_read_unlock_sched();
+                               goto out_unlock;
+                       }
+               }
+               rcu_read_unlock_sched();
+       }
+out_unlock:
+       mutex_unlock(&wq_pool_mutex);
+       return busy;
+}
+
+/**
+ * thaw_workqueues - thaw workqueues
+ *
+ * Thaw workqueues.  Normal queueing is restored and all collected
+ * frozen works are transferred to their respective pool worklists.
+ *
+ * CONTEXT:
+ * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
+ */
+void thaw_workqueues(void)
+{
+       struct workqueue_struct *wq;
+       struct pool_workqueue *pwq;
+
+       mutex_lock(&wq_pool_mutex);
+
+       if (!workqueue_freezing)
+               goto out_unlock;
+
+       workqueue_freezing = false;
+
+       /* restore max_active and repopulate worklist */
+       list_for_each_entry(wq, &workqueues, list) {
+               mutex_lock(&wq->mutex);
+               for_each_pwq(pwq, wq)
+                       pwq_adjust_max_active(pwq);
+               mutex_unlock(&wq->mutex);
+       }
+
+out_unlock:
+       mutex_unlock(&wq_pool_mutex);
+}
+#endif /* CONFIG_FREEZER */
+
+static int workqueue_apply_unbound_cpumask(void)
+{
+       LIST_HEAD(ctxs);
+       int ret = 0;
+       struct workqueue_struct *wq;
+       struct apply_wqattrs_ctx *ctx, *n;
+
+       lockdep_assert_held(&wq_pool_mutex);
+
+       list_for_each_entry(wq, &workqueues, list) {
+               if (!(wq->flags & WQ_UNBOUND))
+                       continue;
+               /* creating multiple pwqs breaks ordering guarantee */
+               if (wq->flags & __WQ_ORDERED)
+                       continue;
+
+               ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs);
+               if (!ctx) {
+                       ret = -ENOMEM;
+                       break;
+               }
+
+               list_add_tail(&ctx->list, &ctxs);
+       }
+
+       list_for_each_entry_safe(ctx, n, &ctxs, list) {
+               if (!ret)
+                       apply_wqattrs_commit(ctx);
+               apply_wqattrs_cleanup(ctx);
+       }
+
+       return ret;
+}
+
+/**
+ *  workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
+ *  @cpumask: the cpumask to set
+ *
+ *  The low-level workqueues cpumask is a global cpumask that limits
+ *  the affinity of all unbound workqueues.  This function check the @cpumask
+ *  and apply it to all unbound workqueues and updates all pwqs of them.
+ *
+ *  Retun:     0       - Success
+ *             -EINVAL - Invalid @cpumask
+ *             -ENOMEM - Failed to allocate memory for attrs or pwqs.
+ */
+int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
+{
+       int ret = -EINVAL;
+       cpumask_var_t saved_cpumask;
+
+       if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL))
+               return -ENOMEM;
+
+       cpumask_and(cpumask, cpumask, cpu_possible_mask);
+       if (!cpumask_empty(cpumask)) {
+               apply_wqattrs_lock();
+
+               /* save the old wq_unbound_cpumask. */
+               cpumask_copy(saved_cpumask, wq_unbound_cpumask);
+
+               /* update wq_unbound_cpumask at first and apply it to wqs. */
+               cpumask_copy(wq_unbound_cpumask, cpumask);
+               ret = workqueue_apply_unbound_cpumask();
+
+               /* restore the wq_unbound_cpumask when failed. */
+               if (ret < 0)
+                       cpumask_copy(wq_unbound_cpumask, saved_cpumask);
+
+               apply_wqattrs_unlock();
+       }
+
+       free_cpumask_var(saved_cpumask);
+       return ret;
+}
+
+#ifdef CONFIG_SYSFS
+/*
+ * Workqueues with WQ_SYSFS flag set is visible to userland via
+ * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
+ * following attributes.
+ *
+ *  per_cpu    RO bool : whether the workqueue is per-cpu or unbound
+ *  max_active RW int  : maximum number of in-flight work items
+ *
+ * Unbound workqueues have the following extra attributes.
+ *
+ *  id         RO int  : the associated pool ID
+ *  nice       RW int  : nice value of the workers
+ *  cpumask    RW mask : bitmask of allowed CPUs for the workers
+ */
+struct wq_device {
+       struct workqueue_struct         *wq;
+       struct device                   dev;
+};
+
+static struct workqueue_struct *dev_to_wq(struct device *dev)
+{
+       struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+
+       return wq_dev->wq;
+}
+
+static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
+                           char *buf)
+{
+       struct workqueue_struct *wq = dev_to_wq(dev);
+
+       return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
+}
+static DEVICE_ATTR_RO(per_cpu);
+
+static ssize_t max_active_show(struct device *dev,
+                              struct device_attribute *attr, char *buf)
+{
+       struct workqueue_struct *wq = dev_to_wq(dev);
+
+       return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
+}
+
+static ssize_t max_active_store(struct device *dev,
+                               struct device_attribute *attr, const char *buf,
+                               size_t count)
+{
+       struct workqueue_struct *wq = dev_to_wq(dev);
+       int val;
+
+       if (sscanf(buf, "%d", &val) != 1 || val <= 0)
+               return -EINVAL;
+
+       workqueue_set_max_active(wq, val);
+       return count;
+}
+static DEVICE_ATTR_RW(max_active);
+
+static struct attribute *wq_sysfs_attrs[] = {
+       &dev_attr_per_cpu.attr,
+       &dev_attr_max_active.attr,
+       NULL,
+};
+ATTRIBUTE_GROUPS(wq_sysfs);
+
+static ssize_t wq_pool_ids_show(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+       struct workqueue_struct *wq = dev_to_wq(dev);
+       const char *delim = "";
+       int node, written = 0;
+
+       rcu_read_lock_sched();
+       for_each_node(node) {
+               written += scnprintf(buf + written, PAGE_SIZE - written,
+                                    "%s%d:%d", delim, node,
+                                    unbound_pwq_by_node(wq, node)->pool->id);
+               delim = " ";
+       }
+       written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
+       rcu_read_unlock_sched();
+
+       return written;
+}
+
+static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
+                           char *buf)
+{
+       struct workqueue_struct *wq = dev_to_wq(dev);
+       int written;
+
+       mutex_lock(&wq->mutex);
+       written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
+       mutex_unlock(&wq->mutex);
+
+       return written;
+}
+
+/* prepare workqueue_attrs for sysfs store operations */
+static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
+{
+       struct workqueue_attrs *attrs;
+
+       lockdep_assert_held(&wq_pool_mutex);
+
+       attrs = alloc_workqueue_attrs(GFP_KERNEL);
+       if (!attrs)
+               return NULL;
+
+       copy_workqueue_attrs(attrs, wq->unbound_attrs);
+       return attrs;
+}
+
+static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
+                            const char *buf, size_t count)
+{
+       struct workqueue_struct *wq = dev_to_wq(dev);
+       struct workqueue_attrs *attrs;
+       int ret = -ENOMEM;
+
+       apply_wqattrs_lock();
+
+       attrs = wq_sysfs_prep_attrs(wq);
+       if (!attrs)
+               goto out_unlock;
+
+       if (sscanf(buf, "%d", &attrs->nice) == 1 &&
+           attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
+               ret = apply_workqueue_attrs_locked(wq, attrs);
+       else
+               ret = -EINVAL;
+
+out_unlock:
+       apply_wqattrs_unlock();
+       free_workqueue_attrs(attrs);
+       return ret ?: count;
+}
+
+static ssize_t wq_cpumask_show(struct device *dev,
+                              struct device_attribute *attr, char *buf)
+{
+       struct workqueue_struct *wq = dev_to_wq(dev);
+       int written;
+
+       mutex_lock(&wq->mutex);
+       written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
+                           cpumask_pr_args(wq->unbound_attrs->cpumask));
+       mutex_unlock(&wq->mutex);
+       return written;
+}
+
+static ssize_t wq_cpumask_store(struct device *dev,
+                               struct device_attribute *attr,
+                               const char *buf, size_t count)
+{
+       struct workqueue_struct *wq = dev_to_wq(dev);
+       struct workqueue_attrs *attrs;
+       int ret = -ENOMEM;
+
+       apply_wqattrs_lock();
+
+       attrs = wq_sysfs_prep_attrs(wq);
+       if (!attrs)
+               goto out_unlock;
+
+       ret = cpumask_parse(buf, attrs->cpumask);
+       if (!ret)
+               ret = apply_workqueue_attrs_locked(wq, attrs);
+
+out_unlock:
+       apply_wqattrs_unlock();
+       free_workqueue_attrs(attrs);
+       return ret ?: count;
+}
+
+static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
+                           char *buf)
+{
+       struct workqueue_struct *wq = dev_to_wq(dev);
+       int written;
+
+       mutex_lock(&wq->mutex);
+       written = scnprintf(buf, PAGE_SIZE, "%d\n",
+                           !wq->unbound_attrs->no_numa);
+       mutex_unlock(&wq->mutex);
+
+       return written;
+}
+
+static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
+                            const char *buf, size_t count)
 {
-       int cpu = (unsigned long)hcpu;
-       struct work_struct unbind_work;
-       struct workqueue_struct *wq;
+       struct workqueue_struct *wq = dev_to_wq(dev);
+       struct workqueue_attrs *attrs;
+       int v, ret = -ENOMEM;
 
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_DOWN_PREPARE:
-               /* unbinding per-cpu workers should happen on the local CPU */
-               INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
-               queue_work_on(cpu, system_highpri_wq, &unbind_work);
+       apply_wqattrs_lock();
 
-               /* update NUMA affinity of unbound workqueues */
-               mutex_lock(&wq_pool_mutex);
-               list_for_each_entry(wq, &workqueues, list)
-                       wq_update_unbound_numa(wq, cpu, false);
-               mutex_unlock(&wq_pool_mutex);
+       attrs = wq_sysfs_prep_attrs(wq);
+       if (!attrs)
+               goto out_unlock;
 
-               /* wait for per-cpu unbinding to finish */
-               flush_work(&unbind_work);
-               break;
+       ret = -EINVAL;
+       if (sscanf(buf, "%d", &v) == 1) {
+               attrs->no_numa = !v;
+               ret = apply_workqueue_attrs_locked(wq, attrs);
        }
-       return NOTIFY_OK;
+
+out_unlock:
+       apply_wqattrs_unlock();
+       free_workqueue_attrs(attrs);
+       return ret ?: count;
 }
 
-#ifdef CONFIG_SMP
+static struct device_attribute wq_sysfs_unbound_attrs[] = {
+       __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
+       __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
+       __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
+       __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
+       __ATTR_NULL,
+};
 
-struct work_for_cpu {
-       struct work_struct work;
-       long (*fn)(void *);
-       void *arg;
-       long ret;
+static struct bus_type wq_subsys = {
+       .name                           = "workqueue",
+       .dev_groups                     = wq_sysfs_groups,
 };
 
-static void work_for_cpu_fn(struct work_struct *work)
+static ssize_t wq_unbound_cpumask_show(struct device *dev,
+               struct device_attribute *attr, char *buf)
 {
-       struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);
+       int written;
 
-       wfc->ret = wfc->fn(wfc->arg);
+       mutex_lock(&wq_pool_mutex);
+       written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
+                           cpumask_pr_args(wq_unbound_cpumask));
+       mutex_unlock(&wq_pool_mutex);
+
+       return written;
 }
 
-/**
- * work_on_cpu - run a function in user context on a particular cpu
- * @cpu: the cpu to run on
- * @fn: the function to run
- * @arg: the function arg
- *
- * This will return the value @fn returns.
- * It is up to the caller to ensure that the cpu doesn't go offline.
- * The caller must not hold any locks which would prevent @fn from completing.
- */
-long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
+static ssize_t wq_unbound_cpumask_store(struct device *dev,
+               struct device_attribute *attr, const char *buf, size_t count)
 {
-       struct work_for_cpu wfc = { .fn = fn, .arg = arg };
+       cpumask_var_t cpumask;
+       int ret;
 
-       INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
-       schedule_work_on(cpu, &wfc.work);
-       flush_work(&wfc.work);
-       return wfc.ret;
+       if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
+               return -ENOMEM;
+
+       ret = cpumask_parse(buf, cpumask);
+       if (!ret)
+               ret = workqueue_set_unbound_cpumask(cpumask);
+
+       free_cpumask_var(cpumask);
+       return ret ? ret : count;
 }
-EXPORT_SYMBOL_GPL(work_on_cpu);
-#endif /* CONFIG_SMP */
 
-#ifdef CONFIG_FREEZER
+static struct device_attribute wq_sysfs_cpumask_attr =
+       __ATTR(cpumask, 0644, wq_unbound_cpumask_show,
+              wq_unbound_cpumask_store);
 
-/**
- * freeze_workqueues_begin - begin freezing workqueues
- *
- * Start freezing workqueues.  After this function returns, all freezable
- * workqueues will queue new works to their delayed_works list instead of
- * pool->worklist.
- *
- * CONTEXT:
- * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
- */
-void freeze_workqueues_begin(void)
+static int __init wq_sysfs_init(void)
 {
-       struct worker_pool *pool;
-       struct workqueue_struct *wq;
-       struct pool_workqueue *pwq;
-       int pi;
+       int err;
 
-       mutex_lock(&wq_pool_mutex);
-
-       WARN_ON_ONCE(workqueue_freezing);
-       workqueue_freezing = true;
+       err = subsys_virtual_register(&wq_subsys, NULL);
+       if (err)
+               return err;
 
-       /* set FREEZING */
-       for_each_pool(pool, pi) {
-               spin_lock_irq(&pool->lock);
-               WARN_ON_ONCE(pool->flags & POOL_FREEZING);
-               pool->flags |= POOL_FREEZING;
-               spin_unlock_irq(&pool->lock);
-       }
+       return device_create_file(wq_subsys.dev_root, &wq_sysfs_cpumask_attr);
+}
+core_initcall(wq_sysfs_init);
 
-       list_for_each_entry(wq, &workqueues, list) {
-               mutex_lock(&wq->mutex);
-               for_each_pwq(pwq, wq)
-                       pwq_adjust_max_active(pwq);
-               mutex_unlock(&wq->mutex);
-       }
+static void wq_device_release(struct device *dev)
+{
+       struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
 
-       mutex_unlock(&wq_pool_mutex);
+       kfree(wq_dev);
 }
 
 /**
- * freeze_workqueues_busy - are freezable workqueues still busy?
+ * workqueue_sysfs_register - make a workqueue visible in sysfs
+ * @wq: the workqueue to register
  *
- * Check whether freezing is complete.  This function must be called
- * between freeze_workqueues_begin() and thaw_workqueues().
+ * Expose @wq in sysfs under /sys/bus/workqueue/devices.
+ * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
+ * which is the preferred method.
  *
- * CONTEXT:
- * Grabs and releases wq_pool_mutex.
+ * Workqueue user should use this function directly iff it wants to apply
+ * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
+ * apply_workqueue_attrs() may race against userland updating the
+ * attributes.
  *
- * RETURNS:
- * %true if some freezable workqueues are still busy.  %false if freezing
- * is complete.
+ * Return: 0 on success, -errno on failure.
  */
-bool freeze_workqueues_busy(void)
+int workqueue_sysfs_register(struct workqueue_struct *wq)
 {
-       bool busy = false;
-       struct workqueue_struct *wq;
-       struct pool_workqueue *pwq;
+       struct wq_device *wq_dev;
+       int ret;
 
-       mutex_lock(&wq_pool_mutex);
+       /*
+        * Adjusting max_active or creating new pwqs by applying
+        * attributes breaks ordering guarantee.  Disallow exposing ordered
+        * workqueues.
+        */
+       if (WARN_ON(wq->flags & __WQ_ORDERED))
+               return -EINVAL;
 
-       WARN_ON_ONCE(!workqueue_freezing);
+       wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
+       if (!wq_dev)
+               return -ENOMEM;
 
-       list_for_each_entry(wq, &workqueues, list) {
-               if (!(wq->flags & WQ_FREEZABLE))
-                       continue;
-               /*
-                * nr_active is monotonically decreasing.  It's safe
-                * to peek without lock.
-                */
-               rcu_read_lock_sched();
-               for_each_pwq(pwq, wq) {
-                       WARN_ON_ONCE(pwq->nr_active < 0);
-                       if (pwq->nr_active) {
-                               busy = true;
-                               rcu_read_unlock_sched();
-                               goto out_unlock;
+       wq_dev->wq = wq;
+       wq_dev->dev.bus = &wq_subsys;
+       wq_dev->dev.init_name = wq->name;
+       wq_dev->dev.release = wq_device_release;
+
+       /*
+        * unbound_attrs are created separately.  Suppress uevent until
+        * everything is ready.
+        */
+       dev_set_uevent_suppress(&wq_dev->dev, true);
+
+       ret = device_register(&wq_dev->dev);
+       if (ret) {
+               kfree(wq_dev);
+               wq->wq_dev = NULL;
+               return ret;
+       }
+
+       if (wq->flags & WQ_UNBOUND) {
+               struct device_attribute *attr;
+
+               for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
+                       ret = device_create_file(&wq_dev->dev, attr);
+                       if (ret) {
+                               device_unregister(&wq_dev->dev);
+                               wq->wq_dev = NULL;
+                               return ret;
                        }
                }
-               rcu_read_unlock_sched();
        }
-out_unlock:
-       mutex_unlock(&wq_pool_mutex);
-       return busy;
+
+       dev_set_uevent_suppress(&wq_dev->dev, false);
+       kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
+       return 0;
 }
 
 /**
- * thaw_workqueues - thaw workqueues
- *
- * Thaw workqueues.  Normal queueing is restored and all collected
- * frozen works are transferred to their respective pool worklists.
+ * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
+ * @wq: the workqueue to unregister
  *
- * CONTEXT:
- * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
+ * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
  */
-void thaw_workqueues(void)
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
 {
-       struct workqueue_struct *wq;
-       struct pool_workqueue *pwq;
-       struct worker_pool *pool;
-       int pi;
-
-       mutex_lock(&wq_pool_mutex);
-
-       if (!workqueue_freezing)
-               goto out_unlock;
-
-       /* clear FREEZING */
-       for_each_pool(pool, pi) {
-               spin_lock_irq(&pool->lock);
-               WARN_ON_ONCE(!(pool->flags & POOL_FREEZING));
-               pool->flags &= ~POOL_FREEZING;
-               spin_unlock_irq(&pool->lock);
-       }
+       struct wq_device *wq_dev = wq->wq_dev;
 
-       /* restore max_active and repopulate worklist */
-       list_for_each_entry(wq, &workqueues, list) {
-               mutex_lock(&wq->mutex);
-               for_each_pwq(pwq, wq)
-                       pwq_adjust_max_active(pwq);
-               mutex_unlock(&wq->mutex);
-       }
+       if (!wq->wq_dev)
+               return;
 
-       workqueue_freezing = false;
-out_unlock:
-       mutex_unlock(&wq_pool_mutex);
+       wq->wq_dev = NULL;
+       device_unregister(&wq_dev->dev);
 }
-#endif /* CONFIG_FREEZER */
+#else  /* CONFIG_SYSFS */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq)    { }
+#endif /* CONFIG_SYSFS */
 
 static void __init wq_numa_init(void)
 {
        cpumask_var_t *tbl;
        int node, cpu;
 
-       /* determine NUMA pwq table len - highest node id + 1 */
-       for_each_node(node)
-               wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1);
-
        if (num_possible_nodes() <= 1)
                return;
 
@@ -4901,11 +5238,11 @@ static void __init wq_numa_init(void)
         * available.  Build one from cpu_to_node() which should have been
         * fully initialized by now.
         */
-       tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL);
+       tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL);
        BUG_ON(!tbl);
 
        for_each_node(node)
-               BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
+               BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
                                node_online(node) ? node : NUMA_NO_NODE));
 
        for_each_possible_cpu(cpu) {
@@ -4927,12 +5264,11 @@ static int __init init_workqueues(void)
        int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
        int i, cpu;
 
-       /* make sure we have enough bits for OFFQ pool ID */
-       BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
-                    WORK_CPU_END * NR_STD_WORKER_POOLS);
-
        WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
 
+       BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
+       cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
+
        pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
 
        cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
@@ -4965,17 +5301,27 @@ static int __init init_workqueues(void)
 
                for_each_cpu_worker_pool(pool, cpu) {
                        pool->flags &= ~POOL_DISASSOCIATED;
-                       BUG_ON(create_and_start_worker(pool) < 0);
+                       BUG_ON(!create_worker(pool));
                }
        }
 
-       /* create default unbound wq attrs */
+       /* create default unbound and ordered wq attrs */
        for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
                struct workqueue_attrs *attrs;
 
                BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
                attrs->nice = std_nice[i];
                unbound_std_wq_attrs[i] = attrs;
+
+               /*
+                * An ordered wq should have only one pwq as ordering is
+                * guaranteed by max_active which is enforced by pwqs.
+                * Turn off NUMA so that dfl_pwq is used for all nodes.
+                */
+               BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
+               attrs->nice = std_nice[i];
+               attrs->no_numa = true;
+               ordered_wq_attrs[i] = attrs;
        }
 
        system_wq = alloc_workqueue("events", 0, 0);
@@ -4985,8 +5331,15 @@ static int __init init_workqueues(void)
                                            WQ_UNBOUND_MAX_ACTIVE);
        system_freezable_wq = alloc_workqueue("events_freezable",
                                              WQ_FREEZABLE, 0);
+       system_power_efficient_wq = alloc_workqueue("events_power_efficient",
+                                             WQ_POWER_EFFICIENT, 0);
+       system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
+                                             WQ_FREEZABLE | WQ_POWER_EFFICIENT,
+                                             0);
        BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
-              !system_unbound_wq || !system_freezable_wq);
+              !system_unbound_wq || !system_freezable_wq ||
+              !system_power_efficient_wq ||
+              !system_freezable_power_efficient_wq);
        return 0;
 }
 early_initcall(init_workqueues);