f43e2aa354a6bd4ccb1a4f89ea5641ae3606785f
[firefly-linux-kernel-4.4.55.git] / drivers / block / drbd / drbd_main.c
1 /*
2    drbd.c
3
4    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10    Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11    from Logicworks, Inc. for making SDP replication support possible.
12
13    drbd is free software; you can redistribute it and/or modify
14    it under the terms of the GNU General Public License as published by
15    the Free Software Foundation; either version 2, or (at your option)
16    any later version.
17
18    drbd is distributed in the hope that it will be useful,
19    but WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21    GNU General Public License for more details.
22
23    You should have received a copy of the GNU General Public License
24    along with drbd; see the file COPYING.  If not, write to
25    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27  */
28
29 #include <linux/module.h>
30 #include <linux/drbd.h>
31 #include <asm/uaccess.h>
32 #include <asm/types.h>
33 #include <net/sock.h>
34 #include <linux/ctype.h>
35 #include <linux/mutex.h>
36 #include <linux/fs.h>
37 #include <linux/file.h>
38 #include <linux/proc_fs.h>
39 #include <linux/init.h>
40 #include <linux/mm.h>
41 #include <linux/memcontrol.h>
42 #include <linux/mm_inline.h>
43 #include <linux/slab.h>
44 #include <linux/random.h>
45 #include <linux/reboot.h>
46 #include <linux/notifier.h>
47 #include <linux/kthread.h>
48
49 #define __KERNEL_SYSCALLS__
50 #include <linux/unistd.h>
51 #include <linux/vmalloc.h>
52
53 #include <linux/drbd_limits.h>
54 #include "drbd_int.h"
55 #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57 #include "drbd_vli.h"
58
59 struct after_state_chg_work {
60         struct drbd_work w;
61         union drbd_state os;
62         union drbd_state ns;
63         enum chg_state_flags flags;
64         struct completion *done;
65 };
66
67 static DEFINE_MUTEX(drbd_main_mutex);
68 int drbdd_init(struct drbd_thread *);
69 int drbd_worker(struct drbd_thread *);
70 int drbd_asender(struct drbd_thread *);
71
72 int drbd_init(void);
73 static int drbd_open(struct block_device *bdev, fmode_t mode);
74 static int drbd_release(struct gendisk *gd, fmode_t mode);
75 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77                            union drbd_state ns, enum chg_state_flags flags);
78 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79 static void md_sync_timer_fn(unsigned long data);
80 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
81 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
82
83 MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
84               "Lars Ellenberg <lars@linbit.com>");
85 MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
86 MODULE_VERSION(REL_VERSION);
87 MODULE_LICENSE("GPL");
88 MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
89 MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
90
91 #include <linux/moduleparam.h>
92 /* allow_open_on_secondary */
93 MODULE_PARM_DESC(allow_oos, "DONT USE!");
94 /* thanks to these macros, if compiled into the kernel (not-module),
95  * this becomes the boot parameter drbd.minor_count */
96 module_param(minor_count, uint, 0444);
97 module_param(disable_sendpage, bool, 0644);
98 module_param(allow_oos, bool, 0);
99 module_param(cn_idx, uint, 0444);
100 module_param(proc_details, int, 0644);
101
102 #ifdef CONFIG_DRBD_FAULT_INJECTION
103 int enable_faults;
104 int fault_rate;
105 static int fault_count;
106 int fault_devs;
107 /* bitmap of enabled faults */
108 module_param(enable_faults, int, 0664);
109 /* fault rate % value - applies to all enabled faults */
110 module_param(fault_rate, int, 0664);
111 /* count of faults inserted */
112 module_param(fault_count, int, 0664);
113 /* bitmap of devices to insert faults on */
114 module_param(fault_devs, int, 0644);
115 #endif
116
117 /* module parameter, defined */
118 unsigned int minor_count = 32;
119 int disable_sendpage;
120 int allow_oos;
121 unsigned int cn_idx = CN_IDX_DRBD;
122 int proc_details;       /* Detail level in proc drbd*/
123
124 /* Module parameter for setting the user mode helper program
125  * to run. Default is /sbin/drbdadm */
126 char usermode_helper[80] = "/sbin/drbdadm";
127
128 module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
129
130 /* in 2.6.x, our device mapping and config info contains our virtual gendisks
131  * as member "struct gendisk *vdisk;"
132  */
133 struct drbd_conf **minor_table;
134
135 struct kmem_cache *drbd_request_cache;
136 struct kmem_cache *drbd_ee_cache;       /* epoch entries */
137 struct kmem_cache *drbd_bm_ext_cache;   /* bitmap extents */
138 struct kmem_cache *drbd_al_ext_cache;   /* activity log extents */
139 mempool_t *drbd_request_mempool;
140 mempool_t *drbd_ee_mempool;
141
142 /* I do not use a standard mempool, because:
143    1) I want to hand out the pre-allocated objects first.
144    2) I want to be able to interrupt sleeping allocation with a signal.
145    Note: This is a single linked list, the next pointer is the private
146          member of struct page.
147  */
148 struct page *drbd_pp_pool;
149 spinlock_t   drbd_pp_lock;
150 int          drbd_pp_vacant;
151 wait_queue_head_t drbd_pp_wait;
152
153 DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
154
155 static const struct block_device_operations drbd_ops = {
156         .owner =   THIS_MODULE,
157         .open =    drbd_open,
158         .release = drbd_release,
159 };
160
161 #define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
162
163 #ifdef __CHECKER__
164 /* When checking with sparse, and this is an inline function, sparse will
165    give tons of false positives. When this is a real functions sparse works.
166  */
167 int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
168 {
169         int io_allowed;
170
171         atomic_inc(&mdev->local_cnt);
172         io_allowed = (mdev->state.disk >= mins);
173         if (!io_allowed) {
174                 if (atomic_dec_and_test(&mdev->local_cnt))
175                         wake_up(&mdev->misc_wait);
176         }
177         return io_allowed;
178 }
179
180 #endif
181
182 /**
183  * DOC: The transfer log
184  *
185  * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
186  * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
187  * of the list. There is always at least one &struct drbd_tl_epoch object.
188  *
189  * Each &struct drbd_tl_epoch has a circular double linked list of requests
190  * attached.
191  */
192 static int tl_init(struct drbd_conf *mdev)
193 {
194         struct drbd_tl_epoch *b;
195
196         /* during device minor initialization, we may well use GFP_KERNEL */
197         b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
198         if (!b)
199                 return 0;
200         INIT_LIST_HEAD(&b->requests);
201         INIT_LIST_HEAD(&b->w.list);
202         b->next = NULL;
203         b->br_number = 4711;
204         b->n_writes = 0;
205         b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
206
207         mdev->oldest_tle = b;
208         mdev->newest_tle = b;
209         INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
210
211         mdev->tl_hash = NULL;
212         mdev->tl_hash_s = 0;
213
214         return 1;
215 }
216
217 static void tl_cleanup(struct drbd_conf *mdev)
218 {
219         D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
220         D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
221         kfree(mdev->oldest_tle);
222         mdev->oldest_tle = NULL;
223         kfree(mdev->unused_spare_tle);
224         mdev->unused_spare_tle = NULL;
225         kfree(mdev->tl_hash);
226         mdev->tl_hash = NULL;
227         mdev->tl_hash_s = 0;
228 }
229
230 /**
231  * _tl_add_barrier() - Adds a barrier to the transfer log
232  * @mdev:       DRBD device.
233  * @new:        Barrier to be added before the current head of the TL.
234  *
235  * The caller must hold the req_lock.
236  */
237 void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
238 {
239         struct drbd_tl_epoch *newest_before;
240
241         INIT_LIST_HEAD(&new->requests);
242         INIT_LIST_HEAD(&new->w.list);
243         new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
244         new->next = NULL;
245         new->n_writes = 0;
246
247         newest_before = mdev->newest_tle;
248         /* never send a barrier number == 0, because that is special-cased
249          * when using TCQ for our write ordering code */
250         new->br_number = (newest_before->br_number+1) ?: 1;
251         if (mdev->newest_tle != new) {
252                 mdev->newest_tle->next = new;
253                 mdev->newest_tle = new;
254         }
255 }
256
257 /**
258  * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
259  * @mdev:       DRBD device.
260  * @barrier_nr: Expected identifier of the DRBD write barrier packet.
261  * @set_size:   Expected number of requests before that barrier.
262  *
263  * In case the passed barrier_nr or set_size does not match the oldest
264  * &struct drbd_tl_epoch objects this function will cause a termination
265  * of the connection.
266  */
267 void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
268                        unsigned int set_size)
269 {
270         struct drbd_tl_epoch *b, *nob; /* next old barrier */
271         struct list_head *le, *tle;
272         struct drbd_request *r;
273
274         spin_lock_irq(&mdev->req_lock);
275
276         b = mdev->oldest_tle;
277
278         /* first some paranoia code */
279         if (b == NULL) {
280                 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
281                         barrier_nr);
282                 goto bail;
283         }
284         if (b->br_number != barrier_nr) {
285                 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
286                         barrier_nr, b->br_number);
287                 goto bail;
288         }
289         if (b->n_writes != set_size) {
290                 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
291                         barrier_nr, set_size, b->n_writes);
292                 goto bail;
293         }
294
295         /* Clean up list of requests processed during current epoch */
296         list_for_each_safe(le, tle, &b->requests) {
297                 r = list_entry(le, struct drbd_request, tl_requests);
298                 _req_mod(r, barrier_acked);
299         }
300         /* There could be requests on the list waiting for completion
301            of the write to the local disk. To avoid corruptions of
302            slab's data structures we have to remove the lists head.
303
304            Also there could have been a barrier ack out of sequence, overtaking
305            the write acks - which would be a bug and violating write ordering.
306            To not deadlock in case we lose connection while such requests are
307            still pending, we need some way to find them for the
308            _req_mode(connection_lost_while_pending).
309
310            These have been list_move'd to the out_of_sequence_requests list in
311            _req_mod(, barrier_acked) above.
312            */
313         list_del_init(&b->requests);
314
315         nob = b->next;
316         if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
317                 _tl_add_barrier(mdev, b);
318                 if (nob)
319                         mdev->oldest_tle = nob;
320                 /* if nob == NULL b was the only barrier, and becomes the new
321                    barrier. Therefore mdev->oldest_tle points already to b */
322         } else {
323                 D_ASSERT(nob != NULL);
324                 mdev->oldest_tle = nob;
325                 kfree(b);
326         }
327
328         spin_unlock_irq(&mdev->req_lock);
329         dec_ap_pending(mdev);
330
331         return;
332
333 bail:
334         spin_unlock_irq(&mdev->req_lock);
335         drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
336 }
337
338 /**
339  * _tl_restart() - Walks the transfer log, and applies an action to all requests
340  * @mdev:       DRBD device.
341  * @what:       The action/event to perform with all request objects
342  *
343  * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
344  * restart_frozen_disk_io.
345  */
346 static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
347 {
348         struct drbd_tl_epoch *b, *tmp, **pn;
349         struct list_head *le, *tle, carry_reads;
350         struct drbd_request *req;
351         int rv, n_writes, n_reads;
352
353         b = mdev->oldest_tle;
354         pn = &mdev->oldest_tle;
355         while (b) {
356                 n_writes = 0;
357                 n_reads = 0;
358                 INIT_LIST_HEAD(&carry_reads);
359                 list_for_each_safe(le, tle, &b->requests) {
360                         req = list_entry(le, struct drbd_request, tl_requests);
361                         rv = _req_mod(req, what);
362
363                         n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
364                         n_reads  += (rv & MR_READ) >> MR_READ_SHIFT;
365                 }
366                 tmp = b->next;
367
368                 if (n_writes) {
369                         if (what == resend) {
370                                 b->n_writes = n_writes;
371                                 if (b->w.cb == NULL) {
372                                         b->w.cb = w_send_barrier;
373                                         inc_ap_pending(mdev);
374                                         set_bit(CREATE_BARRIER, &mdev->flags);
375                                 }
376
377                                 drbd_queue_work(&mdev->data.work, &b->w);
378                         }
379                         pn = &b->next;
380                 } else {
381                         if (n_reads)
382                                 list_add(&carry_reads, &b->requests);
383                         /* there could still be requests on that ring list,
384                          * in case local io is still pending */
385                         list_del(&b->requests);
386
387                         /* dec_ap_pending corresponding to queue_barrier.
388                          * the newest barrier may not have been queued yet,
389                          * in which case w.cb is still NULL. */
390                         if (b->w.cb != NULL)
391                                 dec_ap_pending(mdev);
392
393                         if (b == mdev->newest_tle) {
394                                 /* recycle, but reinit! */
395                                 D_ASSERT(tmp == NULL);
396                                 INIT_LIST_HEAD(&b->requests);
397                                 list_splice(&carry_reads, &b->requests);
398                                 INIT_LIST_HEAD(&b->w.list);
399                                 b->w.cb = NULL;
400                                 b->br_number = net_random();
401                                 b->n_writes = 0;
402
403                                 *pn = b;
404                                 break;
405                         }
406                         *pn = tmp;
407                         kfree(b);
408                 }
409                 b = tmp;
410                 list_splice(&carry_reads, &b->requests);
411         }
412 }
413
414
415 /**
416  * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
417  * @mdev:       DRBD device.
418  *
419  * This is called after the connection to the peer was lost. The storage covered
420  * by the requests on the transfer gets marked as our of sync. Called from the
421  * receiver thread and the worker thread.
422  */
423 void tl_clear(struct drbd_conf *mdev)
424 {
425         struct list_head *le, *tle;
426         struct drbd_request *r;
427
428         spin_lock_irq(&mdev->req_lock);
429
430         _tl_restart(mdev, connection_lost_while_pending);
431
432         /* we expect this list to be empty. */
433         D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
434
435         /* but just in case, clean it up anyways! */
436         list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
437                 r = list_entry(le, struct drbd_request, tl_requests);
438                 /* It would be nice to complete outside of spinlock.
439                  * But this is easier for now. */
440                 _req_mod(r, connection_lost_while_pending);
441         }
442
443         /* ensure bit indicating barrier is required is clear */
444         clear_bit(CREATE_BARRIER, &mdev->flags);
445
446         memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
447
448         spin_unlock_irq(&mdev->req_lock);
449 }
450
451 void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
452 {
453         spin_lock_irq(&mdev->req_lock);
454         _tl_restart(mdev, what);
455         spin_unlock_irq(&mdev->req_lock);
456 }
457
458 /**
459  * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
460  * @mdev:       DRBD device.
461  * @os:         old (current) state.
462  * @ns:         new (wanted) state.
463  */
464 static int cl_wide_st_chg(struct drbd_conf *mdev,
465                           union drbd_state os, union drbd_state ns)
466 {
467         return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
468                  ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
469                   (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
470                   (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
471                   (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
472                 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
473                 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
474 }
475
476 int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
477                       union drbd_state mask, union drbd_state val)
478 {
479         unsigned long flags;
480         union drbd_state os, ns;
481         int rv;
482
483         spin_lock_irqsave(&mdev->req_lock, flags);
484         os = mdev->state;
485         ns.i = (os.i & ~mask.i) | val.i;
486         rv = _drbd_set_state(mdev, ns, f, NULL);
487         ns = mdev->state;
488         spin_unlock_irqrestore(&mdev->req_lock, flags);
489
490         return rv;
491 }
492
493 /**
494  * drbd_force_state() - Impose a change which happens outside our control on our state
495  * @mdev:       DRBD device.
496  * @mask:       mask of state bits to change.
497  * @val:        value of new state bits.
498  */
499 void drbd_force_state(struct drbd_conf *mdev,
500         union drbd_state mask, union drbd_state val)
501 {
502         drbd_change_state(mdev, CS_HARD, mask, val);
503 }
504
505 static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
506 static int is_valid_state_transition(struct drbd_conf *,
507                                      union drbd_state, union drbd_state);
508 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
509                                        union drbd_state ns, const char **warn_sync_abort);
510 int drbd_send_state_req(struct drbd_conf *,
511                         union drbd_state, union drbd_state);
512
513 static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
514                                     union drbd_state mask, union drbd_state val)
515 {
516         union drbd_state os, ns;
517         unsigned long flags;
518         int rv;
519
520         if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
521                 return SS_CW_SUCCESS;
522
523         if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
524                 return SS_CW_FAILED_BY_PEER;
525
526         rv = 0;
527         spin_lock_irqsave(&mdev->req_lock, flags);
528         os = mdev->state;
529         ns.i = (os.i & ~mask.i) | val.i;
530         ns = sanitize_state(mdev, os, ns, NULL);
531
532         if (!cl_wide_st_chg(mdev, os, ns))
533                 rv = SS_CW_NO_NEED;
534         if (!rv) {
535                 rv = is_valid_state(mdev, ns);
536                 if (rv == SS_SUCCESS) {
537                         rv = is_valid_state_transition(mdev, ns, os);
538                         if (rv == SS_SUCCESS)
539                                 rv = 0; /* cont waiting, otherwise fail. */
540                 }
541         }
542         spin_unlock_irqrestore(&mdev->req_lock, flags);
543
544         return rv;
545 }
546
547 /**
548  * drbd_req_state() - Perform an eventually cluster wide state change
549  * @mdev:       DRBD device.
550  * @mask:       mask of state bits to change.
551  * @val:        value of new state bits.
552  * @f:          flags
553  *
554  * Should not be called directly, use drbd_request_state() or
555  * _drbd_request_state().
556  */
557 static int drbd_req_state(struct drbd_conf *mdev,
558                           union drbd_state mask, union drbd_state val,
559                           enum chg_state_flags f)
560 {
561         struct completion done;
562         unsigned long flags;
563         union drbd_state os, ns;
564         int rv;
565
566         init_completion(&done);
567
568         if (f & CS_SERIALIZE)
569                 mutex_lock(&mdev->state_mutex);
570
571         spin_lock_irqsave(&mdev->req_lock, flags);
572         os = mdev->state;
573         ns.i = (os.i & ~mask.i) | val.i;
574         ns = sanitize_state(mdev, os, ns, NULL);
575
576         if (cl_wide_st_chg(mdev, os, ns)) {
577                 rv = is_valid_state(mdev, ns);
578                 if (rv == SS_SUCCESS)
579                         rv = is_valid_state_transition(mdev, ns, os);
580                 spin_unlock_irqrestore(&mdev->req_lock, flags);
581
582                 if (rv < SS_SUCCESS) {
583                         if (f & CS_VERBOSE)
584                                 print_st_err(mdev, os, ns, rv);
585                         goto abort;
586                 }
587
588                 drbd_state_lock(mdev);
589                 if (!drbd_send_state_req(mdev, mask, val)) {
590                         drbd_state_unlock(mdev);
591                         rv = SS_CW_FAILED_BY_PEER;
592                         if (f & CS_VERBOSE)
593                                 print_st_err(mdev, os, ns, rv);
594                         goto abort;
595                 }
596
597                 wait_event(mdev->state_wait,
598                         (rv = _req_st_cond(mdev, mask, val)));
599
600                 if (rv < SS_SUCCESS) {
601                         drbd_state_unlock(mdev);
602                         if (f & CS_VERBOSE)
603                                 print_st_err(mdev, os, ns, rv);
604                         goto abort;
605                 }
606                 spin_lock_irqsave(&mdev->req_lock, flags);
607                 os = mdev->state;
608                 ns.i = (os.i & ~mask.i) | val.i;
609                 rv = _drbd_set_state(mdev, ns, f, &done);
610                 drbd_state_unlock(mdev);
611         } else {
612                 rv = _drbd_set_state(mdev, ns, f, &done);
613         }
614
615         spin_unlock_irqrestore(&mdev->req_lock, flags);
616
617         if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
618                 D_ASSERT(current != mdev->worker.task);
619                 wait_for_completion(&done);
620         }
621
622 abort:
623         if (f & CS_SERIALIZE)
624                 mutex_unlock(&mdev->state_mutex);
625
626         return rv;
627 }
628
629 /**
630  * _drbd_request_state() - Request a state change (with flags)
631  * @mdev:       DRBD device.
632  * @mask:       mask of state bits to change.
633  * @val:        value of new state bits.
634  * @f:          flags
635  *
636  * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
637  * flag, or when logging of failed state change requests is not desired.
638  */
639 int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
640                         union drbd_state val,   enum chg_state_flags f)
641 {
642         int rv;
643
644         wait_event(mdev->state_wait,
645                    (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
646
647         return rv;
648 }
649
650 static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
651 {
652         dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
653             name,
654             drbd_conn_str(ns.conn),
655             drbd_role_str(ns.role),
656             drbd_role_str(ns.peer),
657             drbd_disk_str(ns.disk),
658             drbd_disk_str(ns.pdsk),
659             is_susp(ns) ? 's' : 'r',
660             ns.aftr_isp ? 'a' : '-',
661             ns.peer_isp ? 'p' : '-',
662             ns.user_isp ? 'u' : '-'
663             );
664 }
665
666 void print_st_err(struct drbd_conf *mdev,
667         union drbd_state os, union drbd_state ns, int err)
668 {
669         if (err == SS_IN_TRANSIENT_STATE)
670                 return;
671         dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
672         print_st(mdev, " state", os);
673         print_st(mdev, "wanted", ns);
674 }
675
676
677 /**
678  * is_valid_state() - Returns an SS_ error code if ns is not valid
679  * @mdev:       DRBD device.
680  * @ns:         State to consider.
681  */
682 static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
683 {
684         /* See drbd_state_sw_errors in drbd_strings.c */
685
686         enum drbd_fencing_p fp;
687         int rv = SS_SUCCESS;
688
689         fp = FP_DONT_CARE;
690         if (get_ldev(mdev)) {
691                 fp = mdev->ldev->dc.fencing;
692                 put_ldev(mdev);
693         }
694
695         if (get_net_conf(mdev)) {
696                 if (!mdev->net_conf->two_primaries &&
697                     ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
698                         rv = SS_TWO_PRIMARIES;
699                 put_net_conf(mdev);
700         }
701
702         if (rv <= 0)
703                 /* already found a reason to abort */;
704         else if (ns.role == R_SECONDARY && mdev->open_cnt)
705                 rv = SS_DEVICE_IN_USE;
706
707         else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
708                 rv = SS_NO_UP_TO_DATE_DISK;
709
710         else if (fp >= FP_RESOURCE &&
711                  ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
712                 rv = SS_PRIMARY_NOP;
713
714         else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
715                 rv = SS_NO_UP_TO_DATE_DISK;
716
717         else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
718                 rv = SS_NO_LOCAL_DISK;
719
720         else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
721                 rv = SS_NO_REMOTE_DISK;
722
723         else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
724                 rv = SS_NO_UP_TO_DATE_DISK;
725
726         else if ((ns.conn == C_CONNECTED ||
727                   ns.conn == C_WF_BITMAP_S ||
728                   ns.conn == C_SYNC_SOURCE ||
729                   ns.conn == C_PAUSED_SYNC_S) &&
730                   ns.disk == D_OUTDATED)
731                 rv = SS_CONNECTED_OUTDATES;
732
733         else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
734                  (mdev->sync_conf.verify_alg[0] == 0))
735                 rv = SS_NO_VERIFY_ALG;
736
737         else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
738                   mdev->agreed_pro_version < 88)
739                 rv = SS_NOT_SUPPORTED;
740
741         return rv;
742 }
743
744 /**
745  * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
746  * @mdev:       DRBD device.
747  * @ns:         new state.
748  * @os:         old state.
749  */
750 static int is_valid_state_transition(struct drbd_conf *mdev,
751                                      union drbd_state ns, union drbd_state os)
752 {
753         int rv = SS_SUCCESS;
754
755         if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
756             os.conn > C_CONNECTED)
757                 rv = SS_RESYNC_RUNNING;
758
759         if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
760                 rv = SS_ALREADY_STANDALONE;
761
762         if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
763                 rv = SS_IS_DISKLESS;
764
765         if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
766                 rv = SS_NO_NET_CONFIG;
767
768         if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
769                 rv = SS_LOWER_THAN_OUTDATED;
770
771         if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
772                 rv = SS_IN_TRANSIENT_STATE;
773
774         if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
775                 rv = SS_IN_TRANSIENT_STATE;
776
777         if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
778                 rv = SS_NEED_CONNECTION;
779
780         if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
781             ns.conn != os.conn && os.conn > C_CONNECTED)
782                 rv = SS_RESYNC_RUNNING;
783
784         if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
785             os.conn < C_CONNECTED)
786                 rv = SS_NEED_CONNECTION;
787
788         if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
789             && os.conn < C_WF_REPORT_PARAMS)
790                 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
791
792         return rv;
793 }
794
795 /**
796  * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
797  * @mdev:       DRBD device.
798  * @os:         old state.
799  * @ns:         new state.
800  * @warn_sync_abort:
801  *
802  * When we loose connection, we have to set the state of the peers disk (pdsk)
803  * to D_UNKNOWN. This rule and many more along those lines are in this function.
804  */
805 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
806                                        union drbd_state ns, const char **warn_sync_abort)
807 {
808         enum drbd_fencing_p fp;
809         enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
810
811         fp = FP_DONT_CARE;
812         if (get_ldev(mdev)) {
813                 fp = mdev->ldev->dc.fencing;
814                 put_ldev(mdev);
815         }
816
817         /* Disallow Network errors to configure a device's network part */
818         if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
819             os.conn <= C_DISCONNECTING)
820                 ns.conn = os.conn;
821
822         /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
823          * If you try to go into some Sync* state, that shall fail (elsewhere). */
824         if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
825             ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
826                 ns.conn = os.conn;
827
828         /* we cannot fail (again) if we already detached */
829         if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
830                 ns.disk = D_DISKLESS;
831
832         /* if we are only D_ATTACHING yet,
833          * we can (and should) go directly to D_DISKLESS. */
834         if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
835                 ns.disk = D_DISKLESS;
836
837         /* After C_DISCONNECTING only C_STANDALONE may follow */
838         if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
839                 ns.conn = os.conn;
840
841         if (ns.conn < C_CONNECTED) {
842                 ns.peer_isp = 0;
843                 ns.peer = R_UNKNOWN;
844                 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
845                         ns.pdsk = D_UNKNOWN;
846         }
847
848         /* Clear the aftr_isp when becoming unconfigured */
849         if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
850                 ns.aftr_isp = 0;
851
852         /* Abort resync if a disk fails/detaches */
853         if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
854             (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
855                 if (warn_sync_abort)
856                         *warn_sync_abort =
857                                 os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
858                                 "Online-verify" : "Resync";
859                 ns.conn = C_CONNECTED;
860         }
861
862         /* Connection breaks down before we finished "Negotiating" */
863         if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
864             get_ldev_if_state(mdev, D_NEGOTIATING)) {
865                 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
866                         ns.disk = mdev->new_state_tmp.disk;
867                         ns.pdsk = mdev->new_state_tmp.pdsk;
868                 } else {
869                         dev_alert(DEV, "Connection lost while negotiating, no data!\n");
870                         ns.disk = D_DISKLESS;
871                         ns.pdsk = D_UNKNOWN;
872                 }
873                 put_ldev(mdev);
874         }
875
876         /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
877         if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
878                 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
879                         ns.disk = D_UP_TO_DATE;
880                 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
881                         ns.pdsk = D_UP_TO_DATE;
882         }
883
884         /* Implications of the connection stat on the disk states */
885         disk_min = D_DISKLESS;
886         disk_max = D_UP_TO_DATE;
887         pdsk_min = D_INCONSISTENT;
888         pdsk_max = D_UNKNOWN;
889         switch ((enum drbd_conns)ns.conn) {
890         case C_WF_BITMAP_T:
891         case C_PAUSED_SYNC_T:
892         case C_STARTING_SYNC_T:
893         case C_WF_SYNC_UUID:
894         case C_BEHIND:
895                 disk_min = D_INCONSISTENT;
896                 disk_max = D_OUTDATED;
897                 pdsk_min = D_UP_TO_DATE;
898                 pdsk_max = D_UP_TO_DATE;
899                 break;
900         case C_VERIFY_S:
901         case C_VERIFY_T:
902                 disk_min = D_UP_TO_DATE;
903                 disk_max = D_UP_TO_DATE;
904                 pdsk_min = D_UP_TO_DATE;
905                 pdsk_max = D_UP_TO_DATE;
906                 break;
907         case C_CONNECTED:
908                 disk_min = D_DISKLESS;
909                 disk_max = D_UP_TO_DATE;
910                 pdsk_min = D_DISKLESS;
911                 pdsk_max = D_UP_TO_DATE;
912                 break;
913         case C_WF_BITMAP_S:
914         case C_PAUSED_SYNC_S:
915         case C_STARTING_SYNC_S:
916         case C_AHEAD:
917                 disk_min = D_UP_TO_DATE;
918                 disk_max = D_UP_TO_DATE;
919                 pdsk_min = D_INCONSISTENT;
920                 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
921                 break;
922         case C_SYNC_TARGET:
923                 disk_min = D_INCONSISTENT;
924                 disk_max = D_INCONSISTENT;
925                 pdsk_min = D_UP_TO_DATE;
926                 pdsk_max = D_UP_TO_DATE;
927                 break;
928         case C_SYNC_SOURCE:
929                 disk_min = D_UP_TO_DATE;
930                 disk_max = D_UP_TO_DATE;
931                 pdsk_min = D_INCONSISTENT;
932                 pdsk_max = D_INCONSISTENT;
933                 break;
934         case C_STANDALONE:
935         case C_DISCONNECTING:
936         case C_UNCONNECTED:
937         case C_TIMEOUT:
938         case C_BROKEN_PIPE:
939         case C_NETWORK_FAILURE:
940         case C_PROTOCOL_ERROR:
941         case C_TEAR_DOWN:
942         case C_WF_CONNECTION:
943         case C_WF_REPORT_PARAMS:
944         case C_MASK:
945                 break;
946         }
947         if (ns.disk > disk_max)
948                 ns.disk = disk_max;
949
950         if (ns.disk < disk_min) {
951                 dev_warn(DEV, "Implicitly set disk from %s to %s\n",
952                          drbd_disk_str(ns.disk), drbd_disk_str(disk_min));
953                 ns.disk = disk_min;
954         }
955         if (ns.pdsk > pdsk_max)
956                 ns.pdsk = pdsk_max;
957
958         if (ns.pdsk < pdsk_min) {
959                 dev_warn(DEV, "Implicitly set pdsk from %s to %s\n",
960                          drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min));
961                 ns.pdsk = pdsk_min;
962         }
963
964         if (fp == FP_STONITH &&
965             (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
966             !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
967                 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
968
969         if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
970             (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
971             !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
972                 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
973
974         if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
975                 if (ns.conn == C_SYNC_SOURCE)
976                         ns.conn = C_PAUSED_SYNC_S;
977                 if (ns.conn == C_SYNC_TARGET)
978                         ns.conn = C_PAUSED_SYNC_T;
979         } else {
980                 if (ns.conn == C_PAUSED_SYNC_S)
981                         ns.conn = C_SYNC_SOURCE;
982                 if (ns.conn == C_PAUSED_SYNC_T)
983                         ns.conn = C_SYNC_TARGET;
984         }
985
986         return ns;
987 }
988
989 /* helper for __drbd_set_state */
990 static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
991 {
992         if (mdev->agreed_pro_version < 90)
993                 mdev->ov_start_sector = 0;
994         mdev->rs_total = drbd_bm_bits(mdev);
995         mdev->ov_position = 0;
996         if (cs == C_VERIFY_T) {
997                 /* starting online verify from an arbitrary position
998                  * does not fit well into the existing protocol.
999                  * on C_VERIFY_T, we initialize ov_left and friends
1000                  * implicitly in receive_DataRequest once the
1001                  * first P_OV_REQUEST is received */
1002                 mdev->ov_start_sector = ~(sector_t)0;
1003         } else {
1004                 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
1005                 if (bit >= mdev->rs_total) {
1006                         mdev->ov_start_sector =
1007                                 BM_BIT_TO_SECT(mdev->rs_total - 1);
1008                         mdev->rs_total = 1;
1009                 } else
1010                         mdev->rs_total -= bit;
1011                 mdev->ov_position = mdev->ov_start_sector;
1012         }
1013         mdev->ov_left = mdev->rs_total;
1014 }
1015
1016 static void drbd_resume_al(struct drbd_conf *mdev)
1017 {
1018         if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1019                 dev_info(DEV, "Resumed AL updates\n");
1020 }
1021
1022 /**
1023  * __drbd_set_state() - Set a new DRBD state
1024  * @mdev:       DRBD device.
1025  * @ns:         new state.
1026  * @flags:      Flags
1027  * @done:       Optional completion, that will get completed after the after_state_ch() finished
1028  *
1029  * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1030  */
1031 int __drbd_set_state(struct drbd_conf *mdev,
1032                     union drbd_state ns, enum chg_state_flags flags,
1033                     struct completion *done)
1034 {
1035         union drbd_state os;
1036         int rv = SS_SUCCESS;
1037         const char *warn_sync_abort = NULL;
1038         struct after_state_chg_work *ascw;
1039
1040         os = mdev->state;
1041
1042         ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
1043
1044         if (ns.i == os.i)
1045                 return SS_NOTHING_TO_DO;
1046
1047         if (!(flags & CS_HARD)) {
1048                 /*  pre-state-change checks ; only look at ns  */
1049                 /* See drbd_state_sw_errors in drbd_strings.c */
1050
1051                 rv = is_valid_state(mdev, ns);
1052                 if (rv < SS_SUCCESS) {
1053                         /* If the old state was illegal as well, then let
1054                            this happen...*/
1055
1056                         if (is_valid_state(mdev, os) == rv)
1057                                 rv = is_valid_state_transition(mdev, ns, os);
1058                 } else
1059                         rv = is_valid_state_transition(mdev, ns, os);
1060         }
1061
1062         if (rv < SS_SUCCESS) {
1063                 if (flags & CS_VERBOSE)
1064                         print_st_err(mdev, os, ns, rv);
1065                 return rv;
1066         }
1067
1068         if (warn_sync_abort)
1069                 dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
1070
1071         {
1072         char *pbp, pb[300];
1073         pbp = pb;
1074         *pbp = 0;
1075         if (ns.role != os.role)
1076                 pbp += sprintf(pbp, "role( %s -> %s ) ",
1077                                drbd_role_str(os.role),
1078                                drbd_role_str(ns.role));
1079         if (ns.peer != os.peer)
1080                 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1081                                drbd_role_str(os.peer),
1082                                drbd_role_str(ns.peer));
1083         if (ns.conn != os.conn)
1084                 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1085                                drbd_conn_str(os.conn),
1086                                drbd_conn_str(ns.conn));
1087         if (ns.disk != os.disk)
1088                 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1089                                drbd_disk_str(os.disk),
1090                                drbd_disk_str(ns.disk));
1091         if (ns.pdsk != os.pdsk)
1092                 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1093                                drbd_disk_str(os.pdsk),
1094                                drbd_disk_str(ns.pdsk));
1095         if (is_susp(ns) != is_susp(os))
1096                 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1097                                is_susp(os),
1098                                is_susp(ns));
1099         if (ns.aftr_isp != os.aftr_isp)
1100                 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1101                                os.aftr_isp,
1102                                ns.aftr_isp);
1103         if (ns.peer_isp != os.peer_isp)
1104                 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1105                                os.peer_isp,
1106                                ns.peer_isp);
1107         if (ns.user_isp != os.user_isp)
1108                 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1109                                os.user_isp,
1110                                ns.user_isp);
1111         dev_info(DEV, "%s\n", pb);
1112         }
1113
1114         /* solve the race between becoming unconfigured,
1115          * worker doing the cleanup, and
1116          * admin reconfiguring us:
1117          * on (re)configure, first set CONFIG_PENDING,
1118          * then wait for a potentially exiting worker,
1119          * start the worker, and schedule one no_op.
1120          * then proceed with configuration.
1121          */
1122         if (ns.disk == D_DISKLESS &&
1123             ns.conn == C_STANDALONE &&
1124             ns.role == R_SECONDARY &&
1125             !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1126                 set_bit(DEVICE_DYING, &mdev->flags);
1127
1128         /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1129          * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1130          * drbd_ldev_destroy() won't happen before our corresponding
1131          * after_state_ch works run, where we put_ldev again. */
1132         if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1133             (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1134                 atomic_inc(&mdev->local_cnt);
1135
1136         mdev->state = ns;
1137         wake_up(&mdev->misc_wait);
1138         wake_up(&mdev->state_wait);
1139
1140         /* aborted verify run. log the last position */
1141         if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1142             ns.conn < C_CONNECTED) {
1143                 mdev->ov_start_sector =
1144                         BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
1145                 dev_info(DEV, "Online Verify reached sector %llu\n",
1146                         (unsigned long long)mdev->ov_start_sector);
1147         }
1148
1149         if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1150             (ns.conn == C_SYNC_TARGET  || ns.conn == C_SYNC_SOURCE)) {
1151                 dev_info(DEV, "Syncer continues.\n");
1152                 mdev->rs_paused += (long)jiffies
1153                                   -(long)mdev->rs_mark_time[mdev->rs_last_mark];
1154                 if (ns.conn == C_SYNC_TARGET)
1155                         mod_timer(&mdev->resync_timer, jiffies);
1156         }
1157
1158         if ((os.conn == C_SYNC_TARGET  || os.conn == C_SYNC_SOURCE) &&
1159             (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1160                 dev_info(DEV, "Resync suspended\n");
1161                 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
1162         }
1163
1164         if (os.conn == C_CONNECTED &&
1165             (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1166                 unsigned long now = jiffies;
1167                 int i;
1168
1169                 set_ov_position(mdev, ns.conn);
1170                 mdev->rs_start = now;
1171                 mdev->rs_last_events = 0;
1172                 mdev->rs_last_sect_ev = 0;
1173                 mdev->ov_last_oos_size = 0;
1174                 mdev->ov_last_oos_start = 0;
1175
1176                 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1177                         mdev->rs_mark_left[i] = mdev->ov_left;
1178                         mdev->rs_mark_time[i] = now;
1179                 }
1180
1181                 drbd_rs_controller_reset(mdev);
1182
1183                 if (ns.conn == C_VERIFY_S) {
1184                         dev_info(DEV, "Starting Online Verify from sector %llu\n",
1185                                         (unsigned long long)mdev->ov_position);
1186                         mod_timer(&mdev->resync_timer, jiffies);
1187                 }
1188         }
1189
1190         if (get_ldev(mdev)) {
1191                 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1192                                                  MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1193                                                  MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1194
1195                 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1196                         mdf |= MDF_CRASHED_PRIMARY;
1197                 if (mdev->state.role == R_PRIMARY ||
1198                     (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1199                         mdf |= MDF_PRIMARY_IND;
1200                 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1201                         mdf |= MDF_CONNECTED_IND;
1202                 if (mdev->state.disk > D_INCONSISTENT)
1203                         mdf |= MDF_CONSISTENT;
1204                 if (mdev->state.disk > D_OUTDATED)
1205                         mdf |= MDF_WAS_UP_TO_DATE;
1206                 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1207                         mdf |= MDF_PEER_OUT_DATED;
1208                 if (mdf != mdev->ldev->md.flags) {
1209                         mdev->ldev->md.flags = mdf;
1210                         drbd_md_mark_dirty(mdev);
1211                 }
1212                 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1213                         drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1214                 put_ldev(mdev);
1215         }
1216
1217         /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1218         if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1219             os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1220                 set_bit(CONSIDER_RESYNC, &mdev->flags);
1221
1222         /* Receiver should clean up itself */
1223         if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1224                 drbd_thread_stop_nowait(&mdev->receiver);
1225
1226         /* Now the receiver finished cleaning up itself, it should die */
1227         if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1228                 drbd_thread_stop_nowait(&mdev->receiver);
1229
1230         /* Upon network failure, we need to restart the receiver. */
1231         if (os.conn > C_TEAR_DOWN &&
1232             ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1233                 drbd_thread_restart_nowait(&mdev->receiver);
1234
1235         /* Resume AL writing if we get a connection */
1236         if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1237                 drbd_resume_al(mdev);
1238
1239         ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1240         if (ascw) {
1241                 ascw->os = os;
1242                 ascw->ns = ns;
1243                 ascw->flags = flags;
1244                 ascw->w.cb = w_after_state_ch;
1245                 ascw->done = done;
1246                 drbd_queue_work(&mdev->data.work, &ascw->w);
1247         } else {
1248                 dev_warn(DEV, "Could not kmalloc an ascw\n");
1249         }
1250
1251         return rv;
1252 }
1253
1254 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1255 {
1256         struct after_state_chg_work *ascw =
1257                 container_of(w, struct after_state_chg_work, w);
1258         after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1259         if (ascw->flags & CS_WAIT_COMPLETE) {
1260                 D_ASSERT(ascw->done != NULL);
1261                 complete(ascw->done);
1262         }
1263         kfree(ascw);
1264
1265         return 1;
1266 }
1267
1268 static void abw_start_sync(struct drbd_conf *mdev, int rv)
1269 {
1270         if (rv) {
1271                 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1272                 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1273                 return;
1274         }
1275
1276         switch (mdev->state.conn) {
1277         case C_STARTING_SYNC_T:
1278                 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1279                 break;
1280         case C_STARTING_SYNC_S:
1281                 drbd_start_resync(mdev, C_SYNC_SOURCE);
1282                 break;
1283         }
1284 }
1285
1286 /**
1287  * after_state_ch() - Perform after state change actions that may sleep
1288  * @mdev:       DRBD device.
1289  * @os:         old state.
1290  * @ns:         new state.
1291  * @flags:      Flags
1292  */
1293 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1294                            union drbd_state ns, enum chg_state_flags flags)
1295 {
1296         enum drbd_fencing_p fp;
1297         enum drbd_req_event what = nothing;
1298         union drbd_state nsm = (union drbd_state){ .i = -1 };
1299
1300         if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1301                 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1302                 if (mdev->p_uuid)
1303                         mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1304         }
1305
1306         fp = FP_DONT_CARE;
1307         if (get_ldev(mdev)) {
1308                 fp = mdev->ldev->dc.fencing;
1309                 put_ldev(mdev);
1310         }
1311
1312         /* Inform userspace about the change... */
1313         drbd_bcast_state(mdev, ns);
1314
1315         if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1316             (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1317                 drbd_khelper(mdev, "pri-on-incon-degr");
1318
1319         /* Here we have the actions that are performed after a
1320            state change. This function might sleep */
1321
1322         nsm.i = -1;
1323         if (ns.susp_nod) {
1324                 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1325                         if (ns.conn == C_CONNECTED)
1326                                 what = resend, nsm.susp_nod = 0;
1327                         else /* ns.conn > C_CONNECTED */
1328                                 dev_err(DEV, "Unexpected Resynd going on!\n");
1329                 }
1330
1331                 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
1332                         what = restart_frozen_disk_io, nsm.susp_nod = 0;
1333
1334         }
1335
1336         if (ns.susp_fen) {
1337                 /* case1: The outdate peer handler is successful: */
1338                 if (os.pdsk > D_OUTDATED  && ns.pdsk <= D_OUTDATED) {
1339                         tl_clear(mdev);
1340                         if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1341                                 drbd_uuid_new_current(mdev);
1342                                 clear_bit(NEW_CUR_UUID, &mdev->flags);
1343                         }
1344                         spin_lock_irq(&mdev->req_lock);
1345                         _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
1346                         spin_unlock_irq(&mdev->req_lock);
1347                 }
1348                 /* case2: The connection was established again: */
1349                 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1350                         clear_bit(NEW_CUR_UUID, &mdev->flags);
1351                         what = resend;
1352                         nsm.susp_fen = 0;
1353                 }
1354         }
1355
1356         if (what != nothing) {
1357                 spin_lock_irq(&mdev->req_lock);
1358                 _tl_restart(mdev, what);
1359                 nsm.i &= mdev->state.i;
1360                 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
1361                 spin_unlock_irq(&mdev->req_lock);
1362         }
1363
1364         /* Do not change the order of the if above and the two below... */
1365         if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) {      /* attach on the peer */
1366                 drbd_send_uuids(mdev);
1367                 drbd_send_state(mdev);
1368         }
1369         if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
1370                 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
1371
1372         /* Lost contact to peer's copy of the data */
1373         if ((os.pdsk >= D_INCONSISTENT &&
1374              os.pdsk != D_UNKNOWN &&
1375              os.pdsk != D_OUTDATED)
1376         &&  (ns.pdsk < D_INCONSISTENT ||
1377              ns.pdsk == D_UNKNOWN ||
1378              ns.pdsk == D_OUTDATED)) {
1379                 if (get_ldev(mdev)) {
1380                         if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1381                             mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1382                                 if (is_susp(mdev->state)) {
1383                                         set_bit(NEW_CUR_UUID, &mdev->flags);
1384                                 } else {
1385                                         drbd_uuid_new_current(mdev);
1386                                         drbd_send_uuids(mdev);
1387                                 }
1388                         }
1389                         put_ldev(mdev);
1390                 }
1391         }
1392
1393         if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1394                 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
1395                         drbd_uuid_new_current(mdev);
1396                         drbd_send_uuids(mdev);
1397                 }
1398
1399                 /* D_DISKLESS Peer becomes secondary */
1400                 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1401                         drbd_al_to_on_disk_bm(mdev);
1402                 put_ldev(mdev);
1403         }
1404
1405         /* Last part of the attaching process ... */
1406         if (ns.conn >= C_CONNECTED &&
1407             os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1408                 drbd_send_sizes(mdev, 0, 0);  /* to start sync... */
1409                 drbd_send_uuids(mdev);
1410                 drbd_send_state(mdev);
1411         }
1412
1413         /* We want to pause/continue resync, tell peer. */
1414         if (ns.conn >= C_CONNECTED &&
1415              ((os.aftr_isp != ns.aftr_isp) ||
1416               (os.user_isp != ns.user_isp)))
1417                 drbd_send_state(mdev);
1418
1419         /* In case one of the isp bits got set, suspend other devices. */
1420         if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1421             (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1422                 suspend_other_sg(mdev);
1423
1424         /* Make sure the peer gets informed about eventual state
1425            changes (ISP bits) while we were in WFReportParams. */
1426         if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1427                 drbd_send_state(mdev);
1428
1429         if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1430                 drbd_send_state(mdev);
1431
1432         /* We are in the progress to start a full sync... */
1433         if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1434             (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1435                 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
1436
1437         /* We are invalidating our self... */
1438         if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1439             os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1440                 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1441
1442         /* first half of local IO error, failure to attach,
1443          * or administrative detach */
1444         if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1445                 enum drbd_io_error_p eh;
1446                 int was_io_error;
1447                 /* corresponding get_ldev was in __drbd_set_state, to serialize
1448                  * our cleanup here with the transition to D_DISKLESS,
1449                  * so it is safe to dreference ldev here. */
1450                 eh = mdev->ldev->dc.on_io_error;
1451                 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1452
1453                 /* current state still has to be D_FAILED,
1454                  * there is only one way out: to D_DISKLESS,
1455                  * and that may only happen after our put_ldev below. */
1456                 if (mdev->state.disk != D_FAILED)
1457                         dev_err(DEV,
1458                                 "ASSERT FAILED: disk is %s during detach\n",
1459                                 drbd_disk_str(mdev->state.disk));
1460
1461                 if (drbd_send_state(mdev))
1462                         dev_warn(DEV, "Notified peer that I am detaching my disk\n");
1463                 else
1464                         dev_err(DEV, "Sending state for detaching disk failed\n");
1465
1466                 drbd_rs_cancel_all(mdev);
1467
1468                 /* In case we want to get something to stable storage still,
1469                  * this may be the last chance.
1470                  * Following put_ldev may transition to D_DISKLESS. */
1471                 drbd_md_sync(mdev);
1472                 put_ldev(mdev);
1473
1474                 if (was_io_error && eh == EP_CALL_HELPER)
1475                         drbd_khelper(mdev, "local-io-error");
1476         }
1477
1478         /* second half of local IO error, failure to attach,
1479          * or administrative detach,
1480          * after local_cnt references have reached zero again */
1481         if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1482                 /* We must still be diskless,
1483                  * re-attach has to be serialized with this! */
1484                 if (mdev->state.disk != D_DISKLESS)
1485                         dev_err(DEV,
1486                                 "ASSERT FAILED: disk is %s while going diskless\n",
1487                                 drbd_disk_str(mdev->state.disk));
1488
1489                 mdev->rs_total = 0;
1490                 mdev->rs_failed = 0;
1491                 atomic_set(&mdev->rs_pending_cnt, 0);
1492
1493                 if (drbd_send_state(mdev))
1494                         dev_warn(DEV, "Notified peer that I'm now diskless.\n");
1495                 else
1496                         dev_err(DEV, "Sending state for being diskless failed\n");
1497                 /* corresponding get_ldev in __drbd_set_state
1498                  * this may finaly trigger drbd_ldev_destroy. */
1499                 put_ldev(mdev);
1500         }
1501
1502         /* Disks got bigger while they were detached */
1503         if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1504             test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1505                 if (ns.conn == C_CONNECTED)
1506                         resync_after_online_grow(mdev);
1507         }
1508
1509         /* A resync finished or aborted, wake paused devices... */
1510         if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1511             (os.peer_isp && !ns.peer_isp) ||
1512             (os.user_isp && !ns.user_isp))
1513                 resume_next_sg(mdev);
1514
1515         /* sync target done with resync.  Explicitly notify peer, even though
1516          * it should (at least for non-empty resyncs) already know itself. */
1517         if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1518                 drbd_send_state(mdev);
1519
1520         /* free tl_hash if we Got thawed and are C_STANDALONE */
1521         if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
1522                 drbd_free_tl_hash(mdev);
1523
1524         /* Upon network connection, we need to start the receiver */
1525         if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1526                 drbd_thread_start(&mdev->receiver);
1527
1528         /* Terminate worker thread if we are unconfigured - it will be
1529            restarted as needed... */
1530         if (ns.disk == D_DISKLESS &&
1531             ns.conn == C_STANDALONE &&
1532             ns.role == R_SECONDARY) {
1533                 if (os.aftr_isp != ns.aftr_isp)
1534                         resume_next_sg(mdev);
1535                 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1536                 if (test_bit(DEVICE_DYING, &mdev->flags))
1537                         drbd_thread_stop_nowait(&mdev->worker);
1538         }
1539
1540         drbd_md_sync(mdev);
1541 }
1542
1543
1544 static int drbd_thread_setup(void *arg)
1545 {
1546         struct drbd_thread *thi = (struct drbd_thread *) arg;
1547         struct drbd_conf *mdev = thi->mdev;
1548         unsigned long flags;
1549         int retval;
1550
1551 restart:
1552         retval = thi->function(thi);
1553
1554         spin_lock_irqsave(&thi->t_lock, flags);
1555
1556         /* if the receiver has been "Exiting", the last thing it did
1557          * was set the conn state to "StandAlone",
1558          * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1559          * and receiver thread will be "started".
1560          * drbd_thread_start needs to set "Restarting" in that case.
1561          * t_state check and assignment needs to be within the same spinlock,
1562          * so either thread_start sees Exiting, and can remap to Restarting,
1563          * or thread_start see None, and can proceed as normal.
1564          */
1565
1566         if (thi->t_state == Restarting) {
1567                 dev_info(DEV, "Restarting %s\n", current->comm);
1568                 thi->t_state = Running;
1569                 spin_unlock_irqrestore(&thi->t_lock, flags);
1570                 goto restart;
1571         }
1572
1573         thi->task = NULL;
1574         thi->t_state = None;
1575         smp_mb();
1576         complete(&thi->stop);
1577         spin_unlock_irqrestore(&thi->t_lock, flags);
1578
1579         dev_info(DEV, "Terminating %s\n", current->comm);
1580
1581         /* Release mod reference taken when thread was started */
1582         module_put(THIS_MODULE);
1583         return retval;
1584 }
1585
1586 static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1587                       int (*func) (struct drbd_thread *))
1588 {
1589         spin_lock_init(&thi->t_lock);
1590         thi->task    = NULL;
1591         thi->t_state = None;
1592         thi->function = func;
1593         thi->mdev = mdev;
1594 }
1595
1596 int drbd_thread_start(struct drbd_thread *thi)
1597 {
1598         struct drbd_conf *mdev = thi->mdev;
1599         struct task_struct *nt;
1600         unsigned long flags;
1601
1602         const char *me =
1603                 thi == &mdev->receiver ? "receiver" :
1604                 thi == &mdev->asender  ? "asender"  :
1605                 thi == &mdev->worker   ? "worker"   : "NONSENSE";
1606
1607         /* is used from state engine doing drbd_thread_stop_nowait,
1608          * while holding the req lock irqsave */
1609         spin_lock_irqsave(&thi->t_lock, flags);
1610
1611         switch (thi->t_state) {
1612         case None:
1613                 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1614                                 me, current->comm, current->pid);
1615
1616                 /* Get ref on module for thread - this is released when thread exits */
1617                 if (!try_module_get(THIS_MODULE)) {
1618                         dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1619                         spin_unlock_irqrestore(&thi->t_lock, flags);
1620                         return FALSE;
1621                 }
1622
1623                 init_completion(&thi->stop);
1624                 D_ASSERT(thi->task == NULL);
1625                 thi->reset_cpu_mask = 1;
1626                 thi->t_state = Running;
1627                 spin_unlock_irqrestore(&thi->t_lock, flags);
1628                 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1629
1630                 nt = kthread_create(drbd_thread_setup, (void *) thi,
1631                                     "drbd%d_%s", mdev_to_minor(mdev), me);
1632
1633                 if (IS_ERR(nt)) {
1634                         dev_err(DEV, "Couldn't start thread\n");
1635
1636                         module_put(THIS_MODULE);
1637                         return FALSE;
1638                 }
1639                 spin_lock_irqsave(&thi->t_lock, flags);
1640                 thi->task = nt;
1641                 thi->t_state = Running;
1642                 spin_unlock_irqrestore(&thi->t_lock, flags);
1643                 wake_up_process(nt);
1644                 break;
1645         case Exiting:
1646                 thi->t_state = Restarting;
1647                 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1648                                 me, current->comm, current->pid);
1649                 /* fall through */
1650         case Running:
1651         case Restarting:
1652         default:
1653                 spin_unlock_irqrestore(&thi->t_lock, flags);
1654                 break;
1655         }
1656
1657         return TRUE;
1658 }
1659
1660
1661 void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1662 {
1663         unsigned long flags;
1664
1665         enum drbd_thread_state ns = restart ? Restarting : Exiting;
1666
1667         /* may be called from state engine, holding the req lock irqsave */
1668         spin_lock_irqsave(&thi->t_lock, flags);
1669
1670         if (thi->t_state == None) {
1671                 spin_unlock_irqrestore(&thi->t_lock, flags);
1672                 if (restart)
1673                         drbd_thread_start(thi);
1674                 return;
1675         }
1676
1677         if (thi->t_state != ns) {
1678                 if (thi->task == NULL) {
1679                         spin_unlock_irqrestore(&thi->t_lock, flags);
1680                         return;
1681                 }
1682
1683                 thi->t_state = ns;
1684                 smp_mb();
1685                 init_completion(&thi->stop);
1686                 if (thi->task != current)
1687                         force_sig(DRBD_SIGKILL, thi->task);
1688
1689         }
1690
1691         spin_unlock_irqrestore(&thi->t_lock, flags);
1692
1693         if (wait)
1694                 wait_for_completion(&thi->stop);
1695 }
1696
1697 #ifdef CONFIG_SMP
1698 /**
1699  * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1700  * @mdev:       DRBD device.
1701  *
1702  * Forces all threads of a device onto the same CPU. This is beneficial for
1703  * DRBD's performance. May be overwritten by user's configuration.
1704  */
1705 void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1706 {
1707         int ord, cpu;
1708
1709         /* user override. */
1710         if (cpumask_weight(mdev->cpu_mask))
1711                 return;
1712
1713         ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1714         for_each_online_cpu(cpu) {
1715                 if (ord-- == 0) {
1716                         cpumask_set_cpu(cpu, mdev->cpu_mask);
1717                         return;
1718                 }
1719         }
1720         /* should not be reached */
1721         cpumask_setall(mdev->cpu_mask);
1722 }
1723
1724 /**
1725  * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1726  * @mdev:       DRBD device.
1727  *
1728  * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1729  * prematurely.
1730  */
1731 void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1732 {
1733         struct task_struct *p = current;
1734         struct drbd_thread *thi =
1735                 p == mdev->asender.task  ? &mdev->asender  :
1736                 p == mdev->receiver.task ? &mdev->receiver :
1737                 p == mdev->worker.task   ? &mdev->worker   :
1738                 NULL;
1739         ERR_IF(thi == NULL)
1740                 return;
1741         if (!thi->reset_cpu_mask)
1742                 return;
1743         thi->reset_cpu_mask = 0;
1744         set_cpus_allowed_ptr(p, mdev->cpu_mask);
1745 }
1746 #endif
1747
1748 /* the appropriate socket mutex must be held already */
1749 int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1750                           enum drbd_packets cmd, struct p_header80 *h,
1751                           size_t size, unsigned msg_flags)
1752 {
1753         int sent, ok;
1754
1755         ERR_IF(!h) return FALSE;
1756         ERR_IF(!size) return FALSE;
1757
1758         h->magic   = BE_DRBD_MAGIC;
1759         h->command = cpu_to_be16(cmd);
1760         h->length  = cpu_to_be16(size-sizeof(struct p_header80));
1761
1762         sent = drbd_send(mdev, sock, h, size, msg_flags);
1763
1764         ok = (sent == size);
1765         if (!ok)
1766                 dev_err(DEV, "short sent %s size=%d sent=%d\n",
1767                     cmdname(cmd), (int)size, sent);
1768         return ok;
1769 }
1770
1771 /* don't pass the socket. we may only look at it
1772  * when we hold the appropriate socket mutex.
1773  */
1774 int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1775                   enum drbd_packets cmd, struct p_header80 *h, size_t size)
1776 {
1777         int ok = 0;
1778         struct socket *sock;
1779
1780         if (use_data_socket) {
1781                 mutex_lock(&mdev->data.mutex);
1782                 sock = mdev->data.socket;
1783         } else {
1784                 mutex_lock(&mdev->meta.mutex);
1785                 sock = mdev->meta.socket;
1786         }
1787
1788         /* drbd_disconnect() could have called drbd_free_sock()
1789          * while we were waiting in down()... */
1790         if (likely(sock != NULL))
1791                 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1792
1793         if (use_data_socket)
1794                 mutex_unlock(&mdev->data.mutex);
1795         else
1796                 mutex_unlock(&mdev->meta.mutex);
1797         return ok;
1798 }
1799
1800 int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1801                    size_t size)
1802 {
1803         struct p_header80 h;
1804         int ok;
1805
1806         h.magic   = BE_DRBD_MAGIC;
1807         h.command = cpu_to_be16(cmd);
1808         h.length  = cpu_to_be16(size);
1809
1810         if (!drbd_get_data_sock(mdev))
1811                 return 0;
1812
1813         ok = (sizeof(h) ==
1814                 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1815         ok = ok && (size ==
1816                 drbd_send(mdev, mdev->data.socket, data, size, 0));
1817
1818         drbd_put_data_sock(mdev);
1819
1820         return ok;
1821 }
1822
1823 int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1824 {
1825         struct p_rs_param_95 *p;
1826         struct socket *sock;
1827         int size, rv;
1828         const int apv = mdev->agreed_pro_version;
1829
1830         size = apv <= 87 ? sizeof(struct p_rs_param)
1831                 : apv == 88 ? sizeof(struct p_rs_param)
1832                         + strlen(mdev->sync_conf.verify_alg) + 1
1833                 : apv <= 94 ? sizeof(struct p_rs_param_89)
1834                 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
1835
1836         /* used from admin command context and receiver/worker context.
1837          * to avoid kmalloc, grab the socket right here,
1838          * then use the pre-allocated sbuf there */
1839         mutex_lock(&mdev->data.mutex);
1840         sock = mdev->data.socket;
1841
1842         if (likely(sock != NULL)) {
1843                 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1844
1845                 p = &mdev->data.sbuf.rs_param_95;
1846
1847                 /* initialize verify_alg and csums_alg */
1848                 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1849
1850                 p->rate = cpu_to_be32(sc->rate);
1851                 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1852                 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1853                 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1854                 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
1855
1856                 if (apv >= 88)
1857                         strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1858                 if (apv >= 89)
1859                         strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1860
1861                 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1862         } else
1863                 rv = 0; /* not ok */
1864
1865         mutex_unlock(&mdev->data.mutex);
1866
1867         return rv;
1868 }
1869
1870 int drbd_send_protocol(struct drbd_conf *mdev)
1871 {
1872         struct p_protocol *p;
1873         int size, cf, rv;
1874
1875         size = sizeof(struct p_protocol);
1876
1877         if (mdev->agreed_pro_version >= 87)
1878                 size += strlen(mdev->net_conf->integrity_alg) + 1;
1879
1880         /* we must not recurse into our own queue,
1881          * as that is blocked during handshake */
1882         p = kmalloc(size, GFP_NOIO);
1883         if (p == NULL)
1884                 return 0;
1885
1886         p->protocol      = cpu_to_be32(mdev->net_conf->wire_protocol);
1887         p->after_sb_0p   = cpu_to_be32(mdev->net_conf->after_sb_0p);
1888         p->after_sb_1p   = cpu_to_be32(mdev->net_conf->after_sb_1p);
1889         p->after_sb_2p   = cpu_to_be32(mdev->net_conf->after_sb_2p);
1890         p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1891
1892         cf = 0;
1893         if (mdev->net_conf->want_lose)
1894                 cf |= CF_WANT_LOSE;
1895         if (mdev->net_conf->dry_run) {
1896                 if (mdev->agreed_pro_version >= 92)
1897                         cf |= CF_DRY_RUN;
1898                 else {
1899                         dev_err(DEV, "--dry-run is not supported by peer");
1900                         kfree(p);
1901                         return 0;
1902                 }
1903         }
1904         p->conn_flags    = cpu_to_be32(cf);
1905
1906         if (mdev->agreed_pro_version >= 87)
1907                 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1908
1909         rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
1910                            (struct p_header80 *)p, size);
1911         kfree(p);
1912         return rv;
1913 }
1914
1915 int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1916 {
1917         struct p_uuids p;
1918         int i;
1919
1920         if (!get_ldev_if_state(mdev, D_NEGOTIATING))
1921                 return 1;
1922
1923         for (i = UI_CURRENT; i < UI_SIZE; i++)
1924                 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
1925
1926         mdev->comm_bm_set = drbd_bm_total_weight(mdev);
1927         p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
1928         uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
1929         uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
1930         uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
1931         p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
1932
1933         put_ldev(mdev);
1934
1935         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
1936                              (struct p_header80 *)&p, sizeof(p));
1937 }
1938
1939 int drbd_send_uuids(struct drbd_conf *mdev)
1940 {
1941         return _drbd_send_uuids(mdev, 0);
1942 }
1943
1944 int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1945 {
1946         return _drbd_send_uuids(mdev, 8);
1947 }
1948
1949
1950 int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1951 {
1952         struct p_rs_uuid p;
1953
1954         p.uuid = cpu_to_be64(val);
1955
1956         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
1957                              (struct p_header80 *)&p, sizeof(p));
1958 }
1959
1960 int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
1961 {
1962         struct p_sizes p;
1963         sector_t d_size, u_size;
1964         int q_order_type;
1965         int ok;
1966
1967         if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1968                 D_ASSERT(mdev->ldev->backing_bdev);
1969                 d_size = drbd_get_max_capacity(mdev->ldev);
1970                 u_size = mdev->ldev->dc.disk_size;
1971                 q_order_type = drbd_queue_order_type(mdev);
1972                 put_ldev(mdev);
1973         } else {
1974                 d_size = 0;
1975                 u_size = 0;
1976                 q_order_type = QUEUE_ORDERED_NONE;
1977         }
1978
1979         p.d_size = cpu_to_be64(d_size);
1980         p.u_size = cpu_to_be64(u_size);
1981         p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1982         p.max_bio_size = cpu_to_be32(queue_max_hw_sectors(mdev->rq_queue) << 9);
1983         p.queue_order_type = cpu_to_be16(q_order_type);
1984         p.dds_flags = cpu_to_be16(flags);
1985
1986         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
1987                            (struct p_header80 *)&p, sizeof(p));
1988         return ok;
1989 }
1990
1991 /**
1992  * drbd_send_state() - Sends the drbd state to the peer
1993  * @mdev:       DRBD device.
1994  */
1995 int drbd_send_state(struct drbd_conf *mdev)
1996 {
1997         struct socket *sock;
1998         struct p_state p;
1999         int ok = 0;
2000
2001         /* Grab state lock so we wont send state if we're in the middle
2002          * of a cluster wide state change on another thread */
2003         drbd_state_lock(mdev);
2004
2005         mutex_lock(&mdev->data.mutex);
2006
2007         p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2008         sock = mdev->data.socket;
2009
2010         if (likely(sock != NULL)) {
2011                 ok = _drbd_send_cmd(mdev, sock, P_STATE,
2012                                     (struct p_header80 *)&p, sizeof(p), 0);
2013         }
2014
2015         mutex_unlock(&mdev->data.mutex);
2016
2017         drbd_state_unlock(mdev);
2018         return ok;
2019 }
2020
2021 int drbd_send_state_req(struct drbd_conf *mdev,
2022         union drbd_state mask, union drbd_state val)
2023 {
2024         struct p_req_state p;
2025
2026         p.mask    = cpu_to_be32(mask.i);
2027         p.val     = cpu_to_be32(val.i);
2028
2029         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
2030                              (struct p_header80 *)&p, sizeof(p));
2031 }
2032
2033 int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
2034 {
2035         struct p_req_state_reply p;
2036
2037         p.retcode    = cpu_to_be32(retcode);
2038
2039         return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
2040                              (struct p_header80 *)&p, sizeof(p));
2041 }
2042
2043 int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2044         struct p_compressed_bm *p,
2045         struct bm_xfer_ctx *c)
2046 {
2047         struct bitstream bs;
2048         unsigned long plain_bits;
2049         unsigned long tmp;
2050         unsigned long rl;
2051         unsigned len;
2052         unsigned toggle;
2053         int bits;
2054
2055         /* may we use this feature? */
2056         if ((mdev->sync_conf.use_rle == 0) ||
2057                 (mdev->agreed_pro_version < 90))
2058                         return 0;
2059
2060         if (c->bit_offset >= c->bm_bits)
2061                 return 0; /* nothing to do. */
2062
2063         /* use at most thus many bytes */
2064         bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2065         memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2066         /* plain bits covered in this code string */
2067         plain_bits = 0;
2068
2069         /* p->encoding & 0x80 stores whether the first run length is set.
2070          * bit offset is implicit.
2071          * start with toggle == 2 to be able to tell the first iteration */
2072         toggle = 2;
2073
2074         /* see how much plain bits we can stuff into one packet
2075          * using RLE and VLI. */
2076         do {
2077                 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2078                                     : _drbd_bm_find_next(mdev, c->bit_offset);
2079                 if (tmp == -1UL)
2080                         tmp = c->bm_bits;
2081                 rl = tmp - c->bit_offset;
2082
2083                 if (toggle == 2) { /* first iteration */
2084                         if (rl == 0) {
2085                                 /* the first checked bit was set,
2086                                  * store start value, */
2087                                 DCBP_set_start(p, 1);
2088                                 /* but skip encoding of zero run length */
2089                                 toggle = !toggle;
2090                                 continue;
2091                         }
2092                         DCBP_set_start(p, 0);
2093                 }
2094
2095                 /* paranoia: catch zero runlength.
2096                  * can only happen if bitmap is modified while we scan it. */
2097                 if (rl == 0) {
2098                         dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2099                             "t:%u bo:%lu\n", toggle, c->bit_offset);
2100                         return -1;
2101                 }
2102
2103                 bits = vli_encode_bits(&bs, rl);
2104                 if (bits == -ENOBUFS) /* buffer full */
2105                         break;
2106                 if (bits <= 0) {
2107                         dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2108                         return 0;
2109                 }
2110
2111                 toggle = !toggle;
2112                 plain_bits += rl;
2113                 c->bit_offset = tmp;
2114         } while (c->bit_offset < c->bm_bits);
2115
2116         len = bs.cur.b - p->code + !!bs.cur.bit;
2117
2118         if (plain_bits < (len << 3)) {
2119                 /* incompressible with this method.
2120                  * we need to rewind both word and bit position. */
2121                 c->bit_offset -= plain_bits;
2122                 bm_xfer_ctx_bit_to_word_offset(c);
2123                 c->bit_offset = c->word_offset * BITS_PER_LONG;
2124                 return 0;
2125         }
2126
2127         /* RLE + VLI was able to compress it just fine.
2128          * update c->word_offset. */
2129         bm_xfer_ctx_bit_to_word_offset(c);
2130
2131         /* store pad_bits */
2132         DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2133
2134         return len;
2135 }
2136
2137 enum { OK, FAILED, DONE }
2138 send_bitmap_rle_or_plain(struct drbd_conf *mdev,
2139         struct p_header80 *h, struct bm_xfer_ctx *c)
2140 {
2141         struct p_compressed_bm *p = (void*)h;
2142         unsigned long num_words;
2143         int len;
2144         int ok;
2145
2146         len = fill_bitmap_rle_bits(mdev, p, c);
2147
2148         if (len < 0)
2149                 return FAILED;
2150
2151         if (len) {
2152                 DCBP_set_code(p, RLE_VLI_Bits);
2153                 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2154                         sizeof(*p) + len, 0);
2155
2156                 c->packets[0]++;
2157                 c->bytes[0] += sizeof(*p) + len;
2158
2159                 if (c->bit_offset >= c->bm_bits)
2160                         len = 0; /* DONE */
2161         } else {
2162                 /* was not compressible.
2163                  * send a buffer full of plain text bits instead. */
2164                 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2165                 len = num_words * sizeof(long);
2166                 if (len)
2167                         drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2168                 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
2169                                    h, sizeof(struct p_header80) + len, 0);
2170                 c->word_offset += num_words;
2171                 c->bit_offset = c->word_offset * BITS_PER_LONG;
2172
2173                 c->packets[1]++;
2174                 c->bytes[1] += sizeof(struct p_header80) + len;
2175
2176                 if (c->bit_offset > c->bm_bits)
2177                         c->bit_offset = c->bm_bits;
2178         }
2179         ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
2180
2181         if (ok == DONE)
2182                 INFO_bm_xfer_stats(mdev, "send", c);
2183         return ok;
2184 }
2185
2186 /* See the comment at receive_bitmap() */
2187 int _drbd_send_bitmap(struct drbd_conf *mdev)
2188 {
2189         struct bm_xfer_ctx c;
2190         struct p_header80 *p;
2191         int ret;
2192
2193         ERR_IF(!mdev->bitmap) return FALSE;
2194
2195         /* maybe we should use some per thread scratch page,
2196          * and allocate that during initial device creation? */
2197         p = (struct p_header80 *) __get_free_page(GFP_NOIO);
2198         if (!p) {
2199                 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2200                 return FALSE;
2201         }
2202
2203         if (get_ldev(mdev)) {
2204                 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2205                         dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2206                         drbd_bm_set_all(mdev);
2207                         if (drbd_bm_write(mdev)) {
2208                                 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2209                                  * but otherwise process as per normal - need to tell other
2210                                  * side that a full resync is required! */
2211                                 dev_err(DEV, "Failed to write bitmap to disk!\n");
2212                         } else {
2213                                 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2214                                 drbd_md_sync(mdev);
2215                         }
2216                 }
2217                 put_ldev(mdev);
2218         }
2219
2220         c = (struct bm_xfer_ctx) {
2221                 .bm_bits = drbd_bm_bits(mdev),
2222                 .bm_words = drbd_bm_words(mdev),
2223         };
2224
2225         do {
2226                 ret = send_bitmap_rle_or_plain(mdev, p, &c);
2227         } while (ret == OK);
2228
2229         free_page((unsigned long) p);
2230         return (ret == DONE);
2231 }
2232
2233 int drbd_send_bitmap(struct drbd_conf *mdev)
2234 {
2235         int err;
2236
2237         if (!drbd_get_data_sock(mdev))
2238                 return -1;
2239         err = !_drbd_send_bitmap(mdev);
2240         drbd_put_data_sock(mdev);
2241         return err;
2242 }
2243
2244 int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2245 {
2246         int ok;
2247         struct p_barrier_ack p;
2248
2249         p.barrier  = barrier_nr;
2250         p.set_size = cpu_to_be32(set_size);
2251
2252         if (mdev->state.conn < C_CONNECTED)
2253                 return FALSE;
2254         ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2255                         (struct p_header80 *)&p, sizeof(p));
2256         return ok;
2257 }
2258
2259 /**
2260  * _drbd_send_ack() - Sends an ack packet
2261  * @mdev:       DRBD device.
2262  * @cmd:        Packet command code.
2263  * @sector:     sector, needs to be in big endian byte order
2264  * @blksize:    size in byte, needs to be in big endian byte order
2265  * @block_id:   Id, big endian byte order
2266  */
2267 static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2268                           u64 sector,
2269                           u32 blksize,
2270                           u64 block_id)
2271 {
2272         int ok;
2273         struct p_block_ack p;
2274
2275         p.sector   = sector;
2276         p.block_id = block_id;
2277         p.blksize  = blksize;
2278         p.seq_num  = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2279
2280         if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2281                 return FALSE;
2282         ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2283                                 (struct p_header80 *)&p, sizeof(p));
2284         return ok;
2285 }
2286
2287 /* dp->sector and dp->block_id already/still in network byte order,
2288  * data_size is payload size according to dp->head,
2289  * and may need to be corrected for digest size. */
2290 int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2291                      struct p_data *dp, int data_size)
2292 {
2293         data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2294                 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
2295         return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2296                               dp->block_id);
2297 }
2298
2299 int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2300                      struct p_block_req *rp)
2301 {
2302         return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2303 }
2304
2305 /**
2306  * drbd_send_ack() - Sends an ack packet
2307  * @mdev:       DRBD device.
2308  * @cmd:        Packet command code.
2309  * @e:          Epoch entry.
2310  */
2311 int drbd_send_ack(struct drbd_conf *mdev,
2312         enum drbd_packets cmd, struct drbd_epoch_entry *e)
2313 {
2314         return _drbd_send_ack(mdev, cmd,
2315                               cpu_to_be64(e->sector),
2316                               cpu_to_be32(e->size),
2317                               e->block_id);
2318 }
2319
2320 /* This function misuses the block_id field to signal if the blocks
2321  * are is sync or not. */
2322 int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2323                      sector_t sector, int blksize, u64 block_id)
2324 {
2325         return _drbd_send_ack(mdev, cmd,
2326                               cpu_to_be64(sector),
2327                               cpu_to_be32(blksize),
2328                               cpu_to_be64(block_id));
2329 }
2330
2331 int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2332                        sector_t sector, int size, u64 block_id)
2333 {
2334         int ok;
2335         struct p_block_req p;
2336
2337         p.sector   = cpu_to_be64(sector);
2338         p.block_id = block_id;
2339         p.blksize  = cpu_to_be32(size);
2340
2341         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2342                                 (struct p_header80 *)&p, sizeof(p));
2343         return ok;
2344 }
2345
2346 int drbd_send_drequest_csum(struct drbd_conf *mdev,
2347                             sector_t sector, int size,
2348                             void *digest, int digest_size,
2349                             enum drbd_packets cmd)
2350 {
2351         int ok;
2352         struct p_block_req p;
2353
2354         p.sector   = cpu_to_be64(sector);
2355         p.block_id = BE_DRBD_MAGIC + 0xbeef;
2356         p.blksize  = cpu_to_be32(size);
2357
2358         p.head.magic   = BE_DRBD_MAGIC;
2359         p.head.command = cpu_to_be16(cmd);
2360         p.head.length  = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
2361
2362         mutex_lock(&mdev->data.mutex);
2363
2364         ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2365         ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2366
2367         mutex_unlock(&mdev->data.mutex);
2368
2369         return ok;
2370 }
2371
2372 int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2373 {
2374         int ok;
2375         struct p_block_req p;
2376
2377         p.sector   = cpu_to_be64(sector);
2378         p.block_id = BE_DRBD_MAGIC + 0xbabe;
2379         p.blksize  = cpu_to_be32(size);
2380
2381         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2382                            (struct p_header80 *)&p, sizeof(p));
2383         return ok;
2384 }
2385
2386 /* called on sndtimeo
2387  * returns FALSE if we should retry,
2388  * TRUE if we think connection is dead
2389  */
2390 static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2391 {
2392         int drop_it;
2393         /* long elapsed = (long)(jiffies - mdev->last_received); */
2394
2395         drop_it =   mdev->meta.socket == sock
2396                 || !mdev->asender.task
2397                 || get_t_state(&mdev->asender) != Running
2398                 || mdev->state.conn < C_CONNECTED;
2399
2400         if (drop_it)
2401                 return TRUE;
2402
2403         drop_it = !--mdev->ko_count;
2404         if (!drop_it) {
2405                 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2406                        current->comm, current->pid, mdev->ko_count);
2407                 request_ping(mdev);
2408         }
2409
2410         return drop_it; /* && (mdev->state == R_PRIMARY) */;
2411 }
2412
2413 /* The idea of sendpage seems to be to put some kind of reference
2414  * to the page into the skb, and to hand it over to the NIC. In
2415  * this process get_page() gets called.
2416  *
2417  * As soon as the page was really sent over the network put_page()
2418  * gets called by some part of the network layer. [ NIC driver? ]
2419  *
2420  * [ get_page() / put_page() increment/decrement the count. If count
2421  *   reaches 0 the page will be freed. ]
2422  *
2423  * This works nicely with pages from FSs.
2424  * But this means that in protocol A we might signal IO completion too early!
2425  *
2426  * In order not to corrupt data during a resync we must make sure
2427  * that we do not reuse our own buffer pages (EEs) to early, therefore
2428  * we have the net_ee list.
2429  *
2430  * XFS seems to have problems, still, it submits pages with page_count == 0!
2431  * As a workaround, we disable sendpage on pages
2432  * with page_count == 0 or PageSlab.
2433  */
2434 static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2435                    int offset, size_t size, unsigned msg_flags)
2436 {
2437         int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
2438         kunmap(page);
2439         if (sent == size)
2440                 mdev->send_cnt += size>>9;
2441         return sent == size;
2442 }
2443
2444 static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2445                     int offset, size_t size, unsigned msg_flags)
2446 {
2447         mm_segment_t oldfs = get_fs();
2448         int sent, ok;
2449         int len = size;
2450
2451         /* e.g. XFS meta- & log-data is in slab pages, which have a
2452          * page_count of 0 and/or have PageSlab() set.
2453          * we cannot use send_page for those, as that does get_page();
2454          * put_page(); and would cause either a VM_BUG directly, or
2455          * __page_cache_release a page that would actually still be referenced
2456          * by someone, leading to some obscure delayed Oops somewhere else. */
2457         if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2458                 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
2459
2460         msg_flags |= MSG_NOSIGNAL;
2461         drbd_update_congested(mdev);
2462         set_fs(KERNEL_DS);
2463         do {
2464                 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2465                                                         offset, len,
2466                                                         msg_flags);
2467                 if (sent == -EAGAIN) {
2468                         if (we_should_drop_the_connection(mdev,
2469                                                           mdev->data.socket))
2470                                 break;
2471                         else
2472                                 continue;
2473                 }
2474                 if (sent <= 0) {
2475                         dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2476                              __func__, (int)size, len, sent);
2477                         break;
2478                 }
2479                 len    -= sent;
2480                 offset += sent;
2481         } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2482         set_fs(oldfs);
2483         clear_bit(NET_CONGESTED, &mdev->flags);
2484
2485         ok = (len == 0);
2486         if (likely(ok))
2487                 mdev->send_cnt += size>>9;
2488         return ok;
2489 }
2490
2491 static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2492 {
2493         struct bio_vec *bvec;
2494         int i;
2495         /* hint all but last page with MSG_MORE */
2496         __bio_for_each_segment(bvec, bio, i, 0) {
2497                 if (!_drbd_no_send_page(mdev, bvec->bv_page,
2498                                      bvec->bv_offset, bvec->bv_len,
2499                                      i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2500                         return 0;
2501         }
2502         return 1;
2503 }
2504
2505 static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2506 {
2507         struct bio_vec *bvec;
2508         int i;
2509         /* hint all but last page with MSG_MORE */
2510         __bio_for_each_segment(bvec, bio, i, 0) {
2511                 if (!_drbd_send_page(mdev, bvec->bv_page,
2512                                      bvec->bv_offset, bvec->bv_len,
2513                                      i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2514                         return 0;
2515         }
2516         return 1;
2517 }
2518
2519 static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2520 {
2521         struct page *page = e->pages;
2522         unsigned len = e->size;
2523         /* hint all but last page with MSG_MORE */
2524         page_chain_for_each(page) {
2525                 unsigned l = min_t(unsigned, len, PAGE_SIZE);
2526                 if (!_drbd_send_page(mdev, page, 0, l,
2527                                 page_chain_next(page) ? MSG_MORE : 0))
2528                         return 0;
2529                 len -= l;
2530         }
2531         return 1;
2532 }
2533
2534 static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2535 {
2536         if (mdev->agreed_pro_version >= 95)
2537                 return  (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
2538                         (bi_rw & REQ_FUA ? DP_FUA : 0) |
2539                         (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2540                         (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2541         else
2542                 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
2543 }
2544
2545 /* Used to send write requests
2546  * R_PRIMARY -> Peer    (P_DATA)
2547  */
2548 int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2549 {
2550         int ok = 1;
2551         struct p_data p;
2552         unsigned int dp_flags = 0;
2553         void *dgb;
2554         int dgs;
2555
2556         if (!drbd_get_data_sock(mdev))
2557                 return 0;
2558
2559         dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2560                 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2561
2562         if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
2563                 p.head.h80.magic   = BE_DRBD_MAGIC;
2564                 p.head.h80.command = cpu_to_be16(P_DATA);
2565                 p.head.h80.length  =
2566                         cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2567         } else {
2568                 p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
2569                 p.head.h95.command = cpu_to_be16(P_DATA);
2570                 p.head.h95.length  =
2571                         cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2572         }
2573
2574         p.sector   = cpu_to_be64(req->sector);
2575         p.block_id = (unsigned long)req;
2576         p.seq_num  = cpu_to_be32(req->seq_num =
2577                                  atomic_add_return(1, &mdev->packet_seq));
2578
2579         dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2580
2581         if (mdev->state.conn >= C_SYNC_SOURCE &&
2582             mdev->state.conn <= C_PAUSED_SYNC_T)
2583                 dp_flags |= DP_MAY_SET_IN_SYNC;
2584
2585         p.dp_flags = cpu_to_be32(dp_flags);
2586         set_bit(UNPLUG_REMOTE, &mdev->flags);
2587         ok = (sizeof(p) ==
2588                 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
2589         if (ok && dgs) {
2590                 dgb = mdev->int_dig_out;
2591                 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2592                 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2593         }
2594         if (ok) {
2595                 /* For protocol A, we have to memcpy the payload into
2596                  * socket buffers, as we may complete right away
2597                  * as soon as we handed it over to tcp, at which point the data
2598                  * pages may become invalid.
2599                  *
2600                  * For data-integrity enabled, we copy it as well, so we can be
2601                  * sure that even if the bio pages may still be modified, it
2602                  * won't change the data on the wire, thus if the digest checks
2603                  * out ok after sending on this side, but does not fit on the
2604                  * receiving side, we sure have detected corruption elsewhere.
2605                  */
2606                 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
2607                         ok = _drbd_send_bio(mdev, req->master_bio);
2608                 else
2609                         ok = _drbd_send_zc_bio(mdev, req->master_bio);
2610
2611                 /* double check digest, sometimes buffers have been modified in flight. */
2612                 if (dgs > 0 && dgs <= 64) {
2613                         /* 64 byte, 512 bit, is the larges digest size
2614                          * currently supported in kernel crypto. */
2615                         unsigned char digest[64];
2616                         drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2617                         if (memcmp(mdev->int_dig_out, digest, dgs)) {
2618                                 dev_warn(DEV,
2619                                         "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2620                                         (unsigned long long)req->sector, req->size);
2621                         }
2622                 } /* else if (dgs > 64) {
2623                      ... Be noisy about digest too large ...
2624                 } */
2625         }
2626
2627         drbd_put_data_sock(mdev);
2628
2629         return ok;
2630 }
2631
2632 /* answer packet, used to send data back for read requests:
2633  *  Peer       -> (diskless) R_PRIMARY   (P_DATA_REPLY)
2634  *  C_SYNC_SOURCE -> C_SYNC_TARGET         (P_RS_DATA_REPLY)
2635  */
2636 int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2637                     struct drbd_epoch_entry *e)
2638 {
2639         int ok;
2640         struct p_data p;
2641         void *dgb;
2642         int dgs;
2643
2644         dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2645                 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2646
2647         if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
2648                 p.head.h80.magic   = BE_DRBD_MAGIC;
2649                 p.head.h80.command = cpu_to_be16(cmd);
2650                 p.head.h80.length  =
2651                         cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2652         } else {
2653                 p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
2654                 p.head.h95.command = cpu_to_be16(cmd);
2655                 p.head.h95.length  =
2656                         cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2657         }
2658
2659         p.sector   = cpu_to_be64(e->sector);
2660         p.block_id = e->block_id;
2661         /* p.seq_num  = 0;    No sequence numbers here.. */
2662
2663         /* Only called by our kernel thread.
2664          * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2665          * in response to admin command or module unload.
2666          */
2667         if (!drbd_get_data_sock(mdev))
2668                 return 0;
2669
2670         ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
2671         if (ok && dgs) {
2672                 dgb = mdev->int_dig_out;
2673                 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
2674                 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2675         }
2676         if (ok)
2677                 ok = _drbd_send_zc_ee(mdev, e);
2678
2679         drbd_put_data_sock(mdev);
2680
2681         return ok;
2682 }
2683
2684 int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2685 {
2686         struct p_block_desc p;
2687
2688         p.sector  = cpu_to_be64(req->sector);
2689         p.blksize = cpu_to_be32(req->size);
2690
2691         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2692 }
2693
2694 /*
2695   drbd_send distinguishes two cases:
2696
2697   Packets sent via the data socket "sock"
2698   and packets sent via the meta data socket "msock"
2699
2700                     sock                      msock
2701   -----------------+-------------------------+------------------------------
2702   timeout           conf.timeout / 2          conf.timeout / 2
2703   timeout action    send a ping via msock     Abort communication
2704                                               and close all sockets
2705 */
2706
2707 /*
2708  * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2709  */
2710 int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2711               void *buf, size_t size, unsigned msg_flags)
2712 {
2713         struct kvec iov;
2714         struct msghdr msg;
2715         int rv, sent = 0;
2716
2717         if (!sock)
2718                 return -1000;
2719
2720         /* THINK  if (signal_pending) return ... ? */
2721
2722         iov.iov_base = buf;
2723         iov.iov_len  = size;
2724
2725         msg.msg_name       = NULL;
2726         msg.msg_namelen    = 0;
2727         msg.msg_control    = NULL;
2728         msg.msg_controllen = 0;
2729         msg.msg_flags      = msg_flags | MSG_NOSIGNAL;
2730
2731         if (sock == mdev->data.socket) {
2732                 mdev->ko_count = mdev->net_conf->ko_count;
2733                 drbd_update_congested(mdev);
2734         }
2735         do {
2736                 /* STRANGE
2737                  * tcp_sendmsg does _not_ use its size parameter at all ?
2738                  *
2739                  * -EAGAIN on timeout, -EINTR on signal.
2740                  */
2741 /* THINK
2742  * do we need to block DRBD_SIG if sock == &meta.socket ??
2743  * otherwise wake_asender() might interrupt some send_*Ack !
2744  */
2745                 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2746                 if (rv == -EAGAIN) {
2747                         if (we_should_drop_the_connection(mdev, sock))
2748                                 break;
2749                         else
2750                                 continue;
2751                 }
2752                 D_ASSERT(rv != 0);
2753                 if (rv == -EINTR) {
2754                         flush_signals(current);
2755                         rv = 0;
2756                 }
2757                 if (rv < 0)
2758                         break;
2759                 sent += rv;
2760                 iov.iov_base += rv;
2761                 iov.iov_len  -= rv;
2762         } while (sent < size);
2763
2764         if (sock == mdev->data.socket)
2765                 clear_bit(NET_CONGESTED, &mdev->flags);
2766
2767         if (rv <= 0) {
2768                 if (rv != -EAGAIN) {
2769                         dev_err(DEV, "%s_sendmsg returned %d\n",
2770                             sock == mdev->meta.socket ? "msock" : "sock",
2771                             rv);
2772                         drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2773                 } else
2774                         drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2775         }
2776
2777         return sent;
2778 }
2779
2780 static int drbd_open(struct block_device *bdev, fmode_t mode)
2781 {
2782         struct drbd_conf *mdev = bdev->bd_disk->private_data;
2783         unsigned long flags;
2784         int rv = 0;
2785
2786         mutex_lock(&drbd_main_mutex);
2787         spin_lock_irqsave(&mdev->req_lock, flags);
2788         /* to have a stable mdev->state.role
2789          * and no race with updating open_cnt */
2790
2791         if (mdev->state.role != R_PRIMARY) {
2792                 if (mode & FMODE_WRITE)
2793                         rv = -EROFS;
2794                 else if (!allow_oos)
2795                         rv = -EMEDIUMTYPE;
2796         }
2797
2798         if (!rv)
2799                 mdev->open_cnt++;
2800         spin_unlock_irqrestore(&mdev->req_lock, flags);
2801         mutex_unlock(&drbd_main_mutex);
2802
2803         return rv;
2804 }
2805
2806 static int drbd_release(struct gendisk *gd, fmode_t mode)
2807 {
2808         struct drbd_conf *mdev = gd->private_data;
2809         mutex_lock(&drbd_main_mutex);
2810         mdev->open_cnt--;
2811         mutex_unlock(&drbd_main_mutex);
2812         return 0;
2813 }
2814
2815 static void drbd_set_defaults(struct drbd_conf *mdev)
2816 {
2817         /* This way we get a compile error when sync_conf grows,
2818            and we forgot to initialize it here */
2819         mdev->sync_conf = (struct syncer_conf) {
2820                 /* .rate = */           DRBD_RATE_DEF,
2821                 /* .after = */          DRBD_AFTER_DEF,
2822                 /* .al_extents = */     DRBD_AL_EXTENTS_DEF,
2823                 /* .verify_alg = */     {}, 0,
2824                 /* .cpu_mask = */       {}, 0,
2825                 /* .csums_alg = */      {}, 0,
2826                 /* .use_rle = */        0,
2827                 /* .on_no_data = */     DRBD_ON_NO_DATA_DEF,
2828                 /* .c_plan_ahead = */   DRBD_C_PLAN_AHEAD_DEF,
2829                 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
2830                 /* .c_fill_target = */  DRBD_C_FILL_TARGET_DEF,
2831                 /* .c_max_rate = */     DRBD_C_MAX_RATE_DEF,
2832                 /* .c_min_rate = */     DRBD_C_MIN_RATE_DEF
2833         };
2834
2835         /* Have to use that way, because the layout differs between
2836            big endian and little endian */
2837         mdev->state = (union drbd_state) {
2838                 { .role = R_SECONDARY,
2839                   .peer = R_UNKNOWN,
2840                   .conn = C_STANDALONE,
2841                   .disk = D_DISKLESS,
2842                   .pdsk = D_UNKNOWN,
2843                   .susp = 0,
2844                   .susp_nod = 0,
2845                   .susp_fen = 0
2846                 } };
2847 }
2848
2849 void drbd_init_set_defaults(struct drbd_conf *mdev)
2850 {
2851         /* the memset(,0,) did most of this.
2852          * note: only assignments, no allocation in here */
2853
2854         drbd_set_defaults(mdev);
2855
2856         atomic_set(&mdev->ap_bio_cnt, 0);
2857         atomic_set(&mdev->ap_pending_cnt, 0);
2858         atomic_set(&mdev->rs_pending_cnt, 0);
2859         atomic_set(&mdev->unacked_cnt, 0);
2860         atomic_set(&mdev->local_cnt, 0);
2861         atomic_set(&mdev->net_cnt, 0);
2862         atomic_set(&mdev->packet_seq, 0);
2863         atomic_set(&mdev->pp_in_use, 0);
2864         atomic_set(&mdev->pp_in_use_by_net, 0);
2865         atomic_set(&mdev->rs_sect_in, 0);
2866         atomic_set(&mdev->rs_sect_ev, 0);
2867         atomic_set(&mdev->ap_in_flight, 0);
2868
2869         mutex_init(&mdev->md_io_mutex);
2870         mutex_init(&mdev->data.mutex);
2871         mutex_init(&mdev->meta.mutex);
2872         sema_init(&mdev->data.work.s, 0);
2873         sema_init(&mdev->meta.work.s, 0);
2874         mutex_init(&mdev->state_mutex);
2875
2876         spin_lock_init(&mdev->data.work.q_lock);
2877         spin_lock_init(&mdev->meta.work.q_lock);
2878
2879         spin_lock_init(&mdev->al_lock);
2880         spin_lock_init(&mdev->req_lock);
2881         spin_lock_init(&mdev->peer_seq_lock);
2882         spin_lock_init(&mdev->epoch_lock);
2883
2884         INIT_LIST_HEAD(&mdev->active_ee);
2885         INIT_LIST_HEAD(&mdev->sync_ee);
2886         INIT_LIST_HEAD(&mdev->done_ee);
2887         INIT_LIST_HEAD(&mdev->read_ee);
2888         INIT_LIST_HEAD(&mdev->net_ee);
2889         INIT_LIST_HEAD(&mdev->resync_reads);
2890         INIT_LIST_HEAD(&mdev->data.work.q);
2891         INIT_LIST_HEAD(&mdev->meta.work.q);
2892         INIT_LIST_HEAD(&mdev->resync_work.list);
2893         INIT_LIST_HEAD(&mdev->unplug_work.list);
2894         INIT_LIST_HEAD(&mdev->go_diskless.list);
2895         INIT_LIST_HEAD(&mdev->md_sync_work.list);
2896         INIT_LIST_HEAD(&mdev->start_resync_work.list);
2897         INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
2898
2899         mdev->resync_work.cb  = w_resync_inactive;
2900         mdev->unplug_work.cb  = w_send_write_hint;
2901         mdev->go_diskless.cb  = w_go_diskless;
2902         mdev->md_sync_work.cb = w_md_sync;
2903         mdev->bm_io_work.w.cb = w_bitmap_io;
2904         init_timer(&mdev->resync_timer);
2905         init_timer(&mdev->md_sync_timer);
2906         mdev->resync_timer.function = resync_timer_fn;
2907         mdev->resync_timer.data = (unsigned long) mdev;
2908         mdev->md_sync_timer.function = md_sync_timer_fn;
2909         mdev->md_sync_timer.data = (unsigned long) mdev;
2910
2911         init_waitqueue_head(&mdev->misc_wait);
2912         init_waitqueue_head(&mdev->state_wait);
2913         init_waitqueue_head(&mdev->net_cnt_wait);
2914         init_waitqueue_head(&mdev->ee_wait);
2915         init_waitqueue_head(&mdev->al_wait);
2916         init_waitqueue_head(&mdev->seq_wait);
2917
2918         drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
2919         drbd_thread_init(mdev, &mdev->worker, drbd_worker);
2920         drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2921
2922         mdev->agreed_pro_version = PRO_VERSION_MAX;
2923         mdev->write_ordering = WO_bdev_flush;
2924         mdev->resync_wenr = LC_FREE;
2925 }
2926
2927 void drbd_mdev_cleanup(struct drbd_conf *mdev)
2928 {
2929         int i;
2930         if (mdev->receiver.t_state != None)
2931                 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2932                                 mdev->receiver.t_state);
2933
2934         /* no need to lock it, I'm the only thread alive */
2935         if (atomic_read(&mdev->current_epoch->epoch_size) !=  0)
2936                 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2937         mdev->al_writ_cnt  =
2938         mdev->bm_writ_cnt  =
2939         mdev->read_cnt     =
2940         mdev->recv_cnt     =
2941         mdev->send_cnt     =
2942         mdev->writ_cnt     =
2943         mdev->p_size       =
2944         mdev->rs_start     =
2945         mdev->rs_total     =
2946         mdev->rs_failed    = 0;
2947         mdev->rs_last_events = 0;
2948         mdev->rs_last_sect_ev = 0;
2949         for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2950                 mdev->rs_mark_left[i] = 0;
2951                 mdev->rs_mark_time[i] = 0;
2952         }
2953         D_ASSERT(mdev->net_conf == NULL);
2954
2955         drbd_set_my_capacity(mdev, 0);
2956         if (mdev->bitmap) {
2957                 /* maybe never allocated. */
2958                 drbd_bm_resize(mdev, 0, 1);
2959                 drbd_bm_cleanup(mdev);
2960         }
2961
2962         drbd_free_resources(mdev);
2963         clear_bit(AL_SUSPENDED, &mdev->flags);
2964
2965         /*
2966          * currently we drbd_init_ee only on module load, so
2967          * we may do drbd_release_ee only on module unload!
2968          */
2969         D_ASSERT(list_empty(&mdev->active_ee));
2970         D_ASSERT(list_empty(&mdev->sync_ee));
2971         D_ASSERT(list_empty(&mdev->done_ee));
2972         D_ASSERT(list_empty(&mdev->read_ee));
2973         D_ASSERT(list_empty(&mdev->net_ee));
2974         D_ASSERT(list_empty(&mdev->resync_reads));
2975         D_ASSERT(list_empty(&mdev->data.work.q));
2976         D_ASSERT(list_empty(&mdev->meta.work.q));
2977         D_ASSERT(list_empty(&mdev->resync_work.list));
2978         D_ASSERT(list_empty(&mdev->unplug_work.list));
2979         D_ASSERT(list_empty(&mdev->go_diskless.list));
2980 }
2981
2982
2983 static void drbd_destroy_mempools(void)
2984 {
2985         struct page *page;
2986
2987         while (drbd_pp_pool) {
2988                 page = drbd_pp_pool;
2989                 drbd_pp_pool = (struct page *)page_private(page);
2990                 __free_page(page);
2991                 drbd_pp_vacant--;
2992         }
2993
2994         /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
2995
2996         if (drbd_ee_mempool)
2997                 mempool_destroy(drbd_ee_mempool);
2998         if (drbd_request_mempool)
2999                 mempool_destroy(drbd_request_mempool);
3000         if (drbd_ee_cache)
3001                 kmem_cache_destroy(drbd_ee_cache);
3002         if (drbd_request_cache)
3003                 kmem_cache_destroy(drbd_request_cache);
3004         if (drbd_bm_ext_cache)
3005                 kmem_cache_destroy(drbd_bm_ext_cache);
3006         if (drbd_al_ext_cache)
3007                 kmem_cache_destroy(drbd_al_ext_cache);
3008
3009         drbd_ee_mempool      = NULL;
3010         drbd_request_mempool = NULL;
3011         drbd_ee_cache        = NULL;
3012         drbd_request_cache   = NULL;
3013         drbd_bm_ext_cache    = NULL;
3014         drbd_al_ext_cache    = NULL;
3015
3016         return;
3017 }
3018
3019 static int drbd_create_mempools(void)
3020 {
3021         struct page *page;
3022         const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
3023         int i;
3024
3025         /* prepare our caches and mempools */
3026         drbd_request_mempool = NULL;
3027         drbd_ee_cache        = NULL;
3028         drbd_request_cache   = NULL;
3029         drbd_bm_ext_cache    = NULL;
3030         drbd_al_ext_cache    = NULL;
3031         drbd_pp_pool         = NULL;
3032
3033         /* caches */
3034         drbd_request_cache = kmem_cache_create(
3035                 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3036         if (drbd_request_cache == NULL)
3037                 goto Enomem;
3038
3039         drbd_ee_cache = kmem_cache_create(
3040                 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3041         if (drbd_ee_cache == NULL)
3042                 goto Enomem;
3043
3044         drbd_bm_ext_cache = kmem_cache_create(
3045                 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3046         if (drbd_bm_ext_cache == NULL)
3047                 goto Enomem;
3048
3049         drbd_al_ext_cache = kmem_cache_create(
3050                 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3051         if (drbd_al_ext_cache == NULL)
3052                 goto Enomem;
3053
3054         /* mempools */
3055         drbd_request_mempool = mempool_create(number,
3056                 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3057         if (drbd_request_mempool == NULL)
3058                 goto Enomem;
3059
3060         drbd_ee_mempool = mempool_create(number,
3061                 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
3062         if (drbd_ee_mempool == NULL)
3063                 goto Enomem;
3064
3065         /* drbd's page pool */
3066         spin_lock_init(&drbd_pp_lock);
3067
3068         for (i = 0; i < number; i++) {
3069                 page = alloc_page(GFP_HIGHUSER);
3070                 if (!page)
3071                         goto Enomem;
3072                 set_page_private(page, (unsigned long)drbd_pp_pool);
3073                 drbd_pp_pool = page;
3074         }
3075         drbd_pp_vacant = number;
3076
3077         return 0;
3078
3079 Enomem:
3080         drbd_destroy_mempools(); /* in case we allocated some */
3081         return -ENOMEM;
3082 }
3083
3084 static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3085         void *unused)
3086 {
3087         /* just so we have it.  you never know what interesting things we
3088          * might want to do here some day...
3089          */
3090
3091         return NOTIFY_DONE;
3092 }
3093
3094 static struct notifier_block drbd_notifier = {
3095         .notifier_call = drbd_notify_sys,
3096 };
3097
3098 static void drbd_release_ee_lists(struct drbd_conf *mdev)
3099 {
3100         int rr;
3101
3102         rr = drbd_release_ee(mdev, &mdev->active_ee);
3103         if (rr)
3104                 dev_err(DEV, "%d EEs in active list found!\n", rr);
3105
3106         rr = drbd_release_ee(mdev, &mdev->sync_ee);
3107         if (rr)
3108                 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3109
3110         rr = drbd_release_ee(mdev, &mdev->read_ee);
3111         if (rr)
3112                 dev_err(DEV, "%d EEs in read list found!\n", rr);
3113
3114         rr = drbd_release_ee(mdev, &mdev->done_ee);
3115         if (rr)
3116                 dev_err(DEV, "%d EEs in done list found!\n", rr);
3117
3118         rr = drbd_release_ee(mdev, &mdev->net_ee);
3119         if (rr)
3120                 dev_err(DEV, "%d EEs in net list found!\n", rr);
3121 }
3122
3123 /* caution. no locking.
3124  * currently only used from module cleanup code. */
3125 static void drbd_delete_device(unsigned int minor)
3126 {
3127         struct drbd_conf *mdev = minor_to_mdev(minor);
3128
3129         if (!mdev)
3130                 return;
3131
3132         /* paranoia asserts */
3133         if (mdev->open_cnt != 0)
3134                 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3135                                 __FILE__ , __LINE__);
3136
3137         ERR_IF (!list_empty(&mdev->data.work.q)) {
3138                 struct list_head *lp;
3139                 list_for_each(lp, &mdev->data.work.q) {
3140                         dev_err(DEV, "lp = %p\n", lp);
3141                 }
3142         };
3143         /* end paranoia asserts */
3144
3145         del_gendisk(mdev->vdisk);
3146
3147         /* cleanup stuff that may have been allocated during
3148          * device (re-)configuration or state changes */
3149
3150         if (mdev->this_bdev)
3151                 bdput(mdev->this_bdev);
3152
3153         drbd_free_resources(mdev);
3154
3155         drbd_release_ee_lists(mdev);
3156
3157         /* should be free'd on disconnect? */
3158         kfree(mdev->ee_hash);
3159         /*
3160         mdev->ee_hash_s = 0;
3161         mdev->ee_hash = NULL;
3162         */
3163
3164         lc_destroy(mdev->act_log);
3165         lc_destroy(mdev->resync);
3166
3167         kfree(mdev->p_uuid);
3168         /* mdev->p_uuid = NULL; */
3169
3170         kfree(mdev->int_dig_out);
3171         kfree(mdev->int_dig_in);
3172         kfree(mdev->int_dig_vv);
3173
3174         /* cleanup the rest that has been
3175          * allocated from drbd_new_device
3176          * and actually free the mdev itself */
3177         drbd_free_mdev(mdev);
3178 }
3179
3180 static void drbd_cleanup(void)
3181 {
3182         unsigned int i;
3183
3184         unregister_reboot_notifier(&drbd_notifier);
3185
3186         /* first remove proc,
3187          * drbdsetup uses it's presence to detect
3188          * whether DRBD is loaded.
3189          * If we would get stuck in proc removal,
3190          * but have netlink already deregistered,
3191          * some drbdsetup commands may wait forever
3192          * for an answer.
3193          */
3194         if (drbd_proc)
3195                 remove_proc_entry("drbd", NULL);
3196
3197         drbd_nl_cleanup();
3198
3199         if (minor_table) {
3200                 i = minor_count;
3201                 while (i--)
3202                         drbd_delete_device(i);
3203                 drbd_destroy_mempools();
3204         }
3205
3206         kfree(minor_table);
3207
3208         unregister_blkdev(DRBD_MAJOR, "drbd");
3209
3210         printk(KERN_INFO "drbd: module cleanup done.\n");
3211 }
3212
3213 /**
3214  * drbd_congested() - Callback for pdflush
3215  * @congested_data:     User data
3216  * @bdi_bits:           Bits pdflush is currently interested in
3217  *
3218  * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3219  */
3220 static int drbd_congested(void *congested_data, int bdi_bits)
3221 {
3222         struct drbd_conf *mdev = congested_data;
3223         struct request_queue *q;
3224         char reason = '-';
3225         int r = 0;
3226
3227         if (!__inc_ap_bio_cond(mdev)) {
3228                 /* DRBD has frozen IO */
3229                 r = bdi_bits;
3230                 reason = 'd';
3231                 goto out;
3232         }
3233
3234         if (get_ldev(mdev)) {
3235                 q = bdev_get_queue(mdev->ldev->backing_bdev);
3236                 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3237                 put_ldev(mdev);
3238                 if (r)
3239                         reason = 'b';
3240         }
3241
3242         if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3243                 r |= (1 << BDI_async_congested);
3244                 reason = reason == 'b' ? 'a' : 'n';
3245         }
3246
3247 out:
3248         mdev->congestion_reason = reason;
3249         return r;
3250 }
3251
3252 struct drbd_conf *drbd_new_device(unsigned int minor)
3253 {
3254         struct drbd_conf *mdev;
3255         struct gendisk *disk;
3256         struct request_queue *q;
3257
3258         /* GFP_KERNEL, we are outside of all write-out paths */
3259         mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3260         if (!mdev)
3261                 return NULL;
3262         if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3263                 goto out_no_cpumask;
3264
3265         mdev->minor = minor;
3266
3267         drbd_init_set_defaults(mdev);
3268
3269         q = blk_alloc_queue(GFP_KERNEL);
3270         if (!q)
3271                 goto out_no_q;
3272         mdev->rq_queue = q;
3273         q->queuedata   = mdev;
3274
3275         disk = alloc_disk(1);
3276         if (!disk)
3277                 goto out_no_disk;
3278         mdev->vdisk = disk;
3279
3280         set_disk_ro(disk, TRUE);
3281
3282         disk->queue = q;
3283         disk->major = DRBD_MAJOR;
3284         disk->first_minor = minor;
3285         disk->fops = &drbd_ops;
3286         sprintf(disk->disk_name, "drbd%d", minor);
3287         disk->private_data = mdev;
3288
3289         mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3290         /* we have no partitions. we contain only ourselves. */
3291         mdev->this_bdev->bd_contains = mdev->this_bdev;
3292
3293         q->backing_dev_info.congested_fn = drbd_congested;
3294         q->backing_dev_info.congested_data = mdev;
3295
3296         blk_queue_make_request(q, drbd_make_request);
3297         blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE >> 9);
3298         blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3299         blk_queue_merge_bvec(q, drbd_merge_bvec);
3300         q->queue_lock = &mdev->req_lock;
3301
3302         mdev->md_io_page = alloc_page(GFP_KERNEL);
3303         if (!mdev->md_io_page)
3304                 goto out_no_io_page;
3305
3306         if (drbd_bm_init(mdev))
3307                 goto out_no_bitmap;
3308         /* no need to lock access, we are still initializing this minor device. */
3309         if (!tl_init(mdev))
3310                 goto out_no_tl;
3311
3312         mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3313         if (!mdev->app_reads_hash)
3314                 goto out_no_app_reads;
3315
3316         mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3317         if (!mdev->current_epoch)
3318                 goto out_no_epoch;
3319
3320         INIT_LIST_HEAD(&mdev->current_epoch->list);
3321         mdev->epochs = 1;
3322
3323         return mdev;
3324
3325 /* out_whatever_else:
3326         kfree(mdev->current_epoch); */
3327 out_no_epoch:
3328         kfree(mdev->app_reads_hash);
3329 out_no_app_reads:
3330         tl_cleanup(mdev);
3331 out_no_tl:
3332         drbd_bm_cleanup(mdev);
3333 out_no_bitmap:
3334         __free_page(mdev->md_io_page);
3335 out_no_io_page:
3336         put_disk(disk);
3337 out_no_disk:
3338         blk_cleanup_queue(q);
3339 out_no_q:
3340         free_cpumask_var(mdev->cpu_mask);
3341 out_no_cpumask:
3342         kfree(mdev);
3343         return NULL;
3344 }
3345
3346 /* counterpart of drbd_new_device.
3347  * last part of drbd_delete_device. */
3348 void drbd_free_mdev(struct drbd_conf *mdev)
3349 {
3350         kfree(mdev->current_epoch);
3351         kfree(mdev->app_reads_hash);
3352         tl_cleanup(mdev);
3353         if (mdev->bitmap) /* should no longer be there. */
3354                 drbd_bm_cleanup(mdev);
3355         __free_page(mdev->md_io_page);
3356         put_disk(mdev->vdisk);
3357         blk_cleanup_queue(mdev->rq_queue);
3358         free_cpumask_var(mdev->cpu_mask);
3359         drbd_free_tl_hash(mdev);
3360         kfree(mdev);
3361 }
3362
3363
3364 int __init drbd_init(void)
3365 {
3366         int err;
3367
3368         if (sizeof(struct p_handshake) != 80) {
3369                 printk(KERN_ERR
3370                        "drbd: never change the size or layout "
3371                        "of the HandShake packet.\n");
3372                 return -EINVAL;
3373         }
3374
3375         if (1 > minor_count || minor_count > 255) {
3376                 printk(KERN_ERR
3377                         "drbd: invalid minor_count (%d)\n", minor_count);
3378 #ifdef MODULE
3379                 return -EINVAL;
3380 #else
3381                 minor_count = 8;
3382 #endif
3383         }
3384
3385         err = drbd_nl_init();
3386         if (err)
3387                 return err;
3388
3389         err = register_blkdev(DRBD_MAJOR, "drbd");
3390         if (err) {
3391                 printk(KERN_ERR
3392                        "drbd: unable to register block device major %d\n",
3393                        DRBD_MAJOR);
3394                 return err;
3395         }
3396
3397         register_reboot_notifier(&drbd_notifier);
3398
3399         /*
3400          * allocate all necessary structs
3401          */
3402         err = -ENOMEM;
3403
3404         init_waitqueue_head(&drbd_pp_wait);
3405
3406         drbd_proc = NULL; /* play safe for drbd_cleanup */
3407         minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3408                                 GFP_KERNEL);
3409         if (!minor_table)
3410                 goto Enomem;
3411
3412         err = drbd_create_mempools();
3413         if (err)
3414                 goto Enomem;
3415
3416         drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
3417         if (!drbd_proc) {
3418                 printk(KERN_ERR "drbd: unable to register proc file\n");
3419                 goto Enomem;
3420         }
3421
3422         rwlock_init(&global_state_lock);
3423
3424         printk(KERN_INFO "drbd: initialized. "
3425                "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3426                API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3427         printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3428         printk(KERN_INFO "drbd: registered as block device major %d\n",
3429                 DRBD_MAJOR);
3430         printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3431
3432         return 0; /* Success! */
3433
3434 Enomem:
3435         drbd_cleanup();
3436         if (err == -ENOMEM)
3437                 /* currently always the case */
3438                 printk(KERN_ERR "drbd: ran out of memory\n");
3439         else
3440                 printk(KERN_ERR "drbd: initialization failure\n");
3441         return err;
3442 }
3443
3444 void drbd_free_bc(struct drbd_backing_dev *ldev)
3445 {
3446         if (ldev == NULL)
3447                 return;
3448
3449         blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3450         blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3451
3452         kfree(ldev);
3453 }
3454
3455 void drbd_free_sock(struct drbd_conf *mdev)
3456 {
3457         if (mdev->data.socket) {
3458                 mutex_lock(&mdev->data.mutex);
3459                 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3460                 sock_release(mdev->data.socket);
3461                 mdev->data.socket = NULL;
3462                 mutex_unlock(&mdev->data.mutex);
3463         }
3464         if (mdev->meta.socket) {
3465                 mutex_lock(&mdev->meta.mutex);
3466                 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3467                 sock_release(mdev->meta.socket);
3468                 mdev->meta.socket = NULL;
3469                 mutex_unlock(&mdev->meta.mutex);
3470         }
3471 }
3472
3473
3474 void drbd_free_resources(struct drbd_conf *mdev)
3475 {
3476         crypto_free_hash(mdev->csums_tfm);
3477         mdev->csums_tfm = NULL;
3478         crypto_free_hash(mdev->verify_tfm);
3479         mdev->verify_tfm = NULL;
3480         crypto_free_hash(mdev->cram_hmac_tfm);
3481         mdev->cram_hmac_tfm = NULL;
3482         crypto_free_hash(mdev->integrity_w_tfm);
3483         mdev->integrity_w_tfm = NULL;
3484         crypto_free_hash(mdev->integrity_r_tfm);
3485         mdev->integrity_r_tfm = NULL;
3486
3487         drbd_free_sock(mdev);
3488
3489         __no_warn(local,
3490                   drbd_free_bc(mdev->ldev);
3491                   mdev->ldev = NULL;);
3492 }
3493
3494 /* meta data management */
3495
3496 struct meta_data_on_disk {
3497         u64 la_size;           /* last agreed size. */
3498         u64 uuid[UI_SIZE];   /* UUIDs. */
3499         u64 device_uuid;
3500         u64 reserved_u64_1;
3501         u32 flags;             /* MDF */
3502         u32 magic;
3503         u32 md_size_sect;
3504         u32 al_offset;         /* offset to this block */
3505         u32 al_nr_extents;     /* important for restoring the AL */
3506               /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3507         u32 bm_offset;         /* offset to the bitmap, from here */
3508         u32 bm_bytes_per_bit;  /* BM_BLOCK_SIZE */
3509         u32 reserved_u32[4];
3510
3511 } __packed;
3512
3513 /**
3514  * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3515  * @mdev:       DRBD device.
3516  */
3517 void drbd_md_sync(struct drbd_conf *mdev)
3518 {
3519         struct meta_data_on_disk *buffer;
3520         sector_t sector;
3521         int i;
3522
3523         del_timer(&mdev->md_sync_timer);
3524         /* timer may be rearmed by drbd_md_mark_dirty() now. */
3525         if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3526                 return;
3527
3528         /* We use here D_FAILED and not D_ATTACHING because we try to write
3529          * metadata even if we detach due to a disk failure! */
3530         if (!get_ldev_if_state(mdev, D_FAILED))
3531                 return;
3532
3533         mutex_lock(&mdev->md_io_mutex);
3534         buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3535         memset(buffer, 0, 512);
3536
3537         buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3538         for (i = UI_CURRENT; i < UI_SIZE; i++)
3539                 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3540         buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3541         buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3542
3543         buffer->md_size_sect  = cpu_to_be32(mdev->ldev->md.md_size_sect);
3544         buffer->al_offset     = cpu_to_be32(mdev->ldev->md.al_offset);
3545         buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3546         buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3547         buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3548
3549         buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3550
3551         D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3552         sector = mdev->ldev->md.md_offset;
3553
3554         if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3555                 /* this was a try anyways ... */
3556                 dev_err(DEV, "meta data update failed!\n");
3557                 drbd_chk_io_error(mdev, 1, TRUE);
3558         }
3559
3560         /* Update mdev->ldev->md.la_size_sect,
3561          * since we updated it on metadata. */
3562         mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3563
3564         mutex_unlock(&mdev->md_io_mutex);
3565         put_ldev(mdev);
3566 }
3567
3568 /**
3569  * drbd_md_read() - Reads in the meta data super block
3570  * @mdev:       DRBD device.
3571  * @bdev:       Device from which the meta data should be read in.
3572  *
3573  * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case
3574  * something goes wrong.  Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3575  */
3576 int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3577 {
3578         struct meta_data_on_disk *buffer;
3579         int i, rv = NO_ERROR;
3580
3581         if (!get_ldev_if_state(mdev, D_ATTACHING))
3582                 return ERR_IO_MD_DISK;
3583
3584         mutex_lock(&mdev->md_io_mutex);
3585         buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3586
3587         if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3588                 /* NOTE: cant do normal error processing here as this is
3589                    called BEFORE disk is attached */
3590                 dev_err(DEV, "Error while reading metadata.\n");
3591                 rv = ERR_IO_MD_DISK;
3592                 goto err;
3593         }
3594
3595         if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3596                 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3597                 rv = ERR_MD_INVALID;
3598                 goto err;
3599         }
3600         if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3601                 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3602                     be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3603                 rv = ERR_MD_INVALID;
3604                 goto err;
3605         }
3606         if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3607                 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3608                     be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3609                 rv = ERR_MD_INVALID;
3610                 goto err;
3611         }
3612         if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3613                 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3614                     be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3615                 rv = ERR_MD_INVALID;
3616                 goto err;
3617         }
3618
3619         if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3620                 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3621                     be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3622                 rv = ERR_MD_INVALID;
3623                 goto err;
3624         }
3625
3626         bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3627         for (i = UI_CURRENT; i < UI_SIZE; i++)
3628                 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3629         bdev->md.flags = be32_to_cpu(buffer->flags);
3630         mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3631         bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3632
3633         if (mdev->sync_conf.al_extents < 7)
3634                 mdev->sync_conf.al_extents = 127;
3635
3636  err:
3637         mutex_unlock(&mdev->md_io_mutex);
3638         put_ldev(mdev);
3639
3640         return rv;
3641 }
3642
3643 static void debug_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index)
3644 {
3645         static char *uuid_str[UI_EXTENDED_SIZE] = {
3646                 [UI_CURRENT] = "CURRENT",
3647                 [UI_BITMAP] = "BITMAP",
3648                 [UI_HISTORY_START] = "HISTORY_START",
3649                 [UI_HISTORY_END] = "HISTORY_END",
3650                 [UI_SIZE] = "SIZE",
3651                 [UI_FLAGS] = "FLAGS",
3652         };
3653
3654         if (index >= UI_EXTENDED_SIZE) {
3655                 dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n");
3656                 return;
3657         }
3658
3659         dynamic_dev_dbg(DEV, " uuid[%s] now %016llX\n",
3660                  uuid_str[index],
3661                  (unsigned long long)mdev->ldev->md.uuid[index]);
3662 }
3663
3664
3665 /**
3666  * drbd_md_mark_dirty() - Mark meta data super block as dirty
3667  * @mdev:       DRBD device.
3668  *
3669  * Call this function if you change anything that should be written to
3670  * the meta-data super block. This function sets MD_DIRTY, and starts a
3671  * timer that ensures that within five seconds you have to call drbd_md_sync().
3672  */
3673 #ifdef DEBUG
3674 void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3675 {
3676         if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3677                 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3678                 mdev->last_md_mark_dirty.line = line;
3679                 mdev->last_md_mark_dirty.func = func;
3680         }
3681 }
3682 #else
3683 void drbd_md_mark_dirty(struct drbd_conf *mdev)
3684 {
3685         if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
3686                 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
3687 }
3688 #endif
3689
3690 static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3691 {
3692         int i;
3693
3694         for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) {
3695                 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
3696                 debug_drbd_uuid(mdev, i+1);
3697         }
3698 }
3699
3700 void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3701 {
3702         if (idx == UI_CURRENT) {
3703                 if (mdev->state.role == R_PRIMARY)
3704                         val |= 1;
3705                 else
3706                         val &= ~((u64)1);
3707
3708                 drbd_set_ed_uuid(mdev, val);
3709         }
3710
3711         mdev->ldev->md.uuid[idx] = val;
3712         debug_drbd_uuid(mdev, idx);
3713         drbd_md_mark_dirty(mdev);
3714 }
3715
3716
3717 void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3718 {
3719         if (mdev->ldev->md.uuid[idx]) {
3720                 drbd_uuid_move_history(mdev);
3721                 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
3722                 debug_drbd_uuid(mdev, UI_HISTORY_START);
3723         }
3724         _drbd_uuid_set(mdev, idx, val);
3725 }
3726
3727 /**
3728  * drbd_uuid_new_current() - Creates a new current UUID
3729  * @mdev:       DRBD device.
3730  *
3731  * Creates a new current UUID, and rotates the old current UUID into
3732  * the bitmap slot. Causes an incremental resync upon next connect.
3733  */
3734 void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3735 {
3736         u64 val;
3737
3738         dev_info(DEV, "Creating new current UUID\n");
3739         D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3740         mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
3741         debug_drbd_uuid(mdev, UI_BITMAP);
3742
3743         get_random_bytes(&val, sizeof(u64));
3744         _drbd_uuid_set(mdev, UI_CURRENT, val);
3745         /* get it to stable storage _now_ */
3746         drbd_md_sync(mdev);
3747 }
3748
3749 void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3750 {
3751         if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3752                 return;
3753
3754         if (val == 0) {
3755                 drbd_uuid_move_history(mdev);
3756                 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3757                 mdev->ldev->md.uuid[UI_BITMAP] = 0;
3758                 debug_drbd_uuid(mdev, UI_HISTORY_START);
3759                 debug_drbd_uuid(mdev, UI_BITMAP);
3760         } else {
3761                 if (mdev->ldev->md.uuid[UI_BITMAP])
3762                         dev_warn(DEV, "bm UUID already set");
3763
3764                 mdev->ldev->md.uuid[UI_BITMAP] = val;
3765                 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3766
3767                 debug_drbd_uuid(mdev, UI_BITMAP);
3768         }
3769         drbd_md_mark_dirty(mdev);
3770 }
3771
3772 /**
3773  * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3774  * @mdev:       DRBD device.
3775  *
3776  * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3777  */
3778 int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3779 {
3780         int rv = -EIO;
3781
3782         if (get_ldev_if_state(mdev, D_ATTACHING)) {
3783                 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3784                 drbd_md_sync(mdev);
3785                 drbd_bm_set_all(mdev);
3786
3787                 rv = drbd_bm_write(mdev);
3788
3789                 if (!rv) {
3790                         drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3791                         drbd_md_sync(mdev);
3792                 }
3793
3794                 put_ldev(mdev);
3795         }
3796
3797         return rv;
3798 }
3799
3800 /**
3801  * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3802  * @mdev:       DRBD device.
3803  *
3804  * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3805  */
3806 int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3807 {
3808         int rv = -EIO;
3809
3810         drbd_resume_al(mdev);
3811         if (get_ldev_if_state(mdev, D_ATTACHING)) {
3812                 drbd_bm_clear_all(mdev);
3813                 rv = drbd_bm_write(mdev);
3814                 put_ldev(mdev);
3815         }
3816
3817         return rv;
3818 }
3819
3820 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3821 {
3822         struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3823         int rv;
3824
3825         D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3826
3827         drbd_bm_lock(mdev, work->why);
3828         rv = work->io_fn(mdev);
3829         drbd_bm_unlock(mdev);
3830
3831         clear_bit(BITMAP_IO, &mdev->flags);
3832         smp_mb__after_clear_bit();
3833         wake_up(&mdev->misc_wait);
3834
3835         if (work->done)
3836                 work->done(mdev, rv);
3837
3838         clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3839         work->why = NULL;
3840
3841         return 1;
3842 }
3843
3844 void drbd_ldev_destroy(struct drbd_conf *mdev)
3845 {
3846         lc_destroy(mdev->resync);
3847         mdev->resync = NULL;
3848         lc_destroy(mdev->act_log);
3849         mdev->act_log = NULL;
3850         __no_warn(local,
3851                 drbd_free_bc(mdev->ldev);
3852                 mdev->ldev = NULL;);
3853
3854         if (mdev->md_io_tmpp) {
3855                 __free_page(mdev->md_io_tmpp);
3856                 mdev->md_io_tmpp = NULL;
3857         }
3858         clear_bit(GO_DISKLESS, &mdev->flags);
3859 }
3860
3861 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3862 {
3863         D_ASSERT(mdev->state.disk == D_FAILED);
3864         /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3865          * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
3866          * the protected members anymore, though, so once put_ldev reaches zero
3867          * again, it will be safe to free them. */
3868         drbd_force_state(mdev, NS(disk, D_DISKLESS));
3869         return 1;
3870 }
3871
3872 void drbd_go_diskless(struct drbd_conf *mdev)
3873 {
3874         D_ASSERT(mdev->state.disk == D_FAILED);
3875         if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
3876                 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
3877 }
3878
3879 /**
3880  * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3881  * @mdev:       DRBD device.
3882  * @io_fn:      IO callback to be called when bitmap IO is possible
3883  * @done:       callback to be called after the bitmap IO was performed
3884  * @why:        Descriptive text of the reason for doing the IO
3885  *
3886  * While IO on the bitmap happens we freeze application IO thus we ensure
3887  * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3888  * called from worker context. It MUST NOT be used while a previous such
3889  * work is still pending!
3890  */
3891 void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3892                           int (*io_fn)(struct drbd_conf *),
3893                           void (*done)(struct drbd_conf *, int),
3894                           char *why)
3895 {
3896         D_ASSERT(current == mdev->worker.task);
3897
3898         D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3899         D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3900         D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3901         if (mdev->bm_io_work.why)
3902                 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3903                         why, mdev->bm_io_work.why);
3904
3905         mdev->bm_io_work.io_fn = io_fn;
3906         mdev->bm_io_work.done = done;
3907         mdev->bm_io_work.why = why;
3908
3909         spin_lock_irq(&mdev->req_lock);
3910         set_bit(BITMAP_IO, &mdev->flags);
3911         if (atomic_read(&mdev->ap_bio_cnt) == 0) {
3912                 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
3913                         drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
3914         }
3915         spin_unlock_irq(&mdev->req_lock);
3916 }
3917
3918 /**
3919  * drbd_bitmap_io() -  Does an IO operation on the whole bitmap
3920  * @mdev:       DRBD device.
3921  * @io_fn:      IO callback to be called when bitmap IO is possible
3922  * @why:        Descriptive text of the reason for doing the IO
3923  *
3924  * freezes application IO while that the actual IO operations runs. This
3925  * functions MAY NOT be called from worker context.
3926  */
3927 int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
3928 {
3929         int rv;
3930
3931         D_ASSERT(current != mdev->worker.task);
3932
3933         drbd_suspend_io(mdev);
3934
3935         drbd_bm_lock(mdev, why);
3936         rv = io_fn(mdev);
3937         drbd_bm_unlock(mdev);
3938
3939         drbd_resume_io(mdev);
3940
3941         return rv;
3942 }
3943
3944 void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3945 {
3946         if ((mdev->ldev->md.flags & flag) != flag) {
3947                 drbd_md_mark_dirty(mdev);
3948                 mdev->ldev->md.flags |= flag;
3949         }
3950 }
3951
3952 void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3953 {
3954         if ((mdev->ldev->md.flags & flag) != 0) {
3955                 drbd_md_mark_dirty(mdev);
3956                 mdev->ldev->md.flags &= ~flag;
3957         }
3958 }
3959 int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3960 {
3961         return (bdev->md.flags & flag) != 0;
3962 }
3963
3964 static void md_sync_timer_fn(unsigned long data)
3965 {
3966         struct drbd_conf *mdev = (struct drbd_conf *) data;
3967
3968         drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
3969 }
3970
3971 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3972 {
3973         dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
3974 #ifdef DEBUG
3975         dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
3976                 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
3977 #endif
3978         drbd_md_sync(mdev);
3979         return 1;
3980 }
3981
3982 #ifdef CONFIG_DRBD_FAULT_INJECTION
3983 /* Fault insertion support including random number generator shamelessly
3984  * stolen from kernel/rcutorture.c */
3985 struct fault_random_state {
3986         unsigned long state;
3987         unsigned long count;
3988 };
3989
3990 #define FAULT_RANDOM_MULT 39916801  /* prime */
3991 #define FAULT_RANDOM_ADD        479001701 /* prime */
3992 #define FAULT_RANDOM_REFRESH 10000
3993
3994 /*
3995  * Crude but fast random-number generator.  Uses a linear congruential
3996  * generator, with occasional help from get_random_bytes().
3997  */
3998 static unsigned long
3999 _drbd_fault_random(struct fault_random_state *rsp)
4000 {
4001         long refresh;
4002
4003         if (!rsp->count--) {
4004                 get_random_bytes(&refresh, sizeof(refresh));
4005                 rsp->state += refresh;
4006                 rsp->count = FAULT_RANDOM_REFRESH;
4007         }
4008         rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4009         return swahw32(rsp->state);
4010 }
4011
4012 static char *
4013 _drbd_fault_str(unsigned int type) {
4014         static char *_faults[] = {
4015                 [DRBD_FAULT_MD_WR] = "Meta-data write",
4016                 [DRBD_FAULT_MD_RD] = "Meta-data read",
4017                 [DRBD_FAULT_RS_WR] = "Resync write",
4018                 [DRBD_FAULT_RS_RD] = "Resync read",
4019                 [DRBD_FAULT_DT_WR] = "Data write",
4020                 [DRBD_FAULT_DT_RD] = "Data read",
4021                 [DRBD_FAULT_DT_RA] = "Data read ahead",
4022                 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
4023                 [DRBD_FAULT_AL_EE] = "EE allocation",
4024                 [DRBD_FAULT_RECEIVE] = "receive data corruption",
4025         };
4026
4027         return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4028 }
4029
4030 unsigned int
4031 _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4032 {
4033         static struct fault_random_state rrs = {0, 0};
4034
4035         unsigned int ret = (
4036                 (fault_devs == 0 ||
4037                         ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4038                 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4039
4040         if (ret) {
4041                 fault_count++;
4042
4043                 if (__ratelimit(&drbd_ratelimit_state))
4044                         dev_warn(DEV, "***Simulating %s failure\n",
4045                                 _drbd_fault_str(type));
4046         }
4047
4048         return ret;
4049 }
4050 #endif
4051
4052 const char *drbd_buildtag(void)
4053 {
4054         /* DRBD built from external sources has here a reference to the
4055            git hash of the source code. */
4056
4057         static char buildtag[38] = "\0uilt-in";
4058
4059         if (buildtag[0] == 0) {
4060 #ifdef CONFIG_MODULES
4061                 if (THIS_MODULE != NULL)
4062                         sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4063                 else
4064 #endif
4065                         buildtag[0] = 'b';
4066         }
4067
4068         return buildtag;
4069 }
4070
4071 module_init(drbd_init)
4072 module_exit(drbd_cleanup)
4073
4074 EXPORT_SYMBOL(drbd_conn_str);
4075 EXPORT_SYMBOL(drbd_role_str);
4076 EXPORT_SYMBOL(drbd_disk_str);
4077 EXPORT_SYMBOL(drbd_set_st_err_str);