drivers/block/drbd/drbd_receiver.c

   1 /*
   2    drbd_receiver.c
   3
   4    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
   5
   6    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
   7    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   8    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
   9
  10    drbd is free software; you can redistribute it and/or modify
  11    it under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 2, or (at your option)
  13    any later version.
  14
  15    drbd is distributed in the hope that it will be useful,
  16    but WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18    GNU General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with drbd; see the file COPYING.  If not, write to
  22    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  23  */
  24
  25
  26 #include <linux/module.h>
  27
  28 #include <asm/uaccess.h>
  29 #include <net/sock.h>
  30
  31 #include <linux/drbd.h>
  32 #include <linux/fs.h>
  33 #include <linux/file.h>
  34 #include <linux/in.h>
  35 #include <linux/mm.h>
  36 #include <linux/memcontrol.h>
  37 #include <linux/mm_inline.h>
  38 #include <linux/slab.h>
  39 #include <linux/pkt_sched.h>
  40 #define __KERNEL_SYSCALLS__
  41 #include <linux/unistd.h>
  42 #include <linux/vmalloc.h>
  43 #include <linux/random.h>
  44 #include <linux/string.h>
  45 #include <linux/scatterlist.h>
  46 #include "drbd_int.h"
  47 #include "drbd_protocol.h"
  48 #include "drbd_req.h"
  49
  50 #include "drbd_vli.h"
  51
  52 struct packet_info {
  53         enum drbd_packet cmd;
  54         unsigned int size;
  55         unsigned int vnr;
  56         void *data;
  57 };
  58
  59 enum finish_epoch {
  60         FE_STILL_LIVE,
  61         FE_DESTROYED,
  62         FE_RECYCLED,
  63 };
  64
  65 static int drbd_do_features(struct drbd_connection *connection);
  66 static int drbd_do_auth(struct drbd_connection *connection);
  67 static int drbd_disconnected(struct drbd_peer_device *);
  68
  69 static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event);
  70 static int e_end_block(struct drbd_work *, int);
  71
  72
  73 #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
  74
  75 /*
  76  * some helper functions to deal with single linked page lists,
  77  * page->private being our "next" pointer.
  78  */
  79
  80 /* If at least n pages are linked at head, get n pages off.
  81  * Otherwise, don't modify head, and return NULL.
  82  * Locking is the responsibility of the caller.
  83  */
  84 static struct page *page_chain_del(struct page **head, int n)
  85 {
  86         struct page *page;
  87         struct page *tmp;
  88
  89         BUG_ON(!n);
  90         BUG_ON(!head);
  91
  92         page = *head;
  93
  94         if (!page)
  95                 return NULL;
  96
  97         while (page) {
  98                 tmp = page_chain_next(page);
  99                 if (--n == 0)
 100                         break; /* found sufficient pages */
 101                 if (tmp == NULL)
 102                         /* insufficient pages, don't use any of them. */
 103                         return NULL;
 104                 page = tmp;
 105         }
 106
 107         /* add end of list marker for the returned list */
 108         set_page_private(page, 0);
 109         /* actual return value, and adjustment of head */
 110         page = *head;
 111         *head = tmp;
 112         return page;
 113 }
 114
 115 /* may be used outside of locks to find the tail of a (usually short)
 116  * "private" page chain, before adding it back to a global chain head
 117  * with page_chain_add() under a spinlock. */
 118 static struct page *page_chain_tail(struct page *page, int *len)
 119 {
 120         struct page *tmp;
 121         int i = 1;
 122         while ((tmp = page_chain_next(page)))
 123                 ++i, page = tmp;
 124         if (len)
 125                 *len = i;
 126         return page;
 127 }
 128
 129 static int page_chain_free(struct page *page)
 130 {
 131         struct page *tmp;
 132         int i = 0;
 133         page_chain_for_each_safe(page, tmp) {
 134                 put_page(page);
 135                 ++i;
 136         }
 137         return i;
 138 }
 139
 140 static void page_chain_add(struct page **head,
 141                 struct page *chain_first, struct page *chain_last)
 142 {
 143 #if 1
 144         struct page *tmp;
 145         tmp = page_chain_tail(chain_first, NULL);
 146         BUG_ON(tmp != chain_last);
 147 #endif
 148
 149         /* add chain to head */
 150         set_page_private(chain_last, (unsigned long)*head);
 151         *head = chain_first;
 152 }
 153
 154 static struct page *__drbd_alloc_pages(struct drbd_device *device,
 155                                        unsigned int number)
 156 {
 157         struct page *page = NULL;
 158         struct page *tmp = NULL;
 159         unsigned int i = 0;
 160
 161         /* Yes, testing drbd_pp_vacant outside the lock is racy.
 162          * So what. It saves a spin_lock. */
 163         if (drbd_pp_vacant >= number) {
 164                 spin_lock(&drbd_pp_lock);
 165                 page = page_chain_del(&drbd_pp_pool, number);
 166                 if (page)
 167                         drbd_pp_vacant -= number;
 168                 spin_unlock(&drbd_pp_lock);
 169                 if (page)
 170                         return page;
 171         }
 172
 173         /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
 174          * "criss-cross" setup, that might cause write-out on some other DRBD,
 175          * which in turn might block on the other node at this very place.  */
 176         for (i = 0; i < number; i++) {
 177                 tmp = alloc_page(GFP_TRY);
 178                 if (!tmp)
 179                         break;
 180                 set_page_private(tmp, (unsigned long)page);
 181                 page = tmp;
 182         }
 183
 184         if (i == number)
 185                 return page;
 186
 187         /* Not enough pages immediately available this time.
 188          * No need to jump around here, drbd_alloc_pages will retry this
 189          * function "soon". */
 190         if (page) {
 191                 tmp = page_chain_tail(page, NULL);
 192                 spin_lock(&drbd_pp_lock);
 193                 page_chain_add(&drbd_pp_pool, page, tmp);
 194                 drbd_pp_vacant += i;
 195                 spin_unlock(&drbd_pp_lock);
 196         }
 197         return NULL;
 198 }
 199
 200 static void reclaim_finished_net_peer_reqs(struct drbd_device *device,
 201                                            struct list_head *to_be_freed)
 202 {
 203         struct drbd_peer_request *peer_req, *tmp;
 204
 205         /* The EEs are always appended to the end of the list. Since
 206            they are sent in order over the wire, they have to finish
 207            in order. As soon as we see the first not finished we can
 208            stop to examine the list... */
 209
 210         list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) {
 211                 if (drbd_peer_req_has_active_page(peer_req))
 212                         break;
 213                 list_move(&peer_req->w.list, to_be_freed);
 214         }
 215 }
 216
 217 static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device)
 218 {
 219         LIST_HEAD(reclaimed);
 220         struct drbd_peer_request *peer_req, *t;
 221
 222         spin_lock_irq(&device->resource->req_lock);
 223         reclaim_finished_net_peer_reqs(device, &reclaimed);
 224         spin_unlock_irq(&device->resource->req_lock);
 225
 226         list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
 227                 drbd_free_net_peer_req(device, peer_req);
 228 }
 229
 230 /**
 231  * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
 232  * @device:     DRBD device.
 233  * @number:     number of pages requested
 234  * @retry:      whether to retry, if not enough pages are available right now
 235  *
 236  * Tries to allocate number pages, first from our own page pool, then from
 237  * the kernel, unless this allocation would exceed the max_buffers setting.
 238  * Possibly retry until DRBD frees sufficient pages somewhere else.
 239  *
 240  * Returns a page chain linked via page->private.
 241  */
 242 struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number,
 243                               bool retry)
 244 {
 245         struct drbd_device *device = peer_device->device;
 246         struct page *page = NULL;
 247         struct net_conf *nc;
 248         DEFINE_WAIT(wait);
 249         int mxb;
 250
 251         /* Yes, we may run up to @number over max_buffers. If we
 252          * follow it strictly, the admin will get it wrong anyways. */
 253         rcu_read_lock();
 254         nc = rcu_dereference(peer_device->connection->net_conf);
 255         mxb = nc ? nc->max_buffers : 1000000;
 256         rcu_read_unlock();
 257
 258         if (atomic_read(&device->pp_in_use) < mxb)
 259                 page = __drbd_alloc_pages(device, number);
 260
 261         while (page == NULL) {
 262                 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
 263
 264                 drbd_kick_lo_and_reclaim_net(device);
 265
 266                 if (atomic_read(&device->pp_in_use) < mxb) {
 267                         page = __drbd_alloc_pages(device, number);
 268                         if (page)
 269                                 break;
 270                 }
 271
 272                 if (!retry)
 273                         break;
 274
 275                 if (signal_pending(current)) {
 276                         drbd_warn(device, "drbd_alloc_pages interrupted!\n");
 277                         break;
 278                 }
 279
 280                 schedule();
 281         }
 282         finish_wait(&drbd_pp_wait, &wait);
 283
 284         if (page)
 285                 atomic_add(number, &device->pp_in_use);
 286         return page;
 287 }
 288
 289 /* Must not be used from irq, as that may deadlock: see drbd_alloc_pages.
 290  * Is also used from inside an other spin_lock_irq(&resource->req_lock);
 291  * Either links the page chain back to the global pool,
 292  * or returns all pages to the system. */
 293 static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net)
 294 {
 295         atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use;
 296         int i;
 297
 298         if (page == NULL)
 299                 return;
 300
 301         if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count)
 302                 i = page_chain_free(page);
 303         else {
 304                 struct page *tmp;
 305                 tmp = page_chain_tail(page, &i);
 306                 spin_lock(&drbd_pp_lock);
 307                 page_chain_add(&drbd_pp_pool, page, tmp);
 308                 drbd_pp_vacant += i;
 309                 spin_unlock(&drbd_pp_lock);
 310         }
 311         i = atomic_sub_return(i, a);
 312         if (i < 0)
 313                 drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n",
 314                         is_net ? "pp_in_use_by_net" : "pp_in_use", i);
 315         wake_up(&drbd_pp_wait);
 316 }
 317
 318 /*
 319 You need to hold the req_lock:
 320  _drbd_wait_ee_list_empty()
 321
 322 You must not have the req_lock:
 323  drbd_free_peer_req()
 324  drbd_alloc_peer_req()
 325  drbd_free_peer_reqs()
 326  drbd_ee_fix_bhs()
 327  drbd_finish_peer_reqs()
 328  drbd_clear_done_ee()
 329  drbd_wait_ee_list_empty()
 330 */
 331
 332 struct drbd_peer_request *
 333 drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
 334                     unsigned int data_size, gfp_t gfp_mask) __must_hold(local)
 335 {
 336         struct drbd_device *device = peer_device->device;
 337         struct drbd_peer_request *peer_req;
 338         struct page *page = NULL;
 339         unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
 340
 341         if (drbd_insert_fault(device, DRBD_FAULT_AL_EE))
 342                 return NULL;
 343
 344         peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
 345         if (!peer_req) {
 346                 if (!(gfp_mask & __GFP_NOWARN))
 347                         drbd_err(device, "%s: allocation failed\n", __func__);
 348                 return NULL;
 349         }
 350
 351         if (data_size) {
 352                 page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT));
 353                 if (!page)
 354                         goto fail;
 355         }
 356
 357         drbd_clear_interval(&peer_req->i);
 358         peer_req->i.size = data_size;
 359         peer_req->i.sector = sector;
 360         peer_req->i.local = false;
 361         peer_req->i.waiting = false;
 362
 363         peer_req->epoch = NULL;
 364         peer_req->peer_device = peer_device;
 365         peer_req->pages = page;
 366         atomic_set(&peer_req->pending_bios, 0);
 367         peer_req->flags = 0;
 368         /*
 369          * The block_id is opaque to the receiver.  It is not endianness
 370          * converted, and sent back to the sender unchanged.
 371          */
 372         peer_req->block_id = id;
 373
 374         return peer_req;
 375
 376  fail:
 377         mempool_free(peer_req, drbd_ee_mempool);
 378         return NULL;
 379 }
 380
 381 void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req,
 382                        int is_net)
 383 {
 384         if (peer_req->flags & EE_HAS_DIGEST)
 385                 kfree(peer_req->digest);
 386         drbd_free_pages(device, peer_req->pages, is_net);
 387         D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
 388         D_ASSERT(device, drbd_interval_empty(&peer_req->i));
 389         mempool_free(peer_req, drbd_ee_mempool);
 390 }
 391
 392 int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
 393 {
 394         LIST_HEAD(work_list);
 395         struct drbd_peer_request *peer_req, *t;
 396         int count = 0;
 397         int is_net = list == &device->net_ee;
 398
 399         spin_lock_irq(&device->resource->req_lock);
 400         list_splice_init(list, &work_list);
 401         spin_unlock_irq(&device->resource->req_lock);
 402
 403         list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
 404                 __drbd_free_peer_req(device, peer_req, is_net);
 405                 count++;
 406         }
 407         return count;
 408 }
 409
 410 /*
 411  * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier.
 412  */
 413 static int drbd_finish_peer_reqs(struct drbd_device *device)
 414 {
 415         LIST_HEAD(work_list);
 416         LIST_HEAD(reclaimed);
 417         struct drbd_peer_request *peer_req, *t;
 418         int err = 0;
 419
 420         spin_lock_irq(&device->resource->req_lock);
 421         reclaim_finished_net_peer_reqs(device, &reclaimed);
 422         list_splice_init(&device->done_ee, &work_list);
 423         spin_unlock_irq(&device->resource->req_lock);
 424
 425         list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
 426                 drbd_free_net_peer_req(device, peer_req);
 427
 428         /* possible callbacks here:
 429          * e_end_block, and e_end_resync_block, e_send_superseded.
 430          * all ignore the last argument.
 431          */
 432         list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
 433                 int err2;
 434
 435                 /* list_del not necessary, next/prev members not touched */
 436                 err2 = peer_req->w.cb(&peer_req->w, !!err);
 437                 if (!err)
 438                         err = err2;
 439                 drbd_free_peer_req(device, peer_req);
 440         }
 441         wake_up(&device->ee_wait);
 442
 443         return err;
 444 }
 445
 446 static void _drbd_wait_ee_list_empty(struct drbd_device *device,
 447                                      struct list_head *head)
 448 {
 449         DEFINE_WAIT(wait);
 450
 451         /* avoids spin_lock/unlock
 452          * and calling prepare_to_wait in the fast path */
 453         while (!list_empty(head)) {
 454                 prepare_to_wait(&device->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
 455                 spin_unlock_irq(&device->resource->req_lock);
 456                 io_schedule();
 457                 finish_wait(&device->ee_wait, &wait);
 458                 spin_lock_irq(&device->resource->req_lock);
 459         }
 460 }
 461
 462 static void drbd_wait_ee_list_empty(struct drbd_device *device,
 463                                     struct list_head *head)
 464 {
 465         spin_lock_irq(&device->resource->req_lock);
 466         _drbd_wait_ee_list_empty(device, head);
 467         spin_unlock_irq(&device->resource->req_lock);
 468 }
 469
 470 static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
 471 {
 472         struct kvec iov = {
 473                 .iov_base = buf,
 474                 .iov_len = size,
 475         };
 476         struct msghdr msg = {
 477                 .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
 478         };
 479         return kernel_recvmsg(sock, &msg, &iov, 1, size, msg.msg_flags);
 480 }
 481
 482 static int drbd_recv(struct drbd_connection *connection, void *buf, size_t size)
 483 {
 484         int rv;
 485
 486         rv = drbd_recv_short(connection->data.socket, buf, size, 0);
 487
 488         if (rv < 0) {
 489                 if (rv == -ECONNRESET)
 490                         drbd_info(connection, "sock was reset by peer\n");
 491                 else if (rv != -ERESTARTSYS)
 492                         drbd_err(connection, "sock_recvmsg returned %d\n", rv);
 493         } else if (rv == 0) {
 494                 if (test_bit(DISCONNECT_SENT, &connection->flags)) {
 495                         long t;
 496                         rcu_read_lock();
 497                         t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
 498                         rcu_read_unlock();
 499
 500                         t = wait_event_timeout(connection->ping_wait, connection->cstate < C_WF_REPORT_PARAMS, t);
 501
 502                         if (t)
 503                                 goto out;
 504                 }
 505                 drbd_info(connection, "sock was shut down by peer\n");
 506         }
 507
 508         if (rv != size)
 509                 conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD);
 510
 511 out:
 512         return rv;
 513 }
 514
 515 static int drbd_recv_all(struct drbd_connection *connection, void *buf, size_t size)
 516 {
 517         int err;
 518
 519         err = drbd_recv(connection, buf, size);
 520         if (err != size) {
 521                 if (err >= 0)
 522                         err = -EIO;
 523         } else
 524                 err = 0;
 525         return err;
 526 }
 527
 528 static int drbd_recv_all_warn(struct drbd_connection *connection, void *buf, size_t size)
 529 {
 530         int err;
 531
 532         err = drbd_recv_all(connection, buf, size);
 533         if (err && !signal_pending(current))
 534                 drbd_warn(connection, "short read (expected size %d)\n", (int)size);
 535         return err;
 536 }
 537
 538 /* quoting tcp(7):
 539  *   On individual connections, the socket buffer size must be set prior to the
 540  *   listen(2) or connect(2) calls in order to have it take effect.
 541  * This is our wrapper to do so.
 542  */
 543 static void drbd_setbufsize(struct socket *sock, unsigned int snd,
 544                 unsigned int rcv)
 545 {
 546         /* open coded SO_SNDBUF, SO_RCVBUF */
 547         if (snd) {
 548                 sock->sk->sk_sndbuf = snd;
 549                 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 550         }
 551         if (rcv) {
 552                 sock->sk->sk_rcvbuf = rcv;
 553                 sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 554         }
 555 }
 556
 557 static struct socket *drbd_try_connect(struct drbd_connection *connection)
 558 {
 559         const char *what;
 560         struct socket *sock;
 561         struct sockaddr_in6 src_in6;
 562         struct sockaddr_in6 peer_in6;
 563         struct net_conf *nc;
 564         int err, peer_addr_len, my_addr_len;
 565         int sndbuf_size, rcvbuf_size, connect_int;
 566         int disconnect_on_error = 1;
 567
 568         rcu_read_lock();
 569         nc = rcu_dereference(connection->net_conf);
 570         if (!nc) {
 571                 rcu_read_unlock();
 572                 return NULL;
 573         }
 574         sndbuf_size = nc->sndbuf_size;
 575         rcvbuf_size = nc->rcvbuf_size;
 576         connect_int = nc->connect_int;
 577         rcu_read_unlock();
 578
 579         my_addr_len = min_t(int, connection->my_addr_len, sizeof(src_in6));
 580         memcpy(&src_in6, &connection->my_addr, my_addr_len);
 581
 582         if (((struct sockaddr *)&connection->my_addr)->sa_family == AF_INET6)
 583                 src_in6.sin6_port = 0;
 584         else
 585                 ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
 586
 587         peer_addr_len = min_t(int, connection->peer_addr_len, sizeof(src_in6));
 588         memcpy(&peer_in6, &connection->peer_addr, peer_addr_len);
 589
 590         what = "sock_create_kern";
 591         err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family,
 592                                SOCK_STREAM, IPPROTO_TCP, &sock);
 593         if (err < 0) {
 594                 sock = NULL;
 595                 goto out;
 596         }
 597
 598         sock->sk->sk_rcvtimeo =
 599         sock->sk->sk_sndtimeo = connect_int * HZ;
 600         drbd_setbufsize(sock, sndbuf_size, rcvbuf_size);
 601
 602        /* explicitly bind to the configured IP as source IP
 603         *  for the outgoing connections.
 604         *  This is needed for multihomed hosts and to be
 605         *  able to use lo: interfaces for drbd.
 606         * Make sure to use 0 as port number, so linux selects
 607         *  a free one dynamically.
 608         */
 609         what = "bind before connect";
 610         err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len);
 611         if (err < 0)
 612                 goto out;
 613
 614         /* connect may fail, peer not yet available.
 615          * stay C_WF_CONNECTION, don't go Disconnecting! */
 616         disconnect_on_error = 0;
 617         what = "connect";
 618         err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0);
 619
 620 out:
 621         if (err < 0) {
 622                 if (sock) {
 623                         sock_release(sock);
 624                         sock = NULL;
 625                 }
 626                 switch (-err) {
 627                         /* timeout, busy, signal pending */
 628                 case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
 629                 case EINTR: case ERESTARTSYS:
 630                         /* peer not (yet) available, network problem */
 631                 case ECONNREFUSED: case ENETUNREACH:
 632                 case EHOSTDOWN:    case EHOSTUNREACH:
 633                         disconnect_on_error = 0;
 634                         break;
 635                 default:
 636                         drbd_err(connection, "%s failed, err = %d\n", what, err);
 637                 }
 638                 if (disconnect_on_error)
 639                         conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
 640         }
 641
 642         return sock;
 643 }
 644
 645 struct accept_wait_data {
 646         struct drbd_connection *connection;
 647         struct socket *s_listen;
 648         struct completion door_bell;
 649         void (*original_sk_state_change)(struct sock *sk);
 650
 651 };
 652
 653 static void drbd_incoming_connection(struct sock *sk)
 654 {
 655         struct accept_wait_data *ad = sk->sk_user_data;
 656         void (*state_change)(struct sock *sk);
 657
 658         state_change = ad->original_sk_state_change;
 659         if (sk->sk_state == TCP_ESTABLISHED)
 660                 complete(&ad->door_bell);
 661         state_change(sk);
 662 }
 663
 664 static int prepare_listen_socket(struct drbd_connection *connection, struct accept_wait_data *ad)
 665 {
 666         int err, sndbuf_size, rcvbuf_size, my_addr_len;
 667         struct sockaddr_in6 my_addr;
 668         struct socket *s_listen;
 669         struct net_conf *nc;
 670         const char *what;
 671
 672         rcu_read_lock();
 673         nc = rcu_dereference(connection->net_conf);
 674         if (!nc) {
 675                 rcu_read_unlock();
 676                 return -EIO;
 677         }
 678         sndbuf_size = nc->sndbuf_size;
 679         rcvbuf_size = nc->rcvbuf_size;
 680         rcu_read_unlock();
 681
 682         my_addr_len = min_t(int, connection->my_addr_len, sizeof(struct sockaddr_in6));
 683         memcpy(&my_addr, &connection->my_addr, my_addr_len);
 684
 685         what = "sock_create_kern";
 686         err = sock_create_kern(((struct sockaddr *)&my_addr)->sa_family,
 687                                SOCK_STREAM, IPPROTO_TCP, &s_listen);
 688         if (err) {
 689                 s_listen = NULL;
 690                 goto out;
 691         }
 692
 693         s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
 694         drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
 695
 696         what = "bind before listen";
 697         err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
 698         if (err < 0)
 699                 goto out;
 700
 701         ad->s_listen = s_listen;
 702         write_lock_bh(&s_listen->sk->sk_callback_lock);
 703         ad->original_sk_state_change = s_listen->sk->sk_state_change;
 704         s_listen->sk->sk_state_change = drbd_incoming_connection;
 705         s_listen->sk->sk_user_data = ad;
 706         write_unlock_bh(&s_listen->sk->sk_callback_lock);
 707
 708         what = "listen";
 709         err = s_listen->ops->listen(s_listen, 5);
 710         if (err < 0)
 711                 goto out;
 712
 713         return 0;
 714 out:
 715         if (s_listen)
 716                 sock_release(s_listen);
 717         if (err < 0) {
 718                 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
 719                         drbd_err(connection, "%s failed, err = %d\n", what, err);
 720                         conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
 721                 }
 722         }
 723
 724         return -EIO;
 725 }
 726
 727 static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad)
 728 {
 729         write_lock_bh(&sk->sk_callback_lock);
 730         sk->sk_state_change = ad->original_sk_state_change;
 731         sk->sk_user_data = NULL;
 732         write_unlock_bh(&sk->sk_callback_lock);
 733 }
 734
 735 static struct socket *drbd_wait_for_connect(struct drbd_connection *connection, struct accept_wait_data *ad)
 736 {
 737         int timeo, connect_int, err = 0;
 738         struct socket *s_estab = NULL;
 739         struct net_conf *nc;
 740
 741         rcu_read_lock();
 742         nc = rcu_dereference(connection->net_conf);
 743         if (!nc) {
 744                 rcu_read_unlock();
 745                 return NULL;
 746         }
 747         connect_int = nc->connect_int;
 748         rcu_read_unlock();
 749
 750         timeo = connect_int * HZ;
 751         /* 28.5% random jitter */
 752         timeo += (prandom_u32() & 1) ? timeo / 7 : -timeo / 7;
 753
 754         err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo);
 755         if (err <= 0)
 756                 return NULL;
 757
 758         err = kernel_accept(ad->s_listen, &s_estab, 0);
 759         if (err < 0) {
 760                 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
 761                         drbd_err(connection, "accept failed, err = %d\n", err);
 762                         conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
 763                 }
 764         }
 765
 766         if (s_estab)
 767                 unregister_state_change(s_estab->sk, ad);
 768
 769         return s_estab;
 770 }
 771
 772 static int decode_header(struct drbd_connection *, void *, struct packet_info *);
 773
 774 static int send_first_packet(struct drbd_connection *connection, struct drbd_socket *sock,
 775                              enum drbd_packet cmd)
 776 {
 777         if (!conn_prepare_command(connection, sock))
 778                 return -EIO;
 779         return conn_send_command(connection, sock, cmd, 0, NULL, 0);
 780 }
 781
 782 static int receive_first_packet(struct drbd_connection *connection, struct socket *sock)
 783 {
 784         unsigned int header_size = drbd_header_size(connection);
 785         struct packet_info pi;
 786         int err;
 787
 788         err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0);
 789         if (err != header_size) {
 790                 if (err >= 0)
 791                         err = -EIO;
 792                 return err;
 793         }
 794         err = decode_header(connection, connection->data.rbuf, &pi);
 795         if (err)
 796                 return err;
 797         return pi.cmd;
 798 }
 799
 800 /**
 801  * drbd_socket_okay() - Free the socket if its connection is not okay
 802  * @sock:       pointer to the pointer to the socket.
 803  */
 804 static int drbd_socket_okay(struct socket **sock)
 805 {
 806         int rr;
 807         char tb[4];
 808
 809         if (!*sock)
 810                 return false;
 811
 812         rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
 813
 814         if (rr > 0 || rr == -EAGAIN) {
 815                 return true;
 816         } else {
 817                 sock_release(*sock);
 818                 *sock = NULL;
 819                 return false;
 820         }
 821 }
 822 /* Gets called if a connection is established, or if a new minor gets created
 823    in a connection */
 824 int drbd_connected(struct drbd_peer_device *peer_device)
 825 {
 826         struct drbd_device *device = peer_device->device;
 827         int err;
 828
 829         atomic_set(&device->packet_seq, 0);
 830         device->peer_seq = 0;
 831
 832         device->state_mutex = peer_device->connection->agreed_pro_version < 100 ?
 833                 &peer_device->connection->cstate_mutex :
 834                 &device->own_state_mutex;
 835
 836         err = drbd_send_sync_param(peer_device);
 837         if (!err)
 838                 err = drbd_send_sizes(peer_device, 0, 0);
 839         if (!err)
 840                 err = drbd_send_uuids(peer_device);
 841         if (!err)
 842                 err = drbd_send_current_state(peer_device);
 843         clear_bit(USE_DEGR_WFC_T, &device->flags);
 844         clear_bit(RESIZE_PENDING, &device->flags);
 845         atomic_set(&device->ap_in_flight, 0);
 846         mod_timer(&device->request_timer, jiffies + HZ); /* just start it here. */
 847         return err;
 848 }
 849
 850 /*
 851  * return values:
 852  *   1 yes, we have a valid connection
 853  *   0 oops, did not work out, please try again
 854  *  -1 peer talks different language,
 855  *     no point in trying again, please go standalone.
 856  *  -2 We do not have a network config...
 857  */
 858 static int conn_connect(struct drbd_connection *connection)
 859 {
 860         struct drbd_socket sock, msock;
 861         struct drbd_peer_device *peer_device;
 862         struct net_conf *nc;
 863         int vnr, timeout, h, ok;
 864         bool discard_my_data;
 865         enum drbd_state_rv rv;
 866         struct accept_wait_data ad = {
 867                 .connection = connection,
 868                 .door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell),
 869         };
 870
 871         clear_bit(DISCONNECT_SENT, &connection->flags);
 872         if (conn_request_state(connection, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS)
 873                 return -2;
 874
 875         mutex_init(&sock.mutex);
 876         sock.sbuf = connection->data.sbuf;
 877         sock.rbuf = connection->data.rbuf;
 878         sock.socket = NULL;
 879         mutex_init(&msock.mutex);
 880         msock.sbuf = connection->meta.sbuf;
 881         msock.rbuf = connection->meta.rbuf;
 882         msock.socket = NULL;
 883
 884         /* Assume that the peer only understands protocol 80 until we know better.  */
 885         connection->agreed_pro_version = 80;
 886
 887         if (prepare_listen_socket(connection, &ad))
 888                 return 0;
 889
 890         do {
 891                 struct socket *s;
 892
 893                 s = drbd_try_connect(connection);
 894                 if (s) {
 895                         if (!sock.socket) {
 896                                 sock.socket = s;
 897                                 send_first_packet(connection, &sock, P_INITIAL_DATA);
 898                         } else if (!msock.socket) {
 899                                 clear_bit(RESOLVE_CONFLICTS, &connection->flags);
 900                                 msock.socket = s;
 901                                 send_first_packet(connection, &msock, P_INITIAL_META);
 902                         } else {
 903                                 drbd_err(connection, "Logic error in conn_connect()\n");
 904                                 goto out_release_sockets;
 905                         }
 906                 }
 907
 908                 if (sock.socket && msock.socket) {
 909                         rcu_read_lock();
 910                         nc = rcu_dereference(connection->net_conf);
 911                         timeout = nc->ping_timeo * HZ / 10;
 912                         rcu_read_unlock();
 913                         schedule_timeout_interruptible(timeout);
 914                         ok = drbd_socket_okay(&sock.socket);
 915                         ok = drbd_socket_okay(&msock.socket) && ok;
 916                         if (ok)
 917                                 break;
 918                 }
 919
 920 retry:
 921                 s = drbd_wait_for_connect(connection, &ad);
 922                 if (s) {
 923                         int fp = receive_first_packet(connection, s);
 924                         drbd_socket_okay(&sock.socket);
 925                         drbd_socket_okay(&msock.socket);
 926                         switch (fp) {
 927                         case P_INITIAL_DATA:
 928                                 if (sock.socket) {
 929                                         drbd_warn(connection, "initial packet S crossed\n");
 930                                         sock_release(sock.socket);
 931                                         sock.socket = s;
 932                                         goto randomize;
 933                                 }
 934                                 sock.socket = s;
 935                                 break;
 936                         case P_INITIAL_META:
 937                                 set_bit(RESOLVE_CONFLICTS, &connection->flags);
 938                                 if (msock.socket) {
 939                                         drbd_warn(connection, "initial packet M crossed\n");
 940                                         sock_release(msock.socket);
 941                                         msock.socket = s;
 942                                         goto randomize;
 943                                 }
 944                                 msock.socket = s;
 945                                 break;
 946                         default:
 947                                 drbd_warn(connection, "Error receiving initial packet\n");
 948                                 sock_release(s);
 949 randomize:
 950                                 if (prandom_u32() & 1)
 951                                         goto retry;
 952                         }
 953                 }
 954
 955                 if (connection->cstate <= C_DISCONNECTING)
 956                         goto out_release_sockets;
 957                 if (signal_pending(current)) {
 958                         flush_signals(current);
 959                         smp_rmb();
 960                         if (get_t_state(&connection->receiver) == EXITING)
 961                                 goto out_release_sockets;
 962                 }
 963
 964                 ok = drbd_socket_okay(&sock.socket);
 965                 ok = drbd_socket_okay(&msock.socket) && ok;
 966         } while (!ok);
 967
 968         if (ad.s_listen)
 969                 sock_release(ad.s_listen);
 970
 971         sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
 972         msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
 973
 974         sock.socket->sk->sk_allocation = GFP_NOIO;
 975         msock.socket->sk->sk_allocation = GFP_NOIO;
 976
 977         sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
 978         msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
 979
 980         /* NOT YET ...
 981          * sock.socket->sk->sk_sndtimeo = connection->net_conf->timeout*HZ/10;
 982          * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
 983          * first set it to the P_CONNECTION_FEATURES timeout,
 984          * which we set to 4x the configured ping_timeout. */
 985         rcu_read_lock();
 986         nc = rcu_dereference(connection->net_conf);
 987
 988         sock.socket->sk->sk_sndtimeo =
 989         sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10;
 990
 991         msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ;
 992         timeout = nc->timeout * HZ / 10;
 993         discard_my_data = nc->discard_my_data;
 994         rcu_read_unlock();
 995
 996         msock.socket->sk->sk_sndtimeo = timeout;
 997
 998         /* we don't want delays.
 999          * we use TCP_CORK where appropriate, though */
1000         drbd_tcp_nodelay(sock.socket);
1001         drbd_tcp_nodelay(msock.socket);
1002
1003         connection->data.socket = sock.socket;
1004         connection->meta.socket = msock.socket;
1005         connection->last_received = jiffies;
1006
1007         h = drbd_do_features(connection);
1008         if (h <= 0)
1009                 return h;
1010
1011         if (connection->cram_hmac_tfm) {
1012                 /* drbd_request_state(device, NS(conn, WFAuth)); */
1013                 switch (drbd_do_auth(connection)) {
1014                 case -1:
1015                         drbd_err(connection, "Authentication of peer failed\n");
1016                         return -1;
1017                 case 0:
1018                         drbd_err(connection, "Authentication of peer failed, trying again.\n");
1019                         return 0;
1020                 }
1021         }
1022
1023         connection->data.socket->sk->sk_sndtimeo = timeout;
1024         connection->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1025
1026         if (drbd_send_protocol(connection) == -EOPNOTSUPP)
1027                 return -1;
1028
1029         set_bit(STATE_SENT, &connection->flags);
1030
1031         rcu_read_lock();
1032         idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1033                 struct drbd_device *device = peer_device->device;
1034                 kref_get(&device->kref);
1035                 rcu_read_unlock();
1036
1037                 /* Prevent a race between resync-handshake and
1038                  * being promoted to Primary.
1039                  *
1040                  * Grab and release the state mutex, so we know that any current
1041                  * drbd_set_role() is finished, and any incoming drbd_set_role
1042                  * will see the STATE_SENT flag, and wait for it to be cleared.
1043                  */
1044                 mutex_lock(device->state_mutex);
1045                 mutex_unlock(device->state_mutex);
1046
1047                 if (discard_my_data)
1048                         set_bit(DISCARD_MY_DATA, &device->flags);
1049                 else
1050                         clear_bit(DISCARD_MY_DATA, &device->flags);
1051
1052                 drbd_connected(peer_device);
1053                 kref_put(&device->kref, drbd_destroy_device);
1054                 rcu_read_lock();
1055         }
1056         rcu_read_unlock();
1057
1058         rv = conn_request_state(connection, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE);
1059         if (rv < SS_SUCCESS || connection->cstate != C_WF_REPORT_PARAMS) {
1060                 clear_bit(STATE_SENT, &connection->flags);
1061                 return 0;
1062         }
1063
1064         drbd_thread_start(&connection->asender);
1065
1066         mutex_lock(&connection->resource->conf_update);
1067         /* The discard_my_data flag is a single-shot modifier to the next
1068          * connection attempt, the handshake of which is now well underway.
1069          * No need for rcu style copying of the whole struct
1070          * just to clear a single value. */
1071         connection->net_conf->discard_my_data = 0;
1072         mutex_unlock(&connection->resource->conf_update);
1073
1074         return h;
1075
1076 out_release_sockets:
1077         if (ad.s_listen)
1078                 sock_release(ad.s_listen);
1079         if (sock.socket)
1080                 sock_release(sock.socket);
1081         if (msock.socket)
1082                 sock_release(msock.socket);
1083         return -1;
1084 }
1085
1086 static int decode_header(struct drbd_connection *connection, void *header, struct packet_info *pi)
1087 {
1088         unsigned int header_size = drbd_header_size(connection);
1089
1090         if (header_size == sizeof(struct p_header100) &&
1091             *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) {
1092                 struct p_header100 *h = header;
1093                 if (h->pad != 0) {
1094                         drbd_err(connection, "Header padding is not zero\n");
1095                         return -EINVAL;
1096                 }
1097                 pi->vnr = be16_to_cpu(h->volume);
1098                 pi->cmd = be16_to_cpu(h->command);
1099                 pi->size = be32_to_cpu(h->length);
1100         } else if (header_size == sizeof(struct p_header95) &&
1101                    *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) {
1102                 struct p_header95 *h = header;
1103                 pi->cmd = be16_to_cpu(h->command);
1104                 pi->size = be32_to_cpu(h->length);
1105                 pi->vnr = 0;
1106         } else if (header_size == sizeof(struct p_header80) &&
1107                    *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) {
1108                 struct p_header80 *h = header;
1109                 pi->cmd = be16_to_cpu(h->command);
1110                 pi->size = be16_to_cpu(h->length);
1111                 pi->vnr = 0;
1112         } else {
1113                 drbd_err(connection, "Wrong magic value 0x%08x in protocol version %d\n",
1114                          be32_to_cpu(*(__be32 *)header),
1115                          connection->agreed_pro_version);
1116                 return -EINVAL;
1117         }
1118         pi->data = header + header_size;
1119         return 0;
1120 }
1121
1122 static int drbd_recv_header(struct drbd_connection *connection, struct packet_info *pi)
1123 {
1124         void *buffer = connection->data.rbuf;
1125         int err;
1126
1127         err = drbd_recv_all_warn(connection, buffer, drbd_header_size(connection));
1128         if (err)
1129                 return err;
1130
1131         err = decode_header(connection, buffer, pi);
1132         connection->last_received = jiffies;
1133
1134         return err;
1135 }
1136
1137 static void drbd_flush(struct drbd_connection *connection)
1138 {
1139         int rv;
1140         struct drbd_peer_device *peer_device;
1141         int vnr;
1142
1143         if (connection->write_ordering >= WO_bdev_flush) {
1144                 rcu_read_lock();
1145                 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1146                         struct drbd_device *device = peer_device->device;
1147
1148                         if (!get_ldev(device))
1149                                 continue;
1150                         kref_get(&device->kref);
1151                         rcu_read_unlock();
1152
1153                         rv = blkdev_issue_flush(device->ldev->backing_bdev,
1154                                         GFP_NOIO, NULL);
1155                         if (rv) {
1156                                 drbd_info(device, "local disk flush failed with status %d\n", rv);
1157                                 /* would rather check on EOPNOTSUPP, but that is not reliable.
1158                                  * don't try again for ANY return value != 0
1159                                  * if (rv == -EOPNOTSUPP) */
1160                                 drbd_bump_write_ordering(connection, WO_drain_io);
1161                         }
1162                         put_ldev(device);
1163                         kref_put(&device->kref, drbd_destroy_device);
1164
1165                         rcu_read_lock();
1166                         if (rv)
1167                                 break;
1168                 }
1169                 rcu_read_unlock();
1170         }
1171 }
1172
1173 /**
1174  * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
1175  * @device:     DRBD device.
1176  * @epoch:      Epoch object.
1177  * @ev:         Epoch event.
1178  */
1179 static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connection,
1180                                                struct drbd_epoch *epoch,
1181                                                enum epoch_event ev)
1182 {
1183         int epoch_size;
1184         struct drbd_epoch *next_epoch;
1185         enum finish_epoch rv = FE_STILL_LIVE;
1186
1187         spin_lock(&connection->epoch_lock);
1188         do {
1189                 next_epoch = NULL;
1190
1191                 epoch_size = atomic_read(&epoch->epoch_size);
1192
1193                 switch (ev & ~EV_CLEANUP) {
1194                 case EV_PUT:
1195                         atomic_dec(&epoch->active);
1196                         break;
1197                 case EV_GOT_BARRIER_NR:
1198                         set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
1199                         break;
1200                 case EV_BECAME_LAST:
1201                         /* nothing to do*/
1202                         break;
1203                 }
1204
1205                 if (epoch_size != 0 &&
1206                     atomic_read(&epoch->active) == 0 &&
1207                     (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
1208                         if (!(ev & EV_CLEANUP)) {
1209                                 spin_unlock(&connection->epoch_lock);
1210                                 drbd_send_b_ack(epoch->connection, epoch->barrier_nr, epoch_size);
1211                                 spin_lock(&connection->epoch_lock);
1212                         }
1213 #if 0
1214                         /* FIXME: dec unacked on connection, once we have
1215                          * something to count pending connection packets in. */
1216                         if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
1217                                 dec_unacked(epoch->connection);
1218 #endif
1219
1220                         if (connection->current_epoch != epoch) {
1221                                 next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
1222                                 list_del(&epoch->list);
1223                                 ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
1224                                 connection->epochs--;
1225                                 kfree(epoch);
1226
1227                                 if (rv == FE_STILL_LIVE)
1228                                         rv = FE_DESTROYED;
1229                         } else {
1230                                 epoch->flags = 0;
1231                                 atomic_set(&epoch->epoch_size, 0);
1232                                 /* atomic_set(&epoch->active, 0); is already zero */
1233                                 if (rv == FE_STILL_LIVE)
1234                                         rv = FE_RECYCLED;
1235                         }
1236                 }
1237
1238                 if (!next_epoch)
1239                         break;
1240
1241                 epoch = next_epoch;
1242         } while (1);
1243
1244         spin_unlock(&connection->epoch_lock);
1245
1246         return rv;
1247 }
1248
1249 /**
1250  * drbd_bump_write_ordering() - Fall back to an other write ordering method
1251  * @connection: DRBD connection.
1252  * @wo:         Write ordering method to try.
1253  */
1254 void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo)
1255 {
1256         struct disk_conf *dc;
1257         struct drbd_peer_device *peer_device;
1258         enum write_ordering_e pwo;
1259         int vnr;
1260         static char *write_ordering_str[] = {
1261                 [WO_none] = "none",
1262                 [WO_drain_io] = "drain",
1263                 [WO_bdev_flush] = "flush",
1264         };
1265
1266         pwo = connection->write_ordering;
1267         wo = min(pwo, wo);
1268         rcu_read_lock();
1269         idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1270                 struct drbd_device *device = peer_device->device;
1271
1272                 if (!get_ldev_if_state(device, D_ATTACHING))
1273                         continue;
1274                 dc = rcu_dereference(device->ldev->disk_conf);
1275
1276                 if (wo == WO_bdev_flush && !dc->disk_flushes)
1277                         wo = WO_drain_io;
1278                 if (wo == WO_drain_io && !dc->disk_drain)
1279                         wo = WO_none;
1280                 put_ldev(device);
1281         }
1282         rcu_read_unlock();
1283         connection->write_ordering = wo;
1284         if (pwo != connection->write_ordering || wo == WO_bdev_flush)
1285                 drbd_info(connection, "Method to ensure write ordering: %s\n", write_ordering_str[connection->write_ordering]);
1286 }
1287
1288 /**
1289  * drbd_submit_peer_request()
1290  * @device:     DRBD device.
1291  * @peer_req:   peer request
1292  * @rw:         flag field, see bio->bi_rw
1293  *
1294  * May spread the pages to multiple bios,
1295  * depending on bio_add_page restrictions.
1296  *
1297  * Returns 0 if all bios have been submitted,
1298  * -ENOMEM if we could not allocate enough bios,
1299  * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
1300  *  single page to an empty bio (which should never happen and likely indicates
1301  *  that the lower level IO stack is in some way broken). This has been observed
1302  *  on certain Xen deployments.
1303  */
1304 /* TODO allocate from our own bio_set. */
1305 int drbd_submit_peer_request(struct drbd_device *device,
1306                              struct drbd_peer_request *peer_req,
1307                              const unsigned rw, const int fault_type)
1308 {
1309         struct bio *bios = NULL;
1310         struct bio *bio;
1311         struct page *page = peer_req->pages;
1312         sector_t sector = peer_req->i.sector;
1313         unsigned ds = peer_req->i.size;
1314         unsigned n_bios = 0;
1315         unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
1316         int err = -ENOMEM;
1317
1318         /* In most cases, we will only need one bio.  But in case the lower
1319          * level restrictions happen to be different at this offset on this
1320          * side than those of the sending peer, we may need to submit the
1321          * request in more than one bio.
1322          *
1323          * Plain bio_alloc is good enough here, this is no DRBD internally
1324          * generated bio, but a bio allocated on behalf of the peer.
1325          */
1326 next_bio:
1327         bio = bio_alloc(GFP_NOIO, nr_pages);
1328         if (!bio) {
1329                 drbd_err(device, "submit_ee: Allocation of a bio failed\n");
1330                 goto fail;
1331         }
1332         /* > peer_req->i.sector, unless this is the first bio */
1333         bio->bi_iter.bi_sector = sector;
1334         bio->bi_bdev = device->ldev->backing_bdev;
1335         bio->bi_rw = rw;
1336         bio->bi_private = peer_req;
1337         bio->bi_end_io = drbd_peer_request_endio;
1338
1339         bio->bi_next = bios;
1340         bios = bio;
1341         ++n_bios;
1342
1343         page_chain_for_each(page) {
1344                 unsigned len = min_t(unsigned, ds, PAGE_SIZE);
1345                 if (!bio_add_page(bio, page, len, 0)) {
1346                         /* A single page must always be possible!
1347                          * But in case it fails anyways,
1348                          * we deal with it, and complain (below). */
1349                         if (bio->bi_vcnt == 0) {
1350                                 drbd_err(device,
1351                                         "bio_add_page failed for len=%u, "
1352                                         "bi_vcnt=0 (bi_sector=%llu)\n",
1353                                         len, (uint64_t)bio->bi_iter.bi_sector);
1354                                 err = -ENOSPC;
1355                                 goto fail;
1356                         }
1357                         goto next_bio;
1358                 }
1359                 ds -= len;
1360                 sector += len >> 9;
1361                 --nr_pages;
1362         }
1363         D_ASSERT(device, page == NULL);
1364         D_ASSERT(device, ds == 0);
1365
1366         atomic_set(&peer_req->pending_bios, n_bios);
1367         do {
1368                 bio = bios;
1369                 bios = bios->bi_next;
1370                 bio->bi_next = NULL;
1371
1372                 drbd_generic_make_request(device, fault_type, bio);
1373         } while (bios);
1374         return 0;
1375
1376 fail:
1377         while (bios) {
1378                 bio = bios;
1379                 bios = bios->bi_next;
1380                 bio_put(bio);
1381         }
1382         return err;
1383 }
1384
1385 static void drbd_remove_epoch_entry_interval(struct drbd_device *device,
1386                                              struct drbd_peer_request *peer_req)
1387 {
1388         struct drbd_interval *i = &peer_req->i;
1389
1390         drbd_remove_interval(&device->write_requests, i);
1391         drbd_clear_interval(i);
1392
1393         /* Wake up any processes waiting for this peer request to complete.  */
1394         if (i->waiting)
1395                 wake_up(&device->misc_wait);
1396 }
1397
1398 static void conn_wait_active_ee_empty(struct drbd_connection *connection)
1399 {
1400         struct drbd_peer_device *peer_device;
1401         int vnr;
1402
1403         rcu_read_lock();
1404         idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1405                 struct drbd_device *device = peer_device->device;
1406
1407                 kref_get(&device->kref);
1408                 rcu_read_unlock();
1409                 drbd_wait_ee_list_empty(device, &device->active_ee);
1410                 kref_put(&device->kref, drbd_destroy_device);
1411                 rcu_read_lock();
1412         }
1413         rcu_read_unlock();
1414 }
1415
1416 static struct drbd_peer_device *
1417 conn_peer_device(struct drbd_connection *connection, int volume_number)
1418 {
1419         return idr_find(&connection->peer_devices, volume_number);
1420 }
1421
1422 static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi)
1423 {
1424         int rv;
1425         struct p_barrier *p = pi->data;
1426         struct drbd_epoch *epoch;
1427
1428         /* FIXME these are unacked on connection,
1429          * not a specific (peer)device.
1430          */
1431         connection->current_epoch->barrier_nr = p->barrier;
1432         connection->current_epoch->connection = connection;
1433         rv = drbd_may_finish_epoch(connection, connection->current_epoch, EV_GOT_BARRIER_NR);
1434
1435         /* P_BARRIER_ACK may imply that the corresponding extent is dropped from
1436          * the activity log, which means it would not be resynced in case the
1437          * R_PRIMARY crashes now.
1438          * Therefore we must send the barrier_ack after the barrier request was
1439          * completed. */
1440         switch (connection->write_ordering) {
1441         case WO_none:
1442                 if (rv == FE_RECYCLED)
1443                         return 0;
1444
1445                 /* receiver context, in the writeout path of the other node.
1446                  * avoid potential distributed deadlock */
1447                 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1448                 if (epoch)
1449                         break;
1450                 else
1451                         drbd_warn(connection, "Allocation of an epoch failed, slowing down\n");
1452                         /* Fall through */
1453
1454         case WO_bdev_flush:
1455         case WO_drain_io:
1456                 conn_wait_active_ee_empty(connection);
1457                 drbd_flush(connection);
1458
1459                 if (atomic_read(&connection->current_epoch->epoch_size)) {
1460                         epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1461                         if (epoch)
1462                                 break;
1463                 }
1464
1465                 return 0;
1466         default:
1467                 drbd_err(connection, "Strangeness in connection->write_ordering %d\n", connection->write_ordering);
1468                 return -EIO;
1469         }
1470
1471         epoch->flags = 0;
1472         atomic_set(&epoch->epoch_size, 0);
1473         atomic_set(&epoch->active, 0);
1474
1475         spin_lock(&connection->epoch_lock);
1476         if (atomic_read(&connection->current_epoch->epoch_size)) {
1477                 list_add(&epoch->list, &connection->current_epoch->list);
1478                 connection->current_epoch = epoch;
1479                 connection->epochs++;
1480         } else {
1481                 /* The current_epoch got recycled while we allocated this one... */
1482                 kfree(epoch);
1483         }
1484         spin_unlock(&connection->epoch_lock);
1485
1486         return 0;
1487 }
1488
1489 /* used from receive_RSDataReply (recv_resync_read)
1490  * and from receive_Data */
1491 static struct drbd_peer_request *
1492 read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
1493               int data_size) __must_hold(local)
1494 {
1495         struct drbd_device *device = peer_device->device;
1496         const sector_t capacity = drbd_get_capacity(device->this_bdev);
1497         struct drbd_peer_request *peer_req;
1498         struct page *page;
1499         int dgs, ds, err;
1500         void *dig_in = peer_device->connection->int_dig_in;
1501         void *dig_vv = peer_device->connection->int_dig_vv;
1502         unsigned long *data;
1503
1504         dgs = 0;
1505         if (peer_device->connection->peer_integrity_tfm) {
1506                 dgs = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
1507                 /*
1508                  * FIXME: Receive the incoming digest into the receive buffer
1509                  *        here, together with its struct p_data?
1510                  */
1511                 err = drbd_recv_all_warn(peer_device->connection, dig_in, dgs);
1512                 if (err)
1513                         return NULL;
1514                 data_size -= dgs;
1515         }
1516
1517         if (!expect(IS_ALIGNED(data_size, 512)))
1518                 return NULL;
1519         if (!expect(data_size <= DRBD_MAX_BIO_SIZE))
1520                 return NULL;
1521
1522         /* even though we trust out peer,
1523          * we sometimes have to double check. */
1524         if (sector + (data_size>>9) > capacity) {
1525                 drbd_err(device, "request from peer beyond end of local disk: "
1526                         "capacity: %llus < sector: %llus + size: %u\n",
1527                         (unsigned long long)capacity,
1528                         (unsigned long long)sector, data_size);
1529                 return NULL;
1530         }
1531
1532         /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1533          * "criss-cross" setup, that might cause write-out on some other DRBD,
1534          * which in turn might block on the other node at this very place.  */
1535         peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, GFP_NOIO);
1536         if (!peer_req)
1537                 return NULL;
1538
1539         if (!data_size)
1540                 return peer_req;
1541
1542         ds = data_size;
1543         page = peer_req->pages;
1544         page_chain_for_each(page) {
1545                 unsigned len = min_t(int, ds, PAGE_SIZE);
1546                 data = kmap(page);
1547                 err = drbd_recv_all_warn(peer_device->connection, data, len);
1548                 if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) {
1549                         drbd_err(device, "Fault injection: Corrupting data on receive\n");
1550                         data[0] = data[0] ^ (unsigned long)-1;
1551                 }
1552                 kunmap(page);
1553                 if (err) {
1554                         drbd_free_peer_req(device, peer_req);
1555                         return NULL;
1556                 }
1557                 ds -= len;
1558         }
1559
1560         if (dgs) {
1561                 drbd_csum_ee(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv);
1562                 if (memcmp(dig_in, dig_vv, dgs)) {
1563                         drbd_err(device, "Digest integrity check FAILED: %llus +%u\n",
1564                                 (unsigned long long)sector, data_size);
1565                         drbd_free_peer_req(device, peer_req);
1566                         return NULL;
1567                 }
1568         }
1569         device->recv_cnt += data_size>>9;
1570         return peer_req;
1571 }
1572
1573 /* drbd_drain_block() just takes a data block
1574  * out of the socket input buffer, and discards it.
1575  */
1576 static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size)
1577 {
1578         struct page *page;
1579         int err = 0;
1580         void *data;
1581
1582         if (!data_size)
1583                 return 0;
1584
1585         page = drbd_alloc_pages(peer_device, 1, 1);
1586
1587         data = kmap(page);
1588         while (data_size) {
1589                 unsigned int len = min_t(int, data_size, PAGE_SIZE);
1590
1591                 err = drbd_recv_all_warn(peer_device->connection, data, len);
1592                 if (err)
1593                         break;
1594                 data_size -= len;
1595         }
1596         kunmap(page);
1597         drbd_free_pages(peer_device->device, page, 0);
1598         return err;
1599 }
1600
1601 static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_request *req,
1602                            sector_t sector, int data_size)
1603 {
1604         struct bio_vec bvec;
1605         struct bvec_iter iter;
1606         struct bio *bio;
1607         int dgs, err, expect;
1608         void *dig_in = peer_device->connection->int_dig_in;
1609         void *dig_vv = peer_device->connection->int_dig_vv;
1610
1611         dgs = 0;
1612         if (peer_device->connection->peer_integrity_tfm) {
1613                 dgs = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
1614                 err = drbd_recv_all_warn(peer_device->connection, dig_in, dgs);
1615                 if (err)
1616                         return err;
1617                 data_size -= dgs;
1618         }
1619
1620         /* optimistically update recv_cnt.  if receiving fails below,
1621          * we disconnect anyways, and counters will be reset. */
1622         peer_device->device->recv_cnt += data_size>>9;
1623
1624         bio = req->master_bio;
1625         D_ASSERT(peer_device->device, sector == bio->bi_iter.bi_sector);
1626
1627         bio_for_each_segment(bvec, bio, iter) {
1628                 void *mapped = kmap(bvec.bv_page) + bvec.bv_offset;
1629                 expect = min_t(int, data_size, bvec.bv_len);
1630                 err = drbd_recv_all_warn(peer_device->connection, mapped, expect);
1631                 kunmap(bvec.bv_page);
1632                 if (err)
1633                         return err;
1634                 data_size -= expect;
1635         }
1636
1637         if (dgs) {
1638                 drbd_csum_bio(peer_device->connection->peer_integrity_tfm, bio, dig_vv);
1639                 if (memcmp(dig_in, dig_vv, dgs)) {
1640                         drbd_err(peer_device, "Digest integrity check FAILED. Broken NICs?\n");
1641                         return -EINVAL;
1642                 }
1643         }
1644
1645         D_ASSERT(peer_device->device, data_size == 0);
1646         return 0;
1647 }
1648
1649 /*
1650  * e_end_resync_block() is called in asender context via
1651  * drbd_finish_peer_reqs().
1652  */
1653 static int e_end_resync_block(struct drbd_work *w, int unused)
1654 {
1655         struct drbd_peer_request *peer_req =
1656                 container_of(w, struct drbd_peer_request, w);
1657         struct drbd_peer_device *peer_device = peer_req->peer_device;
1658         struct drbd_device *device = peer_device->device;
1659         sector_t sector = peer_req->i.sector;
1660         int err;
1661
1662         D_ASSERT(device, drbd_interval_empty(&peer_req->i));
1663
1664         if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1665                 drbd_set_in_sync(device, sector, peer_req->i.size);
1666                 err = drbd_send_ack(peer_device, P_RS_WRITE_ACK, peer_req);
1667         } else {
1668                 /* Record failure to sync */
1669                 drbd_rs_failed_io(device, sector, peer_req->i.size);
1670
1671                 err  = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
1672         }
1673         dec_unacked(device);
1674
1675         return err;
1676 }
1677
1678 static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector,
1679                             int data_size) __releases(local)
1680 {
1681         struct drbd_device *device = peer_device->device;
1682         struct drbd_peer_request *peer_req;
1683
1684         peer_req = read_in_block(peer_device, ID_SYNCER, sector, data_size);
1685         if (!peer_req)
1686                 goto fail;
1687
1688         dec_rs_pending(device);
1689
1690         inc_unacked(device);
1691         /* corresponding dec_unacked() in e_end_resync_block()
1692          * respective _drbd_clear_done_ee */
1693
1694         peer_req->w.cb = e_end_resync_block;
1695
1696         spin_lock_irq(&device->resource->req_lock);
1697         list_add(&peer_req->w.list, &device->sync_ee);
1698         spin_unlock_irq(&device->resource->req_lock);
1699
1700         atomic_add(data_size >> 9, &device->rs_sect_ev);
1701         if (drbd_submit_peer_request(device, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0)
1702                 return 0;
1703
1704         /* don't care for the reason here */
1705         drbd_err(device, "submit failed, triggering re-connect\n");
1706         spin_lock_irq(&device->resource->req_lock);
1707         list_del(&peer_req->w.list);
1708         spin_unlock_irq(&device->resource->req_lock);
1709
1710         drbd_free_peer_req(device, peer_req);
1711 fail:
1712         put_ldev(device);
1713         return -EIO;
1714 }
1715
1716 static struct drbd_request *
1717 find_request(struct drbd_device *device, struct rb_root *root, u64 id,
1718              sector_t sector, bool missing_ok, const char *func)
1719 {
1720         struct drbd_request *req;
1721
1722         /* Request object according to our peer */
1723         req = (struct drbd_request *)(unsigned long)id;
1724         if (drbd_contains_interval(root, sector, &req->i) && req->i.local)
1725                 return req;
1726         if (!missing_ok) {
1727                 drbd_err(device, "%s: failed to find request 0x%lx, sector %llus\n", func,
1728                         (unsigned long)id, (unsigned long long)sector);
1729         }
1730         return NULL;
1731 }
1732
1733 static int receive_DataReply(struct drbd_connection *connection, struct packet_info *pi)
1734 {
1735         struct drbd_peer_device *peer_device;
1736         struct drbd_device *device;
1737         struct drbd_request *req;
1738         sector_t sector;
1739         int err;
1740         struct p_data *p = pi->data;
1741
1742         peer_device = conn_peer_device(connection, pi->vnr);
1743         if (!peer_device)
1744                 return -EIO;
1745         device = peer_device->device;
1746
1747         sector = be64_to_cpu(p->sector);
1748
1749         spin_lock_irq(&device->resource->req_lock);
1750         req = find_request(device, &device->read_requests, p->block_id, sector, false, __func__);
1751         spin_unlock_irq(&device->resource->req_lock);
1752         if (unlikely(!req))
1753                 return -EIO;
1754
1755         /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
1756          * special casing it there for the various failure cases.
1757          * still no race with drbd_fail_pending_reads */
1758         err = recv_dless_read(peer_device, req, sector, pi->size);
1759         if (!err)
1760                 req_mod(req, DATA_RECEIVED);
1761         /* else: nothing. handled from drbd_disconnect...
1762          * I don't think we may complete this just yet
1763          * in case we are "on-disconnect: freeze" */
1764
1765         return err;
1766 }
1767
1768 static int receive_RSDataReply(struct drbd_connection *connection, struct packet_info *pi)
1769 {
1770         struct drbd_peer_device *peer_device;
1771         struct drbd_device *device;
1772         sector_t sector;
1773         int err;
1774         struct p_data *p = pi->data;
1775
1776         peer_device = conn_peer_device(connection, pi->vnr);
1777         if (!peer_device)
1778                 return -EIO;
1779         device = peer_device->device;
1780
1781         sector = be64_to_cpu(p->sector);
1782         D_ASSERT(device, p->block_id == ID_SYNCER);
1783
1784         if (get_ldev(device)) {
1785                 /* data is submitted to disk within recv_resync_read.
1786                  * corresponding put_ldev done below on error,
1787                  * or in drbd_peer_request_endio. */
1788                 err = recv_resync_read(peer_device, sector, pi->size);
1789         } else {
1790                 if (__ratelimit(&drbd_ratelimit_state))
1791                         drbd_err(device, "Can not write resync data to local disk.\n");
1792
1793                 err = drbd_drain_block(peer_device, pi->size);
1794
1795                 drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
1796         }
1797
1798         atomic_add(pi->size >> 9, &device->rs_sect_in);
1799
1800         return err;
1801 }
1802
1803 static void restart_conflicting_writes(struct drbd_device *device,
1804                                        sector_t sector, int size)
1805 {
1806         struct drbd_interval *i;
1807         struct drbd_request *req;
1808
1809         drbd_for_each_overlap(i, &device->write_requests, sector, size) {
1810                 if (!i->local)
1811                         continue;
1812                 req = container_of(i, struct drbd_request, i);
1813                 if (req->rq_state & RQ_LOCAL_PENDING ||
1814                     !(req->rq_state & RQ_POSTPONED))
1815                         continue;
1816                 /* as it is RQ_POSTPONED, this will cause it to
1817                  * be queued on the retry workqueue. */
1818                 __req_mod(req, CONFLICT_RESOLVED, NULL);
1819         }
1820 }
1821
1822 /*
1823  * e_end_block() is called in asender context via drbd_finish_peer_reqs().
1824  */
1825 static int e_end_block(struct drbd_work *w, int cancel)
1826 {
1827         struct drbd_peer_request *peer_req =
1828                 container_of(w, struct drbd_peer_request, w);
1829         struct drbd_peer_device *peer_device = peer_req->peer_device;
1830         struct drbd_device *device = peer_device->device;
1831         sector_t sector = peer_req->i.sector;
1832         int err = 0, pcmd;
1833
1834         if (peer_req->flags & EE_SEND_WRITE_ACK) {
1835                 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1836                         pcmd = (device->state.conn >= C_SYNC_SOURCE &&
1837                                 device->state.conn <= C_PAUSED_SYNC_T &&
1838                                 peer_req->flags & EE_MAY_SET_IN_SYNC) ?
1839                                 P_RS_WRITE_ACK : P_WRITE_ACK;
1840                         err = drbd_send_ack(peer_device, pcmd, peer_req);
1841                         if (pcmd == P_RS_WRITE_ACK)
1842                                 drbd_set_in_sync(device, sector, peer_req->i.size);
1843                 } else {
1844                         err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
1845                         /* we expect it to be marked out of sync anyways...
1846                          * maybe assert this?  */
1847                 }
1848                 dec_unacked(device);
1849         }
1850         /* we delete from the conflict detection hash _after_ we sent out the
1851          * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right.  */
1852         if (peer_req->flags & EE_IN_INTERVAL_TREE) {
1853                 spin_lock_irq(&device->resource->req_lock);
1854                 D_ASSERT(device, !drbd_interval_empty(&peer_req->i));
1855                 drbd_remove_epoch_entry_interval(device, peer_req);
1856                 if (peer_req->flags & EE_RESTART_REQUESTS)
1857                         restart_conflicting_writes(device, sector, peer_req->i.size);
1858                 spin_unlock_irq(&device->resource->req_lock);
1859         } else
1860                 D_ASSERT(device, drbd_interval_empty(&peer_req->i));
1861
1862         drbd_may_finish_epoch(first_peer_device(device)->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
1863
1864         return err;
1865 }
1866
1867 static int e_send_ack(struct drbd_work *w, enum drbd_packet ack)
1868 {
1869         struct drbd_peer_request *peer_req =
1870                 container_of(w, struct drbd_peer_request, w);
1871         struct drbd_peer_device *peer_device = peer_req->peer_device;
1872         int err;
1873
1874         err = drbd_send_ack(peer_device, ack, peer_req);
1875         dec_unacked(peer_device->device);
1876
1877         return err;
1878 }
1879
1880 static int e_send_superseded(struct drbd_work *w, int unused)
1881 {
1882         return e_send_ack(w, P_SUPERSEDED);
1883 }
1884
1885 static int e_send_retry_write(struct drbd_work *w, int unused)
1886 {
1887         struct drbd_peer_request *peer_req =
1888                 container_of(w, struct drbd_peer_request, w);
1889         struct drbd_connection *connection = peer_req->peer_device->connection;
1890
1891         return e_send_ack(w, connection->agreed_pro_version >= 100 ?
1892                              P_RETRY_WRITE : P_SUPERSEDED);
1893 }
1894
1895 static bool seq_greater(u32 a, u32 b)
1896 {
1897         /*
1898          * We assume 32-bit wrap-around here.
1899          * For 24-bit wrap-around, we would have to shift:
1900          *  a <<= 8; b <<= 8;
1901          */
1902         return (s32)a - (s32)b > 0;
1903 }
1904
1905 static u32 seq_max(u32 a, u32 b)
1906 {
1907         return seq_greater(a, b) ? a : b;
1908 }
1909
1910 static void update_peer_seq(struct drbd_peer_device *peer_device, unsigned int peer_seq)
1911 {
1912         struct drbd_device *device = peer_device->device;
1913         unsigned int newest_peer_seq;
1914
1915         if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)) {
1916                 spin_lock(&device->peer_seq_lock);
1917                 newest_peer_seq = seq_max(device->peer_seq, peer_seq);
1918                 device->peer_seq = newest_peer_seq;
1919                 spin_unlock(&device->peer_seq_lock);
1920                 /* wake up only if we actually changed device->peer_seq */
1921                 if (peer_seq == newest_peer_seq)
1922                         wake_up(&device->seq_wait);
1923         }
1924 }
1925
1926 static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
1927 {
1928         return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
1929 }
1930
1931 /* maybe change sync_ee into interval trees as well? */
1932 static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req)
1933 {
1934         struct drbd_peer_request *rs_req;
1935         bool rv = 0;
1936
1937         spin_lock_irq(&device->resource->req_lock);
1938         list_for_each_entry(rs_req, &device->sync_ee, w.list) {
1939                 if (overlaps(peer_req->i.sector, peer_req->i.size,
1940                              rs_req->i.sector, rs_req->i.size)) {
1941                         rv = 1;
1942                         break;
1943                 }
1944         }
1945         spin_unlock_irq(&device->resource->req_lock);
1946
1947         return rv;
1948 }
1949
1950 /* Called from receive_Data.
1951  * Synchronize packets on sock with packets on msock.
1952  *
1953  * This is here so even when a P_DATA packet traveling via sock overtook an Ack
1954  * packet traveling on msock, they are still processed in the order they have
1955  * been sent.
1956  *
1957  * Note: we don't care for Ack packets overtaking P_DATA packets.
1958  *
1959  * In case packet_seq is larger than device->peer_seq number, there are
1960  * outstanding packets on the msock. We wait for them to arrive.
1961  * In case we are the logically next packet, we update device->peer_seq
1962  * ourselves. Correctly handles 32bit wrap around.
1963  *
1964  * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
1965  * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
1966  * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
1967  * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
1968  *
1969  * returns 0 if we may process the packet,
1970  * -ERESTARTSYS if we were interrupted (by disconnect signal). */
1971 static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, const u32 peer_seq)
1972 {
1973         struct drbd_device *device = peer_device->device;
1974         DEFINE_WAIT(wait);
1975         long timeout;
1976         int ret = 0, tp;
1977
1978         if (!test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags))
1979                 return 0;
1980
1981         spin_lock(&device->peer_seq_lock);
1982         for (;;) {
1983                 if (!seq_greater(peer_seq - 1, device->peer_seq)) {
1984                         device->peer_seq = seq_max(device->peer_seq, peer_seq);
1985                         break;
1986                 }
1987
1988                 if (signal_pending(current)) {
1989                         ret = -ERESTARTSYS;
1990                         break;
1991                 }
1992
1993                 rcu_read_lock();
1994                 tp = rcu_dereference(first_peer_device(device)->connection->net_conf)->two_primaries;
1995                 rcu_read_unlock();
1996
1997                 if (!tp)
1998                         break;
1999
2000                 /* Only need to wait if two_primaries is enabled */
2001                 prepare_to_wait(&device->seq_wait, &wait, TASK_INTERRUPTIBLE);
2002                 spin_unlock(&device->peer_seq_lock);
2003                 rcu_read_lock();
2004                 timeout = rcu_dereference(peer_device->connection->net_conf)->ping_timeo*HZ/10;
2005                 rcu_read_unlock();
2006                 timeout = schedule_timeout(timeout);
2007                 spin_lock(&device->peer_seq_lock);
2008                 if (!timeout) {
2009                         ret = -ETIMEDOUT;
2010                         drbd_err(device, "Timed out waiting for missing ack packets; disconnecting\n");
2011                         break;
2012                 }
2013         }
2014         spin_unlock(&device->peer_seq_lock);
2015         finish_wait(&device->seq_wait, &wait);
2016         return ret;
2017 }
2018
2019 /* see also bio_flags_to_wire()
2020  * DRBD_REQ_*, because we need to semantically map the flags to data packet
2021  * flags and back. We may replicate to other kernel versions. */
2022 static unsigned long wire_flags_to_bio(u32 dpf)
2023 {
2024         return  (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
2025                 (dpf & DP_FUA ? REQ_FUA : 0) |
2026                 (dpf & DP_FLUSH ? REQ_FLUSH : 0) |
2027                 (dpf & DP_DISCARD ? REQ_DISCARD : 0);
2028 }
2029
2030 static void fail_postponed_requests(struct drbd_device *device, sector_t sector,
2031                                     unsigned int size)
2032 {
2033         struct drbd_interval *i;
2034
2035     repeat:
2036         drbd_for_each_overlap(i, &device->write_requests, sector, size) {
2037                 struct drbd_request *req;
2038                 struct bio_and_error m;
2039
2040                 if (!i->local)
2041                         continue;
2042                 req = container_of(i, struct drbd_request, i);
2043                 if (!(req->rq_state & RQ_POSTPONED))
2044                         continue;
2045                 req->rq_state &= ~RQ_POSTPONED;
2046                 __req_mod(req, NEG_ACKED, &m);
2047                 spin_unlock_irq(&device->resource->req_lock);
2048                 if (m.bio)
2049                         complete_master_bio(device, &m);
2050                 spin_lock_irq(&device->resource->req_lock);
2051                 goto repeat;
2052         }
2053 }
2054
2055 static int handle_write_conflicts(struct drbd_device *device,
2056                                   struct drbd_peer_request *peer_req)
2057 {
2058         struct drbd_connection *connection = peer_req->peer_device->connection;
2059         bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &connection->flags);
2060         sector_t sector = peer_req->i.sector;
2061         const unsigned int size = peer_req->i.size;
2062         struct drbd_interval *i;
2063         bool equal;
2064         int err;
2065
2066         /*
2067          * Inserting the peer request into the write_requests tree will prevent
2068          * new conflicting local requests from being added.
2069          */
2070         drbd_insert_interval(&device->write_requests, &peer_req->i);
2071
2072     repeat:
2073         drbd_for_each_overlap(i, &device->write_requests, sector, size) {
2074                 if (i == &peer_req->i)
2075                         continue;
2076
2077                 if (!i->local) {
2078                         /*
2079                          * Our peer has sent a conflicting remote request; this
2080                          * should not happen in a two-node setup.  Wait for the
2081                          * earlier peer request to complete.
2082                          */
2083                         err = drbd_wait_misc(device, i);
2084                         if (err)
2085                                 goto out;
2086                         goto repeat;
2087                 }
2088
2089                 equal = i->sector == sector && i->size == size;
2090                 if (resolve_conflicts) {
2091                         /*
2092                          * If the peer request is fully contained within the
2093                          * overlapping request, it can be considered overwritten
2094                          * and thus superseded; otherwise, it will be retried
2095                          * once all overlapping requests have completed.
2096                          */
2097                         bool superseded = i->sector <= sector && i->sector +
2098                                        (i->size >> 9) >= sector + (size >> 9);
2099
2100                         if (!equal)
2101                                 drbd_alert(device, "Concurrent writes detected: "
2102                                                "local=%llus +%u, remote=%llus +%u, "
2103                                                "assuming %s came first\n",
2104                                           (unsigned long long)i->sector, i->size,
2105                                           (unsigned long long)sector, size,
2106                                           superseded ? "local" : "remote");
2107
2108                         inc_unacked(device);
2109                         peer_req->w.cb = superseded ? e_send_superseded :
2110                                                    e_send_retry_write;
2111                         list_add_tail(&peer_req->w.list, &device->done_ee);
2112                         wake_asender(connection);
2113
2114                         err = -ENOENT;
2115                         goto out;
2116                 } else {
2117                         struct drbd_request *req =
2118                                 container_of(i, struct drbd_request, i);
2119
2120                         if (!equal)
2121                                 drbd_alert(device, "Concurrent writes detected: "
2122                                                "local=%llus +%u, remote=%llus +%u\n",
2123                                           (unsigned long long)i->sector, i->size,
2124                                           (unsigned long long)sector, size);
2125
2126                         if (req->rq_state & RQ_LOCAL_PENDING ||
2127                             !(req->rq_state & RQ_POSTPONED)) {
2128                                 /*
2129                                  * Wait for the node with the discard flag to
2130                                  * decide if this request has been superseded
2131                                  * or needs to be retried.
2132                                  * Requests that have been superseded will
2133                                  * disappear from the write_requests tree.
2134                                  *
2135                                  * In addition, wait for the conflicting
2136                                  * request to finish locally before submitting
2137                                  * the conflicting peer request.
2138                                  */
2139                                 err = drbd_wait_misc(device, &req->i);
2140                                 if (err) {
2141                                         _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD);
2142                                         fail_postponed_requests(device, sector, size);
2143                                         goto out;
2144                                 }
2145                                 goto repeat;
2146                         }
2147                         /*
2148                          * Remember to restart the conflicting requests after
2149                          * the new peer request has completed.
2150                          */
2151                         peer_req->flags |= EE_RESTART_REQUESTS;
2152                 }
2153         }
2154         err = 0;
2155
2156     out:
2157         if (err)
2158                 drbd_remove_epoch_entry_interval(device, peer_req);
2159         return err;
2160 }
2161
2162 /* mirrored write */
2163 static int receive_Data(struct drbd_connection *connection, struct packet_info *pi)
2164 {
2165         struct drbd_peer_device *peer_device;
2166         struct drbd_device *device;
2167         sector_t sector;
2168         struct drbd_peer_request *peer_req;
2169         struct p_data *p = pi->data;
2170         u32 peer_seq = be32_to_cpu(p->seq_num);
2171         int rw = WRITE;
2172         u32 dp_flags;
2173         int err, tp;
2174
2175         peer_device = conn_peer_device(connection, pi->vnr);
2176         if (!peer_device)
2177                 return -EIO;
2178         device = peer_device->device;
2179
2180         if (!get_ldev(device)) {
2181                 int err2;
2182
2183                 err = wait_for_and_update_peer_seq(peer_device, peer_seq);
2184                 drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
2185                 atomic_inc(&connection->current_epoch->epoch_size);
2186                 err2 = drbd_drain_block(peer_device, pi->size);
2187                 if (!err)
2188                         err = err2;
2189                 return err;
2190         }
2191
2192         /*
2193          * Corresponding put_ldev done either below (on various errors), or in
2194          * drbd_peer_request_endio, if we successfully submit the data at the
2195          * end of this function.
2196          */
2197
2198         sector = be64_to_cpu(p->sector);
2199         peer_req = read_in_block(peer_device, p->block_id, sector, pi->size);
2200         if (!peer_req) {
2201                 put_ldev(device);
2202                 return -EIO;
2203         }
2204
2205         peer_req->w.cb = e_end_block;
2206
2207         dp_flags = be32_to_cpu(p->dp_flags);
2208         rw |= wire_flags_to_bio(dp_flags);
2209         if (peer_req->pages == NULL) {
2210                 D_ASSERT(device, peer_req->i.size == 0);
2211                 D_ASSERT(device, dp_flags & DP_FLUSH);
2212         }
2213
2214         if (dp_flags & DP_MAY_SET_IN_SYNC)
2215                 peer_req->flags |= EE_MAY_SET_IN_SYNC;
2216
2217         spin_lock(&connection->epoch_lock);
2218         peer_req->epoch = connection->current_epoch;
2219         atomic_inc(&peer_req->epoch->epoch_size);
2220         atomic_inc(&peer_req->epoch->active);
2221         spin_unlock(&connection->epoch_lock);
2222
2223         rcu_read_lock();
2224         tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries;
2225         rcu_read_unlock();
2226         if (tp) {
2227                 peer_req->flags |= EE_IN_INTERVAL_TREE;
2228                 err = wait_for_and_update_peer_seq(peer_device, peer_seq);
2229                 if (err)
2230                         goto out_interrupted;
2231                 spin_lock_irq(&device->resource->req_lock);
2232                 err = handle_write_conflicts(device, peer_req);
2233                 if (err) {
2234                         spin_unlock_irq(&device->resource->req_lock);
2235                         if (err == -ENOENT) {
2236                                 put_ldev(device);
2237                                 return 0;
2238                         }
2239                         goto out_interrupted;
2240                 }
2241         } else {
2242                 update_peer_seq(peer_device, peer_seq);
2243                 spin_lock_irq(&device->resource->req_lock);
2244         }
2245         list_add(&peer_req->w.list, &device->active_ee);
2246         spin_unlock_irq(&device->resource->req_lock);
2247
2248         if (device->state.conn == C_SYNC_TARGET)
2249                 wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req));
2250
2251         if (peer_device->connection->agreed_pro_version < 100) {
2252                 rcu_read_lock();
2253                 switch (rcu_dereference(peer_device->connection->net_conf)->wire_protocol) {
2254                 case DRBD_PROT_C:
2255                         dp_flags |= DP_SEND_WRITE_ACK;
2256                         break;
2257                 case DRBD_PROT_B:
2258                         dp_flags |= DP_SEND_RECEIVE_ACK;
2259                         break;
2260                 }
2261                 rcu_read_unlock();
2262         }
2263
2264         if (dp_flags & DP_SEND_WRITE_ACK) {
2265                 peer_req->flags |= EE_SEND_WRITE_ACK;
2266                 inc_unacked(device);
2267                 /* corresponding dec_unacked() in e_end_block()
2268                  * respective _drbd_clear_done_ee */
2269         }
2270
2271         if (dp_flags & DP_SEND_RECEIVE_ACK) {
2272                 /* I really don't like it that the receiver thread
2273                  * sends on the msock, but anyways */
2274                 drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req);
2275         }
2276
2277         if (device->state.pdsk < D_INCONSISTENT) {
2278                 /* In case we have the only disk of the cluster, */
2279                 drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size);
2280                 peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
2281                 peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
2282                 drbd_al_begin_io(device, &peer_req->i, true);
2283         }
2284
2285         err = drbd_submit_peer_request(device, peer_req, rw, DRBD_FAULT_DT_WR);
2286         if (!err)
2287                 return 0;
2288
2289         /* don't care for the reason here */
2290         drbd_err(device, "submit failed, triggering re-connect\n");
2291         spin_lock_irq(&device->resource->req_lock);
2292         list_del(&peer_req->w.list);
2293         drbd_remove_epoch_entry_interval(device, peer_req);
2294         spin_unlock_irq(&device->resource->req_lock);
2295         if (peer_req->flags & EE_CALL_AL_COMPLETE_IO)
2296                 drbd_al_complete_io(device, &peer_req->i);
2297
2298 out_interrupted:
2299         drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP);
2300         put_ldev(device);
2301         drbd_free_peer_req(device, peer_req);
2302         return err;
2303 }
2304
2305 /* We may throttle resync, if the lower device seems to be busy,
2306  * and current sync rate is above c_min_rate.
2307  *
2308  * To decide whether or not the lower device is busy, we use a scheme similar
2309  * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
2310  * (more than 64 sectors) of activity we cannot account for with our own resync
2311  * activity, it obviously is "busy".
2312  *
2313  * The current sync rate used here uses only the most recent two step marks,
2314  * to have a short time average so we can react faster.
2315  */
2316 int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
2317 {
2318         struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
2319         unsigned long db, dt, dbdt;
2320         struct lc_element *tmp;
2321         int curr_events;
2322         int throttle = 0;
2323         unsigned int c_min_rate;
2324
2325         rcu_read_lock();
2326         c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate;
2327         rcu_read_unlock();
2328
2329         /* feature disabled? */
2330         if (c_min_rate == 0)
2331                 return 0;
2332
2333         spin_lock_irq(&device->al_lock);
2334         tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
2335         if (tmp) {
2336                 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
2337                 if (test_bit(BME_PRIORITY, &bm_ext->flags)) {
2338                         spin_unlock_irq(&device->al_lock);
2339                         return 0;
2340                 }
2341                 /* Do not slow down if app IO is already waiting for this extent */
2342         }
2343         spin_unlock_irq(&device->al_lock);
2344
2345         curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
2346                       (int)part_stat_read(&disk->part0, sectors[1]) -
2347                         atomic_read(&device->rs_sect_ev);
2348
2349         if (!device->rs_last_events || curr_events - device->rs_last_events > 64) {
2350                 unsigned long rs_left;
2351                 int i;
2352
2353                 device->rs_last_events = curr_events;
2354
2355                 /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
2356                  * approx. */
2357                 i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
2358
2359                 if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
2360                         rs_left = device->ov_left;
2361                 else
2362                         rs_left = drbd_bm_total_weight(device) - device->rs_failed;
2363
2364                 dt = ((long)jiffies - (long)device->rs_mark_time[i]) / HZ;
2365                 if (!dt)
2366                         dt++;
2367                 db = device->rs_mark_left[i] - rs_left;
2368                 dbdt = Bit2KB(db/dt);
2369
2370                 if (dbdt > c_min_rate)
2371                         throttle = 1;
2372         }
2373         return throttle;
2374 }
2375
2376
2377 static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi)
2378 {
2379         struct drbd_peer_device *peer_device;
2380         struct drbd_device *device;
2381         sector_t sector;
2382         sector_t capacity;
2383         struct drbd_peer_request *peer_req;
2384         struct digest_info *di = NULL;
2385         int size, verb;
2386         unsigned int fault_type;
2387         struct p_block_req *p = pi->data;
2388
2389         peer_device = conn_peer_device(connection, pi->vnr);
2390         if (!peer_device)
2391                 return -EIO;
2392         device = peer_device->device;
2393         capacity = drbd_get_capacity(device->this_bdev);
2394
2395         sector = be64_to_cpu(p->sector);
2396         size   = be32_to_cpu(p->blksize);
2397
2398         if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
2399                 drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2400                                 (unsigned long long)sector, size);
2401                 return -EINVAL;
2402         }
2403         if (sector + (size>>9) > capacity) {
2404                 drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2405                                 (unsigned long long)sector, size);
2406                 return -EINVAL;
2407         }
2408
2409         if (!get_ldev_if_state(device, D_UP_TO_DATE)) {
2410                 verb = 1;
2411                 switch (pi->cmd) {
2412                 case P_DATA_REQUEST:
2413                         drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p);
2414                         break;
2415                 case P_RS_DATA_REQUEST:
2416                 case P_CSUM_RS_REQUEST:
2417                 case P_OV_REQUEST:
2418                         drbd_send_ack_rp(peer_device, P_NEG_RS_DREPLY , p);
2419                         break;
2420                 case P_OV_REPLY:
2421                         verb = 0;
2422                         dec_rs_pending(device);
2423                         drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, ID_IN_SYNC);
2424                         break;
2425                 default:
2426                         BUG();
2427                 }
2428                 if (verb && __ratelimit(&drbd_ratelimit_state))
2429                         drbd_err(device, "Can not satisfy peer's read request, "
2430                             "no local data.\n");
2431
2432                 /* drain possibly payload */
2433                 return drbd_drain_block(peer_device, pi->size);
2434         }
2435
2436         /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
2437          * "criss-cross" setup, that might cause write-out on some other DRBD,
2438          * which in turn might block on the other node at this very place.  */
2439         peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size, GFP_NOIO);
2440         if (!peer_req) {
2441                 put_ldev(device);
2442                 return -ENOMEM;
2443         }
2444
2445         switch (pi->cmd) {
2446         case P_DATA_REQUEST:
2447                 peer_req->w.cb = w_e_end_data_req;
2448                 fault_type = DRBD_FAULT_DT_RD;
2449                 /* application IO, don't drbd_rs_begin_io */
2450                 goto submit;
2451
2452         case P_RS_DATA_REQUEST:
2453                 peer_req->w.cb = w_e_end_rsdata_req;
2454                 fault_type = DRBD_FAULT_RS_RD;
2455                 /* used in the sector offset progress display */
2456                 device->bm_resync_fo = BM_SECT_TO_BIT(sector);
2457                 break;
2458
2459         case P_OV_REPLY:
2460         case P_CSUM_RS_REQUEST:
2461                 fault_type = DRBD_FAULT_RS_RD;
2462                 di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
2463                 if (!di)
2464                         goto out_free_e;
2465
2466                 di->digest_size = pi->size;
2467                 di->digest = (((char *)di)+sizeof(struct digest_info));
2468
2469                 peer_req->digest = di;
2470                 peer_req->flags |= EE_HAS_DIGEST;
2471
2472                 if (drbd_recv_all(peer_device->connection, di->digest, pi->size))
2473                         goto out_free_e;
2474
2475                 if (pi->cmd == P_CSUM_RS_REQUEST) {
2476                         D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
2477                         peer_req->w.cb = w_e_end_csum_rs_req;
2478                         /* used in the sector offset progress display */
2479                         device->bm_resync_fo = BM_SECT_TO_BIT(sector);
2480                 } else if (pi->cmd == P_OV_REPLY) {
2481                         /* track progress, we may need to throttle */
2482                         atomic_add(size >> 9, &device->rs_sect_in);
2483                         peer_req->w.cb = w_e_end_ov_reply;
2484                         dec_rs_pending(device);
2485                         /* drbd_rs_begin_io done when we sent this request,
2486                          * but accounting still needs to be done. */
2487                         goto submit_for_resync;
2488                 }
2489                 break;
2490
2491         case P_OV_REQUEST:
2492                 if (device->ov_start_sector == ~(sector_t)0 &&
2493                     peer_device->connection->agreed_pro_version >= 90) {
2494                         unsigned long now = jiffies;
2495                         int i;
2496                         device->ov_start_sector = sector;
2497                         device->ov_position = sector;
2498                         device->ov_left = drbd_bm_bits(device) - BM_SECT_TO_BIT(sector);
2499                         device->rs_total = device->ov_left;
2500                         for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2501                                 device->rs_mark_left[i] = device->ov_left;
2502                                 device->rs_mark_time[i] = now;
2503                         }
2504                         drbd_info(device, "Online Verify start sector: %llu\n",
2505                                         (unsigned long long)sector);
2506                 }
2507                 peer_req->w.cb = w_e_end_ov_req;
2508                 fault_type = DRBD_FAULT_RS_RD;
2509                 break;
2510
2511         default:
2512                 BUG();
2513         }
2514
2515         /* Throttle, drbd_rs_begin_io and submit should become asynchronous
2516          * wrt the receiver, but it is not as straightforward as it may seem.
2517          * Various places in the resync start and stop logic assume resync
2518          * requests are processed in order, requeuing this on the worker thread
2519          * introduces a bunch of new code for synchronization between threads.
2520          *
2521          * Unlimited throttling before drbd_rs_begin_io may stall the resync
2522          * "forever", throttling after drbd_rs_begin_io will lock that extent
2523          * for application writes for the same time.  For now, just throttle
2524          * here, where the rest of the code expects the receiver to sleep for
2525          * a while, anyways.
2526          */
2527
2528         /* Throttle before drbd_rs_begin_io, as that locks out application IO;
2529          * this defers syncer requests for some time, before letting at least
2530          * on request through.  The resync controller on the receiving side
2531          * will adapt to the incoming rate accordingly.
2532          *
2533          * We cannot throttle here if remote is Primary/SyncTarget:
2534          * we would also throttle its application reads.
2535          * In that case, throttling is done on the SyncTarget only.
2536          */
2537         if (device->state.peer != R_PRIMARY && drbd_rs_should_slow_down(device, sector))
2538                 schedule_timeout_uninterruptible(HZ/10);
2539         if (drbd_rs_begin_io(device, sector))
2540                 goto out_free_e;
2541
2542 submit_for_resync:
2543         atomic_add(size >> 9, &device->rs_sect_ev);
2544
2545 submit:
2546         inc_unacked(device);
2547         spin_lock_irq(&device->resource->req_lock);
2548         list_add_tail(&peer_req->w.list, &device->read_ee);
2549         spin_unlock_irq(&device->resource->req_lock);
2550
2551         if (drbd_submit_peer_request(device, peer_req, READ, fault_type) == 0)
2552                 return 0;
2553
2554         /* don't care for the reason here */
2555         drbd_err(device, "submit failed, triggering re-connect\n");
2556         spin_lock_irq(&device->resource->req_lock);
2557         list_del(&peer_req->w.list);
2558         spin_unlock_irq(&device->resource->req_lock);
2559         /* no drbd_rs_complete_io(), we are dropping the connection anyways */
2560
2561 out_free_e:
2562         put_ldev(device);
2563         drbd_free_peer_req(device, peer_req);
2564         return -EIO;
2565 }
2566
2567 /**
2568  * drbd_asb_recover_0p  -  Recover after split-brain with no remaining primaries
2569  */
2570 static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold(local)
2571 {
2572         struct drbd_device *device = peer_device->device;
2573         int self, peer, rv = -100;
2574         unsigned long ch_self, ch_peer;
2575         enum drbd_after_sb_p after_sb_0p;
2576
2577         self = device->ldev->md.uuid[UI_BITMAP] & 1;
2578         peer = device->p_uuid[UI_BITMAP] & 1;
2579
2580         ch_peer = device->p_uuid[UI_SIZE];
2581         ch_self = device->comm_bm_set;
2582
2583         rcu_read_lock();
2584         after_sb_0p = rcu_dereference(peer_device->connection->net_conf)->after_sb_0p;
2585         rcu_read_unlock();
2586         switch (after_sb_0p) {
2587         case ASB_CONSENSUS:
2588         case ASB_DISCARD_SECONDARY:
2589         case ASB_CALL_HELPER:
2590         case ASB_VIOLENTLY:
2591                 drbd_err(device, "Configuration error.\n");
2592                 break;
2593         case ASB_DISCONNECT:
2594                 break;
2595         case ASB_DISCARD_YOUNGER_PRI:
2596                 if (self == 0 && peer == 1) {
2597                         rv = -1;
2598                         break;
2599                 }
2600                 if (self == 1 && peer == 0) {
2601                         rv =  1;
2602                         break;
2603                 }
2604                 /* Else fall through to one of the other strategies... */
2605         case ASB_DISCARD_OLDER_PRI:
2606                 if (self == 0 && peer == 1) {
2607                         rv = 1;
2608                         break;
2609                 }
2610                 if (self == 1 && peer == 0) {
2611                         rv = -1;
2612                         break;
2613                 }
2614                 /* Else fall through to one of the other strategies... */
2615                 drbd_warn(device, "Discard younger/older primary did not find a decision\n"
2616                      "Using discard-least-changes instead\n");
2617         case ASB_DISCARD_ZERO_CHG:
2618                 if (ch_peer == 0 && ch_self == 0) {
2619                         rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
2620                                 ? -1 : 1;
2621                         break;
2622                 } else {
2623                         if (ch_peer == 0) { rv =  1; break; }
2624                         if (ch_self == 0) { rv = -1; break; }
2625                 }
2626                 if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
2627                         break;
2628         case ASB_DISCARD_LEAST_CHG:
2629                 if      (ch_self < ch_peer)
2630                         rv = -1;
2631                 else if (ch_self > ch_peer)
2632                         rv =  1;
2633                 else /* ( ch_self == ch_peer ) */
2634                      /* Well, then use something else. */
2635                         rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
2636                                 ? -1 : 1;
2637                 break;
2638         case ASB_DISCARD_LOCAL:
2639                 rv = -1;
2640                 break;
2641         case ASB_DISCARD_REMOTE:
2642                 rv =  1;
2643         }
2644
2645         return rv;
2646 }
2647
2648 /**
2649  * drbd_asb_recover_1p  -  Recover after split-brain with one remaining primary
2650  */
2651 static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold(local)
2652 {
2653         struct drbd_device *device = peer_device->device;
2654         int hg, rv = -100;
2655         enum drbd_after_sb_p after_sb_1p;
2656
2657         rcu_read_lock();
2658         after_sb_1p = rcu_dereference(peer_device->connection->net_conf)->after_sb_1p;
2659         rcu_read_unlock();
2660         switch (after_sb_1p) {
2661         case ASB_DISCARD_YOUNGER_PRI:
2662         case ASB_DISCARD_OLDER_PRI:
2663         case ASB_DISCARD_LEAST_CHG:
2664         case ASB_DISCARD_LOCAL:
2665         case ASB_DISCARD_REMOTE:
2666         case ASB_DISCARD_ZERO_CHG:
2667                 drbd_err(device, "Configuration error.\n");
2668                 break;
2669         case ASB_DISCONNECT:
2670                 break;
2671         case ASB_CONSENSUS:
2672                 hg = drbd_asb_recover_0p(peer_device);
2673                 if (hg == -1 && device->state.role == R_SECONDARY)
2674                         rv = hg;
2675                 if (hg == 1  && device->state.role == R_PRIMARY)
2676                         rv = hg;
2677                 break;
2678         case ASB_VIOLENTLY:
2679                 rv = drbd_asb_recover_0p(peer_device);
2680                 break;
2681         case ASB_DISCARD_SECONDARY:
2682                 return device->state.role == R_PRIMARY ? 1 : -1;
2683         case ASB_CALL_HELPER:
2684                 hg = drbd_asb_recover_0p(peer_device);
2685                 if (hg == -1 && device->state.role == R_PRIMARY) {
2686                         enum drbd_state_rv rv2;
2687
2688                          /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2689                           * we might be here in C_WF_REPORT_PARAMS which is transient.
2690                           * we do not need to wait for the after state change work either. */
2691                         rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
2692                         if (rv2 != SS_SUCCESS) {
2693                                 drbd_khelper(device, "pri-lost-after-sb");
2694                         } else {
2695                                 drbd_warn(device, "Successfully gave up primary role.\n");
2696                                 rv = hg;
2697                         }
2698                 } else
2699                         rv = hg;
2700         }
2701
2702         return rv;
2703 }
2704
2705 /**
2706  * drbd_asb_recover_2p  -  Recover after split-brain with two remaining primaries
2707  */
2708 static int drbd_asb_recover_2p(struct drbd_peer_device *peer_device) __must_hold(local)
2709 {
2710         struct drbd_device *device = peer_device->device;
2711         int hg, rv = -100;
2712         enum drbd_after_sb_p after_sb_2p;
2713
2714         rcu_read_lock();
2715         after_sb_2p = rcu_dereference(peer_device->connection->net_conf)->after_sb_2p;
2716         rcu_read_unlock();
2717         switch (after_sb_2p) {
2718         case ASB_DISCARD_YOUNGER_PRI:
2719         case ASB_DISCARD_OLDER_PRI:
2720         case ASB_DISCARD_LEAST_CHG:
2721         case ASB_DISCARD_LOCAL:
2722         case ASB_DISCARD_REMOTE:
2723         case ASB_CONSENSUS:
2724         case ASB_DISCARD_SECONDARY:
2725         case ASB_DISCARD_ZERO_CHG:
2726                 drbd_err(device, "Configuration error.\n");
2727                 break;
2728         case ASB_VIOLENTLY:
2729                 rv = drbd_asb_recover_0p(peer_device);
2730                 break;
2731         case ASB_DISCONNECT:
2732                 break;
2733         case ASB_CALL_HELPER:
2734                 hg = drbd_asb_recover_0p(peer_device);
2735                 if (hg == -1) {
2736                         enum drbd_state_rv rv2;
2737
2738                          /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2739                           * we might be here in C_WF_REPORT_PARAMS which is transient.
2740                           * we do not need to wait for the after state change work either. */
2741                         rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
2742                         if (rv2 != SS_SUCCESS) {
2743                                 drbd_khelper(device, "pri-lost-after-sb");
2744                         } else {
2745                                 drbd_warn(device, "Successfully gave up primary role.\n");
2746                                 rv = hg;
2747                         }
2748                 } else
2749                         rv = hg;
2750         }
2751
2752         return rv;
2753 }
2754
2755 static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid,
2756                            u64 bits, u64 flags)
2757 {
2758         if (!uuid) {
2759                 drbd_info(device, "%s uuid info vanished while I was looking!\n", text);
2760                 return;
2761         }
2762         drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
2763              text,
2764              (unsigned long long)uuid[UI_CURRENT],
2765              (unsigned long long)uuid[UI_BITMAP],
2766              (unsigned long long)uuid[UI_HISTORY_START],
2767              (unsigned long long)uuid[UI_HISTORY_END],
2768              (unsigned long long)bits,
2769              (unsigned long long)flags);
2770 }
2771
2772 /*
2773   100   after split brain try auto recover
2774     2   C_SYNC_SOURCE set BitMap
2775     1   C_SYNC_SOURCE use BitMap
2776     0   no Sync
2777    -1   C_SYNC_TARGET use BitMap
2778    -2   C_SYNC_TARGET set BitMap
2779  -100   after split brain, disconnect
2780 -1000   unrelated data
2781 -1091   requires proto 91
2782 -1096   requires proto 96
2783  */
2784 static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_hold(local)
2785 {
2786         u64 self, peer;
2787         int i, j;
2788
2789         self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2790         peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
2791
2792         *rule_nr = 10;
2793         if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
2794                 return 0;
2795
2796         *rule_nr = 20;
2797         if ((self == UUID_JUST_CREATED || self == (u64)0) &&
2798              peer != UUID_JUST_CREATED)
2799                 return -2;
2800
2801         *rule_nr = 30;
2802         if (self != UUID_JUST_CREATED &&
2803             (peer == UUID_JUST_CREATED || peer == (u64)0))
2804                 return 2;
2805
2806         if (self == peer) {
2807                 int rct, dc; /* roles at crash time */
2808
2809                 if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) {
2810
2811                         if (first_peer_device(device)->connection->agreed_pro_version < 91)
2812                                 return -1091;
2813
2814                         if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
2815                             (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
2816                                 drbd_info(device, "was SyncSource, missed the resync finished event, corrected myself:\n");
2817                                 drbd_uuid_move_history(device);
2818                                 device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP];
2819                                 device->ldev->md.uuid[UI_BITMAP] = 0;
2820
2821                                 drbd_uuid_dump(device, "self", device->ldev->md.uuid,
2822                                                device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
2823                                 *rule_nr = 34;
2824                         } else {
2825                                 drbd_info(device, "was SyncSource (peer failed to write sync_uuid)\n");
2826                                 *rule_nr = 36;
2827                         }
2828
2829                         return 1;
2830                 }
2831
2832                 if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) {
2833
2834                         if (first_peer_device(device)->connection->agreed_pro_version < 91)
2835                                 return -1091;
2836
2837                         if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) &&
2838                             (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
2839                                 drbd_info(device, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
2840
2841                                 device->p_uuid[UI_HISTORY_START + 1] = device->p_uuid[UI_HISTORY_START];
2842                                 device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_BITMAP];
2843                                 device->p_uuid[UI_BITMAP] = 0UL;
2844
2845                                 drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
2846                                 *rule_nr = 35;
2847                         } else {
2848                                 drbd_info(device, "was SyncTarget (failed to write sync_uuid)\n");
2849                                 *rule_nr = 37;
2850                         }
2851
2852                         return -1;
2853                 }
2854
2855                 /* Common power [off|failure] */
2856                 rct = (test_bit(CRASHED_PRIMARY, &device->flags) ? 1 : 0) +
2857                         (device->p_uuid[UI_FLAGS] & 2);
2858                 /* lowest bit is set when we were primary,
2859                  * next bit (weight 2) is set when peer was primary */
2860                 *rule_nr = 40;
2861
2862                 switch (rct) {
2863                 case 0: /* !self_pri && !peer_pri */ return 0;
2864                 case 1: /*  self_pri && !peer_pri */ return 1;
2865                 case 2: /* !self_pri &&  peer_pri */ return -1;
2866                 case 3: /*  self_pri &&  peer_pri */
2867                         dc = test_bit(RESOLVE_CONFLICTS, &first_peer_device(device)->connection->flags);
2868                         return dc ? -1 : 1;
2869                 }
2870         }
2871
2872         *rule_nr = 50;
2873         peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
2874         if (self == peer)
2875                 return -1;
2876
2877         *rule_nr = 51;
2878         peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1);
2879         if (self == peer) {
2880                 if (first_peer_device(device)->connection->agreed_pro_version < 96 ?
2881                     (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
2882                     (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
2883                     peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) {
2884                         /* The last P_SYNC_UUID did not get though. Undo the last start of
2885                            resync as sync source modifications of the peer's UUIDs. */
2886
2887                         if (first_peer_device(device)->connection->agreed_pro_version < 91)
2888                                 return -1091;
2889
2890                         device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START];
2891                         device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_HISTORY_START + 1];
2892
2893                         drbd_info(device, "Lost last syncUUID packet, corrected:\n");
2894                         drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
2895
2896                         return -1;
2897                 }
2898         }
2899
2900         *rule_nr = 60;
2901         self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2902         for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2903                 peer = device->p_uuid[i] & ~((u64)1);
2904                 if (self == peer)
2905                         return -2;
2906         }
2907
2908         *rule_nr = 70;
2909         self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2910         peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
2911         if (self == peer)
2912                 return 1;
2913
2914         *rule_nr = 71;
2915         self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
2916         if (self == peer) {
2917                 if (first_peer_device(device)->connection->agreed_pro_version < 96 ?
2918                     (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
2919                     (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
2920                     self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
2921                         /* The last P_SYNC_UUID did not get though. Undo the last start of
2922                            resync as sync source modifications of our UUIDs. */
2923
2924                         if (first_peer_device(device)->connection->agreed_pro_version < 91)
2925                                 return -1091;
2926
2927                         __drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]);
2928                         __drbd_uuid_set(device, UI_HISTORY_START, device->ldev->md.uuid[UI_HISTORY_START + 1]);
2929
2930                         drbd_info(device, "Last syncUUID did not get through, corrected:\n");
2931                         drbd_uuid_dump(device, "self", device->ldev->md.uuid,
2932                                        device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
2933
2934                         return 1;
2935                 }
2936         }
2937
2938
2939         *rule_nr = 80;
2940         peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
2941         for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2942                 self = device->ldev->md.uuid[i] & ~((u64)1);
2943                 if (self == peer)
2944                         return 2;
2945         }
2946
2947         *rule_nr = 90;
2948         self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2949         peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
2950         if (self == peer && self != ((u64)0))
2951                 return 100;
2952
2953         *rule_nr = 100;
2954         for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2955                 self = device->ldev->md.uuid[i] & ~((u64)1);
2956                 for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
2957                         peer = device->p_uuid[j] & ~((u64)1);
2958                         if (self == peer)
2959                                 return -100;
2960                 }
2961         }
2962
2963         return -1000;
2964 }
2965
2966 /* drbd_sync_handshake() returns the new conn state on success, or
2967    CONN_MASK (-1) on failure.
2968  */
2969 static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
2970                                            enum drbd_role peer_role,
2971                                            enum drbd_disk_state peer_disk) __must_hold(local)
2972 {
2973         struct drbd_device *device = peer_device->device;
2974         enum drbd_conns rv = C_MASK;
2975         enum drbd_disk_state mydisk;
2976         struct net_conf *nc;
2977         int hg, rule_nr, rr_conflict, tentative;
2978
2979         mydisk = device->state.disk;
2980         if (mydisk == D_NEGOTIATING)
2981                 mydisk = device->new_state_tmp.disk;
2982
2983         drbd_info(device, "drbd_sync_handshake:\n");
2984
2985         spin_lock_irq(&device->ldev->md.uuid_lock);
2986         drbd_uuid_dump(device, "self", device->ldev->md.uuid, device->comm_bm_set, 0);
2987         drbd_uuid_dump(device, "peer", device->p_uuid,
2988                        device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
2989
2990         hg = drbd_uuid_compare(device, &rule_nr);
2991         spin_unlock_irq(&device->ldev->md.uuid_lock);
2992
2993         drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
2994
2995         if (hg == -1000) {
2996                 drbd_alert(device, "Unrelated data, aborting!\n");
2997                 return C_MASK;
2998         }
2999         if (hg < -1000) {
3000                 drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
3001                 return C_MASK;
3002         }
3003
3004         if    ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
3005             (peer_disk == D_INCONSISTENT && mydisk    > D_INCONSISTENT)) {
3006                 int f = (hg == -100) || abs(hg) == 2;
3007                 hg = mydisk > D_INCONSISTENT ? 1 : -1;
3008                 if (f)
3009                         hg = hg*2;
3010                 drbd_info(device, "Becoming sync %s due to disk states.\n",
3011                      hg > 0 ? "source" : "target");
3012         }
3013
3014         if (abs(hg) == 100)
3015                 drbd_khelper(device, "initial-split-brain");
3016
3017         rcu_read_lock();
3018         nc = rcu_dereference(peer_device->connection->net_conf);
3019
3020         if (hg == 100 || (hg == -100 && nc->always_asbp)) {
3021                 int pcount = (device->state.role == R_PRIMARY)
3022                            + (peer_role == R_PRIMARY);
3023                 int forced = (hg == -100);
3024
3025                 switch (pcount) {
3026                 case 0:
3027                         hg = drbd_asb_recover_0p(peer_device);
3028                         break;
3029                 case 1:
3030                         hg = drbd_asb_recover_1p(peer_device);
3031                         break;
3032                 case 2:
3033                         hg = drbd_asb_recover_2p(peer_device);
3034                         break;
3035                 }
3036                 if (abs(hg) < 100) {
3037                         drbd_warn(device, "Split-Brain detected, %d primaries, "
3038                              "automatically solved. Sync from %s node\n",
3039                              pcount, (hg < 0) ? "peer" : "this");
3040                         if (forced) {
3041                                 drbd_warn(device, "Doing a full sync, since"
3042                                      " UUIDs where ambiguous.\n");
3043                                 hg = hg*2;
3044                         }
3045                 }
3046         }
3047
3048         if (hg == -100) {
3049                 if (test_bit(DISCARD_MY_DATA, &device->flags) && !(device->p_uuid[UI_FLAGS]&1))
3050                         hg = -1;
3051                 if (!test_bit(DISCARD_MY_DATA, &device->flags) && (device->p_uuid[UI_FLAGS]&1))
3052                         hg = 1;
3053
3054                 if (abs(hg) < 100)
3055                         drbd_warn(device, "Split-Brain detected, manually solved. "
3056                              "Sync from %s node\n",
3057                              (hg < 0) ? "peer" : "this");
3058         }
3059         rr_conflict = nc->rr_conflict;
3060         tentative = nc->tentative;
3061         rcu_read_unlock();
3062
3063         if (hg == -100) {
3064                 /* FIXME this log message is not correct if we end up here
3065                  * after an attempted attach on a diskless node.
3066                  * We just refuse to attach -- well, we drop the "connection"
3067                  * to that disk, in a way... */
3068                 drbd_alert(device, "Split-Brain detected but unresolved, dropping connection!\n");
3069                 drbd_khelper(device, "split-brain");
3070                 return C_MASK;
3071         }
3072
3073         if (hg > 0 && mydisk <= D_INCONSISTENT) {
3074                 drbd_err(device, "I shall become SyncSource, but I am inconsistent!\n");
3075                 return C_MASK;
3076         }
3077
3078         if (hg < 0 && /* by intention we do not use mydisk here. */
3079             device->state.role == R_PRIMARY && device->state.disk >= D_CONSISTENT) {
3080                 switch (rr_conflict) {
3081                 case ASB_CALL_HELPER:
3082                         drbd_khelper(device, "pri-lost");
3083                         /* fall through */
3084                 case ASB_DISCONNECT:
3085                         drbd_err(device, "I shall become SyncTarget, but I am primary!\n");
3086                         return C_MASK;
3087                 case ASB_VIOLENTLY:
3088                         drbd_warn(device, "Becoming SyncTarget, violating the stable-data"
3089                              "assumption\n");
3090                 }
3091         }
3092
3093         if (tentative || test_bit(CONN_DRY_RUN, &peer_device->connection->flags)) {
3094                 if (hg == 0)
3095                         drbd_info(device, "dry-run connect: No resync, would become Connected immediately.\n");
3096                 else
3097                         drbd_info(device, "dry-run connect: Would become %s, doing a %s resync.",
3098                                  drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
3099                                  abs(hg) >= 2 ? "full" : "bit-map based");
3100                 return C_MASK;
3101         }
3102
3103         if (abs(hg) >= 2) {
3104                 drbd_info(device, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
3105                 if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
3106                                         BM_LOCKED_SET_ALLOWED))
3107                         return C_MASK;
3108         }
3109
3110         if (hg > 0) { /* become sync source. */
3111                 rv = C_WF_BITMAP_S;
3112         } else if (hg < 0) { /* become sync target */
3113                 rv = C_WF_BITMAP_T;
3114         } else {
3115                 rv = C_CONNECTED;
3116                 if (drbd_bm_total_weight(device)) {
3117                         drbd_info(device, "No resync, but %lu bits in bitmap!\n",
3118                              drbd_bm_total_weight(device));
3119                 }
3120         }
3121
3122         return rv;
3123 }
3124
3125 static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer)
3126 {
3127         /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
3128         if (peer == ASB_DISCARD_REMOTE)
3129                 return ASB_DISCARD_LOCAL;
3130
3131         /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
3132         if (peer == ASB_DISCARD_LOCAL)
3133                 return ASB_DISCARD_REMOTE;
3134
3135         /* everything else is valid if they are equal on both sides. */
3136         return peer;
3137 }
3138
3139 static int receive_protocol(struct drbd_connection *connection, struct packet_info *pi)
3140 {
3141         struct p_protocol *p = pi->data;
3142         enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
3143         int p_proto, p_discard_my_data, p_two_primaries, cf;
3144         struct net_conf *nc, *old_net_conf, *new_net_conf = NULL;
3145         char integrity_alg[SHARED_SECRET_MAX] = "";
3146         struct crypto_hash *peer_integrity_tfm = NULL;
3147         void *int_dig_in = NULL, *int_dig_vv = NULL;
3148
3149         p_proto         = be32_to_cpu(p->protocol);
3150         p_after_sb_0p   = be32_to_cpu(p->after_sb_0p);
3151         p_after_sb_1p   = be32_to_cpu(p->after_sb_1p);
3152         p_after_sb_2p   = be32_to_cpu(p->after_sb_2p);
3153         p_two_primaries = be32_to_cpu(p->two_primaries);
3154         cf              = be32_to_cpu(p->conn_flags);
3155         p_discard_my_data = cf & CF_DISCARD_MY_DATA;
3156
3157         if (connection->agreed_pro_version >= 87) {
3158                 int err;
3159
3160                 if (pi->size > sizeof(integrity_alg))
3161                         return -EIO;
3162                 err = drbd_recv_all(connection, integrity_alg, pi->size);
3163                 if (err)
3164                         return err;
3165                 integrity_alg[SHARED_SECRET_MAX - 1] = 0;
3166         }
3167
3168         if (pi->cmd != P_PROTOCOL_UPDATE) {
3169                 clear_bit(CONN_DRY_RUN, &connection->flags);
3170
3171                 if (cf & CF_DRY_RUN)
3172                         set_bit(CONN_DRY_RUN, &connection->flags);
3173
3174                 rcu_read_lock();
3175                 nc = rcu_dereference(connection->net_conf);
3176
3177                 if (p_proto != nc->wire_protocol) {
3178                         drbd_err(connection, "incompatible %s settings\n", "protocol");
3179                         goto disconnect_rcu_unlock;
3180                 }
3181
3182                 if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) {
3183                         drbd_err(connection, "incompatible %s settings\n", "after-sb-0pri");
3184                         goto disconnect_rcu_unlock;
3185                 }
3186
3187                 if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) {
3188                         drbd_err(connection, "incompatible %s settings\n", "after-sb-1pri");
3189                         goto disconnect_rcu_unlock;
3190                 }
3191
3192                 if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) {
3193                         drbd_err(connection, "incompatible %s settings\n", "after-sb-2pri");
3194                         goto disconnect_rcu_unlock;
3195                 }
3196
3197                 if (p_discard_my_data && nc->discard_my_data) {
3198                         drbd_err(connection, "incompatible %s settings\n", "discard-my-data");
3199                         goto disconnect_rcu_unlock;
3200                 }
3201
3202                 if (p_two_primaries != nc->two_primaries) {
3203                         drbd_err(connection, "incompatible %s settings\n", "allow-two-primaries");
3204                         goto disconnect_rcu_unlock;
3205                 }
3206
3207                 if (strcmp(integrity_alg, nc->integrity_alg)) {
3208                         drbd_err(connection, "incompatible %s settings\n", "data-integrity-alg");
3209                         goto disconnect_rcu_unlock;
3210                 }
3211
3212                 rcu_read_unlock();
3213         }
3214
3215         if (integrity_alg[0]) {
3216                 int hash_size;
3217
3218                 /*
3219                  * We can only change the peer data integrity algorithm
3220                  * here.  Changing our own data integrity algorithm
3221                  * requires that we send a P_PROTOCOL_UPDATE packet at
3222                  * the same time; otherwise, the peer has no way to
3223                  * tell between which packets the algorithm should
3224                  * change.
3225                  */
3226
3227                 peer_integrity_tfm = crypto_alloc_hash(integrity_alg, 0, CRYPTO_ALG_ASYNC);
3228                 if (!peer_integrity_tfm) {
3229                         drbd_err(connection, "peer data-integrity-alg %s not supported\n",
3230                                  integrity_alg);
3231                         goto disconnect;
3232                 }
3233
3234                 hash_size = crypto_hash_digestsize(peer_integrity_tfm);
3235                 int_dig_in = kmalloc(hash_size, GFP_KERNEL);
3236                 int_dig_vv = kmalloc(hash_size, GFP_KERNEL);
3237                 if (!(int_dig_in && int_dig_vv)) {
3238                         drbd_err(connection, "Allocation of buffers for data integrity checking failed\n");
3239                         goto disconnect;
3240                 }
3241         }
3242
3243         new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
3244         if (!new_net_conf) {
3245                 drbd_err(connection, "Allocation of new net_conf failed\n");
3246                 goto disconnect;
3247         }
3248
3249         mutex_lock(&connection->data.mutex);
3250         mutex_lock(&connection->resource->conf_update);
3251         old_net_conf = connection->net_conf;
3252         *new_net_conf = *old_net_conf;
3253
3254         new_net_conf->wire_protocol = p_proto;
3255         new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p);
3256         new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p);
3257         new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p);
3258         new_net_conf->two_primaries = p_two_primaries;
3259
3260         rcu_assign_pointer(connection->net_conf, new_net_conf);
3261         mutex_unlock(&connection->resource->conf_update);
3262         mutex_unlock(&connection->data.mutex);
3263
3264         crypto_free_hash(connection->peer_integrity_tfm);
3265         kfree(connection->int_dig_in);
3266         kfree(connection->int_dig_vv);
3267         connection->peer_integrity_tfm = peer_integrity_tfm;
3268         connection->int_dig_in = int_dig_in;
3269         connection->int_dig_vv = int_dig_vv;
3270
3271         if (strcmp(old_net_conf->integrity_alg, integrity_alg))
3272                 drbd_info(connection, "peer data-integrity-alg: %s\n",
3273                           integrity_alg[0] ? integrity_alg : "(none)");
3274
3275         synchronize_rcu();
3276         kfree(old_net_conf);
3277         return 0;
3278
3279 disconnect_rcu_unlock:
3280         rcu_read_unlock();
3281 disconnect:
3282         crypto_free_hash(peer_integrity_tfm);
3283         kfree(int_dig_in);
3284         kfree(int_dig_vv);
3285         conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
3286         return -EIO;
3287 }
3288
3289 /* helper function
3290  * input: alg name, feature name
3291  * return: NULL (alg name was "")
3292  *         ERR_PTR(error) if something goes wrong
3293  *         or the crypto hash ptr, if it worked out ok. */
3294 static
3295 struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device,
3296                 const char *alg, const char *name)
3297 {
3298         struct crypto_hash *tfm;
3299
3300         if (!alg[0])
3301                 return NULL;
3302
3303         tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
3304         if (IS_ERR(tfm)) {
3305                 drbd_err(device, "Can not allocate \"%s\" as %s (reason: %ld)\n",
3306                         alg, name, PTR_ERR(tfm));
3307                 return tfm;
3308         }
3309         return tfm;
3310 }
3311
3312 static int ignore_remaining_packet(struct drbd_connection *connection, struct packet_info *pi)
3313 {
3314         void *buffer = connection->data.rbuf;
3315         int size = pi->size;
3316
3317         while (size) {
3318                 int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE);
3319                 s = drbd_recv(connection, buffer, s);
3320                 if (s <= 0) {
3321                         if (s < 0)
3322                                 return s;
3323                         break;
3324                 }
3325                 size -= s;
3326         }
3327         if (size)
3328                 return -EIO;
3329         return 0;
3330 }
3331
3332 /*
3333  * config_unknown_volume  -  device configuration command for unknown volume
3334  *
3335  * When a device is added to an existing connection, the node on which the
3336  * device is added first will send configuration commands to its peer but the
3337  * peer will not know about the device yet.  It will warn and ignore these
3338  * commands.  Once the device is added on the second node, the second node will
3339  * send the same device configuration commands, but in the other direction.
3340  *
3341  * (We can also end up here if drbd is misconfigured.)
3342  */
3343 static int config_unknown_volume(struct drbd_connection *connection, struct packet_info *pi)
3344 {
3345         drbd_warn(connection, "%s packet received for volume %u, which is not configured locally\n",
3346                   cmdname(pi->cmd), pi->vnr);
3347         return ignore_remaining_packet(connection, pi);
3348 }
3349
3350 static int receive_SyncParam(struct drbd_connection *connection, struct packet_info *pi)
3351 {
3352         struct drbd_peer_device *peer_device;
3353         struct drbd_device *device;
3354         struct p_rs_param_95 *p;
3355         unsigned int header_size, data_size, exp_max_sz;
3356         struct crypto_hash *verify_tfm = NULL;
3357         struct crypto_hash *csums_tfm = NULL;
3358         struct net_conf *old_net_conf, *new_net_conf = NULL;
3359         struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL;
3360         const int apv = connection->agreed_pro_version;
3361         struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
3362         int fifo_size = 0;
3363         int err;
3364
3365         peer_device = conn_peer_device(connection, pi->vnr);
3366         if (!peer_device)
3367                 return config_unknown_volume(connection, pi);
3368         device = peer_device->device;
3369
3370         exp_max_sz  = apv <= 87 ? sizeof(struct p_rs_param)
3371                     : apv == 88 ? sizeof(struct p_rs_param)
3372                                         + SHARED_SECRET_MAX
3373                     : apv <= 94 ? sizeof(struct p_rs_param_89)
3374                     : /* apv >= 95 */ sizeof(struct p_rs_param_95);
3375
3376         if (pi->size > exp_max_sz) {
3377                 drbd_err(device, "SyncParam packet too long: received %u, expected <= %u bytes\n",
3378                     pi->size, exp_max_sz);
3379                 return -EIO;
3380         }
3381
3382         if (apv <= 88) {
3383                 header_size = sizeof(struct p_rs_param);
3384                 data_size = pi->size - header_size;
3385         } else if (apv <= 94) {
3386                 header_size = sizeof(struct p_rs_param_89);
3387                 data_size = pi->size - header_size;
3388                 D_ASSERT(device, data_size == 0);
3389         } else {
3390                 header_size = sizeof(struct p_rs_param_95);
3391                 data_size = pi->size - header_size;
3392                 D_ASSERT(device, data_size == 0);
3393         }
3394
3395         /* initialize verify_alg and csums_alg */
3396         p = pi->data;
3397         memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
3398
3399         err = drbd_recv_all(peer_device->connection, p, header_size);
3400         if (err)
3401                 return err;
3402
3403         mutex_lock(&connection->resource->conf_update);
3404         old_net_conf = peer_device->connection->net_conf;
3405         if (get_ldev(device)) {
3406                 new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
3407                 if (!new_disk_conf) {
3408                         put_ldev(device);
3409                         mutex_unlock(&connection->resource->conf_update);
3410                         drbd_err(device, "Allocation of new disk_conf failed\n");
3411                         return -ENOMEM;
3412                 }
3413
3414                 old_disk_conf = device->ldev->disk_conf;
3415                 *new_disk_conf = *old_disk_conf;
3416
3417                 new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate);
3418         }
3419
3420         if (apv >= 88) {
3421                 if (apv == 88) {
3422                         if (data_size > SHARED_SECRET_MAX || data_size == 0) {
3423                                 drbd_err(device, "verify-alg of wrong size, "
3424                                         "peer wants %u, accepting only up to %u byte\n",
3425                                         data_size, SHARED_SECRET_MAX);
3426                                 err = -EIO;
3427                                 goto reconnect;
3428                         }
3429
3430                         err = drbd_recv_all(peer_device->connection, p->verify_alg, data_size);
3431                         if (err)
3432                                 goto reconnect;
3433                         /* we expect NUL terminated string */
3434                         /* but just in case someone tries to be evil */
3435                         D_ASSERT(device, p->verify_alg[data_size-1] == 0);
3436                         p->verify_alg[data_size-1] = 0;
3437
3438                 } else /* apv >= 89 */ {
3439                         /* we still expect NUL terminated strings */
3440                         /* but just in case someone tries to be evil */
3441                         D_ASSERT(device, p->verify_alg[SHARED_SECRET_MAX-1] == 0);
3442                         D_ASSERT(device, p->csums_alg[SHARED_SECRET_MAX-1] == 0);
3443                         p->verify_alg[SHARED_SECRET_MAX-1] = 0;
3444                         p->csums_alg[SHARED_SECRET_MAX-1] = 0;
3445                 }
3446
3447                 if (strcmp(old_net_conf->verify_alg, p->verify_alg)) {
3448                         if (device->state.conn == C_WF_REPORT_PARAMS) {
3449                                 drbd_err(device, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
3450                                     old_net_conf->verify_alg, p->verify_alg);
3451                                 goto disconnect;
3452                         }
3453                         verify_tfm = drbd_crypto_alloc_digest_safe(device,
3454                                         p->verify_alg, "verify-alg");
3455                         if (IS_ERR(verify_tfm)) {
3456                                 verify_tfm = NULL;
3457                                 goto disconnect;
3458                         }
3459                 }
3460
3461                 if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) {
3462                         if (device->state.conn == C_WF_REPORT_PARAMS) {
3463                                 drbd_err(device, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
3464                                     old_net_conf->csums_alg, p->csums_alg);
3465                                 goto disconnect;
3466                         }
3467                         csums_tfm = drbd_crypto_alloc_digest_safe(device,
3468                                         p->csums_alg, "csums-alg");
3469                         if (IS_ERR(csums_tfm)) {
3470                                 csums_tfm = NULL;
3471                                 goto disconnect;
3472                         }
3473                 }
3474
3475                 if (apv > 94 && new_disk_conf) {
3476                         new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
3477                         new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target);
3478                         new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target);
3479                         new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate);
3480
3481                         fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
3482                         if (fifo_size != device->rs_plan_s->size) {
3483                                 new_plan = fifo_alloc(fifo_size);
3484                                 if (!new_plan) {
3485                                         drbd_err(device, "kmalloc of fifo_buffer failed");
3486                                         put_ldev(device);
3487                                         goto disconnect;
3488                                 }
3489                         }
3490                 }
3491
3492                 if (verify_tfm || csums_tfm) {
3493                         new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
3494                         if (!new_net_conf) {
3495                                 drbd_err(device, "Allocation of new net_conf failed\n");
3496                                 goto disconnect;
3497                         }
3498
3499                         *new_net_conf = *old_net_conf;
3500
3501                         if (verify_tfm) {
3502                                 strcpy(new_net_conf->verify_alg, p->verify_alg);
3503                                 new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1;
3504                                 crypto_free_hash(peer_device->connection->verify_tfm);
3505                                 peer_device->connection->verify_tfm = verify_tfm;
3506                                 drbd_info(device, "using verify-alg: \"%s\"\n", p->verify_alg);
3507                         }
3508                         if (csums_tfm) {
3509                                 strcpy(new_net_conf->csums_alg, p->csums_alg);
3510                                 new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1;
3511                                 crypto_free_hash(peer_device->connection->csums_tfm);
3512                                 peer_device->connection->csums_tfm = csums_tfm;
3513                                 drbd_info(device, "using csums-alg: \"%s\"\n", p->csums_alg);
3514                         }
3515                         rcu_assign_pointer(connection->net_conf, new_net_conf);
3516                 }
3517         }
3518
3519         if (new_disk_conf) {
3520                 rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
3521                 put_ldev(device);
3522         }
3523
3524         if (new_plan) {
3525                 old_plan = device->rs_plan_s;
3526                 rcu_assign_pointer(device->rs_plan_s, new_plan);
3527         }
3528
3529         mutex_unlock(&connection->resource->conf_update);
3530         synchronize_rcu();
3531         if (new_net_conf)
3532                 kfree(old_net_conf);
3533         kfree(old_disk_conf);
3534         kfree(old_plan);
3535
3536         return 0;
3537
3538 reconnect:
3539         if (new_disk_conf) {
3540                 put_ldev(device);
3541                 kfree(new_disk_conf);
3542         }
3543         mutex_unlock(&connection->resource->conf_update);
3544         return -EIO;
3545
3546 disconnect:
3547         kfree(new_plan);
3548         if (new_disk_conf) {
3549                 put_ldev(device);
3550                 kfree(new_disk_conf);
3551         }
3552         mutex_unlock(&connection->resource->conf_update);
3553         /* just for completeness: actually not needed,
3554          * as this is not reached if csums_tfm was ok. */
3555         crypto_free_hash(csums_tfm);
3556         /* but free the verify_tfm again, if csums_tfm did not work out */
3557         crypto_free_hash(verify_tfm);
3558         conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
3559         return -EIO;
3560 }
3561
3562 /* warn if the arguments differ by more than 12.5% */
3563 static void warn_if_differ_considerably(struct drbd_device *device,
3564         const char *s, sector_t a, sector_t b)
3565 {
3566         sector_t d;
3567         if (a == 0 || b == 0)
3568                 return;
3569         d = (a > b) ? (a - b) : (b - a);
3570         if (d > (a>>3) || d > (b>>3))
3571                 drbd_warn(device, "Considerable difference in %s: %llus vs. %llus\n", s,
3572                      (unsigned long long)a, (unsigned long long)b);
3573 }
3574
3575 static int receive_sizes(struct drbd_connection *connection, struct packet_info *pi)
3576 {
3577         struct drbd_peer_device *peer_device;
3578         struct drbd_device *device;
3579         struct p_sizes *p = pi->data;
3580         enum determine_dev_size dd = DS_UNCHANGED;
3581         sector_t p_size, p_usize, my_usize;
3582         int ldsc = 0; /* local disk size changed */
3583         enum dds_flags ddsf;
3584
3585         peer_device = conn_peer_device(connection, pi->vnr);
3586         if (!peer_device)
3587                 return config_unknown_volume(connection, pi);
3588         device = peer_device->device;
3589
3590         p_size = be64_to_cpu(p->d_size);
3591         p_usize = be64_to_cpu(p->u_size);
3592
3593         /* just store the peer's disk size for now.
3594          * we still need to figure out whether we accept that. */
3595         device->p_size = p_size;
3596
3597         if (get_ldev(device)) {
3598                 rcu_read_lock();
3599                 my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size;
3600                 rcu_read_unlock();
3601
3602                 warn_if_differ_considerably(device, "lower level device sizes",
3603                            p_size, drbd_get_max_capacity(device->ldev));
3604                 warn_if_differ_considerably(device, "user requested size",
3605                                             p_usize, my_usize);
3606
3607                 /* if this is the first connect, or an otherwise expected
3608                  * param exchange, choose the minimum */
3609                 if (device->state.conn == C_WF_REPORT_PARAMS)
3610                         p_usize = min_not_zero(my_usize, p_usize);
3611
3612                 /* Never shrink a device with usable data during connect.
3613                    But allow online shrinking if we are connected. */
3614                 if (drbd_new_dev_size(device, device->ldev, p_usize, 0) <
3615                     drbd_get_capacity(device->this_bdev) &&
3616                     device->state.disk >= D_OUTDATED &&
3617                     device->state.conn < C_CONNECTED) {
3618                         drbd_err(device, "The peer's disk size is too small!\n");
3619                         conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
3620                         put_ldev(device);
3621                         return -EIO;
3622                 }
3623
3624                 if (my_usize != p_usize) {
3625                         struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
3626
3627                         new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
3628                         if (!new_disk_conf) {
3629                                 drbd_err(device, "Allocation of new disk_conf failed\n");
3630                                 put_ldev(device);
3631                                 return -ENOMEM;
3632                         }
3633
3634                         mutex_lock(&connection->resource->conf_update);
3635                         old_disk_conf = device->ldev->disk_conf;
3636                         *new_disk_conf = *old_disk_conf;
3637                         new_disk_conf->disk_size = p_usize;
3638
3639                         rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
3640                         mutex_unlock(&connection->resource->conf_update);
3641                         synchronize_rcu();
3642                         kfree(old_disk_conf);
3643
3644                         drbd_info(device, "Peer sets u_size to %lu sectors\n",
3645                                  (unsigned long)my_usize);
3646                 }
3647
3648                 put_ldev(device);
3649         }
3650
3651         ddsf = be16_to_cpu(p->dds_flags);
3652         if (get_ldev(device)) {
3653                 dd = drbd_determine_dev_size(device, ddsf, NULL);
3654                 put_ldev(device);
3655                 if (dd == DS_ERROR)
3656                         return -EIO;
3657                 drbd_md_sync(device);
3658         } else {
3659                 /* I am diskless, need to accept the peer's size. */
3660                 drbd_set_my_capacity(device, p_size);
3661         }
3662
3663         device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
3664         drbd_reconsider_max_bio_size(device);
3665
3666         if (get_ldev(device)) {
3667                 if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) {
3668                         device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
3669                         ldsc = 1;
3670                 }
3671
3672                 put_ldev(device);
3673         }
3674
3675         if (device->state.conn > C_WF_REPORT_PARAMS) {
3676                 if (be64_to_cpu(p->c_size) !=
3677                     drbd_get_capacity(device->this_bdev) || ldsc) {
3678                         /* we have different sizes, probably peer
3679                          * needs to know my new size... */
3680                         drbd_send_sizes(peer_device, 0, ddsf);
3681                 }
3682                 if (test_and_clear_bit(RESIZE_PENDING, &device->flags) ||
3683                     (dd == DS_GREW && device->state.conn == C_CONNECTED)) {
3684                         if (device->state.pdsk >= D_INCONSISTENT &&
3685                             device->state.disk >= D_INCONSISTENT) {
3686                                 if (ddsf & DDSF_NO_RESYNC)
3687                                         drbd_info(device, "Resync of new storage suppressed with --assume-clean\n");
3688                                 else
3689                                         resync_after_online_grow(device);
3690                         } else
3691                                 set_bit(RESYNC_AFTER_NEG, &device->flags);
3692                 }
3693         }
3694
3695         return 0;
3696 }
3697
3698 static int receive_uuids(struct drbd_connection *connection, struct packet_info *pi)
3699 {
3700         struct drbd_peer_device *peer_device;
3701         struct drbd_device *device;
3702         struct p_uuids *p = pi->data;
3703         u64 *p_uuid;
3704         int i, updated_uuids = 0;
3705
3706         peer_device = conn_peer_device(connection, pi->vnr);
3707         if (!peer_device)
3708                 return config_unknown_volume(connection, pi);
3709         device = peer_device->device;
3710
3711         p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
3712         if (!p_uuid) {
3713                 drbd_err(device, "kmalloc of p_uuid failed\n");
3714                 return false;
3715         }
3716
3717         for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
3718                 p_uuid[i] = be64_to_cpu(p->uuid[i]);
3719
3720         kfree(device->p_uuid);
3721         device->p_uuid = p_uuid;
3722
3723         if (device->state.conn < C_CONNECTED &&
3724             device->state.disk < D_INCONSISTENT &&
3725             device->state.role == R_PRIMARY &&
3726             (device->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
3727                 drbd_err(device, "Can only connect to data with current UUID=%016llX\n",
3728                     (unsigned long long)device->ed_uuid);
3729                 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
3730                 return -EIO;
3731         }
3732
3733         if (get_ldev(device)) {
3734                 int skip_initial_sync =
3735                         device->state.conn == C_CONNECTED &&
3736                         peer_device->connection->agreed_pro_version >= 90 &&
3737                         device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
3738                         (p_uuid[UI_FLAGS] & 8);
3739                 if (skip_initial_sync) {
3740                         drbd_info(device, "Accepted new current UUID, preparing to skip initial sync\n");
3741                         drbd_bitmap_io(device, &drbd_bmio_clear_n_write,
3742                                         "clear_n_write from receive_uuids",
3743                                         BM_LOCKED_TEST_ALLOWED);
3744                         _drbd_uuid_set(device, UI_CURRENT, p_uuid[UI_CURRENT]);
3745                         _drbd_uuid_set(device, UI_BITMAP, 0);
3746                         _drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
3747                                         CS_VERBOSE, NULL);
3748                         drbd_md_sync(device);
3749                         updated_uuids = 1;
3750                 }
3751                 put_ldev(device);
3752         } else if (device->state.disk < D_INCONSISTENT &&
3753                    device->state.role == R_PRIMARY) {
3754                 /* I am a diskless primary, the peer just created a new current UUID
3755                    for me. */
3756                 updated_uuids = drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
3757         }
3758
3759         /* Before we test for the disk state, we should wait until an eventually
3760            ongoing cluster wide state change is finished. That is important if
3761            we are primary and are detaching from our disk. We need to see the
3762            new disk state... */
3763         mutex_lock(device->state_mutex);
3764         mutex_unlock(device->state_mutex);
3765         if (device->state.conn >= C_CONNECTED && device->state.disk < D_INCONSISTENT)
3766                 updated_uuids |= drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
3767
3768         if (updated_uuids)
3769                 drbd_print_uuids(device, "receiver updated UUIDs to");
3770
3771         return 0;
3772 }
3773
3774 /**
3775  * convert_state() - Converts the peer's view of the cluster state to our point of view
3776  * @ps:         The state as seen by the peer.
3777  */
3778 static union drbd_state convert_state(union drbd_state ps)
3779 {
3780         union drbd_state ms;
3781
3782         static enum drbd_conns c_tab[] = {
3783                 [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS,
3784                 [C_CONNECTED] = C_CONNECTED,
3785
3786                 [C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
3787                 [C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
3788                 [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
3789                 [C_VERIFY_S]       = C_VERIFY_T,
3790                 [C_MASK]   = C_MASK,
3791         };
3792
3793         ms.i = ps.i;
3794
3795         ms.conn = c_tab[ps.conn];
3796         ms.peer = ps.role;
3797         ms.role = ps.peer;
3798         ms.pdsk = ps.disk;
3799         ms.disk = ps.pdsk;
3800         ms.peer_isp = (ps.aftr_isp | ps.user_isp);
3801
3802         return ms;
3803 }
3804
3805 static int receive_req_state(struct drbd_connection *connection, struct packet_info *pi)
3806 {
3807         struct drbd_peer_device *peer_device;
3808         struct drbd_device *device;
3809         struct p_req_state *p = pi->data;
3810         union drbd_state mask, val;
3811         enum drbd_state_rv rv;
3812
3813         peer_device = conn_peer_device(connection, pi->vnr);
3814         if (!peer_device)
3815                 return -EIO;
3816         device = peer_device->device;
3817
3818         mask.i = be32_to_cpu(p->mask);
3819         val.i = be32_to_cpu(p->val);
3820
3821         if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) &&
3822             mutex_is_locked(device->state_mutex)) {
3823                 drbd_send_sr_reply(peer_device, SS_CONCURRENT_ST_CHG);
3824                 return 0;
3825         }
3826
3827         mask = convert_state(mask);
3828         val = convert_state(val);
3829
3830         rv = drbd_change_state(device, CS_VERBOSE, mask, val);
3831         drbd_send_sr_reply(peer_device, rv);
3832
3833         drbd_md_sync(device);
3834
3835         return 0;
3836 }
3837
3838 static int receive_req_conn_state(struct drbd_connection *connection, struct packet_info *pi)
3839 {
3840         struct p_req_state *p = pi->data;
3841         union drbd_state mask, val;
3842         enum drbd_state_rv rv;
3843
3844         mask.i = be32_to_cpu(p->mask);
3845         val.i = be32_to_cpu(p->val);
3846
3847         if (test_bit(RESOLVE_CONFLICTS, &connection->flags) &&
3848             mutex_is_locked(&connection->cstate_mutex)) {
3849                 conn_send_sr_reply(connection, SS_CONCURRENT_ST_CHG);
3850                 return 0;
3851         }
3852
3853         mask = convert_state(mask);
3854         val = convert_state(val);
3855
3856         rv = conn_request_state(connection, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL);
3857         conn_send_sr_reply(connection, rv);
3858
3859         return 0;
3860 }
3861
3862 static int receive_state(struct drbd_connection *connection, struct packet_info *pi)
3863 {
3864         struct drbd_peer_device *peer_device;
3865         struct drbd_device *device;
3866         struct p_state *p = pi->data;
3867         union drbd_state os, ns, peer_state;
3868         enum drbd_disk_state real_peer_disk;
3869         enum chg_state_flags cs_flags;
3870         int rv;
3871
3872         peer_device = conn_peer_device(connection, pi->vnr);
3873         if (!peer_device)
3874                 return config_unknown_volume(connection, pi);
3875         device = peer_device->device;
3876
3877         peer_state.i = be32_to_cpu(p->state);
3878
3879         real_peer_disk = peer_state.disk;
3880         if (peer_state.disk == D_NEGOTIATING) {
3881                 real_peer_disk = device->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
3882                 drbd_info(device, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
3883         }
3884
3885         spin_lock_irq(&device->resource->req_lock);
3886  retry:
3887         os = ns = drbd_read_state(device);
3888         spin_unlock_irq(&device->resource->req_lock);
3889
3890         /* If some other part of the code (asender thread, timeout)
3891          * already decided to close the connection again,
3892          * we must not "re-establish" it here. */
3893         if (os.conn <= C_TEAR_DOWN)
3894                 return -ECONNRESET;
3895
3896         /* If this is the "end of sync" confirmation, usually the peer disk
3897          * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
3898          * set) resync started in PausedSyncT, or if the timing of pause-/
3899          * unpause-sync events has been "just right", the peer disk may
3900          * transition from D_CONSISTENT to D_UP_TO_DATE as well.
3901          */
3902         if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
3903             real_peer_disk == D_UP_TO_DATE &&
3904             os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
3905                 /* If we are (becoming) SyncSource, but peer is still in sync
3906                  * preparation, ignore its uptodate-ness to avoid flapping, it
3907                  * will change to inconsistent once the peer reaches active
3908                  * syncing states.
3909                  * It may have changed syncer-paused flags, however, so we
3910                  * cannot ignore this completely. */
3911                 if (peer_state.conn > C_CONNECTED &&
3912                     peer_state.conn < C_SYNC_SOURCE)
3913                         real_peer_disk = D_INCONSISTENT;
3914
3915                 /* if peer_state changes to connected at the same time,
3916                  * it explicitly notifies us that it finished resync.
3917                  * Maybe we should finish it up, too? */
3918                 else if (os.conn >= C_SYNC_SOURCE &&
3919                          peer_state.conn == C_CONNECTED) {
3920                         if (drbd_bm_total_weight(device) <= device->rs_failed)
3921                                 drbd_resync_finished(device);
3922                         return 0;
3923                 }
3924         }
3925
3926         /* explicit verify finished notification, stop sector reached. */
3927         if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE &&
3928             peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) {
3929                 ov_out_of_sync_print(device);
3930                 drbd_resync_finished(device);
3931                 return 0;
3932         }
3933
3934         /* peer says his disk is inconsistent, while we think it is uptodate,
3935          * and this happens while the peer still thinks we have a sync going on,
3936          * but we think we are already done with the sync.
3937          * We ignore this to avoid flapping pdsk.
3938          * This should not happen, if the peer is a recent version of drbd. */
3939         if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
3940             os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
3941                 real_peer_disk = D_UP_TO_DATE;
3942
3943         if (ns.conn == C_WF_REPORT_PARAMS)
3944                 ns.conn = C_CONNECTED;
3945
3946         if (peer_state.conn == C_AHEAD)
3947                 ns.conn = C_BEHIND;
3948
3949         if (device->p_uuid && peer_state.disk >= D_NEGOTIATING &&
3950             get_ldev_if_state(device, D_NEGOTIATING)) {
3951                 int cr; /* consider resync */
3952
3953                 /* if we established a new connection */
3954                 cr  = (os.conn < C_CONNECTED);
3955                 /* if we had an established connection
3956                  * and one of the nodes newly attaches a disk */
3957                 cr |= (os.conn == C_CONNECTED &&
3958                        (peer_state.disk == D_NEGOTIATING ||
3959                         os.disk == D_NEGOTIATING));
3960                 /* if we have both been inconsistent, and the peer has been
3961                  * forced to be UpToDate with --overwrite-data */
3962                 cr |= test_bit(CONSIDER_RESYNC, &device->flags);
3963                 /* if we had been plain connected, and the admin requested to
3964                  * start a sync by "invalidate" or "invalidate-remote" */
3965                 cr |= (os.conn == C_CONNECTED &&
3966                                 (peer_state.conn >= C_STARTING_SYNC_S &&
3967                                  peer_state.conn <= C_WF_BITMAP_T));
3968
3969                 if (cr)
3970                         ns.conn = drbd_sync_handshake(peer_device, peer_state.role, real_peer_disk);
3971
3972                 put_ldev(device);
3973                 if (ns.conn == C_MASK) {
3974                         ns.conn = C_CONNECTED;
3975                         if (device->state.disk == D_NEGOTIATING) {
3976                                 drbd_force_state(device, NS(disk, D_FAILED));
3977                         } else if (peer_state.disk == D_NEGOTIATING) {
3978                                 drbd_err(device, "Disk attach process on the peer node was aborted.\n");
3979                                 peer_state.disk = D_DISKLESS;
3980                                 real_peer_disk = D_DISKLESS;
3981                         } else {
3982                                 if (test_and_clear_bit(CONN_DRY_RUN, &peer_device->connection->flags))
3983                                         return -EIO;
3984                                 D_ASSERT(device, os.conn == C_WF_REPORT_PARAMS);
3985                                 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
3986                                 return -EIO;
3987                         }
3988                 }
3989         }
3990
3991         spin_lock_irq(&device->resource->req_lock);
3992         if (os.i != drbd_read_state(device).i)
3993                 goto retry;
3994         clear_bit(CONSIDER_RESYNC, &device->flags);
3995         ns.peer = peer_state.role;
3996         ns.pdsk = real_peer_disk;
3997         ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
3998         if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
3999                 ns.disk = device->new_state_tmp.disk;
4000         cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
4001         if (ns.pdsk == D_CONSISTENT && drbd_suspended(device) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
4002             test_bit(NEW_CUR_UUID, &device->flags)) {
4003                 /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this
4004                    for temporal network outages! */
4005                 spin_unlock_irq(&device->resource->req_lock);
4006                 drbd_err(device, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
4007                 tl_clear(peer_device->connection);
4008                 drbd_uuid_new_current(device);
4009                 clear_bit(NEW_CUR_UUID, &device->flags);
4010                 conn_request_state(peer_device->connection, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD);
4011                 return -EIO;
4012         }
4013         rv = _drbd_set_state(device, ns, cs_flags, NULL);
4014         ns = drbd_read_state(device);
4015         spin_unlock_irq(&device->resource->req_lock);
4016
4017         if (rv < SS_SUCCESS) {
4018                 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
4019                 return -EIO;
4020         }
4021
4022         if (os.conn > C_WF_REPORT_PARAMS) {
4023                 if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
4024                     peer_state.disk != D_NEGOTIATING ) {
4025                         /* we want resync, peer has not yet decided to sync... */
4026                         /* Nowadays only used when forcing a node into primary role and
4027                            setting its disk to UpToDate with that */
4028                         drbd_send_uuids(peer_device);
4029                         drbd_send_current_state(peer_device);
4030                 }
4031         }
4032
4033         clear_bit(DISCARD_MY_DATA, &device->flags);
4034
4035         drbd_md_sync(device); /* update connected indicator, la_size_sect, ... */
4036
4037         return 0;
4038 }
4039
4040 static int receive_sync_uuid(struct drbd_connection *connection, struct packet_info *pi)
4041 {
4042         struct drbd_peer_device *peer_device;
4043         struct drbd_device *device;
4044         struct p_rs_uuid *p = pi->data;
4045
4046         peer_device = conn_peer_device(connection, pi->vnr);
4047         if (!peer_device)
4048                 return -EIO;
4049         device = peer_device->device;
4050
4051         wait_event(device->misc_wait,
4052                    device->state.conn == C_WF_SYNC_UUID ||
4053                    device->state.conn == C_BEHIND ||
4054                    device->state.conn < C_CONNECTED ||
4055                    device->state.disk < D_NEGOTIATING);
4056
4057         /* D_ASSERT(device,  device->state.conn == C_WF_SYNC_UUID ); */
4058
4059         /* Here the _drbd_uuid_ functions are right, current should
4060            _not_ be rotated into the history */
4061         if (get_ldev_if_state(device, D_NEGOTIATING)) {
4062                 _drbd_uuid_set(device, UI_CURRENT, be64_to_cpu(p->uuid));
4063                 _drbd_uuid_set(device, UI_BITMAP, 0UL);
4064
4065                 drbd_print_uuids(device, "updated sync uuid");
4066                 drbd_start_resync(device, C_SYNC_TARGET);
4067
4068                 put_ldev(device);
4069         } else
4070                 drbd_err(device, "Ignoring SyncUUID packet!\n");
4071
4072         return 0;
4073 }
4074
4075 /**
4076  * receive_bitmap_plain
4077  *
4078  * Return 0 when done, 1 when another iteration is needed, and a negative error
4079  * code upon failure.
4080  */
4081 static int
4082 receive_bitmap_plain(struct drbd_peer_device *peer_device, unsigned int size,
4083                      unsigned long *p, struct bm_xfer_ctx *c)
4084 {
4085         unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE -
4086                                  drbd_header_size(peer_device->connection);
4087         unsigned int num_words = min_t(size_t, data_size / sizeof(*p),
4088                                        c->bm_words - c->word_offset);
4089         unsigned int want = num_words * sizeof(*p);
4090         int err;
4091
4092         if (want != size) {
4093                 drbd_err(peer_device, "%s:want (%u) != size (%u)\n", __func__, want, size);
4094                 return -EIO;
4095         }
4096         if (want == 0)
4097                 return 0;
4098         err = drbd_recv_all(peer_device->connection, p, want);
4099         if (err)
4100                 return err;
4101
4102         drbd_bm_merge_lel(peer_device->device, c->word_offset, num_words, p);
4103
4104         c->word_offset += num_words;
4105         c->bit_offset = c->word_offset * BITS_PER_LONG;
4106         if (c->bit_offset > c->bm_bits)
4107                 c->bit_offset = c->bm_bits;
4108
4109         return 1;
4110 }
4111
4112 static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p)
4113 {
4114         return (enum drbd_bitmap_code)(p->encoding & 0x0f);
4115 }
4116
4117 static int dcbp_get_start(struct p_compressed_bm *p)
4118 {
4119         return (p->encoding & 0x80) != 0;
4120 }
4121
4122 static int dcbp_get_pad_bits(struct p_compressed_bm *p)
4123 {
4124         return (p->encoding >> 4) & 0x7;
4125 }
4126
4127 /**
4128  * recv_bm_rle_bits
4129  *
4130  * Return 0 when done, 1 when another iteration is needed, and a negative error
4131  * code upon failure.
4132  */
4133 static int
4134 recv_bm_rle_bits(struct drbd_peer_device *peer_device,
4135                 struct p_compressed_bm *p,
4136                  struct bm_xfer_ctx *c,
4137                  unsigned int len)
4138 {
4139         struct bitstream bs;
4140         u64 look_ahead;
4141         u64 rl;
4142         u64 tmp;
4143         unsigned long s = c->bit_offset;
4144         unsigned long e;
4145         int toggle = dcbp_get_start(p);
4146         int have;
4147         int bits;
4148
4149         bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p));
4150
4151         bits = bitstream_get_bits(&bs, &look_ahead, 64);
4152         if (bits < 0)
4153                 return -EIO;
4154
4155         for (have = bits; have > 0; s += rl, toggle = !toggle) {
4156                 bits = vli_decode_bits(&rl, look_ahead);
4157                 if (bits <= 0)
4158                         return -EIO;
4159
4160                 if (toggle) {
4161                         e = s + rl -1;
4162                         if (e >= c->bm_bits) {
4163                                 drbd_err(peer_device, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
4164                                 return -EIO;
4165                         }
4166                         _drbd_bm_set_bits(peer_device->device, s, e);
4167                 }
4168
4169                 if (have < bits) {
4170                         drbd_err(peer_device, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
4171                                 have, bits, look_ahead,
4172                                 (unsigned int)(bs.cur.b - p->code),
4173                                 (unsigned int)bs.buf_len);
4174                         return -EIO;
4175                 }
4176                 /* if we consumed all 64 bits, assign 0; >> 64 is "undefined"; */
4177                 if (likely(bits < 64))
4178                         look_ahead >>= bits;
4179                 else
4180                         look_ahead = 0;
4181                 have -= bits;
4182
4183                 bits = bitstream_get_bits(&bs, &tmp, 64 - have);
4184                 if (bits < 0)
4185                         return -EIO;
4186                 look_ahead |= tmp << have;
4187                 have += bits;
4188         }
4189
4190         c->bit_offset = s;
4191         bm_xfer_ctx_bit_to_word_offset(c);
4192
4193         return (s != c->bm_bits);
4194 }
4195
4196 /**
4197  * decode_bitmap_c
4198  *
4199  * Return 0 when done, 1 when another iteration is needed, and a negative error
4200  * code upon failure.
4201  */
4202 static int
4203 decode_bitmap_c(struct drbd_peer_device *peer_device,
4204                 struct p_compressed_bm *p,
4205                 struct bm_xfer_ctx *c,
4206                 unsigned int len)
4207 {
4208         if (dcbp_get_code(p) == RLE_VLI_Bits)
4209                 return recv_bm_rle_bits(peer_device, p, c, len - sizeof(*p));
4210
4211         /* other variants had been implemented for evaluation,
4212          * but have been dropped as this one turned out to be "best"
4213          * during all our tests. */
4214
4215         drbd_err(peer_device, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
4216         conn_request_state(peer_device->connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
4217         return -EIO;
4218 }
4219
4220 void INFO_bm_xfer_stats(struct drbd_device *device,
4221                 const char *direction, struct bm_xfer_ctx *c)
4222 {
4223         /* what would it take to transfer it "plaintext" */
4224         unsigned int header_size = drbd_header_size(first_peer_device(device)->connection);
4225         unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
4226         unsigned int plain =
4227                 header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) +
4228                 c->bm_words * sizeof(unsigned long);
4229         unsigned int total = c->bytes[0] + c->bytes[1];
4230         unsigned int r;
4231
4232         /* total can not be zero. but just in case: */
4233         if (total == 0)
4234                 return;
4235
4236         /* don't report if not compressed */
4237         if (total >= plain)
4238                 return;
4239
4240         /* total < plain. check for overflow, still */
4241         r = (total > UINT_MAX/1000) ? (total / (plain/1000))
4242                                     : (1000 * total / plain);
4243
4244         if (r > 1000)
4245                 r = 1000;
4246
4247         r = 1000 - r;
4248         drbd_info(device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
4249              "total %u; compression: %u.%u%%\n",
4250                         direction,
4251                         c->bytes[1], c->packets[1],
4252                         c->bytes[0], c->packets[0],
4253                         total, r/10, r % 10);
4254 }
4255
4256 /* Since we are processing the bitfield from lower addresses to higher,
4257    it does not matter if the process it in 32 bit chunks or 64 bit
4258    chunks as long as it is little endian. (Understand it as byte stream,
4259    beginning with the lowest byte...) If we would use big endian
4260    we would need to process it from the highest address to the lowest,
4261    in order to be agnostic to the 32 vs 64 bits issue.
4262
4263    returns 0 on failure, 1 if we successfully received it. */
4264 static int receive_bitmap(struct drbd_connection *connection, struct packet_info *pi)
4265 {
4266         struct drbd_peer_device *peer_device;
4267         struct drbd_device *device;
4268         struct bm_xfer_ctx c;
4269         int err;
4270
4271         peer_device = conn_peer_device(connection, pi->vnr);
4272         if (!peer_device)
4273                 return -EIO;
4274         device = peer_device->device;
4275
4276         drbd_bm_lock(device, "receive bitmap", BM_LOCKED_SET_ALLOWED);
4277         /* you are supposed to send additional out-of-sync information
4278          * if you actually set bits during this phase */
4279
4280         c = (struct bm_xfer_ctx) {
4281                 .bm_bits = drbd_bm_bits(device),
4282                 .bm_words = drbd_bm_words(device),
4283         };
4284
4285         for(;;) {
4286                 if (pi->cmd == P_BITMAP)
4287                         err = receive_bitmap_plain(peer_device, pi->size, pi->data, &c);
4288                 else if (pi->cmd == P_COMPRESSED_BITMAP) {
4289                         /* MAYBE: sanity check that we speak proto >= 90,
4290                          * and the feature is enabled! */
4291                         struct p_compressed_bm *p = pi->data;
4292
4293                         if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(connection)) {
4294                                 drbd_err(device, "ReportCBitmap packet too large\n");
4295                                 err = -EIO;
4296                                 goto out;
4297                         }
4298                         if (pi->size <= sizeof(*p)) {
4299                                 drbd_err(device, "ReportCBitmap packet too small (l:%u)\n", pi->size);
4300                                 err = -EIO;
4301                                 goto out;
4302                         }
4303                         err = drbd_recv_all(peer_device->connection, p, pi->size);
4304                         if (err)
4305                                goto out;
4306                         err = decode_bitmap_c(peer_device, p, &c, pi->size);
4307                 } else {
4308                         drbd_warn(device, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd);
4309                         err = -EIO;
4310                         goto out;
4311                 }
4312
4313                 c.packets[pi->cmd == P_BITMAP]++;
4314                 c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(connection) + pi->size;
4315
4316                 if (err <= 0) {
4317                         if (err < 0)
4318                                 goto out;
4319                         break;
4320                 }
4321                 err = drbd_recv_header(peer_device->connection, pi);
4322                 if (err)
4323                         goto out;
4324         }
4325
4326         INFO_bm_xfer_stats(device, "receive", &c);
4327
4328         if (device->state.conn == C_WF_BITMAP_T) {
4329                 enum drbd_state_rv rv;
4330
4331                 err = drbd_send_bitmap(device);
4332                 if (err)
4333                         goto out;
4334                 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
4335                 rv = _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
4336                 D_ASSERT(device, rv == SS_SUCCESS);
4337         } else if (device->state.conn != C_WF_BITMAP_S) {
4338                 /* admin may have requested C_DISCONNECTING,
4339                  * other threads may have noticed network errors */
4340                 drbd_info(device, "unexpected cstate (%s) in receive_bitmap\n",
4341                     drbd_conn_str(device->state.conn));
4342         }
4343         err = 0;
4344
4345  out:
4346         drbd_bm_unlock(device);
4347         if (!err && device->state.conn == C_WF_BITMAP_S)
4348                 drbd_start_resync(device, C_SYNC_SOURCE);
4349         return err;
4350 }
4351
4352 static int receive_skip(struct drbd_connection *connection, struct packet_info *pi)
4353 {
4354         drbd_warn(connection, "skipping unknown optional packet type %d, l: %d!\n",
4355                  pi->cmd, pi->size);
4356
4357         return ignore_remaining_packet(connection, pi);
4358 }
4359
4360 static int receive_UnplugRemote(struct drbd_connection *connection, struct packet_info *pi)
4361 {
4362         /* Make sure we've acked all the TCP data associated
4363          * with the data requests being unplugged */
4364         drbd_tcp_quickack(connection->data.socket);
4365
4366         return 0;
4367 }
4368
4369 static int receive_out_of_sync(struct drbd_connection *connection, struct packet_info *pi)
4370 {
4371         struct drbd_peer_device *peer_device;
4372         struct drbd_device *device;
4373         struct p_block_desc *p = pi->data;
4374
4375         peer_device = conn_peer_device(connection, pi->vnr);
4376         if (!peer_device)
4377                 return -EIO;
4378         device = peer_device->device;
4379
4380         switch (device->state.conn) {
4381         case C_WF_SYNC_UUID:
4382         case C_WF_BITMAP_T:
4383         case C_BEHIND:
4384                         break;
4385         default:
4386                 drbd_err(device, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
4387                                 drbd_conn_str(device->state.conn));
4388         }
4389
4390         drbd_set_out_of_sync(device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
4391
4392         return 0;
4393 }
4394
4395 struct data_cmd {
4396         int expect_payload;
4397         size_t pkt_size;
4398         int (*fn)(struct drbd_connection *, struct packet_info *);
4399 };
4400
4401 static struct data_cmd drbd_cmd_handler[] = {
4402         [P_DATA]            = { 1, sizeof(struct p_data), receive_Data },
4403         [P_DATA_REPLY]      = { 1, sizeof(struct p_data), receive_DataReply },
4404         [P_RS_DATA_REPLY]   = { 1, sizeof(struct p_data), receive_RSDataReply } ,
4405         [P_BARRIER]         = { 0, sizeof(struct p_barrier), receive_Barrier } ,
4406         [P_BITMAP]          = { 1, 0, receive_bitmap } ,
4407         [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } ,
4408         [P_UNPLUG_REMOTE]   = { 0, 0, receive_UnplugRemote },
4409         [P_DATA_REQUEST]    = { 0, sizeof(struct p_block_req), receive_DataRequest },
4410         [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
4411         [P_SYNC_PARAM]      = { 1, 0, receive_SyncParam },
4412         [P_SYNC_PARAM89]    = { 1, 0, receive_SyncParam },
4413         [P_PROTOCOL]        = { 1, sizeof(struct p_protocol), receive_protocol },
4414         [P_UUIDS]           = { 0, sizeof(struct p_uuids), receive_uuids },
4415         [P_SIZES]           = { 0, sizeof(struct p_sizes), receive_sizes },
4416         [P_STATE]           = { 0, sizeof(struct p_state), receive_state },
4417         [P_STATE_CHG_REQ]   = { 0, sizeof(struct p_req_state), receive_req_state },
4418         [P_SYNC_UUID]       = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
4419         [P_OV_REQUEST]      = { 0, sizeof(struct p_block_req), receive_DataRequest },
4420         [P_OV_REPLY]        = { 1, sizeof(struct p_block_req), receive_DataRequest },
4421         [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
4422         [P_DELAY_PROBE]     = { 0, sizeof(struct p_delay_probe93), receive_skip },
4423         [P_OUT_OF_SYNC]     = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
4424         [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
4425         [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
4426 };
4427
4428 static void drbdd(struct drbd_connection *connection)
4429 {
4430         struct packet_info pi;
4431         size_t shs; /* sub header size */
4432         int err;
4433
4434         while (get_t_state(&connection->receiver) == RUNNING) {
4435                 struct data_cmd *cmd;
4436
4437                 drbd_thread_current_set_cpu(&connection->receiver);
4438                 if (drbd_recv_header(connection, &pi))
4439                         goto err_out;
4440
4441                 cmd = &drbd_cmd_handler[pi.cmd];
4442                 if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) {
4443                         drbd_err(connection, "Unexpected data packet %s (0x%04x)",
4444                                  cmdname(pi.cmd), pi.cmd);
4445                         goto err_out;
4446                 }
4447
4448                 shs = cmd->pkt_size;
4449                 if (pi.size > shs && !cmd->expect_payload) {
4450                         drbd_err(connection, "No payload expected %s l:%d\n",
4451                                  cmdname(pi.cmd), pi.size);
4452                         goto err_out;
4453                 }
4454
4455                 if (shs) {
4456                         err = drbd_recv_all_warn(connection, pi.data, shs);
4457                         if (err)
4458                                 goto err_out;
4459                         pi.size -= shs;
4460                 }
4461
4462                 err = cmd->fn(connection, &pi);
4463                 if (err) {
4464                         drbd_err(connection, "error receiving %s, e: %d l: %d!\n",
4465                                  cmdname(pi.cmd), err, pi.size);
4466                         goto err_out;
4467                 }
4468         }
4469         return;
4470
4471     err_out:
4472         conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
4473 }
4474
4475 static void conn_disconnect(struct drbd_connection *connection)
4476 {
4477         struct drbd_peer_device *peer_device;
4478         enum drbd_conns oc;
4479         int vnr;
4480
4481         if (connection->cstate == C_STANDALONE)
4482                 return;
4483
4484         /* We are about to start the cleanup after connection loss.
4485          * Make sure drbd_make_request knows about that.
4486          * Usually we should be in some network failure state already,
4487          * but just in case we are not, we fix it up here.
4488          */
4489         conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
4490
4491         /* asender does not clean up anything. it must not interfere, either */
4492         drbd_thread_stop(&connection->asender);
4493         drbd_free_sock(connection);
4494
4495         rcu_read_lock();
4496         idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
4497                 struct drbd_device *device = peer_device->device;
4498                 kref_get(&device->kref);
4499                 rcu_read_unlock();
4500                 drbd_disconnected(peer_device);
4501                 kref_put(&device->kref, drbd_destroy_device);
4502                 rcu_read_lock();
4503         }
4504         rcu_read_unlock();
4505
4506         if (!list_empty(&connection->current_epoch->list))
4507                 drbd_err(connection, "ASSERTION FAILED: connection->current_epoch->list not empty\n");
4508         /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
4509         atomic_set(&connection->current_epoch->epoch_size, 0);
4510         connection->send.seen_any_write_yet = false;
4511
4512         drbd_info(connection, "Connection closed\n");
4513
4514         if (conn_highest_role(connection) == R_PRIMARY && conn_highest_pdsk(connection) >= D_UNKNOWN)
4515                 conn_try_outdate_peer_async(connection);
4516
4517         spin_lock_irq(&connection->resource->req_lock);
4518         oc = connection->cstate;
4519         if (oc >= C_UNCONNECTED)
4520                 _conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
4521
4522         spin_unlock_irq(&connection->resource->req_lock);
4523
4524         if (oc == C_DISCONNECTING)
4525                 conn_request_state(connection, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD);
4526 }
4527
4528 static int drbd_disconnected(struct drbd_peer_device *peer_device)
4529 {
4530         struct drbd_device *device = peer_device->device;
4531         unsigned int i;
4532
4533         /* wait for current activity to cease. */
4534         spin_lock_irq(&device->resource->req_lock);
4535         _drbd_wait_ee_list_empty(device, &device->active_ee);
4536         _drbd_wait_ee_list_empty(device, &device->sync_ee);
4537         _drbd_wait_ee_list_empty(device, &device->read_ee);
4538         spin_unlock_irq(&device->resource->req_lock);
4539
4540         /* We do not have data structures that would allow us to
4541          * get the rs_pending_cnt down to 0 again.
4542          *  * On C_SYNC_TARGET we do not have any data structures describing
4543          *    the pending RSDataRequest's we have sent.
4544          *  * On C_SYNC_SOURCE there is no data structure that tracks
4545          *    the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
4546          *  And no, it is not the sum of the reference counts in the
4547          *  resync_LRU. The resync_LRU tracks the whole operation including
4548          *  the disk-IO, while the rs_pending_cnt only tracks the blocks
4549          *  on the fly. */
4550         drbd_rs_cancel_all(device);
4551         device->rs_total = 0;
4552         device->rs_failed = 0;
4553         atomic_set(&device->rs_pending_cnt, 0);
4554         wake_up(&device->misc_wait);
4555
4556         del_timer_sync(&device->resync_timer);
4557         resync_timer_fn((unsigned long)device);
4558
4559         /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
4560          * w_make_resync_request etc. which may still be on the worker queue
4561          * to be "canceled" */
4562         drbd_flush_workqueue(&peer_device->connection->sender_work);
4563
4564         drbd_finish_peer_reqs(device);
4565
4566         /* This second workqueue flush is necessary, since drbd_finish_peer_reqs()
4567            might have issued a work again. The one before drbd_finish_peer_reqs() is
4568            necessary to reclain net_ee in drbd_finish_peer_reqs(). */
4569         drbd_flush_workqueue(&peer_device->connection->sender_work);
4570
4571         /* need to do it again, drbd_finish_peer_reqs() may have populated it
4572          * again via drbd_try_clear_on_disk_bm(). */
4573         drbd_rs_cancel_all(device);
4574
4575         kfree(device->p_uuid);
4576         device->p_uuid = NULL;
4577
4578         if (!drbd_suspended(device))
4579                 tl_clear(peer_device->connection);
4580
4581         drbd_md_sync(device);
4582
4583         /* serialize with bitmap writeout triggered by the state change,
4584          * if any. */
4585         wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
4586
4587         /* tcp_close and release of sendpage pages can be deferred.  I don't
4588          * want to use SO_LINGER, because apparently it can be deferred for
4589          * more than 20 seconds (longest time I checked).
4590          *
4591          * Actually we don't care for exactly when the network stack does its
4592          * put_page(), but release our reference on these pages right here.
4593          */
4594         i = drbd_free_peer_reqs(device, &device->net_ee);
4595         if (i)
4596                 drbd_info(device, "net_ee not empty, killed %u entries\n", i);
4597         i = atomic_read(&device->pp_in_use_by_net);
4598         if (i)
4599                 drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i);
4600         i = atomic_read(&device->pp_in_use);
4601         if (i)
4602                 drbd_info(device, "pp_in_use = %d, expected 0\n", i);
4603
4604         D_ASSERT(device, list_empty(&device->read_ee));
4605         D_ASSERT(device, list_empty(&device->active_ee));
4606         D_ASSERT(device, list_empty(&device->sync_ee));
4607         D_ASSERT(device, list_empty(&device->done_ee));
4608
4609         return 0;
4610 }
4611
4612 /*
4613  * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
4614  * we can agree on is stored in agreed_pro_version.
4615  *
4616  * feature flags and the reserved array should be enough room for future
4617  * enhancements of the handshake protocol, and possible plugins...
4618  *
4619  * for now, they are expected to be zero, but ignored.
4620  */
4621 static int drbd_send_features(struct drbd_connection *connection)
4622 {
4623         struct drbd_socket *sock;
4624         struct p_connection_features *p;
4625
4626         sock = &connection->data;
4627         p = conn_prepare_command(connection, sock);
4628         if (!p)
4629                 return -EIO;
4630         memset(p, 0, sizeof(*p));
4631         p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
4632         p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
4633         return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
4634 }
4635
4636 /*
4637  * return values:
4638  *   1 yes, we have a valid connection
4639  *   0 oops, did not work out, please try again
4640  *  -1 peer talks different language,
4641  *     no point in trying again, please go standalone.
4642  */
4643 static int drbd_do_features(struct drbd_connection *connection)
4644 {
4645         /* ASSERT current == connection->receiver ... */
4646         struct p_connection_features *p;
4647         const int expect = sizeof(struct p_connection_features);
4648         struct packet_info pi;
4649         int err;
4650
4651         err = drbd_send_features(connection);
4652         if (err)
4653                 return 0;
4654
4655         err = drbd_recv_header(connection, &pi);
4656         if (err)
4657                 return 0;
4658
4659         if (pi.cmd != P_CONNECTION_FEATURES) {
4660                 drbd_err(connection, "expected ConnectionFeatures packet, received: %s (0x%04x)\n",
4661                          cmdname(pi.cmd), pi.cmd);
4662                 return -1;
4663         }
4664
4665         if (pi.size != expect) {
4666                 drbd_err(connection, "expected ConnectionFeatures length: %u, received: %u\n",
4667                      expect, pi.size);
4668                 return -1;
4669         }
4670
4671         p = pi.data;
4672         err = drbd_recv_all_warn(connection, p, expect);
4673         if (err)
4674                 return 0;
4675
4676         p->protocol_min = be32_to_cpu(p->protocol_min);
4677         p->protocol_max = be32_to_cpu(p->protocol_max);
4678         if (p->protocol_max == 0)
4679                 p->protocol_max = p->protocol_min;
4680
4681         if (PRO_VERSION_MAX < p->protocol_min ||
4682             PRO_VERSION_MIN > p->protocol_max)
4683                 goto incompat;
4684
4685         connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
4686
4687         drbd_info(connection, "Handshake successful: "
4688              "Agreed network protocol version %d\n", connection->agreed_pro_version);
4689
4690         return 1;
4691
4692  incompat:
4693         drbd_err(connection, "incompatible DRBD dialects: "
4694             "I support %d-%d, peer supports %d-%d\n",
4695             PRO_VERSION_MIN, PRO_VERSION_MAX,
4696             p->protocol_min, p->protocol_max);
4697         return -1;
4698 }
4699
4700 #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
4701 static int drbd_do_auth(struct drbd_connection *connection)
4702 {
4703         drbd_err(connection, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
4704         drbd_err(connection, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
4705         return -1;
4706 }
4707 #else
4708 #define CHALLENGE_LEN 64
4709
4710 /* Return value:
4711         1 - auth succeeded,
4712         0 - failed, try again (network error),
4713         -1 - auth failed, don't try again.
4714 */
4715
4716 static int drbd_do_auth(struct drbd_connection *connection)
4717 {
4718         struct drbd_socket *sock;
4719         char my_challenge[CHALLENGE_LEN];  /* 64 Bytes... */
4720         struct scatterlist sg;
4721         char *response = NULL;
4722         char *right_response = NULL;
4723         char *peers_ch = NULL;
4724         unsigned int key_len;
4725         char secret[SHARED_SECRET_MAX]; /* 64 byte */
4726         unsigned int resp_size;
4727         struct hash_desc desc;
4728         struct packet_info pi;
4729         struct net_conf *nc;
4730         int err, rv;
4731
4732         /* FIXME: Put the challenge/response into the preallocated socket buffer.  */
4733
4734         rcu_read_lock();
4735         nc = rcu_dereference(connection->net_conf);
4736         key_len = strlen(nc->shared_secret);
4737         memcpy(secret, nc->shared_secret, key_len);
4738         rcu_read_unlock();
4739
4740         desc.tfm = connection->cram_hmac_tfm;
4741         desc.flags = 0;
4742
4743         rv = crypto_hash_setkey(connection->cram_hmac_tfm, (u8 *)secret, key_len);
4744         if (rv) {
4745                 drbd_err(connection, "crypto_hash_setkey() failed with %d\n", rv);
4746                 rv = -1;
4747                 goto fail;
4748         }
4749
4750         get_random_bytes(my_challenge, CHALLENGE_LEN);
4751
4752         sock = &connection->data;
4753         if (!conn_prepare_command(connection, sock)) {
4754                 rv = 0;
4755                 goto fail;
4756         }
4757         rv = !conn_send_command(connection, sock, P_AUTH_CHALLENGE, 0,
4758                                 my_challenge, CHALLENGE_LEN);
4759         if (!rv)
4760                 goto fail;
4761
4762         err = drbd_recv_header(connection, &pi);
4763         if (err) {
4764                 rv = 0;
4765                 goto fail;
4766         }
4767
4768         if (pi.cmd != P_AUTH_CHALLENGE) {
4769                 drbd_err(connection, "expected AuthChallenge packet, received: %s (0x%04x)\n",
4770                          cmdname(pi.cmd), pi.cmd);
4771                 rv = 0;
4772                 goto fail;
4773         }
4774
4775         if (pi.size > CHALLENGE_LEN * 2) {
4776                 drbd_err(connection, "expected AuthChallenge payload too big.\n");
4777                 rv = -1;
4778                 goto fail;
4779         }
4780
4781         peers_ch = kmalloc(pi.size, GFP_NOIO);
4782         if (peers_ch == NULL) {
4783                 drbd_err(connection, "kmalloc of peers_ch failed\n");
4784                 rv = -1;
4785                 goto fail;
4786         }
4787
4788         err = drbd_recv_all_warn(connection, peers_ch, pi.size);
4789         if (err) {
4790                 rv = 0;
4791                 goto fail;
4792         }
4793
4794         resp_size = crypto_hash_digestsize(connection->cram_hmac_tfm);
4795         response = kmalloc(resp_size, GFP_NOIO);
4796         if (response == NULL) {
4797                 drbd_err(connection, "kmalloc of response failed\n");
4798                 rv = -1;
4799                 goto fail;
4800         }
4801
4802         sg_init_table(&sg, 1);
4803         sg_set_buf(&sg, peers_ch, pi.size);
4804
4805         rv = crypto_hash_digest(&desc, &sg, sg.length, response);
4806         if (rv) {
4807                 drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
4808                 rv = -1;
4809                 goto fail;
4810         }
4811
4812         if (!conn_prepare_command(connection, sock)) {
4813                 rv = 0;
4814                 goto fail;
4815         }
4816         rv = !conn_send_command(connection, sock, P_AUTH_RESPONSE, 0,
4817                                 response, resp_size);
4818         if (!rv)
4819                 goto fail;
4820
4821         err = drbd_recv_header(connection, &pi);
4822         if (err) {
4823                 rv = 0;
4824                 goto fail;
4825         }
4826
4827         if (pi.cmd != P_AUTH_RESPONSE) {
4828                 drbd_err(connection, "expected AuthResponse packet, received: %s (0x%04x)\n",
4829                          cmdname(pi.cmd), pi.cmd);
4830                 rv = 0;
4831                 goto fail;
4832         }
4833
4834         if (pi.size != resp_size) {
4835                 drbd_err(connection, "expected AuthResponse payload of wrong size\n");
4836                 rv = 0;
4837                 goto fail;
4838         }
4839
4840         err = drbd_recv_all_warn(connection, response , resp_size);
4841         if (err) {
4842                 rv = 0;
4843                 goto fail;
4844         }
4845
4846         right_response = kmalloc(resp_size, GFP_NOIO);
4847         if (right_response == NULL) {
4848                 drbd_err(connection, "kmalloc of right_response failed\n");
4849                 rv = -1;
4850                 goto fail;
4851         }
4852
4853         sg_set_buf(&sg, my_challenge, CHALLENGE_LEN);
4854
4855         rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
4856         if (rv) {
4857                 drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
4858                 rv = -1;
4859                 goto fail;
4860         }
4861
4862         rv = !memcmp(response, right_response, resp_size);
4863
4864         if (rv)
4865                 drbd_info(connection, "Peer authenticated using %d bytes HMAC\n",
4866                      resp_size);
4867         else
4868                 rv = -1;
4869
4870  fail:
4871         kfree(peers_ch);
4872         kfree(response);
4873         kfree(right_response);
4874
4875         return rv;
4876 }
4877 #endif
4878
4879 int drbd_receiver(struct drbd_thread *thi)
4880 {
4881         struct drbd_connection *connection = thi->connection;
4882         int h;
4883
4884         drbd_info(connection, "receiver (re)started\n");
4885
4886         do {
4887                 h = conn_connect(connection);
4888                 if (h == 0) {
4889                         conn_disconnect(connection);
4890                         schedule_timeout_interruptible(HZ);
4891                 }
4892                 if (h == -1) {
4893                         drbd_warn(connection, "Discarding network configuration.\n");
4894                         conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
4895                 }
4896         } while (h == 0);
4897
4898         if (h > 0)
4899                 drbdd(connection);
4900
4901         conn_disconnect(connection);
4902
4903         drbd_info(connection, "receiver terminated\n");
4904         return 0;
4905 }
4906
4907 /* ********* acknowledge sender ******** */
4908
4909 static int got_conn_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
4910 {
4911         struct p_req_state_reply *p = pi->data;
4912         int retcode = be32_to_cpu(p->retcode);
4913
4914         if (retcode >= SS_SUCCESS) {
4915                 set_bit(CONN_WD_ST_CHG_OKAY, &connection->flags);
4916         } else {
4917                 set_bit(CONN_WD_ST_CHG_FAIL, &connection->flags);
4918                 drbd_err(connection, "Requested state change failed by peer: %s (%d)\n",
4919                          drbd_set_st_err_str(retcode), retcode);
4920         }
4921         wake_up(&connection->ping_wait);
4922
4923         return 0;
4924 }
4925
4926 static int got_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
4927 {
4928         struct drbd_peer_device *peer_device;
4929         struct drbd_device *device;
4930         struct p_req_state_reply *p = pi->data;
4931         int retcode = be32_to_cpu(p->retcode);
4932
4933         peer_device = conn_peer_device(connection, pi->vnr);
4934         if (!peer_device)
4935                 return -EIO;
4936         device = peer_device->device;
4937
4938         if (test_bit(CONN_WD_ST_CHG_REQ, &connection->flags)) {
4939                 D_ASSERT(device, connection->agreed_pro_version < 100);
4940                 return got_conn_RqSReply(connection, pi);
4941         }
4942
4943         if (retcode >= SS_SUCCESS) {
4944                 set_bit(CL_ST_CHG_SUCCESS, &device->flags);
4945         } else {
4946                 set_bit(CL_ST_CHG_FAIL, &device->flags);
4947                 drbd_err(device, "Requested state change failed by peer: %s (%d)\n",
4948                         drbd_set_st_err_str(retcode), retcode);
4949         }
4950         wake_up(&device->state_wait);
4951
4952         return 0;
4953 }
4954
4955 static int got_Ping(struct drbd_connection *connection, struct packet_info *pi)
4956 {
4957         return drbd_send_ping_ack(connection);
4958
4959 }
4960
4961 static int got_PingAck(struct drbd_connection *connection, struct packet_info *pi)
4962 {
4963         /* restore idle timeout */
4964         connection->meta.socket->sk->sk_rcvtimeo = connection->net_conf->ping_int*HZ;
4965         if (!test_and_set_bit(GOT_PING_ACK, &connection->flags))
4966                 wake_up(&connection->ping_wait);
4967
4968         return 0;
4969 }
4970
4971 static int got_IsInSync(struct drbd_connection *connection, struct packet_info *pi)
4972 {
4973         struct drbd_peer_device *peer_device;
4974         struct drbd_device *device;
4975         struct p_block_ack *p = pi->data;
4976         sector_t sector = be64_to_cpu(p->sector);
4977         int blksize = be32_to_cpu(p->blksize);
4978
4979         peer_device = conn_peer_device(connection, pi->vnr);
4980         if (!peer_device)
4981                 return -EIO;
4982         device = peer_device->device;
4983
4984         D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
4985
4986         update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
4987
4988         if (get_ldev(device)) {
4989                 drbd_rs_complete_io(device, sector);
4990                 drbd_set_in_sync(device, sector, blksize);
4991                 /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
4992                 device->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
4993                 put_ldev(device);
4994         }
4995         dec_rs_pending(device);
4996         atomic_add(blksize >> 9, &device->rs_sect_in);
4997
4998         return 0;
4999 }
5000
5001 static int
5002 validate_req_change_req_state(struct drbd_device *device, u64 id, sector_t sector,
5003                               struct rb_root *root, const char *func,
5004                               enum drbd_req_event what, bool missing_ok)
5005 {
5006         struct drbd_request *req;
5007         struct bio_and_error m;
5008
5009         spin_lock_irq(&device->resource->req_lock);
5010         req = find_request(device, root, id, sector, missing_ok, func);
5011         if (unlikely(!req)) {
5012                 spin_unlock_irq(&device->resource->req_lock);
5013                 return -EIO;
5014         }
5015         __req_mod(req, what, &m);
5016         spin_unlock_irq(&device->resource->req_lock);
5017
5018         if (m.bio)
5019                 complete_master_bio(device, &m);
5020         return 0;
5021 }
5022
5023 static int got_BlockAck(struct drbd_connection *connection, struct packet_info *pi)
5024 {
5025         struct drbd_peer_device *peer_device;
5026         struct drbd_device *device;
5027         struct p_block_ack *p = pi->data;
5028         sector_t sector = be64_to_cpu(p->sector);
5029         int blksize = be32_to_cpu(p->blksize);
5030         enum drbd_req_event what;
5031
5032         peer_device = conn_peer_device(connection, pi->vnr);
5033         if (!peer_device)
5034                 return -EIO;
5035         device = peer_device->device;
5036
5037         update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5038
5039         if (p->block_id == ID_SYNCER) {
5040                 drbd_set_in_sync(device, sector, blksize);
5041                 dec_rs_pending(device);
5042                 return 0;
5043         }
5044         switch (pi->cmd) {
5045         case P_RS_WRITE_ACK:
5046                 what = WRITE_ACKED_BY_PEER_AND_SIS;
5047                 break;
5048         case P_WRITE_ACK:
5049                 what = WRITE_ACKED_BY_PEER;
5050                 break;
5051         case P_RECV_ACK:
5052                 what = RECV_ACKED_BY_PEER;
5053                 break;
5054         case P_SUPERSEDED:
5055                 what = CONFLICT_RESOLVED;
5056                 break;
5057         case P_RETRY_WRITE:
5058                 what = POSTPONE_WRITE;
5059                 break;
5060         default:
5061                 BUG();
5062         }
5063
5064         return validate_req_change_req_state(device, p->block_id, sector,
5065                                              &device->write_requests, __func__,
5066                                              what, false);
5067 }
5068
5069 static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi)
5070 {
5071         struct drbd_peer_device *peer_device;
5072         struct drbd_device *device;
5073         struct p_block_ack *p = pi->data;
5074         sector_t sector = be64_to_cpu(p->sector);
5075         int size = be32_to_cpu(p->blksize);
5076         int err;
5077
5078         peer_device = conn_peer_device(connection, pi->vnr);
5079         if (!peer_device)
5080                 return -EIO;
5081         device = peer_device->device;
5082
5083         update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5084
5085         if (p->block_id == ID_SYNCER) {
5086                 dec_rs_pending(device);
5087                 drbd_rs_failed_io(device, sector, size);
5088                 return 0;
5089         }
5090
5091         err = validate_req_change_req_state(device, p->block_id, sector,
5092                                             &device->write_requests, __func__,
5093                                             NEG_ACKED, true);
5094         if (err) {
5095                 /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
5096                    The master bio might already be completed, therefore the
5097                    request is no longer in the collision hash. */
5098                 /* In Protocol B we might already have got a P_RECV_ACK
5099                    but then get a P_NEG_ACK afterwards. */
5100                 drbd_set_out_of_sync(device, sector, size);
5101         }
5102         return 0;
5103 }
5104
5105 static int got_NegDReply(struct drbd_connection *connection, struct packet_info *pi)
5106 {
5107         struct drbd_peer_device *peer_device;
5108         struct drbd_device *device;
5109         struct p_block_ack *p = pi->data;
5110         sector_t sector = be64_to_cpu(p->sector);
5111
5112         peer_device = conn_peer_device(connection, pi->vnr);
5113         if (!peer_device)
5114                 return -EIO;
5115         device = peer_device->device;
5116
5117         update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5118
5119         drbd_err(device, "Got NegDReply; Sector %llus, len %u.\n",
5120             (unsigned long long)sector, be32_to_cpu(p->blksize));
5121
5122         return validate_req_change_req_state(device, p->block_id, sector,
5123                                              &device->read_requests, __func__,
5124                                              NEG_ACKED, false);
5125 }
5126
5127 static int got_NegRSDReply(struct drbd_connection *connection, struct packet_info *pi)
5128 {
5129         struct drbd_peer_device *peer_device;
5130         struct drbd_device *device;
5131         sector_t sector;
5132         int size;
5133         struct p_block_ack *p = pi->data;
5134
5135         peer_device = conn_peer_device(connection, pi->vnr);
5136         if (!peer_device)
5137                 return -EIO;
5138         device = peer_device->device;
5139
5140         sector = be64_to_cpu(p->sector);
5141         size = be32_to_cpu(p->blksize);
5142
5143         update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5144
5145         dec_rs_pending(device);
5146
5147         if (get_ldev_if_state(device, D_FAILED)) {
5148                 drbd_rs_complete_io(device, sector);
5149                 switch (pi->cmd) {
5150                 case P_NEG_RS_DREPLY:
5151                         drbd_rs_failed_io(device, sector, size);
5152                 case P_RS_CANCEL:
5153                         break;
5154                 default:
5155                         BUG();
5156                 }
5157                 put_ldev(device);
5158         }
5159
5160         return 0;
5161 }
5162
5163 static int got_BarrierAck(struct drbd_connection *connection, struct packet_info *pi)
5164 {
5165         struct p_barrier_ack *p = pi->data;
5166         struct drbd_peer_device *peer_device;
5167         int vnr;
5168
5169         tl_release(connection, p->barrier, be32_to_cpu(p->set_size));
5170
5171         rcu_read_lock();
5172         idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
5173                 struct drbd_device *device = peer_device->device;
5174
5175                 if (device->state.conn == C_AHEAD &&
5176                     atomic_read(&device->ap_in_flight) == 0 &&
5177                     !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &device->flags)) {
5178                         device->start_resync_timer.expires = jiffies + HZ;
5179                         add_timer(&device->start_resync_timer);
5180                 }
5181         }
5182         rcu_read_unlock();
5183
5184         return 0;
5185 }
5186
5187 static int got_OVResult(struct drbd_connection *connection, struct packet_info *pi)
5188 {
5189         struct drbd_peer_device *peer_device;
5190         struct drbd_device *device;
5191         struct p_block_ack *p = pi->data;
5192         struct drbd_device_work *dw;
5193         sector_t sector;
5194         int size;
5195
5196         peer_device = conn_peer_device(connection, pi->vnr);
5197         if (!peer_device)
5198                 return -EIO;
5199         device = peer_device->device;
5200
5201         sector = be64_to_cpu(p->sector);
5202         size = be32_to_cpu(p->blksize);
5203
5204         update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5205
5206         if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
5207                 drbd_ov_out_of_sync_found(device, sector, size);
5208         else
5209                 ov_out_of_sync_print(device);
5210
5211         if (!get_ldev(device))
5212                 return 0;
5213
5214         drbd_rs_complete_io(device, sector);
5215         dec_rs_pending(device);
5216
5217         --device->ov_left;
5218
5219         /* let's advance progress step marks only for every other megabyte */
5220         if ((device->ov_left & 0x200) == 0x200)
5221                 drbd_advance_rs_marks(device, device->ov_left);
5222
5223         if (device->ov_left == 0) {
5224                 dw = kmalloc(sizeof(*dw), GFP_NOIO);
5225                 if (dw) {
5226                         dw->w.cb = w_ov_finished;
5227                         dw->device = device;
5228                         drbd_queue_work(&peer_device->connection->sender_work, &dw->w);
5229                 } else {
5230                         drbd_err(device, "kmalloc(dw) failed.");
5231                         ov_out_of_sync_print(device);
5232                         drbd_resync_finished(device);
5233                 }
5234         }
5235         put_ldev(device);
5236         return 0;
5237 }
5238
5239 static int got_skip(struct drbd_connection *connection, struct packet_info *pi)
5240 {
5241         return 0;
5242 }
5243
5244 static int connection_finish_peer_reqs(struct drbd_connection *connection)
5245 {
5246         struct drbd_peer_device *peer_device;
5247         int vnr, not_empty = 0;
5248
5249         do {
5250                 clear_bit(SIGNAL_ASENDER, &connection->flags);
5251                 flush_signals(current);
5252
5253                 rcu_read_lock();
5254                 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
5255                         struct drbd_device *device = peer_device->device;
5256                         kref_get(&device->kref);
5257                         rcu_read_unlock();
5258                         if (drbd_finish_peer_reqs(device)) {
5259                                 kref_put(&device->kref, drbd_destroy_device);
5260                                 return 1;
5261                         }
5262                         kref_put(&device->kref, drbd_destroy_device);
5263                         rcu_read_lock();
5264                 }
5265                 set_bit(SIGNAL_ASENDER, &connection->flags);
5266
5267                 spin_lock_irq(&connection->resource->req_lock);
5268                 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
5269                         struct drbd_device *device = peer_device->device;
5270                         not_empty = !list_empty(&device->done_ee);
5271                         if (not_empty)
5272                                 break;
5273                 }
5274                 spin_unlock_irq(&connection->resource->req_lock);
5275                 rcu_read_unlock();
5276         } while (not_empty);
5277
5278         return 0;
5279 }
5280
5281 struct asender_cmd {
5282         size_t pkt_size;
5283         int (*fn)(struct drbd_connection *connection, struct packet_info *);
5284 };
5285
5286 static struct asender_cmd asender_tbl[] = {
5287         [P_PING]            = { 0, got_Ping },
5288         [P_PING_ACK]        = { 0, got_PingAck },
5289         [P_RECV_ACK]        = { sizeof(struct p_block_ack), got_BlockAck },
5290         [P_WRITE_ACK]       = { sizeof(struct p_block_ack), got_BlockAck },
5291         [P_RS_WRITE_ACK]    = { sizeof(struct p_block_ack), got_BlockAck },
5292         [P_SUPERSEDED]   = { sizeof(struct p_block_ack), got_BlockAck },
5293         [P_NEG_ACK]         = { sizeof(struct p_block_ack), got_NegAck },
5294         [P_NEG_DREPLY]      = { sizeof(struct p_block_ack), got_NegDReply },
5295         [P_NEG_RS_DREPLY]   = { sizeof(struct p_block_ack), got_NegRSDReply },
5296         [P_OV_RESULT]       = { sizeof(struct p_block_ack), got_OVResult },
5297         [P_BARRIER_ACK]     = { sizeof(struct p_barrier_ack), got_BarrierAck },
5298         [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
5299         [P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
5300         [P_DELAY_PROBE]     = { sizeof(struct p_delay_probe93), got_skip },
5301         [P_RS_CANCEL]       = { sizeof(struct p_block_ack), got_NegRSDReply },
5302         [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply },
5303         [P_RETRY_WRITE]     = { sizeof(struct p_block_ack), got_BlockAck },
5304 };
5305
5306 int drbd_asender(struct drbd_thread *thi)
5307 {
5308         struct drbd_connection *connection = thi->connection;
5309         struct asender_cmd *cmd = NULL;
5310         struct packet_info pi;
5311         int rv;
5312         void *buf    = connection->meta.rbuf;
5313         int received = 0;
5314         unsigned int header_size = drbd_header_size(connection);
5315         int expect   = header_size;
5316         bool ping_timeout_active = false;
5317         struct net_conf *nc;
5318         int ping_timeo, tcp_cork, ping_int;
5319         struct sched_param param = { .sched_priority = 2 };
5320
5321         rv = sched_setscheduler(current, SCHED_RR, &param);
5322         if (rv < 0)
5323                 drbd_err(connection, "drbd_asender: ERROR set priority, ret=%d\n", rv);
5324
5325         while (get_t_state(thi) == RUNNING) {
5326                 drbd_thread_current_set_cpu(thi);
5327
5328                 rcu_read_lock();
5329                 nc = rcu_dereference(connection->net_conf);
5330                 ping_timeo = nc->ping_timeo;
5331                 tcp_cork = nc->tcp_cork;
5332                 ping_int = nc->ping_int;
5333                 rcu_read_unlock();
5334
5335                 if (test_and_clear_bit(SEND_PING, &connection->flags)) {
5336                         if (drbd_send_ping(connection)) {
5337                                 drbd_err(connection, "drbd_send_ping has failed\n");
5338                                 goto reconnect;
5339                         }
5340                         connection->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10;
5341                         ping_timeout_active = true;
5342                 }
5343
5344                 /* TODO: conditionally cork; it may hurt latency if we cork without
5345                    much to send */
5346                 if (tcp_cork)
5347                         drbd_tcp_cork(connection->meta.socket);
5348                 if (connection_finish_peer_reqs(connection)) {
5349                         drbd_err(connection, "connection_finish_peer_reqs() failed\n");
5350                         goto reconnect;
5351                 }
5352                 /* but unconditionally uncork unless disabled */
5353                 if (tcp_cork)
5354                         drbd_tcp_uncork(connection->meta.socket);
5355
5356                 /* short circuit, recv_msg would return EINTR anyways. */
5357                 if (signal_pending(current))
5358                         continue;
5359
5360                 rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0);
5361                 clear_bit(SIGNAL_ASENDER, &connection->flags);
5362
5363                 flush_signals(current);
5364
5365                 /* Note:
5366                  * -EINTR        (on meta) we got a signal
5367                  * -EAGAIN       (on meta) rcvtimeo expired
5368                  * -ECONNRESET   other side closed the connection
5369                  * -ERESTARTSYS  (on data) we got a signal
5370                  * rv <  0       other than above: unexpected error!
5371                  * rv == expected: full header or command
5372                  * rv <  expected: "woken" by signal during receive
5373                  * rv == 0       : "connection shut down by peer"
5374                  */
5375                 if (likely(rv > 0)) {
5376                         received += rv;
5377                         buf      += rv;
5378                 } else if (rv == 0) {
5379                         if (test_bit(DISCONNECT_SENT, &connection->flags)) {
5380                                 long t;
5381                                 rcu_read_lock();
5382                                 t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
5383                                 rcu_read_unlock();
5384
5385                                 t = wait_event_timeout(connection->ping_wait,
5386                                                        connection->cstate < C_WF_REPORT_PARAMS,
5387                                                        t);
5388                                 if (t)
5389                                         break;
5390                         }
5391                         drbd_err(connection, "meta connection shut down by peer.\n");
5392                         goto reconnect;
5393                 } else if (rv == -EAGAIN) {
5394                         /* If the data socket received something meanwhile,
5395                          * that is good enough: peer is still alive. */
5396                         if (time_after(connection->last_received,
5397                                 jiffies - connection->meta.socket->sk->sk_rcvtimeo))
5398                                 continue;
5399                         if (ping_timeout_active) {
5400                                 drbd_err(connection, "PingAck did not arrive in time.\n");
5401                                 goto reconnect;
5402                         }
5403                         set_bit(SEND_PING, &connection->flags);
5404                         continue;
5405                 } else if (rv == -EINTR) {
5406                         continue;
5407                 } else {
5408                         drbd_err(connection, "sock_recvmsg returned %d\n", rv);
5409                         goto reconnect;
5410                 }
5411
5412                 if (received == expect && cmd == NULL) {
5413                         if (decode_header(connection, connection->meta.rbuf, &pi))
5414                                 goto reconnect;
5415                         cmd = &asender_tbl[pi.cmd];
5416                         if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) {
5417                                 drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n",
5418                                          cmdname(pi.cmd), pi.cmd);
5419                                 goto disconnect;
5420                         }
5421                         expect = header_size + cmd->pkt_size;
5422                         if (pi.size != expect - header_size) {
5423                                 drbd_err(connection, "Wrong packet size on meta (c: %d, l: %d)\n",
5424                                         pi.cmd, pi.size);
5425                                 goto reconnect;
5426                         }
5427                 }
5428                 if (received == expect) {
5429                         bool err;
5430
5431                         err = cmd->fn(connection, &pi);
5432                         if (err) {
5433                                 drbd_err(connection, "%pf failed\n", cmd->fn);
5434                                 goto reconnect;
5435                         }
5436
5437                         connection->last_received = jiffies;
5438
5439                         if (cmd == &asender_tbl[P_PING_ACK]) {
5440                                 /* restore idle timeout */
5441                                 connection->meta.socket->sk->sk_rcvtimeo = ping_int * HZ;
5442                                 ping_timeout_active = false;
5443                         }
5444
5445                         buf      = connection->meta.rbuf;
5446                         received = 0;
5447                         expect   = header_size;
5448                         cmd      = NULL;
5449                 }
5450         }
5451
5452         if (0) {
5453 reconnect:
5454                 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
5455                 conn_md_sync(connection);
5456         }
5457         if (0) {
5458 disconnect:
5459                 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
5460         }
5461         clear_bit(SIGNAL_ASENDER, &connection->flags);
5462
5463         drbd_info(connection, "asender terminated\n");
5464
5465         return 0;
5466 }