ARM64: DTS: Add rk3399-firefly uart4 device, node as /dev/ttyS1
[firefly-linux-kernel-4.4.55.git] / fs / ceph / mds_client.c
1 #include <linux/ceph/ceph_debug.h>
2
3 #include <linux/fs.h>
4 #include <linux/wait.h>
5 #include <linux/slab.h>
6 #include <linux/gfp.h>
7 #include <linux/sched.h>
8 #include <linux/debugfs.h>
9 #include <linux/seq_file.h>
10 #include <linux/utsname.h>
11 #include <linux/ratelimit.h>
12
13 #include "super.h"
14 #include "mds_client.h"
15
16 #include <linux/ceph/ceph_features.h>
17 #include <linux/ceph/messenger.h>
18 #include <linux/ceph/decode.h>
19 #include <linux/ceph/pagelist.h>
20 #include <linux/ceph/auth.h>
21 #include <linux/ceph/debugfs.h>
22
23 /*
24  * A cluster of MDS (metadata server) daemons is responsible for
25  * managing the file system namespace (the directory hierarchy and
26  * inodes) and for coordinating shared access to storage.  Metadata is
27  * partitioning hierarchically across a number of servers, and that
28  * partition varies over time as the cluster adjusts the distribution
29  * in order to balance load.
30  *
31  * The MDS client is primarily responsible to managing synchronous
32  * metadata requests for operations like open, unlink, and so forth.
33  * If there is a MDS failure, we find out about it when we (possibly
34  * request and) receive a new MDS map, and can resubmit affected
35  * requests.
36  *
37  * For the most part, though, we take advantage of a lossless
38  * communications channel to the MDS, and do not need to worry about
39  * timing out or resubmitting requests.
40  *
41  * We maintain a stateful "session" with each MDS we interact with.
42  * Within each session, we sent periodic heartbeat messages to ensure
43  * any capabilities or leases we have been issues remain valid.  If
44  * the session times out and goes stale, our leases and capabilities
45  * are no longer valid.
46  */
47
48 struct ceph_reconnect_state {
49         int nr_caps;
50         struct ceph_pagelist *pagelist;
51         bool flock;
52 };
53
54 static void __wake_requests(struct ceph_mds_client *mdsc,
55                             struct list_head *head);
56
57 static const struct ceph_connection_operations mds_con_ops;
58
59
60 /*
61  * mds reply parsing
62  */
63
64 /*
65  * parse individual inode info
66  */
67 static int parse_reply_info_in(void **p, void *end,
68                                struct ceph_mds_reply_info_in *info,
69                                u64 features)
70 {
71         int err = -EIO;
72
73         info->in = *p;
74         *p += sizeof(struct ceph_mds_reply_inode) +
75                 sizeof(*info->in->fragtree.splits) *
76                 le32_to_cpu(info->in->fragtree.nsplits);
77
78         ceph_decode_32_safe(p, end, info->symlink_len, bad);
79         ceph_decode_need(p, end, info->symlink_len, bad);
80         info->symlink = *p;
81         *p += info->symlink_len;
82
83         if (features & CEPH_FEATURE_DIRLAYOUTHASH)
84                 ceph_decode_copy_safe(p, end, &info->dir_layout,
85                                       sizeof(info->dir_layout), bad);
86         else
87                 memset(&info->dir_layout, 0, sizeof(info->dir_layout));
88
89         ceph_decode_32_safe(p, end, info->xattr_len, bad);
90         ceph_decode_need(p, end, info->xattr_len, bad);
91         info->xattr_data = *p;
92         *p += info->xattr_len;
93
94         if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
95                 ceph_decode_64_safe(p, end, info->inline_version, bad);
96                 ceph_decode_32_safe(p, end, info->inline_len, bad);
97                 ceph_decode_need(p, end, info->inline_len, bad);
98                 info->inline_data = *p;
99                 *p += info->inline_len;
100         } else
101                 info->inline_version = CEPH_INLINE_NONE;
102
103         return 0;
104 bad:
105         return err;
106 }
107
108 /*
109  * parse a normal reply, which may contain a (dir+)dentry and/or a
110  * target inode.
111  */
112 static int parse_reply_info_trace(void **p, void *end,
113                                   struct ceph_mds_reply_info_parsed *info,
114                                   u64 features)
115 {
116         int err;
117
118         if (info->head->is_dentry) {
119                 err = parse_reply_info_in(p, end, &info->diri, features);
120                 if (err < 0)
121                         goto out_bad;
122
123                 if (unlikely(*p + sizeof(*info->dirfrag) > end))
124                         goto bad;
125                 info->dirfrag = *p;
126                 *p += sizeof(*info->dirfrag) +
127                         sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
128                 if (unlikely(*p > end))
129                         goto bad;
130
131                 ceph_decode_32_safe(p, end, info->dname_len, bad);
132                 ceph_decode_need(p, end, info->dname_len, bad);
133                 info->dname = *p;
134                 *p += info->dname_len;
135                 info->dlease = *p;
136                 *p += sizeof(*info->dlease);
137         }
138
139         if (info->head->is_target) {
140                 err = parse_reply_info_in(p, end, &info->targeti, features);
141                 if (err < 0)
142                         goto out_bad;
143         }
144
145         if (unlikely(*p != end))
146                 goto bad;
147         return 0;
148
149 bad:
150         err = -EIO;
151 out_bad:
152         pr_err("problem parsing mds trace %d\n", err);
153         return err;
154 }
155
156 /*
157  * parse readdir results
158  */
159 static int parse_reply_info_dir(void **p, void *end,
160                                 struct ceph_mds_reply_info_parsed *info,
161                                 u64 features)
162 {
163         u32 num, i = 0;
164         int err;
165
166         info->dir_dir = *p;
167         if (*p + sizeof(*info->dir_dir) > end)
168                 goto bad;
169         *p += sizeof(*info->dir_dir) +
170                 sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
171         if (*p > end)
172                 goto bad;
173
174         ceph_decode_need(p, end, sizeof(num) + 2, bad);
175         num = ceph_decode_32(p);
176         info->dir_end = ceph_decode_8(p);
177         info->dir_complete = ceph_decode_8(p);
178         if (num == 0)
179                 goto done;
180
181         BUG_ON(!info->dir_in);
182         info->dir_dname = (void *)(info->dir_in + num);
183         info->dir_dname_len = (void *)(info->dir_dname + num);
184         info->dir_dlease = (void *)(info->dir_dname_len + num);
185         if ((unsigned long)(info->dir_dlease + num) >
186             (unsigned long)info->dir_in + info->dir_buf_size) {
187                 pr_err("dir contents are larger than expected\n");
188                 WARN_ON(1);
189                 goto bad;
190         }
191
192         info->dir_nr = num;
193         while (num) {
194                 /* dentry */
195                 ceph_decode_need(p, end, sizeof(u32)*2, bad);
196                 info->dir_dname_len[i] = ceph_decode_32(p);
197                 ceph_decode_need(p, end, info->dir_dname_len[i], bad);
198                 info->dir_dname[i] = *p;
199                 *p += info->dir_dname_len[i];
200                 dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
201                      info->dir_dname[i]);
202                 info->dir_dlease[i] = *p;
203                 *p += sizeof(struct ceph_mds_reply_lease);
204
205                 /* inode */
206                 err = parse_reply_info_in(p, end, &info->dir_in[i], features);
207                 if (err < 0)
208                         goto out_bad;
209                 i++;
210                 num--;
211         }
212
213 done:
214         if (*p != end)
215                 goto bad;
216         return 0;
217
218 bad:
219         err = -EIO;
220 out_bad:
221         pr_err("problem parsing dir contents %d\n", err);
222         return err;
223 }
224
225 /*
226  * parse fcntl F_GETLK results
227  */
228 static int parse_reply_info_filelock(void **p, void *end,
229                                      struct ceph_mds_reply_info_parsed *info,
230                                      u64 features)
231 {
232         if (*p + sizeof(*info->filelock_reply) > end)
233                 goto bad;
234
235         info->filelock_reply = *p;
236         *p += sizeof(*info->filelock_reply);
237
238         if (unlikely(*p != end))
239                 goto bad;
240         return 0;
241
242 bad:
243         return -EIO;
244 }
245
246 /*
247  * parse create results
248  */
249 static int parse_reply_info_create(void **p, void *end,
250                                   struct ceph_mds_reply_info_parsed *info,
251                                   u64 features)
252 {
253         if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
254                 if (*p == end) {
255                         info->has_create_ino = false;
256                 } else {
257                         info->has_create_ino = true;
258                         info->ino = ceph_decode_64(p);
259                 }
260         }
261
262         if (unlikely(*p != end))
263                 goto bad;
264         return 0;
265
266 bad:
267         return -EIO;
268 }
269
270 /*
271  * parse extra results
272  */
273 static int parse_reply_info_extra(void **p, void *end,
274                                   struct ceph_mds_reply_info_parsed *info,
275                                   u64 features)
276 {
277         u32 op = le32_to_cpu(info->head->op);
278
279         if (op == CEPH_MDS_OP_GETFILELOCK)
280                 return parse_reply_info_filelock(p, end, info, features);
281         else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
282                 return parse_reply_info_dir(p, end, info, features);
283         else if (op == CEPH_MDS_OP_CREATE)
284                 return parse_reply_info_create(p, end, info, features);
285         else
286                 return -EIO;
287 }
288
289 /*
290  * parse entire mds reply
291  */
292 static int parse_reply_info(struct ceph_msg *msg,
293                             struct ceph_mds_reply_info_parsed *info,
294                             u64 features)
295 {
296         void *p, *end;
297         u32 len;
298         int err;
299
300         info->head = msg->front.iov_base;
301         p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
302         end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
303
304         /* trace */
305         ceph_decode_32_safe(&p, end, len, bad);
306         if (len > 0) {
307                 ceph_decode_need(&p, end, len, bad);
308                 err = parse_reply_info_trace(&p, p+len, info, features);
309                 if (err < 0)
310                         goto out_bad;
311         }
312
313         /* extra */
314         ceph_decode_32_safe(&p, end, len, bad);
315         if (len > 0) {
316                 ceph_decode_need(&p, end, len, bad);
317                 err = parse_reply_info_extra(&p, p+len, info, features);
318                 if (err < 0)
319                         goto out_bad;
320         }
321
322         /* snap blob */
323         ceph_decode_32_safe(&p, end, len, bad);
324         info->snapblob_len = len;
325         info->snapblob = p;
326         p += len;
327
328         if (p != end)
329                 goto bad;
330         return 0;
331
332 bad:
333         err = -EIO;
334 out_bad:
335         pr_err("mds parse_reply err %d\n", err);
336         return err;
337 }
338
339 static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
340 {
341         if (!info->dir_in)
342                 return;
343         free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size));
344 }
345
346
347 /*
348  * sessions
349  */
350 const char *ceph_session_state_name(int s)
351 {
352         switch (s) {
353         case CEPH_MDS_SESSION_NEW: return "new";
354         case CEPH_MDS_SESSION_OPENING: return "opening";
355         case CEPH_MDS_SESSION_OPEN: return "open";
356         case CEPH_MDS_SESSION_HUNG: return "hung";
357         case CEPH_MDS_SESSION_CLOSING: return "closing";
358         case CEPH_MDS_SESSION_RESTARTING: return "restarting";
359         case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
360         default: return "???";
361         }
362 }
363
364 static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
365 {
366         if (atomic_inc_not_zero(&s->s_ref)) {
367                 dout("mdsc get_session %p %d -> %d\n", s,
368                      atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
369                 return s;
370         } else {
371                 dout("mdsc get_session %p 0 -- FAIL", s);
372                 return NULL;
373         }
374 }
375
376 void ceph_put_mds_session(struct ceph_mds_session *s)
377 {
378         dout("mdsc put_session %p %d -> %d\n", s,
379              atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
380         if (atomic_dec_and_test(&s->s_ref)) {
381                 if (s->s_auth.authorizer)
382                         ceph_auth_destroy_authorizer(
383                                 s->s_mdsc->fsc->client->monc.auth,
384                                 s->s_auth.authorizer);
385                 kfree(s);
386         }
387 }
388
389 /*
390  * called under mdsc->mutex
391  */
392 struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
393                                                    int mds)
394 {
395         struct ceph_mds_session *session;
396
397         if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
398                 return NULL;
399         session = mdsc->sessions[mds];
400         dout("lookup_mds_session %p %d\n", session,
401              atomic_read(&session->s_ref));
402         get_session(session);
403         return session;
404 }
405
406 static bool __have_session(struct ceph_mds_client *mdsc, int mds)
407 {
408         if (mds >= mdsc->max_sessions)
409                 return false;
410         return mdsc->sessions[mds];
411 }
412
413 static int __verify_registered_session(struct ceph_mds_client *mdsc,
414                                        struct ceph_mds_session *s)
415 {
416         if (s->s_mds >= mdsc->max_sessions ||
417             mdsc->sessions[s->s_mds] != s)
418                 return -ENOENT;
419         return 0;
420 }
421
422 /*
423  * create+register a new session for given mds.
424  * called under mdsc->mutex.
425  */
426 static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
427                                                  int mds)
428 {
429         struct ceph_mds_session *s;
430
431         if (mds >= mdsc->mdsmap->m_max_mds)
432                 return ERR_PTR(-EINVAL);
433
434         s = kzalloc(sizeof(*s), GFP_NOFS);
435         if (!s)
436                 return ERR_PTR(-ENOMEM);
437         s->s_mdsc = mdsc;
438         s->s_mds = mds;
439         s->s_state = CEPH_MDS_SESSION_NEW;
440         s->s_ttl = 0;
441         s->s_seq = 0;
442         mutex_init(&s->s_mutex);
443
444         ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr);
445
446         spin_lock_init(&s->s_gen_ttl_lock);
447         s->s_cap_gen = 0;
448         s->s_cap_ttl = jiffies - 1;
449
450         spin_lock_init(&s->s_cap_lock);
451         s->s_renew_requested = 0;
452         s->s_renew_seq = 0;
453         INIT_LIST_HEAD(&s->s_caps);
454         s->s_nr_caps = 0;
455         s->s_trim_caps = 0;
456         atomic_set(&s->s_ref, 1);
457         INIT_LIST_HEAD(&s->s_waiting);
458         INIT_LIST_HEAD(&s->s_unsafe);
459         s->s_num_cap_releases = 0;
460         s->s_cap_reconnect = 0;
461         s->s_cap_iterator = NULL;
462         INIT_LIST_HEAD(&s->s_cap_releases);
463         INIT_LIST_HEAD(&s->s_cap_flushing);
464         INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
465
466         dout("register_session mds%d\n", mds);
467         if (mds >= mdsc->max_sessions) {
468                 int newmax = 1 << get_count_order(mds+1);
469                 struct ceph_mds_session **sa;
470
471                 dout("register_session realloc to %d\n", newmax);
472                 sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
473                 if (sa == NULL)
474                         goto fail_realloc;
475                 if (mdsc->sessions) {
476                         memcpy(sa, mdsc->sessions,
477                                mdsc->max_sessions * sizeof(void *));
478                         kfree(mdsc->sessions);
479                 }
480                 mdsc->sessions = sa;
481                 mdsc->max_sessions = newmax;
482         }
483         mdsc->sessions[mds] = s;
484         atomic_inc(&mdsc->num_sessions);
485         atomic_inc(&s->s_ref);  /* one ref to sessions[], one to caller */
486
487         ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds,
488                       ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
489
490         return s;
491
492 fail_realloc:
493         kfree(s);
494         return ERR_PTR(-ENOMEM);
495 }
496
497 /*
498  * called under mdsc->mutex
499  */
500 static void __unregister_session(struct ceph_mds_client *mdsc,
501                                struct ceph_mds_session *s)
502 {
503         dout("__unregister_session mds%d %p\n", s->s_mds, s);
504         BUG_ON(mdsc->sessions[s->s_mds] != s);
505         mdsc->sessions[s->s_mds] = NULL;
506         ceph_con_close(&s->s_con);
507         ceph_put_mds_session(s);
508         atomic_dec(&mdsc->num_sessions);
509 }
510
511 /*
512  * drop session refs in request.
513  *
514  * should be last request ref, or hold mdsc->mutex
515  */
516 static void put_request_session(struct ceph_mds_request *req)
517 {
518         if (req->r_session) {
519                 ceph_put_mds_session(req->r_session);
520                 req->r_session = NULL;
521         }
522 }
523
524 void ceph_mdsc_release_request(struct kref *kref)
525 {
526         struct ceph_mds_request *req = container_of(kref,
527                                                     struct ceph_mds_request,
528                                                     r_kref);
529         destroy_reply_info(&req->r_reply_info);
530         if (req->r_request)
531                 ceph_msg_put(req->r_request);
532         if (req->r_reply)
533                 ceph_msg_put(req->r_reply);
534         if (req->r_inode) {
535                 ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
536                 iput(req->r_inode);
537         }
538         if (req->r_locked_dir)
539                 ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
540         iput(req->r_target_inode);
541         if (req->r_dentry)
542                 dput(req->r_dentry);
543         if (req->r_old_dentry)
544                 dput(req->r_old_dentry);
545         if (req->r_old_dentry_dir) {
546                 /*
547                  * track (and drop pins for) r_old_dentry_dir
548                  * separately, since r_old_dentry's d_parent may have
549                  * changed between the dir mutex being dropped and
550                  * this request being freed.
551                  */
552                 ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
553                                   CEPH_CAP_PIN);
554                 iput(req->r_old_dentry_dir);
555         }
556         kfree(req->r_path1);
557         kfree(req->r_path2);
558         if (req->r_pagelist)
559                 ceph_pagelist_release(req->r_pagelist);
560         put_request_session(req);
561         ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
562         kfree(req);
563 }
564
565 /*
566  * lookup session, bump ref if found.
567  *
568  * called under mdsc->mutex.
569  */
570 static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
571                                              u64 tid)
572 {
573         struct ceph_mds_request *req;
574         struct rb_node *n = mdsc->request_tree.rb_node;
575
576         while (n) {
577                 req = rb_entry(n, struct ceph_mds_request, r_node);
578                 if (tid < req->r_tid)
579                         n = n->rb_left;
580                 else if (tid > req->r_tid)
581                         n = n->rb_right;
582                 else {
583                         ceph_mdsc_get_request(req);
584                         return req;
585                 }
586         }
587         return NULL;
588 }
589
590 static void __insert_request(struct ceph_mds_client *mdsc,
591                              struct ceph_mds_request *new)
592 {
593         struct rb_node **p = &mdsc->request_tree.rb_node;
594         struct rb_node *parent = NULL;
595         struct ceph_mds_request *req = NULL;
596
597         while (*p) {
598                 parent = *p;
599                 req = rb_entry(parent, struct ceph_mds_request, r_node);
600                 if (new->r_tid < req->r_tid)
601                         p = &(*p)->rb_left;
602                 else if (new->r_tid > req->r_tid)
603                         p = &(*p)->rb_right;
604                 else
605                         BUG();
606         }
607
608         rb_link_node(&new->r_node, parent, p);
609         rb_insert_color(&new->r_node, &mdsc->request_tree);
610 }
611
612 /*
613  * Register an in-flight request, and assign a tid.  Link to directory
614  * are modifying (if any).
615  *
616  * Called under mdsc->mutex.
617  */
618 static void __register_request(struct ceph_mds_client *mdsc,
619                                struct ceph_mds_request *req,
620                                struct inode *dir)
621 {
622         req->r_tid = ++mdsc->last_tid;
623         if (req->r_num_caps)
624                 ceph_reserve_caps(mdsc, &req->r_caps_reservation,
625                                   req->r_num_caps);
626         dout("__register_request %p tid %lld\n", req, req->r_tid);
627         ceph_mdsc_get_request(req);
628         __insert_request(mdsc, req);
629
630         req->r_uid = current_fsuid();
631         req->r_gid = current_fsgid();
632
633         if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK)
634                 mdsc->oldest_tid = req->r_tid;
635
636         if (dir) {
637                 ihold(dir);
638                 req->r_unsafe_dir = dir;
639         }
640 }
641
642 static void __unregister_request(struct ceph_mds_client *mdsc,
643                                  struct ceph_mds_request *req)
644 {
645         dout("__unregister_request %p tid %lld\n", req, req->r_tid);
646
647         /* Never leave an unregistered request on an unsafe list! */
648         list_del_init(&req->r_unsafe_item);
649
650         if (req->r_tid == mdsc->oldest_tid) {
651                 struct rb_node *p = rb_next(&req->r_node);
652                 mdsc->oldest_tid = 0;
653                 while (p) {
654                         struct ceph_mds_request *next_req =
655                                 rb_entry(p, struct ceph_mds_request, r_node);
656                         if (next_req->r_op != CEPH_MDS_OP_SETFILELOCK) {
657                                 mdsc->oldest_tid = next_req->r_tid;
658                                 break;
659                         }
660                         p = rb_next(p);
661                 }
662         }
663
664         rb_erase(&req->r_node, &mdsc->request_tree);
665         RB_CLEAR_NODE(&req->r_node);
666
667         if (req->r_unsafe_dir && req->r_got_unsafe) {
668                 struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
669                 spin_lock(&ci->i_unsafe_lock);
670                 list_del_init(&req->r_unsafe_dir_item);
671                 spin_unlock(&ci->i_unsafe_lock);
672         }
673         if (req->r_target_inode && req->r_got_unsafe) {
674                 struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
675                 spin_lock(&ci->i_unsafe_lock);
676                 list_del_init(&req->r_unsafe_target_item);
677                 spin_unlock(&ci->i_unsafe_lock);
678         }
679
680         if (req->r_unsafe_dir) {
681                 iput(req->r_unsafe_dir);
682                 req->r_unsafe_dir = NULL;
683         }
684
685         complete_all(&req->r_safe_completion);
686
687         ceph_mdsc_put_request(req);
688 }
689
690 /*
691  * Choose mds to send request to next.  If there is a hint set in the
692  * request (e.g., due to a prior forward hint from the mds), use that.
693  * Otherwise, consult frag tree and/or caps to identify the
694  * appropriate mds.  If all else fails, choose randomly.
695  *
696  * Called under mdsc->mutex.
697  */
698 static struct dentry *get_nonsnap_parent(struct dentry *dentry)
699 {
700         /*
701          * we don't need to worry about protecting the d_parent access
702          * here because we never renaming inside the snapped namespace
703          * except to resplice to another snapdir, and either the old or new
704          * result is a valid result.
705          */
706         while (!IS_ROOT(dentry) && ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
707                 dentry = dentry->d_parent;
708         return dentry;
709 }
710
711 static int __choose_mds(struct ceph_mds_client *mdsc,
712                         struct ceph_mds_request *req)
713 {
714         struct inode *inode;
715         struct ceph_inode_info *ci;
716         struct ceph_cap *cap;
717         int mode = req->r_direct_mode;
718         int mds = -1;
719         u32 hash = req->r_direct_hash;
720         bool is_hash = req->r_direct_is_hash;
721
722         /*
723          * is there a specific mds we should try?  ignore hint if we have
724          * no session and the mds is not up (active or recovering).
725          */
726         if (req->r_resend_mds >= 0 &&
727             (__have_session(mdsc, req->r_resend_mds) ||
728              ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
729                 dout("choose_mds using resend_mds mds%d\n",
730                      req->r_resend_mds);
731                 return req->r_resend_mds;
732         }
733
734         if (mode == USE_RANDOM_MDS)
735                 goto random;
736
737         inode = NULL;
738         if (req->r_inode) {
739                 inode = req->r_inode;
740         } else if (req->r_dentry) {
741                 /* ignore race with rename; old or new d_parent is okay */
742                 struct dentry *parent = req->r_dentry->d_parent;
743                 struct inode *dir = d_inode(parent);
744
745                 if (dir->i_sb != mdsc->fsc->sb) {
746                         /* not this fs! */
747                         inode = d_inode(req->r_dentry);
748                 } else if (ceph_snap(dir) != CEPH_NOSNAP) {
749                         /* direct snapped/virtual snapdir requests
750                          * based on parent dir inode */
751                         struct dentry *dn = get_nonsnap_parent(parent);
752                         inode = d_inode(dn);
753                         dout("__choose_mds using nonsnap parent %p\n", inode);
754                 } else {
755                         /* dentry target */
756                         inode = d_inode(req->r_dentry);
757                         if (!inode || mode == USE_AUTH_MDS) {
758                                 /* dir + name */
759                                 inode = dir;
760                                 hash = ceph_dentry_hash(dir, req->r_dentry);
761                                 is_hash = true;
762                         }
763                 }
764         }
765
766         dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
767              (int)hash, mode);
768         if (!inode)
769                 goto random;
770         ci = ceph_inode(inode);
771
772         if (is_hash && S_ISDIR(inode->i_mode)) {
773                 struct ceph_inode_frag frag;
774                 int found;
775
776                 ceph_choose_frag(ci, hash, &frag, &found);
777                 if (found) {
778                         if (mode == USE_ANY_MDS && frag.ndist > 0) {
779                                 u8 r;
780
781                                 /* choose a random replica */
782                                 get_random_bytes(&r, 1);
783                                 r %= frag.ndist;
784                                 mds = frag.dist[r];
785                                 dout("choose_mds %p %llx.%llx "
786                                      "frag %u mds%d (%d/%d)\n",
787                                      inode, ceph_vinop(inode),
788                                      frag.frag, mds,
789                                      (int)r, frag.ndist);
790                                 if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
791                                     CEPH_MDS_STATE_ACTIVE)
792                                         return mds;
793                         }
794
795                         /* since this file/dir wasn't known to be
796                          * replicated, then we want to look for the
797                          * authoritative mds. */
798                         mode = USE_AUTH_MDS;
799                         if (frag.mds >= 0) {
800                                 /* choose auth mds */
801                                 mds = frag.mds;
802                                 dout("choose_mds %p %llx.%llx "
803                                      "frag %u mds%d (auth)\n",
804                                      inode, ceph_vinop(inode), frag.frag, mds);
805                                 if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
806                                     CEPH_MDS_STATE_ACTIVE)
807                                         return mds;
808                         }
809                 }
810         }
811
812         spin_lock(&ci->i_ceph_lock);
813         cap = NULL;
814         if (mode == USE_AUTH_MDS)
815                 cap = ci->i_auth_cap;
816         if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
817                 cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
818         if (!cap) {
819                 spin_unlock(&ci->i_ceph_lock);
820                 goto random;
821         }
822         mds = cap->session->s_mds;
823         dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
824              inode, ceph_vinop(inode), mds,
825              cap == ci->i_auth_cap ? "auth " : "", cap);
826         spin_unlock(&ci->i_ceph_lock);
827         return mds;
828
829 random:
830         mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
831         dout("choose_mds chose random mds%d\n", mds);
832         return mds;
833 }
834
835
836 /*
837  * session messages
838  */
839 static struct ceph_msg *create_session_msg(u32 op, u64 seq)
840 {
841         struct ceph_msg *msg;
842         struct ceph_mds_session_head *h;
843
844         msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
845                            false);
846         if (!msg) {
847                 pr_err("create_session_msg ENOMEM creating msg\n");
848                 return NULL;
849         }
850         h = msg->front.iov_base;
851         h->op = cpu_to_le32(op);
852         h->seq = cpu_to_le64(seq);
853
854         return msg;
855 }
856
857 /*
858  * session message, specialization for CEPH_SESSION_REQUEST_OPEN
859  * to include additional client metadata fields.
860  */
861 static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u64 seq)
862 {
863         struct ceph_msg *msg;
864         struct ceph_mds_session_head *h;
865         int i = -1;
866         int metadata_bytes = 0;
867         int metadata_key_count = 0;
868         struct ceph_options *opt = mdsc->fsc->client->options;
869         void *p;
870
871         const char* metadata[][2] = {
872                 {"hostname", utsname()->nodename},
873                 {"kernel_version", utsname()->release},
874                 {"entity_id", opt->name ? opt->name : ""},
875                 {NULL, NULL}
876         };
877
878         /* Calculate serialized length of metadata */
879         metadata_bytes = 4;  /* map length */
880         for (i = 0; metadata[i][0] != NULL; ++i) {
881                 metadata_bytes += 8 + strlen(metadata[i][0]) +
882                         strlen(metadata[i][1]);
883                 metadata_key_count++;
884         }
885
886         /* Allocate the message */
887         msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + metadata_bytes,
888                            GFP_NOFS, false);
889         if (!msg) {
890                 pr_err("create_session_msg ENOMEM creating msg\n");
891                 return NULL;
892         }
893         h = msg->front.iov_base;
894         h->op = cpu_to_le32(CEPH_SESSION_REQUEST_OPEN);
895         h->seq = cpu_to_le64(seq);
896
897         /*
898          * Serialize client metadata into waiting buffer space, using
899          * the format that userspace expects for map<string, string>
900          *
901          * ClientSession messages with metadata are v2
902          */
903         msg->hdr.version = cpu_to_le16(2);
904         msg->hdr.compat_version = cpu_to_le16(1);
905
906         /* The write pointer, following the session_head structure */
907         p = msg->front.iov_base + sizeof(*h);
908
909         /* Number of entries in the map */
910         ceph_encode_32(&p, metadata_key_count);
911
912         /* Two length-prefixed strings for each entry in the map */
913         for (i = 0; metadata[i][0] != NULL; ++i) {
914                 size_t const key_len = strlen(metadata[i][0]);
915                 size_t const val_len = strlen(metadata[i][1]);
916
917                 ceph_encode_32(&p, key_len);
918                 memcpy(p, metadata[i][0], key_len);
919                 p += key_len;
920                 ceph_encode_32(&p, val_len);
921                 memcpy(p, metadata[i][1], val_len);
922                 p += val_len;
923         }
924
925         return msg;
926 }
927
928 /*
929  * send session open request.
930  *
931  * called under mdsc->mutex
932  */
933 static int __open_session(struct ceph_mds_client *mdsc,
934                           struct ceph_mds_session *session)
935 {
936         struct ceph_msg *msg;
937         int mstate;
938         int mds = session->s_mds;
939
940         /* wait for mds to go active? */
941         mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
942         dout("open_session to mds%d (%s)\n", mds,
943              ceph_mds_state_name(mstate));
944         session->s_state = CEPH_MDS_SESSION_OPENING;
945         session->s_renew_requested = jiffies;
946
947         /* send connect message */
948         msg = create_session_open_msg(mdsc, session->s_seq);
949         if (!msg)
950                 return -ENOMEM;
951         ceph_con_send(&session->s_con, msg);
952         return 0;
953 }
954
955 /*
956  * open sessions for any export targets for the given mds
957  *
958  * called under mdsc->mutex
959  */
960 static struct ceph_mds_session *
961 __open_export_target_session(struct ceph_mds_client *mdsc, int target)
962 {
963         struct ceph_mds_session *session;
964
965         session = __ceph_lookup_mds_session(mdsc, target);
966         if (!session) {
967                 session = register_session(mdsc, target);
968                 if (IS_ERR(session))
969                         return session;
970         }
971         if (session->s_state == CEPH_MDS_SESSION_NEW ||
972             session->s_state == CEPH_MDS_SESSION_CLOSING)
973                 __open_session(mdsc, session);
974
975         return session;
976 }
977
978 struct ceph_mds_session *
979 ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target)
980 {
981         struct ceph_mds_session *session;
982
983         dout("open_export_target_session to mds%d\n", target);
984
985         mutex_lock(&mdsc->mutex);
986         session = __open_export_target_session(mdsc, target);
987         mutex_unlock(&mdsc->mutex);
988
989         return session;
990 }
991
992 static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
993                                           struct ceph_mds_session *session)
994 {
995         struct ceph_mds_info *mi;
996         struct ceph_mds_session *ts;
997         int i, mds = session->s_mds;
998
999         if (mds >= mdsc->mdsmap->m_max_mds)
1000                 return;
1001
1002         mi = &mdsc->mdsmap->m_info[mds];
1003         dout("open_export_target_sessions for mds%d (%d targets)\n",
1004              session->s_mds, mi->num_export_targets);
1005
1006         for (i = 0; i < mi->num_export_targets; i++) {
1007                 ts = __open_export_target_session(mdsc, mi->export_targets[i]);
1008                 if (!IS_ERR(ts))
1009                         ceph_put_mds_session(ts);
1010         }
1011 }
1012
1013 void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
1014                                            struct ceph_mds_session *session)
1015 {
1016         mutex_lock(&mdsc->mutex);
1017         __open_export_target_sessions(mdsc, session);
1018         mutex_unlock(&mdsc->mutex);
1019 }
1020
1021 /*
1022  * session caps
1023  */
1024
1025 /* caller holds s_cap_lock, we drop it */
1026 static void cleanup_cap_releases(struct ceph_mds_client *mdsc,
1027                                  struct ceph_mds_session *session)
1028         __releases(session->s_cap_lock)
1029 {
1030         LIST_HEAD(tmp_list);
1031         list_splice_init(&session->s_cap_releases, &tmp_list);
1032         session->s_num_cap_releases = 0;
1033         spin_unlock(&session->s_cap_lock);
1034
1035         dout("cleanup_cap_releases mds%d\n", session->s_mds);
1036         while (!list_empty(&tmp_list)) {
1037                 struct ceph_cap *cap;
1038                 /* zero out the in-progress message */
1039                 cap = list_first_entry(&tmp_list,
1040                                         struct ceph_cap, session_caps);
1041                 list_del(&cap->session_caps);
1042                 ceph_put_cap(mdsc, cap);
1043         }
1044 }
1045
1046 static void cleanup_session_requests(struct ceph_mds_client *mdsc,
1047                                      struct ceph_mds_session *session)
1048 {
1049         struct ceph_mds_request *req;
1050         struct rb_node *p;
1051
1052         dout("cleanup_session_requests mds%d\n", session->s_mds);
1053         mutex_lock(&mdsc->mutex);
1054         while (!list_empty(&session->s_unsafe)) {
1055                 req = list_first_entry(&session->s_unsafe,
1056                                        struct ceph_mds_request, r_unsafe_item);
1057                 pr_warn_ratelimited(" dropping unsafe request %llu\n",
1058                                     req->r_tid);
1059                 __unregister_request(mdsc, req);
1060         }
1061         /* zero r_attempts, so kick_requests() will re-send requests */
1062         p = rb_first(&mdsc->request_tree);
1063         while (p) {
1064                 req = rb_entry(p, struct ceph_mds_request, r_node);
1065                 p = rb_next(p);
1066                 if (req->r_session &&
1067                     req->r_session->s_mds == session->s_mds)
1068                         req->r_attempts = 0;
1069         }
1070         mutex_unlock(&mdsc->mutex);
1071 }
1072
1073 /*
1074  * Helper to safely iterate over all caps associated with a session, with
1075  * special care taken to handle a racing __ceph_remove_cap().
1076  *
1077  * Caller must hold session s_mutex.
1078  */
1079 static int iterate_session_caps(struct ceph_mds_session *session,
1080                                  int (*cb)(struct inode *, struct ceph_cap *,
1081                                             void *), void *arg)
1082 {
1083         struct list_head *p;
1084         struct ceph_cap *cap;
1085         struct inode *inode, *last_inode = NULL;
1086         struct ceph_cap *old_cap = NULL;
1087         int ret;
1088
1089         dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
1090         spin_lock(&session->s_cap_lock);
1091         p = session->s_caps.next;
1092         while (p != &session->s_caps) {
1093                 cap = list_entry(p, struct ceph_cap, session_caps);
1094                 inode = igrab(&cap->ci->vfs_inode);
1095                 if (!inode) {
1096                         p = p->next;
1097                         continue;
1098                 }
1099                 session->s_cap_iterator = cap;
1100                 spin_unlock(&session->s_cap_lock);
1101
1102                 if (last_inode) {
1103                         iput(last_inode);
1104                         last_inode = NULL;
1105                 }
1106                 if (old_cap) {
1107                         ceph_put_cap(session->s_mdsc, old_cap);
1108                         old_cap = NULL;
1109                 }
1110
1111                 ret = cb(inode, cap, arg);
1112                 last_inode = inode;
1113
1114                 spin_lock(&session->s_cap_lock);
1115                 p = p->next;
1116                 if (cap->ci == NULL) {
1117                         dout("iterate_session_caps  finishing cap %p removal\n",
1118                              cap);
1119                         BUG_ON(cap->session != session);
1120                         cap->session = NULL;
1121                         list_del_init(&cap->session_caps);
1122                         session->s_nr_caps--;
1123                         if (cap->queue_release) {
1124                                 list_add_tail(&cap->session_caps,
1125                                               &session->s_cap_releases);
1126                                 session->s_num_cap_releases++;
1127                         } else {
1128                                 old_cap = cap;  /* put_cap it w/o locks held */
1129                         }
1130                 }
1131                 if (ret < 0)
1132                         goto out;
1133         }
1134         ret = 0;
1135 out:
1136         session->s_cap_iterator = NULL;
1137         spin_unlock(&session->s_cap_lock);
1138
1139         iput(last_inode);
1140         if (old_cap)
1141                 ceph_put_cap(session->s_mdsc, old_cap);
1142
1143         return ret;
1144 }
1145
1146 static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1147                                   void *arg)
1148 {
1149         struct ceph_inode_info *ci = ceph_inode(inode);
1150         LIST_HEAD(to_remove);
1151         int drop = 0;
1152
1153         dout("removing cap %p, ci is %p, inode is %p\n",
1154              cap, ci, &ci->vfs_inode);
1155         spin_lock(&ci->i_ceph_lock);
1156         __ceph_remove_cap(cap, false);
1157         if (!ci->i_auth_cap) {
1158                 struct ceph_cap_flush *cf;
1159                 struct ceph_mds_client *mdsc =
1160                         ceph_sb_to_client(inode->i_sb)->mdsc;
1161
1162                 while (true) {
1163                         struct rb_node *n = rb_first(&ci->i_cap_flush_tree);
1164                         if (!n)
1165                                 break;
1166                         cf = rb_entry(n, struct ceph_cap_flush, i_node);
1167                         rb_erase(&cf->i_node, &ci->i_cap_flush_tree);
1168                         list_add(&cf->list, &to_remove);
1169                 }
1170
1171                 spin_lock(&mdsc->cap_dirty_lock);
1172
1173                 list_for_each_entry(cf, &to_remove, list)
1174                         rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
1175
1176                 if (!list_empty(&ci->i_dirty_item)) {
1177                         pr_warn_ratelimited(
1178                                 " dropping dirty %s state for %p %lld\n",
1179                                 ceph_cap_string(ci->i_dirty_caps),
1180                                 inode, ceph_ino(inode));
1181                         ci->i_dirty_caps = 0;
1182                         list_del_init(&ci->i_dirty_item);
1183                         drop = 1;
1184                 }
1185                 if (!list_empty(&ci->i_flushing_item)) {
1186                         pr_warn_ratelimited(
1187                                 " dropping dirty+flushing %s state for %p %lld\n",
1188                                 ceph_cap_string(ci->i_flushing_caps),
1189                                 inode, ceph_ino(inode));
1190                         ci->i_flushing_caps = 0;
1191                         list_del_init(&ci->i_flushing_item);
1192                         mdsc->num_cap_flushing--;
1193                         drop = 1;
1194                 }
1195                 spin_unlock(&mdsc->cap_dirty_lock);
1196
1197                 if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
1198                         list_add(&ci->i_prealloc_cap_flush->list, &to_remove);
1199                         ci->i_prealloc_cap_flush = NULL;
1200                 }
1201         }
1202         spin_unlock(&ci->i_ceph_lock);
1203         while (!list_empty(&to_remove)) {
1204                 struct ceph_cap_flush *cf;
1205                 cf = list_first_entry(&to_remove,
1206                                       struct ceph_cap_flush, list);
1207                 list_del(&cf->list);
1208                 ceph_free_cap_flush(cf);
1209         }
1210         while (drop--)
1211                 iput(inode);
1212         return 0;
1213 }
1214
1215 /*
1216  * caller must hold session s_mutex
1217  */
1218 static void remove_session_caps(struct ceph_mds_session *session)
1219 {
1220         dout("remove_session_caps on %p\n", session);
1221         iterate_session_caps(session, remove_session_caps_cb, NULL);
1222
1223         spin_lock(&session->s_cap_lock);
1224         if (session->s_nr_caps > 0) {
1225                 struct super_block *sb = session->s_mdsc->fsc->sb;
1226                 struct inode *inode;
1227                 struct ceph_cap *cap, *prev = NULL;
1228                 struct ceph_vino vino;
1229                 /*
1230                  * iterate_session_caps() skips inodes that are being
1231                  * deleted, we need to wait until deletions are complete.
1232                  * __wait_on_freeing_inode() is designed for the job,
1233                  * but it is not exported, so use lookup inode function
1234                  * to access it.
1235                  */
1236                 while (!list_empty(&session->s_caps)) {
1237                         cap = list_entry(session->s_caps.next,
1238                                          struct ceph_cap, session_caps);
1239                         if (cap == prev)
1240                                 break;
1241                         prev = cap;
1242                         vino = cap->ci->i_vino;
1243                         spin_unlock(&session->s_cap_lock);
1244
1245                         inode = ceph_find_inode(sb, vino);
1246                         iput(inode);
1247
1248                         spin_lock(&session->s_cap_lock);
1249                 }
1250         }
1251
1252         // drop cap expires and unlock s_cap_lock
1253         cleanup_cap_releases(session->s_mdsc, session);
1254
1255         BUG_ON(session->s_nr_caps > 0);
1256         BUG_ON(!list_empty(&session->s_cap_flushing));
1257 }
1258
1259 /*
1260  * wake up any threads waiting on this session's caps.  if the cap is
1261  * old (didn't get renewed on the client reconnect), remove it now.
1262  *
1263  * caller must hold s_mutex.
1264  */
1265 static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
1266                               void *arg)
1267 {
1268         struct ceph_inode_info *ci = ceph_inode(inode);
1269
1270         wake_up_all(&ci->i_cap_wq);
1271         if (arg) {
1272                 spin_lock(&ci->i_ceph_lock);
1273                 ci->i_wanted_max_size = 0;
1274                 ci->i_requested_max_size = 0;
1275                 spin_unlock(&ci->i_ceph_lock);
1276         }
1277         return 0;
1278 }
1279
1280 static void wake_up_session_caps(struct ceph_mds_session *session,
1281                                  int reconnect)
1282 {
1283         dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
1284         iterate_session_caps(session, wake_up_session_cb,
1285                              (void *)(unsigned long)reconnect);
1286 }
1287
1288 /*
1289  * Send periodic message to MDS renewing all currently held caps.  The
1290  * ack will reset the expiration for all caps from this session.
1291  *
1292  * caller holds s_mutex
1293  */
1294 static int send_renew_caps(struct ceph_mds_client *mdsc,
1295                            struct ceph_mds_session *session)
1296 {
1297         struct ceph_msg *msg;
1298         int state;
1299
1300         if (time_after_eq(jiffies, session->s_cap_ttl) &&
1301             time_after_eq(session->s_cap_ttl, session->s_renew_requested))
1302                 pr_info("mds%d caps stale\n", session->s_mds);
1303         session->s_renew_requested = jiffies;
1304
1305         /* do not try to renew caps until a recovering mds has reconnected
1306          * with its clients. */
1307         state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
1308         if (state < CEPH_MDS_STATE_RECONNECT) {
1309                 dout("send_renew_caps ignoring mds%d (%s)\n",
1310                      session->s_mds, ceph_mds_state_name(state));
1311                 return 0;
1312         }
1313
1314         dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
1315                 ceph_mds_state_name(state));
1316         msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
1317                                  ++session->s_renew_seq);
1318         if (!msg)
1319                 return -ENOMEM;
1320         ceph_con_send(&session->s_con, msg);
1321         return 0;
1322 }
1323
1324 static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
1325                              struct ceph_mds_session *session, u64 seq)
1326 {
1327         struct ceph_msg *msg;
1328
1329         dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
1330              session->s_mds, ceph_session_state_name(session->s_state), seq);
1331         msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
1332         if (!msg)
1333                 return -ENOMEM;
1334         ceph_con_send(&session->s_con, msg);
1335         return 0;
1336 }
1337
1338
1339 /*
1340  * Note new cap ttl, and any transition from stale -> not stale (fresh?).
1341  *
1342  * Called under session->s_mutex
1343  */
1344 static void renewed_caps(struct ceph_mds_client *mdsc,
1345                          struct ceph_mds_session *session, int is_renew)
1346 {
1347         int was_stale;
1348         int wake = 0;
1349
1350         spin_lock(&session->s_cap_lock);
1351         was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl);
1352
1353         session->s_cap_ttl = session->s_renew_requested +
1354                 mdsc->mdsmap->m_session_timeout*HZ;
1355
1356         if (was_stale) {
1357                 if (time_before(jiffies, session->s_cap_ttl)) {
1358                         pr_info("mds%d caps renewed\n", session->s_mds);
1359                         wake = 1;
1360                 } else {
1361                         pr_info("mds%d caps still stale\n", session->s_mds);
1362                 }
1363         }
1364         dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
1365              session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
1366              time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
1367         spin_unlock(&session->s_cap_lock);
1368
1369         if (wake)
1370                 wake_up_session_caps(session, 0);
1371 }
1372
1373 /*
1374  * send a session close request
1375  */
1376 static int request_close_session(struct ceph_mds_client *mdsc,
1377                                  struct ceph_mds_session *session)
1378 {
1379         struct ceph_msg *msg;
1380
1381         dout("request_close_session mds%d state %s seq %lld\n",
1382              session->s_mds, ceph_session_state_name(session->s_state),
1383              session->s_seq);
1384         msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
1385         if (!msg)
1386                 return -ENOMEM;
1387         ceph_con_send(&session->s_con, msg);
1388         return 0;
1389 }
1390
1391 /*
1392  * Called with s_mutex held.
1393  */
1394 static int __close_session(struct ceph_mds_client *mdsc,
1395                          struct ceph_mds_session *session)
1396 {
1397         if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
1398                 return 0;
1399         session->s_state = CEPH_MDS_SESSION_CLOSING;
1400         return request_close_session(mdsc, session);
1401 }
1402
1403 /*
1404  * Trim old(er) caps.
1405  *
1406  * Because we can't cache an inode without one or more caps, we do
1407  * this indirectly: if a cap is unused, we prune its aliases, at which
1408  * point the inode will hopefully get dropped to.
1409  *
1410  * Yes, this is a bit sloppy.  Our only real goal here is to respond to
1411  * memory pressure from the MDS, though, so it needn't be perfect.
1412  */
1413 static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
1414 {
1415         struct ceph_mds_session *session = arg;
1416         struct ceph_inode_info *ci = ceph_inode(inode);
1417         int used, wanted, oissued, mine;
1418
1419         if (session->s_trim_caps <= 0)
1420                 return -1;
1421
1422         spin_lock(&ci->i_ceph_lock);
1423         mine = cap->issued | cap->implemented;
1424         used = __ceph_caps_used(ci);
1425         wanted = __ceph_caps_file_wanted(ci);
1426         oissued = __ceph_caps_issued_other(ci, cap);
1427
1428         dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n",
1429              inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
1430              ceph_cap_string(used), ceph_cap_string(wanted));
1431         if (cap == ci->i_auth_cap) {
1432                 if (ci->i_dirty_caps || ci->i_flushing_caps ||
1433                     !list_empty(&ci->i_cap_snaps))
1434                         goto out;
1435                 if ((used | wanted) & CEPH_CAP_ANY_WR)
1436                         goto out;
1437         }
1438         /* The inode has cached pages, but it's no longer used.
1439          * we can safely drop it */
1440         if (wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
1441             !(oissued & CEPH_CAP_FILE_CACHE)) {
1442           used = 0;
1443           oissued = 0;
1444         }
1445         if ((used | wanted) & ~oissued & mine)
1446                 goto out;   /* we need these caps */
1447
1448         session->s_trim_caps--;
1449         if (oissued) {
1450                 /* we aren't the only cap.. just remove us */
1451                 __ceph_remove_cap(cap, true);
1452         } else {
1453                 /* try dropping referring dentries */
1454                 spin_unlock(&ci->i_ceph_lock);
1455                 d_prune_aliases(inode);
1456                 dout("trim_caps_cb %p cap %p  pruned, count now %d\n",
1457                      inode, cap, atomic_read(&inode->i_count));
1458                 return 0;
1459         }
1460
1461 out:
1462         spin_unlock(&ci->i_ceph_lock);
1463         return 0;
1464 }
1465
1466 /*
1467  * Trim session cap count down to some max number.
1468  */
1469 static int trim_caps(struct ceph_mds_client *mdsc,
1470                      struct ceph_mds_session *session,
1471                      int max_caps)
1472 {
1473         int trim_caps = session->s_nr_caps - max_caps;
1474
1475         dout("trim_caps mds%d start: %d / %d, trim %d\n",
1476              session->s_mds, session->s_nr_caps, max_caps, trim_caps);
1477         if (trim_caps > 0) {
1478                 session->s_trim_caps = trim_caps;
1479                 iterate_session_caps(session, trim_caps_cb, session);
1480                 dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
1481                      session->s_mds, session->s_nr_caps, max_caps,
1482                         trim_caps - session->s_trim_caps);
1483                 session->s_trim_caps = 0;
1484         }
1485
1486         ceph_send_cap_releases(mdsc, session);
1487         return 0;
1488 }
1489
1490 static int check_capsnap_flush(struct ceph_inode_info *ci,
1491                                u64 want_snap_seq)
1492 {
1493         int ret = 1;
1494         spin_lock(&ci->i_ceph_lock);
1495         if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) {
1496                 struct ceph_cap_snap *capsnap =
1497                         list_first_entry(&ci->i_cap_snaps,
1498                                          struct ceph_cap_snap, ci_item);
1499                 ret = capsnap->follows >= want_snap_seq;
1500         }
1501         spin_unlock(&ci->i_ceph_lock);
1502         return ret;
1503 }
1504
1505 static int check_caps_flush(struct ceph_mds_client *mdsc,
1506                             u64 want_flush_tid)
1507 {
1508         struct rb_node *n;
1509         struct ceph_cap_flush *cf;
1510         int ret = 1;
1511
1512         spin_lock(&mdsc->cap_dirty_lock);
1513         n = rb_first(&mdsc->cap_flush_tree);
1514         cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
1515         if (cf && cf->tid <= want_flush_tid) {
1516                 dout("check_caps_flush still flushing tid %llu <= %llu\n",
1517                      cf->tid, want_flush_tid);
1518                 ret = 0;
1519         }
1520         spin_unlock(&mdsc->cap_dirty_lock);
1521         return ret;
1522 }
1523
1524 /*
1525  * flush all dirty inode data to disk.
1526  *
1527  * returns true if we've flushed through want_flush_tid
1528  */
1529 static void wait_caps_flush(struct ceph_mds_client *mdsc,
1530                             u64 want_flush_tid, u64 want_snap_seq)
1531 {
1532         int mds;
1533
1534         dout("check_caps_flush want %llu snap want %llu\n",
1535              want_flush_tid, want_snap_seq);
1536         mutex_lock(&mdsc->mutex);
1537         for (mds = 0; mds < mdsc->max_sessions; ) {
1538                 struct ceph_mds_session *session = mdsc->sessions[mds];
1539                 struct inode *inode = NULL;
1540
1541                 if (!session) {
1542                         mds++;
1543                         continue;
1544                 }
1545                 get_session(session);
1546                 mutex_unlock(&mdsc->mutex);
1547
1548                 mutex_lock(&session->s_mutex);
1549                 if (!list_empty(&session->s_cap_snaps_flushing)) {
1550                         struct ceph_cap_snap *capsnap =
1551                                 list_first_entry(&session->s_cap_snaps_flushing,
1552                                                  struct ceph_cap_snap,
1553                                                  flushing_item);
1554                         struct ceph_inode_info *ci = capsnap->ci;
1555                         if (!check_capsnap_flush(ci, want_snap_seq)) {
1556                                 dout("check_cap_flush still flushing snap %p "
1557                                      "follows %lld <= %lld to mds%d\n",
1558                                      &ci->vfs_inode, capsnap->follows,
1559                                      want_snap_seq, mds);
1560                                 inode = igrab(&ci->vfs_inode);
1561                         }
1562                 }
1563                 mutex_unlock(&session->s_mutex);
1564                 ceph_put_mds_session(session);
1565
1566                 if (inode) {
1567                         wait_event(mdsc->cap_flushing_wq,
1568                                    check_capsnap_flush(ceph_inode(inode),
1569                                                        want_snap_seq));
1570                         iput(inode);
1571                 } else {
1572                         mds++;
1573                 }
1574
1575                 mutex_lock(&mdsc->mutex);
1576         }
1577         mutex_unlock(&mdsc->mutex);
1578
1579         wait_event(mdsc->cap_flushing_wq,
1580                    check_caps_flush(mdsc, want_flush_tid));
1581
1582         dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid);
1583 }
1584
1585 /*
1586  * called under s_mutex
1587  */
1588 void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
1589                             struct ceph_mds_session *session)
1590 {
1591         struct ceph_msg *msg = NULL;
1592         struct ceph_mds_cap_release *head;
1593         struct ceph_mds_cap_item *item;
1594         struct ceph_cap *cap;
1595         LIST_HEAD(tmp_list);
1596         int num_cap_releases;
1597
1598         spin_lock(&session->s_cap_lock);
1599 again:
1600         list_splice_init(&session->s_cap_releases, &tmp_list);
1601         num_cap_releases = session->s_num_cap_releases;
1602         session->s_num_cap_releases = 0;
1603         spin_unlock(&session->s_cap_lock);
1604
1605         while (!list_empty(&tmp_list)) {
1606                 if (!msg) {
1607                         msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE,
1608                                         PAGE_CACHE_SIZE, GFP_NOFS, false);
1609                         if (!msg)
1610                                 goto out_err;
1611                         head = msg->front.iov_base;
1612                         head->num = cpu_to_le32(0);
1613                         msg->front.iov_len = sizeof(*head);
1614                 }
1615                 cap = list_first_entry(&tmp_list, struct ceph_cap,
1616                                         session_caps);
1617                 list_del(&cap->session_caps);
1618                 num_cap_releases--;
1619
1620                 head = msg->front.iov_base;
1621                 le32_add_cpu(&head->num, 1);
1622                 item = msg->front.iov_base + msg->front.iov_len;
1623                 item->ino = cpu_to_le64(cap->cap_ino);
1624                 item->cap_id = cpu_to_le64(cap->cap_id);
1625                 item->migrate_seq = cpu_to_le32(cap->mseq);
1626                 item->seq = cpu_to_le32(cap->issue_seq);
1627                 msg->front.iov_len += sizeof(*item);
1628
1629                 ceph_put_cap(mdsc, cap);
1630
1631                 if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
1632                         msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1633                         dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
1634                         ceph_con_send(&session->s_con, msg);
1635                         msg = NULL;
1636                 }
1637         }
1638
1639         BUG_ON(num_cap_releases != 0);
1640
1641         spin_lock(&session->s_cap_lock);
1642         if (!list_empty(&session->s_cap_releases))
1643                 goto again;
1644         spin_unlock(&session->s_cap_lock);
1645
1646         if (msg) {
1647                 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1648                 dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
1649                 ceph_con_send(&session->s_con, msg);
1650         }
1651         return;
1652 out_err:
1653         pr_err("send_cap_releases mds%d, failed to allocate message\n",
1654                 session->s_mds);
1655         spin_lock(&session->s_cap_lock);
1656         list_splice(&tmp_list, &session->s_cap_releases);
1657         session->s_num_cap_releases += num_cap_releases;
1658         spin_unlock(&session->s_cap_lock);
1659 }
1660
1661 /*
1662  * requests
1663  */
1664
1665 int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
1666                                     struct inode *dir)
1667 {
1668         struct ceph_inode_info *ci = ceph_inode(dir);
1669         struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1670         struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
1671         size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) +
1672                       sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease);
1673         int order, num_entries;
1674
1675         spin_lock(&ci->i_ceph_lock);
1676         num_entries = ci->i_files + ci->i_subdirs;
1677         spin_unlock(&ci->i_ceph_lock);
1678         num_entries = max(num_entries, 1);
1679         num_entries = min(num_entries, opt->max_readdir);
1680
1681         order = get_order(size * num_entries);
1682         while (order >= 0) {
1683                 rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL |
1684                                                         __GFP_NOWARN,
1685                                                         order);
1686                 if (rinfo->dir_in)
1687                         break;
1688                 order--;
1689         }
1690         if (!rinfo->dir_in)
1691                 return -ENOMEM;
1692
1693         num_entries = (PAGE_SIZE << order) / size;
1694         num_entries = min(num_entries, opt->max_readdir);
1695
1696         rinfo->dir_buf_size = PAGE_SIZE << order;
1697         req->r_num_caps = num_entries + 1;
1698         req->r_args.readdir.max_entries = cpu_to_le32(num_entries);
1699         req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes);
1700         return 0;
1701 }
1702
1703 /*
1704  * Create an mds request.
1705  */
1706 struct ceph_mds_request *
1707 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1708 {
1709         struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
1710
1711         if (!req)
1712                 return ERR_PTR(-ENOMEM);
1713
1714         mutex_init(&req->r_fill_mutex);
1715         req->r_mdsc = mdsc;
1716         req->r_started = jiffies;
1717         req->r_resend_mds = -1;
1718         INIT_LIST_HEAD(&req->r_unsafe_dir_item);
1719         INIT_LIST_HEAD(&req->r_unsafe_target_item);
1720         req->r_fmode = -1;
1721         kref_init(&req->r_kref);
1722         INIT_LIST_HEAD(&req->r_wait);
1723         init_completion(&req->r_completion);
1724         init_completion(&req->r_safe_completion);
1725         INIT_LIST_HEAD(&req->r_unsafe_item);
1726
1727         req->r_stamp = CURRENT_TIME;
1728
1729         req->r_op = op;
1730         req->r_direct_mode = mode;
1731         return req;
1732 }
1733
1734 /*
1735  * return oldest (lowest) request, tid in request tree, 0 if none.
1736  *
1737  * called under mdsc->mutex.
1738  */
1739 static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
1740 {
1741         if (RB_EMPTY_ROOT(&mdsc->request_tree))
1742                 return NULL;
1743         return rb_entry(rb_first(&mdsc->request_tree),
1744                         struct ceph_mds_request, r_node);
1745 }
1746
1747 static inline  u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
1748 {
1749         return mdsc->oldest_tid;
1750 }
1751
1752 /*
1753  * Build a dentry's path.  Allocate on heap; caller must kfree.  Based
1754  * on build_path_from_dentry in fs/cifs/dir.c.
1755  *
1756  * If @stop_on_nosnap, generate path relative to the first non-snapped
1757  * inode.
1758  *
1759  * Encode hidden .snap dirs as a double /, i.e.
1760  *   foo/.snap/bar -> foo//bar
1761  */
1762 char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
1763                            int stop_on_nosnap)
1764 {
1765         struct dentry *temp;
1766         char *path;
1767         int len, pos;
1768         unsigned seq;
1769
1770         if (dentry == NULL)
1771                 return ERR_PTR(-EINVAL);
1772
1773 retry:
1774         len = 0;
1775         seq = read_seqbegin(&rename_lock);
1776         rcu_read_lock();
1777         for (temp = dentry; !IS_ROOT(temp);) {
1778                 struct inode *inode = d_inode(temp);
1779                 if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
1780                         len++;  /* slash only */
1781                 else if (stop_on_nosnap && inode &&
1782                          ceph_snap(inode) == CEPH_NOSNAP)
1783                         break;
1784                 else
1785                         len += 1 + temp->d_name.len;
1786                 temp = temp->d_parent;
1787         }
1788         rcu_read_unlock();
1789         if (len)
1790                 len--;  /* no leading '/' */
1791
1792         path = kmalloc(len+1, GFP_NOFS);
1793         if (path == NULL)
1794                 return ERR_PTR(-ENOMEM);
1795         pos = len;
1796         path[pos] = 0;  /* trailing null */
1797         rcu_read_lock();
1798         for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
1799                 struct inode *inode;
1800
1801                 spin_lock(&temp->d_lock);
1802                 inode = d_inode(temp);
1803                 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
1804                         dout("build_path path+%d: %p SNAPDIR\n",
1805                              pos, temp);
1806                 } else if (stop_on_nosnap && inode &&
1807                            ceph_snap(inode) == CEPH_NOSNAP) {
1808                         spin_unlock(&temp->d_lock);
1809                         break;
1810                 } else {
1811                         pos -= temp->d_name.len;
1812                         if (pos < 0) {
1813                                 spin_unlock(&temp->d_lock);
1814                                 break;
1815                         }
1816                         strncpy(path + pos, temp->d_name.name,
1817                                 temp->d_name.len);
1818                 }
1819                 spin_unlock(&temp->d_lock);
1820                 if (pos)
1821                         path[--pos] = '/';
1822                 temp = temp->d_parent;
1823         }
1824         rcu_read_unlock();
1825         if (pos != 0 || read_seqretry(&rename_lock, seq)) {
1826                 pr_err("build_path did not end path lookup where "
1827                        "expected, namelen is %d, pos is %d\n", len, pos);
1828                 /* presumably this is only possible if racing with a
1829                    rename of one of the parent directories (we can not
1830                    lock the dentries above us to prevent this, but
1831                    retrying should be harmless) */
1832                 kfree(path);
1833                 goto retry;
1834         }
1835
1836         *base = ceph_ino(d_inode(temp));
1837         *plen = len;
1838         dout("build_path on %p %d built %llx '%.*s'\n",
1839              dentry, d_count(dentry), *base, len, path);
1840         return path;
1841 }
1842
1843 static int build_dentry_path(struct dentry *dentry,
1844                              const char **ppath, int *ppathlen, u64 *pino,
1845                              int *pfreepath)
1846 {
1847         char *path;
1848
1849         if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP) {
1850                 *pino = ceph_ino(d_inode(dentry->d_parent));
1851                 *ppath = dentry->d_name.name;
1852                 *ppathlen = dentry->d_name.len;
1853                 return 0;
1854         }
1855         path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1856         if (IS_ERR(path))
1857                 return PTR_ERR(path);
1858         *ppath = path;
1859         *pfreepath = 1;
1860         return 0;
1861 }
1862
1863 static int build_inode_path(struct inode *inode,
1864                             const char **ppath, int *ppathlen, u64 *pino,
1865                             int *pfreepath)
1866 {
1867         struct dentry *dentry;
1868         char *path;
1869
1870         if (ceph_snap(inode) == CEPH_NOSNAP) {
1871                 *pino = ceph_ino(inode);
1872                 *ppathlen = 0;
1873                 return 0;
1874         }
1875         dentry = d_find_alias(inode);
1876         path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1877         dput(dentry);
1878         if (IS_ERR(path))
1879                 return PTR_ERR(path);
1880         *ppath = path;
1881         *pfreepath = 1;
1882         return 0;
1883 }
1884
1885 /*
1886  * request arguments may be specified via an inode *, a dentry *, or
1887  * an explicit ino+path.
1888  */
1889 static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
1890                                   const char *rpath, u64 rino,
1891                                   const char **ppath, int *pathlen,
1892                                   u64 *ino, int *freepath)
1893 {
1894         int r = 0;
1895
1896         if (rinode) {
1897                 r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
1898                 dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
1899                      ceph_snap(rinode));
1900         } else if (rdentry) {
1901                 r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
1902                 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
1903                      *ppath);
1904         } else if (rpath || rino) {
1905                 *ino = rino;
1906                 *ppath = rpath;
1907                 *pathlen = rpath ? strlen(rpath) : 0;
1908                 dout(" path %.*s\n", *pathlen, rpath);
1909         }
1910
1911         return r;
1912 }
1913
1914 /*
1915  * called under mdsc->mutex
1916  */
1917 static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1918                                                struct ceph_mds_request *req,
1919                                                int mds, bool drop_cap_releases)
1920 {
1921         struct ceph_msg *msg;
1922         struct ceph_mds_request_head *head;
1923         const char *path1 = NULL;
1924         const char *path2 = NULL;
1925         u64 ino1 = 0, ino2 = 0;
1926         int pathlen1 = 0, pathlen2 = 0;
1927         int freepath1 = 0, freepath2 = 0;
1928         int len;
1929         u16 releases;
1930         void *p, *end;
1931         int ret;
1932
1933         ret = set_request_path_attr(req->r_inode, req->r_dentry,
1934                               req->r_path1, req->r_ino1.ino,
1935                               &path1, &pathlen1, &ino1, &freepath1);
1936         if (ret < 0) {
1937                 msg = ERR_PTR(ret);
1938                 goto out;
1939         }
1940
1941         ret = set_request_path_attr(NULL, req->r_old_dentry,
1942                               req->r_path2, req->r_ino2.ino,
1943                               &path2, &pathlen2, &ino2, &freepath2);
1944         if (ret < 0) {
1945                 msg = ERR_PTR(ret);
1946                 goto out_free1;
1947         }
1948
1949         len = sizeof(*head) +
1950                 pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
1951                 sizeof(struct ceph_timespec);
1952
1953         /* calculate (max) length for cap releases */
1954         len += sizeof(struct ceph_mds_request_release) *
1955                 (!!req->r_inode_drop + !!req->r_dentry_drop +
1956                  !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
1957         if (req->r_dentry_drop)
1958                 len += req->r_dentry->d_name.len;
1959         if (req->r_old_dentry_drop)
1960                 len += req->r_old_dentry->d_name.len;
1961
1962         msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false);
1963         if (!msg) {
1964                 msg = ERR_PTR(-ENOMEM);
1965                 goto out_free2;
1966         }
1967
1968         msg->hdr.version = cpu_to_le16(2);
1969         msg->hdr.tid = cpu_to_le64(req->r_tid);
1970
1971         head = msg->front.iov_base;
1972         p = msg->front.iov_base + sizeof(*head);
1973         end = msg->front.iov_base + msg->front.iov_len;
1974
1975         head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
1976         head->op = cpu_to_le32(req->r_op);
1977         head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid));
1978         head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid));
1979         head->args = req->r_args;
1980
1981         ceph_encode_filepath(&p, end, ino1, path1);
1982         ceph_encode_filepath(&p, end, ino2, path2);
1983
1984         /* make note of release offset, in case we need to replay */
1985         req->r_request_release_offset = p - msg->front.iov_base;
1986
1987         /* cap releases */
1988         releases = 0;
1989         if (req->r_inode_drop)
1990                 releases += ceph_encode_inode_release(&p,
1991                       req->r_inode ? req->r_inode : d_inode(req->r_dentry),
1992                       mds, req->r_inode_drop, req->r_inode_unless, 0);
1993         if (req->r_dentry_drop)
1994                 releases += ceph_encode_dentry_release(&p, req->r_dentry,
1995                        mds, req->r_dentry_drop, req->r_dentry_unless);
1996         if (req->r_old_dentry_drop)
1997                 releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
1998                        mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
1999         if (req->r_old_inode_drop)
2000                 releases += ceph_encode_inode_release(&p,
2001                       d_inode(req->r_old_dentry),
2002                       mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
2003
2004         if (drop_cap_releases) {
2005                 releases = 0;
2006                 p = msg->front.iov_base + req->r_request_release_offset;
2007         }
2008
2009         head->num_releases = cpu_to_le16(releases);
2010
2011         /* time stamp */
2012         {
2013                 struct ceph_timespec ts;
2014                 ceph_encode_timespec(&ts, &req->r_stamp);
2015                 ceph_encode_copy(&p, &ts, sizeof(ts));
2016         }
2017
2018         BUG_ON(p > end);
2019         msg->front.iov_len = p - msg->front.iov_base;
2020         msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
2021
2022         if (req->r_pagelist) {
2023                 struct ceph_pagelist *pagelist = req->r_pagelist;
2024                 atomic_inc(&pagelist->refcnt);
2025                 ceph_msg_data_add_pagelist(msg, pagelist);
2026                 msg->hdr.data_len = cpu_to_le32(pagelist->length);
2027         } else {
2028                 msg->hdr.data_len = 0;
2029         }
2030
2031         msg->hdr.data_off = cpu_to_le16(0);
2032
2033 out_free2:
2034         if (freepath2)
2035                 kfree((char *)path2);
2036 out_free1:
2037         if (freepath1)
2038                 kfree((char *)path1);
2039 out:
2040         return msg;
2041 }
2042
2043 /*
2044  * called under mdsc->mutex if error, under no mutex if
2045  * success.
2046  */
2047 static void complete_request(struct ceph_mds_client *mdsc,
2048                              struct ceph_mds_request *req)
2049 {
2050         if (req->r_callback)
2051                 req->r_callback(mdsc, req);
2052         else
2053                 complete_all(&req->r_completion);
2054 }
2055
2056 /*
2057  * called under mdsc->mutex
2058  */
2059 static int __prepare_send_request(struct ceph_mds_client *mdsc,
2060                                   struct ceph_mds_request *req,
2061                                   int mds, bool drop_cap_releases)
2062 {
2063         struct ceph_mds_request_head *rhead;
2064         struct ceph_msg *msg;
2065         int flags = 0;
2066
2067         req->r_attempts++;
2068         if (req->r_inode) {
2069                 struct ceph_cap *cap =
2070                         ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds);
2071
2072                 if (cap)
2073                         req->r_sent_on_mseq = cap->mseq;
2074                 else
2075                         req->r_sent_on_mseq = -1;
2076         }
2077         dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
2078              req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
2079
2080         if (req->r_got_unsafe) {
2081                 void *p;
2082                 /*
2083                  * Replay.  Do not regenerate message (and rebuild
2084                  * paths, etc.); just use the original message.
2085                  * Rebuilding paths will break for renames because
2086                  * d_move mangles the src name.
2087                  */
2088                 msg = req->r_request;
2089                 rhead = msg->front.iov_base;
2090
2091                 flags = le32_to_cpu(rhead->flags);
2092                 flags |= CEPH_MDS_FLAG_REPLAY;
2093                 rhead->flags = cpu_to_le32(flags);
2094
2095                 if (req->r_target_inode)
2096                         rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
2097
2098                 rhead->num_retry = req->r_attempts - 1;
2099
2100                 /* remove cap/dentry releases from message */
2101                 rhead->num_releases = 0;
2102
2103                 /* time stamp */
2104                 p = msg->front.iov_base + req->r_request_release_offset;
2105                 {
2106                         struct ceph_timespec ts;
2107                         ceph_encode_timespec(&ts, &req->r_stamp);
2108                         ceph_encode_copy(&p, &ts, sizeof(ts));
2109                 }
2110
2111                 msg->front.iov_len = p - msg->front.iov_base;
2112                 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
2113                 return 0;
2114         }
2115
2116         if (req->r_request) {
2117                 ceph_msg_put(req->r_request);
2118                 req->r_request = NULL;
2119         }
2120         msg = create_request_message(mdsc, req, mds, drop_cap_releases);
2121         if (IS_ERR(msg)) {
2122                 req->r_err = PTR_ERR(msg);
2123                 return PTR_ERR(msg);
2124         }
2125         req->r_request = msg;
2126
2127         rhead = msg->front.iov_base;
2128         rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
2129         if (req->r_got_unsafe)
2130                 flags |= CEPH_MDS_FLAG_REPLAY;
2131         if (req->r_locked_dir)
2132                 flags |= CEPH_MDS_FLAG_WANT_DENTRY;
2133         rhead->flags = cpu_to_le32(flags);
2134         rhead->num_fwd = req->r_num_fwd;
2135         rhead->num_retry = req->r_attempts - 1;
2136         rhead->ino = 0;
2137
2138         dout(" r_locked_dir = %p\n", req->r_locked_dir);
2139         return 0;
2140 }
2141
2142 /*
2143  * send request, or put it on the appropriate wait list.
2144  */
2145 static int __do_request(struct ceph_mds_client *mdsc,
2146                         struct ceph_mds_request *req)
2147 {
2148         struct ceph_mds_session *session = NULL;
2149         int mds = -1;
2150         int err = 0;
2151
2152         if (req->r_err || req->r_got_result) {
2153                 if (req->r_aborted)
2154                         __unregister_request(mdsc, req);
2155                 goto out;
2156         }
2157
2158         if (req->r_timeout &&
2159             time_after_eq(jiffies, req->r_started + req->r_timeout)) {
2160                 dout("do_request timed out\n");
2161                 err = -EIO;
2162                 goto finish;
2163         }
2164         if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
2165                 dout("do_request forced umount\n");
2166                 err = -EIO;
2167                 goto finish;
2168         }
2169
2170         put_request_session(req);
2171
2172         mds = __choose_mds(mdsc, req);
2173         if (mds < 0 ||
2174             ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
2175                 dout("do_request no mds or not active, waiting for map\n");
2176                 list_add(&req->r_wait, &mdsc->waiting_for_map);
2177                 goto out;
2178         }
2179
2180         /* get, open session */
2181         session = __ceph_lookup_mds_session(mdsc, mds);
2182         if (!session) {
2183                 session = register_session(mdsc, mds);
2184                 if (IS_ERR(session)) {
2185                         err = PTR_ERR(session);
2186                         goto finish;
2187                 }
2188         }
2189         req->r_session = get_session(session);
2190
2191         dout("do_request mds%d session %p state %s\n", mds, session,
2192              ceph_session_state_name(session->s_state));
2193         if (session->s_state != CEPH_MDS_SESSION_OPEN &&
2194             session->s_state != CEPH_MDS_SESSION_HUNG) {
2195                 if (session->s_state == CEPH_MDS_SESSION_NEW ||
2196                     session->s_state == CEPH_MDS_SESSION_CLOSING)
2197                         __open_session(mdsc, session);
2198                 list_add(&req->r_wait, &session->s_waiting);
2199                 goto out_session;
2200         }
2201
2202         /* send request */
2203         req->r_resend_mds = -1;   /* forget any previous mds hint */
2204
2205         if (req->r_request_started == 0)   /* note request start time */
2206                 req->r_request_started = jiffies;
2207
2208         err = __prepare_send_request(mdsc, req, mds, false);
2209         if (!err) {
2210                 ceph_msg_get(req->r_request);
2211                 ceph_con_send(&session->s_con, req->r_request);
2212         }
2213
2214 out_session:
2215         ceph_put_mds_session(session);
2216 finish:
2217         if (err) {
2218                 dout("__do_request early error %d\n", err);
2219                 req->r_err = err;
2220                 complete_request(mdsc, req);
2221                 __unregister_request(mdsc, req);
2222         }
2223 out:
2224         return err;
2225 }
2226
2227 /*
2228  * called under mdsc->mutex
2229  */
2230 static void __wake_requests(struct ceph_mds_client *mdsc,
2231                             struct list_head *head)
2232 {
2233         struct ceph_mds_request *req;
2234         LIST_HEAD(tmp_list);
2235
2236         list_splice_init(head, &tmp_list);
2237
2238         while (!list_empty(&tmp_list)) {
2239                 req = list_entry(tmp_list.next,
2240                                  struct ceph_mds_request, r_wait);
2241                 list_del_init(&req->r_wait);
2242                 dout(" wake request %p tid %llu\n", req, req->r_tid);
2243                 __do_request(mdsc, req);
2244         }
2245 }
2246
2247 /*
2248  * Wake up threads with requests pending for @mds, so that they can
2249  * resubmit their requests to a possibly different mds.
2250  */
2251 static void kick_requests(struct ceph_mds_client *mdsc, int mds)
2252 {
2253         struct ceph_mds_request *req;
2254         struct rb_node *p = rb_first(&mdsc->request_tree);
2255
2256         dout("kick_requests mds%d\n", mds);
2257         while (p) {
2258                 req = rb_entry(p, struct ceph_mds_request, r_node);
2259                 p = rb_next(p);
2260                 if (req->r_got_unsafe)
2261                         continue;
2262                 if (req->r_attempts > 0)
2263                         continue; /* only new requests */
2264                 if (req->r_session &&
2265                     req->r_session->s_mds == mds) {
2266                         dout(" kicking tid %llu\n", req->r_tid);
2267                         list_del_init(&req->r_wait);
2268                         __do_request(mdsc, req);
2269                 }
2270         }
2271 }
2272
2273 void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
2274                               struct ceph_mds_request *req)
2275 {
2276         dout("submit_request on %p\n", req);
2277         mutex_lock(&mdsc->mutex);
2278         __register_request(mdsc, req, NULL);
2279         __do_request(mdsc, req);
2280         mutex_unlock(&mdsc->mutex);
2281 }
2282
2283 /*
2284  * Synchrously perform an mds request.  Take care of all of the
2285  * session setup, forwarding, retry details.
2286  */
2287 int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
2288                          struct inode *dir,
2289                          struct ceph_mds_request *req)
2290 {
2291         int err;
2292
2293         dout("do_request on %p\n", req);
2294
2295         /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
2296         if (req->r_inode)
2297                 ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
2298         if (req->r_locked_dir)
2299                 ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
2300         if (req->r_old_dentry_dir)
2301                 ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
2302                                   CEPH_CAP_PIN);
2303
2304         /* issue */
2305         mutex_lock(&mdsc->mutex);
2306         __register_request(mdsc, req, dir);
2307         __do_request(mdsc, req);
2308
2309         if (req->r_err) {
2310                 err = req->r_err;
2311                 goto out;
2312         }
2313
2314         /* wait */
2315         mutex_unlock(&mdsc->mutex);
2316         dout("do_request waiting\n");
2317         if (!req->r_timeout && req->r_wait_for_completion) {
2318                 err = req->r_wait_for_completion(mdsc, req);
2319         } else {
2320                 long timeleft = wait_for_completion_killable_timeout(
2321                                         &req->r_completion,
2322                                         ceph_timeout_jiffies(req->r_timeout));
2323                 if (timeleft > 0)
2324                         err = 0;
2325                 else if (!timeleft)
2326                         err = -EIO;  /* timed out */
2327                 else
2328                         err = timeleft;  /* killed */
2329         }
2330         dout("do_request waited, got %d\n", err);
2331         mutex_lock(&mdsc->mutex);
2332
2333         /* only abort if we didn't race with a real reply */
2334         if (req->r_got_result) {
2335                 err = le32_to_cpu(req->r_reply_info.head->result);
2336         } else if (err < 0) {
2337                 dout("aborted request %lld with %d\n", req->r_tid, err);
2338
2339                 /*
2340                  * ensure we aren't running concurrently with
2341                  * ceph_fill_trace or ceph_readdir_prepopulate, which
2342                  * rely on locks (dir mutex) held by our caller.
2343                  */
2344                 mutex_lock(&req->r_fill_mutex);
2345                 req->r_err = err;
2346                 req->r_aborted = true;
2347                 mutex_unlock(&req->r_fill_mutex);
2348
2349                 if (req->r_locked_dir &&
2350                     (req->r_op & CEPH_MDS_OP_WRITE))
2351                         ceph_invalidate_dir_request(req);
2352         } else {
2353                 err = req->r_err;
2354         }
2355
2356 out:
2357         mutex_unlock(&mdsc->mutex);
2358         dout("do_request %p done, result %d\n", req, err);
2359         return err;
2360 }
2361
2362 /*
2363  * Invalidate dir's completeness, dentry lease state on an aborted MDS
2364  * namespace request.
2365  */
2366 void ceph_invalidate_dir_request(struct ceph_mds_request *req)
2367 {
2368         struct inode *inode = req->r_locked_dir;
2369
2370         dout("invalidate_dir_request %p (complete, lease(s))\n", inode);
2371
2372         ceph_dir_clear_complete(inode);
2373         if (req->r_dentry)
2374                 ceph_invalidate_dentry_lease(req->r_dentry);
2375         if (req->r_old_dentry)
2376                 ceph_invalidate_dentry_lease(req->r_old_dentry);
2377 }
2378
2379 /*
2380  * Handle mds reply.
2381  *
2382  * We take the session mutex and parse and process the reply immediately.
2383  * This preserves the logical ordering of replies, capabilities, etc., sent
2384  * by the MDS as they are applied to our local cache.
2385  */
2386 static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2387 {
2388         struct ceph_mds_client *mdsc = session->s_mdsc;
2389         struct ceph_mds_request *req;
2390         struct ceph_mds_reply_head *head = msg->front.iov_base;
2391         struct ceph_mds_reply_info_parsed *rinfo;  /* parsed reply info */
2392         struct ceph_snap_realm *realm;
2393         u64 tid;
2394         int err, result;
2395         int mds = session->s_mds;
2396
2397         if (msg->front.iov_len < sizeof(*head)) {
2398                 pr_err("mdsc_handle_reply got corrupt (short) reply\n");
2399                 ceph_msg_dump(msg);
2400                 return;
2401         }
2402
2403         /* get request, session */
2404         tid = le64_to_cpu(msg->hdr.tid);
2405         mutex_lock(&mdsc->mutex);
2406         req = __lookup_request(mdsc, tid);
2407         if (!req) {
2408                 dout("handle_reply on unknown tid %llu\n", tid);
2409                 mutex_unlock(&mdsc->mutex);
2410                 return;
2411         }
2412         dout("handle_reply %p\n", req);
2413
2414         /* correct session? */
2415         if (req->r_session != session) {
2416                 pr_err("mdsc_handle_reply got %llu on session mds%d"
2417                        " not mds%d\n", tid, session->s_mds,
2418                        req->r_session ? req->r_session->s_mds : -1);
2419                 mutex_unlock(&mdsc->mutex);
2420                 goto out;
2421         }
2422
2423         /* dup? */
2424         if ((req->r_got_unsafe && !head->safe) ||
2425             (req->r_got_safe && head->safe)) {
2426                 pr_warn("got a dup %s reply on %llu from mds%d\n",
2427                            head->safe ? "safe" : "unsafe", tid, mds);
2428                 mutex_unlock(&mdsc->mutex);
2429                 goto out;
2430         }
2431         if (req->r_got_safe) {
2432                 pr_warn("got unsafe after safe on %llu from mds%d\n",
2433                            tid, mds);
2434                 mutex_unlock(&mdsc->mutex);
2435                 goto out;
2436         }
2437
2438         result = le32_to_cpu(head->result);
2439
2440         /*
2441          * Handle an ESTALE
2442          * if we're not talking to the authority, send to them
2443          * if the authority has changed while we weren't looking,
2444          * send to new authority
2445          * Otherwise we just have to return an ESTALE
2446          */
2447         if (result == -ESTALE) {
2448                 dout("got ESTALE on request %llu", req->r_tid);
2449                 req->r_resend_mds = -1;
2450                 if (req->r_direct_mode != USE_AUTH_MDS) {
2451                         dout("not using auth, setting for that now");
2452                         req->r_direct_mode = USE_AUTH_MDS;
2453                         __do_request(mdsc, req);
2454                         mutex_unlock(&mdsc->mutex);
2455                         goto out;
2456                 } else  {
2457                         int mds = __choose_mds(mdsc, req);
2458                         if (mds >= 0 && mds != req->r_session->s_mds) {
2459                                 dout("but auth changed, so resending");
2460                                 __do_request(mdsc, req);
2461                                 mutex_unlock(&mdsc->mutex);
2462                                 goto out;
2463                         }
2464                 }
2465                 dout("have to return ESTALE on request %llu", req->r_tid);
2466         }
2467
2468
2469         if (head->safe) {
2470                 req->r_got_safe = true;
2471                 __unregister_request(mdsc, req);
2472
2473                 if (req->r_got_unsafe) {
2474                         /*
2475                          * We already handled the unsafe response, now do the
2476                          * cleanup.  No need to examine the response; the MDS
2477                          * doesn't include any result info in the safe
2478                          * response.  And even if it did, there is nothing
2479                          * useful we could do with a revised return value.
2480                          */
2481                         dout("got safe reply %llu, mds%d\n", tid, mds);
2482
2483                         /* last unsafe request during umount? */
2484                         if (mdsc->stopping && !__get_oldest_req(mdsc))
2485                                 complete_all(&mdsc->safe_umount_waiters);
2486                         mutex_unlock(&mdsc->mutex);
2487                         goto out;
2488                 }
2489         } else {
2490                 req->r_got_unsafe = true;
2491                 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
2492                 if (req->r_unsafe_dir) {
2493                         struct ceph_inode_info *ci =
2494                                         ceph_inode(req->r_unsafe_dir);
2495                         spin_lock(&ci->i_unsafe_lock);
2496                         list_add_tail(&req->r_unsafe_dir_item,
2497                                       &ci->i_unsafe_dirops);
2498                         spin_unlock(&ci->i_unsafe_lock);
2499                 }
2500         }
2501
2502         dout("handle_reply tid %lld result %d\n", tid, result);
2503         rinfo = &req->r_reply_info;
2504         err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
2505         mutex_unlock(&mdsc->mutex);
2506
2507         mutex_lock(&session->s_mutex);
2508         if (err < 0) {
2509                 pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
2510                 ceph_msg_dump(msg);
2511                 goto out_err;
2512         }
2513
2514         /* snap trace */
2515         realm = NULL;
2516         if (rinfo->snapblob_len) {
2517                 down_write(&mdsc->snap_rwsem);
2518                 ceph_update_snap_trace(mdsc, rinfo->snapblob,
2519                                 rinfo->snapblob + rinfo->snapblob_len,
2520                                 le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP,
2521                                 &realm);
2522                 downgrade_write(&mdsc->snap_rwsem);
2523         } else {
2524                 down_read(&mdsc->snap_rwsem);
2525         }
2526
2527         /* insert trace into our cache */
2528         mutex_lock(&req->r_fill_mutex);
2529         err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
2530         if (err == 0) {
2531                 if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
2532                                     req->r_op == CEPH_MDS_OP_LSSNAP))
2533                         ceph_readdir_prepopulate(req, req->r_session);
2534                 ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
2535         }
2536         mutex_unlock(&req->r_fill_mutex);
2537
2538         up_read(&mdsc->snap_rwsem);
2539         if (realm)
2540                 ceph_put_snap_realm(mdsc, realm);
2541
2542         if (err == 0 && req->r_got_unsafe && req->r_target_inode) {
2543                 struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
2544                 spin_lock(&ci->i_unsafe_lock);
2545                 list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops);
2546                 spin_unlock(&ci->i_unsafe_lock);
2547         }
2548 out_err:
2549         mutex_lock(&mdsc->mutex);
2550         if (!req->r_aborted) {
2551                 if (err) {
2552                         req->r_err = err;
2553                 } else {
2554                         req->r_reply =  ceph_msg_get(msg);
2555                         req->r_got_result = true;
2556                 }
2557         } else {
2558                 dout("reply arrived after request %lld was aborted\n", tid);
2559         }
2560         mutex_unlock(&mdsc->mutex);
2561
2562         mutex_unlock(&session->s_mutex);
2563
2564         /* kick calling process */
2565         complete_request(mdsc, req);
2566 out:
2567         ceph_mdsc_put_request(req);
2568         return;
2569 }
2570
2571
2572
2573 /*
2574  * handle mds notification that our request has been forwarded.
2575  */
2576 static void handle_forward(struct ceph_mds_client *mdsc,
2577                            struct ceph_mds_session *session,
2578                            struct ceph_msg *msg)
2579 {
2580         struct ceph_mds_request *req;
2581         u64 tid = le64_to_cpu(msg->hdr.tid);
2582         u32 next_mds;
2583         u32 fwd_seq;
2584         int err = -EINVAL;
2585         void *p = msg->front.iov_base;
2586         void *end = p + msg->front.iov_len;
2587
2588         ceph_decode_need(&p, end, 2*sizeof(u32), bad);
2589         next_mds = ceph_decode_32(&p);
2590         fwd_seq = ceph_decode_32(&p);
2591
2592         mutex_lock(&mdsc->mutex);
2593         req = __lookup_request(mdsc, tid);
2594         if (!req) {
2595                 dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
2596                 goto out;  /* dup reply? */
2597         }
2598
2599         if (req->r_aborted) {
2600                 dout("forward tid %llu aborted, unregistering\n", tid);
2601                 __unregister_request(mdsc, req);
2602         } else if (fwd_seq <= req->r_num_fwd) {
2603                 dout("forward tid %llu to mds%d - old seq %d <= %d\n",
2604                      tid, next_mds, req->r_num_fwd, fwd_seq);
2605         } else {
2606                 /* resend. forward race not possible; mds would drop */
2607                 dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
2608                 BUG_ON(req->r_err);
2609                 BUG_ON(req->r_got_result);
2610                 req->r_attempts = 0;
2611                 req->r_num_fwd = fwd_seq;
2612                 req->r_resend_mds = next_mds;
2613                 put_request_session(req);
2614                 __do_request(mdsc, req);
2615         }
2616         ceph_mdsc_put_request(req);
2617 out:
2618         mutex_unlock(&mdsc->mutex);
2619         return;
2620
2621 bad:
2622         pr_err("mdsc_handle_forward decode error err=%d\n", err);
2623 }
2624
2625 /*
2626  * handle a mds session control message
2627  */
2628 static void handle_session(struct ceph_mds_session *session,
2629                            struct ceph_msg *msg)
2630 {
2631         struct ceph_mds_client *mdsc = session->s_mdsc;
2632         u32 op;
2633         u64 seq;
2634         int mds = session->s_mds;
2635         struct ceph_mds_session_head *h = msg->front.iov_base;
2636         int wake = 0;
2637
2638         /* decode */
2639         if (msg->front.iov_len != sizeof(*h))
2640                 goto bad;
2641         op = le32_to_cpu(h->op);
2642         seq = le64_to_cpu(h->seq);
2643
2644         mutex_lock(&mdsc->mutex);
2645         if (op == CEPH_SESSION_CLOSE)
2646                 __unregister_session(mdsc, session);
2647         /* FIXME: this ttl calculation is generous */
2648         session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
2649         mutex_unlock(&mdsc->mutex);
2650
2651         mutex_lock(&session->s_mutex);
2652
2653         dout("handle_session mds%d %s %p state %s seq %llu\n",
2654              mds, ceph_session_op_name(op), session,
2655              ceph_session_state_name(session->s_state), seq);
2656
2657         if (session->s_state == CEPH_MDS_SESSION_HUNG) {
2658                 session->s_state = CEPH_MDS_SESSION_OPEN;
2659                 pr_info("mds%d came back\n", session->s_mds);
2660         }
2661
2662         switch (op) {
2663         case CEPH_SESSION_OPEN:
2664                 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
2665                         pr_info("mds%d reconnect success\n", session->s_mds);
2666                 session->s_state = CEPH_MDS_SESSION_OPEN;
2667                 renewed_caps(mdsc, session, 0);
2668                 wake = 1;
2669                 if (mdsc->stopping)
2670                         __close_session(mdsc, session);
2671                 break;
2672
2673         case CEPH_SESSION_RENEWCAPS:
2674                 if (session->s_renew_seq == seq)
2675                         renewed_caps(mdsc, session, 1);
2676                 break;
2677
2678         case CEPH_SESSION_CLOSE:
2679                 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
2680                         pr_info("mds%d reconnect denied\n", session->s_mds);
2681                 cleanup_session_requests(mdsc, session);
2682                 remove_session_caps(session);
2683                 wake = 2; /* for good measure */
2684                 wake_up_all(&mdsc->session_close_wq);
2685                 break;
2686
2687         case CEPH_SESSION_STALE:
2688                 pr_info("mds%d caps went stale, renewing\n",
2689                         session->s_mds);
2690                 spin_lock(&session->s_gen_ttl_lock);
2691                 session->s_cap_gen++;
2692                 session->s_cap_ttl = jiffies - 1;
2693                 spin_unlock(&session->s_gen_ttl_lock);
2694                 send_renew_caps(mdsc, session);
2695                 break;
2696
2697         case CEPH_SESSION_RECALL_STATE:
2698                 trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
2699                 break;
2700
2701         case CEPH_SESSION_FLUSHMSG:
2702                 send_flushmsg_ack(mdsc, session, seq);
2703                 break;
2704
2705         case CEPH_SESSION_FORCE_RO:
2706                 dout("force_session_readonly %p\n", session);
2707                 spin_lock(&session->s_cap_lock);
2708                 session->s_readonly = true;
2709                 spin_unlock(&session->s_cap_lock);
2710                 wake_up_session_caps(session, 0);
2711                 break;
2712
2713         default:
2714                 pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
2715                 WARN_ON(1);
2716         }
2717
2718         mutex_unlock(&session->s_mutex);
2719         if (wake) {
2720                 mutex_lock(&mdsc->mutex);
2721                 __wake_requests(mdsc, &session->s_waiting);
2722                 if (wake == 2)
2723                         kick_requests(mdsc, mds);
2724                 mutex_unlock(&mdsc->mutex);
2725         }
2726         return;
2727
2728 bad:
2729         pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
2730                (int)msg->front.iov_len);
2731         ceph_msg_dump(msg);
2732         return;
2733 }
2734
2735
2736 /*
2737  * called under session->mutex.
2738  */
2739 static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
2740                                    struct ceph_mds_session *session)
2741 {
2742         struct ceph_mds_request *req, *nreq;
2743         struct rb_node *p;
2744         int err;
2745
2746         dout("replay_unsafe_requests mds%d\n", session->s_mds);
2747
2748         mutex_lock(&mdsc->mutex);
2749         list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
2750                 err = __prepare_send_request(mdsc, req, session->s_mds, true);
2751                 if (!err) {
2752                         ceph_msg_get(req->r_request);
2753                         ceph_con_send(&session->s_con, req->r_request);
2754                 }
2755         }
2756
2757         /*
2758          * also re-send old requests when MDS enters reconnect stage. So that MDS
2759          * can process completed request in clientreplay stage.
2760          */
2761         p = rb_first(&mdsc->request_tree);
2762         while (p) {
2763                 req = rb_entry(p, struct ceph_mds_request, r_node);
2764                 p = rb_next(p);
2765                 if (req->r_got_unsafe)
2766                         continue;
2767                 if (req->r_attempts == 0)
2768                         continue; /* only old requests */
2769                 if (req->r_session &&
2770                     req->r_session->s_mds == session->s_mds) {
2771                         err = __prepare_send_request(mdsc, req,
2772                                                      session->s_mds, true);
2773                         if (!err) {
2774                                 ceph_msg_get(req->r_request);
2775                                 ceph_con_send(&session->s_con, req->r_request);
2776                         }
2777                 }
2778         }
2779         mutex_unlock(&mdsc->mutex);
2780 }
2781
2782 /*
2783  * Encode information about a cap for a reconnect with the MDS.
2784  */
2785 static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2786                           void *arg)
2787 {
2788         union {
2789                 struct ceph_mds_cap_reconnect v2;
2790                 struct ceph_mds_cap_reconnect_v1 v1;
2791         } rec;
2792         size_t reclen;
2793         struct ceph_inode_info *ci;
2794         struct ceph_reconnect_state *recon_state = arg;
2795         struct ceph_pagelist *pagelist = recon_state->pagelist;
2796         char *path;
2797         int pathlen, err;
2798         u64 pathbase;
2799         struct dentry *dentry;
2800
2801         ci = cap->ci;
2802
2803         dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
2804              inode, ceph_vinop(inode), cap, cap->cap_id,
2805              ceph_cap_string(cap->issued));
2806         err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
2807         if (err)
2808                 return err;
2809
2810         dentry = d_find_alias(inode);
2811         if (dentry) {
2812                 path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
2813                 if (IS_ERR(path)) {
2814                         err = PTR_ERR(path);
2815                         goto out_dput;
2816                 }
2817         } else {
2818                 path = NULL;
2819                 pathlen = 0;
2820         }
2821         err = ceph_pagelist_encode_string(pagelist, path, pathlen);
2822         if (err)
2823                 goto out_free;
2824
2825         spin_lock(&ci->i_ceph_lock);
2826         cap->seq = 0;        /* reset cap seq */
2827         cap->issue_seq = 0;  /* and issue_seq */
2828         cap->mseq = 0;       /* and migrate_seq */
2829         cap->cap_gen = cap->session->s_cap_gen;
2830
2831         if (recon_state->flock) {
2832                 rec.v2.cap_id = cpu_to_le64(cap->cap_id);
2833                 rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
2834                 rec.v2.issued = cpu_to_le32(cap->issued);
2835                 rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
2836                 rec.v2.pathbase = cpu_to_le64(pathbase);
2837                 rec.v2.flock_len = 0;
2838                 reclen = sizeof(rec.v2);
2839         } else {
2840                 rec.v1.cap_id = cpu_to_le64(cap->cap_id);
2841                 rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
2842                 rec.v1.issued = cpu_to_le32(cap->issued);
2843                 rec.v1.size = cpu_to_le64(inode->i_size);
2844                 ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime);
2845                 ceph_encode_timespec(&rec.v1.atime, &inode->i_atime);
2846                 rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
2847                 rec.v1.pathbase = cpu_to_le64(pathbase);
2848                 reclen = sizeof(rec.v1);
2849         }
2850         spin_unlock(&ci->i_ceph_lock);
2851
2852         if (recon_state->flock) {
2853                 int num_fcntl_locks, num_flock_locks;
2854                 struct ceph_filelock *flocks;
2855
2856 encode_again:
2857                 ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
2858                 flocks = kmalloc((num_fcntl_locks+num_flock_locks) *
2859                                  sizeof(struct ceph_filelock), GFP_NOFS);
2860                 if (!flocks) {
2861                         err = -ENOMEM;
2862                         goto out_free;
2863                 }
2864                 err = ceph_encode_locks_to_buffer(inode, flocks,
2865                                                   num_fcntl_locks,
2866                                                   num_flock_locks);
2867                 if (err) {
2868                         kfree(flocks);
2869                         if (err == -ENOSPC)
2870                                 goto encode_again;
2871                         goto out_free;
2872                 }
2873                 /*
2874                  * number of encoded locks is stable, so copy to pagelist
2875                  */
2876                 rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) +
2877                                     (num_fcntl_locks+num_flock_locks) *
2878                                     sizeof(struct ceph_filelock));
2879                 err = ceph_pagelist_append(pagelist, &rec, reclen);
2880                 if (!err)
2881                         err = ceph_locks_to_pagelist(flocks, pagelist,
2882                                                      num_fcntl_locks,
2883                                                      num_flock_locks);
2884                 kfree(flocks);
2885         } else {
2886                 err = ceph_pagelist_append(pagelist, &rec, reclen);
2887         }
2888
2889         recon_state->nr_caps++;
2890 out_free:
2891         kfree(path);
2892 out_dput:
2893         dput(dentry);
2894         return err;
2895 }
2896
2897
2898 /*
2899  * If an MDS fails and recovers, clients need to reconnect in order to
2900  * reestablish shared state.  This includes all caps issued through
2901  * this session _and_ the snap_realm hierarchy.  Because it's not
2902  * clear which snap realms the mds cares about, we send everything we
2903  * know about.. that ensures we'll then get any new info the
2904  * recovering MDS might have.
2905  *
2906  * This is a relatively heavyweight operation, but it's rare.
2907  *
2908  * called with mdsc->mutex held.
2909  */
2910 static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2911                                struct ceph_mds_session *session)
2912 {
2913         struct ceph_msg *reply;
2914         struct rb_node *p;
2915         int mds = session->s_mds;
2916         int err = -ENOMEM;
2917         int s_nr_caps;
2918         struct ceph_pagelist *pagelist;
2919         struct ceph_reconnect_state recon_state;
2920
2921         pr_info("mds%d reconnect start\n", mds);
2922
2923         pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
2924         if (!pagelist)
2925                 goto fail_nopagelist;
2926         ceph_pagelist_init(pagelist);
2927
2928         reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false);
2929         if (!reply)
2930                 goto fail_nomsg;
2931
2932         mutex_lock(&session->s_mutex);
2933         session->s_state = CEPH_MDS_SESSION_RECONNECTING;
2934         session->s_seq = 0;
2935
2936         dout("session %p state %s\n", session,
2937              ceph_session_state_name(session->s_state));
2938
2939         spin_lock(&session->s_gen_ttl_lock);
2940         session->s_cap_gen++;
2941         spin_unlock(&session->s_gen_ttl_lock);
2942
2943         spin_lock(&session->s_cap_lock);
2944         /* don't know if session is readonly */
2945         session->s_readonly = 0;
2946         /*
2947          * notify __ceph_remove_cap() that we are composing cap reconnect.
2948          * If a cap get released before being added to the cap reconnect,
2949          * __ceph_remove_cap() should skip queuing cap release.
2950          */
2951         session->s_cap_reconnect = 1;
2952         /* drop old cap expires; we're about to reestablish that state */
2953         cleanup_cap_releases(mdsc, session);
2954
2955         /* trim unused caps to reduce MDS's cache rejoin time */
2956         if (mdsc->fsc->sb->s_root)
2957                 shrink_dcache_parent(mdsc->fsc->sb->s_root);
2958
2959         ceph_con_close(&session->s_con);
2960         ceph_con_open(&session->s_con,
2961                       CEPH_ENTITY_TYPE_MDS, mds,
2962                       ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
2963
2964         /* replay unsafe requests */
2965         replay_unsafe_requests(mdsc, session);
2966
2967         down_read(&mdsc->snap_rwsem);
2968
2969         /* traverse this session's caps */
2970         s_nr_caps = session->s_nr_caps;
2971         err = ceph_pagelist_encode_32(pagelist, s_nr_caps);
2972         if (err)
2973                 goto fail;
2974
2975         recon_state.nr_caps = 0;
2976         recon_state.pagelist = pagelist;
2977         recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;
2978         err = iterate_session_caps(session, encode_caps_cb, &recon_state);
2979         if (err < 0)
2980                 goto fail;
2981
2982         spin_lock(&session->s_cap_lock);
2983         session->s_cap_reconnect = 0;
2984         spin_unlock(&session->s_cap_lock);
2985
2986         /*
2987          * snaprealms.  we provide mds with the ino, seq (version), and
2988          * parent for all of our realms.  If the mds has any newer info,
2989          * it will tell us.
2990          */
2991         for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
2992                 struct ceph_snap_realm *realm =
2993                         rb_entry(p, struct ceph_snap_realm, node);
2994                 struct ceph_mds_snaprealm_reconnect sr_rec;
2995
2996                 dout(" adding snap realm %llx seq %lld parent %llx\n",
2997                      realm->ino, realm->seq, realm->parent_ino);
2998                 sr_rec.ino = cpu_to_le64(realm->ino);
2999                 sr_rec.seq = cpu_to_le64(realm->seq);
3000                 sr_rec.parent = cpu_to_le64(realm->parent_ino);
3001                 err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
3002                 if (err)
3003                         goto fail;
3004         }
3005
3006         if (recon_state.flock)
3007                 reply->hdr.version = cpu_to_le16(2);
3008
3009         /* raced with cap release? */
3010         if (s_nr_caps != recon_state.nr_caps) {
3011                 struct page *page = list_first_entry(&pagelist->head,
3012                                                      struct page, lru);
3013                 __le32 *addr = kmap_atomic(page);
3014                 *addr = cpu_to_le32(recon_state.nr_caps);
3015                 kunmap_atomic(addr);
3016         }
3017
3018         reply->hdr.data_len = cpu_to_le32(pagelist->length);
3019         ceph_msg_data_add_pagelist(reply, pagelist);
3020
3021         ceph_early_kick_flushing_caps(mdsc, session);
3022
3023         ceph_con_send(&session->s_con, reply);
3024
3025         mutex_unlock(&session->s_mutex);
3026
3027         mutex_lock(&mdsc->mutex);
3028         __wake_requests(mdsc, &session->s_waiting);
3029         mutex_unlock(&mdsc->mutex);
3030
3031         up_read(&mdsc->snap_rwsem);
3032         return;
3033
3034 fail:
3035         ceph_msg_put(reply);
3036         up_read(&mdsc->snap_rwsem);
3037         mutex_unlock(&session->s_mutex);
3038 fail_nomsg:
3039         ceph_pagelist_release(pagelist);
3040 fail_nopagelist:
3041         pr_err("error %d preparing reconnect for mds%d\n", err, mds);
3042         return;
3043 }
3044
3045
3046 /*
3047  * compare old and new mdsmaps, kicking requests
3048  * and closing out old connections as necessary
3049  *
3050  * called under mdsc->mutex.
3051  */
3052 static void check_new_map(struct ceph_mds_client *mdsc,
3053                           struct ceph_mdsmap *newmap,
3054                           struct ceph_mdsmap *oldmap)
3055 {
3056         int i;
3057         int oldstate, newstate;
3058         struct ceph_mds_session *s;
3059
3060         dout("check_new_map new %u old %u\n",
3061              newmap->m_epoch, oldmap->m_epoch);
3062
3063         for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
3064                 if (mdsc->sessions[i] == NULL)
3065                         continue;
3066                 s = mdsc->sessions[i];
3067                 oldstate = ceph_mdsmap_get_state(oldmap, i);
3068                 newstate = ceph_mdsmap_get_state(newmap, i);
3069
3070                 dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n",
3071                      i, ceph_mds_state_name(oldstate),
3072                      ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
3073                      ceph_mds_state_name(newstate),
3074                      ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
3075                      ceph_session_state_name(s->s_state));
3076
3077                 if (i >= newmap->m_max_mds ||
3078                     memcmp(ceph_mdsmap_get_addr(oldmap, i),
3079                            ceph_mdsmap_get_addr(newmap, i),
3080                            sizeof(struct ceph_entity_addr))) {
3081                         if (s->s_state == CEPH_MDS_SESSION_OPENING) {
3082                                 /* the session never opened, just close it
3083                                  * out now */
3084                                 __wake_requests(mdsc, &s->s_waiting);
3085                                 __unregister_session(mdsc, s);
3086                         } else {
3087                                 /* just close it */
3088                                 mutex_unlock(&mdsc->mutex);
3089                                 mutex_lock(&s->s_mutex);
3090                                 mutex_lock(&mdsc->mutex);
3091                                 ceph_con_close(&s->s_con);
3092                                 mutex_unlock(&s->s_mutex);
3093                                 s->s_state = CEPH_MDS_SESSION_RESTARTING;
3094                         }
3095                 } else if (oldstate == newstate) {
3096                         continue;  /* nothing new with this mds */
3097                 }
3098
3099                 /*
3100                  * send reconnect?
3101                  */
3102                 if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
3103                     newstate >= CEPH_MDS_STATE_RECONNECT) {
3104                         mutex_unlock(&mdsc->mutex);
3105                         send_mds_reconnect(mdsc, s);
3106                         mutex_lock(&mdsc->mutex);
3107                 }
3108
3109                 /*
3110                  * kick request on any mds that has gone active.
3111                  */
3112                 if (oldstate < CEPH_MDS_STATE_ACTIVE &&
3113                     newstate >= CEPH_MDS_STATE_ACTIVE) {
3114                         if (oldstate != CEPH_MDS_STATE_CREATING &&
3115                             oldstate != CEPH_MDS_STATE_STARTING)
3116                                 pr_info("mds%d recovery completed\n", s->s_mds);
3117                         kick_requests(mdsc, i);
3118                         ceph_kick_flushing_caps(mdsc, s);
3119                         wake_up_session_caps(s, 1);
3120                 }
3121         }
3122
3123         for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) {
3124                 s = mdsc->sessions[i];
3125                 if (!s)
3126                         continue;
3127                 if (!ceph_mdsmap_is_laggy(newmap, i))
3128                         continue;
3129                 if (s->s_state == CEPH_MDS_SESSION_OPEN ||
3130                     s->s_state == CEPH_MDS_SESSION_HUNG ||
3131                     s->s_state == CEPH_MDS_SESSION_CLOSING) {
3132                         dout(" connecting to export targets of laggy mds%d\n",
3133                              i);
3134                         __open_export_target_sessions(mdsc, s);
3135                 }
3136         }
3137 }
3138
3139
3140
3141 /*
3142  * leases
3143  */
3144
3145 /*
3146  * caller must hold session s_mutex, dentry->d_lock
3147  */
3148 void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
3149 {
3150         struct ceph_dentry_info *di = ceph_dentry(dentry);
3151
3152         ceph_put_mds_session(di->lease_session);
3153         di->lease_session = NULL;
3154 }
3155
3156 static void handle_lease(struct ceph_mds_client *mdsc,
3157                          struct ceph_mds_session *session,
3158                          struct ceph_msg *msg)
3159 {
3160         struct super_block *sb = mdsc->fsc->sb;
3161         struct inode *inode;
3162         struct dentry *parent, *dentry;
3163         struct ceph_dentry_info *di;
3164         int mds = session->s_mds;
3165         struct ceph_mds_lease *h = msg->front.iov_base;
3166         u32 seq;
3167         struct ceph_vino vino;
3168         struct qstr dname;
3169         int release = 0;
3170
3171         dout("handle_lease from mds%d\n", mds);
3172
3173         /* decode */
3174         if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
3175                 goto bad;
3176         vino.ino = le64_to_cpu(h->ino);
3177         vino.snap = CEPH_NOSNAP;
3178         seq = le32_to_cpu(h->seq);
3179         dname.name = (void *)h + sizeof(*h) + sizeof(u32);
3180         dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
3181         if (dname.len != get_unaligned_le32(h+1))
3182                 goto bad;
3183
3184         /* lookup inode */
3185         inode = ceph_find_inode(sb, vino);
3186         dout("handle_lease %s, ino %llx %p %.*s\n",
3187              ceph_lease_op_name(h->action), vino.ino, inode,
3188              dname.len, dname.name);
3189
3190         mutex_lock(&session->s_mutex);
3191         session->s_seq++;
3192
3193         if (inode == NULL) {
3194                 dout("handle_lease no inode %llx\n", vino.ino);
3195                 goto release;
3196         }
3197
3198         /* dentry */
3199         parent = d_find_alias(inode);
3200         if (!parent) {
3201                 dout("no parent dentry on inode %p\n", inode);
3202                 WARN_ON(1);
3203                 goto release;  /* hrm... */
3204         }
3205         dname.hash = full_name_hash(dname.name, dname.len);
3206         dentry = d_lookup(parent, &dname);
3207         dput(parent);
3208         if (!dentry)
3209                 goto release;
3210
3211         spin_lock(&dentry->d_lock);
3212         di = ceph_dentry(dentry);
3213         switch (h->action) {
3214         case CEPH_MDS_LEASE_REVOKE:
3215                 if (di->lease_session == session) {
3216                         if (ceph_seq_cmp(di->lease_seq, seq) > 0)
3217                                 h->seq = cpu_to_le32(di->lease_seq);
3218                         __ceph_mdsc_drop_dentry_lease(dentry);
3219                 }
3220                 release = 1;
3221                 break;
3222
3223         case CEPH_MDS_LEASE_RENEW:
3224                 if (di->lease_session == session &&
3225                     di->lease_gen == session->s_cap_gen &&
3226                     di->lease_renew_from &&
3227                     di->lease_renew_after == 0) {
3228                         unsigned long duration =
3229                                 msecs_to_jiffies(le32_to_cpu(h->duration_ms));
3230
3231                         di->lease_seq = seq;
3232                         dentry->d_time = di->lease_renew_from + duration;
3233                         di->lease_renew_after = di->lease_renew_from +
3234                                 (duration >> 1);
3235                         di->lease_renew_from = 0;
3236                 }
3237                 break;
3238         }
3239         spin_unlock(&dentry->d_lock);
3240         dput(dentry);
3241
3242         if (!release)
3243                 goto out;
3244
3245 release:
3246         /* let's just reuse the same message */
3247         h->action = CEPH_MDS_LEASE_REVOKE_ACK;
3248         ceph_msg_get(msg);
3249         ceph_con_send(&session->s_con, msg);
3250
3251 out:
3252         iput(inode);
3253         mutex_unlock(&session->s_mutex);
3254         return;
3255
3256 bad:
3257         pr_err("corrupt lease message\n");
3258         ceph_msg_dump(msg);
3259 }
3260
3261 void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
3262                               struct inode *inode,
3263                               struct dentry *dentry, char action,
3264                               u32 seq)
3265 {
3266         struct ceph_msg *msg;
3267         struct ceph_mds_lease *lease;
3268         int len = sizeof(*lease) + sizeof(u32);
3269         int dnamelen = 0;
3270
3271         dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
3272              inode, dentry, ceph_lease_op_name(action), session->s_mds);
3273         dnamelen = dentry->d_name.len;
3274         len += dnamelen;
3275
3276         msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
3277         if (!msg)
3278                 return;
3279         lease = msg->front.iov_base;
3280         lease->action = action;
3281         lease->ino = cpu_to_le64(ceph_vino(inode).ino);
3282         lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
3283         lease->seq = cpu_to_le32(seq);
3284         put_unaligned_le32(dnamelen, lease + 1);
3285         memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
3286
3287         /*
3288          * if this is a preemptive lease RELEASE, no need to
3289          * flush request stream, since the actual request will
3290          * soon follow.
3291          */
3292         msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
3293
3294         ceph_con_send(&session->s_con, msg);
3295 }
3296
3297 /*
3298  * Preemptively release a lease we expect to invalidate anyway.
3299  * Pass @inode always, @dentry is optional.
3300  */
3301 void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
3302                              struct dentry *dentry)
3303 {
3304         struct ceph_dentry_info *di;
3305         struct ceph_mds_session *session;
3306         u32 seq;
3307
3308         BUG_ON(inode == NULL);
3309         BUG_ON(dentry == NULL);
3310
3311         /* is dentry lease valid? */
3312         spin_lock(&dentry->d_lock);
3313         di = ceph_dentry(dentry);
3314         if (!di || !di->lease_session ||
3315             di->lease_session->s_mds < 0 ||
3316             di->lease_gen != di->lease_session->s_cap_gen ||
3317             !time_before(jiffies, dentry->d_time)) {
3318                 dout("lease_release inode %p dentry %p -- "
3319                      "no lease\n",
3320                      inode, dentry);
3321                 spin_unlock(&dentry->d_lock);
3322                 return;
3323         }
3324
3325         /* we do have a lease on this dentry; note mds and seq */
3326         session = ceph_get_mds_session(di->lease_session);
3327         seq = di->lease_seq;
3328         __ceph_mdsc_drop_dentry_lease(dentry);
3329         spin_unlock(&dentry->d_lock);
3330
3331         dout("lease_release inode %p dentry %p to mds%d\n",
3332              inode, dentry, session->s_mds);
3333         ceph_mdsc_lease_send_msg(session, inode, dentry,
3334                                  CEPH_MDS_LEASE_RELEASE, seq);
3335         ceph_put_mds_session(session);
3336 }
3337
3338 /*
3339  * drop all leases (and dentry refs) in preparation for umount
3340  */
3341 static void drop_leases(struct ceph_mds_client *mdsc)
3342 {
3343         int i;
3344
3345         dout("drop_leases\n");
3346         mutex_lock(&mdsc->mutex);
3347         for (i = 0; i < mdsc->max_sessions; i++) {
3348                 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
3349                 if (!s)
3350                         continue;
3351                 mutex_unlock(&mdsc->mutex);
3352                 mutex_lock(&s->s_mutex);
3353                 mutex_unlock(&s->s_mutex);
3354                 ceph_put_mds_session(s);
3355                 mutex_lock(&mdsc->mutex);
3356         }
3357         mutex_unlock(&mdsc->mutex);
3358 }
3359
3360
3361
3362 /*
3363  * delayed work -- periodically trim expired leases, renew caps with mds
3364  */
3365 static void schedule_delayed(struct ceph_mds_client *mdsc)
3366 {
3367         int delay = 5;
3368         unsigned hz = round_jiffies_relative(HZ * delay);
3369         schedule_delayed_work(&mdsc->delayed_work, hz);
3370 }
3371
3372 static void delayed_work(struct work_struct *work)
3373 {
3374         int i;
3375         struct ceph_mds_client *mdsc =
3376                 container_of(work, struct ceph_mds_client, delayed_work.work);
3377         int renew_interval;
3378         int renew_caps;
3379
3380         dout("mdsc delayed_work\n");
3381         ceph_check_delayed_caps(mdsc);
3382
3383         mutex_lock(&mdsc->mutex);
3384         renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
3385         renew_caps = time_after_eq(jiffies, HZ*renew_interval +
3386                                    mdsc->last_renew_caps);
3387         if (renew_caps)
3388                 mdsc->last_renew_caps = jiffies;
3389
3390         for (i = 0; i < mdsc->max_sessions; i++) {
3391                 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
3392                 if (s == NULL)
3393                         continue;
3394                 if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
3395                         dout("resending session close request for mds%d\n",
3396                              s->s_mds);
3397                         request_close_session(mdsc, s);
3398                         ceph_put_mds_session(s);
3399                         continue;
3400                 }
3401                 if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
3402                         if (s->s_state == CEPH_MDS_SESSION_OPEN) {
3403                                 s->s_state = CEPH_MDS_SESSION_HUNG;
3404                                 pr_info("mds%d hung\n", s->s_mds);
3405                         }
3406                 }
3407                 if (s->s_state < CEPH_MDS_SESSION_OPEN) {
3408                         /* this mds is failed or recovering, just wait */
3409                         ceph_put_mds_session(s);
3410                         continue;
3411                 }
3412                 mutex_unlock(&mdsc->mutex);
3413
3414                 mutex_lock(&s->s_mutex);
3415                 if (renew_caps)
3416                         send_renew_caps(mdsc, s);
3417                 else
3418                         ceph_con_keepalive(&s->s_con);
3419                 if (s->s_state == CEPH_MDS_SESSION_OPEN ||
3420                     s->s_state == CEPH_MDS_SESSION_HUNG)
3421                         ceph_send_cap_releases(mdsc, s);
3422                 mutex_unlock(&s->s_mutex);
3423                 ceph_put_mds_session(s);
3424
3425                 mutex_lock(&mdsc->mutex);
3426         }
3427         mutex_unlock(&mdsc->mutex);
3428
3429         schedule_delayed(mdsc);
3430 }
3431
3432 int ceph_mdsc_init(struct ceph_fs_client *fsc)
3433
3434 {
3435         struct ceph_mds_client *mdsc;
3436
3437         mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
3438         if (!mdsc)
3439                 return -ENOMEM;
3440         mdsc->fsc = fsc;
3441         fsc->mdsc = mdsc;
3442         mutex_init(&mdsc->mutex);
3443         mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
3444         if (mdsc->mdsmap == NULL) {
3445                 kfree(mdsc);
3446                 return -ENOMEM;
3447         }
3448
3449         init_completion(&mdsc->safe_umount_waiters);
3450         init_waitqueue_head(&mdsc->session_close_wq);
3451         INIT_LIST_HEAD(&mdsc->waiting_for_map);
3452         mdsc->sessions = NULL;
3453         atomic_set(&mdsc->num_sessions, 0);
3454         mdsc->max_sessions = 0;
3455         mdsc->stopping = 0;
3456         mdsc->last_snap_seq = 0;
3457         init_rwsem(&mdsc->snap_rwsem);
3458         mdsc->snap_realms = RB_ROOT;
3459         INIT_LIST_HEAD(&mdsc->snap_empty);
3460         spin_lock_init(&mdsc->snap_empty_lock);
3461         mdsc->last_tid = 0;
3462         mdsc->oldest_tid = 0;
3463         mdsc->request_tree = RB_ROOT;
3464         INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
3465         mdsc->last_renew_caps = jiffies;
3466         INIT_LIST_HEAD(&mdsc->cap_delay_list);
3467         spin_lock_init(&mdsc->cap_delay_lock);
3468         INIT_LIST_HEAD(&mdsc->snap_flush_list);
3469         spin_lock_init(&mdsc->snap_flush_lock);
3470         mdsc->last_cap_flush_tid = 1;
3471         mdsc->cap_flush_tree = RB_ROOT;
3472         INIT_LIST_HEAD(&mdsc->cap_dirty);
3473         INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
3474         mdsc->num_cap_flushing = 0;
3475         spin_lock_init(&mdsc->cap_dirty_lock);
3476         init_waitqueue_head(&mdsc->cap_flushing_wq);
3477         spin_lock_init(&mdsc->dentry_lru_lock);
3478         INIT_LIST_HEAD(&mdsc->dentry_lru);
3479
3480         ceph_caps_init(mdsc);
3481         ceph_adjust_min_caps(mdsc, fsc->min_caps);
3482
3483         init_rwsem(&mdsc->pool_perm_rwsem);
3484         mdsc->pool_perm_tree = RB_ROOT;
3485
3486         return 0;
3487 }
3488
3489 /*
3490  * Wait for safe replies on open mds requests.  If we time out, drop
3491  * all requests from the tree to avoid dangling dentry refs.
3492  */
3493 static void wait_requests(struct ceph_mds_client *mdsc)
3494 {
3495         struct ceph_options *opts = mdsc->fsc->client->options;
3496         struct ceph_mds_request *req;
3497
3498         mutex_lock(&mdsc->mutex);
3499         if (__get_oldest_req(mdsc)) {
3500                 mutex_unlock(&mdsc->mutex);
3501
3502                 dout("wait_requests waiting for requests\n");
3503                 wait_for_completion_timeout(&mdsc->safe_umount_waiters,
3504                                     ceph_timeout_jiffies(opts->mount_timeout));
3505
3506                 /* tear down remaining requests */
3507                 mutex_lock(&mdsc->mutex);
3508                 while ((req = __get_oldest_req(mdsc))) {
3509                         dout("wait_requests timed out on tid %llu\n",
3510                              req->r_tid);
3511                         __unregister_request(mdsc, req);
3512                 }
3513         }
3514         mutex_unlock(&mdsc->mutex);
3515         dout("wait_requests done\n");
3516 }
3517
3518 /*
3519  * called before mount is ro, and before dentries are torn down.
3520  * (hmm, does this still race with new lookups?)
3521  */
3522 void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
3523 {
3524         dout("pre_umount\n");
3525         mdsc->stopping = 1;
3526
3527         drop_leases(mdsc);
3528         ceph_flush_dirty_caps(mdsc);
3529         wait_requests(mdsc);
3530
3531         /*
3532          * wait for reply handlers to drop their request refs and
3533          * their inode/dcache refs
3534          */
3535         ceph_msgr_flush();
3536 }
3537
3538 /*
3539  * wait for all write mds requests to flush.
3540  */
3541 static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
3542 {
3543         struct ceph_mds_request *req = NULL, *nextreq;
3544         struct rb_node *n;
3545
3546         mutex_lock(&mdsc->mutex);
3547         dout("wait_unsafe_requests want %lld\n", want_tid);
3548 restart:
3549         req = __get_oldest_req(mdsc);
3550         while (req && req->r_tid <= want_tid) {
3551                 /* find next request */
3552                 n = rb_next(&req->r_node);
3553                 if (n)
3554                         nextreq = rb_entry(n, struct ceph_mds_request, r_node);
3555                 else
3556                         nextreq = NULL;
3557                 if (req->r_op != CEPH_MDS_OP_SETFILELOCK &&
3558                     (req->r_op & CEPH_MDS_OP_WRITE)) {
3559                         /* write op */
3560                         ceph_mdsc_get_request(req);
3561                         if (nextreq)
3562                                 ceph_mdsc_get_request(nextreq);
3563                         mutex_unlock(&mdsc->mutex);
3564                         dout("wait_unsafe_requests  wait on %llu (want %llu)\n",
3565                              req->r_tid, want_tid);
3566                         wait_for_completion(&req->r_safe_completion);
3567                         mutex_lock(&mdsc->mutex);
3568                         ceph_mdsc_put_request(req);
3569                         if (!nextreq)
3570                                 break;  /* next dne before, so we're done! */
3571                         if (RB_EMPTY_NODE(&nextreq->r_node)) {
3572                                 /* next request was removed from tree */
3573                                 ceph_mdsc_put_request(nextreq);
3574                                 goto restart;
3575                         }
3576                         ceph_mdsc_put_request(nextreq);  /* won't go away */
3577                 }
3578                 req = nextreq;
3579         }
3580         mutex_unlock(&mdsc->mutex);
3581         dout("wait_unsafe_requests done\n");
3582 }
3583
3584 void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
3585 {
3586         u64 want_tid, want_flush, want_snap;
3587
3588         if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
3589                 return;
3590
3591         dout("sync\n");
3592         mutex_lock(&mdsc->mutex);
3593         want_tid = mdsc->last_tid;
3594         mutex_unlock(&mdsc->mutex);
3595
3596         ceph_flush_dirty_caps(mdsc);
3597         spin_lock(&mdsc->cap_dirty_lock);
3598         want_flush = mdsc->last_cap_flush_tid;
3599         spin_unlock(&mdsc->cap_dirty_lock);
3600
3601         down_read(&mdsc->snap_rwsem);
3602         want_snap = mdsc->last_snap_seq;
3603         up_read(&mdsc->snap_rwsem);
3604
3605         dout("sync want tid %lld flush_seq %lld snap_seq %lld\n",
3606              want_tid, want_flush, want_snap);
3607
3608         wait_unsafe_requests(mdsc, want_tid);
3609         wait_caps_flush(mdsc, want_flush, want_snap);
3610 }
3611
3612 /*
3613  * true if all sessions are closed, or we force unmount
3614  */
3615 static bool done_closing_sessions(struct ceph_mds_client *mdsc)
3616 {
3617         if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
3618                 return true;
3619         return atomic_read(&mdsc->num_sessions) == 0;
3620 }
3621
3622 /*
3623  * called after sb is ro.
3624  */
3625 void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
3626 {
3627         struct ceph_options *opts = mdsc->fsc->client->options;
3628         struct ceph_mds_session *session;
3629         int i;
3630
3631         dout("close_sessions\n");
3632
3633         /* close sessions */
3634         mutex_lock(&mdsc->mutex);
3635         for (i = 0; i < mdsc->max_sessions; i++) {
3636                 session = __ceph_lookup_mds_session(mdsc, i);
3637                 if (!session)
3638                         continue;
3639                 mutex_unlock(&mdsc->mutex);
3640                 mutex_lock(&session->s_mutex);
3641                 __close_session(mdsc, session);
3642                 mutex_unlock(&session->s_mutex);
3643                 ceph_put_mds_session(session);
3644                 mutex_lock(&mdsc->mutex);
3645         }
3646         mutex_unlock(&mdsc->mutex);
3647
3648         dout("waiting for sessions to close\n");
3649         wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
3650                            ceph_timeout_jiffies(opts->mount_timeout));
3651
3652         /* tear down remaining sessions */
3653         mutex_lock(&mdsc->mutex);
3654         for (i = 0; i < mdsc->max_sessions; i++) {
3655                 if (mdsc->sessions[i]) {
3656                         session = get_session(mdsc->sessions[i]);
3657                         __unregister_session(mdsc, session);
3658                         mutex_unlock(&mdsc->mutex);
3659                         mutex_lock(&session->s_mutex);
3660                         remove_session_caps(session);
3661                         mutex_unlock(&session->s_mutex);
3662                         ceph_put_mds_session(session);
3663                         mutex_lock(&mdsc->mutex);
3664                 }
3665         }
3666         WARN_ON(!list_empty(&mdsc->cap_delay_list));
3667         mutex_unlock(&mdsc->mutex);
3668
3669         ceph_cleanup_empty_realms(mdsc);
3670
3671         cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
3672
3673         dout("stopped\n");
3674 }
3675
3676 void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
3677 {
3678         struct ceph_mds_session *session;
3679         int mds;
3680
3681         dout("force umount\n");
3682
3683         mutex_lock(&mdsc->mutex);
3684         for (mds = 0; mds < mdsc->max_sessions; mds++) {
3685                 session = __ceph_lookup_mds_session(mdsc, mds);
3686                 if (!session)
3687                         continue;
3688                 mutex_unlock(&mdsc->mutex);
3689                 mutex_lock(&session->s_mutex);
3690                 __close_session(mdsc, session);
3691                 if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
3692                         cleanup_session_requests(mdsc, session);
3693                         remove_session_caps(session);
3694                 }
3695                 mutex_unlock(&session->s_mutex);
3696                 ceph_put_mds_session(session);
3697                 mutex_lock(&mdsc->mutex);
3698                 kick_requests(mdsc, mds);
3699         }
3700         __wake_requests(mdsc, &mdsc->waiting_for_map);
3701         mutex_unlock(&mdsc->mutex);
3702 }
3703
3704 static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
3705 {
3706         dout("stop\n");
3707         cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
3708         if (mdsc->mdsmap)
3709                 ceph_mdsmap_destroy(mdsc->mdsmap);
3710         kfree(mdsc->sessions);
3711         ceph_caps_finalize(mdsc);
3712         ceph_pool_perm_destroy(mdsc);
3713 }
3714
3715 void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
3716 {
3717         struct ceph_mds_client *mdsc = fsc->mdsc;
3718
3719         dout("mdsc_destroy %p\n", mdsc);
3720         ceph_mdsc_stop(mdsc);
3721
3722         /* flush out any connection work with references to us */
3723         ceph_msgr_flush();
3724
3725         fsc->mdsc = NULL;
3726         kfree(mdsc);
3727         dout("mdsc_destroy %p done\n", mdsc);
3728 }
3729
3730
3731 /*
3732  * handle mds map update.
3733  */
3734 void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
3735 {
3736         u32 epoch;
3737         u32 maplen;
3738         void *p = msg->front.iov_base;
3739         void *end = p + msg->front.iov_len;
3740         struct ceph_mdsmap *newmap, *oldmap;
3741         struct ceph_fsid fsid;
3742         int err = -EINVAL;
3743
3744         ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
3745         ceph_decode_copy(&p, &fsid, sizeof(fsid));
3746         if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
3747                 return;
3748         epoch = ceph_decode_32(&p);
3749         maplen = ceph_decode_32(&p);
3750         dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
3751
3752         /* do we need it? */
3753         ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
3754         mutex_lock(&mdsc->mutex);
3755         if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
3756                 dout("handle_map epoch %u <= our %u\n",
3757                      epoch, mdsc->mdsmap->m_epoch);
3758                 mutex_unlock(&mdsc->mutex);
3759                 return;
3760         }
3761
3762         newmap = ceph_mdsmap_decode(&p, end);
3763         if (IS_ERR(newmap)) {
3764                 err = PTR_ERR(newmap);
3765                 goto bad_unlock;
3766         }
3767
3768         /* swap into place */
3769         if (mdsc->mdsmap) {
3770                 oldmap = mdsc->mdsmap;
3771                 mdsc->mdsmap = newmap;
3772                 check_new_map(mdsc, newmap, oldmap);
3773                 ceph_mdsmap_destroy(oldmap);
3774         } else {
3775                 mdsc->mdsmap = newmap;  /* first mds map */
3776         }
3777         mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
3778
3779         __wake_requests(mdsc, &mdsc->waiting_for_map);
3780
3781         mutex_unlock(&mdsc->mutex);
3782         schedule_delayed(mdsc);
3783         return;
3784
3785 bad_unlock:
3786         mutex_unlock(&mdsc->mutex);
3787 bad:
3788         pr_err("error decoding mdsmap %d\n", err);
3789         return;
3790 }
3791
3792 static struct ceph_connection *con_get(struct ceph_connection *con)
3793 {
3794         struct ceph_mds_session *s = con->private;
3795
3796         if (get_session(s)) {
3797                 dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
3798                 return con;
3799         }
3800         dout("mdsc con_get %p FAIL\n", s);
3801         return NULL;
3802 }
3803
3804 static void con_put(struct ceph_connection *con)
3805 {
3806         struct ceph_mds_session *s = con->private;
3807
3808         dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref) - 1);
3809         ceph_put_mds_session(s);
3810 }
3811
3812 /*
3813  * if the client is unresponsive for long enough, the mds will kill
3814  * the session entirely.
3815  */
3816 static void peer_reset(struct ceph_connection *con)
3817 {
3818         struct ceph_mds_session *s = con->private;
3819         struct ceph_mds_client *mdsc = s->s_mdsc;
3820
3821         pr_warn("mds%d closed our session\n", s->s_mds);
3822         send_mds_reconnect(mdsc, s);
3823 }
3824
3825 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
3826 {
3827         struct ceph_mds_session *s = con->private;
3828         struct ceph_mds_client *mdsc = s->s_mdsc;
3829         int type = le16_to_cpu(msg->hdr.type);
3830
3831         mutex_lock(&mdsc->mutex);
3832         if (__verify_registered_session(mdsc, s) < 0) {
3833                 mutex_unlock(&mdsc->mutex);
3834                 goto out;
3835         }
3836         mutex_unlock(&mdsc->mutex);
3837
3838         switch (type) {
3839         case CEPH_MSG_MDS_MAP:
3840                 ceph_mdsc_handle_map(mdsc, msg);
3841                 break;
3842         case CEPH_MSG_CLIENT_SESSION:
3843                 handle_session(s, msg);
3844                 break;
3845         case CEPH_MSG_CLIENT_REPLY:
3846                 handle_reply(s, msg);
3847                 break;
3848         case CEPH_MSG_CLIENT_REQUEST_FORWARD:
3849                 handle_forward(mdsc, s, msg);
3850                 break;
3851         case CEPH_MSG_CLIENT_CAPS:
3852                 ceph_handle_caps(s, msg);
3853                 break;
3854         case CEPH_MSG_CLIENT_SNAP:
3855                 ceph_handle_snap(mdsc, s, msg);
3856                 break;
3857         case CEPH_MSG_CLIENT_LEASE:
3858                 handle_lease(mdsc, s, msg);
3859                 break;
3860
3861         default:
3862                 pr_err("received unknown message type %d %s\n", type,
3863                        ceph_msg_type_name(type));
3864         }
3865 out:
3866         ceph_msg_put(msg);
3867 }
3868
3869 /*
3870  * authentication
3871  */
3872
3873 /*
3874  * Note: returned pointer is the address of a structure that's
3875  * managed separately.  Caller must *not* attempt to free it.
3876  */
3877 static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
3878                                         int *proto, int force_new)
3879 {
3880         struct ceph_mds_session *s = con->private;
3881         struct ceph_mds_client *mdsc = s->s_mdsc;
3882         struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3883         struct ceph_auth_handshake *auth = &s->s_auth;
3884
3885         if (force_new && auth->authorizer) {
3886                 ceph_auth_destroy_authorizer(ac, auth->authorizer);
3887                 auth->authorizer = NULL;
3888         }
3889         if (!auth->authorizer) {
3890                 int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
3891                                                       auth);
3892                 if (ret)
3893                         return ERR_PTR(ret);
3894         } else {
3895                 int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
3896                                                       auth);
3897                 if (ret)
3898                         return ERR_PTR(ret);
3899         }
3900         *proto = ac->protocol;
3901
3902         return auth;
3903 }
3904
3905
3906 static int verify_authorizer_reply(struct ceph_connection *con, int len)
3907 {
3908         struct ceph_mds_session *s = con->private;
3909         struct ceph_mds_client *mdsc = s->s_mdsc;
3910         struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3911
3912         return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer, len);
3913 }
3914
3915 static int invalidate_authorizer(struct ceph_connection *con)
3916 {
3917         struct ceph_mds_session *s = con->private;
3918         struct ceph_mds_client *mdsc = s->s_mdsc;
3919         struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3920
3921         ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
3922
3923         return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
3924 }
3925
3926 static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
3927                                 struct ceph_msg_header *hdr, int *skip)
3928 {
3929         struct ceph_msg *msg;
3930         int type = (int) le16_to_cpu(hdr->type);
3931         int front_len = (int) le32_to_cpu(hdr->front_len);
3932
3933         if (con->in_msg)
3934                 return con->in_msg;
3935
3936         *skip = 0;
3937         msg = ceph_msg_new(type, front_len, GFP_NOFS, false);
3938         if (!msg) {
3939                 pr_err("unable to allocate msg type %d len %d\n",
3940                        type, front_len);
3941                 return NULL;
3942         }
3943
3944         return msg;
3945 }
3946
3947 static int mds_sign_message(struct ceph_msg *msg)
3948 {
3949        struct ceph_mds_session *s = msg->con->private;
3950        struct ceph_auth_handshake *auth = &s->s_auth;
3951
3952        return ceph_auth_sign_message(auth, msg);
3953 }
3954
3955 static int mds_check_message_signature(struct ceph_msg *msg)
3956 {
3957        struct ceph_mds_session *s = msg->con->private;
3958        struct ceph_auth_handshake *auth = &s->s_auth;
3959
3960        return ceph_auth_check_message_signature(auth, msg);
3961 }
3962
3963 static const struct ceph_connection_operations mds_con_ops = {
3964         .get = con_get,
3965         .put = con_put,
3966         .dispatch = dispatch,
3967         .get_authorizer = get_authorizer,
3968         .verify_authorizer_reply = verify_authorizer_reply,
3969         .invalidate_authorizer = invalidate_authorizer,
3970         .peer_reset = peer_reset,
3971         .alloc_msg = mds_alloc_msg,
3972         .sign_message = mds_sign_message,
3973         .check_message_signature = mds_check_message_signature,
3974 };
3975
3976 /* eof */