4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
60 fd->fd_write_failed = false;
64 static void ll_file_data_put(struct ll_file_data *fd)
67 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
70 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
71 struct lustre_handle *fh)
73 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
74 op_data->op_attr.ia_mode = inode->i_mode;
75 op_data->op_attr.ia_atime = inode->i_atime;
76 op_data->op_attr.ia_mtime = inode->i_mtime;
77 op_data->op_attr.ia_ctime = inode->i_ctime;
78 op_data->op_attr.ia_size = i_size_read(inode);
79 op_data->op_attr_blocks = inode->i_blocks;
80 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
81 ll_inode_to_ext_flags(inode->i_flags);
82 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
84 op_data->op_handle = *fh;
85 op_data->op_capa1 = ll_mdscapa_get(inode);
87 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
88 op_data->op_bias |= MDS_DATA_MODIFIED;
92 * Closes the IO epoch and packs all the attributes into @op_data for
95 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
96 struct obd_client_handle *och)
100 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
101 ATTR_MTIME | ATTR_MTIME_SET |
102 ATTR_CTIME | ATTR_CTIME_SET;
104 if (!(och->och_flags & FMODE_WRITE))
107 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
108 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
110 ll_ioepoch_close(inode, op_data, &och, 0);
113 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
114 ll_prep_md_op_data(op_data, inode, NULL, NULL,
115 0, 0, LUSTRE_OPC_ANY, NULL);
119 static int ll_close_inode_openhandle(struct obd_export *md_exp,
121 struct obd_client_handle *och)
123 struct obd_export *exp = ll_i2mdexp(inode);
124 struct md_op_data *op_data;
125 struct ptlrpc_request *req = NULL;
126 struct obd_device *obd = class_exp2obd(exp);
133 * XXX: in case of LMV, is this correct to access
136 CERROR("Invalid MDC connection handle "LPX64"\n",
137 ll_i2mdexp(inode)->exp_handle.h_cookie);
141 OBD_ALLOC_PTR(op_data);
143 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
145 ll_prepare_close(inode, op_data, och);
146 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
147 rc = md_close(md_exp, op_data, och->och_mod, &req);
149 /* This close must have the epoch closed. */
150 LASSERT(epoch_close);
151 /* MDS has instructed us to obtain Size-on-MDS attribute from
152 * OSTs and send setattr to back to MDS. */
153 rc = ll_som_update(inode, op_data);
155 CERROR("inode %lu mdc Size-on-MDS update failed: "
156 "rc = %d\n", inode->i_ino, rc);
160 CERROR("inode %lu mdc close failed: rc = %d\n",
164 /* DATA_MODIFIED flag was successfully sent on close, cancel data
165 * modification flag. */
166 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
167 struct ll_inode_info *lli = ll_i2info(inode);
169 spin_lock(&lli->lli_lock);
170 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
171 spin_unlock(&lli->lli_lock);
174 ll_finish_md_op_data(op_data);
177 rc = ll_objects_destroy(req, inode);
179 CERROR("inode %lu ll_objects destroy: rc = %d\n",
186 if (exp_connect_som(exp) && !epoch_close &&
187 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
188 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
190 md_clear_open_replay_data(md_exp, och);
191 /* Free @och if it is not waiting for DONE_WRITING. */
192 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
195 if (req) /* This is close request */
196 ptlrpc_req_finished(req);
200 int ll_md_real_close(struct inode *inode, int flags)
202 struct ll_inode_info *lli = ll_i2info(inode);
203 struct obd_client_handle **och_p;
204 struct obd_client_handle *och;
209 if (flags & FMODE_WRITE) {
210 och_p = &lli->lli_mds_write_och;
211 och_usecount = &lli->lli_open_fd_write_count;
212 } else if (flags & FMODE_EXEC) {
213 och_p = &lli->lli_mds_exec_och;
214 och_usecount = &lli->lli_open_fd_exec_count;
216 LASSERT(flags & FMODE_READ);
217 och_p = &lli->lli_mds_read_och;
218 och_usecount = &lli->lli_open_fd_read_count;
221 mutex_lock(&lli->lli_och_mutex);
222 if (*och_usecount) { /* There are still users of this handle, so
224 mutex_unlock(&lli->lli_och_mutex);
229 mutex_unlock(&lli->lli_och_mutex);
231 if (och) { /* There might be a race and somebody have freed this och
233 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
240 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
243 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
244 struct ll_inode_info *lli = ll_i2info(inode);
248 /* clear group lock, if present */
249 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
250 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
252 /* Let's see if we have good enough OPEN lock on the file and if
253 we can skip talking to MDS */
254 if (file->f_dentry->d_inode) { /* Can this ever be false? */
256 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
257 struct lustre_handle lockh;
258 struct inode *inode = file->f_dentry->d_inode;
259 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
261 mutex_lock(&lli->lli_och_mutex);
262 if (fd->fd_omode & FMODE_WRITE) {
264 LASSERT(lli->lli_open_fd_write_count);
265 lli->lli_open_fd_write_count--;
266 } else if (fd->fd_omode & FMODE_EXEC) {
268 LASSERT(lli->lli_open_fd_exec_count);
269 lli->lli_open_fd_exec_count--;
272 LASSERT(lli->lli_open_fd_read_count);
273 lli->lli_open_fd_read_count--;
275 mutex_unlock(&lli->lli_och_mutex);
277 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
278 LDLM_IBITS, &policy, lockmode,
280 rc = ll_md_real_close(file->f_dentry->d_inode,
284 CERROR("Releasing a file %p with negative dentry %p. Name %s",
285 file, file->f_dentry, file->f_dentry->d_name.name);
288 LUSTRE_FPRIVATE(file) = NULL;
289 ll_file_data_put(fd);
290 ll_capa_close(inode);
295 /* While this returns an error code, fput() the caller does not, so we need
296 * to make every effort to clean up all of our state here. Also, applications
297 * rarely check close errors and even if an error is returned they will not
298 * re-try the close call.
300 int ll_file_release(struct inode *inode, struct file *file)
302 struct ll_file_data *fd;
303 struct ll_sb_info *sbi = ll_i2sbi(inode);
304 struct ll_inode_info *lli = ll_i2info(inode);
308 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
309 inode->i_generation, inode);
311 #ifdef CONFIG_FS_POSIX_ACL
312 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
313 inode == inode->i_sb->s_root->d_inode) {
314 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
317 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
318 fd->fd_flags &= ~LL_FILE_RMTACL;
319 rct_del(&sbi->ll_rct, current_pid());
320 et_search_free(&sbi->ll_et, current_pid());
325 if (inode->i_sb->s_root != file->f_dentry)
326 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
327 fd = LUSTRE_FPRIVATE(file);
330 /* The last ref on @file, maybe not the the owner pid of statahead.
331 * Different processes can open the same dir, "ll_opendir_key" means:
332 * it is me that should stop the statahead thread. */
333 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
334 lli->lli_opendir_pid != 0)
335 ll_stop_statahead(inode, lli->lli_opendir_key);
337 if (inode->i_sb->s_root == file->f_dentry) {
338 LUSTRE_FPRIVATE(file) = NULL;
339 ll_file_data_put(fd);
343 if (!S_ISDIR(inode->i_mode)) {
344 lov_read_and_clear_async_rc(lli->lli_clob);
345 lli->lli_async_rc = 0;
348 rc = ll_md_close(sbi->ll_md_exp, inode, file);
350 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
351 libcfs_debug_dumplog();
356 static int ll_intent_file_open(struct file *file, void *lmm,
357 int lmmsize, struct lookup_intent *itp)
359 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
360 struct dentry *parent = file->f_dentry->d_parent;
361 const char *name = file->f_dentry->d_name.name;
362 const int len = file->f_dentry->d_name.len;
363 struct md_op_data *op_data;
364 struct ptlrpc_request *req;
365 __u32 opc = LUSTRE_OPC_ANY;
372 /* Usually we come here only for NFSD, and we want open lock.
373 But we can also get here with pre 2.6.15 patchless kernels, and in
374 that case that lock is also ok */
375 /* We can also get here if there was cached open handle in revalidate_it
376 * but it disappeared while we were getting from there to ll_file_open.
377 * But this means this file was closed and immediatelly opened which
378 * makes a good candidate for using OPEN lock */
379 /* If lmmsize & lmm are not 0, we are just setting stripe info
380 * parameters. No need for the open lock */
381 if (lmm == NULL && lmmsize == 0) {
382 itp->it_flags |= MDS_OPEN_LOCK;
383 if (itp->it_flags & FMODE_WRITE)
384 opc = LUSTRE_OPC_CREATE;
387 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
388 file->f_dentry->d_inode, name, len,
391 RETURN(PTR_ERR(op_data));
393 itp->it_flags |= MDS_OPEN_BY_FID;
394 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
395 0 /*unused */, &req, ll_md_blocking_ast, 0);
396 ll_finish_md_op_data(op_data);
398 /* reason for keep own exit path - don`t flood log
399 * with messages with -ESTALE errors.
401 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
402 it_open_error(DISP_OPEN_OPEN, itp))
404 ll_release_openhandle(file->f_dentry, itp);
408 if (it_disposition(itp, DISP_LOOKUP_NEG))
409 GOTO(out, rc = -ENOENT);
411 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
412 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
413 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
417 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
418 if (!rc && itp->d.lustre.it_lock_mode)
419 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
423 ptlrpc_req_finished(itp->d.lustre.it_data);
424 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
425 ll_intent_drop_lock(itp);
431 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
432 * not believe attributes if a few ioepoch holders exist. Attributes for
433 * previous ioepoch if new one is opened are also skipped by MDS.
435 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
437 if (ioepoch && lli->lli_ioepoch != ioepoch) {
438 lli->lli_ioepoch = ioepoch;
439 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
440 ioepoch, PFID(&lli->lli_fid));
444 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
445 struct lookup_intent *it, struct obd_client_handle *och)
447 struct ptlrpc_request *req = it->d.lustre.it_data;
448 struct mdt_body *body;
452 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
453 LASSERT(body != NULL); /* reply already checked out */
455 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
456 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
457 och->och_fid = lli->lli_fid;
458 och->och_flags = it->it_flags;
459 ll_ioepoch_open(lli, body->ioepoch);
461 return md_set_open_replay_data(md_exp, och, req);
464 int ll_local_open(struct file *file, struct lookup_intent *it,
465 struct ll_file_data *fd, struct obd_client_handle *och)
467 struct inode *inode = file->f_dentry->d_inode;
468 struct ll_inode_info *lli = ll_i2info(inode);
471 LASSERT(!LUSTRE_FPRIVATE(file));
476 struct ptlrpc_request *req = it->d.lustre.it_data;
477 struct mdt_body *body;
480 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
484 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
485 if ((it->it_flags & FMODE_WRITE) &&
486 (body->valid & OBD_MD_FLSIZE))
487 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
488 lli->lli_ioepoch, PFID(&lli->lli_fid));
491 LUSTRE_FPRIVATE(file) = fd;
492 ll_readahead_init(inode, &fd->fd_ras);
493 fd->fd_omode = it->it_flags;
497 /* Open a file, and (for the very first open) create objects on the OSTs at
498 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
499 * creation or open until ll_lov_setstripe() ioctl is called.
501 * If we already have the stripe MD locally then we don't request it in
502 * md_open(), by passing a lmm_size = 0.
504 * It is up to the application to ensure no other processes open this file
505 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
506 * used. We might be able to avoid races of that sort by getting lli_open_sem
507 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
508 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
510 int ll_file_open(struct inode *inode, struct file *file)
512 struct ll_inode_info *lli = ll_i2info(inode);
513 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
514 .it_flags = file->f_flags };
515 struct obd_client_handle **och_p = NULL;
516 __u64 *och_usecount = NULL;
517 struct ll_file_data *fd;
518 int rc = 0, opendir_set = 0;
521 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
522 inode->i_generation, inode, file->f_flags);
524 it = file->private_data; /* XXX: compat macro */
525 file->private_data = NULL; /* prevent ll_local_open assertion */
527 fd = ll_file_data_get();
529 GOTO(out_openerr, rc = -ENOMEM);
532 if (S_ISDIR(inode->i_mode)) {
533 spin_lock(&lli->lli_sa_lock);
534 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
535 lli->lli_opendir_pid == 0) {
536 lli->lli_opendir_key = fd;
537 lli->lli_opendir_pid = current_pid();
540 spin_unlock(&lli->lli_sa_lock);
543 if (inode->i_sb->s_root == file->f_dentry) {
544 LUSTRE_FPRIVATE(file) = fd;
548 if (!it || !it->d.lustre.it_disposition) {
549 /* Convert f_flags into access mode. We cannot use file->f_mode,
550 * because everything but O_ACCMODE mask was stripped from
552 if ((oit.it_flags + 1) & O_ACCMODE)
554 if (file->f_flags & O_TRUNC)
555 oit.it_flags |= FMODE_WRITE;
557 /* kernel only call f_op->open in dentry_open. filp_open calls
558 * dentry_open after call to open_namei that checks permissions.
559 * Only nfsd_open call dentry_open directly without checking
560 * permissions and because of that this code below is safe. */
561 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
562 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
564 /* We do not want O_EXCL here, presumably we opened the file
565 * already? XXX - NFS implications? */
566 oit.it_flags &= ~O_EXCL;
568 /* bug20584, if "it_flags" contains O_CREAT, the file will be
569 * created if necessary, then "IT_CREAT" should be set to keep
570 * consistent with it */
571 if (oit.it_flags & O_CREAT)
572 oit.it_op |= IT_CREAT;
578 /* Let's see if we have file open on MDS already. */
579 if (it->it_flags & FMODE_WRITE) {
580 och_p = &lli->lli_mds_write_och;
581 och_usecount = &lli->lli_open_fd_write_count;
582 } else if (it->it_flags & FMODE_EXEC) {
583 och_p = &lli->lli_mds_exec_och;
584 och_usecount = &lli->lli_open_fd_exec_count;
586 och_p = &lli->lli_mds_read_och;
587 och_usecount = &lli->lli_open_fd_read_count;
590 mutex_lock(&lli->lli_och_mutex);
591 if (*och_p) { /* Open handle is present */
592 if (it_disposition(it, DISP_OPEN_OPEN)) {
593 /* Well, there's extra open request that we do not need,
594 let's close it somehow. This will decref request. */
595 rc = it_open_error(DISP_OPEN_OPEN, it);
597 mutex_unlock(&lli->lli_och_mutex);
598 GOTO(out_openerr, rc);
601 ll_release_openhandle(file->f_dentry, it);
605 rc = ll_local_open(file, it, fd, NULL);
608 mutex_unlock(&lli->lli_och_mutex);
609 GOTO(out_openerr, rc);
612 LASSERT(*och_usecount == 0);
613 if (!it->d.lustre.it_disposition) {
614 /* We cannot just request lock handle now, new ELC code
615 means that one of other OPEN locks for this file
616 could be cancelled, and since blocking ast handler
617 would attempt to grab och_mutex as well, that would
618 result in a deadlock */
619 mutex_unlock(&lli->lli_och_mutex);
620 it->it_create_mode |= M_CHECK_STALE;
621 rc = ll_intent_file_open(file, NULL, 0, it);
622 it->it_create_mode &= ~M_CHECK_STALE;
624 GOTO(out_openerr, rc);
628 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
630 GOTO(out_och_free, rc = -ENOMEM);
634 /* md_intent_lock() didn't get a request ref if there was an
635 * open error, so don't do cleanup on the request here
637 /* XXX (green): Should not we bail out on any error here, not
638 * just open error? */
639 rc = it_open_error(DISP_OPEN_OPEN, it);
641 GOTO(out_och_free, rc);
643 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
645 rc = ll_local_open(file, it, fd, *och_p);
647 GOTO(out_och_free, rc);
649 mutex_unlock(&lli->lli_och_mutex);
652 /* Must do this outside lli_och_mutex lock to prevent deadlock where
653 different kind of OPEN lock for this same inode gets cancelled
654 by ldlm_cancel_lru */
655 if (!S_ISREG(inode->i_mode))
656 GOTO(out_och_free, rc);
660 if (!lli->lli_has_smd) {
661 if (file->f_flags & O_LOV_DELAY_CREATE ||
662 !(file->f_mode & FMODE_WRITE)) {
663 CDEBUG(D_INODE, "object creation was delayed\n");
664 GOTO(out_och_free, rc);
667 file->f_flags &= ~O_LOV_DELAY_CREATE;
668 GOTO(out_och_free, rc);
672 if (och_p && *och_p) {
673 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
674 *och_p = NULL; /* OBD_FREE writes some magic there */
677 mutex_unlock(&lli->lli_och_mutex);
680 if (opendir_set != 0)
681 ll_stop_statahead(inode, lli->lli_opendir_key);
683 ll_file_data_put(fd);
685 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
688 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
689 ptlrpc_req_finished(it->d.lustre.it_data);
690 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
696 /* Fills the obdo with the attributes for the lsm */
697 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
698 struct obd_capa *capa, struct obdo *obdo,
699 __u64 ioepoch, int sync)
701 struct ptlrpc_request_set *set;
702 struct obd_info oinfo = { { { 0 } } };
707 LASSERT(lsm != NULL);
711 oinfo.oi_oa->o_oi = lsm->lsm_oi;
712 oinfo.oi_oa->o_mode = S_IFREG;
713 oinfo.oi_oa->o_ioepoch = ioepoch;
714 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
715 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
716 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
717 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
718 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
719 OBD_MD_FLDATAVERSION;
720 oinfo.oi_capa = capa;
722 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
723 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
726 set = ptlrpc_prep_set();
728 CERROR("can't allocate ptlrpc set\n");
731 rc = obd_getattr_async(exp, &oinfo, set);
733 rc = ptlrpc_set_wait(set);
734 ptlrpc_set_destroy(set);
737 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
738 OBD_MD_FLATIME | OBD_MD_FLMTIME |
739 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
740 OBD_MD_FLDATAVERSION);
745 * Performs the getattr on the inode and updates its fields.
746 * If @sync != 0, perform the getattr under the server-side lock.
748 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
749 __u64 ioepoch, int sync)
751 struct obd_capa *capa = ll_mdscapa_get(inode);
752 struct lov_stripe_md *lsm;
756 lsm = ccc_inode_lsm_get(inode);
757 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
758 capa, obdo, ioepoch, sync);
761 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
763 obdo_refresh_inode(inode, obdo, obdo->o_valid);
764 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
765 " blksize %lu\n", POSTID(oi), i_size_read(inode),
766 (unsigned long long)inode->i_blocks,
767 (unsigned long)ll_inode_blksize(inode));
769 ccc_inode_lsm_put(inode, lsm);
773 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
775 struct ll_inode_info *lli = ll_i2info(inode);
776 struct cl_object *obj = lli->lli_clob;
777 struct cl_attr *attr = ccc_env_thread_attr(env);
783 ll_inode_size_lock(inode);
784 /* merge timestamps the most recently obtained from mds with
785 timestamps obtained from osts */
786 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
787 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
788 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
789 inode_init_lvb(inode, &lvb);
791 cl_object_attr_lock(obj);
792 rc = cl_object_attr_get(env, obj, attr);
793 cl_object_attr_unlock(obj);
796 if (lvb.lvb_atime < attr->cat_atime)
797 lvb.lvb_atime = attr->cat_atime;
798 if (lvb.lvb_ctime < attr->cat_ctime)
799 lvb.lvb_ctime = attr->cat_ctime;
800 if (lvb.lvb_mtime < attr->cat_mtime)
801 lvb.lvb_mtime = attr->cat_mtime;
803 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
804 PFID(&lli->lli_fid), attr->cat_size);
805 cl_isize_write_nolock(inode, attr->cat_size);
807 inode->i_blocks = attr->cat_blocks;
809 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
810 LTIME_S(inode->i_atime) = lvb.lvb_atime;
811 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
813 ll_inode_size_unlock(inode);
818 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
821 struct obdo obdo = { 0 };
824 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
826 st->st_size = obdo.o_size;
827 st->st_blocks = obdo.o_blocks;
828 st->st_mtime = obdo.o_mtime;
829 st->st_atime = obdo.o_atime;
830 st->st_ctime = obdo.o_ctime;
835 void ll_io_init(struct cl_io *io, const struct file *file, int write)
837 struct inode *inode = file->f_dentry->d_inode;
839 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
841 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
842 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
843 file->f_flags & O_DIRECT ||
846 io->ci_obj = ll_i2info(inode)->lli_clob;
847 io->ci_lockreq = CILR_MAYBE;
848 if (ll_file_nolock(file)) {
849 io->ci_lockreq = CILR_NEVER;
850 io->ci_no_srvlock = 1;
851 } else if (file->f_flags & O_APPEND) {
852 io->ci_lockreq = CILR_MANDATORY;
857 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
858 struct file *file, enum cl_io_type iot,
859 loff_t *ppos, size_t count)
861 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
862 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
868 io = ccc_env_thread_io(env);
869 ll_io_init(io, file, iot == CIT_WRITE);
871 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
872 struct vvp_io *vio = vvp_env_io(env);
873 struct ccc_io *cio = ccc_env_io(env);
874 int write_mutex_locked = 0;
876 cio->cui_fd = LUSTRE_FPRIVATE(file);
877 vio->cui_io_subtype = args->via_io_subtype;
879 switch (vio->cui_io_subtype) {
881 cio->cui_iov = args->u.normal.via_iov;
882 cio->cui_nrsegs = args->u.normal.via_nrsegs;
883 cio->cui_tot_nrsegs = cio->cui_nrsegs;
884 cio->cui_iocb = args->u.normal.via_iocb;
885 if ((iot == CIT_WRITE) &&
886 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
887 if (mutex_lock_interruptible(&lli->
889 GOTO(out, result = -ERESTARTSYS);
890 write_mutex_locked = 1;
891 } else if (iot == CIT_READ) {
892 down_read(&lli->lli_trunc_sem);
896 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
897 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
900 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
901 vio->u.splice.cui_flags = args->u.splice.via_flags;
904 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
907 result = cl_io_loop(env, io);
908 if (write_mutex_locked)
909 mutex_unlock(&lli->lli_write_mutex);
910 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
911 up_read(&lli->lli_trunc_sem);
913 /* cl_io_rw_init() handled IO */
914 result = io->ci_result;
917 if (io->ci_nob > 0) {
919 *ppos = io->u.ci_wr.wr.crw_pos;
924 /* If any bit been read/written (result != 0), we just return
925 * short read/write instead of restart io. */
926 if (result == 0 && io->ci_need_restart) {
927 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
928 iot == CIT_READ ? "read" : "write",
929 file->f_dentry->d_name.name, *ppos, count);
930 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
934 if (iot == CIT_READ) {
936 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
937 LPROC_LL_READ_BYTES, result);
938 } else if (iot == CIT_WRITE) {
940 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
941 LPROC_LL_WRITE_BYTES, result);
942 fd->fd_write_failed = false;
943 } else if (result != -ERESTARTSYS) {
944 fd->fd_write_failed = true;
953 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
955 static int ll_file_get_iov_count(const struct iovec *iov,
956 unsigned long *nr_segs, size_t *count)
961 for (seg = 0; seg < *nr_segs; seg++) {
962 const struct iovec *iv = &iov[seg];
965 * If any segment has a negative length, or the cumulative
966 * length ever wraps negative then return -EINVAL.
969 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
971 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
976 cnt -= iv->iov_len; /* This segment is no good */
983 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
984 unsigned long nr_segs, loff_t pos)
987 struct vvp_io_args *args;
993 result = ll_file_get_iov_count(iov, &nr_segs, &count);
997 env = cl_env_get(&refcheck);
999 RETURN(PTR_ERR(env));
1001 args = vvp_env_args(env, IO_NORMAL);
1002 args->u.normal.via_iov = (struct iovec *)iov;
1003 args->u.normal.via_nrsegs = nr_segs;
1004 args->u.normal.via_iocb = iocb;
1006 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1007 &iocb->ki_pos, count);
1008 cl_env_put(env, &refcheck);
1012 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1016 struct iovec *local_iov;
1017 struct kiocb *kiocb;
1022 env = cl_env_get(&refcheck);
1024 RETURN(PTR_ERR(env));
1026 local_iov = &vvp_env_info(env)->vti_local_iov;
1027 kiocb = &vvp_env_info(env)->vti_kiocb;
1028 local_iov->iov_base = (void __user *)buf;
1029 local_iov->iov_len = count;
1030 init_sync_kiocb(kiocb, file);
1031 kiocb->ki_pos = *ppos;
1032 kiocb->ki_left = count;
1034 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1035 *ppos = kiocb->ki_pos;
1037 cl_env_put(env, &refcheck);
1042 * Write to a file (through the page cache).
1044 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1045 unsigned long nr_segs, loff_t pos)
1048 struct vvp_io_args *args;
1054 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1058 env = cl_env_get(&refcheck);
1060 RETURN(PTR_ERR(env));
1062 args = vvp_env_args(env, IO_NORMAL);
1063 args->u.normal.via_iov = (struct iovec *)iov;
1064 args->u.normal.via_nrsegs = nr_segs;
1065 args->u.normal.via_iocb = iocb;
1067 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1068 &iocb->ki_pos, count);
1069 cl_env_put(env, &refcheck);
1073 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1077 struct iovec *local_iov;
1078 struct kiocb *kiocb;
1083 env = cl_env_get(&refcheck);
1085 RETURN(PTR_ERR(env));
1087 local_iov = &vvp_env_info(env)->vti_local_iov;
1088 kiocb = &vvp_env_info(env)->vti_kiocb;
1089 local_iov->iov_base = (void __user *)buf;
1090 local_iov->iov_len = count;
1091 init_sync_kiocb(kiocb, file);
1092 kiocb->ki_pos = *ppos;
1093 kiocb->ki_left = count;
1095 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1096 *ppos = kiocb->ki_pos;
1098 cl_env_put(env, &refcheck);
1105 * Send file content (through pagecache) somewhere with helper
1107 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1108 struct pipe_inode_info *pipe, size_t count,
1112 struct vvp_io_args *args;
1117 env = cl_env_get(&refcheck);
1119 RETURN(PTR_ERR(env));
1121 args = vvp_env_args(env, IO_SPLICE);
1122 args->u.splice.via_pipe = pipe;
1123 args->u.splice.via_flags = flags;
1125 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1126 cl_env_put(env, &refcheck);
1130 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1133 struct obd_export *exp = ll_i2dtexp(inode);
1134 struct obd_trans_info oti = { 0 };
1135 struct obdo *oa = NULL;
1138 struct lov_stripe_md *lsm = NULL, *lsm2;
1145 lsm = ccc_inode_lsm_get(inode);
1146 if (!lsm_has_objects(lsm))
1147 GOTO(out, rc = -ENOENT);
1149 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1150 (lsm->lsm_stripe_count));
1152 OBD_ALLOC_LARGE(lsm2, lsm_size);
1154 GOTO(out, rc = -ENOMEM);
1157 oa->o_nlink = ost_idx;
1158 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1159 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1160 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1161 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1162 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1163 memcpy(lsm2, lsm, lsm_size);
1164 ll_inode_size_lock(inode);
1165 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1166 ll_inode_size_unlock(inode);
1168 OBD_FREE_LARGE(lsm2, lsm_size);
1171 ccc_inode_lsm_put(inode, lsm);
1176 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1178 struct ll_recreate_obj ucreat;
1182 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1185 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1189 ostid_set_seq_mdt0(&oi);
1190 ostid_set_id(&oi, ucreat.lrc_id);
1191 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1194 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1201 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1204 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1207 fid_to_ostid(&fid, &oi);
1208 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1209 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1212 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1213 int flags, struct lov_user_md *lum, int lum_size)
1215 struct lov_stripe_md *lsm = NULL;
1216 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1220 lsm = ccc_inode_lsm_get(inode);
1222 ccc_inode_lsm_put(inode, lsm);
1223 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1228 ll_inode_size_lock(inode);
1229 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1232 rc = oit.d.lustre.it_status;
1234 GOTO(out_req_free, rc);
1236 ll_release_openhandle(file->f_dentry, &oit);
1239 ll_inode_size_unlock(inode);
1240 ll_intent_release(&oit);
1241 ccc_inode_lsm_put(inode, lsm);
1244 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1248 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1249 struct lov_mds_md **lmmp, int *lmm_size,
1250 struct ptlrpc_request **request)
1252 struct ll_sb_info *sbi = ll_i2sbi(inode);
1253 struct mdt_body *body;
1254 struct lov_mds_md *lmm = NULL;
1255 struct ptlrpc_request *req = NULL;
1256 struct md_op_data *op_data;
1259 rc = ll_get_max_mdsize(sbi, &lmmsize);
1263 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1264 strlen(filename), lmmsize,
1265 LUSTRE_OPC_ANY, NULL);
1266 if (IS_ERR(op_data))
1267 RETURN(PTR_ERR(op_data));
1269 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1270 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1271 ll_finish_md_op_data(op_data);
1273 CDEBUG(D_INFO, "md_getattr_name failed "
1274 "on %s: rc %d\n", filename, rc);
1278 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1279 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1281 lmmsize = body->eadatasize;
1283 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1285 GOTO(out, rc = -ENODATA);
1288 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1289 LASSERT(lmm != NULL);
1291 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1292 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1293 GOTO(out, rc = -EPROTO);
1297 * This is coming from the MDS, so is probably in
1298 * little endian. We convert it to host endian before
1299 * passing it to userspace.
1301 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1304 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1305 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1308 /* if function called for directory - we should
1309 * avoid swab not existent lsm objects */
1310 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1311 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1312 if (S_ISREG(body->mode))
1313 lustre_swab_lov_user_md_objects(
1314 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1316 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1317 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1318 if (S_ISREG(body->mode))
1319 lustre_swab_lov_user_md_objects(
1320 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1327 *lmm_size = lmmsize;
1332 static int ll_lov_setea(struct inode *inode, struct file *file,
1335 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1336 struct lov_user_md *lump;
1337 int lum_size = sizeof(struct lov_user_md) +
1338 sizeof(struct lov_user_ost_data);
1342 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1345 OBD_ALLOC_LARGE(lump, lum_size);
1349 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1350 OBD_FREE_LARGE(lump, lum_size);
1354 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1356 OBD_FREE_LARGE(lump, lum_size);
1360 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1363 struct lov_user_md_v3 lumv3;
1364 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1365 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1366 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1368 int flags = FMODE_WRITE;
1371 /* first try with v1 which is smaller than v3 */
1372 lum_size = sizeof(struct lov_user_md_v1);
1373 if (copy_from_user(lumv1, lumv1p, lum_size))
1376 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1377 lum_size = sizeof(struct lov_user_md_v3);
1378 if (copy_from_user(&lumv3, lumv3p, lum_size))
1382 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1384 struct lov_stripe_md *lsm;
1387 put_user(0, &lumv1p->lmm_stripe_count);
1389 ll_layout_refresh(inode, &gen);
1390 lsm = ccc_inode_lsm_get(inode);
1391 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1392 0, lsm, (void *)arg);
1393 ccc_inode_lsm_put(inode, lsm);
1398 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1400 struct lov_stripe_md *lsm;
1404 lsm = ccc_inode_lsm_get(inode);
1406 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1408 ccc_inode_lsm_put(inode, lsm);
1412 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1414 struct ll_inode_info *lli = ll_i2info(inode);
1415 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1416 struct ccc_grouplock grouplock;
1420 if (ll_file_nolock(file))
1421 RETURN(-EOPNOTSUPP);
1423 spin_lock(&lli->lli_lock);
1424 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1425 CWARN("group lock already existed with gid %lu\n",
1426 fd->fd_grouplock.cg_gid);
1427 spin_unlock(&lli->lli_lock);
1430 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1431 spin_unlock(&lli->lli_lock);
1433 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1434 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1438 spin_lock(&lli->lli_lock);
1439 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1440 spin_unlock(&lli->lli_lock);
1441 CERROR("another thread just won the race\n");
1442 cl_put_grouplock(&grouplock);
1446 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1447 fd->fd_grouplock = grouplock;
1448 spin_unlock(&lli->lli_lock);
1450 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1454 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1456 struct ll_inode_info *lli = ll_i2info(inode);
1457 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1458 struct ccc_grouplock grouplock;
1461 spin_lock(&lli->lli_lock);
1462 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1463 spin_unlock(&lli->lli_lock);
1464 CWARN("no group lock held\n");
1467 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1469 if (fd->fd_grouplock.cg_gid != arg) {
1470 CWARN("group lock %lu doesn't match current id %lu\n",
1471 arg, fd->fd_grouplock.cg_gid);
1472 spin_unlock(&lli->lli_lock);
1476 grouplock = fd->fd_grouplock;
1477 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1478 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1479 spin_unlock(&lli->lli_lock);
1481 cl_put_grouplock(&grouplock);
1482 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1487 * Close inode open handle
1489 * \param dentry [in] dentry which contains the inode
1490 * \param it [in,out] intent which contains open info and result
1493 * \retval <0 failure
1495 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1497 struct inode *inode = dentry->d_inode;
1498 struct obd_client_handle *och;
1504 /* Root ? Do nothing. */
1505 if (dentry->d_inode->i_sb->s_root == dentry)
1508 /* No open handle to close? Move away */
1509 if (!it_disposition(it, DISP_OPEN_OPEN))
1512 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1514 OBD_ALLOC(och, sizeof(*och));
1516 GOTO(out, rc = -ENOMEM);
1518 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1519 ll_i2info(inode), it, och);
1521 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1524 /* this one is in place of ll_file_open */
1525 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1526 ptlrpc_req_finished(it->d.lustre.it_data);
1527 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1533 * Get size for inode for which FIEMAP mapping is requested.
1534 * Make the FIEMAP get_info call and returns the result.
1536 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1539 struct obd_export *exp = ll_i2dtexp(inode);
1540 struct lov_stripe_md *lsm = NULL;
1541 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1542 int vallen = num_bytes;
1546 /* Checks for fiemap flags */
1547 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1548 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1552 /* Check for FIEMAP_FLAG_SYNC */
1553 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1554 rc = filemap_fdatawrite(inode->i_mapping);
1559 lsm = ccc_inode_lsm_get(inode);
1563 /* If the stripe_count > 1 and the application does not understand
1564 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1566 if (lsm->lsm_stripe_count > 1 &&
1567 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1568 GOTO(out, rc = -EOPNOTSUPP);
1570 fm_key.oa.o_oi = lsm->lsm_oi;
1571 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1573 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1574 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1575 /* If filesize is 0, then there would be no objects for mapping */
1576 if (fm_key.oa.o_size == 0) {
1577 fiemap->fm_mapped_extents = 0;
1581 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1583 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1586 CERROR("obd_get_info failed: rc = %d\n", rc);
1589 ccc_inode_lsm_put(inode, lsm);
1593 int ll_fid2path(struct inode *inode, void *arg)
1595 struct obd_export *exp = ll_i2mdexp(inode);
1596 struct getinfo_fid2path *gfout, *gfin;
1600 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1601 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1604 /* Need to get the buflen */
1605 OBD_ALLOC_PTR(gfin);
1608 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1613 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1614 OBD_ALLOC(gfout, outsize);
1615 if (gfout == NULL) {
1619 memcpy(gfout, gfin, sizeof(*gfout));
1622 /* Call mdc_iocontrol */
1623 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1627 if (copy_to_user(arg, gfout, outsize))
1631 OBD_FREE(gfout, outsize);
1635 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1637 struct ll_user_fiemap *fiemap_s;
1638 size_t num_bytes, ret_bytes;
1639 unsigned int extent_count;
1642 /* Get the extent count so we can calculate the size of
1643 * required fiemap buffer */
1644 if (get_user(extent_count,
1645 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1647 num_bytes = sizeof(*fiemap_s) + (extent_count *
1648 sizeof(struct ll_fiemap_extent));
1650 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1651 if (fiemap_s == NULL)
1654 /* get the fiemap value */
1655 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1657 GOTO(error, rc = -EFAULT);
1659 /* If fm_extent_count is non-zero, read the first extent since
1660 * it is used to calculate end_offset and device from previous
1663 if (copy_from_user(&fiemap_s->fm_extents[0],
1664 (char __user *)arg + sizeof(*fiemap_s),
1665 sizeof(struct ll_fiemap_extent)))
1666 GOTO(error, rc = -EFAULT);
1669 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1673 ret_bytes = sizeof(struct ll_user_fiemap);
1675 if (extent_count != 0)
1676 ret_bytes += (fiemap_s->fm_mapped_extents *
1677 sizeof(struct ll_fiemap_extent));
1679 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1683 OBD_FREE_LARGE(fiemap_s, num_bytes);
1688 * Read the data_version for inode.
1690 * This value is computed using stripe object version on OST.
1691 * Version is computed using server side locking.
1693 * @param extent_lock Take extent lock. Not needed if a process is already
1694 * holding the OST object group locks.
1696 int ll_data_version(struct inode *inode, __u64 *data_version,
1699 struct lov_stripe_md *lsm = NULL;
1700 struct ll_sb_info *sbi = ll_i2sbi(inode);
1701 struct obdo *obdo = NULL;
1705 /* If no stripe, we consider version is 0. */
1706 lsm = ccc_inode_lsm_get(inode);
1707 if (!lsm_has_objects(lsm)) {
1709 CDEBUG(D_INODE, "No object for inode\n");
1713 OBD_ALLOC_PTR(obdo);
1715 GOTO(out, rc = -ENOMEM);
1717 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1719 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1722 *data_version = obdo->o_data_version;
1728 ccc_inode_lsm_put(inode, lsm);
1732 struct ll_swap_stack {
1733 struct iattr ia1, ia2;
1735 struct inode *inode1, *inode2;
1736 bool check_dv1, check_dv2;
1739 static int ll_swap_layouts(struct file *file1, struct file *file2,
1740 struct lustre_swap_layouts *lsl)
1742 struct mdc_swap_layouts msl;
1743 struct md_op_data *op_data;
1746 struct ll_swap_stack *llss = NULL;
1749 OBD_ALLOC_PTR(llss);
1753 llss->inode1 = file1->f_dentry->d_inode;
1754 llss->inode2 = file2->f_dentry->d_inode;
1756 if (!S_ISREG(llss->inode2->i_mode))
1757 GOTO(free, rc = -EINVAL);
1759 if (ll_permission(llss->inode1, MAY_WRITE, NULL) ||
1760 ll_permission(llss->inode2, MAY_WRITE, NULL))
1761 GOTO(free, rc = -EPERM);
1763 if (llss->inode2->i_sb != llss->inode1->i_sb)
1764 GOTO(free, rc = -EXDEV);
1766 /* we use 2 bool because it is easier to swap than 2 bits */
1767 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1768 llss->check_dv1 = true;
1770 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1771 llss->check_dv2 = true;
1773 /* we cannot use lsl->sl_dvX directly because we may swap them */
1774 llss->dv1 = lsl->sl_dv1;
1775 llss->dv2 = lsl->sl_dv2;
1777 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1778 if (rc == 0) /* same file, done! */
1781 if (rc < 0) { /* sequentialize it */
1782 swap(llss->inode1, llss->inode2);
1784 swap(llss->dv1, llss->dv2);
1785 swap(llss->check_dv1, llss->check_dv2);
1789 if (gid != 0) { /* application asks to flush dirty cache */
1790 rc = ll_get_grouplock(llss->inode1, file1, gid);
1794 rc = ll_get_grouplock(llss->inode2, file2, gid);
1796 ll_put_grouplock(llss->inode1, file1, gid);
1801 /* to be able to restore mtime and atime after swap
1802 * we need to first save them */
1804 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
1805 llss->ia1.ia_mtime = llss->inode1->i_mtime;
1806 llss->ia1.ia_atime = llss->inode1->i_atime;
1807 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
1808 llss->ia2.ia_mtime = llss->inode2->i_mtime;
1809 llss->ia2.ia_atime = llss->inode2->i_atime;
1810 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
1813 /* ultimate check, before swaping the layouts we check if
1814 * dataversion has changed (if requested) */
1815 if (llss->check_dv1) {
1816 rc = ll_data_version(llss->inode1, &dv, 0);
1819 if (dv != llss->dv1)
1820 GOTO(putgl, rc = -EAGAIN);
1823 if (llss->check_dv2) {
1824 rc = ll_data_version(llss->inode2, &dv, 0);
1827 if (dv != llss->dv2)
1828 GOTO(putgl, rc = -EAGAIN);
1831 /* struct md_op_data is used to send the swap args to the mdt
1832 * only flags is missing, so we use struct mdc_swap_layouts
1833 * through the md_op_data->op_data */
1834 /* flags from user space have to be converted before they are send to
1835 * server, no flag is sent today, they are only used on the client */
1838 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1839 0, LUSTRE_OPC_ANY, &msl);
1840 if (IS_ERR(op_data))
1841 GOTO(free, rc = PTR_ERR(op_data));
1843 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
1844 sizeof(*op_data), op_data, NULL);
1845 ll_finish_md_op_data(op_data);
1849 ll_put_grouplock(llss->inode2, file2, gid);
1850 ll_put_grouplock(llss->inode1, file1, gid);
1853 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
1857 /* clear useless flags */
1858 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
1859 llss->ia1.ia_valid &= ~ATTR_MTIME;
1860 llss->ia2.ia_valid &= ~ATTR_MTIME;
1863 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
1864 llss->ia1.ia_valid &= ~ATTR_ATIME;
1865 llss->ia2.ia_valid &= ~ATTR_ATIME;
1868 /* update time if requested */
1870 if (llss->ia2.ia_valid != 0) {
1871 mutex_lock(&llss->inode1->i_mutex);
1872 rc = ll_setattr(file1->f_dentry, &llss->ia2);
1873 mutex_unlock(&llss->inode1->i_mutex);
1876 if (llss->ia1.ia_valid != 0) {
1879 mutex_lock(&llss->inode2->i_mutex);
1880 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
1881 mutex_unlock(&llss->inode2->i_mutex);
1893 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1895 struct inode *inode = file->f_dentry->d_inode;
1896 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1900 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1901 inode->i_generation, inode, cmd);
1902 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1904 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1905 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1909 case LL_IOC_GETFLAGS:
1910 /* Get the current value of the file flags */
1911 return put_user(fd->fd_flags, (int *)arg);
1912 case LL_IOC_SETFLAGS:
1913 case LL_IOC_CLRFLAGS:
1914 /* Set or clear specific file flags */
1915 /* XXX This probably needs checks to ensure the flags are
1916 * not abused, and to handle any flag side effects.
1918 if (get_user(flags, (int *) arg))
1921 if (cmd == LL_IOC_SETFLAGS) {
1922 if ((flags & LL_FILE_IGNORE_LOCK) &&
1923 !(file->f_flags & O_DIRECT)) {
1924 CERROR("%s: unable to disable locking on "
1925 "non-O_DIRECT file\n", current->comm);
1929 fd->fd_flags |= flags;
1931 fd->fd_flags &= ~flags;
1934 case LL_IOC_LOV_SETSTRIPE:
1935 RETURN(ll_lov_setstripe(inode, file, arg));
1936 case LL_IOC_LOV_SETEA:
1937 RETURN(ll_lov_setea(inode, file, arg));
1938 case LL_IOC_LOV_SWAP_LAYOUTS: {
1940 struct lustre_swap_layouts lsl;
1942 if (copy_from_user(&lsl, (char *)arg,
1943 sizeof(struct lustre_swap_layouts)))
1946 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
1949 file2 = fget(lsl.sl_fd);
1954 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
1955 rc = ll_swap_layouts(file, file2, &lsl);
1959 case LL_IOC_LOV_GETSTRIPE:
1960 RETURN(ll_lov_getstripe(inode, arg));
1961 case LL_IOC_RECREATE_OBJ:
1962 RETURN(ll_lov_recreate_obj(inode, arg));
1963 case LL_IOC_RECREATE_FID:
1964 RETURN(ll_lov_recreate_fid(inode, arg));
1965 case FSFILT_IOC_FIEMAP:
1966 RETURN(ll_ioctl_fiemap(inode, arg));
1967 case FSFILT_IOC_GETFLAGS:
1968 case FSFILT_IOC_SETFLAGS:
1969 RETURN(ll_iocontrol(inode, file, cmd, arg));
1970 case FSFILT_IOC_GETVERSION_OLD:
1971 case FSFILT_IOC_GETVERSION:
1972 RETURN(put_user(inode->i_generation, (int *)arg));
1973 case LL_IOC_GROUP_LOCK:
1974 RETURN(ll_get_grouplock(inode, file, arg));
1975 case LL_IOC_GROUP_UNLOCK:
1976 RETURN(ll_put_grouplock(inode, file, arg));
1977 case IOC_OBD_STATFS:
1978 RETURN(ll_obd_statfs(inode, (void *)arg));
1980 /* We need to special case any other ioctls we want to handle,
1981 * to send them to the MDS/OST as appropriate and to properly
1982 * network encode the arg field.
1983 case FSFILT_IOC_SETVERSION_OLD:
1984 case FSFILT_IOC_SETVERSION:
1986 case LL_IOC_FLUSHCTX:
1987 RETURN(ll_flush_ctx(inode));
1988 case LL_IOC_PATH2FID: {
1989 if (copy_to_user((void *)arg, ll_inode2fid(inode),
1990 sizeof(struct lu_fid)))
1995 case OBD_IOC_FID2PATH:
1996 RETURN(ll_fid2path(inode, (void *)arg));
1997 case LL_IOC_DATA_VERSION: {
1998 struct ioc_data_version idv;
2001 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2004 rc = ll_data_version(inode, &idv.idv_version,
2005 !(idv.idv_flags & LL_DV_NOFLUSH));
2007 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2013 case LL_IOC_GET_MDTIDX: {
2016 mdtidx = ll_get_mdt_idx(inode);
2020 if (put_user((int)mdtidx, (int*)arg))
2025 case OBD_IOC_GETDTNAME:
2026 case OBD_IOC_GETMDNAME:
2027 RETURN(ll_get_obd_name(inode, cmd, arg));
2028 case LL_IOC_HSM_STATE_GET: {
2029 struct md_op_data *op_data;
2030 struct hsm_user_state *hus;
2037 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2038 LUSTRE_OPC_ANY, hus);
2039 if (IS_ERR(op_data)) {
2041 RETURN(PTR_ERR(op_data));
2044 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2047 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2050 ll_finish_md_op_data(op_data);
2054 case LL_IOC_HSM_STATE_SET: {
2055 struct md_op_data *op_data;
2056 struct hsm_state_set *hss;
2062 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2067 /* Non-root users are forbidden to set or clear flags which are
2068 * NOT defined in HSM_USER_MASK. */
2069 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2070 && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2075 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2076 LUSTRE_OPC_ANY, hss);
2077 if (IS_ERR(op_data)) {
2079 RETURN(PTR_ERR(op_data));
2082 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2085 ll_finish_md_op_data(op_data);
2090 case LL_IOC_HSM_ACTION: {
2091 struct md_op_data *op_data;
2092 struct hsm_current_action *hca;
2099 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2100 LUSTRE_OPC_ANY, hca);
2101 if (IS_ERR(op_data)) {
2103 RETURN(PTR_ERR(op_data));
2106 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2109 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2112 ll_finish_md_op_data(op_data);
2120 ll_iocontrol_call(inode, file, cmd, arg, &err))
2123 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2130 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2132 struct inode *inode = file->f_dentry->d_inode;
2133 loff_t retval, eof = 0;
2136 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2137 (origin == SEEK_CUR) ? file->f_pos : 0);
2138 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2139 inode->i_ino, inode->i_generation, inode, retval, retval,
2141 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2143 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2144 retval = ll_glimpse_size(inode);
2147 eof = i_size_read(inode);
2150 retval = ll_generic_file_llseek_size(file, offset, origin,
2151 ll_file_maxbytes(inode), eof);
2155 int ll_flush(struct file *file, fl_owner_t id)
2157 struct inode *inode = file->f_dentry->d_inode;
2158 struct ll_inode_info *lli = ll_i2info(inode);
2159 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2162 LASSERT(!S_ISDIR(inode->i_mode));
2164 /* catch async errors that were recorded back when async writeback
2165 * failed for pages in this mapping. */
2166 rc = lli->lli_async_rc;
2167 lli->lli_async_rc = 0;
2168 err = lov_read_and_clear_async_rc(lli->lli_clob);
2172 /* The application has been told write failure already.
2173 * Do not report failure again. */
2174 if (fd->fd_write_failed)
2176 return rc ? -EIO : 0;
2180 * Called to make sure a portion of file has been written out.
2181 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2183 * Return how many pages have been written.
2185 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2186 enum cl_fsync_mode mode, int ignore_layout)
2188 struct cl_env_nest nest;
2191 struct obd_capa *capa = NULL;
2192 struct cl_fsync_io *fio;
2196 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2197 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2200 env = cl_env_nested_get(&nest);
2202 RETURN(PTR_ERR(env));
2204 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2206 io = ccc_env_thread_io(env);
2207 io->ci_obj = cl_i2info(inode)->lli_clob;
2208 io->ci_ignore_layout = ignore_layout;
2210 /* initialize parameters for sync */
2211 fio = &io->u.ci_fsync;
2212 fio->fi_capa = capa;
2213 fio->fi_start = start;
2215 fio->fi_fid = ll_inode2fid(inode);
2216 fio->fi_mode = mode;
2217 fio->fi_nr_written = 0;
2219 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2220 result = cl_io_loop(env, io);
2222 result = io->ci_result;
2224 result = fio->fi_nr_written;
2225 cl_io_fini(env, io);
2226 cl_env_nested_put(&nest, env);
2234 * When dentry is provided (the 'else' case), *file->f_dentry may be
2235 * null and dentry must be used directly rather than pulled from
2236 * *file->f_dentry as is done otherwise.
2239 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2241 struct dentry *dentry = file->f_dentry;
2242 struct inode *inode = dentry->d_inode;
2243 struct ll_inode_info *lli = ll_i2info(inode);
2244 struct ptlrpc_request *req;
2245 struct obd_capa *oc;
2249 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2250 inode->i_generation, inode);
2251 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2253 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2254 mutex_lock(&inode->i_mutex);
2256 /* catch async errors that were recorded back when async writeback
2257 * failed for pages in this mapping. */
2258 if (!S_ISDIR(inode->i_mode)) {
2259 err = lli->lli_async_rc;
2260 lli->lli_async_rc = 0;
2263 err = lov_read_and_clear_async_rc(lli->lli_clob);
2268 oc = ll_mdscapa_get(inode);
2269 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2275 ptlrpc_req_finished(req);
2277 if (datasync && S_ISREG(inode->i_mode)) {
2278 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2280 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2282 if (rc == 0 && err < 0)
2285 fd->fd_write_failed = true;
2287 fd->fd_write_failed = false;
2290 mutex_unlock(&inode->i_mutex);
2294 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2296 struct inode *inode = file->f_dentry->d_inode;
2297 struct ll_sb_info *sbi = ll_i2sbi(inode);
2298 struct ldlm_enqueue_info einfo = {
2299 .ei_type = LDLM_FLOCK,
2300 .ei_cb_cp = ldlm_flock_completion_ast,
2301 .ei_cbdata = file_lock,
2303 struct md_op_data *op_data;
2304 struct lustre_handle lockh = {0};
2305 ldlm_policy_data_t flock = {{0}};
2311 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2312 inode->i_ino, file_lock);
2314 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2316 if (file_lock->fl_flags & FL_FLOCK) {
2317 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2318 /* flocks are whole-file locks */
2319 flock.l_flock.end = OFFSET_MAX;
2320 /* For flocks owner is determined by the local file desctiptor*/
2321 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2322 } else if (file_lock->fl_flags & FL_POSIX) {
2323 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2324 flock.l_flock.start = file_lock->fl_start;
2325 flock.l_flock.end = file_lock->fl_end;
2329 flock.l_flock.pid = file_lock->fl_pid;
2331 /* Somewhat ugly workaround for svc lockd.
2332 * lockd installs custom fl_lmops->lm_compare_owner that checks
2333 * for the fl_owner to be the same (which it always is on local node
2334 * I guess between lockd processes) and then compares pid.
2335 * As such we assign pid to the owner field to make it all work,
2336 * conflict with normal locks is unlikely since pid space and
2337 * pointer space for current->files are not intersecting */
2338 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2339 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2341 switch (file_lock->fl_type) {
2343 einfo.ei_mode = LCK_PR;
2346 /* An unlock request may or may not have any relation to
2347 * existing locks so we may not be able to pass a lock handle
2348 * via a normal ldlm_lock_cancel() request. The request may even
2349 * unlock a byte range in the middle of an existing lock. In
2350 * order to process an unlock request we need all of the same
2351 * information that is given with a normal read or write record
2352 * lock request. To avoid creating another ldlm unlock (cancel)
2353 * message we'll treat a LCK_NL flock request as an unlock. */
2354 einfo.ei_mode = LCK_NL;
2357 einfo.ei_mode = LCK_PW;
2360 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2361 file_lock->fl_type);
2376 flags = LDLM_FL_BLOCK_NOWAIT;
2382 flags = LDLM_FL_TEST_LOCK;
2383 /* Save the old mode so that if the mode in the lock changes we
2384 * can decrement the appropriate reader or writer refcount. */
2385 file_lock->fl_type = einfo.ei_mode;
2388 CERROR("unknown fcntl lock command: %d\n", cmd);
2392 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2393 LUSTRE_OPC_ANY, NULL);
2394 if (IS_ERR(op_data))
2395 RETURN(PTR_ERR(op_data));
2397 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2398 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2399 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2401 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2402 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2404 if ((file_lock->fl_flags & FL_FLOCK) &&
2405 (rc == 0 || file_lock->fl_type == F_UNLCK))
2406 rc2 = flock_lock_file_wait(file, file_lock);
2407 if ((file_lock->fl_flags & FL_POSIX) &&
2408 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2409 !(flags & LDLM_FL_TEST_LOCK))
2410 rc2 = posix_lock_file_wait(file, file_lock);
2412 if (rc2 && file_lock->fl_type != F_UNLCK) {
2413 einfo.ei_mode = LCK_NL;
2414 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2415 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2419 ll_finish_md_op_data(op_data);
2424 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2432 * test if some locks matching bits and l_req_mode are acquired
2433 * - bits can be in different locks
2434 * - if found clear the common lock bits in *bits
2435 * - the bits not found, are kept in *bits
2437 * \param bits [IN] searched lock bits [IN]
2438 * \param l_req_mode [IN] searched lock mode
2439 * \retval boolean, true iff all bits are found
2441 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2443 struct lustre_handle lockh;
2444 ldlm_policy_data_t policy;
2445 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2446 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2455 fid = &ll_i2info(inode)->lli_fid;
2456 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2457 ldlm_lockname[mode]);
2459 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2460 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2461 policy.l_inodebits.bits = *bits & (1 << i);
2462 if (policy.l_inodebits.bits == 0)
2465 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2466 &policy, mode, &lockh)) {
2467 struct ldlm_lock *lock;
2469 lock = ldlm_handle2lock(&lockh);
2472 ~(lock->l_policy_data.l_inodebits.bits);
2473 LDLM_LOCK_PUT(lock);
2475 *bits &= ~policy.l_inodebits.bits;
2482 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2483 struct lustre_handle *lockh, __u64 flags)
2485 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2490 fid = &ll_i2info(inode)->lli_fid;
2491 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2493 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2494 fid, LDLM_IBITS, &policy,
2495 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2499 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2501 /* Already unlinked. Just update nlink and return success */
2502 if (rc == -ENOENT) {
2504 /* This path cannot be hit for regular files unless in
2505 * case of obscure races, so no need to to validate
2507 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2509 } else if (rc != 0) {
2510 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2511 ll_get_fsname(inode->i_sb, NULL, 0),
2512 PFID(ll_inode2fid(inode)), rc);
2518 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2521 struct inode *inode = dentry->d_inode;
2522 struct ptlrpc_request *req = NULL;
2523 struct obd_export *exp;
2527 LASSERT(inode != NULL);
2529 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2530 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2532 exp = ll_i2mdexp(inode);
2534 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2535 * But under CMD case, it caused some lock issues, should be fixed
2536 * with new CMD ibits lock. See bug 12718 */
2537 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2538 struct lookup_intent oit = { .it_op = IT_GETATTR };
2539 struct md_op_data *op_data;
2541 if (ibits == MDS_INODELOCK_LOOKUP)
2542 oit.it_op = IT_LOOKUP;
2544 /* Call getattr by fid, so do not provide name at all. */
2545 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2546 dentry->d_inode, NULL, 0, 0,
2547 LUSTRE_OPC_ANY, NULL);
2548 if (IS_ERR(op_data))
2549 RETURN(PTR_ERR(op_data));
2551 oit.it_create_mode |= M_CHECK_STALE;
2552 rc = md_intent_lock(exp, op_data, NULL, 0,
2553 /* we are not interested in name
2556 ll_md_blocking_ast, 0);
2557 ll_finish_md_op_data(op_data);
2558 oit.it_create_mode &= ~M_CHECK_STALE;
2560 rc = ll_inode_revalidate_fini(inode, rc);
2564 rc = ll_revalidate_it_finish(req, &oit, dentry);
2566 ll_intent_release(&oit);
2570 /* Unlinked? Unhash dentry, so it is not picked up later by
2571 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2572 here to preserve get_cwd functionality on 2.6.
2574 if (!dentry->d_inode->i_nlink)
2575 d_lustre_invalidate(dentry, 0);
2577 ll_lookup_finish_locks(&oit, dentry);
2578 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2579 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2580 obd_valid valid = OBD_MD_FLGETATTR;
2581 struct md_op_data *op_data;
2584 if (S_ISREG(inode->i_mode)) {
2585 rc = ll_get_max_mdsize(sbi, &ealen);
2588 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2591 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2592 0, ealen, LUSTRE_OPC_ANY,
2594 if (IS_ERR(op_data))
2595 RETURN(PTR_ERR(op_data));
2597 op_data->op_valid = valid;
2598 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2599 * capa for this inode. Because we only keep capas of dirs
2601 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2602 ll_finish_md_op_data(op_data);
2604 rc = ll_inode_revalidate_fini(inode, rc);
2608 rc = ll_prep_inode(&inode, req, NULL, NULL);
2611 ptlrpc_req_finished(req);
2615 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2618 struct inode *inode = dentry->d_inode;
2622 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2626 /* if object isn't regular file, don't validate size */
2627 if (!S_ISREG(inode->i_mode)) {
2628 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2629 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2630 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2632 rc = ll_glimpse_size(inode);
2637 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2638 struct lookup_intent *it, struct kstat *stat)
2640 struct inode *inode = de->d_inode;
2641 struct ll_sb_info *sbi = ll_i2sbi(inode);
2642 struct ll_inode_info *lli = ll_i2info(inode);
2645 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2646 MDS_INODELOCK_LOOKUP);
2647 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2652 stat->dev = inode->i_sb->s_dev;
2653 if (ll_need_32bit_api(sbi))
2654 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2656 stat->ino = inode->i_ino;
2657 stat->mode = inode->i_mode;
2658 stat->nlink = inode->i_nlink;
2659 stat->uid = inode->i_uid;
2660 stat->gid = inode->i_gid;
2661 stat->rdev = inode->i_rdev;
2662 stat->atime = inode->i_atime;
2663 stat->mtime = inode->i_mtime;
2664 stat->ctime = inode->i_ctime;
2665 stat->blksize = 1 << inode->i_blkbits;
2667 stat->size = i_size_read(inode);
2668 stat->blocks = inode->i_blocks;
2672 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2674 struct lookup_intent it = { .it_op = IT_GETATTR };
2676 return ll_getattr_it(mnt, de, &it, stat);
2680 struct posix_acl * ll_get_acl(struct inode *inode, int type)
2682 struct ll_inode_info *lli = ll_i2info(inode);
2683 struct posix_acl *acl = NULL;
2686 spin_lock(&lli->lli_lock);
2687 /* VFS' acl_permission_check->check_acl will release the refcount */
2688 acl = posix_acl_dup(lli->lli_posix_acl);
2689 spin_unlock(&lli->lli_lock);
2695 int ll_inode_permission(struct inode *inode, int mask)
2700 #ifdef MAY_NOT_BLOCK
2701 if (mask & MAY_NOT_BLOCK)
2705 /* as root inode are NOT getting validated in lookup operation,
2706 * need to do it before permission check. */
2708 if (inode == inode->i_sb->s_root->d_inode) {
2709 struct lookup_intent it = { .it_op = IT_LOOKUP };
2711 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2712 MDS_INODELOCK_LOOKUP);
2717 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2718 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2720 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2721 return lustre_check_remote_perm(inode, mask);
2723 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2724 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
2729 #define READ_METHOD aio_read
2730 #define READ_FUNCTION ll_file_aio_read
2731 #define WRITE_METHOD aio_write
2732 #define WRITE_FUNCTION ll_file_aio_write
2734 /* -o localflock - only provides locally consistent flock locks */
2735 struct file_operations ll_file_operations = {
2736 .read = ll_file_read,
2737 .READ_METHOD = READ_FUNCTION,
2738 .write = ll_file_write,
2739 .WRITE_METHOD = WRITE_FUNCTION,
2740 .unlocked_ioctl = ll_file_ioctl,
2741 .open = ll_file_open,
2742 .release = ll_file_release,
2743 .mmap = ll_file_mmap,
2744 .llseek = ll_file_seek,
2745 .splice_read = ll_file_splice_read,
2750 struct file_operations ll_file_operations_flock = {
2751 .read = ll_file_read,
2752 .READ_METHOD = READ_FUNCTION,
2753 .write = ll_file_write,
2754 .WRITE_METHOD = WRITE_FUNCTION,
2755 .unlocked_ioctl = ll_file_ioctl,
2756 .open = ll_file_open,
2757 .release = ll_file_release,
2758 .mmap = ll_file_mmap,
2759 .llseek = ll_file_seek,
2760 .splice_read = ll_file_splice_read,
2763 .flock = ll_file_flock,
2764 .lock = ll_file_flock
2767 /* These are for -o noflock - to return ENOSYS on flock calls */
2768 struct file_operations ll_file_operations_noflock = {
2769 .read = ll_file_read,
2770 .READ_METHOD = READ_FUNCTION,
2771 .write = ll_file_write,
2772 .WRITE_METHOD = WRITE_FUNCTION,
2773 .unlocked_ioctl = ll_file_ioctl,
2774 .open = ll_file_open,
2775 .release = ll_file_release,
2776 .mmap = ll_file_mmap,
2777 .llseek = ll_file_seek,
2778 .splice_read = ll_file_splice_read,
2781 .flock = ll_file_noflock,
2782 .lock = ll_file_noflock
2785 struct inode_operations ll_file_inode_operations = {
2786 .setattr = ll_setattr,
2787 .getattr = ll_getattr,
2788 .permission = ll_inode_permission,
2789 .setxattr = ll_setxattr,
2790 .getxattr = ll_getxattr,
2791 .listxattr = ll_listxattr,
2792 .removexattr = ll_removexattr,
2793 .get_acl = ll_get_acl,
2796 /* dynamic ioctl number support routins */
2797 static struct llioc_ctl_data {
2798 struct rw_semaphore ioc_sem;
2799 struct list_head ioc_head;
2801 __RWSEM_INITIALIZER(llioc.ioc_sem),
2802 LIST_HEAD_INIT(llioc.ioc_head)
2807 struct list_head iocd_list;
2808 unsigned int iocd_size;
2809 llioc_callback_t iocd_cb;
2810 unsigned int iocd_count;
2811 unsigned int iocd_cmd[0];
2814 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2817 struct llioc_data *in_data = NULL;
2820 if (cb == NULL || cmd == NULL ||
2821 count > LLIOC_MAX_CMD || count < 0)
2824 size = sizeof(*in_data) + count * sizeof(unsigned int);
2825 OBD_ALLOC(in_data, size);
2826 if (in_data == NULL)
2829 memset(in_data, 0, sizeof(*in_data));
2830 in_data->iocd_size = size;
2831 in_data->iocd_cb = cb;
2832 in_data->iocd_count = count;
2833 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2835 down_write(&llioc.ioc_sem);
2836 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2837 up_write(&llioc.ioc_sem);
2842 void ll_iocontrol_unregister(void *magic)
2844 struct llioc_data *tmp;
2849 down_write(&llioc.ioc_sem);
2850 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2852 unsigned int size = tmp->iocd_size;
2854 list_del(&tmp->iocd_list);
2855 up_write(&llioc.ioc_sem);
2857 OBD_FREE(tmp, size);
2861 up_write(&llioc.ioc_sem);
2863 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2866 EXPORT_SYMBOL(ll_iocontrol_register);
2867 EXPORT_SYMBOL(ll_iocontrol_unregister);
2869 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2870 unsigned int cmd, unsigned long arg, int *rcp)
2872 enum llioc_iter ret = LLIOC_CONT;
2873 struct llioc_data *data;
2874 int rc = -EINVAL, i;
2876 down_read(&llioc.ioc_sem);
2877 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2878 for (i = 0; i < data->iocd_count; i++) {
2879 if (cmd != data->iocd_cmd[i])
2882 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2886 if (ret == LLIOC_STOP)
2889 up_read(&llioc.ioc_sem);
2896 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
2898 struct ll_inode_info *lli = ll_i2info(inode);
2899 struct cl_env_nest nest;
2904 if (lli->lli_clob == NULL)
2907 env = cl_env_nested_get(&nest);
2909 RETURN(PTR_ERR(env));
2911 result = cl_conf_set(env, lli->lli_clob, conf);
2912 cl_env_nested_put(&nest, env);
2914 if (conf->coc_opc == OBJECT_CONF_SET) {
2915 struct ldlm_lock *lock = conf->coc_lock;
2917 LASSERT(lock != NULL);
2918 LASSERT(ldlm_has_layout(lock));
2920 /* it can only be allowed to match after layout is
2921 * applied to inode otherwise false layout would be
2922 * seen. Applying layout shoud happen before dropping
2923 * the intent lock. */
2924 ldlm_lock_allow_match(lock);
2930 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
2931 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
2934 struct ll_sb_info *sbi = ll_i2sbi(inode);
2935 struct obd_capa *oc;
2936 struct ptlrpc_request *req;
2937 struct mdt_body *body;
2944 if (lock->l_lvb_data != NULL)
2947 /* if layout lock was granted right away, the layout is returned
2948 * within DLM_LVB of dlm reply; otherwise if the lock was ever
2949 * blocked and then granted via completion ast, we have to fetch
2950 * layout here. Please note that we can't use the LVB buffer in
2951 * completion AST because it doesn't have a large enough buffer */
2952 oc = ll_mdscapa_get(inode);
2953 rc = ll_get_max_mdsize(sbi, &lmmsize);
2955 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
2956 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
2962 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2963 if (body == NULL || body->eadatasize > lmmsize)
2964 GOTO(out, rc = -EPROTO);
2966 lmmsize = body->eadatasize;
2967 if (lmmsize == 0) /* empty layout */
2970 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
2972 GOTO(out, rc = -EFAULT);
2974 OBD_ALLOC_LARGE(lvbdata, lmmsize);
2975 if (lvbdata == NULL)
2976 GOTO(out, rc = -ENOMEM);
2978 memcpy(lvbdata, lmm, lmmsize);
2979 lock_res_and_lock(lock);
2980 if (lock->l_lvb_data == NULL) {
2981 lock->l_lvb_data = lvbdata;
2982 lock->l_lvb_len = lmmsize;
2985 unlock_res_and_lock(lock);
2987 if (lvbdata != NULL)
2988 OBD_FREE_LARGE(lvbdata, lmmsize);
2992 ptlrpc_req_finished(req);
2997 * Apply the layout to the inode. Layout lock is held and will be released
3000 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3001 struct inode *inode, __u32 *gen, bool reconf)
3003 struct ll_inode_info *lli = ll_i2info(inode);
3004 struct ll_sb_info *sbi = ll_i2sbi(inode);
3005 struct ldlm_lock *lock;
3006 struct lustre_md md = { NULL };
3007 struct cl_object_conf conf;
3010 bool wait_layout = false;
3013 LASSERT(lustre_handle_is_used(lockh));
3015 lock = ldlm_handle2lock(lockh);
3016 LASSERT(lock != NULL);
3017 LASSERT(ldlm_has_layout(lock));
3019 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3020 inode, PFID(&lli->lli_fid), reconf);
3022 /* in case this is a caching lock and reinstate with new inode */
3023 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3025 lock_res_and_lock(lock);
3026 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3027 unlock_res_and_lock(lock);
3028 /* checking lvb_ready is racy but this is okay. The worst case is
3029 * that multi processes may configure the file on the same time. */
3030 if (lvb_ready || !reconf) {
3033 /* layout_gen must be valid if layout lock is not
3034 * cancelled and stripe has already set */
3035 *gen = lli->lli_layout_gen;
3041 rc = ll_layout_fetch(inode, lock);
3045 /* for layout lock, lmm is returned in lock's lvb.
3046 * lvb_data is immutable if the lock is held so it's safe to access it
3047 * without res lock. See the description in ldlm_lock_decref_internal()
3048 * for the condition to free lvb_data of layout lock */
3049 if (lock->l_lvb_data != NULL) {
3050 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3051 lock->l_lvb_data, lock->l_lvb_len);
3053 *gen = LL_LAYOUT_GEN_EMPTY;
3055 *gen = md.lsm->lsm_layout_gen;
3058 CERROR("%s: file "DFID" unpackmd error: %d\n",
3059 ll_get_fsname(inode->i_sb, NULL, 0),
3060 PFID(&lli->lli_fid), rc);
3066 /* set layout to file. Unlikely this will fail as old layout was
3067 * surely eliminated */
3068 memset(&conf, 0, sizeof conf);
3069 conf.coc_opc = OBJECT_CONF_SET;
3070 conf.coc_inode = inode;
3071 conf.coc_lock = lock;
3072 conf.u.coc_md = &md;
3073 rc = ll_layout_conf(inode, &conf);
3076 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3078 /* refresh layout failed, need to wait */
3079 wait_layout = rc == -EBUSY;
3083 LDLM_LOCK_PUT(lock);
3084 ldlm_lock_decref(lockh, mode);
3086 /* wait for IO to complete if it's still being used. */
3088 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3089 ll_get_fsname(inode->i_sb, NULL, 0),
3090 inode, PFID(&lli->lli_fid));
3092 memset(&conf, 0, sizeof conf);
3093 conf.coc_opc = OBJECT_CONF_WAIT;
3094 conf.coc_inode = inode;
3095 rc = ll_layout_conf(inode, &conf);
3099 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3100 PFID(&lli->lli_fid), rc);
3106 * This function checks if there exists a LAYOUT lock on the client side,
3107 * or enqueues it if it doesn't have one in cache.
3109 * This function will not hold layout lock so it may be revoked any time after
3110 * this function returns. Any operations depend on layout should be redone
3113 * This function should be called before lov_io_init() to get an uptodate
3114 * layout version, the caller should save the version number and after IO
3115 * is finished, this function should be called again to verify that layout
3116 * is not changed during IO time.
3118 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3120 struct ll_inode_info *lli = ll_i2info(inode);
3121 struct ll_sb_info *sbi = ll_i2sbi(inode);
3122 struct md_op_data *op_data;
3123 struct lookup_intent it;
3124 struct lustre_handle lockh;
3126 struct ldlm_enqueue_info einfo = {
3127 .ei_type = LDLM_IBITS,
3129 .ei_cb_bl = ll_md_blocking_ast,
3130 .ei_cb_cp = ldlm_completion_ast,
3135 *gen = lli->lli_layout_gen;
3136 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3140 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3141 LASSERT(S_ISREG(inode->i_mode));
3143 /* mostly layout lock is caching on the local side, so try to match
3144 * it before grabbing layout lock mutex. */
3145 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3146 if (mode != 0) { /* hit cached lock */
3147 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3151 /* better hold lli_layout_mutex to try again otherwise
3152 * it will have starvation problem. */
3155 /* take layout lock mutex to enqueue layout lock exclusively. */
3156 mutex_lock(&lli->lli_layout_mutex);
3159 /* try again. Maybe somebody else has done this. */
3160 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3161 if (mode != 0) { /* hit cached lock */
3162 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3166 mutex_unlock(&lli->lli_layout_mutex);
3170 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3171 0, 0, LUSTRE_OPC_ANY, NULL);
3172 if (IS_ERR(op_data)) {
3173 mutex_unlock(&lli->lli_layout_mutex);
3174 RETURN(PTR_ERR(op_data));
3177 /* have to enqueue one */
3178 memset(&it, 0, sizeof(it));
3179 it.it_op = IT_LAYOUT;
3180 lockh.cookie = 0ULL;
3182 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3183 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3184 PFID(&lli->lli_fid));
3186 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3188 if (it.d.lustre.it_data != NULL)
3189 ptlrpc_req_finished(it.d.lustre.it_data);
3190 it.d.lustre.it_data = NULL;
3192 ll_finish_md_op_data(op_data);
3194 mode = it.d.lustre.it_lock_mode;
3195 it.d.lustre.it_lock_mode = 0;
3196 ll_intent_drop_lock(&it);
3199 /* set lock data in case this is a new lock */
3200 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3201 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3205 mutex_unlock(&lli->lli_layout_mutex);