Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/fs/ceph/inode.c
29265 views
1
// SPDX-License-Identifier: GPL-2.0
2
#include <linux/ceph/ceph_debug.h>
3
4
#include <linux/module.h>
5
#include <linux/fs.h>
6
#include <linux/slab.h>
7
#include <linux/string.h>
8
#include <linux/uaccess.h>
9
#include <linux/kernel.h>
10
#include <linux/writeback.h>
11
#include <linux/vmalloc.h>
12
#include <linux/xattr.h>
13
#include <linux/posix_acl.h>
14
#include <linux/random.h>
15
#include <linux/sort.h>
16
#include <linux/iversion.h>
17
#include <linux/fscrypt.h>
18
19
#include "super.h"
20
#include "mds_client.h"
21
#include "cache.h"
22
#include "crypto.h"
23
#include <linux/ceph/decode.h>
24
25
/*
26
* Ceph inode operations
27
*
28
* Implement basic inode helpers (get, alloc) and inode ops (getattr,
29
* setattr, etc.), xattr helpers, and helpers for assimilating
30
* metadata returned by the MDS into our cache.
31
*
32
* Also define helpers for doing asynchronous writeback, invalidation,
33
* and truncation for the benefit of those who can't afford to block
34
* (typically because they are in the message handler path).
35
*/
36
37
static const struct inode_operations ceph_symlink_iops;
38
static const struct inode_operations ceph_encrypted_symlink_iops;
39
40
static void ceph_inode_work(struct work_struct *work);
41
42
/*
43
* find or create an inode, given the ceph ino number
44
*/
45
static int ceph_set_ino_cb(struct inode *inode, void *data)
46
{
47
struct ceph_inode_info *ci = ceph_inode(inode);
48
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
49
50
ci->i_vino = *(struct ceph_vino *)data;
51
inode->i_ino = ceph_vino_to_ino_t(ci->i_vino);
52
inode_set_iversion_raw(inode, 0);
53
percpu_counter_inc(&mdsc->metric.total_inodes);
54
55
return 0;
56
}
57
58
/*
59
* Check if the parent inode matches the vino from directory reply info
60
*/
61
static inline bool ceph_vino_matches_parent(struct inode *parent,
62
struct ceph_vino vino)
63
{
64
return ceph_ino(parent) == vino.ino && ceph_snap(parent) == vino.snap;
65
}
66
67
/*
68
* Validate that the directory inode referenced by @req->r_parent matches the
69
* inode number and snapshot id contained in the reply's directory record. If
70
* they do not match – which can theoretically happen if the parent dentry was
71
* moved between the time the request was issued and the reply arrived – fall
72
* back to looking up the correct inode in the inode cache.
73
*
74
* A reference is *always* returned. Callers that receive a different inode
75
* than the original @parent are responsible for dropping the extra reference
76
* once the reply has been processed.
77
*/
78
static struct inode *ceph_get_reply_dir(struct super_block *sb,
79
struct inode *parent,
80
struct ceph_mds_reply_info_parsed *rinfo)
81
{
82
struct ceph_vino vino;
83
84
if (unlikely(!rinfo->diri.in))
85
return parent; /* nothing to compare against */
86
87
/* If we didn't have a cached parent inode to begin with, just bail out. */
88
if (!parent)
89
return NULL;
90
91
vino.ino = le64_to_cpu(rinfo->diri.in->ino);
92
vino.snap = le64_to_cpu(rinfo->diri.in->snapid);
93
94
if (likely(ceph_vino_matches_parent(parent, vino)))
95
return parent; /* matches – use the original reference */
96
97
/* Mismatch – this should be rare. Emit a WARN and obtain the correct inode. */
98
WARN_ONCE(1, "ceph: reply dir mismatch (parent valid %llx.%llx reply %llx.%llx)\n",
99
ceph_ino(parent), ceph_snap(parent), vino.ino, vino.snap);
100
101
return ceph_get_inode(sb, vino, NULL);
102
}
103
104
/**
105
* ceph_new_inode - allocate a new inode in advance of an expected create
106
* @dir: parent directory for new inode
107
* @dentry: dentry that may eventually point to new inode
108
* @mode: mode of new inode
109
* @as_ctx: pointer to inherited security context
110
*
111
* Allocate a new inode in advance of an operation to create a new inode.
112
* This allocates the inode and sets up the acl_sec_ctx with appropriate
113
* info for the new inode.
114
*
115
* Returns a pointer to the new inode or an ERR_PTR.
116
*/
117
struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry,
118
umode_t *mode, struct ceph_acl_sec_ctx *as_ctx)
119
{
120
int err;
121
struct inode *inode;
122
123
inode = new_inode(dir->i_sb);
124
if (!inode)
125
return ERR_PTR(-ENOMEM);
126
127
inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT;
128
129
if (!S_ISLNK(*mode)) {
130
err = ceph_pre_init_acls(dir, mode, as_ctx);
131
if (err < 0)
132
goto out_err;
133
}
134
135
inode->i_state = 0;
136
inode->i_mode = *mode;
137
138
err = ceph_security_init_secctx(dentry, *mode, as_ctx);
139
if (err < 0)
140
goto out_err;
141
142
/*
143
* We'll skip setting fscrypt context for snapshots, leaving that for
144
* the handle_reply().
145
*/
146
if (ceph_snap(dir) != CEPH_SNAPDIR) {
147
err = ceph_fscrypt_prepare_context(dir, inode, as_ctx);
148
if (err)
149
goto out_err;
150
}
151
152
return inode;
153
out_err:
154
iput(inode);
155
return ERR_PTR(err);
156
}
157
158
void ceph_as_ctx_to_req(struct ceph_mds_request *req,
159
struct ceph_acl_sec_ctx *as_ctx)
160
{
161
if (as_ctx->pagelist) {
162
req->r_pagelist = as_ctx->pagelist;
163
as_ctx->pagelist = NULL;
164
}
165
ceph_fscrypt_as_ctx_to_req(req, as_ctx);
166
}
167
168
/**
169
* ceph_get_inode - find or create/hash a new inode
170
* @sb: superblock to search and allocate in
171
* @vino: vino to search for
172
* @newino: optional new inode to insert if one isn't found (may be NULL)
173
*
174
* Search for or insert a new inode into the hash for the given vino, and
175
* return a reference to it. If new is non-NULL, its reference is consumed.
176
*/
177
struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino,
178
struct inode *newino)
179
{
180
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
181
struct ceph_client *cl = mdsc->fsc->client;
182
struct inode *inode;
183
184
if (ceph_vino_is_reserved(vino))
185
return ERR_PTR(-EREMOTEIO);
186
187
if (newino) {
188
inode = inode_insert5(newino, (unsigned long)vino.ino,
189
ceph_ino_compare, ceph_set_ino_cb, &vino);
190
if (inode != newino)
191
iput(newino);
192
} else {
193
inode = iget5_locked(sb, (unsigned long)vino.ino,
194
ceph_ino_compare, ceph_set_ino_cb, &vino);
195
}
196
197
if (!inode) {
198
doutc(cl, "no inode found for %llx.%llx\n", vino.ino, vino.snap);
199
return ERR_PTR(-ENOMEM);
200
}
201
202
doutc(cl, "on %llx=%llx.%llx got %p new %d\n",
203
ceph_present_inode(inode), ceph_vinop(inode), inode,
204
!!(inode->i_state & I_NEW));
205
return inode;
206
}
207
208
/*
209
* get/construct snapdir inode for a given directory
210
*/
211
struct inode *ceph_get_snapdir(struct inode *parent)
212
{
213
struct ceph_client *cl = ceph_inode_to_client(parent);
214
struct ceph_vino vino = {
215
.ino = ceph_ino(parent),
216
.snap = CEPH_SNAPDIR,
217
};
218
struct inode *inode = ceph_get_inode(parent->i_sb, vino, NULL);
219
struct ceph_inode_info *ci = ceph_inode(inode);
220
int ret = -ENOTDIR;
221
222
if (IS_ERR(inode))
223
return inode;
224
225
if (!S_ISDIR(parent->i_mode)) {
226
pr_warn_once_client(cl, "bad snapdir parent type (mode=0%o)\n",
227
parent->i_mode);
228
goto err;
229
}
230
231
if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) {
232
pr_warn_once_client(cl, "bad snapdir inode type (mode=0%o)\n",
233
inode->i_mode);
234
goto err;
235
}
236
237
inode->i_mode = parent->i_mode;
238
inode->i_uid = parent->i_uid;
239
inode->i_gid = parent->i_gid;
240
inode_set_mtime_to_ts(inode, inode_get_mtime(parent));
241
inode_set_ctime_to_ts(inode, inode_get_ctime(parent));
242
inode_set_atime_to_ts(inode, inode_get_atime(parent));
243
ci->i_rbytes = 0;
244
ci->i_btime = ceph_inode(parent)->i_btime;
245
246
#ifdef CONFIG_FS_ENCRYPTION
247
/* if encrypted, just borrow fscrypt_auth from parent */
248
if (IS_ENCRYPTED(parent)) {
249
struct ceph_inode_info *pci = ceph_inode(parent);
250
251
ci->fscrypt_auth = kmemdup(pci->fscrypt_auth,
252
pci->fscrypt_auth_len,
253
GFP_KERNEL);
254
if (ci->fscrypt_auth) {
255
inode->i_flags |= S_ENCRYPTED;
256
ci->fscrypt_auth_len = pci->fscrypt_auth_len;
257
} else {
258
doutc(cl, "Failed to alloc snapdir fscrypt_auth\n");
259
ret = -ENOMEM;
260
goto err;
261
}
262
}
263
#endif
264
if (inode->i_state & I_NEW) {
265
inode->i_op = &ceph_snapdir_iops;
266
inode->i_fop = &ceph_snapdir_fops;
267
ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
268
unlock_new_inode(inode);
269
}
270
271
return inode;
272
err:
273
if ((inode->i_state & I_NEW))
274
discard_new_inode(inode);
275
else
276
iput(inode);
277
return ERR_PTR(ret);
278
}
279
280
const struct inode_operations ceph_file_iops = {
281
.permission = ceph_permission,
282
.setattr = ceph_setattr,
283
.getattr = ceph_getattr,
284
.listxattr = ceph_listxattr,
285
.get_inode_acl = ceph_get_acl,
286
.set_acl = ceph_set_acl,
287
};
288
289
290
/*
291
* We use a 'frag tree' to keep track of the MDS's directory fragments
292
* for a given inode (usually there is just a single fragment). We
293
* need to know when a child frag is delegated to a new MDS, or when
294
* it is flagged as replicated, so we can direct our requests
295
* accordingly.
296
*/
297
298
/*
299
* find/create a frag in the tree
300
*/
301
static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
302
u32 f)
303
{
304
struct inode *inode = &ci->netfs.inode;
305
struct ceph_client *cl = ceph_inode_to_client(inode);
306
struct rb_node **p;
307
struct rb_node *parent = NULL;
308
struct ceph_inode_frag *frag;
309
int c;
310
311
p = &ci->i_fragtree.rb_node;
312
while (*p) {
313
parent = *p;
314
frag = rb_entry(parent, struct ceph_inode_frag, node);
315
c = ceph_frag_compare(f, frag->frag);
316
if (c < 0)
317
p = &(*p)->rb_left;
318
else if (c > 0)
319
p = &(*p)->rb_right;
320
else
321
return frag;
322
}
323
324
frag = kmalloc(sizeof(*frag), GFP_NOFS);
325
if (!frag)
326
return ERR_PTR(-ENOMEM);
327
328
frag->frag = f;
329
frag->split_by = 0;
330
frag->mds = -1;
331
frag->ndist = 0;
332
333
rb_link_node(&frag->node, parent, p);
334
rb_insert_color(&frag->node, &ci->i_fragtree);
335
336
doutc(cl, "added %p %llx.%llx frag %x\n", inode, ceph_vinop(inode), f);
337
return frag;
338
}
339
340
/*
341
* find a specific frag @f
342
*/
343
struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
344
{
345
struct rb_node *n = ci->i_fragtree.rb_node;
346
347
while (n) {
348
struct ceph_inode_frag *frag =
349
rb_entry(n, struct ceph_inode_frag, node);
350
int c = ceph_frag_compare(f, frag->frag);
351
if (c < 0)
352
n = n->rb_left;
353
else if (c > 0)
354
n = n->rb_right;
355
else
356
return frag;
357
}
358
return NULL;
359
}
360
361
/*
362
* Choose frag containing the given value @v. If @pfrag is
363
* specified, copy the frag delegation info to the caller if
364
* it is present.
365
*/
366
static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
367
struct ceph_inode_frag *pfrag, int *found)
368
{
369
struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode);
370
u32 t = ceph_frag_make(0, 0);
371
struct ceph_inode_frag *frag;
372
unsigned nway, i;
373
u32 n;
374
375
if (found)
376
*found = 0;
377
378
while (1) {
379
WARN_ON(!ceph_frag_contains_value(t, v));
380
frag = __ceph_find_frag(ci, t);
381
if (!frag)
382
break; /* t is a leaf */
383
if (frag->split_by == 0) {
384
if (pfrag)
385
memcpy(pfrag, frag, sizeof(*pfrag));
386
if (found)
387
*found = 1;
388
break;
389
}
390
391
/* choose child */
392
nway = 1 << frag->split_by;
393
doutc(cl, "frag(%x) %x splits by %d (%d ways)\n", v, t,
394
frag->split_by, nway);
395
for (i = 0; i < nway; i++) {
396
n = ceph_frag_make_child(t, frag->split_by, i);
397
if (ceph_frag_contains_value(n, v)) {
398
t = n;
399
break;
400
}
401
}
402
BUG_ON(i == nway);
403
}
404
doutc(cl, "frag(%x) = %x\n", v, t);
405
406
return t;
407
}
408
409
u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
410
struct ceph_inode_frag *pfrag, int *found)
411
{
412
u32 ret;
413
mutex_lock(&ci->i_fragtree_mutex);
414
ret = __ceph_choose_frag(ci, v, pfrag, found);
415
mutex_unlock(&ci->i_fragtree_mutex);
416
return ret;
417
}
418
419
/*
420
* Process dirfrag (delegation) info from the mds. Include leaf
421
* fragment in tree ONLY if ndist > 0. Otherwise, only
422
* branches/splits are included in i_fragtree)
423
*/
424
static int ceph_fill_dirfrag(struct inode *inode,
425
struct ceph_mds_reply_dirfrag *dirinfo)
426
{
427
struct ceph_inode_info *ci = ceph_inode(inode);
428
struct ceph_client *cl = ceph_inode_to_client(inode);
429
struct ceph_inode_frag *frag;
430
u32 id = le32_to_cpu(dirinfo->frag);
431
int mds = le32_to_cpu(dirinfo->auth);
432
int ndist = le32_to_cpu(dirinfo->ndist);
433
int diri_auth = -1;
434
int i;
435
int err = 0;
436
437
spin_lock(&ci->i_ceph_lock);
438
if (ci->i_auth_cap)
439
diri_auth = ci->i_auth_cap->mds;
440
spin_unlock(&ci->i_ceph_lock);
441
442
if (mds == -1) /* CDIR_AUTH_PARENT */
443
mds = diri_auth;
444
445
mutex_lock(&ci->i_fragtree_mutex);
446
if (ndist == 0 && mds == diri_auth) {
447
/* no delegation info needed. */
448
frag = __ceph_find_frag(ci, id);
449
if (!frag)
450
goto out;
451
if (frag->split_by == 0) {
452
/* tree leaf, remove */
453
doutc(cl, "removed %p %llx.%llx frag %x (no ref)\n",
454
inode, ceph_vinop(inode), id);
455
rb_erase(&frag->node, &ci->i_fragtree);
456
kfree(frag);
457
} else {
458
/* tree branch, keep and clear */
459
doutc(cl, "cleared %p %llx.%llx frag %x referral\n",
460
inode, ceph_vinop(inode), id);
461
frag->mds = -1;
462
frag->ndist = 0;
463
}
464
goto out;
465
}
466
467
468
/* find/add this frag to store mds delegation info */
469
frag = __get_or_create_frag(ci, id);
470
if (IS_ERR(frag)) {
471
/* this is not the end of the world; we can continue
472
with bad/inaccurate delegation info */
473
pr_err_client(cl, "ENOMEM on mds ref %p %llx.%llx fg %x\n",
474
inode, ceph_vinop(inode),
475
le32_to_cpu(dirinfo->frag));
476
err = -ENOMEM;
477
goto out;
478
}
479
480
frag->mds = mds;
481
frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
482
for (i = 0; i < frag->ndist; i++)
483
frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
484
doutc(cl, "%p %llx.%llx frag %x ndist=%d\n", inode,
485
ceph_vinop(inode), frag->frag, frag->ndist);
486
487
out:
488
mutex_unlock(&ci->i_fragtree_mutex);
489
return err;
490
}
491
492
static int frag_tree_split_cmp(const void *l, const void *r)
493
{
494
struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l;
495
struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r;
496
return ceph_frag_compare(le32_to_cpu(ls->frag),
497
le32_to_cpu(rs->frag));
498
}
499
500
static bool is_frag_child(u32 f, struct ceph_inode_frag *frag)
501
{
502
if (!frag)
503
return f == ceph_frag_make(0, 0);
504
if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by)
505
return false;
506
return ceph_frag_contains_value(frag->frag, ceph_frag_value(f));
507
}
508
509
static int ceph_fill_fragtree(struct inode *inode,
510
struct ceph_frag_tree_head *fragtree,
511
struct ceph_mds_reply_dirfrag *dirinfo)
512
{
513
struct ceph_client *cl = ceph_inode_to_client(inode);
514
struct ceph_inode_info *ci = ceph_inode(inode);
515
struct ceph_inode_frag *frag, *prev_frag = NULL;
516
struct rb_node *rb_node;
517
unsigned i, split_by, nsplits;
518
u32 id;
519
bool update = false;
520
521
mutex_lock(&ci->i_fragtree_mutex);
522
nsplits = le32_to_cpu(fragtree->nsplits);
523
if (nsplits != ci->i_fragtree_nsplits) {
524
update = true;
525
} else if (nsplits) {
526
i = get_random_u32_below(nsplits);
527
id = le32_to_cpu(fragtree->splits[i].frag);
528
if (!__ceph_find_frag(ci, id))
529
update = true;
530
} else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) {
531
rb_node = rb_first(&ci->i_fragtree);
532
frag = rb_entry(rb_node, struct ceph_inode_frag, node);
533
if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node))
534
update = true;
535
}
536
if (!update && dirinfo) {
537
id = le32_to_cpu(dirinfo->frag);
538
if (id != __ceph_choose_frag(ci, id, NULL, NULL))
539
update = true;
540
}
541
if (!update)
542
goto out_unlock;
543
544
if (nsplits > 1) {
545
sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]),
546
frag_tree_split_cmp, NULL);
547
}
548
549
doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
550
rb_node = rb_first(&ci->i_fragtree);
551
for (i = 0; i < nsplits; i++) {
552
id = le32_to_cpu(fragtree->splits[i].frag);
553
split_by = le32_to_cpu(fragtree->splits[i].by);
554
if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) {
555
pr_err_client(cl, "%p %llx.%llx invalid split %d/%u, "
556
"frag %x split by %d\n", inode,
557
ceph_vinop(inode), i, nsplits, id, split_by);
558
continue;
559
}
560
frag = NULL;
561
while (rb_node) {
562
frag = rb_entry(rb_node, struct ceph_inode_frag, node);
563
if (ceph_frag_compare(frag->frag, id) >= 0) {
564
if (frag->frag != id)
565
frag = NULL;
566
else
567
rb_node = rb_next(rb_node);
568
break;
569
}
570
rb_node = rb_next(rb_node);
571
/* delete stale split/leaf node */
572
if (frag->split_by > 0 ||
573
!is_frag_child(frag->frag, prev_frag)) {
574
rb_erase(&frag->node, &ci->i_fragtree);
575
if (frag->split_by > 0)
576
ci->i_fragtree_nsplits--;
577
kfree(frag);
578
}
579
frag = NULL;
580
}
581
if (!frag) {
582
frag = __get_or_create_frag(ci, id);
583
if (IS_ERR(frag))
584
continue;
585
}
586
if (frag->split_by == 0)
587
ci->i_fragtree_nsplits++;
588
frag->split_by = split_by;
589
doutc(cl, " frag %x split by %d\n", frag->frag, frag->split_by);
590
prev_frag = frag;
591
}
592
while (rb_node) {
593
frag = rb_entry(rb_node, struct ceph_inode_frag, node);
594
rb_node = rb_next(rb_node);
595
/* delete stale split/leaf node */
596
if (frag->split_by > 0 ||
597
!is_frag_child(frag->frag, prev_frag)) {
598
rb_erase(&frag->node, &ci->i_fragtree);
599
if (frag->split_by > 0)
600
ci->i_fragtree_nsplits--;
601
kfree(frag);
602
}
603
}
604
out_unlock:
605
mutex_unlock(&ci->i_fragtree_mutex);
606
return 0;
607
}
608
609
/*
610
* initialize a newly allocated inode.
611
*/
612
struct inode *ceph_alloc_inode(struct super_block *sb)
613
{
614
struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
615
struct ceph_inode_info *ci;
616
int i;
617
618
ci = alloc_inode_sb(sb, ceph_inode_cachep, GFP_NOFS);
619
if (!ci)
620
return NULL;
621
622
doutc(fsc->client, "%p\n", &ci->netfs.inode);
623
624
/* Set parameters for the netfs library */
625
netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false);
626
627
spin_lock_init(&ci->i_ceph_lock);
628
629
ci->i_version = 0;
630
ci->i_inline_version = 0;
631
ci->i_time_warp_seq = 0;
632
ci->i_ceph_flags = 0;
633
atomic64_set(&ci->i_ordered_count, 1);
634
atomic64_set(&ci->i_release_count, 1);
635
atomic64_set(&ci->i_complete_seq[0], 0);
636
atomic64_set(&ci->i_complete_seq[1], 0);
637
ci->i_symlink = NULL;
638
639
ci->i_max_bytes = 0;
640
ci->i_max_files = 0;
641
642
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
643
memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
644
RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
645
646
ci->i_fragtree = RB_ROOT;
647
mutex_init(&ci->i_fragtree_mutex);
648
649
ci->i_xattrs.blob = NULL;
650
ci->i_xattrs.prealloc_blob = NULL;
651
ci->i_xattrs.dirty = false;
652
ci->i_xattrs.index = RB_ROOT;
653
ci->i_xattrs.count = 0;
654
ci->i_xattrs.names_size = 0;
655
ci->i_xattrs.vals_size = 0;
656
ci->i_xattrs.version = 0;
657
ci->i_xattrs.index_version = 0;
658
659
ci->i_caps = RB_ROOT;
660
ci->i_auth_cap = NULL;
661
ci->i_dirty_caps = 0;
662
ci->i_flushing_caps = 0;
663
INIT_LIST_HEAD(&ci->i_dirty_item);
664
INIT_LIST_HEAD(&ci->i_flushing_item);
665
ci->i_prealloc_cap_flush = NULL;
666
INIT_LIST_HEAD(&ci->i_cap_flush_list);
667
init_waitqueue_head(&ci->i_cap_wq);
668
ci->i_hold_caps_max = 0;
669
INIT_LIST_HEAD(&ci->i_cap_delay_list);
670
INIT_LIST_HEAD(&ci->i_cap_snaps);
671
ci->i_head_snapc = NULL;
672
ci->i_snap_caps = 0;
673
674
ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ;
675
for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
676
ci->i_nr_by_mode[i] = 0;
677
678
mutex_init(&ci->i_truncate_mutex);
679
ci->i_truncate_seq = 0;
680
ci->i_truncate_size = 0;
681
ci->i_truncate_pending = 0;
682
ci->i_truncate_pagecache_size = 0;
683
684
ci->i_max_size = 0;
685
ci->i_reported_size = 0;
686
ci->i_wanted_max_size = 0;
687
ci->i_requested_max_size = 0;
688
689
ci->i_pin_ref = 0;
690
ci->i_rd_ref = 0;
691
ci->i_rdcache_ref = 0;
692
ci->i_wr_ref = 0;
693
ci->i_wb_ref = 0;
694
ci->i_fx_ref = 0;
695
ci->i_wrbuffer_ref = 0;
696
ci->i_wrbuffer_ref_head = 0;
697
atomic_set(&ci->i_filelock_ref, 0);
698
atomic_set(&ci->i_shared_gen, 1);
699
ci->i_rdcache_gen = 0;
700
ci->i_rdcache_revoking = 0;
701
702
INIT_LIST_HEAD(&ci->i_unsafe_dirops);
703
INIT_LIST_HEAD(&ci->i_unsafe_iops);
704
spin_lock_init(&ci->i_unsafe_lock);
705
706
ci->i_snap_realm = NULL;
707
INIT_LIST_HEAD(&ci->i_snap_realm_item);
708
INIT_LIST_HEAD(&ci->i_snap_flush_item);
709
710
INIT_WORK(&ci->i_work, ceph_inode_work);
711
ci->i_work_mask = 0;
712
memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
713
#ifdef CONFIG_FS_ENCRYPTION
714
ci->i_crypt_info = NULL;
715
ci->fscrypt_auth = NULL;
716
ci->fscrypt_auth_len = 0;
717
#endif
718
return &ci->netfs.inode;
719
}
720
721
void ceph_free_inode(struct inode *inode)
722
{
723
struct ceph_inode_info *ci = ceph_inode(inode);
724
725
kfree(ci->i_symlink);
726
#ifdef CONFIG_FS_ENCRYPTION
727
kfree(ci->fscrypt_auth);
728
#endif
729
fscrypt_free_inode(inode);
730
kmem_cache_free(ceph_inode_cachep, ci);
731
}
732
733
void ceph_evict_inode(struct inode *inode)
734
{
735
struct ceph_inode_info *ci = ceph_inode(inode);
736
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
737
struct ceph_client *cl = ceph_inode_to_client(inode);
738
struct ceph_inode_frag *frag;
739
struct rb_node *n;
740
741
doutc(cl, "%p ino %llx.%llx\n", inode, ceph_vinop(inode));
742
743
percpu_counter_dec(&mdsc->metric.total_inodes);
744
745
netfs_wait_for_outstanding_io(inode);
746
truncate_inode_pages_final(&inode->i_data);
747
if (inode->i_state & I_PINNING_NETFS_WB)
748
ceph_fscache_unuse_cookie(inode, true);
749
clear_inode(inode);
750
751
ceph_fscache_unregister_inode_cookie(ci);
752
fscrypt_put_encryption_info(inode);
753
754
__ceph_remove_caps(ci);
755
756
if (__ceph_has_quota(ci, QUOTA_GET_ANY))
757
ceph_adjust_quota_realms_count(inode, false);
758
759
/*
760
* we may still have a snap_realm reference if there are stray
761
* caps in i_snap_caps.
762
*/
763
if (ci->i_snap_realm) {
764
if (ceph_snap(inode) == CEPH_NOSNAP) {
765
doutc(cl, " dropping residual ref to snap realm %p\n",
766
ci->i_snap_realm);
767
ceph_change_snap_realm(inode, NULL);
768
} else {
769
ceph_put_snapid_map(mdsc, ci->i_snapid_map);
770
ci->i_snap_realm = NULL;
771
}
772
}
773
774
while ((n = rb_first(&ci->i_fragtree)) != NULL) {
775
frag = rb_entry(n, struct ceph_inode_frag, node);
776
rb_erase(n, &ci->i_fragtree);
777
kfree(frag);
778
}
779
ci->i_fragtree_nsplits = 0;
780
781
__ceph_destroy_xattrs(ci);
782
if (ci->i_xattrs.blob)
783
ceph_buffer_put(ci->i_xattrs.blob);
784
if (ci->i_xattrs.prealloc_blob)
785
ceph_buffer_put(ci->i_xattrs.prealloc_blob);
786
787
ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
788
ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
789
}
790
791
static inline blkcnt_t calc_inode_blocks(u64 size)
792
{
793
return (size + (1<<9) - 1) >> 9;
794
}
795
796
/*
797
* Helpers to fill in size, ctime, mtime, and atime. We have to be
798
* careful because either the client or MDS may have more up to date
799
* info, depending on which capabilities are held, and whether
800
* time_warp_seq or truncate_seq have increased. (Ordinarily, mtime
801
* and size are monotonically increasing, except when utimes() or
802
* truncate() increments the corresponding _seq values.)
803
*/
804
int ceph_fill_file_size(struct inode *inode, int issued,
805
u32 truncate_seq, u64 truncate_size, u64 size)
806
{
807
struct ceph_client *cl = ceph_inode_to_client(inode);
808
struct ceph_inode_info *ci = ceph_inode(inode);
809
int queue_trunc = 0;
810
loff_t isize = i_size_read(inode);
811
812
if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
813
(truncate_seq == ci->i_truncate_seq && size > isize)) {
814
doutc(cl, "size %lld -> %llu\n", isize, size);
815
if (size > 0 && S_ISDIR(inode->i_mode)) {
816
pr_err_client(cl, "non-zero size for directory\n");
817
size = 0;
818
}
819
i_size_write(inode, size);
820
inode->i_blocks = calc_inode_blocks(size);
821
/*
822
* If we're expanding, then we should be able to just update
823
* the existing cookie.
824
*/
825
if (size > isize)
826
ceph_fscache_update(inode);
827
ci->i_reported_size = size;
828
if (truncate_seq != ci->i_truncate_seq) {
829
doutc(cl, "truncate_seq %u -> %u\n",
830
ci->i_truncate_seq, truncate_seq);
831
ci->i_truncate_seq = truncate_seq;
832
833
/* the MDS should have revoked these caps */
834
WARN_ON_ONCE(issued & (CEPH_CAP_FILE_RD |
835
CEPH_CAP_FILE_LAZYIO));
836
/*
837
* If we hold relevant caps, or in the case where we're
838
* not the only client referencing this file and we
839
* don't hold those caps, then we need to check whether
840
* the file is either opened or mmaped
841
*/
842
if ((issued & (CEPH_CAP_FILE_CACHE|
843
CEPH_CAP_FILE_BUFFER)) ||
844
mapping_mapped(inode->i_mapping) ||
845
__ceph_is_file_opened(ci)) {
846
ci->i_truncate_pending++;
847
queue_trunc = 1;
848
}
849
}
850
}
851
852
/*
853
* It's possible that the new sizes of the two consecutive
854
* size truncations will be in the same fscrypt last block,
855
* and we need to truncate the corresponding page caches
856
* anyway.
857
*/
858
if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0) {
859
doutc(cl, "truncate_size %lld -> %llu, encrypted %d\n",
860
ci->i_truncate_size, truncate_size,
861
!!IS_ENCRYPTED(inode));
862
863
ci->i_truncate_size = truncate_size;
864
865
if (IS_ENCRYPTED(inode)) {
866
doutc(cl, "truncate_pagecache_size %lld -> %llu\n",
867
ci->i_truncate_pagecache_size, size);
868
ci->i_truncate_pagecache_size = size;
869
} else {
870
ci->i_truncate_pagecache_size = truncate_size;
871
}
872
}
873
return queue_trunc;
874
}
875
876
void ceph_fill_file_time(struct inode *inode, int issued,
877
u64 time_warp_seq, struct timespec64 *ctime,
878
struct timespec64 *mtime, struct timespec64 *atime)
879
{
880
struct ceph_client *cl = ceph_inode_to_client(inode);
881
struct ceph_inode_info *ci = ceph_inode(inode);
882
struct timespec64 ictime = inode_get_ctime(inode);
883
int warn = 0;
884
885
if (issued & (CEPH_CAP_FILE_EXCL|
886
CEPH_CAP_FILE_WR|
887
CEPH_CAP_FILE_BUFFER|
888
CEPH_CAP_AUTH_EXCL|
889
CEPH_CAP_XATTR_EXCL)) {
890
if (ci->i_version == 0 ||
891
timespec64_compare(ctime, &ictime) > 0) {
892
doutc(cl, "ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n",
893
ictime.tv_sec, ictime.tv_nsec,
894
ctime->tv_sec, ctime->tv_nsec);
895
inode_set_ctime_to_ts(inode, *ctime);
896
}
897
if (ci->i_version == 0 ||
898
ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
899
/* the MDS did a utimes() */
900
doutc(cl, "mtime %lld.%09ld -> %lld.%09ld tw %d -> %d\n",
901
inode_get_mtime_sec(inode),
902
inode_get_mtime_nsec(inode),
903
mtime->tv_sec, mtime->tv_nsec,
904
ci->i_time_warp_seq, (int)time_warp_seq);
905
906
inode_set_mtime_to_ts(inode, *mtime);
907
inode_set_atime_to_ts(inode, *atime);
908
ci->i_time_warp_seq = time_warp_seq;
909
} else if (time_warp_seq == ci->i_time_warp_seq) {
910
struct timespec64 ts;
911
912
/* nobody did utimes(); take the max */
913
ts = inode_get_mtime(inode);
914
if (timespec64_compare(mtime, &ts) > 0) {
915
doutc(cl, "mtime %lld.%09ld -> %lld.%09ld inc\n",
916
ts.tv_sec, ts.tv_nsec,
917
mtime->tv_sec, mtime->tv_nsec);
918
inode_set_mtime_to_ts(inode, *mtime);
919
}
920
ts = inode_get_atime(inode);
921
if (timespec64_compare(atime, &ts) > 0) {
922
doutc(cl, "atime %lld.%09ld -> %lld.%09ld inc\n",
923
ts.tv_sec, ts.tv_nsec,
924
atime->tv_sec, atime->tv_nsec);
925
inode_set_atime_to_ts(inode, *atime);
926
}
927
} else if (issued & CEPH_CAP_FILE_EXCL) {
928
/* we did a utimes(); ignore mds values */
929
} else {
930
warn = 1;
931
}
932
} else {
933
/* we have no write|excl caps; whatever the MDS says is true */
934
if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
935
inode_set_ctime_to_ts(inode, *ctime);
936
inode_set_mtime_to_ts(inode, *mtime);
937
inode_set_atime_to_ts(inode, *atime);
938
ci->i_time_warp_seq = time_warp_seq;
939
} else {
940
warn = 1;
941
}
942
}
943
if (warn) /* time_warp_seq shouldn't go backwards */
944
doutc(cl, "%p mds time_warp_seq %llu < %u\n", inode,
945
time_warp_seq, ci->i_time_warp_seq);
946
}
947
948
#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
949
static int decode_encrypted_symlink(struct ceph_mds_client *mdsc,
950
const char *encsym,
951
int enclen, u8 **decsym)
952
{
953
struct ceph_client *cl = mdsc->fsc->client;
954
int declen;
955
u8 *sym;
956
957
sym = kmalloc(enclen + 1, GFP_NOFS);
958
if (!sym)
959
return -ENOMEM;
960
961
declen = ceph_base64_decode(encsym, enclen, sym);
962
if (declen < 0) {
963
pr_err_client(cl,
964
"can't decode symlink (%d). Content: %.*s\n",
965
declen, enclen, encsym);
966
kfree(sym);
967
return -EIO;
968
}
969
sym[declen + 1] = '\0';
970
*decsym = sym;
971
return declen;
972
}
973
#else
974
static int decode_encrypted_symlink(struct ceph_mds_client *mdsc,
975
const char *encsym,
976
int symlen, u8 **decsym)
977
{
978
return -EOPNOTSUPP;
979
}
980
#endif
981
982
/*
983
* Populate an inode based on info from mds. May be called on new or
984
* existing inodes.
985
*/
986
int ceph_fill_inode(struct inode *inode, struct page *locked_page,
987
struct ceph_mds_reply_info_in *iinfo,
988
struct ceph_mds_reply_dirfrag *dirinfo,
989
struct ceph_mds_session *session, int cap_fmode,
990
struct ceph_cap_reservation *caps_reservation)
991
{
992
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
993
struct ceph_client *cl = mdsc->fsc->client;
994
struct ceph_mds_reply_inode *info = iinfo->in;
995
struct ceph_inode_info *ci = ceph_inode(inode);
996
int issued, new_issued, info_caps;
997
struct timespec64 mtime, atime, ctime;
998
struct ceph_buffer *xattr_blob = NULL;
999
struct ceph_buffer *old_blob = NULL;
1000
struct ceph_string *pool_ns = NULL;
1001
struct ceph_cap *new_cap = NULL;
1002
int err = 0;
1003
bool wake = false;
1004
bool queue_trunc = false;
1005
bool new_version = false;
1006
bool fill_inline = false;
1007
umode_t mode = le32_to_cpu(info->mode);
1008
dev_t rdev = le32_to_cpu(info->rdev);
1009
1010
lockdep_assert_held(&mdsc->snap_rwsem);
1011
1012
doutc(cl, "%p ino %llx.%llx v %llu had %llu\n", inode, ceph_vinop(inode),
1013
le64_to_cpu(info->version), ci->i_version);
1014
1015
/* Once I_NEW is cleared, we can't change type or dev numbers */
1016
if (inode->i_state & I_NEW) {
1017
inode->i_mode = mode;
1018
} else {
1019
if (inode_wrong_type(inode, mode)) {
1020
pr_warn_once_client(cl,
1021
"inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n",
1022
ceph_vinop(inode), inode->i_mode, mode);
1023
return -ESTALE;
1024
}
1025
1026
if ((S_ISCHR(mode) || S_ISBLK(mode)) && inode->i_rdev != rdev) {
1027
pr_warn_once_client(cl,
1028
"dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n",
1029
ceph_vinop(inode), MAJOR(inode->i_rdev),
1030
MINOR(inode->i_rdev), MAJOR(rdev),
1031
MINOR(rdev));
1032
return -ESTALE;
1033
}
1034
}
1035
1036
info_caps = le32_to_cpu(info->cap.caps);
1037
1038
/* prealloc new cap struct */
1039
if (info_caps && ceph_snap(inode) == CEPH_NOSNAP) {
1040
new_cap = ceph_get_cap(mdsc, caps_reservation);
1041
if (!new_cap)
1042
return -ENOMEM;
1043
}
1044
1045
/*
1046
* prealloc xattr data, if it looks like we'll need it. only
1047
* if len > 4 (meaning there are actually xattrs; the first 4
1048
* bytes are the xattr count).
1049
*/
1050
if (iinfo->xattr_len > 4) {
1051
xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
1052
if (!xattr_blob)
1053
pr_err_client(cl, "ENOMEM xattr blob %d bytes\n",
1054
iinfo->xattr_len);
1055
}
1056
1057
if (iinfo->pool_ns_len > 0)
1058
pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
1059
iinfo->pool_ns_len);
1060
1061
if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map)
1062
ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode));
1063
1064
spin_lock(&ci->i_ceph_lock);
1065
1066
/*
1067
* provided version will be odd if inode value is projected,
1068
* even if stable. skip the update if we have newer stable
1069
* info (ours>=theirs, e.g. due to racing mds replies), unless
1070
* we are getting projected (unstable) info (in which case the
1071
* version is odd, and we want ours>theirs).
1072
* us them
1073
* 2 2 skip
1074
* 3 2 skip
1075
* 3 3 update
1076
*/
1077
if (ci->i_version == 0 ||
1078
((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
1079
le64_to_cpu(info->version) > (ci->i_version & ~1)))
1080
new_version = true;
1081
1082
/* Update change_attribute */
1083
inode_set_max_iversion_raw(inode, iinfo->change_attr);
1084
1085
__ceph_caps_issued(ci, &issued);
1086
issued |= __ceph_caps_dirty(ci);
1087
new_issued = ~issued & info_caps;
1088
1089
__ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files);
1090
1091
#ifdef CONFIG_FS_ENCRYPTION
1092
if (iinfo->fscrypt_auth_len &&
1093
((inode->i_state & I_NEW) || (ci->fscrypt_auth_len == 0))) {
1094
kfree(ci->fscrypt_auth);
1095
ci->fscrypt_auth_len = iinfo->fscrypt_auth_len;
1096
ci->fscrypt_auth = iinfo->fscrypt_auth;
1097
iinfo->fscrypt_auth = NULL;
1098
iinfo->fscrypt_auth_len = 0;
1099
inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED);
1100
}
1101
#endif
1102
1103
if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
1104
(issued & CEPH_CAP_AUTH_EXCL) == 0) {
1105
inode->i_mode = mode;
1106
inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid));
1107
inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));
1108
doutc(cl, "%p %llx.%llx mode 0%o uid.gid %d.%d\n", inode,
1109
ceph_vinop(inode), inode->i_mode,
1110
from_kuid(&init_user_ns, inode->i_uid),
1111
from_kgid(&init_user_ns, inode->i_gid));
1112
ceph_decode_timespec64(&ci->i_btime, &iinfo->btime);
1113
ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime);
1114
}
1115
1116
/* directories have fl_stripe_unit set to zero */
1117
if (IS_ENCRYPTED(inode))
1118
inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT;
1119
else if (le32_to_cpu(info->layout.fl_stripe_unit))
1120
inode->i_blkbits =
1121
fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
1122
else
1123
inode->i_blkbits = CEPH_BLOCK_SHIFT;
1124
1125
if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
1126
(issued & CEPH_CAP_LINK_EXCL) == 0)
1127
set_nlink(inode, le32_to_cpu(info->nlink));
1128
1129
if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
1130
/* be careful with mtime, atime, size */
1131
ceph_decode_timespec64(&atime, &info->atime);
1132
ceph_decode_timespec64(&mtime, &info->mtime);
1133
ceph_decode_timespec64(&ctime, &info->ctime);
1134
ceph_fill_file_time(inode, issued,
1135
le32_to_cpu(info->time_warp_seq),
1136
&ctime, &mtime, &atime);
1137
}
1138
1139
if (new_version || (info_caps & CEPH_CAP_FILE_SHARED)) {
1140
ci->i_files = le64_to_cpu(info->files);
1141
ci->i_subdirs = le64_to_cpu(info->subdirs);
1142
}
1143
1144
if (new_version ||
1145
(new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
1146
u64 size = le64_to_cpu(info->size);
1147
s64 old_pool = ci->i_layout.pool_id;
1148
struct ceph_string *old_ns;
1149
1150
ceph_file_layout_from_legacy(&ci->i_layout, &info->layout);
1151
old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
1152
lockdep_is_held(&ci->i_ceph_lock));
1153
rcu_assign_pointer(ci->i_layout.pool_ns, pool_ns);
1154
1155
if (ci->i_layout.pool_id != old_pool || pool_ns != old_ns)
1156
ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
1157
1158
pool_ns = old_ns;
1159
1160
if (IS_ENCRYPTED(inode) && size &&
1161
iinfo->fscrypt_file_len == sizeof(__le64)) {
1162
u64 fsize = __le64_to_cpu(*(__le64 *)iinfo->fscrypt_file);
1163
1164
if (size == round_up(fsize, CEPH_FSCRYPT_BLOCK_SIZE)) {
1165
size = fsize;
1166
} else {
1167
pr_warn_client(cl,
1168
"fscrypt size mismatch: size=%llu fscrypt_file=%llu, discarding fscrypt_file size.\n",
1169
info->size, size);
1170
}
1171
}
1172
1173
queue_trunc = ceph_fill_file_size(inode, issued,
1174
le32_to_cpu(info->truncate_seq),
1175
le64_to_cpu(info->truncate_size),
1176
size);
1177
/* only update max_size on auth cap */
1178
if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
1179
ci->i_max_size != le64_to_cpu(info->max_size)) {
1180
doutc(cl, "max_size %lld -> %llu\n",
1181
ci->i_max_size, le64_to_cpu(info->max_size));
1182
ci->i_max_size = le64_to_cpu(info->max_size);
1183
}
1184
}
1185
1186
/* layout and rstat are not tracked by capability, update them if
1187
* the inode info is from auth mds */
1188
if (new_version || (info->cap.flags & CEPH_CAP_FLAG_AUTH)) {
1189
if (S_ISDIR(inode->i_mode)) {
1190
ci->i_dir_layout = iinfo->dir_layout;
1191
ci->i_rbytes = le64_to_cpu(info->rbytes);
1192
ci->i_rfiles = le64_to_cpu(info->rfiles);
1193
ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
1194
ci->i_dir_pin = iinfo->dir_pin;
1195
ci->i_rsnaps = iinfo->rsnaps;
1196
ceph_decode_timespec64(&ci->i_rctime, &info->rctime);
1197
}
1198
}
1199
1200
/* xattrs */
1201
/* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
1202
if ((ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
1203
le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
1204
if (ci->i_xattrs.blob)
1205
old_blob = ci->i_xattrs.blob;
1206
ci->i_xattrs.blob = xattr_blob;
1207
if (xattr_blob)
1208
memcpy(ci->i_xattrs.blob->vec.iov_base,
1209
iinfo->xattr_data, iinfo->xattr_len);
1210
ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
1211
ceph_forget_all_cached_acls(inode);
1212
ceph_security_invalidate_secctx(inode);
1213
xattr_blob = NULL;
1214
}
1215
1216
/* finally update i_version */
1217
if (le64_to_cpu(info->version) > ci->i_version)
1218
ci->i_version = le64_to_cpu(info->version);
1219
1220
inode->i_mapping->a_ops = &ceph_aops;
1221
1222
switch (inode->i_mode & S_IFMT) {
1223
case S_IFIFO:
1224
case S_IFBLK:
1225
case S_IFCHR:
1226
case S_IFSOCK:
1227
inode->i_blkbits = PAGE_SHIFT;
1228
init_special_inode(inode, inode->i_mode, rdev);
1229
inode->i_op = &ceph_file_iops;
1230
break;
1231
case S_IFREG:
1232
inode->i_op = &ceph_file_iops;
1233
inode->i_fop = &ceph_file_fops;
1234
break;
1235
case S_IFLNK:
1236
if (!ci->i_symlink) {
1237
u32 symlen = iinfo->symlink_len;
1238
char *sym;
1239
1240
spin_unlock(&ci->i_ceph_lock);
1241
1242
if (IS_ENCRYPTED(inode)) {
1243
if (symlen != i_size_read(inode))
1244
pr_err_client(cl,
1245
"%p %llx.%llx BAD symlink size %lld\n",
1246
inode, ceph_vinop(inode),
1247
i_size_read(inode));
1248
1249
err = decode_encrypted_symlink(mdsc, iinfo->symlink,
1250
symlen, (u8 **)&sym);
1251
if (err < 0) {
1252
pr_err_client(cl,
1253
"decoding encrypted symlink failed: %d\n",
1254
err);
1255
goto out;
1256
}
1257
symlen = err;
1258
i_size_write(inode, symlen);
1259
inode->i_blocks = calc_inode_blocks(symlen);
1260
} else {
1261
if (symlen != i_size_read(inode)) {
1262
pr_err_client(cl,
1263
"%p %llx.%llx BAD symlink size %lld\n",
1264
inode, ceph_vinop(inode),
1265
i_size_read(inode));
1266
i_size_write(inode, symlen);
1267
inode->i_blocks = calc_inode_blocks(symlen);
1268
}
1269
1270
err = -ENOMEM;
1271
sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
1272
if (!sym)
1273
goto out;
1274
}
1275
1276
spin_lock(&ci->i_ceph_lock);
1277
if (!ci->i_symlink)
1278
ci->i_symlink = sym;
1279
else
1280
kfree(sym); /* lost a race */
1281
}
1282
1283
if (IS_ENCRYPTED(inode)) {
1284
/*
1285
* Encrypted symlinks need to be decrypted before we can
1286
* cache their targets in i_link. Don't touch it here.
1287
*/
1288
inode->i_op = &ceph_encrypted_symlink_iops;
1289
} else {
1290
inode->i_link = ci->i_symlink;
1291
inode->i_op = &ceph_symlink_iops;
1292
}
1293
break;
1294
case S_IFDIR:
1295
inode->i_op = &ceph_dir_iops;
1296
inode->i_fop = &ceph_dir_fops;
1297
break;
1298
default:
1299
pr_err_client(cl, "%p %llx.%llx BAD mode 0%o\n", inode,
1300
ceph_vinop(inode), inode->i_mode);
1301
}
1302
1303
/* were we issued a capability? */
1304
if (info_caps) {
1305
if (ceph_snap(inode) == CEPH_NOSNAP) {
1306
ceph_add_cap(inode, session,
1307
le64_to_cpu(info->cap.cap_id),
1308
info_caps,
1309
le32_to_cpu(info->cap.wanted),
1310
le32_to_cpu(info->cap.seq),
1311
le32_to_cpu(info->cap.mseq),
1312
le64_to_cpu(info->cap.realm),
1313
info->cap.flags, &new_cap);
1314
1315
/* set dir completion flag? */
1316
if (S_ISDIR(inode->i_mode) &&
1317
ci->i_files == 0 && ci->i_subdirs == 0 &&
1318
(info_caps & CEPH_CAP_FILE_SHARED) &&
1319
(issued & CEPH_CAP_FILE_EXCL) == 0 &&
1320
!__ceph_dir_is_complete(ci)) {
1321
doutc(cl, " marking %p complete (empty)\n",
1322
inode);
1323
i_size_write(inode, 0);
1324
__ceph_dir_set_complete(ci,
1325
atomic64_read(&ci->i_release_count),
1326
atomic64_read(&ci->i_ordered_count));
1327
}
1328
1329
wake = true;
1330
} else {
1331
doutc(cl, " %p got snap_caps %s\n", inode,
1332
ceph_cap_string(info_caps));
1333
ci->i_snap_caps |= info_caps;
1334
}
1335
}
1336
1337
if (iinfo->inline_version > 0 &&
1338
iinfo->inline_version >= ci->i_inline_version) {
1339
int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
1340
ci->i_inline_version = iinfo->inline_version;
1341
if (ceph_has_inline_data(ci) &&
1342
(locked_page || (info_caps & cache_caps)))
1343
fill_inline = true;
1344
}
1345
1346
if (cap_fmode >= 0) {
1347
if (!info_caps)
1348
pr_warn_client(cl, "mds issued no caps on %llx.%llx\n",
1349
ceph_vinop(inode));
1350
__ceph_touch_fmode(ci, mdsc, cap_fmode);
1351
}
1352
1353
spin_unlock(&ci->i_ceph_lock);
1354
1355
ceph_fscache_register_inode_cookie(inode);
1356
1357
if (fill_inline)
1358
ceph_fill_inline_data(inode, locked_page,
1359
iinfo->inline_data, iinfo->inline_len);
1360
1361
if (wake)
1362
wake_up_all(&ci->i_cap_wq);
1363
1364
/* queue truncate if we saw i_size decrease */
1365
if (queue_trunc)
1366
ceph_queue_vmtruncate(inode);
1367
1368
/* populate frag tree */
1369
if (S_ISDIR(inode->i_mode))
1370
ceph_fill_fragtree(inode, &info->fragtree, dirinfo);
1371
1372
/* update delegation info? */
1373
if (dirinfo)
1374
ceph_fill_dirfrag(inode, dirinfo);
1375
1376
err = 0;
1377
out:
1378
if (new_cap)
1379
ceph_put_cap(mdsc, new_cap);
1380
ceph_buffer_put(old_blob);
1381
ceph_buffer_put(xattr_blob);
1382
ceph_put_string(pool_ns);
1383
return err;
1384
}
1385
1386
/*
1387
* caller should hold session s_mutex and dentry->d_lock.
1388
*/
1389
static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
1390
struct ceph_mds_reply_lease *lease,
1391
struct ceph_mds_session *session,
1392
unsigned long from_time,
1393
struct ceph_mds_session **old_lease_session)
1394
{
1395
struct ceph_client *cl = ceph_inode_to_client(dir);
1396
struct ceph_dentry_info *di = ceph_dentry(dentry);
1397
unsigned mask = le16_to_cpu(lease->mask);
1398
long unsigned duration = le32_to_cpu(lease->duration_ms);
1399
long unsigned ttl = from_time + (duration * HZ) / 1000;
1400
long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
1401
1402
doutc(cl, "%p duration %lu ms ttl %lu\n", dentry, duration, ttl);
1403
1404
/* only track leases on regular dentries */
1405
if (ceph_snap(dir) != CEPH_NOSNAP)
1406
return;
1407
1408
if (mask & CEPH_LEASE_PRIMARY_LINK)
1409
di->flags |= CEPH_DENTRY_PRIMARY_LINK;
1410
else
1411
di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
1412
1413
di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen);
1414
if (!(mask & CEPH_LEASE_VALID)) {
1415
__ceph_dentry_dir_lease_touch(di);
1416
return;
1417
}
1418
1419
if (di->lease_gen == atomic_read(&session->s_cap_gen) &&
1420
time_before(ttl, di->time))
1421
return; /* we already have a newer lease. */
1422
1423
if (di->lease_session && di->lease_session != session) {
1424
*old_lease_session = di->lease_session;
1425
di->lease_session = NULL;
1426
}
1427
1428
if (!di->lease_session)
1429
di->lease_session = ceph_get_mds_session(session);
1430
di->lease_gen = atomic_read(&session->s_cap_gen);
1431
di->lease_seq = le32_to_cpu(lease->seq);
1432
di->lease_renew_after = half_ttl;
1433
di->lease_renew_from = 0;
1434
di->time = ttl;
1435
1436
__ceph_dentry_lease_touch(di);
1437
}
1438
1439
static inline void update_dentry_lease(struct inode *dir, struct dentry *dentry,
1440
struct ceph_mds_reply_lease *lease,
1441
struct ceph_mds_session *session,
1442
unsigned long from_time)
1443
{
1444
struct ceph_mds_session *old_lease_session = NULL;
1445
spin_lock(&dentry->d_lock);
1446
__update_dentry_lease(dir, dentry, lease, session, from_time,
1447
&old_lease_session);
1448
spin_unlock(&dentry->d_lock);
1449
ceph_put_mds_session(old_lease_session);
1450
}
1451
1452
/*
1453
* update dentry lease without having parent inode locked
1454
*/
1455
static void update_dentry_lease_careful(struct dentry *dentry,
1456
struct ceph_mds_reply_lease *lease,
1457
struct ceph_mds_session *session,
1458
unsigned long from_time,
1459
char *dname, u32 dname_len,
1460
struct ceph_vino *pdvino,
1461
struct ceph_vino *ptvino)
1462
1463
{
1464
struct inode *dir;
1465
struct ceph_mds_session *old_lease_session = NULL;
1466
1467
spin_lock(&dentry->d_lock);
1468
/* make sure dentry's name matches target */
1469
if (dentry->d_name.len != dname_len ||
1470
memcmp(dentry->d_name.name, dname, dname_len))
1471
goto out_unlock;
1472
1473
dir = d_inode(dentry->d_parent);
1474
/* make sure parent matches dvino */
1475
if (!ceph_ino_compare(dir, pdvino))
1476
goto out_unlock;
1477
1478
/* make sure dentry's inode matches target. NULL ptvino means that
1479
* we expect a negative dentry */
1480
if (ptvino) {
1481
if (d_really_is_negative(dentry))
1482
goto out_unlock;
1483
if (!ceph_ino_compare(d_inode(dentry), ptvino))
1484
goto out_unlock;
1485
} else {
1486
if (d_really_is_positive(dentry))
1487
goto out_unlock;
1488
}
1489
1490
__update_dentry_lease(dir, dentry, lease, session,
1491
from_time, &old_lease_session);
1492
out_unlock:
1493
spin_unlock(&dentry->d_lock);
1494
ceph_put_mds_session(old_lease_session);
1495
}
1496
1497
/*
1498
* splice a dentry to an inode.
1499
* caller must hold directory i_rwsem for this to be safe.
1500
*/
1501
static int splice_dentry(struct dentry **pdn, struct inode *in)
1502
{
1503
struct ceph_client *cl = ceph_inode_to_client(in);
1504
struct dentry *dn = *pdn;
1505
struct dentry *realdn;
1506
1507
BUG_ON(d_inode(dn));
1508
1509
if (S_ISDIR(in->i_mode)) {
1510
/* If inode is directory, d_splice_alias() below will remove
1511
* 'realdn' from its origin parent. We need to ensure that
1512
* origin parent's readdir cache will not reference 'realdn'
1513
*/
1514
realdn = d_find_any_alias(in);
1515
if (realdn) {
1516
struct ceph_dentry_info *di = ceph_dentry(realdn);
1517
spin_lock(&realdn->d_lock);
1518
1519
realdn->d_op->d_prune(realdn);
1520
1521
di->time = jiffies;
1522
di->lease_shared_gen = 0;
1523
di->offset = 0;
1524
1525
spin_unlock(&realdn->d_lock);
1526
dput(realdn);
1527
}
1528
}
1529
1530
/* dn must be unhashed */
1531
if (!d_unhashed(dn))
1532
d_drop(dn);
1533
realdn = d_splice_alias(in, dn);
1534
if (IS_ERR(realdn)) {
1535
pr_err_client(cl, "error %ld %p inode %p ino %llx.%llx\n",
1536
PTR_ERR(realdn), dn, in, ceph_vinop(in));
1537
return PTR_ERR(realdn);
1538
}
1539
1540
if (realdn) {
1541
doutc(cl, "dn %p (%d) spliced with %p (%d) inode %p ino %llx.%llx\n",
1542
dn, d_count(dn), realdn, d_count(realdn),
1543
d_inode(realdn), ceph_vinop(d_inode(realdn)));
1544
dput(dn);
1545
*pdn = realdn;
1546
} else {
1547
BUG_ON(!ceph_dentry(dn));
1548
doutc(cl, "dn %p attached to %p ino %llx.%llx\n", dn,
1549
d_inode(dn), ceph_vinop(d_inode(dn)));
1550
}
1551
return 0;
1552
}
1553
1554
/*
1555
* Incorporate results into the local cache. This is either just
1556
* one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
1557
* after a lookup).
1558
*
1559
* A reply may contain
1560
* a directory inode along with a dentry.
1561
* and/or a target inode
1562
*
1563
* Called with snap_rwsem (read).
1564
*/
1565
int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
1566
{
1567
struct ceph_mds_session *session = req->r_session;
1568
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1569
struct inode *in = NULL;
1570
struct ceph_vino tvino, dvino;
1571
struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
1572
struct ceph_client *cl = fsc->client;
1573
struct inode *parent_dir = NULL;
1574
int err = 0;
1575
1576
doutc(cl, "%p is_dentry %d is_target %d\n", req,
1577
rinfo->head->is_dentry, rinfo->head->is_target);
1578
1579
if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
1580
doutc(cl, "reply is empty!\n");
1581
if (rinfo->head->result == 0 && req->r_parent)
1582
ceph_invalidate_dir_request(req);
1583
return 0;
1584
}
1585
1586
if (rinfo->head->is_dentry) {
1587
/*
1588
* r_parent may be stale, in cases when R_PARENT_LOCKED is not set,
1589
* so we need to get the correct inode
1590
*/
1591
parent_dir = ceph_get_reply_dir(sb, req->r_parent, rinfo);
1592
if (unlikely(IS_ERR(parent_dir))) {
1593
err = PTR_ERR(parent_dir);
1594
goto done;
1595
}
1596
if (parent_dir) {
1597
err = ceph_fill_inode(parent_dir, NULL, &rinfo->diri,
1598
rinfo->dirfrag, session, -1,
1599
&req->r_caps_reservation);
1600
if (err < 0)
1601
goto done;
1602
} else {
1603
WARN_ON_ONCE(1);
1604
}
1605
1606
if (parent_dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
1607
test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
1608
!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
1609
bool is_nokey = false;
1610
struct qstr dname;
1611
struct dentry *dn, *parent;
1612
struct fscrypt_str oname = FSTR_INIT(NULL, 0);
1613
struct ceph_fname fname = { .dir = parent_dir,
1614
.name = rinfo->dname,
1615
.ctext = rinfo->altname,
1616
.name_len = rinfo->dname_len,
1617
.ctext_len = rinfo->altname_len };
1618
1619
BUG_ON(!rinfo->head->is_target);
1620
BUG_ON(req->r_dentry);
1621
1622
parent = d_find_any_alias(parent_dir);
1623
BUG_ON(!parent);
1624
1625
err = ceph_fname_alloc_buffer(parent_dir, &oname);
1626
if (err < 0) {
1627
dput(parent);
1628
goto done;
1629
}
1630
1631
err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey);
1632
if (err < 0) {
1633
dput(parent);
1634
ceph_fname_free_buffer(parent_dir, &oname);
1635
goto done;
1636
}
1637
dname.name = oname.name;
1638
dname.len = oname.len;
1639
dname.hash = full_name_hash(parent, dname.name, dname.len);
1640
tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1641
tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1642
retry_lookup:
1643
dn = d_lookup(parent, &dname);
1644
doutc(cl, "d_lookup on parent=%p name=%.*s got %p\n",
1645
parent, dname.len, dname.name, dn);
1646
1647
if (!dn) {
1648
dn = d_alloc(parent, &dname);
1649
doutc(cl, "d_alloc %p '%.*s' = %p\n", parent,
1650
dname.len, dname.name, dn);
1651
if (!dn) {
1652
dput(parent);
1653
ceph_fname_free_buffer(parent_dir, &oname);
1654
err = -ENOMEM;
1655
goto done;
1656
}
1657
if (is_nokey) {
1658
spin_lock(&dn->d_lock);
1659
dn->d_flags |= DCACHE_NOKEY_NAME;
1660
spin_unlock(&dn->d_lock);
1661
}
1662
err = 0;
1663
} else if (d_really_is_positive(dn) &&
1664
(ceph_ino(d_inode(dn)) != tvino.ino ||
1665
ceph_snap(d_inode(dn)) != tvino.snap)) {
1666
doutc(cl, " dn %p points to wrong inode %p\n",
1667
dn, d_inode(dn));
1668
ceph_dir_clear_ordered(parent_dir);
1669
d_delete(dn);
1670
dput(dn);
1671
goto retry_lookup;
1672
}
1673
ceph_fname_free_buffer(parent_dir, &oname);
1674
1675
req->r_dentry = dn;
1676
dput(parent);
1677
}
1678
}
1679
1680
if (rinfo->head->is_target) {
1681
/* Should be filled in by handle_reply */
1682
BUG_ON(!req->r_target_inode);
1683
1684
in = req->r_target_inode;
1685
err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti,
1686
NULL, session,
1687
(!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
1688
!test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) &&
1689
rinfo->head->result == 0) ? req->r_fmode : -1,
1690
&req->r_caps_reservation);
1691
if (err < 0) {
1692
pr_err_client(cl, "badness %p %llx.%llx\n", in,
1693
ceph_vinop(in));
1694
req->r_target_inode = NULL;
1695
if (in->i_state & I_NEW)
1696
discard_new_inode(in);
1697
else
1698
iput(in);
1699
goto done;
1700
}
1701
if (in->i_state & I_NEW)
1702
unlock_new_inode(in);
1703
}
1704
1705
/*
1706
* ignore null lease/binding on snapdir ENOENT, or else we
1707
* will have trouble splicing in the virtual snapdir later
1708
*/
1709
if (rinfo->head->is_dentry &&
1710
!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
1711
test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
1712
(rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
1713
fsc->mount_options->snapdir_name,
1714
req->r_dentry->d_name.len))) {
1715
/*
1716
* lookup link rename : null -> possibly existing inode
1717
* mknod symlink mkdir : null -> new inode
1718
* unlink : linked -> null
1719
*/
1720
struct inode *dir = req->r_parent;
1721
struct dentry *dn = req->r_dentry;
1722
bool have_dir_cap, have_lease;
1723
1724
BUG_ON(!dn);
1725
BUG_ON(!dir);
1726
BUG_ON(d_inode(dn->d_parent) != dir);
1727
1728
dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
1729
dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
1730
1731
BUG_ON(ceph_ino(dir) != dvino.ino);
1732
BUG_ON(ceph_snap(dir) != dvino.snap);
1733
1734
/* do we have a lease on the whole dir? */
1735
have_dir_cap =
1736
(le32_to_cpu(rinfo->diri.in->cap.caps) &
1737
CEPH_CAP_FILE_SHARED);
1738
1739
/* do we have a dn lease? */
1740
have_lease = have_dir_cap ||
1741
le32_to_cpu(rinfo->dlease->duration_ms);
1742
if (!have_lease)
1743
doutc(cl, "no dentry lease or dir cap\n");
1744
1745
/* rename? */
1746
if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
1747
struct inode *olddir = req->r_old_dentry_dir;
1748
BUG_ON(!olddir);
1749
1750
doutc(cl, " src %p '%pd' dst %p '%pd'\n",
1751
req->r_old_dentry, req->r_old_dentry, dn, dn);
1752
doutc(cl, "doing d_move %p -> %p\n", req->r_old_dentry, dn);
1753
1754
/* d_move screws up sibling dentries' offsets */
1755
ceph_dir_clear_ordered(dir);
1756
ceph_dir_clear_ordered(olddir);
1757
1758
d_move(req->r_old_dentry, dn);
1759
doutc(cl, " src %p '%pd' dst %p '%pd'\n",
1760
req->r_old_dentry, req->r_old_dentry, dn, dn);
1761
1762
/* ensure target dentry is invalidated, despite
1763
rehashing bug in vfs_rename_dir */
1764
ceph_invalidate_dentry_lease(dn);
1765
1766
doutc(cl, "dn %p gets new offset %lld\n",
1767
req->r_old_dentry,
1768
ceph_dentry(req->r_old_dentry)->offset);
1769
1770
/* swap r_dentry and r_old_dentry in case that
1771
* splice_dentry() gets called later. This is safe
1772
* because no other place will use them */
1773
req->r_dentry = req->r_old_dentry;
1774
req->r_old_dentry = dn;
1775
dn = req->r_dentry;
1776
}
1777
1778
/* null dentry? */
1779
if (!rinfo->head->is_target) {
1780
doutc(cl, "null dentry\n");
1781
if (d_really_is_positive(dn)) {
1782
doutc(cl, "d_delete %p\n", dn);
1783
ceph_dir_clear_ordered(dir);
1784
d_delete(dn);
1785
} else if (have_lease) {
1786
if (d_unhashed(dn))
1787
d_add(dn, NULL);
1788
}
1789
1790
if (!d_unhashed(dn) && have_lease)
1791
update_dentry_lease(dir, dn,
1792
rinfo->dlease, session,
1793
req->r_request_started);
1794
goto done;
1795
}
1796
1797
/* attach proper inode */
1798
if (d_really_is_negative(dn)) {
1799
ceph_dir_clear_ordered(dir);
1800
ihold(in);
1801
err = splice_dentry(&req->r_dentry, in);
1802
if (err < 0)
1803
goto done;
1804
dn = req->r_dentry; /* may have spliced */
1805
} else if (d_really_is_positive(dn) && d_inode(dn) != in) {
1806
doutc(cl, " %p links to %p %llx.%llx, not %llx.%llx\n",
1807
dn, d_inode(dn), ceph_vinop(d_inode(dn)),
1808
ceph_vinop(in));
1809
d_invalidate(dn);
1810
have_lease = false;
1811
}
1812
1813
if (have_lease) {
1814
update_dentry_lease(dir, dn,
1815
rinfo->dlease, session,
1816
req->r_request_started);
1817
}
1818
doutc(cl, " final dn %p\n", dn);
1819
} else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
1820
req->r_op == CEPH_MDS_OP_MKSNAP) &&
1821
test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
1822
!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
1823
struct inode *dir = req->r_parent;
1824
1825
/* fill out a snapdir LOOKUPSNAP dentry */
1826
BUG_ON(!dir);
1827
BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
1828
BUG_ON(!req->r_dentry);
1829
doutc(cl, " linking snapped dir %p to dn %p\n", in,
1830
req->r_dentry);
1831
ceph_dir_clear_ordered(dir);
1832
ihold(in);
1833
err = splice_dentry(&req->r_dentry, in);
1834
if (err < 0)
1835
goto done;
1836
} else if (rinfo->head->is_dentry && req->r_dentry) {
1837
/* parent inode is not locked, be careful */
1838
struct ceph_vino *ptvino = NULL;
1839
dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
1840
dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
1841
if (rinfo->head->is_target) {
1842
tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1843
tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1844
ptvino = &tvino;
1845
}
1846
update_dentry_lease_careful(req->r_dentry, rinfo->dlease,
1847
session, req->r_request_started,
1848
rinfo->dname, rinfo->dname_len,
1849
&dvino, ptvino);
1850
}
1851
done:
1852
/* Drop extra ref from ceph_get_reply_dir() if it returned a new inode */
1853
if (unlikely(!IS_ERR_OR_NULL(parent_dir) && parent_dir != req->r_parent))
1854
iput(parent_dir);
1855
doutc(cl, "done err=%d\n", err);
1856
return err;
1857
}
1858
1859
/*
1860
* Prepopulate our cache with readdir results, leases, etc.
1861
*/
1862
static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
1863
struct ceph_mds_session *session)
1864
{
1865
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1866
struct ceph_client *cl = session->s_mdsc->fsc->client;
1867
int i, err = 0;
1868
1869
for (i = 0; i < rinfo->dir_nr; i++) {
1870
struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
1871
struct ceph_vino vino;
1872
struct inode *in;
1873
int rc;
1874
1875
vino.ino = le64_to_cpu(rde->inode.in->ino);
1876
vino.snap = le64_to_cpu(rde->inode.in->snapid);
1877
1878
in = ceph_get_inode(req->r_dentry->d_sb, vino, NULL);
1879
if (IS_ERR(in)) {
1880
err = PTR_ERR(in);
1881
doutc(cl, "badness got %d\n", err);
1882
continue;
1883
}
1884
rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
1885
-1, &req->r_caps_reservation);
1886
if (rc < 0) {
1887
pr_err_client(cl, "inode badness on %p got %d\n", in,
1888
rc);
1889
err = rc;
1890
if (in->i_state & I_NEW) {
1891
ihold(in);
1892
discard_new_inode(in);
1893
}
1894
} else if (in->i_state & I_NEW) {
1895
unlock_new_inode(in);
1896
}
1897
1898
iput(in);
1899
}
1900
1901
return err;
1902
}
1903
1904
void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
1905
{
1906
if (ctl->folio) {
1907
folio_release_kmap(ctl->folio, ctl->dentries);
1908
ctl->folio = NULL;
1909
}
1910
}
1911
1912
static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
1913
struct ceph_readdir_cache_control *ctl,
1914
struct ceph_mds_request *req)
1915
{
1916
struct ceph_client *cl = ceph_inode_to_client(dir);
1917
struct ceph_inode_info *ci = ceph_inode(dir);
1918
unsigned nsize = PAGE_SIZE / sizeof(struct dentry*);
1919
unsigned idx = ctl->index % nsize;
1920
pgoff_t pgoff = ctl->index / nsize;
1921
1922
if (!ctl->folio || pgoff != ctl->folio->index) {
1923
ceph_readdir_cache_release(ctl);
1924
fgf_t fgf = FGP_LOCK;
1925
1926
if (idx == 0)
1927
fgf |= FGP_ACCESSED | FGP_CREAT;
1928
1929
ctl->folio = __filemap_get_folio(&dir->i_data, pgoff,
1930
fgf, mapping_gfp_mask(&dir->i_data));
1931
if (IS_ERR(ctl->folio)) {
1932
int err = PTR_ERR(ctl->folio);
1933
1934
ctl->folio = NULL;
1935
ctl->index = -1;
1936
return idx == 0 ? err : 0;
1937
}
1938
/* reading/filling the cache are serialized by
1939
* i_rwsem, no need to use folio lock */
1940
folio_unlock(ctl->folio);
1941
ctl->dentries = kmap_local_folio(ctl->folio, 0);
1942
if (idx == 0)
1943
memset(ctl->dentries, 0, PAGE_SIZE);
1944
}
1945
1946
if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
1947
req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) {
1948
doutc(cl, "dn %p idx %d\n", dn, ctl->index);
1949
ctl->dentries[idx] = dn;
1950
ctl->index++;
1951
} else {
1952
doutc(cl, "disable readdir cache\n");
1953
ctl->index = -1;
1954
}
1955
return 0;
1956
}
1957
1958
int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1959
struct ceph_mds_session *session)
1960
{
1961
struct dentry *parent = req->r_dentry;
1962
struct inode *inode = d_inode(parent);
1963
struct ceph_inode_info *ci = ceph_inode(inode);
1964
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1965
struct ceph_client *cl = session->s_mdsc->fsc->client;
1966
struct qstr dname;
1967
struct dentry *dn;
1968
struct inode *in;
1969
int err = 0, skipped = 0, ret, i;
1970
u32 frag = le32_to_cpu(req->r_args.readdir.frag);
1971
u32 last_hash = 0;
1972
u32 fpos_offset;
1973
struct ceph_readdir_cache_control cache_ctl = {};
1974
1975
if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
1976
return readdir_prepopulate_inodes_only(req, session);
1977
1978
if (rinfo->hash_order) {
1979
if (req->r_path2) {
1980
last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
1981
req->r_path2,
1982
strlen(req->r_path2));
1983
last_hash = ceph_frag_value(last_hash);
1984
} else if (rinfo->offset_hash) {
1985
/* mds understands offset_hash */
1986
WARN_ON_ONCE(req->r_readdir_offset != 2);
1987
last_hash = le32_to_cpu(req->r_args.readdir.offset_hash);
1988
}
1989
}
1990
1991
if (rinfo->dir_dir &&
1992
le32_to_cpu(rinfo->dir_dir->frag) != frag) {
1993
doutc(cl, "got new frag %x -> %x\n", frag,
1994
le32_to_cpu(rinfo->dir_dir->frag));
1995
frag = le32_to_cpu(rinfo->dir_dir->frag);
1996
if (!rinfo->hash_order)
1997
req->r_readdir_offset = 2;
1998
}
1999
2000
if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
2001
doutc(cl, "%d items under SNAPDIR dn %p\n",
2002
rinfo->dir_nr, parent);
2003
} else {
2004
doutc(cl, "%d items under dn %p\n", rinfo->dir_nr, parent);
2005
if (rinfo->dir_dir)
2006
ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir);
2007
2008
if (ceph_frag_is_leftmost(frag) &&
2009
req->r_readdir_offset == 2 &&
2010
!(rinfo->hash_order && last_hash)) {
2011
/* note dir version at start of readdir so we can
2012
* tell if any dentries get dropped */
2013
req->r_dir_release_cnt =
2014
atomic64_read(&ci->i_release_count);
2015
req->r_dir_ordered_cnt =
2016
atomic64_read(&ci->i_ordered_count);
2017
req->r_readdir_cache_idx = 0;
2018
}
2019
}
2020
2021
cache_ctl.index = req->r_readdir_cache_idx;
2022
fpos_offset = req->r_readdir_offset;
2023
2024
/* FIXME: release caps/leases if error occurs */
2025
for (i = 0; i < rinfo->dir_nr; i++) {
2026
struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
2027
struct ceph_vino tvino;
2028
2029
dname.name = rde->name;
2030
dname.len = rde->name_len;
2031
dname.hash = full_name_hash(parent, dname.name, dname.len);
2032
2033
tvino.ino = le64_to_cpu(rde->inode.in->ino);
2034
tvino.snap = le64_to_cpu(rde->inode.in->snapid);
2035
2036
if (rinfo->hash_order) {
2037
u32 hash = ceph_frag_value(rde->raw_hash);
2038
if (hash != last_hash)
2039
fpos_offset = 2;
2040
last_hash = hash;
2041
rde->offset = ceph_make_fpos(hash, fpos_offset++, true);
2042
} else {
2043
rde->offset = ceph_make_fpos(frag, fpos_offset++, false);
2044
}
2045
2046
retry_lookup:
2047
dn = d_lookup(parent, &dname);
2048
doutc(cl, "d_lookup on parent=%p name=%.*s got %p\n",
2049
parent, dname.len, dname.name, dn);
2050
2051
if (!dn) {
2052
dn = d_alloc(parent, &dname);
2053
doutc(cl, "d_alloc %p '%.*s' = %p\n", parent,
2054
dname.len, dname.name, dn);
2055
if (!dn) {
2056
doutc(cl, "d_alloc badness\n");
2057
err = -ENOMEM;
2058
goto out;
2059
}
2060
if (rde->is_nokey) {
2061
spin_lock(&dn->d_lock);
2062
dn->d_flags |= DCACHE_NOKEY_NAME;
2063
spin_unlock(&dn->d_lock);
2064
}
2065
} else if (d_really_is_positive(dn) &&
2066
(ceph_ino(d_inode(dn)) != tvino.ino ||
2067
ceph_snap(d_inode(dn)) != tvino.snap)) {
2068
struct ceph_dentry_info *di = ceph_dentry(dn);
2069
doutc(cl, " dn %p points to wrong inode %p\n",
2070
dn, d_inode(dn));
2071
2072
spin_lock(&dn->d_lock);
2073
if (di->offset > 0 &&
2074
di->lease_shared_gen ==
2075
atomic_read(&ci->i_shared_gen)) {
2076
__ceph_dir_clear_ordered(ci);
2077
di->offset = 0;
2078
}
2079
spin_unlock(&dn->d_lock);
2080
2081
d_delete(dn);
2082
dput(dn);
2083
goto retry_lookup;
2084
}
2085
2086
/* inode */
2087
if (d_really_is_positive(dn)) {
2088
in = d_inode(dn);
2089
} else {
2090
in = ceph_get_inode(parent->d_sb, tvino, NULL);
2091
if (IS_ERR(in)) {
2092
doutc(cl, "new_inode badness\n");
2093
d_drop(dn);
2094
dput(dn);
2095
err = PTR_ERR(in);
2096
goto out;
2097
}
2098
}
2099
2100
ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
2101
-1, &req->r_caps_reservation);
2102
if (ret < 0) {
2103
pr_err_client(cl, "badness on %p %llx.%llx\n", in,
2104
ceph_vinop(in));
2105
if (d_really_is_negative(dn)) {
2106
if (in->i_state & I_NEW) {
2107
ihold(in);
2108
discard_new_inode(in);
2109
}
2110
iput(in);
2111
}
2112
d_drop(dn);
2113
err = ret;
2114
goto next_item;
2115
}
2116
if (in->i_state & I_NEW)
2117
unlock_new_inode(in);
2118
2119
if (d_really_is_negative(dn)) {
2120
if (ceph_security_xattr_deadlock(in)) {
2121
doutc(cl, " skip splicing dn %p to inode %p"
2122
" (security xattr deadlock)\n", dn, in);
2123
iput(in);
2124
skipped++;
2125
goto next_item;
2126
}
2127
2128
err = splice_dentry(&dn, in);
2129
if (err < 0)
2130
goto next_item;
2131
}
2132
2133
ceph_dentry(dn)->offset = rde->offset;
2134
2135
update_dentry_lease(d_inode(parent), dn,
2136
rde->lease, req->r_session,
2137
req->r_request_started);
2138
2139
if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
2140
ret = fill_readdir_cache(d_inode(parent), dn,
2141
&cache_ctl, req);
2142
if (ret < 0)
2143
err = ret;
2144
}
2145
next_item:
2146
dput(dn);
2147
}
2148
out:
2149
if (err == 0 && skipped == 0) {
2150
set_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags);
2151
req->r_readdir_cache_idx = cache_ctl.index;
2152
}
2153
ceph_readdir_cache_release(&cache_ctl);
2154
doutc(cl, "done\n");
2155
return err;
2156
}
2157
2158
bool ceph_inode_set_size(struct inode *inode, loff_t size)
2159
{
2160
struct ceph_client *cl = ceph_inode_to_client(inode);
2161
struct ceph_inode_info *ci = ceph_inode(inode);
2162
bool ret;
2163
2164
spin_lock(&ci->i_ceph_lock);
2165
doutc(cl, "set_size %p %llu -> %llu\n", inode, i_size_read(inode), size);
2166
i_size_write(inode, size);
2167
ceph_fscache_update(inode);
2168
inode->i_blocks = calc_inode_blocks(size);
2169
2170
ret = __ceph_should_report_size(ci);
2171
2172
spin_unlock(&ci->i_ceph_lock);
2173
2174
return ret;
2175
}
2176
2177
void ceph_queue_inode_work(struct inode *inode, int work_bit)
2178
{
2179
struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
2180
struct ceph_client *cl = fsc->client;
2181
struct ceph_inode_info *ci = ceph_inode(inode);
2182
set_bit(work_bit, &ci->i_work_mask);
2183
2184
ihold(inode);
2185
if (queue_work(fsc->inode_wq, &ci->i_work)) {
2186
doutc(cl, "%p %llx.%llx mask=%lx\n", inode,
2187
ceph_vinop(inode), ci->i_work_mask);
2188
} else {
2189
doutc(cl, "%p %llx.%llx already queued, mask=%lx\n",
2190
inode, ceph_vinop(inode), ci->i_work_mask);
2191
iput(inode);
2192
}
2193
}
2194
2195
static void ceph_do_invalidate_pages(struct inode *inode)
2196
{
2197
struct ceph_client *cl = ceph_inode_to_client(inode);
2198
struct ceph_inode_info *ci = ceph_inode(inode);
2199
u32 orig_gen;
2200
int check = 0;
2201
2202
ceph_fscache_invalidate(inode, false);
2203
2204
mutex_lock(&ci->i_truncate_mutex);
2205
2206
if (ceph_inode_is_shutdown(inode)) {
2207
pr_warn_ratelimited_client(cl,
2208
"%p %llx.%llx is shut down\n", inode,
2209
ceph_vinop(inode));
2210
mapping_set_error(inode->i_mapping, -EIO);
2211
truncate_pagecache(inode, 0);
2212
mutex_unlock(&ci->i_truncate_mutex);
2213
goto out;
2214
}
2215
2216
spin_lock(&ci->i_ceph_lock);
2217
doutc(cl, "%p %llx.%llx gen %d revoking %d\n", inode,
2218
ceph_vinop(inode), ci->i_rdcache_gen, ci->i_rdcache_revoking);
2219
if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
2220
if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
2221
check = 1;
2222
spin_unlock(&ci->i_ceph_lock);
2223
mutex_unlock(&ci->i_truncate_mutex);
2224
goto out;
2225
}
2226
orig_gen = ci->i_rdcache_gen;
2227
spin_unlock(&ci->i_ceph_lock);
2228
2229
if (invalidate_inode_pages2(inode->i_mapping) < 0) {
2230
pr_err_client(cl, "invalidate_inode_pages2 %llx.%llx failed\n",
2231
ceph_vinop(inode));
2232
}
2233
2234
spin_lock(&ci->i_ceph_lock);
2235
if (orig_gen == ci->i_rdcache_gen &&
2236
orig_gen == ci->i_rdcache_revoking) {
2237
doutc(cl, "%p %llx.%llx gen %d successful\n", inode,
2238
ceph_vinop(inode), ci->i_rdcache_gen);
2239
ci->i_rdcache_revoking--;
2240
check = 1;
2241
} else {
2242
doutc(cl, "%p %llx.%llx gen %d raced, now %d revoking %d\n",
2243
inode, ceph_vinop(inode), orig_gen, ci->i_rdcache_gen,
2244
ci->i_rdcache_revoking);
2245
if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
2246
check = 1;
2247
}
2248
spin_unlock(&ci->i_ceph_lock);
2249
mutex_unlock(&ci->i_truncate_mutex);
2250
out:
2251
if (check)
2252
ceph_check_caps(ci, 0);
2253
}
2254
2255
/*
2256
* Make sure any pending truncation is applied before doing anything
2257
* that may depend on it.
2258
*/
2259
void __ceph_do_pending_vmtruncate(struct inode *inode)
2260
{
2261
struct ceph_client *cl = ceph_inode_to_client(inode);
2262
struct ceph_inode_info *ci = ceph_inode(inode);
2263
u64 to;
2264
int wrbuffer_refs, finish = 0;
2265
2266
mutex_lock(&ci->i_truncate_mutex);
2267
retry:
2268
spin_lock(&ci->i_ceph_lock);
2269
if (ci->i_truncate_pending == 0) {
2270
doutc(cl, "%p %llx.%llx none pending\n", inode,
2271
ceph_vinop(inode));
2272
spin_unlock(&ci->i_ceph_lock);
2273
mutex_unlock(&ci->i_truncate_mutex);
2274
return;
2275
}
2276
2277
/*
2278
* make sure any dirty snapped pages are flushed before we
2279
* possibly truncate them.. so write AND block!
2280
*/
2281
if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
2282
spin_unlock(&ci->i_ceph_lock);
2283
doutc(cl, "%p %llx.%llx flushing snaps first\n", inode,
2284
ceph_vinop(inode));
2285
filemap_write_and_wait_range(&inode->i_data, 0,
2286
inode->i_sb->s_maxbytes);
2287
goto retry;
2288
}
2289
2290
/* there should be no reader or writer */
2291
WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref);
2292
2293
to = ci->i_truncate_pagecache_size;
2294
wrbuffer_refs = ci->i_wrbuffer_ref;
2295
doutc(cl, "%p %llx.%llx (%d) to %lld\n", inode, ceph_vinop(inode),
2296
ci->i_truncate_pending, to);
2297
spin_unlock(&ci->i_ceph_lock);
2298
2299
ceph_fscache_resize(inode, to);
2300
truncate_pagecache(inode, to);
2301
2302
spin_lock(&ci->i_ceph_lock);
2303
if (to == ci->i_truncate_pagecache_size) {
2304
ci->i_truncate_pending = 0;
2305
finish = 1;
2306
}
2307
spin_unlock(&ci->i_ceph_lock);
2308
if (!finish)
2309
goto retry;
2310
2311
mutex_unlock(&ci->i_truncate_mutex);
2312
2313
if (wrbuffer_refs == 0)
2314
ceph_check_caps(ci, 0);
2315
2316
wake_up_all(&ci->i_cap_wq);
2317
}
2318
2319
static void ceph_inode_work(struct work_struct *work)
2320
{
2321
struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
2322
i_work);
2323
struct inode *inode = &ci->netfs.inode;
2324
struct ceph_client *cl = ceph_inode_to_client(inode);
2325
2326
if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) {
2327
doutc(cl, "writeback %p %llx.%llx\n", inode, ceph_vinop(inode));
2328
filemap_fdatawrite(&inode->i_data);
2329
}
2330
if (test_and_clear_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask))
2331
ceph_do_invalidate_pages(inode);
2332
2333
if (test_and_clear_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask))
2334
__ceph_do_pending_vmtruncate(inode);
2335
2336
if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, &ci->i_work_mask))
2337
ceph_check_caps(ci, 0);
2338
2339
if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, &ci->i_work_mask))
2340
ceph_flush_snaps(ci, NULL);
2341
2342
iput(inode);
2343
}
2344
2345
static const char *ceph_encrypted_get_link(struct dentry *dentry,
2346
struct inode *inode,
2347
struct delayed_call *done)
2348
{
2349
struct ceph_inode_info *ci = ceph_inode(inode);
2350
2351
if (!dentry)
2352
return ERR_PTR(-ECHILD);
2353
2354
return fscrypt_get_symlink(inode, ci->i_symlink, i_size_read(inode),
2355
done);
2356
}
2357
2358
static int ceph_encrypted_symlink_getattr(struct mnt_idmap *idmap,
2359
const struct path *path,
2360
struct kstat *stat, u32 request_mask,
2361
unsigned int query_flags)
2362
{
2363
int ret;
2364
2365
ret = ceph_getattr(idmap, path, stat, request_mask, query_flags);
2366
if (ret)
2367
return ret;
2368
return fscrypt_symlink_getattr(path, stat);
2369
}
2370
2371
/*
2372
* symlinks
2373
*/
2374
static const struct inode_operations ceph_symlink_iops = {
2375
.get_link = simple_get_link,
2376
.setattr = ceph_setattr,
2377
.getattr = ceph_getattr,
2378
.listxattr = ceph_listxattr,
2379
};
2380
2381
static const struct inode_operations ceph_encrypted_symlink_iops = {
2382
.get_link = ceph_encrypted_get_link,
2383
.setattr = ceph_setattr,
2384
.getattr = ceph_encrypted_symlink_getattr,
2385
.listxattr = ceph_listxattr,
2386
};
2387
2388
/*
2389
* Transfer the encrypted last block to the MDS and the MDS
2390
* will help update it when truncating a smaller size.
2391
*
2392
* We don't support a PAGE_SIZE that is smaller than the
2393
* CEPH_FSCRYPT_BLOCK_SIZE.
2394
*/
2395
static int fill_fscrypt_truncate(struct inode *inode,
2396
struct ceph_mds_request *req,
2397
struct iattr *attr)
2398
{
2399
struct ceph_client *cl = ceph_inode_to_client(inode);
2400
struct ceph_inode_info *ci = ceph_inode(inode);
2401
int boff = attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE;
2402
loff_t pos, orig_pos = round_down(attr->ia_size,
2403
CEPH_FSCRYPT_BLOCK_SIZE);
2404
u64 block = orig_pos >> CEPH_FSCRYPT_BLOCK_SHIFT;
2405
struct ceph_pagelist *pagelist = NULL;
2406
struct kvec iov = {0};
2407
struct iov_iter iter;
2408
struct page *page = NULL;
2409
struct ceph_fscrypt_truncate_size_header header;
2410
int retry_op = 0;
2411
int len = CEPH_FSCRYPT_BLOCK_SIZE;
2412
loff_t i_size = i_size_read(inode);
2413
int got, ret, issued;
2414
u64 objver;
2415
2416
ret = __ceph_get_caps(inode, NULL, CEPH_CAP_FILE_RD, 0, -1, &got);
2417
if (ret < 0)
2418
return ret;
2419
2420
issued = __ceph_caps_issued(ci, NULL);
2421
2422
doutc(cl, "size %lld -> %lld got cap refs on %s, issued %s\n",
2423
i_size, attr->ia_size, ceph_cap_string(got),
2424
ceph_cap_string(issued));
2425
2426
/* Try to writeback the dirty pagecaches */
2427
if (issued & (CEPH_CAP_FILE_BUFFER)) {
2428
loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SIZE - 1;
2429
2430
ret = filemap_write_and_wait_range(inode->i_mapping,
2431
orig_pos, lend);
2432
if (ret < 0)
2433
goto out;
2434
}
2435
2436
page = __page_cache_alloc(GFP_KERNEL);
2437
if (page == NULL) {
2438
ret = -ENOMEM;
2439
goto out;
2440
}
2441
2442
pagelist = ceph_pagelist_alloc(GFP_KERNEL);
2443
if (!pagelist) {
2444
ret = -ENOMEM;
2445
goto out;
2446
}
2447
2448
iov.iov_base = kmap_local_page(page);
2449
iov.iov_len = len;
2450
iov_iter_kvec(&iter, READ, &iov, 1, len);
2451
2452
pos = orig_pos;
2453
ret = __ceph_sync_read(inode, &pos, &iter, &retry_op, &objver);
2454
if (ret < 0)
2455
goto out;
2456
2457
/* Insert the header first */
2458
header.ver = 1;
2459
header.compat = 1;
2460
header.change_attr = cpu_to_le64(inode_peek_iversion_raw(inode));
2461
2462
/*
2463
* Always set the block_size to CEPH_FSCRYPT_BLOCK_SIZE,
2464
* because in MDS it may need this to do the truncate.
2465
*/
2466
header.block_size = cpu_to_le32(CEPH_FSCRYPT_BLOCK_SIZE);
2467
2468
/*
2469
* If we hit a hole here, we should just skip filling
2470
* the fscrypt for the request, because once the fscrypt
2471
* is enabled, the file will be split into many blocks
2472
* with the size of CEPH_FSCRYPT_BLOCK_SIZE, if there
2473
* has a hole, the hole size should be multiple of block
2474
* size.
2475
*
2476
* If the Rados object doesn't exist, it will be set to 0.
2477
*/
2478
if (!objver) {
2479
doutc(cl, "hit hole, ppos %lld < size %lld\n", pos, i_size);
2480
2481
header.data_len = cpu_to_le32(8 + 8 + 4);
2482
header.file_offset = 0;
2483
ret = 0;
2484
} else {
2485
header.data_len = cpu_to_le32(8 + 8 + 4 + CEPH_FSCRYPT_BLOCK_SIZE);
2486
header.file_offset = cpu_to_le64(orig_pos);
2487
2488
doutc(cl, "encrypt block boff/bsize %d/%lu\n", boff,
2489
CEPH_FSCRYPT_BLOCK_SIZE);
2490
2491
/* truncate and zero out the extra contents for the last block */
2492
memset(iov.iov_base + boff, 0, PAGE_SIZE - boff);
2493
2494
/* encrypt the last block */
2495
ret = ceph_fscrypt_encrypt_block_inplace(inode, page,
2496
CEPH_FSCRYPT_BLOCK_SIZE,
2497
0, block);
2498
if (ret)
2499
goto out;
2500
}
2501
2502
/* Insert the header */
2503
ret = ceph_pagelist_append(pagelist, &header, sizeof(header));
2504
if (ret)
2505
goto out;
2506
2507
if (header.block_size) {
2508
/* Append the last block contents to pagelist */
2509
ret = ceph_pagelist_append(pagelist, iov.iov_base,
2510
CEPH_FSCRYPT_BLOCK_SIZE);
2511
if (ret)
2512
goto out;
2513
}
2514
req->r_pagelist = pagelist;
2515
out:
2516
doutc(cl, "%p %llx.%llx size dropping cap refs on %s\n", inode,
2517
ceph_vinop(inode), ceph_cap_string(got));
2518
ceph_put_cap_refs(ci, got);
2519
if (iov.iov_base)
2520
kunmap_local(iov.iov_base);
2521
if (page)
2522
__free_pages(page, 0);
2523
if (ret && pagelist)
2524
ceph_pagelist_release(pagelist);
2525
return ret;
2526
}
2527
2528
int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode,
2529
struct iattr *attr, struct ceph_iattr *cia)
2530
{
2531
struct ceph_inode_info *ci = ceph_inode(inode);
2532
unsigned int ia_valid = attr->ia_valid;
2533
struct ceph_mds_request *req;
2534
struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
2535
struct ceph_client *cl = ceph_inode_to_client(inode);
2536
struct ceph_cap_flush *prealloc_cf;
2537
loff_t isize = i_size_read(inode);
2538
int issued;
2539
int release = 0, dirtied = 0;
2540
int mask = 0;
2541
int err = 0;
2542
int inode_dirty_flags = 0;
2543
bool lock_snap_rwsem = false;
2544
bool fill_fscrypt;
2545
int truncate_retry = 20; /* The RMW will take around 50ms */
2546
struct dentry *dentry;
2547
char *path;
2548
bool do_sync = false;
2549
2550
dentry = d_find_alias(inode);
2551
if (!dentry) {
2552
do_sync = true;
2553
} else {
2554
struct ceph_path_info path_info;
2555
path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
2556
if (IS_ERR(path)) {
2557
do_sync = true;
2558
err = 0;
2559
} else {
2560
err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
2561
}
2562
ceph_mdsc_free_path_info(&path_info);
2563
dput(dentry);
2564
2565
/* For none EACCES cases will let the MDS do the mds auth check */
2566
if (err == -EACCES) {
2567
return err;
2568
} else if (err < 0) {
2569
do_sync = true;
2570
err = 0;
2571
}
2572
}
2573
2574
retry:
2575
prealloc_cf = ceph_alloc_cap_flush();
2576
if (!prealloc_cf)
2577
return -ENOMEM;
2578
2579
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
2580
USE_AUTH_MDS);
2581
if (IS_ERR(req)) {
2582
ceph_free_cap_flush(prealloc_cf);
2583
return PTR_ERR(req);
2584
}
2585
2586
fill_fscrypt = false;
2587
spin_lock(&ci->i_ceph_lock);
2588
issued = __ceph_caps_issued(ci, NULL);
2589
2590
if (!ci->i_head_snapc &&
2591
(issued & (CEPH_CAP_ANY_EXCL | CEPH_CAP_FILE_WR))) {
2592
lock_snap_rwsem = true;
2593
if (!down_read_trylock(&mdsc->snap_rwsem)) {
2594
spin_unlock(&ci->i_ceph_lock);
2595
down_read(&mdsc->snap_rwsem);
2596
spin_lock(&ci->i_ceph_lock);
2597
issued = __ceph_caps_issued(ci, NULL);
2598
}
2599
}
2600
2601
doutc(cl, "%p %llx.%llx issued %s\n", inode, ceph_vinop(inode),
2602
ceph_cap_string(issued));
2603
#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
2604
if (cia && cia->fscrypt_auth) {
2605
u32 len = ceph_fscrypt_auth_len(cia->fscrypt_auth);
2606
2607
if (len > sizeof(*cia->fscrypt_auth)) {
2608
err = -EINVAL;
2609
spin_unlock(&ci->i_ceph_lock);
2610
goto out;
2611
}
2612
2613
doutc(cl, "%p %llx.%llx fscrypt_auth len %u to %u)\n", inode,
2614
ceph_vinop(inode), ci->fscrypt_auth_len, len);
2615
2616
/* It should never be re-set once set */
2617
WARN_ON_ONCE(ci->fscrypt_auth);
2618
2619
if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) {
2620
dirtied |= CEPH_CAP_AUTH_EXCL;
2621
kfree(ci->fscrypt_auth);
2622
ci->fscrypt_auth = (u8 *)cia->fscrypt_auth;
2623
ci->fscrypt_auth_len = len;
2624
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
2625
ci->fscrypt_auth_len != len ||
2626
memcmp(ci->fscrypt_auth, cia->fscrypt_auth, len)) {
2627
req->r_fscrypt_auth = cia->fscrypt_auth;
2628
mask |= CEPH_SETATTR_FSCRYPT_AUTH;
2629
release |= CEPH_CAP_AUTH_SHARED;
2630
}
2631
cia->fscrypt_auth = NULL;
2632
}
2633
#else
2634
if (cia && cia->fscrypt_auth) {
2635
err = -EINVAL;
2636
spin_unlock(&ci->i_ceph_lock);
2637
goto out;
2638
}
2639
#endif /* CONFIG_FS_ENCRYPTION */
2640
2641
if (ia_valid & ATTR_UID) {
2642
kuid_t fsuid = from_vfsuid(idmap, i_user_ns(inode), attr->ia_vfsuid);
2643
2644
doutc(cl, "%p %llx.%llx uid %d -> %d\n", inode,
2645
ceph_vinop(inode),
2646
from_kuid(&init_user_ns, inode->i_uid),
2647
from_kuid(&init_user_ns, attr->ia_uid));
2648
if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) {
2649
inode->i_uid = fsuid;
2650
dirtied |= CEPH_CAP_AUTH_EXCL;
2651
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
2652
!uid_eq(fsuid, inode->i_uid)) {
2653
req->r_args.setattr.uid = cpu_to_le32(
2654
from_kuid(&init_user_ns, fsuid));
2655
mask |= CEPH_SETATTR_UID;
2656
release |= CEPH_CAP_AUTH_SHARED;
2657
}
2658
}
2659
if (ia_valid & ATTR_GID) {
2660
kgid_t fsgid = from_vfsgid(idmap, i_user_ns(inode), attr->ia_vfsgid);
2661
2662
doutc(cl, "%p %llx.%llx gid %d -> %d\n", inode,
2663
ceph_vinop(inode),
2664
from_kgid(&init_user_ns, inode->i_gid),
2665
from_kgid(&init_user_ns, attr->ia_gid));
2666
if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) {
2667
inode->i_gid = fsgid;
2668
dirtied |= CEPH_CAP_AUTH_EXCL;
2669
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
2670
!gid_eq(fsgid, inode->i_gid)) {
2671
req->r_args.setattr.gid = cpu_to_le32(
2672
from_kgid(&init_user_ns, fsgid));
2673
mask |= CEPH_SETATTR_GID;
2674
release |= CEPH_CAP_AUTH_SHARED;
2675
}
2676
}
2677
if (ia_valid & ATTR_MODE) {
2678
doutc(cl, "%p %llx.%llx mode 0%o -> 0%o\n", inode,
2679
ceph_vinop(inode), inode->i_mode, attr->ia_mode);
2680
if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) {
2681
inode->i_mode = attr->ia_mode;
2682
dirtied |= CEPH_CAP_AUTH_EXCL;
2683
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
2684
attr->ia_mode != inode->i_mode) {
2685
inode->i_mode = attr->ia_mode;
2686
req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
2687
mask |= CEPH_SETATTR_MODE;
2688
release |= CEPH_CAP_AUTH_SHARED;
2689
}
2690
}
2691
2692
if (ia_valid & ATTR_ATIME) {
2693
struct timespec64 atime = inode_get_atime(inode);
2694
2695
doutc(cl, "%p %llx.%llx atime %lld.%09ld -> %lld.%09ld\n",
2696
inode, ceph_vinop(inode),
2697
atime.tv_sec, atime.tv_nsec,
2698
attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
2699
if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) {
2700
ci->i_time_warp_seq++;
2701
inode_set_atime_to_ts(inode, attr->ia_atime);
2702
dirtied |= CEPH_CAP_FILE_EXCL;
2703
} else if (!do_sync && (issued & CEPH_CAP_FILE_WR) &&
2704
timespec64_compare(&atime,
2705
&attr->ia_atime) < 0) {
2706
inode_set_atime_to_ts(inode, attr->ia_atime);
2707
dirtied |= CEPH_CAP_FILE_WR;
2708
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
2709
!timespec64_equal(&atime, &attr->ia_atime)) {
2710
ceph_encode_timespec64(&req->r_args.setattr.atime,
2711
&attr->ia_atime);
2712
mask |= CEPH_SETATTR_ATIME;
2713
release |= CEPH_CAP_FILE_SHARED |
2714
CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
2715
}
2716
}
2717
if (ia_valid & ATTR_SIZE) {
2718
doutc(cl, "%p %llx.%llx size %lld -> %lld\n", inode,
2719
ceph_vinop(inode), isize, attr->ia_size);
2720
/*
2721
* Only when the new size is smaller and not aligned to
2722
* CEPH_FSCRYPT_BLOCK_SIZE will the RMW is needed.
2723
*/
2724
if (IS_ENCRYPTED(inode) && attr->ia_size < isize &&
2725
(attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE)) {
2726
mask |= CEPH_SETATTR_SIZE;
2727
release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
2728
CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
2729
set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags);
2730
mask |= CEPH_SETATTR_FSCRYPT_FILE;
2731
req->r_args.setattr.size =
2732
cpu_to_le64(round_up(attr->ia_size,
2733
CEPH_FSCRYPT_BLOCK_SIZE));
2734
req->r_args.setattr.old_size =
2735
cpu_to_le64(round_up(isize,
2736
CEPH_FSCRYPT_BLOCK_SIZE));
2737
req->r_fscrypt_file = attr->ia_size;
2738
fill_fscrypt = true;
2739
} else if (!do_sync && (issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) {
2740
if (attr->ia_size > isize) {
2741
i_size_write(inode, attr->ia_size);
2742
inode->i_blocks = calc_inode_blocks(attr->ia_size);
2743
ci->i_reported_size = attr->ia_size;
2744
dirtied |= CEPH_CAP_FILE_EXCL;
2745
ia_valid |= ATTR_MTIME;
2746
}
2747
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
2748
attr->ia_size != isize) {
2749
mask |= CEPH_SETATTR_SIZE;
2750
release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
2751
CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
2752
if (IS_ENCRYPTED(inode) && attr->ia_size) {
2753
set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags);
2754
mask |= CEPH_SETATTR_FSCRYPT_FILE;
2755
req->r_args.setattr.size =
2756
cpu_to_le64(round_up(attr->ia_size,
2757
CEPH_FSCRYPT_BLOCK_SIZE));
2758
req->r_args.setattr.old_size =
2759
cpu_to_le64(round_up(isize,
2760
CEPH_FSCRYPT_BLOCK_SIZE));
2761
req->r_fscrypt_file = attr->ia_size;
2762
} else {
2763
req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
2764
req->r_args.setattr.old_size = cpu_to_le64(isize);
2765
req->r_fscrypt_file = 0;
2766
}
2767
}
2768
}
2769
if (ia_valid & ATTR_MTIME) {
2770
struct timespec64 mtime = inode_get_mtime(inode);
2771
2772
doutc(cl, "%p %llx.%llx mtime %lld.%09ld -> %lld.%09ld\n",
2773
inode, ceph_vinop(inode),
2774
mtime.tv_sec, mtime.tv_nsec,
2775
attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
2776
if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) {
2777
ci->i_time_warp_seq++;
2778
inode_set_mtime_to_ts(inode, attr->ia_mtime);
2779
dirtied |= CEPH_CAP_FILE_EXCL;
2780
} else if (!do_sync && (issued & CEPH_CAP_FILE_WR) &&
2781
timespec64_compare(&mtime, &attr->ia_mtime) < 0) {
2782
inode_set_mtime_to_ts(inode, attr->ia_mtime);
2783
dirtied |= CEPH_CAP_FILE_WR;
2784
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
2785
!timespec64_equal(&mtime, &attr->ia_mtime)) {
2786
ceph_encode_timespec64(&req->r_args.setattr.mtime,
2787
&attr->ia_mtime);
2788
mask |= CEPH_SETATTR_MTIME;
2789
release |= CEPH_CAP_FILE_SHARED |
2790
CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
2791
}
2792
}
2793
2794
/* these do nothing */
2795
if (ia_valid & ATTR_CTIME) {
2796
bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
2797
ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
2798
doutc(cl, "%p %llx.%llx ctime %lld.%09ld -> %lld.%09ld (%s)\n",
2799
inode, ceph_vinop(inode),
2800
inode_get_ctime_sec(inode),
2801
inode_get_ctime_nsec(inode),
2802
attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
2803
only ? "ctime only" : "ignored");
2804
if (only) {
2805
/*
2806
* if kernel wants to dirty ctime but nothing else,
2807
* we need to choose a cap to dirty under, or do
2808
* a almost-no-op setattr
2809
*/
2810
if (issued & CEPH_CAP_AUTH_EXCL)
2811
dirtied |= CEPH_CAP_AUTH_EXCL;
2812
else if (issued & CEPH_CAP_FILE_EXCL)
2813
dirtied |= CEPH_CAP_FILE_EXCL;
2814
else if (issued & CEPH_CAP_XATTR_EXCL)
2815
dirtied |= CEPH_CAP_XATTR_EXCL;
2816
else
2817
mask |= CEPH_SETATTR_CTIME;
2818
}
2819
}
2820
if (ia_valid & ATTR_FILE)
2821
doutc(cl, "%p %llx.%llx ATTR_FILE ... hrm!\n", inode,
2822
ceph_vinop(inode));
2823
2824
if (dirtied) {
2825
inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
2826
&prealloc_cf);
2827
inode_set_ctime_to_ts(inode, attr->ia_ctime);
2828
inode_inc_iversion_raw(inode);
2829
}
2830
2831
release &= issued;
2832
spin_unlock(&ci->i_ceph_lock);
2833
if (lock_snap_rwsem) {
2834
up_read(&mdsc->snap_rwsem);
2835
lock_snap_rwsem = false;
2836
}
2837
2838
if (inode_dirty_flags)
2839
__mark_inode_dirty(inode, inode_dirty_flags);
2840
2841
if (mask) {
2842
req->r_inode = inode;
2843
ihold(inode);
2844
req->r_inode_drop = release;
2845
req->r_args.setattr.mask = cpu_to_le32(mask);
2846
req->r_num_caps = 1;
2847
req->r_stamp = attr->ia_ctime;
2848
if (fill_fscrypt) {
2849
err = fill_fscrypt_truncate(inode, req, attr);
2850
if (err)
2851
goto out;
2852
}
2853
2854
/*
2855
* The truncate request will return -EAGAIN when the
2856
* last block has been updated just before the MDS
2857
* successfully gets the xlock for the FILE lock. To
2858
* avoid corrupting the file contents we need to retry
2859
* it.
2860
*/
2861
err = ceph_mdsc_do_request(mdsc, NULL, req);
2862
if (err == -EAGAIN && truncate_retry--) {
2863
doutc(cl, "%p %llx.%llx result=%d (%s locally, %d remote), retry it!\n",
2864
inode, ceph_vinop(inode), err,
2865
ceph_cap_string(dirtied), mask);
2866
ceph_mdsc_put_request(req);
2867
ceph_free_cap_flush(prealloc_cf);
2868
goto retry;
2869
}
2870
}
2871
out:
2872
doutc(cl, "%p %llx.%llx result=%d (%s locally, %d remote)\n", inode,
2873
ceph_vinop(inode), err, ceph_cap_string(dirtied), mask);
2874
2875
ceph_mdsc_put_request(req);
2876
ceph_free_cap_flush(prealloc_cf);
2877
2878
if (err >= 0 && (mask & CEPH_SETATTR_SIZE))
2879
__ceph_do_pending_vmtruncate(inode);
2880
2881
return err;
2882
}
2883
2884
/*
2885
* setattr
2886
*/
2887
int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
2888
struct iattr *attr)
2889
{
2890
struct inode *inode = d_inode(dentry);
2891
struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
2892
int err;
2893
2894
if (ceph_snap(inode) != CEPH_NOSNAP)
2895
return -EROFS;
2896
2897
if (ceph_inode_is_shutdown(inode))
2898
return -ESTALE;
2899
2900
err = fscrypt_prepare_setattr(dentry, attr);
2901
if (err)
2902
return err;
2903
2904
err = setattr_prepare(idmap, dentry, attr);
2905
if (err != 0)
2906
return err;
2907
2908
if ((attr->ia_valid & ATTR_SIZE) &&
2909
attr->ia_size > max(i_size_read(inode), fsc->max_file_size))
2910
return -EFBIG;
2911
2912
if ((attr->ia_valid & ATTR_SIZE) &&
2913
ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size))
2914
return -EDQUOT;
2915
2916
err = __ceph_setattr(idmap, inode, attr, NULL);
2917
2918
if (err >= 0 && (attr->ia_valid & ATTR_MODE))
2919
err = posix_acl_chmod(idmap, dentry, attr->ia_mode);
2920
2921
return err;
2922
}
2923
2924
int ceph_try_to_choose_auth_mds(struct inode *inode, int mask)
2925
{
2926
int issued = ceph_caps_issued(ceph_inode(inode));
2927
2928
/*
2929
* If any 'x' caps is issued we can just choose the auth MDS
2930
* instead of the random replica MDSes. Because only when the
2931
* Locker is in LOCK_EXEC state will the loner client could
2932
* get the 'x' caps. And if we send the getattr requests to
2933
* any replica MDS it must auth pin and tries to rdlock from
2934
* the auth MDS, and then the auth MDS need to do the Locker
2935
* state transition to LOCK_SYNC. And after that the lock state
2936
* will change back.
2937
*
2938
* This cost much when doing the Locker state transition and
2939
* usually will need to revoke caps from clients.
2940
*
2941
* And for the 'Xs' caps for getxattr we will also choose the
2942
* auth MDS, because the MDS side code is buggy due to setxattr
2943
* won't notify the replica MDSes when the values changed and
2944
* the replica MDS will return the old values. Though we will
2945
* fix it in MDS code, but this still makes sense for old ceph.
2946
*/
2947
if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL))
2948
|| (mask & (CEPH_STAT_RSTAT | CEPH_STAT_CAP_XATTR)))
2949
return USE_AUTH_MDS;
2950
else
2951
return USE_ANY_MDS;
2952
}
2953
2954
/*
2955
* Verify that we have a lease on the given mask. If not,
2956
* do a getattr against an mds.
2957
*/
2958
int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
2959
int mask, bool force)
2960
{
2961
struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb);
2962
struct ceph_client *cl = fsc->client;
2963
struct ceph_mds_client *mdsc = fsc->mdsc;
2964
struct ceph_mds_request *req;
2965
int mode;
2966
int err;
2967
2968
if (ceph_snap(inode) == CEPH_SNAPDIR) {
2969
doutc(cl, "inode %p %llx.%llx SNAPDIR\n", inode,
2970
ceph_vinop(inode));
2971
return 0;
2972
}
2973
2974
doutc(cl, "inode %p %llx.%llx mask %s mode 0%o\n", inode,
2975
ceph_vinop(inode), ceph_cap_string(mask), inode->i_mode);
2976
if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1))
2977
return 0;
2978
2979
mode = ceph_try_to_choose_auth_mds(inode, mask);
2980
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
2981
if (IS_ERR(req))
2982
return PTR_ERR(req);
2983
req->r_inode = inode;
2984
ihold(inode);
2985
req->r_num_caps = 1;
2986
req->r_args.getattr.mask = cpu_to_le32(mask);
2987
req->r_locked_page = locked_page;
2988
err = ceph_mdsc_do_request(mdsc, NULL, req);
2989
if (locked_page && err == 0) {
2990
u64 inline_version = req->r_reply_info.targeti.inline_version;
2991
if (inline_version == 0) {
2992
/* the reply is supposed to contain inline data */
2993
err = -EINVAL;
2994
} else if (inline_version == CEPH_INLINE_NONE ||
2995
inline_version == 1) {
2996
err = -ENODATA;
2997
} else {
2998
err = req->r_reply_info.targeti.inline_len;
2999
}
3000
}
3001
ceph_mdsc_put_request(req);
3002
doutc(cl, "result=%d\n", err);
3003
return err;
3004
}
3005
3006
int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
3007
size_t size)
3008
{
3009
struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb);
3010
struct ceph_client *cl = fsc->client;
3011
struct ceph_mds_client *mdsc = fsc->mdsc;
3012
struct ceph_mds_request *req;
3013
int mode = USE_AUTH_MDS;
3014
int err;
3015
char *xattr_value;
3016
size_t xattr_value_len;
3017
3018
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
3019
if (IS_ERR(req)) {
3020
err = -ENOMEM;
3021
goto out;
3022
}
3023
3024
req->r_feature_needed = CEPHFS_FEATURE_OP_GETVXATTR;
3025
req->r_path2 = kstrdup(name, GFP_NOFS);
3026
if (!req->r_path2) {
3027
err = -ENOMEM;
3028
goto put;
3029
}
3030
3031
ihold(inode);
3032
req->r_inode = inode;
3033
err = ceph_mdsc_do_request(mdsc, NULL, req);
3034
if (err < 0)
3035
goto put;
3036
3037
xattr_value = req->r_reply_info.xattr_info.xattr_value;
3038
xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
3039
3040
doutc(cl, "xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
3041
3042
err = (int)xattr_value_len;
3043
if (size == 0)
3044
goto put;
3045
3046
if (xattr_value_len > size) {
3047
err = -ERANGE;
3048
goto put;
3049
}
3050
3051
memcpy(value, xattr_value, xattr_value_len);
3052
put:
3053
ceph_mdsc_put_request(req);
3054
out:
3055
doutc(cl, "result=%d\n", err);
3056
return err;
3057
}
3058
3059
3060
/*
3061
* Check inode permissions. We verify we have a valid value for
3062
* the AUTH cap, then call the generic handler.
3063
*/
3064
int ceph_permission(struct mnt_idmap *idmap, struct inode *inode,
3065
int mask)
3066
{
3067
int err;
3068
3069
if (mask & MAY_NOT_BLOCK)
3070
return -ECHILD;
3071
3072
err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false);
3073
3074
if (!err)
3075
err = generic_permission(idmap, inode, mask);
3076
return err;
3077
}
3078
3079
/* Craft a mask of needed caps given a set of requested statx attrs. */
3080
static int statx_to_caps(u32 want, umode_t mode)
3081
{
3082
int mask = 0;
3083
3084
if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME|STATX_CHANGE_COOKIE))
3085
mask |= CEPH_CAP_AUTH_SHARED;
3086
3087
if (want & (STATX_NLINK|STATX_CTIME|STATX_CHANGE_COOKIE)) {
3088
/*
3089
* The link count for directories depends on inode->i_subdirs,
3090
* and that is only updated when Fs caps are held.
3091
*/
3092
if (S_ISDIR(mode))
3093
mask |= CEPH_CAP_FILE_SHARED;
3094
else
3095
mask |= CEPH_CAP_LINK_SHARED;
3096
}
3097
3098
if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|STATX_BLOCKS|STATX_CHANGE_COOKIE))
3099
mask |= CEPH_CAP_FILE_SHARED;
3100
3101
if (want & (STATX_CTIME|STATX_CHANGE_COOKIE))
3102
mask |= CEPH_CAP_XATTR_SHARED;
3103
3104
return mask;
3105
}
3106
3107
/*
3108
* Get all the attributes. If we have sufficient caps for the requested attrs,
3109
* then we can avoid talking to the MDS at all.
3110
*/
3111
int ceph_getattr(struct mnt_idmap *idmap, const struct path *path,
3112
struct kstat *stat, u32 request_mask, unsigned int flags)
3113
{
3114
struct inode *inode = d_inode(path->dentry);
3115
struct super_block *sb = inode->i_sb;
3116
struct ceph_inode_info *ci = ceph_inode(inode);
3117
u32 valid_mask = STATX_BASIC_STATS;
3118
int err = 0;
3119
3120
if (ceph_inode_is_shutdown(inode))
3121
return -ESTALE;
3122
3123
/* Skip the getattr altogether if we're asked not to sync */
3124
if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) {
3125
err = ceph_do_getattr(inode,
3126
statx_to_caps(request_mask, inode->i_mode),
3127
flags & AT_STATX_FORCE_SYNC);
3128
if (err)
3129
return err;
3130
}
3131
3132
generic_fillattr(idmap, request_mask, inode, stat);
3133
stat->ino = ceph_present_inode(inode);
3134
3135
/*
3136
* btime on newly-allocated inodes is 0, so if this is still set to
3137
* that, then assume that it's not valid.
3138
*/
3139
if (ci->i_btime.tv_sec || ci->i_btime.tv_nsec) {
3140
stat->btime = ci->i_btime;
3141
valid_mask |= STATX_BTIME;
3142
}
3143
3144
if (request_mask & STATX_CHANGE_COOKIE) {
3145
stat->change_cookie = inode_peek_iversion_raw(inode);
3146
valid_mask |= STATX_CHANGE_COOKIE;
3147
}
3148
3149
if (ceph_snap(inode) == CEPH_NOSNAP)
3150
stat->dev = sb->s_dev;
3151
else
3152
stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
3153
3154
if (S_ISDIR(inode->i_mode)) {
3155
if (ceph_test_mount_opt(ceph_sb_to_fs_client(sb), RBYTES)) {
3156
stat->size = ci->i_rbytes;
3157
} else if (ceph_snap(inode) == CEPH_SNAPDIR) {
3158
struct ceph_inode_info *pci;
3159
struct ceph_snap_realm *realm;
3160
struct inode *parent;
3161
3162
parent = ceph_lookup_inode(sb, ceph_ino(inode));
3163
if (IS_ERR(parent))
3164
return PTR_ERR(parent);
3165
3166
pci = ceph_inode(parent);
3167
spin_lock(&pci->i_ceph_lock);
3168
realm = pci->i_snap_realm;
3169
if (realm)
3170
stat->size = realm->num_snaps;
3171
else
3172
stat->size = 0;
3173
spin_unlock(&pci->i_ceph_lock);
3174
iput(parent);
3175
} else {
3176
stat->size = ci->i_files + ci->i_subdirs;
3177
}
3178
stat->blocks = 0;
3179
stat->blksize = 65536;
3180
/*
3181
* Some applications rely on the number of st_nlink
3182
* value on directories to be either 0 (if unlinked)
3183
* or 2 + number of subdirectories.
3184
*/
3185
if (stat->nlink == 1)
3186
/* '.' + '..' + subdirs */
3187
stat->nlink = 1 + 1 + ci->i_subdirs;
3188
}
3189
3190
stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC;
3191
if (IS_ENCRYPTED(inode))
3192
stat->attributes |= STATX_ATTR_ENCRYPTED;
3193
stat->attributes_mask |= (STATX_ATTR_CHANGE_MONOTONIC |
3194
STATX_ATTR_ENCRYPTED);
3195
3196
stat->result_mask = request_mask & valid_mask;
3197
return err;
3198
}
3199
3200
void ceph_inode_shutdown(struct inode *inode)
3201
{
3202
struct ceph_inode_info *ci = ceph_inode(inode);
3203
struct rb_node *p;
3204
int iputs = 0;
3205
bool invalidate = false;
3206
3207
spin_lock(&ci->i_ceph_lock);
3208
ci->i_ceph_flags |= CEPH_I_SHUTDOWN;
3209
p = rb_first(&ci->i_caps);
3210
while (p) {
3211
struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
3212
3213
p = rb_next(p);
3214
iputs += ceph_purge_inode_cap(inode, cap, &invalidate);
3215
}
3216
spin_unlock(&ci->i_ceph_lock);
3217
3218
if (invalidate)
3219
ceph_queue_invalidate(inode);
3220
while (iputs--)
3221
iput(inode);
3222
}
3223
3224