8a19e00450
commit bc0939fcfab0d7efb2ed12896b1af3d819954a14 upstream. We have a race between marking that an inode needs to be logged, either at btrfs_set_inode_last_trans() or at btrfs_page_mkwrite(), and between btrfs_sync_log(). The following steps describe how the race happens. 1) We are at transaction N; 2) Inode I was previously fsynced in the current transaction so it has: inode->logged_trans set to N; 3) The inode's root currently has: root->log_transid set to 1 root->last_log_commit set to 0 Which means only one log transaction was committed to far, log transaction 0. When a log tree is created we set ->log_transid and ->last_log_commit of its parent root to 0 (at btrfs_add_log_tree()); 4) One more range of pages is dirtied in inode I; 5) Some task A starts an fsync against some other inode J (same root), and so it joins log transaction 1. Before task A calls btrfs_sync_log()... 6) Task B starts an fsync against inode I, which currently has the full sync flag set, so it starts delalloc and waits for the ordered extent to complete before calling btrfs_inode_in_log() at btrfs_sync_file(); 7) During ordered extent completion we have btrfs_update_inode() called against inode I, which in turn calls btrfs_set_inode_last_trans(), which does the following: spin_lock(&inode->lock); inode->last_trans = trans->transaction->transid; inode->last_sub_trans = inode->root->log_transid; inode->last_log_commit = inode->root->last_log_commit; spin_unlock(&inode->lock); So ->last_trans is set to N and ->last_sub_trans set to 1. But before setting ->last_log_commit... 8) Task A is at btrfs_sync_log(): - it increments root->log_transid to 2 - starts writeback for all log tree extent buffers - waits for the writeback to complete - writes the super blocks - updates root->last_log_commit to 1 It's a lot of slow steps between updating root->log_transid and root->last_log_commit; 9) The task doing the ordered extent completion, currently at btrfs_set_inode_last_trans(), then finally runs: inode->last_log_commit = inode->root->last_log_commit; spin_unlock(&inode->lock); Which results in inode->last_log_commit being set to 1. The ordered extent completes; 10) Task B is resumed, and it calls btrfs_inode_in_log() which returns true because we have all the following conditions met: inode->logged_trans == N which matches fs_info->generation && inode->last_subtrans (1) <= inode->last_log_commit (1) && inode->last_subtrans (1) <= root->last_log_commit (1) && list inode->extent_tree.modified_extents is empty And as a consequence we return without logging the inode, so the existing logged version of the inode does not point to the extent that was written after the previous fsync. It should be impossible in practice for one task be able to do so much progress in btrfs_sync_log() while another task is at btrfs_set_inode_last_trans() right after it reads root->log_transid and before it reads root->last_log_commit. Even if kernel preemption is enabled we know the task at btrfs_set_inode_last_trans() can not be preempted because it is holding the inode's spinlock. However there is another place where we do the same without holding the spinlock, which is in the memory mapped write path at: vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) { (...) BTRFS_I(inode)->last_trans = fs_info->generation; BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; (...) So with preemption happening after setting ->last_sub_trans and before setting ->last_log_commit, it is less of a stretch to have another task do enough progress at btrfs_sync_log() such that the task doing the memory mapped write ends up with ->last_sub_trans and ->last_log_commit set to the same value. It is still a big stretch to get there, as the task doing btrfs_sync_log() has to start writeback, wait for its completion and write the super blocks. So fix this in two different ways: 1) For btrfs_set_inode_last_trans(), simply set ->last_log_commit to the value of ->last_sub_trans minus 1; 2) For btrfs_page_mkwrite() only set the inode's ->last_sub_trans, just like we do for buffered and direct writes at btrfs_file_write_iter(), which is all we need to make sure multiple writes and fsyncs to an inode in the same transaction never result in an fsync missing that the inode changed and needs to be logged. Turn this into a helper function and use it both at btrfs_page_mkwrite() and at btrfs_file_write_iter() - this also fixes the problem that at btrfs_page_mkwrite() we were setting those fields without the protection of the inode's spinlock. This is an extremely unlikely race to happen in practice. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
386 lines
10 KiB
C
386 lines
10 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* Copyright (C) 2007 Oracle. All rights reserved.
|
|
*/
|
|
|
|
#ifndef BTRFS_INODE_H
|
|
#define BTRFS_INODE_H
|
|
|
|
#include <linux/hash.h>
|
|
#include "extent_map.h"
|
|
#include "extent_io.h"
|
|
#include "ordered-data.h"
|
|
#include "delayed-inode.h"
|
|
|
|
/*
|
|
* ordered_data_close is set by truncate when a file that used
|
|
* to have good data has been truncated to zero. When it is set
|
|
* the btrfs file release call will add this inode to the
|
|
* ordered operations list so that we make sure to flush out any
|
|
* new data the application may have written before commit.
|
|
*/
|
|
enum {
|
|
BTRFS_INODE_ORDERED_DATA_CLOSE,
|
|
BTRFS_INODE_DUMMY,
|
|
BTRFS_INODE_IN_DEFRAG,
|
|
BTRFS_INODE_HAS_ASYNC_EXTENT,
|
|
BTRFS_INODE_NEEDS_FULL_SYNC,
|
|
BTRFS_INODE_COPY_EVERYTHING,
|
|
BTRFS_INODE_IN_DELALLOC_LIST,
|
|
BTRFS_INODE_READDIO_NEED_LOCK,
|
|
BTRFS_INODE_HAS_PROPS,
|
|
BTRFS_INODE_SNAPSHOT_FLUSH,
|
|
};
|
|
|
|
/* in memory btrfs inode */
|
|
struct btrfs_inode {
|
|
/* which subvolume this inode belongs to */
|
|
struct btrfs_root *root;
|
|
|
|
/* key used to find this inode on disk. This is used by the code
|
|
* to read in roots of subvolumes
|
|
*/
|
|
struct btrfs_key location;
|
|
|
|
/*
|
|
* Lock for counters and all fields used to determine if the inode is in
|
|
* the log or not (last_trans, last_sub_trans, last_log_commit,
|
|
* logged_trans).
|
|
*/
|
|
spinlock_t lock;
|
|
|
|
/* the extent_tree has caches of all the extent mappings to disk */
|
|
struct extent_map_tree extent_tree;
|
|
|
|
/* the io_tree does range state (DIRTY, LOCKED etc) */
|
|
struct extent_io_tree io_tree;
|
|
|
|
/* special utility tree used to record which mirrors have already been
|
|
* tried when checksums fail for a given block
|
|
*/
|
|
struct extent_io_tree io_failure_tree;
|
|
|
|
/* held while logging the inode in tree-log.c */
|
|
struct mutex log_mutex;
|
|
|
|
/* held while doing delalloc reservations */
|
|
struct mutex delalloc_mutex;
|
|
|
|
/* used to order data wrt metadata */
|
|
struct btrfs_ordered_inode_tree ordered_tree;
|
|
|
|
/* list of all the delalloc inodes in the FS. There are times we need
|
|
* to write all the delalloc pages to disk, and this list is used
|
|
* to walk them all.
|
|
*/
|
|
struct list_head delalloc_inodes;
|
|
|
|
/* node for the red-black tree that links inodes in subvolume root */
|
|
struct rb_node rb_node;
|
|
|
|
unsigned long runtime_flags;
|
|
|
|
/* Keep track of who's O_SYNC/fsyncing currently */
|
|
atomic_t sync_writers;
|
|
|
|
/* full 64 bit generation number, struct vfs_inode doesn't have a big
|
|
* enough field for this.
|
|
*/
|
|
u64 generation;
|
|
|
|
/*
|
|
* transid of the trans_handle that last modified this inode
|
|
*/
|
|
u64 last_trans;
|
|
|
|
/*
|
|
* transid that last logged this inode
|
|
*/
|
|
u64 logged_trans;
|
|
|
|
/*
|
|
* log transid when this inode was last modified
|
|
*/
|
|
int last_sub_trans;
|
|
|
|
/* a local copy of root's last_log_commit */
|
|
int last_log_commit;
|
|
|
|
/* total number of bytes pending delalloc, used by stat to calc the
|
|
* real block usage of the file
|
|
*/
|
|
u64 delalloc_bytes;
|
|
|
|
/*
|
|
* Total number of bytes pending delalloc that fall within a file
|
|
* range that is either a hole or beyond EOF (and no prealloc extent
|
|
* exists in the range). This is always <= delalloc_bytes.
|
|
*/
|
|
u64 new_delalloc_bytes;
|
|
|
|
/*
|
|
* total number of bytes pending defrag, used by stat to check whether
|
|
* it needs COW.
|
|
*/
|
|
u64 defrag_bytes;
|
|
|
|
/*
|
|
* the size of the file stored in the metadata on disk. data=ordered
|
|
* means the in-memory i_size might be larger than the size on disk
|
|
* because not all the blocks are written yet.
|
|
*/
|
|
u64 disk_i_size;
|
|
|
|
/*
|
|
* if this is a directory then index_cnt is the counter for the index
|
|
* number for new files that are created
|
|
*/
|
|
u64 index_cnt;
|
|
|
|
/* Cache the directory index number to speed the dir/file remove */
|
|
u64 dir_index;
|
|
|
|
/* the fsync log has some corner cases that mean we have to check
|
|
* directories to see if any unlinks have been done before
|
|
* the directory was logged. See tree-log.c for all the
|
|
* details
|
|
*/
|
|
u64 last_unlink_trans;
|
|
|
|
/*
|
|
* Number of bytes outstanding that are going to need csums. This is
|
|
* used in ENOSPC accounting.
|
|
*/
|
|
u64 csum_bytes;
|
|
|
|
/* flags field from the on disk inode */
|
|
u32 flags;
|
|
|
|
/*
|
|
* Counters to keep track of the number of extent item's we may use due
|
|
* to delalloc and such. outstanding_extents is the number of extent
|
|
* items we think we'll end up using, and reserved_extents is the number
|
|
* of extent items we've reserved metadata for.
|
|
*/
|
|
unsigned outstanding_extents;
|
|
|
|
struct btrfs_block_rsv block_rsv;
|
|
|
|
/*
|
|
* Cached values of inode properties
|
|
*/
|
|
unsigned prop_compress; /* per-file compression algorithm */
|
|
/*
|
|
* Force compression on the file using the defrag ioctl, could be
|
|
* different from prop_compress and takes precedence if set
|
|
*/
|
|
unsigned defrag_compress;
|
|
|
|
struct btrfs_delayed_node *delayed_node;
|
|
|
|
/* File creation time. */
|
|
struct timespec64 i_otime;
|
|
|
|
/* Hook into fs_info->delayed_iputs */
|
|
struct list_head delayed_iput;
|
|
|
|
/*
|
|
* To avoid races between lockless (i_mutex not held) direct IO writes
|
|
* and concurrent fsync requests. Direct IO writes must acquire read
|
|
* access on this semaphore for creating an extent map and its
|
|
* corresponding ordered extent. The fast fsync path must acquire write
|
|
* access on this semaphore before it collects ordered extents and
|
|
* extent maps.
|
|
*/
|
|
struct rw_semaphore dio_sem;
|
|
|
|
struct inode vfs_inode;
|
|
};
|
|
|
|
static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
|
|
{
|
|
return container_of(inode, struct btrfs_inode, vfs_inode);
|
|
}
|
|
|
|
static inline unsigned long btrfs_inode_hash(u64 objectid,
|
|
const struct btrfs_root *root)
|
|
{
|
|
u64 h = objectid ^ (root->root_key.objectid * GOLDEN_RATIO_PRIME);
|
|
|
|
#if BITS_PER_LONG == 32
|
|
h = (h >> 32) ^ (h & 0xffffffff);
|
|
#endif
|
|
|
|
return (unsigned long)h;
|
|
}
|
|
|
|
static inline void btrfs_insert_inode_hash(struct inode *inode)
|
|
{
|
|
unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root);
|
|
|
|
__insert_inode_hash(inode, h);
|
|
}
|
|
|
|
static inline u64 btrfs_ino(const struct btrfs_inode *inode)
|
|
{
|
|
u64 ino = inode->location.objectid;
|
|
|
|
/*
|
|
* !ino: btree_inode
|
|
* type == BTRFS_ROOT_ITEM_KEY: subvol dir
|
|
*/
|
|
if (!ino || inode->location.type == BTRFS_ROOT_ITEM_KEY)
|
|
ino = inode->vfs_inode.i_ino;
|
|
return ino;
|
|
}
|
|
|
|
static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size)
|
|
{
|
|
i_size_write(&inode->vfs_inode, size);
|
|
inode->disk_i_size = size;
|
|
}
|
|
|
|
static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode)
|
|
{
|
|
struct btrfs_root *root = inode->root;
|
|
|
|
if (root == root->fs_info->tree_root &&
|
|
btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID)
|
|
return true;
|
|
if (inode->location.objectid == BTRFS_FREE_INO_OBJECTID)
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
static inline bool is_data_inode(struct inode *inode)
|
|
{
|
|
return btrfs_ino(BTRFS_I(inode)) != BTRFS_BTREE_INODE_OBJECTID;
|
|
}
|
|
|
|
static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
|
|
int mod)
|
|
{
|
|
lockdep_assert_held(&inode->lock);
|
|
inode->outstanding_extents += mod;
|
|
if (btrfs_is_free_space_inode(inode))
|
|
return;
|
|
trace_btrfs_inode_mod_outstanding_extents(inode->root, btrfs_ino(inode),
|
|
mod);
|
|
}
|
|
|
|
/*
|
|
* Called every time after doing a buffered, direct IO or memory mapped write.
|
|
*
|
|
* This is to ensure that if we write to a file that was previously fsynced in
|
|
* the current transaction, then try to fsync it again in the same transaction,
|
|
* we will know that there were changes in the file and that it needs to be
|
|
* logged.
|
|
*/
|
|
static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode)
|
|
{
|
|
spin_lock(&inode->lock);
|
|
inode->last_sub_trans = inode->root->log_transid;
|
|
spin_unlock(&inode->lock);
|
|
}
|
|
|
|
static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
|
|
{
|
|
int ret = 0;
|
|
|
|
spin_lock(&inode->lock);
|
|
if (inode->logged_trans == generation &&
|
|
inode->last_sub_trans <= inode->last_log_commit &&
|
|
inode->last_sub_trans <= inode->root->last_log_commit) {
|
|
/*
|
|
* After a ranged fsync we might have left some extent maps
|
|
* (that fall outside the fsync's range). So return false
|
|
* here if the list isn't empty, to make sure btrfs_log_inode()
|
|
* will be called and process those extent maps.
|
|
*/
|
|
smp_mb();
|
|
if (list_empty(&inode->extent_tree.modified_extents))
|
|
ret = 1;
|
|
}
|
|
spin_unlock(&inode->lock);
|
|
return ret;
|
|
}
|
|
|
|
#define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1
|
|
|
|
struct btrfs_dio_private {
|
|
struct inode *inode;
|
|
unsigned long flags;
|
|
u64 logical_offset;
|
|
u64 disk_bytenr;
|
|
u64 bytes;
|
|
void *private;
|
|
|
|
/* number of bios pending for this dio */
|
|
atomic_t pending_bios;
|
|
|
|
/* IO errors */
|
|
int errors;
|
|
|
|
/* orig_bio is our btrfs_io_bio */
|
|
struct bio *orig_bio;
|
|
|
|
/* dio_bio came from fs/direct-io.c */
|
|
struct bio *dio_bio;
|
|
|
|
/*
|
|
* The original bio may be split to several sub-bios, this is
|
|
* done during endio of sub-bios
|
|
*/
|
|
blk_status_t (*subio_endio)(struct inode *, struct btrfs_io_bio *,
|
|
blk_status_t);
|
|
};
|
|
|
|
/*
|
|
* Disable DIO read nolock optimization, so new dio readers will be forced
|
|
* to grab i_mutex. It is used to avoid the endless truncate due to
|
|
* nonlocked dio read.
|
|
*/
|
|
static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode)
|
|
{
|
|
set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
|
|
smp_mb();
|
|
}
|
|
|
|
static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode)
|
|
{
|
|
smp_mb__before_atomic();
|
|
clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
|
|
}
|
|
|
|
/* Array of bytes with variable length, hexadecimal format 0x1234 */
|
|
#define CSUM_FMT "0x%*phN"
|
|
#define CSUM_FMT_VALUE(size, bytes) size, bytes
|
|
|
|
static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
|
|
u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
|
|
{
|
|
struct btrfs_root *root = inode->root;
|
|
struct btrfs_super_block *sb = root->fs_info->super_copy;
|
|
const u16 csum_size = btrfs_super_csum_size(sb);
|
|
|
|
/* Output minus objectid, which is more meaningful */
|
|
if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID)
|
|
btrfs_warn_rl(root->fs_info,
|
|
"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
|
|
root->root_key.objectid, btrfs_ino(inode),
|
|
logical_start,
|
|
CSUM_FMT_VALUE(csum_size, csum),
|
|
CSUM_FMT_VALUE(csum_size, csum_expected),
|
|
mirror_num);
|
|
else
|
|
btrfs_warn_rl(root->fs_info,
|
|
"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
|
|
root->root_key.objectid, btrfs_ino(inode),
|
|
logical_start,
|
|
CSUM_FMT_VALUE(csum_size, csum),
|
|
CSUM_FMT_VALUE(csum_size, csum_expected),
|
|
mirror_num);
|
|
}
|
|
|
|
#endif
|