b659ef0277
Commit3a8b36f378
("Btrfs: fix data loss in the fast fsync path") added a performance regression for that causes an unnecessary sync of the log trees (fs/subvol and root log trees) when 2 consecutive fsyncs are done against a file, without no writes or any metadata updates to the inode in between them and if a transaction is committed before the second fsync is called. Huang Ying reported this to lkml (https://lkml.org/lkml/2015/3/18/99) after a test sysbench test that measured a -62% decrease of file io requests per second for that tests' workload. The test is: echo performance > /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor echo performance > /sys/devices/system/cpu/cpu1/cpufreq/scaling_governor echo performance > /sys/devices/system/cpu/cpu2/cpufreq/scaling_governor echo performance > /sys/devices/system/cpu/cpu3/cpufreq/scaling_governor mkfs -t btrfs /dev/sda2 mount -t btrfs /dev/sda2 /fs/sda2 cd /fs/sda2 for ((i = 0; i < 1024; i++)); do fallocate -l 67108864 testfile.$i; done sysbench --test=fileio --max-requests=0 --num-threads=4 --max-time=600 \ --file-test-mode=rndwr --file-total-size=68719476736 --file-io-mode=sync \ --file-num=1024 run A test on kvm guest, running a debug kernel gave me the following results: Without3a8b36f378
: 16.01 reqs/sec With3a8b36f378
: 3.39 reqs/sec With3a8b36f378
and this patch: 16.04 reqs/sec Reported-by: Huang Ying <ying.huang@intel.com> Tested-by: Huang, Ying <ying.huang@intel.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
213 lines
6.8 KiB
C
213 lines
6.8 KiB
C
/*
|
|
* Copyright (C) 2007 Oracle. All rights reserved.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public
|
|
* License v2 as published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public
|
|
* License along with this program; if not, write to the
|
|
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
* Boston, MA 021110-1307, USA.
|
|
*/
|
|
|
|
#ifndef __BTRFS_ORDERED_DATA__
|
|
#define __BTRFS_ORDERED_DATA__
|
|
|
|
/* one of these per inode */
|
|
struct btrfs_ordered_inode_tree {
|
|
spinlock_t lock;
|
|
struct rb_root tree;
|
|
struct rb_node *last;
|
|
};
|
|
|
|
struct btrfs_ordered_sum {
|
|
/* bytenr is the start of this extent on disk */
|
|
u64 bytenr;
|
|
|
|
/*
|
|
* this is the length in bytes covered by the sums array below.
|
|
*/
|
|
int len;
|
|
struct list_head list;
|
|
/* last field is a variable length array of csums */
|
|
u32 sums[];
|
|
};
|
|
|
|
/*
|
|
* bits for the flags field:
|
|
*
|
|
* BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
|
|
* It is used to make sure metadata is inserted into the tree only once
|
|
* per extent.
|
|
*
|
|
* BTRFS_ORDERED_COMPLETE is set when the extent is removed from the
|
|
* rbtree, just before waking any waiters. It is used to indicate the
|
|
* IO is done and any metadata is inserted into the tree.
|
|
*/
|
|
#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
|
|
|
|
#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
|
|
|
|
#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
|
|
|
|
#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
|
|
|
|
#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
|
|
|
|
#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
|
|
|
|
#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
|
|
|
|
#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
|
|
* has done its due diligence in updating
|
|
* the isize. */
|
|
#define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered
|
|
ordered extent */
|
|
#define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */
|
|
|
|
#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent
|
|
* in the logging code. */
|
|
struct btrfs_ordered_extent {
|
|
/* logical offset in the file */
|
|
u64 file_offset;
|
|
|
|
/* disk byte number */
|
|
u64 start;
|
|
|
|
/* ram length of the extent in bytes */
|
|
u64 len;
|
|
|
|
/* extent length on disk */
|
|
u64 disk_len;
|
|
|
|
/* number of bytes that still need writing */
|
|
u64 bytes_left;
|
|
|
|
/*
|
|
* the end of the ordered extent which is behind it but
|
|
* didn't update disk_i_size. Please see the comment of
|
|
* btrfs_ordered_update_i_size();
|
|
*/
|
|
u64 outstanding_isize;
|
|
|
|
/*
|
|
* If we get truncated we need to adjust the file extent we enter for
|
|
* this ordered extent so that we do not expose stale data.
|
|
*/
|
|
u64 truncated_len;
|
|
|
|
/* flags (described above) */
|
|
unsigned long flags;
|
|
|
|
/* compression algorithm */
|
|
int compress_type;
|
|
|
|
/* reference count */
|
|
atomic_t refs;
|
|
|
|
/* the inode we belong to */
|
|
struct inode *inode;
|
|
|
|
/* list of checksums for insertion when the extent io is done */
|
|
struct list_head list;
|
|
|
|
/* If we need to wait on this to be done */
|
|
struct list_head log_list;
|
|
|
|
/* If the transaction needs to wait on this ordered extent */
|
|
struct list_head trans_list;
|
|
|
|
/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
|
|
wait_queue_head_t wait;
|
|
|
|
/* our friendly rbtree entry */
|
|
struct rb_node rb_node;
|
|
|
|
/* a per root list of all the pending ordered extents */
|
|
struct list_head root_extent_list;
|
|
|
|
struct btrfs_work work;
|
|
|
|
struct completion completion;
|
|
struct btrfs_work flush_work;
|
|
struct list_head work_list;
|
|
};
|
|
|
|
/*
|
|
* calculates the total size you need to allocate for an ordered sum
|
|
* structure spanning 'bytes' in the file
|
|
*/
|
|
static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
|
|
unsigned long bytes)
|
|
{
|
|
int num_sectors = (int)DIV_ROUND_UP(bytes, root->sectorsize);
|
|
return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32);
|
|
}
|
|
|
|
static inline void
|
|
btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
|
|
{
|
|
spin_lock_init(&t->lock);
|
|
t->tree = RB_ROOT;
|
|
t->last = NULL;
|
|
}
|
|
|
|
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
|
|
void btrfs_remove_ordered_extent(struct inode *inode,
|
|
struct btrfs_ordered_extent *entry);
|
|
int btrfs_dec_test_ordered_pending(struct inode *inode,
|
|
struct btrfs_ordered_extent **cached,
|
|
u64 file_offset, u64 io_size, int uptodate);
|
|
int btrfs_dec_test_first_ordered_pending(struct inode *inode,
|
|
struct btrfs_ordered_extent **cached,
|
|
u64 *file_offset, u64 io_size,
|
|
int uptodate);
|
|
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
|
|
u64 start, u64 len, u64 disk_len, int type);
|
|
int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
|
|
u64 start, u64 len, u64 disk_len, int type);
|
|
int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
|
|
u64 start, u64 len, u64 disk_len,
|
|
int type, int compress_type);
|
|
void btrfs_add_ordered_sum(struct inode *inode,
|
|
struct btrfs_ordered_extent *entry,
|
|
struct btrfs_ordered_sum *sum);
|
|
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
|
|
u64 file_offset);
|
|
void btrfs_start_ordered_extent(struct inode *inode,
|
|
struct btrfs_ordered_extent *entry, int wait);
|
|
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
|
|
struct btrfs_ordered_extent *
|
|
btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
|
|
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
|
|
u64 file_offset,
|
|
u64 len);
|
|
bool btrfs_have_ordered_extents_in_range(struct inode *inode,
|
|
u64 file_offset,
|
|
u64 len);
|
|
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
|
|
struct btrfs_ordered_extent *ordered);
|
|
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
|
|
u32 *sum, int len);
|
|
int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
|
|
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
|
|
void btrfs_get_logged_extents(struct inode *inode,
|
|
struct list_head *logged_list,
|
|
const loff_t start,
|
|
const loff_t end);
|
|
void btrfs_put_logged_extents(struct list_head *logged_list);
|
|
void btrfs_submit_logged_extents(struct list_head *logged_list,
|
|
struct btrfs_root *log);
|
|
void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
|
|
struct btrfs_root *log, u64 transid);
|
|
void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
|
|
int __init ordered_data_init(void);
|
|
void ordered_data_exit(void);
|
|
#endif
|