replay的iolog解读

有2种可供回放的格式

  • blkparsebin文件, 支持的比较多
  • 纯文本的iolog格式, 常用v2v3

正常使用bin文件的方式如下

采集

1
2
blktrace /dev/sdb1 
blkparse sdb1 -d dd.bin >/dev/null

replay io

1
2
3
4
fio --direct=1 --read_iolog="dd.bin" --replay_redirect=/dev/sdc1  --name=replay --replay_no_stall=1 --numjobs=1 --ioengine=libaio --iodepth=32


fio --read_iolog=../bb.bin --filename=fio-rand-read --name=a

iolog使用方式

目前初步实验来看, 多个job的时候, 最好让不同job的write_log 文件独立, 否则可能存在因同时追加写入冲突, 导致部分格式错误.

然后得到独立的iolog文件后, 再使用fio --read_iolog="<file1>:<file2>" --merge_blktrace_file="<output_file>"来进行多个job文件的合并

然后就可以正常使用1个iolog文件进行read_log测试了.

fio Trace file format

rbd引擎生成的iolog是v2协议

文件头指定fio version 2 iolog

然后声明job对应的action

1
filename action
  • add
  • open
  • close
1
filename action offset length
  • action
    • wait
    • read
    • write
    • sync
    • datasync
    • trim

样例如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
fio version 2 iolog
rbd_13.0.0 add
rbd_13.0.0 open
rbd_13.0.0 write 893865984 4096
rbd_13.0.0 write 9905799168 4096
rbd_13.0.0 write 6045495296 4096
rbd_13.0.0 write 5778386944 4096
rbd_13.0.0 write 9706029056 4096
rbd_13.0.0 write 1973067776 4096
rbd_13.0.0 write 3528716288 4096
rbd_13.0.0 write 6849687552 4096
rbd_13.0.0 write 2277048320 4096
rbd_13.0.0 write 7225700352 4096
rbd_13.0.0 write 5898452992 4096
rbd_13.0.0 write 5612314624 4096
rbd_13.0.0 write 10423967744 4096
rbd_13.0.0 write 8727756800 4096
rbd_13.0.0 write 5164285952 4096
rbd_13.0.0 write 4583624704 4096
rbd_13.0.0 write 4850122752 4096
rbd_13.0.0 write 86384640 4096
rbd_13.0.0 write 6490755072 4096
rbd_13.0.0 write 7782293504 4096
rbd_13.0.0 write 122646528 4096
rbd_13.0.0 write 8404697088 4096
rbd_13.0.0 write 1540767744 4096
rbd_13.0.0 write 206385152 4096
rbd_13.0.0 write 9246814208 4096
rbd_13.0.0 write 2709151744 4096
rbd_13.0.0 write 7710785536 4096
rbd_13.0.0 write 2957721600 4096
rbd_13.0.0 write 7532285952 4096
rbd_13.0.0 write 52547584 4096
rbd_13.0.0 write 4910313472 4096
rbd_13.0.0 write 4400508928 4096
rbd_13.0.0 write 1650491392 4096
rbd_13.0.0 write 2253017088 4096
rbd_13.0.0 write 8878170112 4096
rbd_13.0.0 write 7537848320 4096
rbd_13.0.0 write 9147822080 4096
rbd_13.0.0 write 4819779584 4096
rbd_13.0.0 write 907501568 4096
rbd_13.0.0 write 3035762688 4096
rbd_13.0.0 write 7090388992 4096
rbd_13.0.0 write 5126242304 4096
rbd_13.0.0 write 6447304704 4096
rbd_13.0.0 write 6967037952 4096
rbd_13.0.0 write 4684316672 4096
rbd_13.0.0 write 4559695872 4096

iolog基于magic解析2类格式

较直观

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# iolog.c
bool init_iolog(struct thread_data *td)
{
bool ret;

if (td->o.read_iolog_file) {
int need_swap;
char * fname = get_name_by_idx(td->o.read_iolog_file, td->subjob_number);

/*
* Check if it's a blktrace file and load that if possible.
* Otherwise assume it's a normal log file and load that.
*/
if (is_blktrace(fname, &need_swap)) {
td->io_log_blktrace = 1;
ret = init_blktrace_read(td, fname, need_swap);
} else {
td->io_log_blktrace = 0;
ret = init_iolog_read(td, fname);
}
free(fname);
...
}

...

# blktrace.c

/*
* Check if this is a blktrace binary data file. We read a single trace
* into memory and check for the magic signature.
*/
bool is_blktrace(const char *filename, int *need_swap)
{
struct blk_io_trace t;
int fd, ret;

fd = open(filename, O_RDONLY);
if (fd < 0)
return false;

ret = read(fd, &t, sizeof(t));
close(fd);

if (ret < 0) {
perror("read blktrace");
return false;
} else if (ret != sizeof(t)) {
log_err("fio: short read on blktrace file\n");
return false;
}

if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC) {
*need_swap = 0;
return true;
}

/*
* Maybe it needs to be endian swapped...
*/
t.magic = fio_swap32(t.magic);
if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC) {
*need_swap = 1;
return true;
}

return false;
}

各大评测规范的测试集

SNIA - Storage Networking Industry Association: IOTTA Repository Home

从这里可以下载到对应的iolog.bin的replay文件.

ceph之rbd

iolog中action对应的rbd接口

分别对应哪些rbd接口呢?

  • add
  • open
    • rbd_open
  • close
    • 没用, 只是设置个标签
    • 当关闭引擎的时候触发fio_rbd_cleanup, 再调用底层shutdown
  • action
    • wait
      • fio iolog内自己实现
    • read
      • rbd_aio_read
    • write
      • rbd_aio_write
    • sync
      • rbd_aio_flush
    • datasync
    • trim
      • rbd_aio_discard
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48

static int ipo_special(struct thread_data *td, struct io_piece *ipo)
{
struct fio_file *f;
int ret;

/*
* Not a special ipo
*/
if (ipo->ddir != DDIR_INVAL)
return 0;

f = td->files[ipo->fileno];

if (ipo->delay)
iolog_delay(td, ipo->delay);
if (fio_fill_issue_time(td))
fio_gettime(&td->last_issue, NULL);
switch (ipo->file_action) {
case FIO_LOG_OPEN_FILE:
if (td->o.replay_redirect && fio_file_open(f)) {
dprint(FD_FILE, "iolog: ignoring re-open of file %s\n",
f->file_name);
break;
}
ret = td_io_open_file(td, f);
if (!ret)
break;
td_verror(td, ret, "iolog open file");
return -1;
case FIO_LOG_CLOSE_FILE:
td_io_close_file(td, f);
break;
case FIO_LOG_UNLINK_FILE:
td_io_unlink_file(td, f);
break;
case FIO_LOG_ADD_FILE:
/*
* Nothing to do
*/
break;
default:
log_err("fio: bad file action %d\n", ipo->file_action);
break;
}

return 1;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
static enum fio_q_status fio_rbd_queue(struct thread_data *td,
struct io_u *io_u)
{
struct rbd_data *rbd = td->io_ops_data;
struct fio_rbd_iou *fri = io_u->engine_data;
int r = -1;

fio_ro_check(td, io_u);

fri->io_seen = 0;
fri->io_complete = 0;

r = rbd_aio_create_completion(fri, _fio_rbd_finish_aiocb,
&fri->completion);
if (r < 0) {
log_err("rbd_aio_create_completion failed.\n");
goto failed;
}

if (io_u->ddir == DDIR_WRITE) {
r = rbd_aio_write(rbd->image, io_u->offset, io_u->xfer_buflen,
io_u->xfer_buf, fri->completion);
if (r < 0) {
log_err("rbd_aio_write failed.\n");
goto failed_comp;
}

} else if (io_u->ddir == DDIR_READ) {
r = rbd_aio_read(rbd->image, io_u->offset, io_u->xfer_buflen,
io_u->xfer_buf, fri->completion);

if (r < 0) {
log_err("rbd_aio_read failed.\n");
goto failed_comp;
}
} else if (io_u->ddir == DDIR_TRIM) {
r = rbd_aio_discard(rbd->image, io_u->offset,
io_u->xfer_buflen, fri->completion);
if (r < 0) {
log_err("rbd_aio_discard failed.\n");
goto failed_comp;
}
} else if (io_u->ddir == DDIR_SYNC) {
r = rbd_aio_flush(rbd->image, fri->completion);
if (r < 0) {
log_err("rbd_flush failed.\n");
goto failed_comp;
}
} else {
dprint(FD_IO, "%s: Warning: unhandled ddir: %d\n", __func__,
io_u->ddir);
r = -EINVAL;
goto failed_comp;
}

return FIO_Q_QUEUED;
failed_comp:
rbd_aio_release(fri->completion);
failed:
io_u->error = -r;
td_verror(td, io_u->error, "xfer");
return FIO_Q_COMPLETED;
}

采用librbd用户态接口访问时, 如何回放?

  1. rbd map映射出来, 读取接口
  2. lttng用户态采集

这里方案2社区有样例 ### lttng采集

RBD Replay — Ceph Documentation

Capture the trace. Make sure to capture pthread_id context:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
mkdir -p traces
lttng create -o traces librbd
lttng enable-event -u 'librbd:*'
lttng add-context -u -t pthread_id
lttng start
# run RBD workload here
lttng stop

# Process the trace with rbd-replay-prep:

rbd-replay-prep traces/ust/uid/*/* replay.bin
# Replay the trace with rbd-replay. Use read-only until you know it’s doing what you want:

rbd-replay --read-only replay.bin

这里的rbd-replay代码基本上就是单独解析的lttng采集到的埋点格式了

FAQ

fio的iolog 和blktrace/lttng捕获到的bin文件差异是什么?

两种格式, fio代码里针对两种格式分别处理, 通过解析文件头, 是否是blktrace来判断触发哪种解析.

为什么我blkparse得到的bin文件是前八位是7407 6561, 而不是那个代码中的0xffffff00

linux - How to make FIO replay a trace with multiple thread - Stack Overflow

fio不支持多线程回放. 只能是用merge-blktrace-file合并后再进行处理.