论文阅读《SplitFS: Reducing Software Overhead in File Systems for Persistent Memory》

SplitFS发表在SOSP‘19上。

拦截POSIX调用

作者通过设置环境变量LD_PRELOAD,拦截POSIX调用:

1
export LD_PRELOAD=xxx

日志操作

SplitFS着重优化了追加操作。追加操作的日志项append_log_entry定义在头文件splitfs/log.h中。

1
2
3
4
5
6
7
8
9
struct append_log_entry {
uint32_t checksum;
uint32_t file_ino;
uint32_t dr_ino;
loff_t file_offset;
loff_t dr_offset;
size_t data_size; // long unsigned int
uint8_t padding[28];
};

$$
追加操作的日志项大小 = 4 \times 3 + 8 \times 2 + 8 + 28 = 64B
$$

校验和(checksum)

P499: First, to distinguish between valid and invalid or torn log entries, we incorporate a 4B transactional checksum within the 64B log entry.

校验和(占用4个字节)使用了循环冗余校验(Cyclic Redundancy Check, CRC)。

在CRC中,有几个重要的概念:

第一,模2运算。加法不进位,减法不借位,等价于异或运算(eXclusive OR,XOR)。

第二,生成多项式(Generator Polynomial)。

第三,余数初始值(Initial Remainder)。

第四,结果异或值(XOR)
$$
CRC =(D \ll 2^r) \oplus G
$$

其中,G的长度为r + 1。

校验和的实现代码如下所示:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
// 计算r的校验码
uint32_t crc32_for_byte(uint32_t r) {
for(int j = 0; j < 8; ++j)
r = (r & 1? 0: (uint32_t)0xEDB88320L) ^ r >> 1;
return r ^ (uint32_t)0xFF000000L;
}

/*
* data: 原始数据指针
* n_bytes: 原始数据占用的字节数
* crc: 校验和指针
*/
void create_crc32(const void *data, size_t n_bytes, uint32_t* crc) {
// 使用static修饰符,表明table只计算一次,后续调用不再计算
static uint32_t table[0x100];
if(!*table)
for(size_t i = 0; i < 0x100; ++i)
table[i] = crc32_for_byte(i);
for(size_t i = 0; i < n_bytes; ++i)
*crc = table[(uint8_t)*crc ^ ((uint8_t*)data)[i]] ^ *crc >> 8;
}

其中,0xEDB88320是CRC-32的生成多项式:

$$
\mathrm{poly} = 0xEDB88320 = 0b1110\ 1101\ 1011\ 1000\ 1000\ 0011\ 0010\ 0000 \\
$$

1
2
3
4
5
6
// 计算r的校验码
uint32_t crc32_for_byte(uint32_t r) {
for(int j = 0; j < 8; ++j)
r = (r & 1? 0: (uint32_t)0xEDB88320L) ^ r >> 1;
return r ^ (uint32_t)0xFF000000L;
}

将三目运算符改写为条件判断语句:

1
2
3
4
5
6
7
8
9
10
uint32_t crc32_for_byte(uint32_t r) {
for(int j = 0; j < 8; ++j) {
if (r & 1) {
r = r >> 1;
} else {
r = ((uint32_t)0xEDB88320L) ^ (r >> 1);
}
}
return r ^ (uint32_t)0xFF000000L;
}

CAS操作

P499: Second, we maintain a tail for the log in DRAM and concurrent threads use the tail as a synchronization variable. They use compare-and-swap to atomically advance the tail and write to their respective log entries concurrently.

比较并交换(Compare And Swap,简称CAS)操作:

1
2
3
4
type __sync_fetch_and_add(type *ptr, type value, ...);

bool __sync_bool_compare_and_swap(type *ptr, type oldval, type newval, ...);
type __sync_val_compare_and_swap(type *ptr, type oldval, type newval, ...);
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
void persist_append_entry(uint32_t file_ino,
uint32_t dr_ino,
loff_t file_off,
loff_t dr_off,
size_t size) {
loff_t log_off;
struct append_log_entry app_entry;
fetch_and_add:
log_off = __sync_fetch_and_add(&app_log_tail, APPEND_LOG_ENTRY_SIZE);
if (app_log_tail > app_log_lim) {
if (__sync_bool_compare_and_swap(&clearing_app_log, 0, 1))
sync_and_clear_app_log();
goto fetch_and_add;
}
app_entry.file_ino = file_ino;
app_entry.dr_ino = dr_ino;
app_entry.file_offset = file_off;
app_entry.dr_offset = dr_off;
app_entry.data_size = size;
create_crc32((void *) &(app_entry.file_ino), 32, &(app_entry.checksum));
//app_entry.checksum = checksum; // [TODO] Calculate Checksum
MEMCPY_NON_TEMPORAL((void *)app_log + log_off,
&app_entry,
APPEND_LOG_ENTRY_SIZE);
_mm_sfence();
#if NVM_DELAY
perfmodel_add_delay(0, APPEND_LOG_ENTRY_SIZE);
#endif
}

其中,日志文件的最大容量app_log_lim

1
2
3
4
5
6
7
// log.h
#define APPEND_LOG_SIZE (128*1024*1024)

loff_t app_log_lim;

// log.c > init_append_log()
app_log_lim = APPEND_LOG_SIZE;

P499: We employ a 128MB operation log file and if it becomes full, we checkpoint the state of the application by calling relink() on all the open files that have data in staging files. We then zero out the log and reuse it.

对应代码中的条件判断:

1
2
3
4
5
6
7
fetch_and_add:
log_off = __sync_fetch_and_add(&app_log_tail, APPEND_LOG_ENTRY_SIZE);
if (app_log_tail > app_log_lim) {
if (__sync_bool_compare_and_swap(&clearing_app_log, 0, 1))
sync_and_clear_app_log();
goto fetch_and_add;
}

fsync()

1
2
3
4
#include <unistd.h>

// 将内存中的修改同步到存储设备中
int fsync(int fd);

内存屏障

内存屏障(Memory Barrier),内存栅栏(Memory Fence)sfence

1
2
3
#include <emmintrin.h>

void _mm_sfence(void);

ext4中的函数ext4_ioctl()中的EXT4_IOC_MOVE_EXT命令。

https://github.com/torvalds/linux/blob/v4.13/fs/ext4/ioctl.c#L717

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
    case EXT4_IOC_MOVE_EXT: {
struct move_extent me;
struct fd donor;
int err;

if (!(filp->f_mode & FMODE_READ) ||
!(filp->f_mode & FMODE_WRITE))
return -EBADF;

if (copy_from_user(&me,
(struct move_extent __user *)arg, sizeof(me)))
return -EFAULT;
me.moved_len = 0;

donor = fdget(me.donor_fd);
if (!donor.file)
return -EBADF;

if (!(donor.file->f_mode & FMODE_WRITE)) {
err = -EBADF;
goto mext_out;
}

if (ext4_has_feature_bigalloc(sb)) {
ext4_msg(sb, KERN_ERR,
"Online defrag not supported with bigalloc");
err = -EOPNOTSUPP;
goto mext_out;
} else if (IS_DAX(inode)) {
ext4_msg(sb, KERN_ERR,
"Online defrag not supported with DAX");
err = -EOPNOTSUPP;
goto mext_out;
}

err = mnt_want_write_file(filp);
if (err)
goto mext_out;

err = ext4_move_extents(filp, donor.file, me.orig_start,
me.donor_start, me.len, &me.moved_len);
mnt_drop_write_file(filp);

if (copy_to_user((struct move_extent __user *)arg,
&me, sizeof(me)))
err = -EFAULT;
mext_out:
fdput(donor);
return err;
}

参考资料

1.论文:https://www.cs.utexas.edu/~vijay/papers/sosp19-splitfs.pdf

2.代码:https://github.com/utsaslab/SplitFS

3.libcrc:https://github.com/lammertb/libcrc


----------本文结束感谢您的阅读----------
坚持原创技术分享,您的支持将鼓励我继续创作!