APFS: mmap page fault can take up to minutes after ftruncate/F_PREALLOCATE
APFS: mmap page fault can take up to minutes after ftruncate/F_PREALLOCATE
- Subject: APFS: mmap page fault can take up to minutes after ftruncate/F_PREALLOCATE
- From: Ilia K via Filesystem-dev <email@hidden>
- Date: Thu, 19 Dec 2019 16:32:36 +0300
Hi!
I investigate performance issues with our test cases on Mac mini (2018,
Core i7 3.2GHz, 16GB RAM) with macOS Mojave 10.14.6.
Our storage uses memory mappings backed by file, and periodically when it
gets too big we increase file size using the corresponding function:
CreateFileMapping on Windows, posix_fallocate on Linux. On macOS, we
emulate posix_fallocate() which simply does ftruncate().
It so happened that one of our test cases repeatedly allocates and mmap()'s
chunks of size >= 4KB, without reading/writing to them. (btw, page size and
block size are also 4K).
The problem is that sometimes we have unpredictable delays in page faults:
from tens of seconds to minutes. Usually it happens when accessing the
mmap()'ed addresses with offset ~2050-2090MB.
Well, I tried to implement posix_fallocate() the different ways:
* ftruncate() -- the easist one, works both on Linux and on macOS with
HFS+. But on APFS page fault takes about 23 seconds.
* fcntl(F_SETSIZE) -- the worst page fault time is less than for
ftruncate() (only 11 seconds), and we need root privilege.
* fcntl(F_PREALLOCATE) -- I found 3 ways of using it in various open source
projects: #1 & #2 seems wrong to me (see the comments in my demo for
details), and #3 can cause a page fault lasting 10 minutes.
* pwrite() -- works slow but without obscenely long page faults if step
size 4K. Otherwise, we can also wait in pwrite() for 12 seconds, or get a
13 seconds page fault.
Here is my posix_fallocate(), the full demo code is in the attachments
(pagefault_test.c):
```C
int posix_fallocate(int fd, off_t offset, off_t len) {
struct stat stat_buf;
if (flock(fd, LOCK_EX) != 0) return errno;
int err_code = fstat(fd, &stat_buf) == 0 ? 0 : errno;
if (err_code == 0 && offset + len > stat_buf.st_size) {
#if defined(IMPL_FTRUNCATE)
err_code = ftruncate(fd, offset + len) == 0 ? 0 : errno;
// btw, LLVM simply uses ftruncate when posix_fallocate not
available:
https://github.com/llvm/llvm-project/blob/b462cdff05b82071190e8bfd1078a2c76933b19b/llvm/lib/Support/Unix/Path.inc#L559
.
#elif defined(IMPL_FCNTL_SETSIZE)
unsigned long long arg = offset + len;
err_code = fcntl(fd, F_SETSIZE, &arg) != -1 ? 0 : errno;
#elif defined(IMPL_FCNTL_PREALLOCATE)
// I found several ways to use F_PREALLOCATE (uncomment to try it):
// 1. Starting from specific offset. This way is used in Chromium (
https://chromium.googlesource.com/chromium/src/+/7ca4a2b489b1dd4b5c9b0046d55193b900da06ea/base/files/file_util_posix.cc#901),
fallocate module for Python (
https://github.com/trbs/fallocate/blob/9d7aae312ad0d1de6c6451193748e8e8c7e8230d/fallocate/_fallocatemodule.c#L59
),
// but I get EINVAL if .fst_offset != 0.
//fstore_t store = { F_ALLOCATEALL, F_PEOFPOSMODE, offset, len };
//
// 2. Specifying the desired file size. Examples: Mozilla
https://hg.mozilla.org/mozilla-central/file/3d846420a907/xpcom/glue/FileUtils.cpp#l61
(copies here:
https://github.com/mozilla/universal-search-gecko-dev/blob/33e34ae066dbdb35ff6889973e21a38792991f35/xpcom/glue/FileUtils.cpp,
https://github.com/mozilla/integration-mozilla-inbound/blob/0d01aa29ce350beca861f7d3b7b4df399b246ed0/xpcom/glue/FileUtils.cpp),
one guy there https://forums.developer.apple.com/thread/111312, Rust fs2
https://docs.rs/crate/fs2/0.4.3/source/src/unix.rs.
// But as I can see, this will allocate offset + len bytes per
call, despite how big the file is at the moment.
//fstore_t store = { F_ALLOCATEALL, F_PEOFPOSMODE, 0, offset + len
};
//
// 3. Specifying the diff file size. The only example I found is
Realm core
https://github.com/realm/realm-core/blob/44152d283878473db8cbf90ac4083dcae44c1852/src/realm/util/file.cpp#L783
.
// Unfortunately in this case page fault can take longer than with
ftruncate(): up to 611 seconds!!!
// ```
// $ ./a.out
// map 2179727360-2179792896
// page fault took 611276 milliseconds
// map 2181300224-2181365760
// page fault took 214747 milliseconds
// ...
// ```
fstore_t store = { F_ALLOCATEALL, F_PEOFPOSMODE, 0, offset + len -
stat_buf.st_size };
err_code = fcntl(fd, F_PREALLOCATE, &store) != -1 && ftruncate(fd,
offset + len) != -1 ? 0 : errno;
#elif defined(IMPL_WRITE)
// for 64K: pwrite() can take about 12 seconds, or we can get a 13
seconds page fault.
int step = 65536;
//int step = stat_buf.st_blksize;
assert(stat_buf.st_size % step == 0); // precondition for this
program
assert((offset + len) % step == 0);
printf("\n");
for (off_t ofs = stat_buf.st_size; ofs < offset + len; ofs += step)
{
static const char pad = '\0';
fprintf(stdout, "writing %lld, step %d\r", ofs + step - 1,
step); fflush(stdout);
if (pwrite(fd, &pad, 1, ofs + step - 1) == -1) {
err_code = errno;
break;
}
}
printf("\n");
#else
#error Select impl
#endif
}
if (flock(fd, LOCK_UN) != 0) return errno;
return err_code;
}
```
--
- Ilia
// Compile with: clang -Wall -Wextra -O0 pagefault_test.c
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/file.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <time.h>
#include <unistd.h>
#define IMPL_FTRUNCATE // 23sec page fault, when mmap'ed ofs is about
2080MB
//#define IMPL_FCNTL_SETSIZE // 11sec page fault,
2063.5/2082.5/2799.75/2055
//#define IMPL_FCNTL_PREALLOCATE // 611sec page fault, 2078.75
//#define IMPL_WRITE // 13sec page fault (64K step)
int posix_fallocate(int fd, off_t offset, off_t len) {
struct stat stat_buf;
if (flock(fd, LOCK_EX) != 0) return errno;
int err_code = fstat(fd, &stat_buf) == 0 ? 0 : errno;
if (err_code == 0 && offset + len > stat_buf.st_size) {
#if defined(IMPL_FTRUNCATE)
err_code = ftruncate(fd, offset + len) == 0 ? 0 : errno;
// btw, LLVM simply uses ftruncate when posix_fallocate not available:
https://github.com/llvm/llvm-project/blob/b462cdff05b82071190e8bfd1078a2c76933b19b/llvm/lib/Support/Unix/Path.inc#L559.
#elif defined(IMPL_FCNTL_SETSIZE)
unsigned long long arg = offset + len;
err_code = fcntl(fd, F_SETSIZE, &arg) != -1 ? 0 : errno;
#elif defined(IMPL_FCNTL_PREALLOCATE)
// I found several ways to use F_PREALLOCATE (uncomment to try it):
// 1. Starting from specific offset. This way is used in Chromium
(https://chromium.googlesource.com/chromium/src/+/7ca4a2b489b1dd4b5c9b0046d55193b900da06ea/base/files/file_util_posix.cc#901),
fallocate module for Python
(https://github.com/trbs/fallocate/blob/9d7aae312ad0d1de6c6451193748e8e8c7e8230d/fallocate/_fallocatemodule.c#L59),
// but I get EINVAL if .fst_offset != 0.
//fstore_t store = { F_ALLOCATEALL, F_PEOFPOSMODE, offset, len };
//
// 2. Specifying the desired file size. Examples: Mozilla
https://hg.mozilla.org/mozilla-central/file/3d846420a907/xpcom/glue/FileUtils.cpp#l61
(copies here:
https://github.com/mozilla/universal-search-gecko-dev/blob/33e34ae066dbdb35ff6889973e21a38792991f35/xpcom/glue/FileUtils.cpp,
https://github.com/mozilla/integration-mozilla-inbound/blob/0d01aa29ce350beca861f7d3b7b4df399b246ed0/xpcom/glue/FileUtils.cpp),
one guy there https://forums.developer.apple.com/thread/111312, Rust fs2
https://docs.rs/crate/fs2/0.4.3/source/src/unix.rs.
// But as I can see, this will allocate offset + len bytes per call,
despite how big the file is at the moment.
//fstore_t store = { F_ALLOCATEALL, F_PEOFPOSMODE, 0, offset + len };
//
// 3. Specifying the diff file size. The only example I found is Realm
core
https://github.com/realm/realm-core/blob/44152d283878473db8cbf90ac4083dcae44c1852/src/realm/util/file.cpp#L783.
// Unfortunately in this case page fault can take longer than with
ftruncate(): up to 611 seconds!!!
// ```
// $ ./a.out
// map 2179727360-2179792896
// page fault took 611276 milliseconds
// map 2181300224-2181365760
// page fault took 214747 milliseconds
// ...
// ```
fstore_t store = { F_ALLOCATEALL, F_PEOFPOSMODE, 0, offset + len -
stat_buf.st_size };
err_code = fcntl(fd, F_PREALLOCATE, &store) != -1 && ftruncate(fd,
offset + len) != -1 ? 0 : errno;
#elif defined(IMPL_WRITE)
// for 64K: pwrite() can take about 12 seconds, or we can get a 13
seconds page fault.
int step = 65536;
//int step = stat_buf.st_blksize;
assert(stat_buf.st_size % step == 0); // precondition for this program
assert((offset + len) % step == 0);
printf("\n");
for (off_t ofs = stat_buf.st_size; ofs < offset + len; ofs += step) {
static const char pad = '\0';
fprintf(stdout, "writing %lld, step %d\r", ofs + step - 1, step);
fflush(stdout);
if (pwrite(fd, &pad, 1, ofs + step - 1) == -1) {
err_code = errno;
break;
}
}
printf("\n");
#else
#error Select impl
#endif
}
if (flock(fd, LOCK_UN) != 0) return errno;
return err_code;
}
#define FAIL_IF(COND) \
do { if (COND) { fprintf(stderr, "FAIL \"%s\" [errno = %d, %s]\n", #COND,
errno, strerror(errno)); exit(EXIT_FAILURE); } } while (0)
int main() {
int fd = open("file.tmp", O_RDWR | O_CREAT | O_TRUNC, 0644);
FAIL_IF(fd == -1);
const int64_t page_size = 4096, view_size = 65536, max_file_size = 4UL *
1024 * 1024 * 1024;
for (off_t off = 0; off < max_file_size; off += view_size) {
if (off % (512 * 1024 * 1024) == 0) // fallocate each 512MB
FAIL_IF(posix_fallocate(fd, off, 512 * 1024 * 1024) != 0);
fprintf(stdout, "\rmap %lld-%lld", off, off + view_size);
fflush(stdout);
void* m = mmap(NULL, view_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd,
off);
FAIL_IF(m == MAP_FAILED);
for (int page_num = 0; page_num < view_size / page_size; page_num++) {
if (page_num % 2 == 0) {
struct timespec from;
FAIL_IF(clock_gettime(CLOCK_REALTIME, &from) == -1);
*(int*)((char*)m + page_num * page_size) = 0x1234;
//memset((char*)m + page_num * page_size, 0xAA, page_size);
struct timespec to;
FAIL_IF(clock_gettime(CLOCK_REALTIME, &to) == -1);
uint64_t elapsed_ms = (to.tv_sec - from.tv_sec) * 1000 +
(to.tv_nsec - from.tv_nsec) / 1000000;
if (elapsed_ms >= 1000)
fprintf(stdout, "\npage fault took %llu milliseconds\n",
elapsed_ms); fflush(stdout);
}
}
munmap(m, view_size);
}
printf("exiting\n");
return 0;
}
_______________________________________________
Do not post admin requests to the list. They will be ignored.
Filesystem-dev mailing list (email@hidden)
Help/Unsubscribe/Update your Subscription:
This email sent to email@hidden