Minor optimisations and bug fixes

Fix bug in mt_mpmc.c, in Linux mutexes are not recursive.
Add arena_trim_string() to the arena API
Removing arena->path, now paths are pushed to arena->metadata
Replacing fe->owner[128] with char *owner; the owner is not pushed as a
string to arena->metadata and trimed with arena_trim_string()
Improving cache locality in arena->metadata, the memory layout is not
fe; fe->path; fe->owner.
Cache aligning all arenas except HasherContext->arena to sizeof(void *).
Pushing elements one by one instead of snprintf() in finalize_file() and
hash_worker().
Getting the full path of current directory instead of "."
Fixing bug in path formating, this allow us to remove normalize_path()
from the hot loop.
This commit is contained in:
2026-05-08 15:45:57 +01:00
parent 7d2a24d0be
commit 16c6aeae65
8 changed files with 352 additions and 160 deletions

View File

@@ -1,4 +1,4 @@
#pragma once
#pragma once
#include "arena.h"
#include "base.h"
@@ -175,7 +175,6 @@ static void os_file_close(FileHandle handle) { close(handle); }
// -------------------- Thread abstraction -------------------
// Threads context
typedef struct {
mem_arena *path_arena;
mem_arena *meta_arena;
MPMCQueue *dir_queue;
@@ -187,7 +186,7 @@ typedef struct {
typedef struct {
mem_arena *arena;
MPMCQueue *file_queue;
} WorkerContext;
} HasherContext;
#if defined(_WIN32) || defined(_WIN64)
typedef HANDLE ThreadHandle;
@@ -284,7 +283,7 @@ static int thread_wait_multiple(Thread *threads, size_t count) {
#endif
// ======================== Get file metadata ========================
// -------------------- Path parsing -------------------
// -------------------- Path helpers -------------------
static void normalize_path(char *p) {
char *src = p;
char *dst = p;
@@ -353,6 +352,64 @@ static int parse_paths(char *line, char folders[][MAX_PATHLEN],
return count;
}
#if defined(_WIN32) || defined(_WIN64)
bool platform_get_current_directory(char *buffer, size_t size) {
DWORD len = GetCurrentDirectoryA((DWORD)size, buffer);
if (len == 0 || len >= size)
return false;
return true;
}
#elif defined(__linux__)
bool platform_get_current_directory(char *buffer, size_t size) {
return getcwd(buffer, size) != NULL;
}
#endif
typedef struct {
char buffer[MAX_PATHLEN];
char *base_end; // Points to end of base path
char *filename_pos; // Points to where filename should be written
size_t base_len;
} PathBuilder;
static void path_builder_init(PathBuilder *pb, const char *base) {
pb->base_len = strlen(base);
memcpy(pb->buffer, base, pb->base_len);
pb->base_end = pb->buffer + pb->base_len;
// Only add separator if not already present
if (pb->base_len > 0 && *(pb->base_end - 1) != '/') {
*pb->base_end = '/';
pb->base_end++;
pb->base_len++;
}
// Ensure null termination
*pb->base_end = '\0';
pb->filename_pos = pb->base_end;
}
static void path_builder_set_filename(PathBuilder *pb, const char *filename,
size_t name_len) {
memcpy(pb->filename_pos, filename, name_len);
pb->filename_pos[name_len] = '\0'; // Ensure null termination
}
static char *path_builder_dup_arena(PathBuilder *pb, mem_arena *arena,
bool zero) {
// Calculate total length including base + separator + filename + null
// terminator
size_t total_len =
(pb->filename_pos - pb->buffer) + strlen(pb->filename_pos) + 1;
char *dup = arena_push(&arena, total_len, zero);
memcpy(dup, pb->buffer, total_len);
return dup;
}
// ------------------------- File time -------------------------
#if FILE_TIMES
#if defined(_WIN32) || defined(_WIN64)
@@ -369,6 +426,7 @@ static void format_time(uint64_t t, char *out, size_t out_sz) {
strftime(out, out_sz, "%Y-%m-%d %H:%M:%S", &tm);
}
// ------------------ Convert filetime to epoch -------------------
static uint64_t filetime_to_epoch(const FILETIME *ft) {
ULARGE_INTEGER ull;
@@ -614,7 +672,7 @@ typedef struct FileEntry {
uint64_t modified_time; // epoch seconds
#endif
#if FILE_OWNER
char owner[128]; // resolved owner name
char *owner; // resolved owner name
#endif
#if CHECK_FILE_SYSTEM // Linux only
@@ -622,46 +680,6 @@ typedef struct FileEntry {
#endif
} FileEntry;
typedef struct {
char buffer[MAX_PATHLEN];
char *base_end; // Points to end of base path
char *filename_pos; // Points to where filename should be written
size_t base_len;
} PathBuilder;
static void path_builder_init(PathBuilder *pb, const char *base) {
pb->base_len = strlen(base);
memcpy(pb->buffer, base, pb->base_len);
pb->base_end = pb->buffer + pb->base_len;
#if defined(_WIN32) || defined(_WIN64)
*pb->base_end = '\\';
#elif defined(__linux__)
*pb->base_end = '/';
#endif
// Ensure null termination
*(pb->base_end + 1) = '\0';
pb->filename_pos = pb->base_end + 1;
}
static void path_builder_set_filename(PathBuilder *pb, const char *filename,
size_t name_len) {
memcpy(pb->filename_pos, filename, name_len);
pb->filename_pos[name_len] = '\0'; // Ensure null termination
}
static char *path_builder_dup_arena(PathBuilder *pb, mem_arena *arena,
bool zero) {
// Calculate total length including base + separator + filename + null
// terminator
size_t total_len =
(pb->filename_pos - pb->buffer) + strlen(pb->filename_pos) + 1;
char *dup = arena_push(&arena, total_len, zero);
memcpy(dup, pb->buffer, total_len);
return dup;
}
#if defined(_WIN32) || defined(_WIN64)
void scan_folder(const char *base, ScannerContext *ctx) {
PathBuilder pb;
@@ -669,7 +687,7 @@ void scan_folder(const char *base, ScannerContext *ctx) {
char search[MAX_PATHLEN];
memcpy(search, pb.buffer, pb.base_len + 1); // Copy base + separator
memcpy(search + pb.base_len + 1, "*", 2); // Add "*" and null
memcpy(search + pb.base_len, "*", 2); // Add "*" and null
WIN32_FIND_DATAA fd;
HANDLE h = FindFirstFileA(search, &fd);
@@ -691,29 +709,27 @@ void scan_folder(const char *base, ScannerContext *ctx) {
// If it's a directory:
if (fd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
char *dir = path_builder_dup_arena(&pb, ctx->path_arena, false);
char *dir = path_builder_dup_arena(&pb, ctx->meta_arena, false);
mpmc_push_work(ctx->dir_queue, dir);
} else {
// else a file:
atomic_fetch_add(&g_files_found, 1);
FileEntry *fe = arena_push(&ctx->meta_arena, sizeof(FileEntry), true);
FileEntry *fe = arena_push(&ctx->meta_arena, sizeof(FileEntry), false);
fe->path = arena_push(&ctx->meta_arena, MAX_PATHLEN, false);
// Create a temporary copy for normalization to avoid corrupting pb.buffer
char temp_path[MAX_PATHLEN];
memcpy(temp_path, pb.buffer,
(pb.filename_pos - pb.buffer) + name_len + 1);
normalize_path(temp_path);
memcpy(fe->path, pb.buffer, (pb.filename_pos - pb.buffer) + name_len + 1);
fe->path = arena_push(&ctx->path_arena, strlen(temp_path) + 1, false);
strcpy(fe->path, temp_path);
arena_trim_string(&ctx->meta_arena, fe->path, ARENA_TRIM_NUL);
#if FILE_TIMES
platform_get_file_times(pb.buffer, &fe->created_time, &fe->modified_time);
#endif
#if FILE_OWNER
platform_get_file_owner(pb.buffer, fe->owner, sizeof(fe->owner));
fe->owner = arena_push(&ctx->meta_arena, MAX_PATHLEN, false);
platform_get_file_owner(pb.buffer, fe->owner, MAX_PATHLEN);
arena_trim_string(&ctx->meta_arena, fe->owner, ARENA_TRIM_NUL);
#endif
fe->size_bytes = ((uint64_t)fd.nFileSizeHigh << 32) | fd.nFileSizeLow;
@@ -772,14 +788,20 @@ void scan_folder(const char *base, ScannerContext *ctx) {
continue; // Skip symlinks
if (file_type == DT_DIR) {
char *dir_path = path_builder_dup_arena(&pb, ctx->path_arena, false);
char *dir_path = path_builder_dup_arena(&pb, ctx->meta_arena, false);
mpmc_push_work(ctx->dir_queue, dir_path);
continue;
}
if (file_type == DT_REG) {
atomic_fetch_add(&g_files_found, 1);
FileEntry *fe = arena_push(&ctx->meta_arena, sizeof(FileEntry), true);
FileEntry *fe = arena_push(&ctx->meta_arena, sizeof(FileEntry), false);
fe->path = arena_push(&ctx->meta_arena, MAX_PATHLEN, false);
memcpy(fe->path, pb.buffer,
(pb.filename_pos - pb.buffer) + name_len + 1);
arena_trim_string(&ctx->meta_arena, fe->path, ARENA_TRIM_NUL);
// Use fstatat for file info
struct stat st;
@@ -790,21 +812,13 @@ void scan_folder(const char *base, ScannerContext *ctx) {
#endif
#if FILE_OWNER
platform_get_file_owner_fd(dir_fd, entry->d_name, fe->owner,
sizeof(fe->owner));
fe->owner = arena_push(&ctx->meta_arena, MAX_PATHLEN, false);
platform_get_file_owner_fd(dir_fd, pb.buffer, fe->owner, MAX_PATHLEN);
arena_trim_string(&ctx->meta_arena, fe->owner, ARENA_TRIM_NUL);
#endif
fe->size_bytes = (uint64_t)st.st_size;
// Normalize path
char temp_path[MAX_PATHLEN];
memcpy(temp_path, pb.buffer,
(pb.filename_pos - pb.buffer) + name_len + 1);
normalize_path(temp_path);
fe->path = arena_push(&ctx->path_arena, strlen(temp_path) + 1, false);
strcpy(fe->path, temp_path);
#if CHECK_FILE_SYSTEM
fe->fs_type = fs_type;
#endif
@@ -822,11 +836,17 @@ void scan_folder(const char *base, ScannerContext *ctx) {
continue;
if (S_ISDIR(st.st_mode)) {
char *dir_path = path_builder_dup_arena(&pb, ctx->path_arena, false);
char *dir_path = path_builder_dup_arena(&pb, ctx->meta_arena, false);
mpmc_push_work(ctx->dir_queue, dir_path);
} else if (S_ISREG(st.st_mode)) {
atomic_fetch_add(&g_files_found, 1);
FileEntry *fe = arena_push(&ctx->meta_arena, sizeof(FileEntry), true);
FileEntry *fe = arena_push(&ctx->meta_arena, sizeof(FileEntry), false);
fe->path = arena_push(&ctx->meta_arena, MAX_PATHLEN, false);
memcpy(fe->path, pb.buffer,
(pb.filename_pos - pb.buffer) + name_len + 1);
arena_trim_string(&ctx->meta_arena, fe->path, ARENA_TRIM_NUL);
#if FILE_TIMES
platform_get_file_times(pb.buffer, &fe->created_time,
@@ -834,19 +854,13 @@ void scan_folder(const char *base, ScannerContext *ctx) {
#endif
#if FILE_OWNER
platform_get_file_owner(pb.buffer, fe->owner, sizeof(fe->owner));
fe->owner = arena_push(&ctx->meta_arena, MAX_PATHLEN, false);
platform_get_file_owner(pb.buffer, fe->owner, MAX_PATHLEN);
arena_trim_string(&ctx->meta_arena, fe->owner, ARENA_TRIM_NUL);
#endif
fe->size_bytes = (uint64_t)st.st_size;
char temp_path[MAX_PATHLEN];
memcpy(temp_path, pb.buffer,
(pb.filename_pos - pb.buffer) + name_len + 1);
normalize_path(temp_path);
fe->path = arena_push(&ctx->path_arena, strlen(temp_path) + 1, false);
strcpy(fe->path, temp_path);
#if CHECK_FILE_SYSTEM
fe->fs_type = fs_type;
#endif
@@ -906,7 +920,7 @@ static void xxh3_hash_file_stream(const char *path, char *out_hex,
// ------------------------- Hash worker --------------------------------
static THREAD_RETURN hash_worker(void *arg) {
WorkerContext *ctx = (WorkerContext *)arg;
HasherContext *ctx = (HasherContext *)arg;
void *buf = malloc(READ_BLOCK);
for (;;) {
@@ -914,35 +928,54 @@ static THREAD_RETURN hash_worker(void *arg) {
if (!fe)
break;
char hash[HASH_STRLEN];
// Hash
char *hash = arena_push(&ctx->arena, HASH_STRLEN, false);
xxh3_hash_file_stream(fe->path, hash, buf);
arena_trim_string(&ctx->arena, hash, ARENA_TRIM_TAB);
// Path
u64 path_len = strlen(fe->path) + 1;
char *path = arena_push(&ctx->arena, path_len, ARENA_TRIM_TAB);
memcpy(path, fe->path, path_len);
arena_trim_string(&ctx->arena, path, ARENA_TRIM_TAB);
// Size
double size_kib = (double)fe->size_bytes / 1024.0;
char stack_buf[KiB(4)];
int len;
char *size = arena_push(&ctx->arena, 32, false);
snprintf(size, 32, "%.2f", size_kib);
arena_trim_string(&ctx->arena, size, ARENA_TRIM_NONE);
// Times
char *separator;
#if FILE_TIMES
char created[32], modified[32];
format_time(fe->created_time, created, sizeof(created));
format_time(fe->modified_time, modified, sizeof(modified));
separator = arena_push(&ctx->arena, 1, false);
*separator = '\t';
u64 time_size = 32;
char *created = arena_push(&ctx->arena, time_size, false);
format_time(fe->created_time, created, time_size);
arena_trim_string(&ctx->arena, created, ARENA_TRIM_TAB);
char *modified = arena_push(&ctx->arena, time_size, false);
format_time(fe->modified_time, modified, time_size);
arena_trim_string(&ctx->arena, modified, ARENA_TRIM_NONE);
#endif
#if FILE_TIMES && FILE_OWNER
len = snprintf(stack_buf, sizeof(stack_buf), "%s\t%s\t%.2f\t%s\t%s\t%s\n",
hash, fe->path, size_kib, created, modified, fe->owner);
#elif FILE_TIMES
len = snprintf(stack_buf, sizeof(stack_buf), "%s\t%s\t%.2f\t%s\t%s\n", hash,
fe->path, size_kib, created, modified);
#elif FILE_OWNER
len = snprintf(stack_buf, sizeof(stack_buf), "%s\t%s\t%.2f\t%s\n", hash,
fe->path, size_kib, fe->owner);
#else
len = snprintf(stack_buf, sizeof(stack_buf), "%s\t%s\t%.2f\n", hash,
fe->path, size_kib);
// Owner
#if FILE_OWNER
separator = arena_push(&ctx->arena, 1, false);
*separator = '\t';
char *owner = arena_push(&ctx->arena, 128, false);
memcpy(owner, fe->owner, 128);
arena_trim_string(&ctx->arena, owner, ARENA_TRIM_NONE);
#endif
char *dst = arena_push(&ctx->arena, len, false);
memcpy(dst, stack_buf, len);
// Final newline
separator = arena_push(&ctx->arena, 1, false);
*separator = '\n';
atomic_fetch_add(&g_files_hashed, 1);
}
@@ -1576,8 +1609,8 @@ static int ioring_submit(ThreadIoContext *thread_ctx, uint32_t *submitted) {
static void ioring_process_completions(ThreadIoContext *thread_ctx) {
struct io_uring_cqe *cqes[NUM_BUFFERS_PER_THREAD];
unsigned cqe_count = io_uring_peek_batch_cqe(&((IoUring *)thread_ctx->ring)->ring,
cqes, NUM_BUFFERS_PER_THREAD);
unsigned cqe_count = io_uring_peek_batch_cqe(
&((IoUring *)thread_ctx->ring)->ring, cqes, NUM_BUFFERS_PER_THREAD);
if (cqe_count == 0) {
return;
@@ -1788,7 +1821,8 @@ static IoBuffer *get_free_buffer(ThreadIoContext *restrict thread_ctx) {
return buf;
}
static void return_buffer(ThreadIoContext *restrict thread_ctx, IoBuffer *restrict buf) {
static void return_buffer(ThreadIoContext *restrict thread_ctx,
IoBuffer *restrict buf) {
if (!buf)
return;
@@ -1796,8 +1830,8 @@ static void return_buffer(ThreadIoContext *restrict thread_ctx, IoBuffer *restri
}
// -------------------- File operations -----------------------
static int init_file(ThreadIoContext *restrict thread_ctx, FileReadContext *restrict file,
FileEntry *restrict fe) {
static int init_file(ThreadIoContext *restrict thread_ctx,
FileReadContext *restrict file, FileEntry *restrict fe) {
#if USE_REGISTERED_FILES
uint32_t saved_slot_id = file->slot_id;
@@ -1835,13 +1869,14 @@ static int init_file(ThreadIoContext *restrict thread_ctx, FileReadContext *rest
}
static void finalize_file(ThreadIoContext *restrict thread_ctx,
WorkerContext *worker_ctx, FileReadContext *restrict file) {
HasherContext *worker_ctx,
FileReadContext *restrict file) {
FileEntry *restrict fe = file->fe;
os_file_close(file->file_handle);
char hash[HASH_STRLEN];
char *hash = arena_push(&worker_ctx->arena, HASH_STRLEN, false);
if (file->bytes_hashed == file->file_size) {
if (file->use_incremental_hash) {
@@ -1863,40 +1898,56 @@ static void finalize_file(ThreadIoContext *restrict thread_ctx,
atomic_fetch_add(&g_io_ring_fallbacks, 1);
xxh3_hash_file_stream(fe->path, hash, thread_ctx->fallback_buffer);
}
arena_trim_string(&worker_ctx->arena, hash, ARENA_TRIM_TAB);
// Path
u64 path_len = strlen(fe->path) + 1;
char *path = arena_push(&worker_ctx->arena, path_len, ARENA_TRIM_TAB);
memcpy(path, fe->path, path_len);
arena_trim_string(&worker_ctx->arena, path, ARENA_TRIM_TAB);
// Size
double size_kib = (double)fe->size_bytes / 1024.0;
char stack_buf[KiB(4)];
int len;
char *size = arena_push(&worker_ctx->arena, 32, false);
snprintf(size, 32, "%.2f", size_kib);
arena_trim_string(&worker_ctx->arena, size, ARENA_TRIM_NONE);
// Time
char *separator;
#if FILE_TIMES
char created[32], modified[32];
format_time(fe->created_time, created, sizeof(created));
format_time(fe->modified_time, modified, sizeof(modified));
separator = arena_push(&worker_ctx->arena, 1, false);
*separator = '\t';
u64 time_size = 32;
char *created = arena_push(&worker_ctx->arena, time_size, false);
format_time(fe->created_time, created, time_size);
arena_trim_string(&worker_ctx->arena, created, ARENA_TRIM_TAB);
char *modified = arena_push(&worker_ctx->arena, time_size, false);
format_time(fe->modified_time, modified, time_size);
arena_trim_string(&worker_ctx->arena, modified, ARENA_TRIM_NONE);
#endif
#if FILE_TIMES && FILE_OWNER
len = snprintf(stack_buf, sizeof(stack_buf), "%s\t%s\t%.2f\t%s\t%s\t%s\n",
hash, fe->path, size_kib, created, modified, fe->owner);
#elif FILE_TIMES
len = snprintf(stack_buf, sizeof(stack_buf), "%s\t%s\t%.2f\t%s\t%s\n", hash,
fe->path, size_kib, created, modified);
#elif FILE_OWNER
len = snprintf(stack_buf, sizeof(stack_buf), "%s\t%s\t%.2f\t%s\n", hash,
fe->path, size_kib, fe->owner);
#else
len = snprintf(stack_buf, sizeof(stack_buf), "%s\t%s\t%.2f\n", hash, fe->path,
size_kib);
// Owner
#if FILE_OWNER
separator = arena_push(&worker_ctx->arena, 1, false);
*separator = '\t';
char *owner = arena_push(&worker_ctx->arena, 128, false);
memcpy(owner, fe->owner, 128);
arena_trim_string(&worker_ctx->arena, owner, ARENA_TRIM_NONE);
#endif
char *restrict dst = arena_push(&worker_ctx->arena, len, false);
memcpy(dst, stack_buf, len);
separator = arena_push(&worker_ctx->arena, 1, false);
*separator = '\n';
atomic_fetch_add(&g_files_hashed, 1);
}
// -------------------- Hash files -----------------------
static void hash_ready_files(ThreadIoContext *restrict thread_ctx, FileQueue *restrict fq,
WorkerContext *worker_ctx) {
static void hash_ready_files(ThreadIoContext *restrict thread_ctx,
FileQueue *restrict fq,
HasherContext *worker_ctx) {
for (int i = 0; i < fq->count; i++) {
@@ -1962,8 +2013,9 @@ static void hash_ready_files(ThreadIoContext *restrict thread_ctx, FileQueue *re
}
// ------------------ Build pending reads ----------------------
static void build_pending_reads(ThreadIoContext *restrict thread_ctx, FileQueue *restrict fq,
WorkerContext *worker_ctx) {
static void build_pending_reads(ThreadIoContext *restrict thread_ctx,
FileQueue *restrict fq,
HasherContext *worker_ctx) {
MPMCQueue *file_queue = worker_ctx->file_queue;
@@ -2049,7 +2101,7 @@ static void build_pending_reads(ThreadIoContext *restrict thread_ctx, FileQueue
// -------------------------- Hash worker I/O Ring ---------------------------
static THREAD_RETURN hash_worker_ioring(void *arg) {
WorkerContext *worker_ctx = (WorkerContext *)arg;
HasherContext *worker_ctx = (HasherContext *)arg;
// Init IO ring
ThreadIoContext *thread_ctx = ioring_init_thread();