From f37e9154895c2a27326ff77683dff7b7fa6637ce Mon Sep 17 00:00:00 2001 From: amir Date: Fri, 8 May 2026 20:06:48 +0100 Subject: [PATCH] Minor optimisations and bug fixes Fix bug in mt_mpmc.c, in Linux mutexes are not recursive. Add arena_trim_string() to the arena API Removing arena->path, now paths are pushed to arena->metadata Replacing fe->owner[128] with char *owner; the owner is not pushed as a string to arena->metadata and trimed with arena_trim_string() Improving cache locality in arena->metadata, the memory layout is not fe; fe->path; fe->owner. Cache aligning all arenas except HasherContext->arena to sizeof(void *). Pushing elements one by one instead of snprintf() in finalize_file() and hash_worker(). Getting the full path of current directory instead of "." Fixing bug in path formating, this allow us to remove normalize_path() from the hot loop. --- CMakeLists.txt | 10 +- arena.c | 117 +++++++++- arena.h | 2 +- base.h | 4 +- build.bat | 6 +- build.sh | 20 +- file_hasher.c => duplicate_finder.c | 27 ++- mt_mpmc.h | 14 +- platform.c | 317 ++++++++++++++++------------ 9 files changed, 353 insertions(+), 164 deletions(-) rename file_hasher.c => duplicate_finder.c (90%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2e6f72d..d7b6993 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.20) -project(filehasher +project(dfin VERSION 1.0.0 - DESCRIPTION "High-performance file hasher with I/O Ring/io_uring support" + DESCRIPTION "High-performance duplicate finder with I/O Ring/io_uring support" LANGUAGES C ) @@ -106,7 +106,7 @@ endif() # --------------------------------------------------------------------------- set(SOURCES - file_hasher.c + duplicate_finder.c xxhash.c xxh_x86dispatch.c ) @@ -116,7 +116,7 @@ set(HEADERS arena.h base.h xxhash.h - lf_mpmc.h + mt_mpmc.h ) # --------------------------------------------------------------------------- @@ -281,4 +281,4 @@ message(STATUS " Compiler: ${CMAKE_C_COMPILER}") message(STATUS " Build Type: ${CMAKE_BUILD_TYPE}") message(STATUS " Generator: ${CMAKE_GENERATOR}") message(STATUS " Platform: ${PLATFORM_NAME}") -message(STATUS "----------------------------------------") \ No newline at end of file +message(STATUS "----------------------------------------") diff --git a/arena.c b/arena.c index beb646b..d080200 100644 --- a/arena.c +++ b/arena.c @@ -196,7 +196,7 @@ mem_arena *arena_create(arena_params *params) { // mk create arena->free_list = arena_create(&(arena_params){ .reserve_size = MiB(1), .commit_size = MiB(1), - .align = ARENA_ALIGN, + .align = ARENA_CACHE_ALIGN, .push_size = sizeof(arena_free_node), .allow_free_list = false, .free_list = NULL, @@ -620,6 +620,119 @@ void *arena_swapback_pop(mem_arena **arena_ptr, u64 index) { // mk swapback /* ============================================================ Utilities ============================================================ */ +typedef enum arena_trim_flags { + ARENA_TRIM_NONE = 0, + + ARENA_TRIM_SPACE = 1 << 0, + ARENA_TRIM_TAB = 1 << 1, + ARENA_TRIM_LF = 1 << 2, + ARENA_TRIM_CR = 1 << 3, + ARENA_TRIM_NUL = 1 << 4, + +} arena_trim_flags; + +u64 arena_trim_string(mem_arena **arena_ptr, char *str, u8 termination_flags) { + ASSERT(arena_ptr); + ASSERT(*arena_ptr); + ASSERT(str); + + if (!arena_ptr || !*arena_ptr || !str) { + return 0; + } + + mem_arena *arena = *arena_ptr; + + /* ------------------------------------------------------------ + Find owning block + ------------------------------------------------------------ */ + + mem_arena *owner = arena_block_from_ptr(arena, (u8 *)str); + + ASSERT(owner); + if (!owner) { + return 0; + } + + /* ------------------------------------------------------------ + Must be current block + ------------------------------------------------------------ */ + + if (owner != arena) { + fprintf(stderr, "arena_trim_string(): string is not " + "in current arena block.\n"); + return 0; + } + + /* ------------------------------------------------------------ + Compute string position + ------------------------------------------------------------ */ + + u64 str_pos = arena_pos_from_ptr(arena, str); + + /* ------------------------------------------------------------ + Original reserved size + ------------------------------------------------------------ */ + + u64 allocated_size = arena->pos - str_pos; + + /* ------------------------------------------------------------ + Compute sizes + ------------------------------------------------------------ */ + + u64 str_size = strlen(str); + + char *dst = str + str_size; + u64 termination_size = 0; + + if (termination_flags & ARENA_TRIM_SPACE) { + *dst++ = ' '; + termination_size++; + } + + if (termination_flags & ARENA_TRIM_TAB) { + *dst++ = '\t'; + termination_size++; + } + + if (termination_flags & ARENA_TRIM_CR) { + *dst++ = '\r'; + termination_size++; + } + + if (termination_flags & ARENA_TRIM_LF) { + *dst++ = '\n'; + termination_size++; + } + + if (termination_flags & ARENA_TRIM_NUL) { + *dst++ = '\0'; + termination_size++; + } + /* ------------------------------------------------------------ + Final used size + ------------------------------------------------------------ */ + + u64 used_size = str_size + termination_size; + + used_size = ALIGN_UP_POW2(used_size, arena->align); + + /* ------------------------------------------------------------ + Overflow detection + ------------------------------------------------------------ */ + + if (used_size > allocated_size) { + fprintf(stderr, "arena_trim_string(): string overflow " + "detected.\n"); + } + + /* ------------------------------------------------------------ + Update arena position + ------------------------------------------------------------ */ + + arena->pos = str_pos + used_size; + + return used_size; +} void *arena_clear(mem_arena **arena_ptr) { // mk clear @@ -801,7 +914,7 @@ mem_arena_temp arena_scratch_get(mem_arena **conflicts, u32 num_conflicts) { arena_params params = { .reserve_size = MiB(64), .commit_size = MiB(1), - .align = ARENA_ALIGN, + .align = ARENA_CACHE_ALIGN, .push_size = 8, .allow_free_list = false, .allow_swapback = true, diff --git a/arena.h b/arena.h index a0ddb4d..9372115 100644 --- a/arena.h +++ b/arena.h @@ -239,7 +239,7 @@ void *arena_ptr_from_index(mem_arena *arena, u64 index); */ #define ARENA_HEADER_SIZE (sizeof(mem_arena)) -#define ARENA_ALIGN (sizeof(void *)) +#define ARENA_CACHE_ALIGN (sizeof(void *)) // arena config typedef enum arena_growth_policy { diff --git a/base.h b/base.h index a7ebdec..920f311 100644 --- a/base.h +++ b/base.h @@ -35,6 +35,7 @@ #include #include #include +#include #endif #include @@ -147,9 +148,6 @@ static void sleep_ms(int ms) { Sleep(ms); } #define _DEFAULT_SOURCE #endif -#include -#include - static u32 plat_get_pagesize(void) { return (u32)sysconf(_SC_PAGESIZE); } static void *plat_mem_reserve(u64 size) { diff --git a/build.bat b/build.bat index ccef9ce..8040633 100644 --- a/build.bat +++ b/build.bat @@ -1,6 +1,8 @@ @echo off setlocal enabledelayedexpansion +set PROJECT_NAME=dfin + :: ============================================================================ :: build.bat :: ============================================================================ @@ -45,7 +47,7 @@ exit /b 1 :main set BUILD_DIR=%SCRIPT_DIR%\build\windows\%BUILD_TYPE% -echo === Building filehasher (%BUILD_TYPE%) === +echo === Building %PROJECT_NAME% (%BUILD_TYPE%) === :: -------------------------------------------------------------------------- :: Clean if requested @@ -167,4 +169,4 @@ popd echo. echo === Build Complete === -echo Executable: %BUILD_DIR%\filehasher.exe \ No newline at end of file +echo Executable: %BUILD_DIR%\%PROJECT_NAME%.exe diff --git a/build.sh b/build.sh index 8321d80..e8593d4 100644 --- a/build.sh +++ b/build.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash # ============================================================================ -# build.sh - Build script for filehasher (Linux) +# build.sh - Build script (Linux) # Usage: ./build.sh [Release|Debug] [clean] # # Compiler preference: gcc > clang @@ -9,6 +9,8 @@ set -euo pipefail +PROJECT_NAME="dfin" + # --------------------------------------------------------------------------- # Colors # --------------------------------------------------------------------------- @@ -55,7 +57,7 @@ done readonly BUILD_DIR="build/linux/${BUILD_TYPE}" readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -echo -e "${GREEN}=== Building filehasher (${BUILD_TYPE}) ===${NC}" +echo -e "${GREEN}=== Building ${PROJECT_NAME} (${BUILD_TYPE}) ===${NC}" echo "Project: ${SCRIPT_DIR}" # --------------------------------------------------------------------------- @@ -226,18 +228,18 @@ echo # --------------------------------------------------------------------------- cd "${SCRIPT_DIR}" -if [[ -f "${BUILD_DIR}/filehasher" ]]; then - echo -e "${GREEN}Executable: ${BUILD_DIR}/filehasher${NC}" +if [[ -f "${BUILD_DIR}/${PROJECT_NAME}" ]]; then + echo -e "${GREEN}Executable: ${BUILD_DIR}/${PROJECT_NAME}${NC}" if command -v file &> /dev/null; then - echo -e " Type: $(file -b ${BUILD_DIR}/filehasher)" + echo -e " Type: $(file -b ${BUILD_DIR}/${PROJECT_NAME})" fi if command -v du &> /dev/null; then - echo -e " Size: $(du -h ${BUILD_DIR}/filehasher | cut -f1)" + echo -e " Size: $(du -h ${BUILD_DIR}/${PROJECT_NAME} | cut -f1)" fi -elif [[ -f "${BUILD_DIR}/filehasher.exe" ]]; then - echo -e "${GREEN}Executable: ${BUILD_DIR}/filehasher.exe${NC}" +elif [[ -f "${BUILD_DIR}/${PROJECT_NAME}.exe" ]]; then + echo -e "${GREEN}Executable: ${BUILD_DIR}/${PROJECT_NAME}.exe${NC}" else echo -e "${YELLOW}Note: Could not locate executable${NC}" echo "Checking build directory:" @@ -269,4 +271,4 @@ if [[ "${EXPORT_COMPILE_COMMANDS}" == "ON" ]]; then fi echo -echo -e "${GREEN}Ready to run: ./${BUILD_DIR}/filehasher${NC}" +echo -e "${GREEN}Ready to run: ./${BUILD_DIR}/${PROJECT_NAME}${NC}" diff --git a/file_hasher.c b/duplicate_finder.c similarity index 90% rename from file_hasher.c rename to duplicate_finder.c index abb4f6d..4af6975 100644 --- a/file_hasher.c +++ b/duplicate_finder.c @@ -32,7 +32,11 @@ int main(int argc, char **argv) { buf[strcspn(buf, "\r\n")] = 0; if (buf[0] == 0) { - strcpy(folders[0], "."); + if (!platform_get_current_directory(folders[0], sizeof(folders[0]))) { + fprintf(stderr, "Failed to get current directory\n"); + return 1; + } + normalize_path(folders[0]); folder_count = 1; } else { folder_count = parse_paths(buf, folders, 64); @@ -71,7 +75,19 @@ int main(int argc, char **argv) { .max_nbre_blocks = 1, }; - mem_arena *gp_arena = arena_create(¶ms); + arena_params params_caligned = { + .reserve_size = GiB(1), + .commit_size = MiB(16), + .align = ARENA_CACHE_ALIGN, + .push_size = 0, + .allow_free_list = true, + .allow_swapback = false, + .growth_policy = ARENA_GROWTH_NORMAL, + .commit_policy = ARENA_COMMIT_LAZY, + .max_nbre_blocks = 1, + }; + + mem_arena *gp_arena = arena_create(¶ms_caligned); // ------------------------------- // Detect hardware @@ -119,7 +135,7 @@ int main(int argc, char **argv) { mpmc_init(&file_queue, MiB(1)); // Starting hash threads - WorkerContext workers[num_hash_threads]; + HasherContext workers[num_hash_threads]; Thread *hash_threads = arena_push(&gp_arena, sizeof(Thread) * num_hash_threads, true); @@ -155,8 +171,7 @@ int main(int argc, char **argv) { for (uint8_t i = 0; i < num_scan_threads; i++) { scanners[i].num_threads = num_scan_threads; - scanners[i].path_arena = arena_create(¶ms); - scanners[i].meta_arena = arena_create(¶ms); + scanners[i].meta_arena = arena_create(¶ms_caligned); scanners[i].dir_queue = &dir_queue; scanners[i].file_queue = &file_queue; @@ -170,7 +185,7 @@ int main(int argc, char **argv) { // Initial folder push for (int i = 0; i < folder_count; i++) { size_t len = strlen(folders[i]) + 1; - char *path = arena_push(&scanners[0].path_arena, len, false); + char *path = arena_push(&scanners[0].meta_arena, len, false); memcpy(path, folders[i], len); mpmc_push_work(&dir_queue, path); } diff --git a/mt_mpmc.h b/mt_mpmc.h index 8d1fc1e..4eec886 100644 --- a/mt_mpmc.h +++ b/mt_mpmc.h @@ -1,4 +1,4 @@ -#pragma once +#pragma once #include "base.h" @@ -214,19 +214,27 @@ static void mpmc_producers_finished(MPMCQueue *q, u8 consumer_count) { /* Done */ /* ----------------------------------------------------------- */ static void mpmc_task_done(MPMCQueue *q, u8 consumer_count) { + + bool finished = false; + mtx_lock(&q->lock); if (--q->work_count == 0) { - mpmc_producers_finished(q, consumer_count); + finished = true; } mtx_unlock(&q->lock); + + if (finished) { + mpmc_producers_finished(q, consumer_count); + } } /* ----------------------------------------------------------- */ /* MPMC Cleanup */ /* ----------------------------------------------------------- */ -// static void mpmc_finish(MPMCQueue *q) { // Comment to prevent warning: unused function +// static void mpmc_finish(MPMCQueue *q) { // Comment to prevent warning: unused +// function // if (!q) return; // // if (q->slots) { diff --git a/platform.c b/platform.c index b1e5435..559a0d7 100644 --- a/platform.c +++ b/platform.c @@ -1,8 +1,8 @@ -#pragma once +#pragma once #include "arena.h" #include "base.h" -#include "sm_mpmc.h" +#include "mt_mpmc.h" #include "arena.c" #include @@ -175,7 +175,6 @@ static void os_file_close(FileHandle handle) { close(handle); } // -------------------- Thread abstraction ------------------- // Threads context typedef struct { - mem_arena *path_arena; mem_arena *meta_arena; MPMCQueue *dir_queue; @@ -187,7 +186,7 @@ typedef struct { typedef struct { mem_arena *arena; MPMCQueue *file_queue; -} WorkerContext; +} HasherContext; #if defined(_WIN32) || defined(_WIN64) typedef HANDLE ThreadHandle; @@ -284,7 +283,7 @@ static int thread_wait_multiple(Thread *threads, size_t count) { #endif // ======================== Get file metadata ======================== -// -------------------- Path parsing ------------------- +// -------------------- Path helpers ------------------- static void normalize_path(char *p) { char *src = p; char *dst = p; @@ -353,6 +352,64 @@ static int parse_paths(char *line, char folders[][MAX_PATHLEN], return count; } +#if defined(_WIN32) || defined(_WIN64) +bool platform_get_current_directory(char *buffer, size_t size) { + + DWORD len = GetCurrentDirectoryA((DWORD)size, buffer); + + if (len == 0 || len >= size) + return false; + + return true; +} + +#elif defined(__linux__) +bool platform_get_current_directory(char *buffer, size_t size) { + return getcwd(buffer, size) != NULL; +} +#endif + +typedef struct { + char buffer[MAX_PATHLEN]; + char *base_end; // Points to end of base path + char *filename_pos; // Points to where filename should be written + size_t base_len; +} PathBuilder; + +static void path_builder_init(PathBuilder *pb, const char *base) { + pb->base_len = strlen(base); + memcpy(pb->buffer, base, pb->base_len); + pb->base_end = pb->buffer + pb->base_len; + + // Only add separator if not already present + if (pb->base_len > 0 && *(pb->base_end - 1) != '/') { + *pb->base_end = '/'; + pb->base_end++; + pb->base_len++; + } + + // Ensure null termination + *pb->base_end = '\0'; + pb->filename_pos = pb->base_end; +} + +static void path_builder_set_filename(PathBuilder *pb, const char *filename, + size_t name_len) { + memcpy(pb->filename_pos, filename, name_len); + pb->filename_pos[name_len] = '\0'; // Ensure null termination +} + +static char *path_builder_dup_arena(PathBuilder *pb, mem_arena *arena, + bool zero) { + // Calculate total length including base + separator + filename + null + // terminator + size_t total_len = + (pb->filename_pos - pb->buffer) + strlen(pb->filename_pos) + 1; + char *dup = arena_push(&arena, total_len, zero); + memcpy(dup, pb->buffer, total_len); + return dup; +} + // ------------------------- File time ------------------------- #if FILE_TIMES #if defined(_WIN32) || defined(_WIN64) @@ -614,7 +671,7 @@ typedef struct FileEntry { uint64_t modified_time; // epoch seconds #endif #if FILE_OWNER - char owner[128]; // resolved owner name + char *owner; // resolved owner name #endif #if CHECK_FILE_SYSTEM // Linux only @@ -622,46 +679,6 @@ typedef struct FileEntry { #endif } FileEntry; -typedef struct { - char buffer[MAX_PATHLEN]; - char *base_end; // Points to end of base path - char *filename_pos; // Points to where filename should be written - size_t base_len; -} PathBuilder; - -static void path_builder_init(PathBuilder *pb, const char *base) { - pb->base_len = strlen(base); - memcpy(pb->buffer, base, pb->base_len); - pb->base_end = pb->buffer + pb->base_len; - -#if defined(_WIN32) || defined(_WIN64) - *pb->base_end = '\\'; -#elif defined(__linux__) - *pb->base_end = '/'; -#endif - - // Ensure null termination - *(pb->base_end + 1) = '\0'; - pb->filename_pos = pb->base_end + 1; -} - -static void path_builder_set_filename(PathBuilder *pb, const char *filename, - size_t name_len) { - memcpy(pb->filename_pos, filename, name_len); - pb->filename_pos[name_len] = '\0'; // Ensure null termination -} - -static char *path_builder_dup_arena(PathBuilder *pb, mem_arena *arena, - bool zero) { - // Calculate total length including base + separator + filename + null - // terminator - size_t total_len = - (pb->filename_pos - pb->buffer) + strlen(pb->filename_pos) + 1; - char *dup = arena_push(&arena, total_len, zero); - memcpy(dup, pb->buffer, total_len); - return dup; -} - #if defined(_WIN32) || defined(_WIN64) void scan_folder(const char *base, ScannerContext *ctx) { PathBuilder pb; @@ -669,7 +686,7 @@ void scan_folder(const char *base, ScannerContext *ctx) { char search[MAX_PATHLEN]; memcpy(search, pb.buffer, pb.base_len + 1); // Copy base + separator - memcpy(search + pb.base_len + 1, "*", 2); // Add "*" and null + memcpy(search + pb.base_len, "*", 2); // Add "*" and null WIN32_FIND_DATAA fd; HANDLE h = FindFirstFileA(search, &fd); @@ -691,29 +708,27 @@ void scan_folder(const char *base, ScannerContext *ctx) { // If it's a directory: if (fd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) { - char *dir = path_builder_dup_arena(&pb, ctx->path_arena, false); + char *dir = path_builder_dup_arena(&pb, ctx->meta_arena, false); mpmc_push_work(ctx->dir_queue, dir); } else { // else a file: atomic_fetch_add(&g_files_found, 1); - FileEntry *fe = arena_push(&ctx->meta_arena, sizeof(FileEntry), true); + FileEntry *fe = arena_push(&ctx->meta_arena, sizeof(FileEntry), false); + fe->path = arena_push(&ctx->meta_arena, MAX_PATHLEN, false); - // Create a temporary copy for normalization to avoid corrupting pb.buffer - char temp_path[MAX_PATHLEN]; - memcpy(temp_path, pb.buffer, - (pb.filename_pos - pb.buffer) + name_len + 1); - normalize_path(temp_path); + memcpy(fe->path, pb.buffer, (pb.filename_pos - pb.buffer) + name_len + 1); - fe->path = arena_push(&ctx->path_arena, strlen(temp_path) + 1, false); - strcpy(fe->path, temp_path); + arena_trim_string(&ctx->meta_arena, fe->path, ARENA_TRIM_NUL); #if FILE_TIMES platform_get_file_times(pb.buffer, &fe->created_time, &fe->modified_time); #endif #if FILE_OWNER - platform_get_file_owner(pb.buffer, fe->owner, sizeof(fe->owner)); + fe->owner = arena_push(&ctx->meta_arena, MAX_PATHLEN, false); + platform_get_file_owner(pb.buffer, fe->owner, MAX_PATHLEN); + arena_trim_string(&ctx->meta_arena, fe->owner, ARENA_TRIM_NUL); #endif fe->size_bytes = ((uint64_t)fd.nFileSizeHigh << 32) | fd.nFileSizeLow; @@ -772,14 +787,20 @@ void scan_folder(const char *base, ScannerContext *ctx) { continue; // Skip symlinks if (file_type == DT_DIR) { - char *dir_path = path_builder_dup_arena(&pb, ctx->path_arena, false); + char *dir_path = path_builder_dup_arena(&pb, ctx->meta_arena, false); mpmc_push_work(ctx->dir_queue, dir_path); continue; } if (file_type == DT_REG) { atomic_fetch_add(&g_files_found, 1); - FileEntry *fe = arena_push(&ctx->meta_arena, sizeof(FileEntry), true); + FileEntry *fe = arena_push(&ctx->meta_arena, sizeof(FileEntry), false); + fe->path = arena_push(&ctx->meta_arena, MAX_PATHLEN, false); + + memcpy(fe->path, pb.buffer, + (pb.filename_pos - pb.buffer) + name_len + 1); + + arena_trim_string(&ctx->meta_arena, fe->path, ARENA_TRIM_NUL); // Use fstatat for file info struct stat st; @@ -790,21 +811,13 @@ void scan_folder(const char *base, ScannerContext *ctx) { #endif #if FILE_OWNER - platform_get_file_owner_fd(dir_fd, entry->d_name, fe->owner, - sizeof(fe->owner)); + fe->owner = arena_push(&ctx->meta_arena, MAX_PATHLEN, false); + platform_get_file_owner_fd(dir_fd, pb.buffer, fe->owner, MAX_PATHLEN); + arena_trim_string(&ctx->meta_arena, fe->owner, ARENA_TRIM_NUL); #endif fe->size_bytes = (uint64_t)st.st_size; - // Normalize path - char temp_path[MAX_PATHLEN]; - memcpy(temp_path, pb.buffer, - (pb.filename_pos - pb.buffer) + name_len + 1); - normalize_path(temp_path); - - fe->path = arena_push(&ctx->path_arena, strlen(temp_path) + 1, false); - strcpy(fe->path, temp_path); - #if CHECK_FILE_SYSTEM fe->fs_type = fs_type; #endif @@ -822,11 +835,17 @@ void scan_folder(const char *base, ScannerContext *ctx) { continue; if (S_ISDIR(st.st_mode)) { - char *dir_path = path_builder_dup_arena(&pb, ctx->path_arena, false); + char *dir_path = path_builder_dup_arena(&pb, ctx->meta_arena, false); mpmc_push_work(ctx->dir_queue, dir_path); } else if (S_ISREG(st.st_mode)) { atomic_fetch_add(&g_files_found, 1); - FileEntry *fe = arena_push(&ctx->meta_arena, sizeof(FileEntry), true); + FileEntry *fe = arena_push(&ctx->meta_arena, sizeof(FileEntry), false); + fe->path = arena_push(&ctx->meta_arena, MAX_PATHLEN, false); + + memcpy(fe->path, pb.buffer, + (pb.filename_pos - pb.buffer) + name_len + 1); + + arena_trim_string(&ctx->meta_arena, fe->path, ARENA_TRIM_NUL); #if FILE_TIMES platform_get_file_times(pb.buffer, &fe->created_time, @@ -834,19 +853,13 @@ void scan_folder(const char *base, ScannerContext *ctx) { #endif #if FILE_OWNER - platform_get_file_owner(pb.buffer, fe->owner, sizeof(fe->owner)); + fe->owner = arena_push(&ctx->meta_arena, MAX_PATHLEN, false); + platform_get_file_owner(pb.buffer, fe->owner, MAX_PATHLEN); + arena_trim_string(&ctx->meta_arena, fe->owner, ARENA_TRIM_NUL); #endif fe->size_bytes = (uint64_t)st.st_size; - char temp_path[MAX_PATHLEN]; - memcpy(temp_path, pb.buffer, - (pb.filename_pos - pb.buffer) + name_len + 1); - normalize_path(temp_path); - - fe->path = arena_push(&ctx->path_arena, strlen(temp_path) + 1, false); - strcpy(fe->path, temp_path); - #if CHECK_FILE_SYSTEM fe->fs_type = fs_type; #endif @@ -906,7 +919,7 @@ static void xxh3_hash_file_stream(const char *path, char *out_hex, // ------------------------- Hash worker -------------------------------- static THREAD_RETURN hash_worker(void *arg) { - WorkerContext *ctx = (WorkerContext *)arg; + HasherContext *ctx = (HasherContext *)arg; void *buf = malloc(READ_BLOCK); for (;;) { @@ -914,35 +927,54 @@ static THREAD_RETURN hash_worker(void *arg) { if (!fe) break; - char hash[HASH_STRLEN]; + // Hash + char *hash = arena_push(&ctx->arena, HASH_STRLEN, false); xxh3_hash_file_stream(fe->path, hash, buf); + arena_trim_string(&ctx->arena, hash, ARENA_TRIM_TAB); + // Path + u64 path_len = strlen(fe->path) + 1; + char *path = arena_push(&ctx->arena, path_len, ARENA_TRIM_TAB); + memcpy(path, fe->path, path_len); + arena_trim_string(&ctx->arena, path, ARENA_TRIM_TAB); + + // Size double size_kib = (double)fe->size_bytes / 1024.0; - char stack_buf[KiB(4)]; - int len; + char *size = arena_push(&ctx->arena, 32, false); + snprintf(size, 32, "%.2f", size_kib); + arena_trim_string(&ctx->arena, size, ARENA_TRIM_NONE); + + // Times + char *separator; #if FILE_TIMES - char created[32], modified[32]; - format_time(fe->created_time, created, sizeof(created)); - format_time(fe->modified_time, modified, sizeof(modified)); + separator = arena_push(&ctx->arena, 1, false); + *separator = '\t'; + + u64 time_size = 32; + + char *created = arena_push(&ctx->arena, time_size, false); + format_time(fe->created_time, created, time_size); + arena_trim_string(&ctx->arena, created, ARENA_TRIM_TAB); + + char *modified = arena_push(&ctx->arena, time_size, false); + format_time(fe->modified_time, modified, time_size); + arena_trim_string(&ctx->arena, modified, ARENA_TRIM_NONE); #endif -#if FILE_TIMES && FILE_OWNER - len = snprintf(stack_buf, sizeof(stack_buf), "%s\t%s\t%.2f\t%s\t%s\t%s\n", - hash, fe->path, size_kib, created, modified, fe->owner); -#elif FILE_TIMES - len = snprintf(stack_buf, sizeof(stack_buf), "%s\t%s\t%.2f\t%s\t%s\n", hash, - fe->path, size_kib, created, modified); -#elif FILE_OWNER - len = snprintf(stack_buf, sizeof(stack_buf), "%s\t%s\t%.2f\t%s\n", hash, - fe->path, size_kib, fe->owner); -#else - len = snprintf(stack_buf, sizeof(stack_buf), "%s\t%s\t%.2f\n", hash, - fe->path, size_kib); + // Owner +#if FILE_OWNER + separator = arena_push(&ctx->arena, 1, false); + *separator = '\t'; + + char *owner = arena_push(&ctx->arena, 128, false); + memcpy(owner, fe->owner, 128); + arena_trim_string(&ctx->arena, owner, ARENA_TRIM_NONE); #endif - char *dst = arena_push(&ctx->arena, len, false); - memcpy(dst, stack_buf, len); + // Final newline + separator = arena_push(&ctx->arena, 1, false); + *separator = '\n'; atomic_fetch_add(&g_files_hashed, 1); } @@ -1576,8 +1608,8 @@ static int ioring_submit(ThreadIoContext *thread_ctx, uint32_t *submitted) { static void ioring_process_completions(ThreadIoContext *thread_ctx) { struct io_uring_cqe *cqes[NUM_BUFFERS_PER_THREAD]; - unsigned cqe_count = io_uring_peek_batch_cqe(&((IoUring *)thread_ctx->ring)->ring, - cqes, NUM_BUFFERS_PER_THREAD); + unsigned cqe_count = io_uring_peek_batch_cqe( + &((IoUring *)thread_ctx->ring)->ring, cqes, NUM_BUFFERS_PER_THREAD); if (cqe_count == 0) { return; @@ -1788,7 +1820,8 @@ static IoBuffer *get_free_buffer(ThreadIoContext *restrict thread_ctx) { return buf; } -static void return_buffer(ThreadIoContext *restrict thread_ctx, IoBuffer *restrict buf) { +static void return_buffer(ThreadIoContext *restrict thread_ctx, + IoBuffer *restrict buf) { if (!buf) return; @@ -1796,8 +1829,8 @@ static void return_buffer(ThreadIoContext *restrict thread_ctx, IoBuffer *restri } // -------------------- File operations ----------------------- -static int init_file(ThreadIoContext *restrict thread_ctx, FileReadContext *restrict file, - FileEntry *restrict fe) { +static int init_file(ThreadIoContext *restrict thread_ctx, + FileReadContext *restrict file, FileEntry *restrict fe) { #if USE_REGISTERED_FILES uint32_t saved_slot_id = file->slot_id; @@ -1835,13 +1868,14 @@ static int init_file(ThreadIoContext *restrict thread_ctx, FileReadContext *rest } static void finalize_file(ThreadIoContext *restrict thread_ctx, - WorkerContext *worker_ctx, FileReadContext *restrict file) { + HasherContext *worker_ctx, + FileReadContext *restrict file) { FileEntry *restrict fe = file->fe; os_file_close(file->file_handle); - char hash[HASH_STRLEN]; + char *hash = arena_push(&worker_ctx->arena, HASH_STRLEN, false); if (file->bytes_hashed == file->file_size) { if (file->use_incremental_hash) { @@ -1863,40 +1897,56 @@ static void finalize_file(ThreadIoContext *restrict thread_ctx, atomic_fetch_add(&g_io_ring_fallbacks, 1); xxh3_hash_file_stream(fe->path, hash, thread_ctx->fallback_buffer); } + arena_trim_string(&worker_ctx->arena, hash, ARENA_TRIM_TAB); + // Path + u64 path_len = strlen(fe->path) + 1; + char *path = arena_push(&worker_ctx->arena, path_len, ARENA_TRIM_TAB); + memcpy(path, fe->path, path_len); + arena_trim_string(&worker_ctx->arena, path, ARENA_TRIM_TAB); + + // Size double size_kib = (double)fe->size_bytes / 1024.0; - char stack_buf[KiB(4)]; - int len; + char *size = arena_push(&worker_ctx->arena, 32, false); + snprintf(size, 32, "%.2f", size_kib); + arena_trim_string(&worker_ctx->arena, size, ARENA_TRIM_NONE); + // Time + char *separator; #if FILE_TIMES - char created[32], modified[32]; - format_time(fe->created_time, created, sizeof(created)); - format_time(fe->modified_time, modified, sizeof(modified)); + separator = arena_push(&worker_ctx->arena, 1, false); + *separator = '\t'; + + u64 time_size = 32; + char *created = arena_push(&worker_ctx->arena, time_size, false); + format_time(fe->created_time, created, time_size); + arena_trim_string(&worker_ctx->arena, created, ARENA_TRIM_TAB); + + char *modified = arena_push(&worker_ctx->arena, time_size, false); + format_time(fe->modified_time, modified, time_size); + arena_trim_string(&worker_ctx->arena, modified, ARENA_TRIM_NONE); #endif -#if FILE_TIMES && FILE_OWNER - len = snprintf(stack_buf, sizeof(stack_buf), "%s\t%s\t%.2f\t%s\t%s\t%s\n", - hash, fe->path, size_kib, created, modified, fe->owner); -#elif FILE_TIMES - len = snprintf(stack_buf, sizeof(stack_buf), "%s\t%s\t%.2f\t%s\t%s\n", hash, - fe->path, size_kib, created, modified); -#elif FILE_OWNER - len = snprintf(stack_buf, sizeof(stack_buf), "%s\t%s\t%.2f\t%s\n", hash, - fe->path, size_kib, fe->owner); -#else - len = snprintf(stack_buf, sizeof(stack_buf), "%s\t%s\t%.2f\n", hash, fe->path, - size_kib); + // Owner +#if FILE_OWNER + separator = arena_push(&worker_ctx->arena, 1, false); + *separator = '\t'; + + char *owner = arena_push(&worker_ctx->arena, 128, false); + memcpy(owner, fe->owner, 128); + arena_trim_string(&worker_ctx->arena, owner, ARENA_TRIM_NONE); #endif - char *restrict dst = arena_push(&worker_ctx->arena, len, false); - memcpy(dst, stack_buf, len); + separator = arena_push(&worker_ctx->arena, 1, false); + *separator = '\n'; atomic_fetch_add(&g_files_hashed, 1); } // -------------------- Hash files ----------------------- -static void hash_ready_files(ThreadIoContext *restrict thread_ctx, FileQueue *restrict fq, - WorkerContext *worker_ctx) { +static void hash_ready_files(ThreadIoContext *restrict thread_ctx, + FileQueue *restrict fq, + HasherContext *worker_ctx) { for (int i = 0; i < fq->count; i++) { @@ -1962,8 +2012,9 @@ static void hash_ready_files(ThreadIoContext *restrict thread_ctx, FileQueue *re } // ------------------ Build pending reads ---------------------- -static void build_pending_reads(ThreadIoContext *restrict thread_ctx, FileQueue *restrict fq, - WorkerContext *worker_ctx) { +static void build_pending_reads(ThreadIoContext *restrict thread_ctx, + FileQueue *restrict fq, + HasherContext *worker_ctx) { MPMCQueue *file_queue = worker_ctx->file_queue; @@ -2049,7 +2100,7 @@ static void build_pending_reads(ThreadIoContext *restrict thread_ctx, FileQueue // -------------------------- Hash worker I/O Ring --------------------------- static THREAD_RETURN hash_worker_ioring(void *arg) { - WorkerContext *worker_ctx = (WorkerContext *)arg; + HasherContext *worker_ctx = (HasherContext *)arg; // Init IO ring ThreadIoContext *thread_ctx = ioring_init_thread();