2 Commits

Author SHA1 Message Date
0029179fc0 Replacing printf, putchar and snprintf in the hot path with custom functions
Replacing snprintf(), localtime() and strftime() with custom formating
functions

Reworking progress_thread(), instead of composing the progress printing
with multiple printf() and putchar, we compose it in a buffer and write
it at onse using WriteConsole in Windows and write in Linux
2026-05-22 18:48:33 +01:00
f37e915489 Minor optimisations and bug fixes
Fix bug in mt_mpmc.c, in Linux mutexes are not recursive.
Add arena_trim_string() to the arena API
Removing arena->path, now paths are pushed to arena->metadata
Replacing fe->owner[128] with char *owner; the owner is not pushed as a
string to arena->metadata and trimed with arena_trim_string()
Improving cache locality in arena->metadata, the memory layout is not
fe; fe->path; fe->owner.
Cache aligning all arenas except HasherContext->arena to sizeof(void *).
Pushing elements one by one instead of snprintf() in finalize_file() and
hash_worker().
Getting the full path of current directory instead of "."
Fixing bug in path formating, this allow us to remove normalize_path()
from the hot loop.
2026-05-08 20:06:48 +01:00
9 changed files with 713 additions and 230 deletions

View File

@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.20) cmake_minimum_required(VERSION 3.20)
project(filehasher project(dfin
VERSION 1.0.0 VERSION 1.0.0
DESCRIPTION "High-performance file hasher with I/O Ring/io_uring support" DESCRIPTION "High-performance duplicate finder with I/O Ring/io_uring support"
LANGUAGES C LANGUAGES C
) )
@@ -106,7 +106,7 @@ endif()
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
set(SOURCES set(SOURCES
file_hasher.c duplicate_finder.c
xxhash.c xxhash.c
xxh_x86dispatch.c xxh_x86dispatch.c
) )
@@ -116,7 +116,7 @@ set(HEADERS
arena.h arena.h
base.h base.h
xxhash.h xxhash.h
lf_mpmc.h mt_mpmc.h
) )
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -281,4 +281,4 @@ message(STATUS " Compiler: ${CMAKE_C_COMPILER}")
message(STATUS " Build Type: ${CMAKE_BUILD_TYPE}") message(STATUS " Build Type: ${CMAKE_BUILD_TYPE}")
message(STATUS " Generator: ${CMAKE_GENERATOR}") message(STATUS " Generator: ${CMAKE_GENERATOR}")
message(STATUS " Platform: ${PLATFORM_NAME}") message(STATUS " Platform: ${PLATFORM_NAME}")
message(STATUS "----------------------------------------") message(STATUS "----------------------------------------")

117
arena.c
View File

@@ -196,7 +196,7 @@ mem_arena *arena_create(arena_params *params) { // mk create
arena->free_list = arena_create(&(arena_params){ arena->free_list = arena_create(&(arena_params){
.reserve_size = MiB(1), .reserve_size = MiB(1),
.commit_size = MiB(1), .commit_size = MiB(1),
.align = ARENA_ALIGN, .align = ARENA_CACHE_ALIGN,
.push_size = sizeof(arena_free_node), .push_size = sizeof(arena_free_node),
.allow_free_list = false, .allow_free_list = false,
.free_list = NULL, .free_list = NULL,
@@ -620,6 +620,119 @@ void *arena_swapback_pop(mem_arena **arena_ptr, u64 index) { // mk swapback
/* ============================================================ /* ============================================================
Utilities Utilities
============================================================ */ ============================================================ */
typedef enum arena_trim_flags {
ARENA_TRIM_NONE = 0,
ARENA_TRIM_SPACE = 1 << 0,
ARENA_TRIM_TAB = 1 << 1,
ARENA_TRIM_LF = 1 << 2,
ARENA_TRIM_CR = 1 << 3,
ARENA_TRIM_NUL = 1 << 4,
} arena_trim_flags;
u64 arena_trim_string(mem_arena **arena_ptr, char *str, u8 termination_flags) {
ASSERT(arena_ptr);
ASSERT(*arena_ptr);
ASSERT(str);
if (!arena_ptr || !*arena_ptr || !str) {
return 0;
}
mem_arena *arena = *arena_ptr;
/* ------------------------------------------------------------
Find owning block
------------------------------------------------------------ */
mem_arena *owner = arena_block_from_ptr(arena, (u8 *)str);
ASSERT(owner);
if (!owner) {
return 0;
}
/* ------------------------------------------------------------
Must be current block
------------------------------------------------------------ */
if (owner != arena) {
fprintf(stderr, "arena_trim_string(): string is not "
"in current arena block.\n");
return 0;
}
/* ------------------------------------------------------------
Compute string position
------------------------------------------------------------ */
u64 str_pos = arena_pos_from_ptr(arena, str);
/* ------------------------------------------------------------
Original reserved size
------------------------------------------------------------ */
u64 allocated_size = arena->pos - str_pos;
/* ------------------------------------------------------------
Compute sizes
------------------------------------------------------------ */
u64 str_size = strlen(str);
char *dst = str + str_size;
u64 termination_size = 0;
if (termination_flags & ARENA_TRIM_SPACE) {
*dst++ = ' ';
termination_size++;
}
if (termination_flags & ARENA_TRIM_TAB) {
*dst++ = '\t';
termination_size++;
}
if (termination_flags & ARENA_TRIM_CR) {
*dst++ = '\r';
termination_size++;
}
if (termination_flags & ARENA_TRIM_LF) {
*dst++ = '\n';
termination_size++;
}
if (termination_flags & ARENA_TRIM_NUL) {
*dst++ = '\0';
termination_size++;
}
/* ------------------------------------------------------------
Final used size
------------------------------------------------------------ */
u64 used_size = str_size + termination_size;
used_size = ALIGN_UP_POW2(used_size, arena->align);
/* ------------------------------------------------------------
Overflow detection
------------------------------------------------------------ */
if (used_size > allocated_size) {
fprintf(stderr, "arena_trim_string(): string overflow "
"detected.\n");
}
/* ------------------------------------------------------------
Update arena position
------------------------------------------------------------ */
arena->pos = str_pos + used_size;
return used_size;
}
void *arena_clear(mem_arena **arena_ptr) { // mk clear void *arena_clear(mem_arena **arena_ptr) { // mk clear
@@ -801,7 +914,7 @@ mem_arena_temp arena_scratch_get(mem_arena **conflicts, u32 num_conflicts) {
arena_params params = { arena_params params = {
.reserve_size = MiB(64), .reserve_size = MiB(64),
.commit_size = MiB(1), .commit_size = MiB(1),
.align = ARENA_ALIGN, .align = ARENA_CACHE_ALIGN,
.push_size = 8, .push_size = 8,
.allow_free_list = false, .allow_free_list = false,
.allow_swapback = true, .allow_swapback = true,

View File

@@ -239,7 +239,7 @@ void *arena_ptr_from_index(mem_arena *arena, u64 index);
*/ */
#define ARENA_HEADER_SIZE (sizeof(mem_arena)) #define ARENA_HEADER_SIZE (sizeof(mem_arena))
#define ARENA_ALIGN (sizeof(void *)) #define ARENA_CACHE_ALIGN (sizeof(void *))
// arena config // arena config
typedef enum arena_growth_policy { typedef enum arena_growth_policy {

4
base.h
View File

@@ -35,6 +35,7 @@
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/vfs.h> #include <sys/vfs.h>
#include <unistd.h> #include <unistd.h>
#include <sys/mman.h>
#endif #endif
#include <assert.h> #include <assert.h>
@@ -147,9 +148,6 @@ static void sleep_ms(int ms) { Sleep(ms); }
#define _DEFAULT_SOURCE #define _DEFAULT_SOURCE
#endif #endif
#include <sys/mman.h>
#include <unistd.h>
static u32 plat_get_pagesize(void) { return (u32)sysconf(_SC_PAGESIZE); } static u32 plat_get_pagesize(void) { return (u32)sysconf(_SC_PAGESIZE); }
static void *plat_mem_reserve(u64 size) { static void *plat_mem_reserve(u64 size) {

View File

@@ -1,6 +1,8 @@
@echo off @echo off
setlocal enabledelayedexpansion setlocal enabledelayedexpansion
set PROJECT_NAME=dfin
:: ============================================================================ :: ============================================================================
:: build.bat :: build.bat
:: ============================================================================ :: ============================================================================
@@ -45,7 +47,7 @@ exit /b 1
:main :main
set BUILD_DIR=%SCRIPT_DIR%\build\windows\%BUILD_TYPE% set BUILD_DIR=%SCRIPT_DIR%\build\windows\%BUILD_TYPE%
echo === Building filehasher (%BUILD_TYPE%) === echo === Building %PROJECT_NAME% (%BUILD_TYPE%) ===
:: -------------------------------------------------------------------------- :: --------------------------------------------------------------------------
:: Clean if requested :: Clean if requested
@@ -167,4 +169,4 @@ popd
echo. echo.
echo === Build Complete === echo === Build Complete ===
echo Executable: %BUILD_DIR%\filehasher.exe echo Executable: %BUILD_DIR%\%PROJECT_NAME%.exe

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# ============================================================================ # ============================================================================
# build.sh - Build script for filehasher (Linux) # build.sh - Build script (Linux)
# Usage: ./build.sh [Release|Debug] [clean] # Usage: ./build.sh [Release|Debug] [clean]
# #
# Compiler preference: gcc > clang # Compiler preference: gcc > clang
@@ -9,6 +9,8 @@
set -euo pipefail set -euo pipefail
PROJECT_NAME="dfin"
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Colors # Colors
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -55,7 +57,7 @@ done
readonly BUILD_DIR="build/linux/${BUILD_TYPE}" readonly BUILD_DIR="build/linux/${BUILD_TYPE}"
readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
echo -e "${GREEN}=== Building filehasher (${BUILD_TYPE}) ===${NC}" echo -e "${GREEN}=== Building ${PROJECT_NAME} (${BUILD_TYPE}) ===${NC}"
echo "Project: ${SCRIPT_DIR}" echo "Project: ${SCRIPT_DIR}"
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -226,18 +228,18 @@ echo
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
cd "${SCRIPT_DIR}" cd "${SCRIPT_DIR}"
if [[ -f "${BUILD_DIR}/filehasher" ]]; then if [[ -f "${BUILD_DIR}/${PROJECT_NAME}" ]]; then
echo -e "${GREEN}Executable: ${BUILD_DIR}/filehasher${NC}" echo -e "${GREEN}Executable: ${BUILD_DIR}/${PROJECT_NAME}${NC}"
if command -v file &> /dev/null; then if command -v file &> /dev/null; then
echo -e " Type: $(file -b ${BUILD_DIR}/filehasher)" echo -e " Type: $(file -b ${BUILD_DIR}/${PROJECT_NAME})"
fi fi
if command -v du &> /dev/null; then if command -v du &> /dev/null; then
echo -e " Size: $(du -h ${BUILD_DIR}/filehasher | cut -f1)" echo -e " Size: $(du -h ${BUILD_DIR}/${PROJECT_NAME} | cut -f1)"
fi fi
elif [[ -f "${BUILD_DIR}/filehasher.exe" ]]; then elif [[ -f "${BUILD_DIR}/${PROJECT_NAME}.exe" ]]; then
echo -e "${GREEN}Executable: ${BUILD_DIR}/filehasher.exe${NC}" echo -e "${GREEN}Executable: ${BUILD_DIR}/${PROJECT_NAME}.exe${NC}"
else else
echo -e "${YELLOW}Note: Could not locate executable${NC}" echo -e "${YELLOW}Note: Could not locate executable${NC}"
echo "Checking build directory:" echo "Checking build directory:"
@@ -269,4 +271,4 @@ if [[ "${EXPORT_COMPILE_COMMANDS}" == "ON" ]]; then
fi fi
echo echo
echo -e "${GREEN}Ready to run: ./${BUILD_DIR}/filehasher${NC}" echo -e "${GREEN}Ready to run: ./${BUILD_DIR}/${PROJECT_NAME}${NC}"

View File

@@ -32,7 +32,11 @@ int main(int argc, char **argv) {
buf[strcspn(buf, "\r\n")] = 0; buf[strcspn(buf, "\r\n")] = 0;
if (buf[0] == 0) { if (buf[0] == 0) {
strcpy(folders[0], "."); if (!platform_get_current_directory(folders[0], sizeof(folders[0]))) {
fprintf(stderr, "Failed to get current directory\n");
return 1;
}
normalize_path(folders[0]);
folder_count = 1; folder_count = 1;
} else { } else {
folder_count = parse_paths(buf, folders, 64); folder_count = parse_paths(buf, folders, 64);
@@ -71,7 +75,19 @@ int main(int argc, char **argv) {
.max_nbre_blocks = 1, .max_nbre_blocks = 1,
}; };
mem_arena *gp_arena = arena_create(&params); arena_params params_caligned = {
.reserve_size = GiB(1),
.commit_size = MiB(16),
.align = ARENA_CACHE_ALIGN,
.push_size = 0,
.allow_free_list = true,
.allow_swapback = false,
.growth_policy = ARENA_GROWTH_NORMAL,
.commit_policy = ARENA_COMMIT_LAZY,
.max_nbre_blocks = 1,
};
mem_arena *gp_arena = arena_create(&params_caligned);
// ------------------------------- // -------------------------------
// Detect hardware // Detect hardware
@@ -119,7 +135,7 @@ int main(int argc, char **argv) {
mpmc_init(&file_queue, MiB(1)); mpmc_init(&file_queue, MiB(1));
// Starting hash threads // Starting hash threads
WorkerContext workers[num_hash_threads]; HasherContext workers[num_hash_threads];
Thread *hash_threads = Thread *hash_threads =
arena_push(&gp_arena, sizeof(Thread) * num_hash_threads, true); arena_push(&gp_arena, sizeof(Thread) * num_hash_threads, true);
@@ -155,8 +171,7 @@ int main(int argc, char **argv) {
for (uint8_t i = 0; i < num_scan_threads; i++) { for (uint8_t i = 0; i < num_scan_threads; i++) {
scanners[i].num_threads = num_scan_threads; scanners[i].num_threads = num_scan_threads;
scanners[i].path_arena = arena_create(&params); scanners[i].meta_arena = arena_create(&params_caligned);
scanners[i].meta_arena = arena_create(&params);
scanners[i].dir_queue = &dir_queue; scanners[i].dir_queue = &dir_queue;
scanners[i].file_queue = &file_queue; scanners[i].file_queue = &file_queue;
@@ -170,7 +185,7 @@ int main(int argc, char **argv) {
// Initial folder push // Initial folder push
for (int i = 0; i < folder_count; i++) { for (int i = 0; i < folder_count; i++) {
size_t len = strlen(folders[i]) + 1; size_t len = strlen(folders[i]) + 1;
char *path = arena_push(&scanners[0].path_arena, len, false); char *path = arena_push(&scanners[0].meta_arena, len, false);
memcpy(path, folders[i], len); memcpy(path, folders[i], len);
mpmc_push_work(&dir_queue, path); mpmc_push_work(&dir_queue, path);
} }

View File

@@ -1,4 +1,4 @@
#pragma once #pragma once
#include "base.h" #include "base.h"
@@ -214,19 +214,27 @@ static void mpmc_producers_finished(MPMCQueue *q, u8 consumer_count) {
/* Done */ /* Done */
/* ----------------------------------------------------------- */ /* ----------------------------------------------------------- */
static void mpmc_task_done(MPMCQueue *q, u8 consumer_count) { static void mpmc_task_done(MPMCQueue *q, u8 consumer_count) {
bool finished = false;
mtx_lock(&q->lock); mtx_lock(&q->lock);
if (--q->work_count == 0) { if (--q->work_count == 0) {
mpmc_producers_finished(q, consumer_count); finished = true;
} }
mtx_unlock(&q->lock); mtx_unlock(&q->lock);
if (finished) {
mpmc_producers_finished(q, consumer_count);
}
} }
/* ----------------------------------------------------------- */ /* ----------------------------------------------------------- */
/* MPMC Cleanup */ /* MPMC Cleanup */
/* ----------------------------------------------------------- */ /* ----------------------------------------------------------- */
// static void mpmc_finish(MPMCQueue *q) { // Comment to prevent warning: unused function // static void mpmc_finish(MPMCQueue *q) { // Comment to prevent warning: unused
// function
// if (!q) return; // if (!q) return;
// //
// if (q->slots) { // if (q->slots) {

File diff suppressed because it is too large Load Diff