Files
filehasher/platform.c
amir 0294498538 Add support for multiple inflight files and one shot hash small files
The IO Ring now supports bashing multiple submissions and can handle
multiple files at the same time.

Hashing small files using XXH3_128bits() instead of the streaming
pipeline(XXH3_128bits_reset(), XXH3_128bits_update(),
XXH3_128bits_digest()), this reduses the overhead of creating a state
and digest, coupled with the IO Ring it improves the hashing of small
files whose size is inferior to the size of IO Ring buffers
2026-04-02 14:31:58 +01:00

1447 lines
38 KiB
C

#pragma once // ensure that a given header file is included only once in a
// single compilation unit
#include "arena.h"
#include "base.h"
#include "lf_mpmc.h"
#include "arena.c"
// xxhash include
#define XXH_STATIC_LINKING_ONLY
#include "xxh_x86dispatch.h"
// ----------------------------- Config -------------------------------------
#define FILE_HASHES_TXT "file_hashes.txt"
#define HASH_STRLEN 33 // 128-bit hex (32 chars) + null
#define MAX_PATHLEN 4096
#define READ_BLOCK (KiB(64))
// ----------------------------- Globals ------------------------------------
static atomic_uint_fast64_t g_files_found = 0;
static atomic_uint_fast64_t g_files_hashed = 0;
static atomic_uint_fast64_t g_bytes_processed = 0;
static atomic_int g_scan_done = 0;
// ================== OS-agnostic functions abstraction =====================
// ----------------------------- Timer functions --------------
typedef struct {
u64 start;
u64 now;
} HiResTimer;
#if defined(_WIN32) || defined(_WIN64)
static LARGE_INTEGER g_freq;
static void timer_init(void) { QueryPerformanceFrequency(&g_freq); }
static void timer_start(HiResTimer *t) {
LARGE_INTEGER v;
QueryPerformanceCounter(&v);
t->start = v.QuadPart;
}
static double timer_elapsed(HiResTimer *t) {
LARGE_INTEGER v;
QueryPerformanceCounter(&v);
t->now = v.QuadPart;
return (double)(t->now - t->start) / (double)g_freq.QuadPart;
}
#elif defined(__linux__)
void timer_init(void) {}
void timer_start(HiResTimer *t) {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
t->start = ts.tv_sec * 1000000000ULL + ts.tv_nsec;
}
double timer_elapsed(HiResTimer *t) {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
uint64_t now = ts.tv_sec * 1000000000ULL + ts.tv_nsec;
return (double)(now - t->start) / 1e9;
}
#endif
// ----------------------------- Get HW info --------------
#if defined(_WIN32) || defined(_WIN64)
size_t platform_physical_cores(void) {
DWORD len = 0;
GetLogicalProcessorInformation(NULL, &len);
SYSTEM_LOGICAL_PROCESSOR_INFORMATION buf[len];
GetLogicalProcessorInformation(buf, &len);
DWORD count = 0;
DWORD n = len / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
for (DWORD i = 0; i < n; i++) {
if (buf[i].Relationship == RelationProcessorCore)
count++;
}
return count ? count : 1;
}
#elif defined(__linux__)
size_t platform_physical_cores(void) {
long n = sysconf(_SC_NPROCESSORS_ONLN);
return n > 0 ? (size_t)n : 1;
}
#endif
const char *get_xxhash_instruction_set(void) {
int vecID = XXH_featureTest();
switch (vecID) {
case XXH_SCALAR:
return "Scalar (portable C)";
case XXH_SSE2:
return "SSE2";
case XXH_AVX2:
return "AVX2";
case XXH_AVX512:
return "AVX-512";
default:
return "Unknown";
}
}
// -------------------- File IO -------------------
#if defined(_WIN32) || defined(_WIN64)
typedef HANDLE FileHandle;
#define INVALID_FILE_HANDLE INVALID_HANDLE_VALUE
// File open function
static FileHandle os_file_open(const char *path) {
return CreateFileA(path, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE,
NULL, OPEN_EXISTING, FILE_FLAG_SEQUENTIAL_SCAN, NULL);
}
// File read function
static int os_file_read(FileHandle handle, void *buf, size_t count,
uint64_t *bytes_read) {
DWORD read = 0;
BOOL result = ReadFile(handle, buf, (DWORD)count, &read, NULL);
*bytes_read = read;
return (result && read > 0) ? 0 : -1;
}
// File close function
static void os_file_close(FileHandle handle) { CloseHandle(handle); }
#elif defined(__linux__)
typedef int FileHandle;
#define INVALID_FILE_HANDLE (-1)
// File open function
static FileHandle os_file_open(const char *path) {
return open(path, O_RDONLY | O_NOFOLLOW);
}
// File read function
static int os_file_read(FileHandle handle, void *buf, size_t count,
uint64_t *bytes_read) {
ssize_t result = read(handle, buf, count);
if (result >= 0) {
*bytes_read = (uint64_t)result;
return 0;
}
*bytes_read = 0;
return -1;
}
// File close function
static void os_file_close(FileHandle handle) { close(handle); }
#endif
// -------------------- Thread abstraction -------------------
// Threads context
typedef struct {
u8 num_threads;
mem_arena *path_arena;
mem_arena *meta_arena;
MPMCQueue *dir_queue;
MPMCQueue *file_queue;
} ScannerContext;
typedef struct {
mem_arena *arena;
MPMCQueue *file_queue;
} WorkerContext;
#if defined(_WIN32) || defined(_WIN64)
typedef HANDLE ThreadHandle;
typedef DWORD(WINAPI *ThreadFunc)(void *);
#define THREAD_RETURN DWORD WINAPI
#define THREAD_RETURN_VALUE 0;
typedef struct {
ThreadHandle handle;
int valid; // Track if thread was successfully created
} Thread;
// Thread function wrapper to handle different return types
#define THREAD_FUNCTION(name) DWORD WINAPI name(LPVOID arg)
// Thread creation function
static int thread_create(Thread *thread, ThreadFunc func, void *arg) {
thread->handle =
CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, arg, 0, NULL);
return (thread->handle != NULL) ? 0 : -1;
}
// Thread join function
static int thread_join(Thread *thread) {
return (WaitForSingleObject(thread->handle, INFINITE) == WAIT_OBJECT_0) ? 0
: -1;
}
// Thread close/detach function
static void thread_close(Thread *thread) { CloseHandle(thread->handle); }
// Wait for multiple threads
static int thread_wait_multiple(Thread *threads, size_t count) {
HANDLE handles[64]; // Max 64 threads for Windows
for (size_t i = 0; i < count; i++) {
handles[i] = threads[i].handle;
}
return (WaitForMultipleObjects((DWORD)count, handles, TRUE, INFINITE) ==
WAIT_OBJECT_0)
? 0
: -1;
}
#elif defined(__linux__)
typedef pthread_t ThreadHandle;
typedef void *(*ThreadFunc)(void *);
#define THREAD_RETURN void *
#define THREAD_RETURN_VALUE NULL;
typedef struct {
ThreadHandle handle;
int valid; // Track if thread was successfully created
} Thread;
// Thread function wrapper to handle different return types
typedef struct {
void *(*func)(void *);
void *arg;
} ThreadWrapper;
static void *thread_start_routine(void *arg) {
ThreadWrapper *wrapper = (ThreadWrapper *)arg;
void *result = wrapper->func(wrapper->arg);
free(wrapper);
return result;
}
// Thread creation function
static int thread_create(Thread *thread, ThreadFunc func, void *arg) {
int ret = pthread_create(&thread->handle, NULL, func, arg);
if (ret == 0) {
thread->valid = 1;
}
return ret;
}
// Thread join function
static int thread_join(Thread *thread) {
int ret = pthread_join(thread->handle, NULL);
thread->valid = 0;
return ret;
}
// Thread close/detach function
static void thread_close(Thread *thread) {
if (thread->valid) {
pthread_detach(thread->handle);
thread->valid = 0;
}
}
// Wait for multiple threads
static int thread_wait_multiple(Thread *threads, size_t count) {
for (size_t i = 0; i < count; i++) {
if (thread_join(&threads[i]) != 0) {
return -1;
}
}
return 0;
}
#endif
// ======================== Get file metadata ========================
// -------------------- Path parsing -------------------
static void normalize_path(char *p) {
char *src = p;
char *dst = p;
int prev_slash = 0;
while (*src) {
char c = *src++;
if (c == '\\' || c == '/') {
if (!prev_slash) {
*dst++ = '/';
prev_slash = 1;
}
} else {
*dst++ = c;
prev_slash = 0;
}
}
*dst = '\0';
}
static int parse_paths(char *line, char folders[][MAX_PATHLEN],
int max_folders) {
int count = 0;
char *p = line;
while (*p && count < max_folders) {
while (*p && isspace((unsigned char)*p))
p++;
if (!*p)
break;
char *start;
char quote = 0;
if (*p == '"' || *p == '\'') {
quote = *p++;
start = p;
while (*p && *p != quote)
p++;
} else {
start = p;
while (*p && !isspace((unsigned char)*p))
p++;
}
size_t len = p - start;
if (len >= MAX_PATHLEN)
len = MAX_PATHLEN - 1;
memcpy(folders[count], start, len);
folders[count][len] = 0;
normalize_path(folders[count]);
count++;
if (quote && *p == quote)
p++;
}
return count;
}
// ----------------------------- File time -------------------------
#if defined(_WIN32) || defined(_WIN64)
static void format_time(uint64_t t, char *out, size_t out_sz) {
if (t == 0) {
snprintf(out, out_sz, "N/A");
return;
}
time_t tt = (time_t)t;
struct tm tm;
localtime_s(&tm, &tt);
strftime(out, out_sz, "%Y-%m-%d %H:%M:%S", &tm);
}
// ----------------------------- Convert filetime to epoch --------------
static uint64_t filetime_to_epoch(const FILETIME *ft) {
ULARGE_INTEGER ull;
ull.LowPart = ft->dwLowDateTime;
ull.HighPart = ft->dwHighDateTime;
// Windows epoch (1601) ¬ニメ Unix epoch (1970)
return (ull.QuadPart - 116444736000000000ULL) / 10000000ULL;
}
void platform_get_file_times(const char *path, uint64_t *out_created,
uint64_t *out_modified) {
WIN32_FILE_ATTRIBUTE_DATA fad;
if (GetFileAttributesExA(path, GetFileExInfoStandard, &fad)) {
*out_created = filetime_to_epoch(&fad.ftCreationTime);
*out_modified = filetime_to_epoch(&fad.ftLastWriteTime);
} else {
*out_created = 0;
*out_modified = 0;
}
}
#elif defined(__linux__)
static void format_time(uint64_t t, char *out, size_t out_sz) {
if (t == 0) {
snprintf(out, out_sz, "N/A");
return;
}
time_t tt = (time_t)t;
struct tm tm;
localtime_r(&tt, &tm);
strftime(out, out_sz, "%Y-%m-%d %H:%M:%S", &tm);
}
void platform_get_file_times(const char *path, uint64_t *out_created,
uint64_t *out_modified) {
struct stat st;
if (stat(path, &st) == 0) {
*out_created = (uint64_t)st.st_ctime;
*out_modified = (uint64_t)st.st_mtime;
} else {
*out_created = 0;
*out_modified = 0;
}
}
#endif
// ----------------------------- File owner ---------------------
#if defined(_WIN32) || defined(_WIN64)
static void get_file_owner(const char *path, char *out, size_t out_sz) {
PSID sid = NULL;
PSECURITY_DESCRIPTOR sd = NULL;
if (GetNamedSecurityInfoA(path, SE_FILE_OBJECT, OWNER_SECURITY_INFORMATION,
&sid, NULL, NULL, NULL, &sd) == ERROR_SUCCESS) {
char name[64], domain[64];
DWORD name_len = sizeof(name);
DWORD domain_len = sizeof(domain);
SID_NAME_USE use;
if (LookupAccountSidA(NULL, sid, name, &name_len, domain, &domain_len,
&use)) {
snprintf(out, out_sz, "%s\\%s", domain, name);
} else {
snprintf(out, out_sz, "UNKNOWN");
}
} else {
snprintf(out, out_sz, "UNKNOWN");
}
if (sd)
LocalFree(sd);
}
void platform_get_file_owner(const char *path, char *out_owner,
size_t out_owner_size) {
get_file_owner(path, out_owner, out_owner_size);
}
#elif defined(__linux__)
static void get_file_owner(uid_t uid, char *out, size_t out_sz) {
struct passwd *pw = getpwuid(uid);
if (pw) {
snprintf(out, out_sz, "%s", pw->pw_name);
} else {
snprintf(out, out_sz, "UNKNOWN");
}
}
void platform_get_file_owner(const char *path, char *out_owner,
size_t out_owner_size) {
struct stat st;
if (stat(path, &st) == 0) {
get_file_owner(st.st_uid, out_owner, out_owner_size);
} else {
snprintf(out_owner, out_owner_size, "UNKNOWN");
}
}
#endif
// ----------------------------- Scan helpers -----------------------------
typedef struct FileEntry {
char *path;
uint64_t size_bytes;
uint64_t created_time; // epoch
uint64_t modified_time; // epoch seconds
char owner[128]; // resolved owner name
} FileEntry;
typedef struct {
char buffer[MAX_PATHLEN];
char *base_end; // Points to end of base path
char *filename_pos; // Points to where filename should be written
size_t base_len;
} PathBuilder;
static void path_builder_init(PathBuilder *pb, const char *base) {
pb->base_len = strlen(base);
memcpy(pb->buffer, base, pb->base_len);
pb->base_end = pb->buffer + pb->base_len;
#if defined(_WIN32) || defined(_WIN64)
*pb->base_end = '\\';
#elif defined(__linux__)
*pb->base_end = '/';
#endif
// Ensure null termination
*(pb->base_end + 1) = '\0';
pb->filename_pos = pb->base_end + 1;
}
static void path_builder_set_filename(PathBuilder *pb, const char *filename,
size_t name_len) {
memcpy(pb->filename_pos, filename, name_len);
pb->filename_pos[name_len] = '\0'; // Ensure null termination
}
static char *path_builder_dup_arena(PathBuilder *pb, mem_arena *arena,
bool zero) {
// Calculate total length including base + separator + filename + null
// terminator
size_t total_len =
(pb->filename_pos - pb->buffer) + strlen(pb->filename_pos) + 1;
char *dup = arena_push(&arena, total_len, zero);
memcpy(dup, pb->buffer, total_len);
return dup;
}
#if defined(_WIN32) || defined(_WIN64)
void scan_folder(const char *base, ScannerContext *ctx) {
PathBuilder pb;
path_builder_init(&pb, base);
char search[MAX_PATHLEN];
memcpy(search, pb.buffer, pb.base_len + 1); // Copy base + separator
memcpy(search + pb.base_len + 1, "*", 2); // Add "*" and null
WIN32_FIND_DATAA fd;
HANDLE h = FindFirstFileA(search, &fd);
if (h == INVALID_HANDLE_VALUE)
return;
do {
// Skip . and ..
if (fd.cFileName[0] == '.' &&
(fd.cFileName[1] == 0 ||
(fd.cFileName[1] == '.' && fd.cFileName[2] == 0)))
continue;
if (fd.dwFileAttributes & FILE_ATTRIBUTE_REPARSE_POINT)
continue;
size_t name_len = strlen(fd.cFileName);
path_builder_set_filename(&pb, fd.cFileName, name_len);
if (fd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
char *dir = path_builder_dup_arena(&pb, ctx->path_arena, false);
mpmc_push_work(ctx->dir_queue, dir);
} else {
atomic_fetch_add(&g_files_found, 1);
FileEntry *fe = arena_push(&ctx->meta_arena, sizeof(FileEntry), true);
// Create a temporary copy for normalization to avoid corrupting pb.buffer
char temp_path[MAX_PATHLEN];
memcpy(temp_path, pb.buffer,
(pb.filename_pos - pb.buffer) + name_len + 1);
normalize_path(temp_path);
fe->path = arena_push(&ctx->path_arena, strlen(temp_path) + 1, false);
strcpy(fe->path, temp_path);
platform_get_file_times(pb.buffer, &fe->created_time, &fe->modified_time);
platform_get_file_owner(pb.buffer, fe->owner, sizeof(fe->owner));
fe->size_bytes = ((uint64_t)fd.nFileSizeHigh << 32) | fd.nFileSizeLow;
mpmc_push(ctx->file_queue, fe);
}
} while (FindNextFileA(h, &fd));
FindClose(h);
}
#elif defined(__linux__)
static int platform_get_file_times_fd(int dir_fd, const char *name,
time_t *created, time_t *modified) {
struct stat st;
if (fstatat(dir_fd, name, &st, 0) == 0) {
*created = st.st_ctime; // or st.st_birthtime on systems that support it
*modified = st.st_mtime;
return 0;
}
return -1;
}
static int platform_get_file_owner_fd(int dir_fd, const char *name, char *owner,
size_t owner_size) {
struct stat st;
if (fstatat(dir_fd, name, &st, 0) == 0) {
struct passwd pw;
struct passwd *result;
char buffer[4096]; // Sufficiently large buffer for passwd data
// Reentrant version (thread-safe)
if (getpwuid_r(st.st_uid, &pw, buffer, sizeof(buffer), &result) == 0 &&
result != NULL && result->pw_name != NULL) {
strncpy(owner, result->pw_name, owner_size - 1);
owner[owner_size - 1] = '\0';
} else {
// Fallback to uid
snprintf(owner, owner_size, "uid:%d", st.st_uid);
}
return 0;
}
return -1;
void scan_folder(const char *base, ScannerContext *ctx) {
PathBuilder pb;
path_builder_init(&pb, base);
int dir_fd = open(base, O_RDONLY | O_DIRECTORY | O_NOFOLLOW);
if (dir_fd == -1)
return;
DIR *dir = fdopendir(dir_fd);
if (!dir) {
close(dir_fd);
return;
}
struct dirent *entry;
while ((entry = readdir(dir)) != NULL) {
if (entry->d_name[0] == '.' &&
(entry->d_name[1] == 0 ||
(entry->d_name[1] == '.' && entry->d_name[2] == 0)))
continue;
size_t name_len = strlen(entry->d_name);
path_builder_set_filename(&pb, entry->d_name, name_len);
int file_type = DT_UNKNOWN;
#ifdef _DIRENT_HAVE_D_TYPE
file_type = entry->d_type;
#endif
// Fast path using d_type
if (file_type != DT_UNKNOWN) {
if (file_type == DT_LNK)
continue; // Skip symlinks
if (file_type == DT_DIR) {
char *dir_path = path_builder_dup_arena(&pb, ctx->path_arena, false);
mpmc_push_work(ctx->dir_queue, dir_path);
continue;
}
if (file_type == DT_REG) {
atomic_fetch_add(&g_files_found, 1);
FileEntry *fe = arena_push(&ctx->meta_arena, sizeof(FileEntry), true);
// Use fstatat for file info
struct stat st;
if (fstatat(dir_fd, entry->d_name, &st, 0) == 0) {
// Convert times using fd variant
platform_get_file_times_fd(dir_fd, entry->d_name, &fe->created_time,
&fe->modified_time);
platform_get_file_owner_fd(dir_fd, entry->d_name, fe->owner,
sizeof(fe->owner));
fe->size_bytes = (uint64_t)st.st_size;
// Normalize path
char temp_path[MAX_PATHLEN];
memcpy(temp_path, pb.buffer,
(pb.filename_pos - pb.buffer) + name_len + 1);
normalize_path(temp_path);
fe->path =
arena_push(&ctx->path_arena, strlen(temp_path) + 1, false);
strcpy(fe->path, temp_path);
mpmc_push(ctx->file_queue, fe);
}
continue;
}
}
// Fallback for unknown types
struct stat st;
if (fstatat(dir_fd, entry->d_name, &st, AT_SYMLINK_NOFOLLOW) == 0) {
if (S_ISLNK(st.st_mode))
continue;
if (S_ISDIR(st.st_mode)) {
char *dir_path = path_builder_dup_arena(&pb, ctx->path_arena, false);
mpmc_push_work(ctx->dir_queue, dir_path);
} else if (S_ISREG(st.st_mode)) {
atomic_fetch_add(&g_files_found, 1);
FileEntry *fe = arena_push(&ctx->meta_arena, sizeof(FileEntry), true);
platform_get_file_times(pb.buffer, &fe->created_time,
&fe->modified_time);
platform_get_file_owner(pb.buffer, fe->owner, sizeof(fe->owner));
fe->size_bytes = (uint64_t)st.st_size;
char temp_path[MAX_PATHLEN];
memcpy(temp_path, pb.buffer,
(pb.filename_pos - pb.buffer) + name_len + 1);
normalize_path(temp_path);
fe->path = arena_push(&ctx->path_arena, strlen(temp_path) + 1, false);
strcpy(fe->path, temp_path);
mpmc_push(ctx->file_queue, fe);
}
}
}
closedir(dir); // Closes dir_fd automatically
}
#endif
// ------------------------- Scan worker --------------------------------
static THREAD_RETURN scan_worker(void *arg) {
ScannerContext *ctx = (ScannerContext *)arg;
for (;;) {
char *dir = mpmc_pop(ctx->dir_queue);
if (!dir)
break;
scan_folder(dir, ctx);
mpmc_task_done(ctx->dir_queue, ctx->num_threads);
}
return THREAD_RETURN_VALUE;
}
// ----------------------------- Hashing helpers -----------------------------
static void xxh3_hash_file_stream(const char *path, char *out_hex,
unsigned char *buf) {
XXH128_hash_t h;
XXH3_state_t state;
XXH3_128bits_reset(&state);
FileHandle handle = os_file_open(path);
if (handle == INVALID_FILE_HANDLE) {
strcpy(out_hex, "ERROR");
return;
}
uint64_t bytes_read;
while (os_file_read(handle, buf, READ_BLOCK, &bytes_read) == 0 &&
bytes_read > 0) {
XXH3_128bits_update(&state, buf, (size_t)bytes_read);
atomic_fetch_add(&g_bytes_processed, bytes_read);
}
os_file_close(handle);
h = XXH3_128bits_digest(&state);
snprintf(out_hex, HASH_STRLEN, "%016llx%016llx", (unsigned long long)h.high64,
(unsigned long long)h.low64);
}
// ------------------------- Hash worker --------------------------------
static THREAD_RETURN hash_worker(void *arg) {
WorkerContext *ctx = (WorkerContext *)arg;
unsigned char *buf = (unsigned char *)malloc(READ_BLOCK);
for (;;) {
FileEntry *fe = mpmc_pop(ctx->file_queue);
if (!fe)
break;
char hash[HASH_STRLEN];
xxh3_hash_file_stream(fe->path, hash, buf);
char created[32], modified[32];
format_time(fe->created_time, created, sizeof(created));
format_time(fe->modified_time, modified, sizeof(modified));
double size_kib = (double)fe->size_bytes / 1024.0;
char stack_buf[1024];
int len =
snprintf(stack_buf, sizeof(stack_buf), "%s\t%s\t%.2f\t%s\t%s\t%s\n",
hash, fe->path, size_kib, created, modified, fe->owner);
char *dst = arena_push(&ctx->arena, len, false);
memcpy(dst, stack_buf, len);
atomic_fetch_add(&g_files_hashed, 1);
}
free(buf);
return THREAD_RETURN_VALUE;
}
// ------------------------- Progress display ---------------------------
static THREAD_RETURN progress_thread(void *arg) {
(void)arg;
HiResTimer progress_timer;
timer_start(&progress_timer);
uint64_t last_bytes = 0;
double last_time = 0.0;
double displayed_speed = 0.0;
const double sample_interval = 0.5;
// Hide cursor to prevent flickering
printf("\033[?25l");
for (;;) {
uint64_t found = atomic_load(&g_files_found);
uint64_t hashed = atomic_load(&g_files_hashed);
uint64_t bytes = atomic_load(&g_bytes_processed);
int scan_done = atomic_load(&g_scan_done);
double t = timer_elapsed(&progress_timer);
double dt = t - last_time;
if (dt >= sample_interval) {
uint64_t db = (bytes > last_bytes) ? bytes - last_bytes : 0;
displayed_speed = (double)db / (1024.0 * 1024.0) / dt;
last_bytes = bytes;
last_time = t;
}
printf("\r");
if (!scan_done) {
printf("\033[1mScanning:\033[0m %llu files | Hashed: %llu | \033[32m%.2f "
"MB/s\033[0m ",
(unsigned long long)found, (unsigned long long)hashed,
displayed_speed);
} else {
double pct = found ? (double)hashed / (double)found : 0.0;
int barw = 40;
int filled = (int)(pct * barw);
printf("[");
// Print filled part in Green (\033[32m)
printf("\033[32m");
for (int i = 0; i < filled; i++)
putchar('#');
// Reset color for empty part
printf("\033[0m");
for (int i = filled; i < barw; i++)
putchar('.');
printf("] %6.2f%% (%llu/%llu) \033[32m%.2f MB/s\033[0m ", pct * 100.0,
(unsigned long long)hashed, (unsigned long long)found,
displayed_speed);
}
fflush(stdout);
if (scan_done && hashed == found)
break;
sleep_ms(100);
}
// Restore cursor (\033[?25h) and move to next line
printf("\033[?25h\n");
return THREAD_RETURN_VALUE;
}
// ======================== Hash worker IO Ring ========================
// -------------------------- Configuration ---------------------------
#define IORING_BUFFER_SIZE (KiB(512))
#define NUM_BUFFERS_PER_THREAD 16
#define MAX_ACTIVE_FILES 8
#define SUBMIT_TIMEOUT_MS 30000
#define USERDATA_REGISTER 1
// Globals
u64 g_ioring_buffer_size = 4096 * 64;
static atomic_uint_fast64_t g_io_ring_fallbacks = 0;
// -------------------------- File context ---------------------------
typedef struct IoBuffer IoBuffer;
typedef struct FileReadContext {
HANDLE hFile;
uint64_t file_size;
bool use_incremental_hash;
union {
XXH3_state_t hash_state; // For incremental hash (large files)
XXH128_hash_t single_hash; // For single-shot hash (small files)
};
FileEntry *fe;
// Completion tracking
int reads_submitted;
int reads_completed;
int active_reads;
int failed_reads;
int reads_hashed;
uint64_t bytes_hashed;
// For in-order hashing
uint64_t next_hash_offset;
uint64_t next_read_offset;
IoBuffer *head;
IoBuffer *tail;
} FileReadContext;
// -------------------------- Buffer structure ---------------------------
typedef struct IoBuffer {
void *data;
uint64_t offset;
size_t size;
size_t bytes_read;
HRESULT result;
int buffer_id;
int completed;
FileReadContext *file;
struct IoBuffer *next;
} IoBuffer;
// Thread-local I/O Ring context
typedef struct ThreadIoContext {
HIORING ring;
HANDLE completion_event;
IoBuffer buffers[NUM_BUFFERS_PER_THREAD];
int buffer_pool[NUM_BUFFERS_PER_THREAD];
int free_count;
int num_submissions;
int active_files;
int initialized;
} ThreadIoContext;
static _Thread_local ThreadIoContext *g_thread_ctx = NULL;
typedef struct FileQueue {
FileReadContext files[MAX_ACTIVE_FILES];
int head;
int tail;
int count;
} FileQueue;
// ---------------------- FIFO queue operations ---------------------------
static FileReadContext *fq_push(FileQueue *fq) {
if (fq->count == MAX_ACTIVE_FILES)
return NULL;
FileReadContext *f = &fq->files[fq->tail];
fq->tail = (fq->tail + 1) % MAX_ACTIVE_FILES;
fq->count++;
return f;
}
static FileReadContext *fq_peek(FileQueue *fq) {
if (fq->count == 0)
return NULL;
return &fq->files[fq->head];
}
static void fq_pop(FileQueue *fq) {
fq->head = (fq->head + 1) % MAX_ACTIVE_FILES;
fq->count--;
}
// ---------------------- Initialize thread context ---------------------------
static ThreadIoContext *io_ring_init_thread(void) {
if (g_thread_ctx && g_thread_ctx->initialized) {
return g_thread_ctx;
}
if (!g_thread_ctx) {
g_thread_ctx = (ThreadIoContext *)calloc(1, sizeof(ThreadIoContext));
if (!g_thread_ctx)
return NULL;
}
// Create I/O Ring
IORING_CAPABILITIES caps;
QueryIoRingCapabilities(&caps);
UINT32 queue_size = min(4096, caps.MaxSubmissionQueueSize);
IORING_CREATE_FLAGS flags = {0};
HRESULT hr = CreateIoRing(caps.MaxVersion, flags, queue_size, queue_size * 2,
&g_thread_ctx->ring);
// Create completion event
g_thread_ctx->completion_event = CreateEvent(NULL, FALSE, FALSE, NULL);
if (g_thread_ctx->completion_event) {
SetIoRingCompletionEvent(g_thread_ctx->ring,
g_thread_ctx->completion_event);
}
// Initialize buffer pool
IORING_BUFFER_INFO buf_info[NUM_BUFFERS_PER_THREAD];
u64 buf_pool_size = g_ioring_buffer_size * NUM_BUFFERS_PER_THREAD;
// Reserve and Commit the entire memory chunk
void *base_ptr = plat_mem_reserve(buf_pool_size);
if (base_ptr) {
if (!plat_mem_commit(base_ptr, buf_pool_size)) {
plat_mem_release(base_ptr, 0);
return NULL;
}
} else {
return NULL;
}
for (int i = 0; i < NUM_BUFFERS_PER_THREAD; i++) {
g_thread_ctx->buffers[i].data = (u8 *)base_ptr + (i * g_ioring_buffer_size);
g_thread_ctx->buffer_pool[i] = i;
g_thread_ctx->buffers[i].buffer_id = i;
buf_info[i].Address = g_thread_ctx->buffers[i].data;
buf_info[i].Length = (ULONG)g_ioring_buffer_size;
}
g_thread_ctx->free_count = NUM_BUFFERS_PER_THREAD;
HRESULT hb = BuildIoRingRegisterBuffers(
g_thread_ctx->ring, NUM_BUFFERS_PER_THREAD, buf_info, USERDATA_REGISTER);
if (FAILED(hb)) {
printf("Buffer registration failed: 0x%lx\n", hb);
return NULL;
}
// Submit registration
SubmitIoRing(g_thread_ctx->ring, 0, 0, NULL);
g_thread_ctx->num_submissions = 0;
g_thread_ctx->active_files = 0;
g_thread_ctx->initialized = 1;
return g_thread_ctx;
}
static void io_ring_cleanup_thread(void) {
if (!g_thread_ctx)
return;
if (g_thread_ctx->completion_event)
CloseHandle(g_thread_ctx->completion_event);
if (g_thread_ctx->ring)
CloseIoRing(g_thread_ctx->ring);
plat_mem_release(g_thread_ctx->buffers[0].data, 0);
free(g_thread_ctx);
g_thread_ctx = NULL;
}
// ---------------------- Buffer get and return ------------------------
static IoBuffer *get_free_buffer(ThreadIoContext *ctx) {
if (ctx->free_count == 0) {
return NULL;
}
int idx = ctx->buffer_pool[--ctx->free_count];
IoBuffer *buf = &ctx->buffers[idx];
buf->completed = 0;
buf->bytes_read = 0;
buf->result = E_PENDING;
return buf;
}
static void return_buffer(ThreadIoContext *ctx, IoBuffer *buf) {
if (!buf)
return;
ctx->buffer_pool[ctx->free_count++] = buf->buffer_id;
}
// -------------------------- Submit async read ---------------------------
static HRESULT submit_read(ThreadIoContext *ctx, FileReadContext *file_ctx,
IoBuffer *buf, uint64_t offset, size_t size) {
buf->offset = offset;
buf->size = size;
buf->file = file_ctx;
IORING_HANDLE_REF file_ref = IoRingHandleRefFromHandle(file_ctx->hFile);
IORING_BUFFER_REF buffer_ref =
IoRingBufferRefFromIndexAndOffset(buf->buffer_id, 0);
HRESULT hr =
BuildIoRingReadFile(ctx->ring, file_ref, buffer_ref, (UINT32)size, offset,
(UINT_PTR)buf, IOSQE_FLAGS_NONE);
if (SUCCEEDED(hr)) {
file_ctx->active_reads++;
file_ctx->reads_submitted++;
ctx->num_submissions++;
} else {
buf->completed = 1;
return_buffer(ctx, buf);
}
return hr;
}
// ------------ Link completed buffers in an ordered list -------------
static void insert_buffer_ordered(FileReadContext *file, IoBuffer *buf) {
buf->next = NULL;
// empty list
if (!file->head) {
file->head = file->tail = buf;
return;
}
// insert at head
if (buf->offset < file->head->offset) {
buf->next = file->head;
file->head = buf;
return;
}
// find position
IoBuffer *cur = file->head;
while (cur->next && cur->next->offset < buf->offset) {
cur = cur->next;
}
buf->next = cur->next;
cur->next = buf;
if (!buf->next)
file->tail = buf;
}
// -------------------------- Process completions ---------------------------
static void process_completions(ThreadIoContext *ctx, FileQueue *fq) {
IORING_CQE cqe;
while (PopIoRingCompletion(ctx->ring, &cqe) == S_OK) {
if (cqe.UserData == USERDATA_REGISTER || cqe.UserData == 0)
continue;
IoBuffer *buf = (IoBuffer *)cqe.UserData;
FileReadContext *file = buf->file;
buf->result = cqe.ResultCode;
buf->bytes_read = (DWORD)cqe.Information;
buf->completed = 1;
file->active_reads--;
file->reads_completed++;
ctx->num_submissions--;
if (SUCCEEDED(cqe.ResultCode) && cqe.Information > 0) {
buf->next = NULL;
insert_buffer_ordered(file, buf);
} else {
file->failed_reads++;
return_buffer(ctx, buf);
}
}
}
// -------------------- File operations -----------------------
static int init_file(FileReadContext *f, FileEntry *fe) {
memset(f, 0, sizeof(*f));
f->fe = fe;
f->file_size = fe->size_bytes;
f->hFile = CreateFileA(
fe->path, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
OPEN_EXISTING, FILE_FLAG_OVERLAPPED | FILE_FLAG_NO_BUFFERING, NULL);
if (f->hFile == INVALID_HANDLE_VALUE) {
return 0;
}
// Determine hash method based on file size
if (f->file_size > g_ioring_buffer_size) {
f->use_incremental_hash = true;
XXH3_128bits_reset(&f->hash_state);
} else {
f->use_incremental_hash = false;
}
return 1;
}
static void finalize_file(FileReadContext *file, WorkerContext *ctx) {
FileEntry *fe = file->fe;
char hash[HASH_STRLEN];
if (file->failed_reads == 0 && file->bytes_hashed == file->file_size) {
if (file->use_incremental_hash) {
// Large file: digest the accumulated hash state
XXH128_hash_t h = XXH3_128bits_digest(&file->hash_state);
snprintf(hash, HASH_STRLEN, "%016llx%016llx",
(unsigned long long)h.high64, (unsigned long long)h.low64);
} else {
// Small file: hash already computed, stored directly in single_hash
snprintf(hash, HASH_STRLEN, "%016llx%016llx",
(unsigned long long)file->single_hash.high64,
(unsigned long long)file->single_hash.low64);
}
} else {
atomic_fetch_add(&g_io_ring_fallbacks, 1);
xxh3_hash_file_stream(fe->path, hash, NULL);
printf("Fallback for path: %s\n", fe->path);
}
char created[32], modified[32];
format_time(fe->created_time, created, sizeof(created));
format_time(fe->modified_time, modified, sizeof(modified));
double size_kib = (double)fe->size_bytes / 1024.0;
char stack_buf[1024];
int len = snprintf(stack_buf, sizeof(stack_buf), "%s\t%s\t%.2f\t%s\t%s\t%s\n",
hash, fe->path, size_kib, created, modified, fe->owner);
char *dst = arena_push(&ctx->arena, len, false);
memcpy(dst, stack_buf, len);
atomic_fetch_add(&g_files_hashed, 1);
}
// -------------------- Hash head file -----------------------
static void hash_head_file(ThreadIoContext *ctx, FileQueue *fq,
WorkerContext *wctx) {
FileReadContext *file = fq_peek(fq);
if (!file)
return;
while (file->head) {
IoBuffer *buf = file->head;
// Check ordering
if (buf->offset != file->bytes_hashed)
return;
// Consume
file->head = buf->next;
if (!file->head)
file->tail = NULL;
// Calculate actual bytes to hash (handle last partial sector)
size_t bytes_to_hash = buf->bytes_read;
// If this is the last buffer and we read beyond file size, trim it
if (buf->offset + buf->bytes_read > file->file_size) {
bytes_to_hash = file->file_size - buf->offset;
}
if (bytes_to_hash > 0) {
if (file->use_incremental_hash) {
// Large file: update incremental hash state
XXH3_128bits_update(&file->hash_state, buf->data, bytes_to_hash);
} else {
// Small file: single-shot hash
file->single_hash = XXH3_128bits(buf->data, bytes_to_hash);
}
file->bytes_hashed += bytes_to_hash;
atomic_fetch_add(&g_bytes_processed, bytes_to_hash);
}
return_buffer(ctx, buf);
}
// Finalize
if (file->active_reads == 0 && file->bytes_hashed >= file->file_size) {
finalize_file(file, wctx);
CloseHandle(file->hFile);
fq_pop(fq);
ctx->active_files--;
}
}
// ------------- Submit pending reads - fill all free buffers -----------------
static void submit_pending_reads(ThreadIoContext *ctx, FileQueue *fq,
WorkerContext *worker_ctx, int *submitting) {
MPMCQueue *file_queue = worker_ctx->file_queue;
// Try to submit reads for the current head file
FileReadContext *f = fq_peek(fq);
for (;;) {
if (f) {
while (f->next_read_offset < f->file_size) {
IoBuffer *buf = get_free_buffer(ctx);
if (!buf)
return;
size_t remaining = f->file_size - f->next_read_offset;
size_t size;
// Check if this is the last read and the file size is not
// sector-aligned
BOOL is_last_read = (remaining <= g_ioring_buffer_size);
if (remaining >= g_ioring_buffer_size) {
// Normal full read
size = g_ioring_buffer_size;
} else {
// Last read - handle partial sector
if (remaining % g_pagesize != 0) {
size = ALIGN_UP_POW2(remaining, g_pagesize);
} else {
size = remaining;
}
}
HRESULT hr = submit_read(ctx, f, buf, f->next_read_offset, size);
if (FAILED(hr)) {
return_buffer(ctx, buf);
f->failed_reads++;
f->active_reads = 0;
f->reads_submitted = 0;
f->next_read_offset = f->file_size;
break;
}
f->next_read_offset += size;
if (ctx->num_submissions >= NUM_BUFFERS_PER_THREAD)
return;
}
}
// Add new file if possible
if (!*submitting)
return;
if (ctx->active_files >= MAX_ACTIVE_FILES)
return;
FileEntry *fe = mpmc_pop(file_queue);
if (!fe) {
*submitting = 0;
return;
}
FileReadContext *newf = fq_push(fq);
if (!init_file(newf, fe)) {
// File can't be opened with NO_BUFFERING, process with fallback
char hash[HASH_STRLEN];
finalize_file(newf, worker_ctx);
fq_pop(fq);
continue;
}
f = newf;
ctx->active_files++;
}
}
// -------------------------- Wait for completions ---------------------------
static void wait_for_completions(ThreadIoContext *ctx) {
// If there are in-flight I/O requests → wait for completion
if (ctx->num_submissions > 0) {
WaitForSingleObject(ctx->completion_event, SUBMIT_TIMEOUT_MS);
return;
}
// Sleep(1);
}
// -------------------------- Hash worker I/O Ring ---------------------------
static THREAD_RETURN hash_worker_io_ring(void *arg) {
WorkerContext *ctx = (WorkerContext *)arg;
// Init IO ring
ThreadIoContext *ring_ctx = io_ring_init_thread();
if (!ring_ctx || !ring_ctx->ring) {
printf("Thread %lu: I/O Ring unavailable, using buffered I/O\n",
GetCurrentThreadId());
return hash_worker(arg);
}
// Initialize pipeline state
FileQueue fq;
memset(&fq, 0, sizeof(fq));
int submitting = 1;
// Main pipeline loop
for (;;) {
// 1. Submit new reads
submit_pending_reads(ring_ctx, &fq, ctx, &submitting);
UINT32 submitted = 0;
SubmitIoRing(ring_ctx->ring, 0, 0, &submitted);
// 5. Avoid busy witing
wait_for_completions(ring_ctx);
// 2. Process completions
process_completions(ring_ctx, &fq);
// 3. Hash files
for (int i = 0; i < fq.count; i++) {
hash_head_file(ring_ctx, &fq, ctx);
}
// debug
// printf("Free buffers: %d, Submissions: %d, Active files: %d\n",
// ring_ctx->free_count, ring_ctx->num_submissions,
// ring_ctx->active_files);
// 4. Exit condition
if (!submitting && ring_ctx->active_files == 0 &&
ring_ctx->num_submissions == 0) {
break;
}
}
io_ring_cleanup_thread();
return THREAD_RETURN_VALUE;
}