Files
filehasher/file_hasher.c
amir 0294498538 Add support for multiple inflight files and one shot hash small files
The IO Ring now supports bashing multiple submissions and can handle
multiple files at the same time.

Hashing small files using XXH3_128bits() instead of the streaming
pipeline(XXH3_128bits_reset(), XXH3_128bits_update(),
XXH3_128bits_digest()), this reduses the overhead of creating a state
and digest, coupled with the IO Ring it improves the hashing of small
files whose size is inferior to the size of IO Ring buffers
2026-04-02 14:31:58 +01:00

288 lines
8.3 KiB
C

#include "platform.c"
// ----------------------------- Main ---------------------------------------
int main(int argc, char **argv) {
char folders[64][MAX_PATHLEN]; // up to 64 input folders
int folder_count = 0;
// -------------------------------
// Parse arguments
// -------------------------------
for (int i = 1; i < argc; ++i) {
if (folder_count < 64) {
normalize_path(argv[i]);
strncpy(folders[folder_count], argv[i], MAX_PATHLEN - 1);
folders[folder_count][MAX_PATHLEN - 1] = 0;
folder_count++;
}
}
// -------------------------------
// Ask user if no folders provided
// -------------------------------
if (folder_count == 0) {
printf("Enter folders to process (Enter = current folder): ");
fflush(stdout);
char buf[KiB(32)];
if (!fgets(buf, sizeof(buf), stdin))
return 1;
buf[strcspn(buf, "\r\n")] = 0;
if (buf[0] == 0) {
strcpy(folders[0], ".");
folder_count = 1;
} else {
folder_count = parse_paths(buf, folders, 64);
}
}
// Display selected folders
printf("Processing %d folder(s):\n", folder_count);
for (int i = 0; i < folder_count; ++i) {
printf(" - %s\n", folders[i]);
}
// -------------------------------
// Scanning and total timer init
// -------------------------------
timer_init();
HiResTimer total_timer;
HiResTimer scan_timer;
timer_start(&total_timer);
timer_start(&scan_timer);
// -------------------------------
// Creating a general purpose arena
// -------------------------------
arena_params params = {
.reserve_size = GiB(1),
.commit_size = MiB(16),
.align = 0,
.push_size = 0,
.allow_free_list = true,
.allow_swapback = false,
.growth_policy = ARENA_GROWTH_NORMAL,
.commit_policy = ARENA_COMMIT_LAZY,
.max_nbre_blocks = 1,
};
mem_arena *gp_arena = arena_create(&params);
// -------------------------------
// Detect hardware
// -------------------------------
// --- Windows: detect PHYSICAL cores (not logical threads) ---
size_t hw_threads = platform_physical_cores();
// Logical threads = CPU cores * 2
size_t num_threads = hw_threads * 2;
printf("Starting thread pool: %zu threads (CPU cores: %zu)\n", num_threads,
hw_threads);
printf(" Selected instruction set: %s\n", get_xxhash_instruction_set());
// Align IO Ring block size to the system page size
g_ioring_buffer_size = ALIGN_UP_POW2(IORING_BUFFER_SIZE, g_pagesize);
// -------------------------------
// Scanning and hashing
// -------------------------------
// test_io_ring();
MPMCQueue dir_queue;
mpmc_init(&dir_queue, MiB(1));
MPMCQueue file_queue;
mpmc_init(&file_queue, MiB(1));
// Starting hash threads
// size_t num_hash_threads = num_threads;
//
// WorkerContext workers[num_hash_threads];
// Thread *hash_threads =
// arena_push(&gp_arena, sizeof(Thread) * num_hash_threads, true);
//
// for (size_t i = 0; i < num_hash_threads; ++i) {
// workers[i].arena = arena_create(&params);
// workers[i].file_queue = &file_queue;
//
// if (thread_create(&hash_threads[i], (ThreadFunc)hash_worker, &workers[i])
// !=
// 0) {
// fprintf(stderr, "Failed to create hash thread %zu\n", i);
// exit(1);
// }
// }
// Starting hash threads
size_t num_hash_threads = num_threads;
// size_t num_hash_threads = 1;
WorkerContext workers[num_hash_threads];
Thread *hash_threads =
arena_push(&gp_arena, sizeof(Thread) * num_hash_threads, true);
for (size_t i = 0; i < num_hash_threads; ++i) {
workers[i].arena = arena_create(&params);
workers[i].file_queue = &file_queue;
if (thread_create(&hash_threads[i], (ThreadFunc)hash_worker_io_ring,
&workers[i]) != 0) {
fprintf(stderr, "Failed to create hash thread %zu\n", i);
exit(1);
}
}
// Starting hash threads
// size_t num_hash_threads = num_threads;
//
// WorkerContext workers[num_hash_threads];
// Thread *hash_threads =
// arena_push(&gp_arena, sizeof(Thread) * num_hash_threads, true);
//
// // Check if I/O Ring is available
// bool io_ring_available = false;
// HIORING test_ring = io_ring_init();
// if (test_ring) {
// io_ring_available = true;
// io_ring_cleanup(test_ring);
// // printf("I/O Ring is available, using high-performance async I/O\n");
// } else {
// printf("I/O Ring not available, using buffered I/O\n");
// }
//
// for (size_t i = 0; i < num_hash_threads; ++i) {
// workers[i].arena = arena_create(&params);
// workers[i].file_queue = &file_queue;
//
// // Select the appropriate worker function
// ThreadFunc fn = io_ring_available ? (ThreadFunc)hash_worker_io_ring
// : (ThreadFunc)hash_worker;
//
// if (thread_create(&hash_threads[i], fn, &workers[i]) != 0) {
// fprintf(stderr, "Failed to create hash thread %zu\n", i);
// exit(1);
// }
// }
// Starting progress printing thread
Thread progress_thread_handle;
if (thread_create(&progress_thread_handle, (ThreadFunc)progress_thread,
NULL) != 0) {
fprintf(stderr, "Failed to create progress thread\n");
exit(1);
}
// Starting scan threads
size_t num_scan_threads = num_threads;
ScannerContext scanners[num_scan_threads];
Thread *scan_threads =
arena_push(&gp_arena, sizeof(Thread) * num_scan_threads, true);
for (size_t i = 0; i < num_scan_threads; i++) {
scanners[i].num_threads = num_scan_threads;
scanners[i].path_arena = arena_create(&params);
scanners[i].meta_arena = arena_create(&params);
scanners[i].dir_queue = &dir_queue;
scanners[i].file_queue = &file_queue;
if (thread_create(&scan_threads[i], (ThreadFunc)scan_worker,
&scanners[i]) != 0) {
fprintf(stderr, "Failed to create scan thread %zu\n", i);
exit(1);
}
}
// Initial folder push
for (int i = 0; i < folder_count; i++) {
size_t len = strlen(folders[i]) + 1;
char *path = arena_push(&scanners[0].path_arena, len, false);
memcpy(path, folders[i], len);
mpmc_push_work(&dir_queue, path);
}
// Stop scan threads
thread_wait_multiple(scan_threads, num_scan_threads);
for (size_t i = 0; i < num_scan_threads; ++i) {
thread_close(&scan_threads[i]);
}
mpmc_producers_finished(&file_queue, num_hash_threads);
atomic_store(&g_scan_done, 1);
arena_free(&gp_arena, (u8 **)&scan_threads,
sizeof(Thread) * num_scan_threads);
double scan_seconds = timer_elapsed(&scan_timer);
size_t total_found = atomic_load(&g_files_found);
printf("\r%*s\r", 120, ""); // clear_console_line
printf("Completed scanning in %.2f seconds, found %zu files\n\n",
scan_seconds, total_found);
// If no files found
if (total_found == 0) {
printf("No files found.\n");
return 0;
}
// Stop hashing threads
thread_wait_multiple(hash_threads, num_hash_threads);
for (size_t i = 0; i < num_hash_threads; ++i) {
thread_close(&hash_threads[i]);
}
arena_free(&gp_arena, (u8 **)&hash_threads,
sizeof(Thread) * num_hash_threads);
// Stop progress printing thread
thread_join(&progress_thread_handle);
thread_close(&progress_thread_handle);
// -------------------------------
// Export file_hashes.txt
// -------------------------------
FILE *f = fopen(FILE_HASHES_TXT, "wb");
for (int i = 0; i < num_threads; i++) {
mem_arena *arena = workers[i].arena;
u8 *arena_base =
(u8 *)arena + ALIGN_UP_POW2(sizeof(mem_arena), arena->align);
fwrite(arena_base, 1, arena->pos, f);
}
fclose(f);
// -------------------------------
// Print summary
// -------------------------------
// DEBUG
uint64_t incomplete = atomic_load(&g_io_ring_fallbacks);
if (incomplete > 0) {
printf(
"\nI/O Ring incomplete files: %llu (fallback to buffered I/O used)\n",
(unsigned long long)incomplete);
}
//
double total_seconds = timer_elapsed(&total_timer);
printf("Completed hashing %zu files\n", total_found);
uint64_t total_bytes = (uint64_t)atomic_load(&g_bytes_processed);
double total_mb = (double)total_bytes / (1024.0 * 1024.0);
double avg_mbps = total_mb / total_seconds;
printf("Total: %.2f MB, Average: %.2f MB/s\n", total_mb, avg_mbps);
printf(" Total time : %.2f seconds\n\n", total_seconds);
return 0;
}