forked from amir/filehasher
Small improvements of the LF MPMC queue Making the LF MPMC queue generic and in a seperate header file
604 lines
16 KiB
C
604 lines
16 KiB
C
#include "platform.h"
|
|
|
|
// ----------------------------- Globals ------------------------------------
|
|
static atomic_uint_fast64_t g_files_found = 0;
|
|
static atomic_uint_fast64_t g_files_hashed = 0;
|
|
static atomic_uint_fast64_t g_bytes_processed = 0;
|
|
static atomic_int g_scan_done = 0;
|
|
|
|
// ============================= Utils ======================================
|
|
// ----------------------------- Normalize path --------------
|
|
static void normalize_path(char *p) {
|
|
char *src = p;
|
|
char *dst = p;
|
|
int prev_slash = 0;
|
|
|
|
while (*src) {
|
|
char c = *src++;
|
|
|
|
if (c == '\\' || c == '/') {
|
|
if (!prev_slash) {
|
|
*dst++ = '/';
|
|
prev_slash = 1;
|
|
}
|
|
} else {
|
|
*dst++ = c;
|
|
prev_slash = 0;
|
|
}
|
|
}
|
|
|
|
*dst = '\0';
|
|
}
|
|
|
|
// ----------------------------- Convert filetime to epoch --------------
|
|
static uint64_t filetime_to_epoch(const FILETIME *ft) {
|
|
ULARGE_INTEGER ull;
|
|
ull.LowPart = ft->dwLowDateTime;
|
|
ull.HighPart = ft->dwHighDateTime;
|
|
|
|
// Windows epoch (1601) → Unix epoch (1970)
|
|
return (ull.QuadPart - 116444736000000000ULL) / 10000000ULL;
|
|
}
|
|
// ----------------------------- Format time helper -------------------------
|
|
static void format_time(uint64_t t, char *out, size_t out_sz) {
|
|
if (t == 0) {
|
|
snprintf(out, out_sz, "N/A");
|
|
return;
|
|
}
|
|
|
|
time_t tt = (time_t)t;
|
|
struct tm tm;
|
|
|
|
#if PLATFORM_WINDOWS
|
|
localtime_s(&tm, &tt);
|
|
#else
|
|
localtime_r(&tt, &tm);
|
|
#endif
|
|
|
|
strftime(out, out_sz, "%Y-%m-%d %H:%M:%S", &tm);
|
|
}
|
|
|
|
// ----------------------------- Resolve file owner ---------------------
|
|
static void get_file_owner(const char *path, char *out, size_t out_sz) {
|
|
PSID sid = NULL;
|
|
PSECURITY_DESCRIPTOR sd = NULL;
|
|
|
|
if (GetNamedSecurityInfoA(path, SE_FILE_OBJECT, OWNER_SECURITY_INFORMATION,
|
|
&sid, NULL, NULL, NULL, &sd) == ERROR_SUCCESS) {
|
|
|
|
char name[64], domain[64];
|
|
DWORD name_len = sizeof(name);
|
|
DWORD domain_len = sizeof(domain);
|
|
SID_NAME_USE use;
|
|
|
|
if (LookupAccountSidA(NULL, sid, name, &name_len, domain, &domain_len,
|
|
&use)) {
|
|
snprintf(out, out_sz, "%s\\%s", domain, name);
|
|
} else {
|
|
snprintf(out, out_sz, "UNKNOWN");
|
|
}
|
|
} else {
|
|
snprintf(out, out_sz, "UNKNOWN");
|
|
}
|
|
|
|
if (sd)
|
|
LocalFree(sd);
|
|
}
|
|
|
|
// ----------------------------- Get file metadata -------------------------
|
|
void platform_get_file_times(const char *path, uint64_t *out_created,
|
|
uint64_t *out_modified) {
|
|
WIN32_FILE_ATTRIBUTE_DATA fad;
|
|
if (GetFileAttributesExA(path, GetFileExInfoStandard, &fad)) {
|
|
*out_created = filetime_to_epoch(&fad.ftCreationTime);
|
|
*out_modified = filetime_to_epoch(&fad.ftLastWriteTime);
|
|
} else {
|
|
*out_created = 0;
|
|
*out_modified = 0;
|
|
}
|
|
}
|
|
|
|
void platform_get_file_owner(const char *path, char *out_owner,
|
|
size_t out_owner_size) {
|
|
get_file_owner(path, out_owner, out_owner_size);
|
|
}
|
|
|
|
// --------------- parallel directory scanning ----------------
|
|
// Add queue helper functions
|
|
static void dirqueue_push(DirQueue *q, const char *path) {
|
|
EnterCriticalSection(&q->cs);
|
|
|
|
if (q->count + 1 > q->cap) {
|
|
q->cap = q->cap ? q->cap * 2 : 1024;
|
|
q->items = realloc(q->items, q->cap * sizeof(char *));
|
|
}
|
|
|
|
q->items[q->count++] = _strdup(path);
|
|
|
|
WakeConditionVariable(&q->cv);
|
|
LeaveCriticalSection(&q->cs);
|
|
}
|
|
|
|
static char *dirqueue_pop(DirQueue *q) {
|
|
EnterCriticalSection(&q->cs);
|
|
|
|
while (q->count == 0 && q->active > 0) {
|
|
SleepConditionVariableCS(&q->cv, &q->cs, INFINITE);
|
|
}
|
|
|
|
if (q->count == 0 && q->active == 0) {
|
|
LeaveCriticalSection(&q->cs);
|
|
return NULL; // truly done
|
|
}
|
|
|
|
char *dir = q->items[--q->count];
|
|
q->active++;
|
|
|
|
LeaveCriticalSection(&q->cs);
|
|
return dir;
|
|
}
|
|
|
|
static void dirqueue_done(DirQueue *q) {
|
|
EnterCriticalSection(&q->cs);
|
|
q->active--;
|
|
WakeAllConditionVariable(&q->cv);
|
|
LeaveCriticalSection(&q->cs);
|
|
}
|
|
static DWORD WINAPI scan_worker(LPVOID arg) {
|
|
DirQueue *q = (DirQueue *)arg;
|
|
|
|
for (;;) {
|
|
char *dir = dirqueue_pop(q);
|
|
if (!dir)
|
|
break;
|
|
|
|
scan_folder_windows_parallel(dir, q);
|
|
|
|
free(dir);
|
|
dirqueue_done(q);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
// Scanning directory function
|
|
void scan_folder_windows_parallel(const char *base, DirQueue *q) {
|
|
char search[MAX_PATHLEN];
|
|
snprintf(search, sizeof(search), "%s\\*", base);
|
|
|
|
WIN32_FIND_DATAA fd;
|
|
HANDLE h = FindFirstFileA(search, &fd);
|
|
if (h == INVALID_HANDLE_VALUE)
|
|
return;
|
|
|
|
do {
|
|
if (!strcmp(fd.cFileName, ".") || !strcmp(fd.cFileName, ".."))
|
|
continue;
|
|
|
|
char full[MAX_PATHLEN];
|
|
snprintf(full, sizeof(full), "%s\\%s", base, fd.cFileName);
|
|
|
|
if (fd.dwFileAttributes & FILE_ATTRIBUTE_REPARSE_POINT)
|
|
continue;
|
|
|
|
if (fd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
|
|
dirqueue_push(q, full);
|
|
} else {
|
|
|
|
atomic_fetch_add(&g_files_found, 1);
|
|
|
|
FileEntry *fe = malloc(sizeof(FileEntry));
|
|
memset(fe, 0, sizeof(FileEntry));
|
|
|
|
char norm[MAX_PATHLEN];
|
|
strncpy(norm, full, sizeof(norm) - 1);
|
|
norm[sizeof(norm) - 1] = 0;
|
|
normalize_path(norm);
|
|
|
|
fe->path = _strdup(norm);
|
|
|
|
platform_get_file_times(full, &fe->created_time, &fe->modified_time);
|
|
|
|
platform_get_file_owner(full, fe->owner, sizeof(fe->owner));
|
|
|
|
LARGE_INTEGER size;
|
|
HANDLE hf =
|
|
CreateFileA(full, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE,
|
|
NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
|
|
|
|
if (hf != INVALID_HANDLE_VALUE) {
|
|
if (GetFileSizeEx(hf, &size))
|
|
fe->size_bytes = (uint64_t)size.QuadPart;
|
|
CloseHandle(hf);
|
|
}
|
|
|
|
mpmc_push(&g_file_queue, fe);
|
|
}
|
|
|
|
} while (FindNextFileA(h, &fd));
|
|
|
|
FindClose(h);
|
|
}
|
|
|
|
// ----------------------------- Hashing helpers -----------------------------
|
|
static void xxh3_hash_file_stream(const char *path, char *out_hex, BYTE *buf) {
|
|
// compute XXH3_128 over file. POSIX and Windows use standard reads in this
|
|
// helper.
|
|
// On Windows try to use overlapped synchronous chunked reads for higher
|
|
// throughput.
|
|
HANDLE hFile =
|
|
CreateFileA(path, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
|
|
OPEN_EXISTING, FILE_FLAG_SEQUENTIAL_SCAN, NULL);
|
|
if (hFile == INVALID_HANDLE_VALUE) {
|
|
strcpy(out_hex, "ERROR");
|
|
return;
|
|
}
|
|
XXH128_hash_t h;
|
|
XXH3_state_t state;
|
|
XXH3_128bits_reset(&state);
|
|
|
|
DWORD read = 0;
|
|
BOOL ok;
|
|
while (ReadFile(hFile, buf, READ_BLOCK, &read, NULL) && read > 0) {
|
|
XXH3_128bits_update(&state, buf, (size_t)read);
|
|
atomic_fetch_add(&g_bytes_processed, (uint64_t)read);
|
|
}
|
|
h = XXH3_128bits_digest(&state);
|
|
CloseHandle(hFile);
|
|
snprintf(out_hex, HASH_STRLEN, "%016llx%016llx", (unsigned long long)h.high64,
|
|
(unsigned long long)h.low64);
|
|
}
|
|
|
|
// ------------------------- Hash worker --------------------------------
|
|
static DWORD WINAPI hash_worker(LPVOID arg) {
|
|
|
|
WorkerContext *ctx = (WorkerContext *)arg;
|
|
MPMCQueue *q = ctx->queue;
|
|
mem_arena *local_arena = ctx->arena;
|
|
BYTE *buf = (BYTE *)malloc(READ_BLOCK);
|
|
|
|
for (;;) {
|
|
FileEntry *fe = mpmc_pop(q);
|
|
if (!fe)
|
|
break;
|
|
|
|
char hash[HASH_STRLEN];
|
|
xxh3_hash_file_stream(fe->path, hash, buf);
|
|
|
|
char created[32], modified[32];
|
|
format_time(fe->created_time, created, sizeof(created));
|
|
format_time(fe->modified_time, modified, sizeof(modified));
|
|
|
|
double size_kib = (double)fe->size_bytes / 1024.0;
|
|
|
|
char stack_buf[1024];
|
|
|
|
int len =
|
|
snprintf(stack_buf, sizeof(stack_buf), "%s\t%s\t%.2f\t%s\t%s\t%s\n",
|
|
hash, fe->path, size_kib, created, modified, fe->owner);
|
|
|
|
char *dst = arena_push(&local_arena, len, false);
|
|
memcpy(dst, stack_buf, len);
|
|
|
|
atomic_fetch_add(&g_files_hashed, 1);
|
|
|
|
free(fe->path);
|
|
free(fe);
|
|
}
|
|
free(buf);
|
|
|
|
return 0;
|
|
}
|
|
|
|
// ----------------------------- Progress display ---------------------------
|
|
DWORD WINAPI progress_thread(void *arg) {
|
|
|
|
LARGE_INTEGER freq, start;
|
|
QueryPerformanceFrequency(&freq);
|
|
QueryPerformanceCounter(&start);
|
|
|
|
uint64_t last_bytes = atomic_load(&g_bytes_processed);
|
|
double last_time = 0.0;
|
|
|
|
double displayed_speed = 0.0;
|
|
const double sample_interval = 0.5;
|
|
|
|
for (;;) {
|
|
|
|
uint64_t found = atomic_load(&g_files_found);
|
|
uint64_t hashed = atomic_load(&g_files_hashed);
|
|
uint64_t bytes = atomic_load(&g_bytes_processed);
|
|
int scan_done = atomic_load(&g_scan_done);
|
|
|
|
LARGE_INTEGER now;
|
|
QueryPerformanceCounter(&now);
|
|
|
|
double t = (double)(now.QuadPart - start.QuadPart) / (double)freq.QuadPart;
|
|
|
|
if (last_time == 0.0) {
|
|
last_time = t;
|
|
last_bytes = bytes;
|
|
}
|
|
|
|
double dt = t - last_time;
|
|
|
|
if (dt >= sample_interval) {
|
|
uint64_t db = bytes - last_bytes;
|
|
|
|
if (db > 0 && dt > 0.0001) {
|
|
displayed_speed = (double)db / (1024.0 * 1024.0) / dt;
|
|
}
|
|
|
|
last_bytes = bytes;
|
|
last_time = t;
|
|
}
|
|
|
|
if (!scan_done) {
|
|
|
|
printf("\rScanning: %llu files | Hashed: %llu | %.2f MB/s ",
|
|
(unsigned long long)found, (unsigned long long)hashed,
|
|
displayed_speed);
|
|
|
|
} else {
|
|
|
|
double pct = found ? (double)hashed / (double)found : 0.0;
|
|
|
|
int barw = 40;
|
|
int filled = (int)(pct * barw);
|
|
|
|
char bar[64];
|
|
int p = 0;
|
|
|
|
bar[p++] = '[';
|
|
|
|
for (int i = 0; i < filled; i++)
|
|
bar[p++] = '#';
|
|
|
|
for (int i = filled; i < barw; i++)
|
|
bar[p++] = '.';
|
|
|
|
bar[p++] = ']';
|
|
bar[p] = 0;
|
|
|
|
printf("\r%s %6.2f%% (%llu / %llu) %.2f MB/s ", bar, pct * 100.0,
|
|
(unsigned long long)hashed, (unsigned long long)found,
|
|
displayed_speed);
|
|
}
|
|
|
|
fflush(stdout);
|
|
|
|
if (scan_done && hashed == found)
|
|
break;
|
|
|
|
Sleep(100);
|
|
}
|
|
|
|
printf("\n");
|
|
|
|
return 0;
|
|
}
|
|
|
|
// ----------------------------- Main ---------------------------------------
|
|
int main(int argc, char **argv) {
|
|
char folders[64][MAX_PATHLEN]; // up to 64 input folders
|
|
int folder_count = 0;
|
|
|
|
// -------------------------------
|
|
// Scanning and total timer init
|
|
// -------------------------------
|
|
timer_init();
|
|
|
|
HiResTimer total_timer;
|
|
HiResTimer scan_timer;
|
|
|
|
timer_start(&total_timer);
|
|
timer_start(&scan_timer);
|
|
|
|
// -------------------------------
|
|
// Parse arguments
|
|
// -------------------------------
|
|
for (int i = 1; i < argc; ++i) {
|
|
if (folder_count < 64) {
|
|
strncpy(folders[folder_count], argv[i], MAX_PATHLEN - 1);
|
|
folders[folder_count][MAX_PATHLEN - 1] = 0;
|
|
folder_count++;
|
|
}
|
|
}
|
|
|
|
// -------------------------------
|
|
// Ask user if no folders provided
|
|
// -------------------------------
|
|
if (folder_count == 0) {
|
|
printf("Enter folder to process (Enter = current folder): ");
|
|
fflush(stdout);
|
|
|
|
char buf[MAX_PATHLEN];
|
|
if (!fgets(buf, sizeof(buf), stdin))
|
|
return 1;
|
|
buf[strcspn(buf, "\r\n")] = 0;
|
|
|
|
if (buf[0] == 0)
|
|
strcpy(folders[0], ".");
|
|
else
|
|
strncpy(folders[0], buf, MAX_PATHLEN - 1);
|
|
|
|
folder_count = 1;
|
|
}
|
|
|
|
// -------------------------------
|
|
// Display selected folders
|
|
// -------------------------------
|
|
printf("Processing %d folder(s):\n", folder_count);
|
|
for (int i = 0; i < folder_count; ++i) {
|
|
printf(" - %s\n", folders[i]);
|
|
}
|
|
|
|
// -------------------------------
|
|
// Creating a general purpose arena
|
|
// -------------------------------
|
|
arena_params params = {
|
|
.reserve_size = GiB(1),
|
|
.commit_size = MiB(16),
|
|
.align = 0,
|
|
.push_size = 0,
|
|
.allow_free_list = true,
|
|
.allow_swapback = false,
|
|
.growth_policy = ARENA_GROWTH_NORMAL,
|
|
.commit_policy = ARENA_COMMIT_LAZY,
|
|
.max_nbre_blocks = 1,
|
|
};
|
|
|
|
mem_arena *gp_arena = arena_create(¶ms);
|
|
|
|
// -------------------------------
|
|
// Detect hardware threads (CPU cores)
|
|
// -------------------------------
|
|
size_t hw_threads = 1;
|
|
// --- Windows: detect PHYSICAL cores (not logical threads) ---
|
|
DWORD len = 0;
|
|
GetLogicalProcessorInformation(NULL, &len);
|
|
|
|
SYSTEM_LOGICAL_PROCESSOR_INFORMATION *buf =
|
|
(SYSTEM_LOGICAL_PROCESSOR_INFORMATION *)arena_push(&gp_arena, len, true);
|
|
|
|
if (GetLogicalProcessorInformation(buf, &len)) {
|
|
DWORD count = 0;
|
|
DWORD n = len / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
|
|
for (DWORD i = 0; i < n; i++) {
|
|
if (buf[i].Relationship == RelationProcessorCore)
|
|
count++;
|
|
}
|
|
if (count > 0)
|
|
hw_threads = count;
|
|
}
|
|
arena_free(&gp_arena, (u8 **)&buf, len);
|
|
|
|
// Add some extra threads to overlap I/O more aggressively
|
|
u8 num_threads = hw_threads * 2;
|
|
if (num_threads < 2)
|
|
num_threads = 2;
|
|
|
|
// -------------------------------
|
|
// Step 1: Scan all folders
|
|
// -------------------------------
|
|
|
|
mpmc_init(&g_file_queue, GiB(1));
|
|
|
|
DirQueue q;
|
|
memset(&q, 0, sizeof(q));
|
|
InitializeCriticalSection(&q.cs);
|
|
InitializeConditionVariable(&q.cv);
|
|
q.active = 0;
|
|
|
|
// starting hash threads
|
|
WorkerContext workers[num_threads];
|
|
|
|
for (int i = 0; i < num_threads; i++) {
|
|
workers[i].queue = &g_file_queue;
|
|
workers[i].arena = arena_create(¶ms);
|
|
}
|
|
|
|
HANDLE *hash_threads =
|
|
arena_push(&gp_arena, sizeof(HANDLE) * num_threads, true);
|
|
|
|
for (size_t i = 0; i < num_threads; ++i) {
|
|
hash_threads[i] = CreateThread(NULL, 0, hash_worker, &workers[i], 0, NULL);
|
|
}
|
|
|
|
// starting scan threads
|
|
HANDLE progress = CreateThread(NULL, 0, progress_thread, NULL, 0, NULL);
|
|
|
|
for (int i = 0; i < folder_count; ++i) {
|
|
dirqueue_push(&q, folders[i]);
|
|
}
|
|
|
|
size_t scan_threads = hw_threads;
|
|
if (scan_threads < 2)
|
|
scan_threads = 2;
|
|
|
|
HANDLE *scan_tids =
|
|
arena_push(&gp_arena, sizeof(HANDLE) * scan_threads, true);
|
|
|
|
for (size_t i = 0; i < scan_threads; ++i) {
|
|
scan_tids[i] =
|
|
CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)scan_worker, &q, 0, NULL);
|
|
}
|
|
|
|
WaitForMultipleObjects((DWORD)scan_threads, scan_tids, TRUE, INFINITE);
|
|
|
|
mpmc_producers_finished(&g_file_queue, num_threads);
|
|
|
|
atomic_store(&g_scan_done, 1);
|
|
|
|
for (size_t i = 0; i < scan_threads; ++i)
|
|
CloseHandle(scan_tids[i]);
|
|
|
|
arena_free(&gp_arena, (u8 **)&scan_tids, sizeof(HANDLE) * scan_threads);
|
|
|
|
double scan_seconds = timer_stop(&scan_timer);
|
|
size_t total_found = atomic_load(&g_files_found);
|
|
|
|
printf("\r%*s\r", 120, ""); // clear_console_line
|
|
printf("Completed scanning in %.2f seconds, found %zu files\n\n",
|
|
scan_seconds, total_found);
|
|
|
|
// if no files found
|
|
if (total_found == 0) {
|
|
printf("No files found.\n");
|
|
return 0;
|
|
}
|
|
|
|
// stop hashing threads
|
|
WaitForMultipleObjects((DWORD)num_threads, hash_threads, TRUE, INFINITE);
|
|
|
|
for (size_t i = 0; i < num_threads; ++i)
|
|
CloseHandle(hash_threads[i]);
|
|
|
|
arena_free(&gp_arena, (u8 **)&hash_threads, sizeof(HANDLE) * num_threads);
|
|
|
|
WaitForSingleObject(progress, INFINITE);
|
|
CloseHandle(progress);
|
|
|
|
// write file_hashes.txt
|
|
|
|
// FILE *f = fopen(FILE_HASHES_TXT, "wb");
|
|
//
|
|
// for (int i = 0; i < num_threads; i++) {
|
|
// mem_arena *arena = workers[i].arena;
|
|
//
|
|
// u8 *arena_base =
|
|
// (u8 *)arena + ALIGN_UP_POW2(sizeof(mem_arena), arena->align);
|
|
// fwrite(arena_base, 1, arena->pos, f);
|
|
// }
|
|
//
|
|
// fclose(f);
|
|
|
|
HANDLE h = CreateFileA(FILE_HASHES_TXT, GENERIC_WRITE, 0, NULL, CREATE_ALWAYS,
|
|
FILE_ATTRIBUTE_NORMAL, NULL);
|
|
|
|
for (int i = 0; i < num_threads; i++) {
|
|
|
|
mem_arena *local_hash_arena = workers[i].arena;
|
|
|
|
DWORD written;
|
|
|
|
u8 *arena_base = (u8 *)local_hash_arena +
|
|
ALIGN_UP_POW2(sizeof(mem_arena), local_hash_arena->align);
|
|
|
|
WriteFile(h, arena_base, (DWORD)local_hash_arena->pos, &written, NULL);
|
|
}
|
|
|
|
// done time
|
|
double total_seconds = timer_stop(&total_timer);
|
|
|
|
printf("Completed hashing %zu files\n", total_found);
|
|
|
|
uint64_t total_bytes = (uint64_t)atomic_load(&g_bytes_processed);
|
|
double total_mb = (double)total_bytes / (1024.0 * 1024.0);
|
|
double avg_mbps = total_mb / total_seconds;
|
|
printf("Total: %.2f MB, Average: %.2f MB/s\n", total_mb, avg_mbps);
|
|
printf(" Total time : %.2f seconds\n", total_seconds);
|
|
|
|
return 0;
|
|
}
|