Files
filehasher/platform_posix.c
2026-02-28 10:54:16 +01:00

679 lines
18 KiB
C

#include "platform.h"
// ----------------------------- Globals ------------------------------------
static atomic_uint_fast64_t g_bytes_processed = 0;
FileEntry *g_entries = NULL;
size_t g_entry_count = 0;
size_t g_entry_capacity = 0;
// ----------------------------- Utils --------------------------------------
static void perror_exit(const char *msg) {
perror(msg);
exit(1);
}
static void *xmalloc(size_t n) {
void *p = malloc(n);
if (!p)
perror_exit("malloc");
return p;
}
static void add_entry(const FileEntry *src) {
if (g_entry_count + 1 > g_entry_capacity) {
g_entry_capacity = g_entry_capacity ? g_entry_capacity * 2 : 1024;
g_entries = realloc(g_entries, sizeof(FileEntry) * g_entry_capacity);
if (!g_entries)
perror_exit("realloc");
}
FileEntry *dst = &g_entries[g_entry_count++];
memset(dst, 0, sizeof(*dst));
dst->size_bytes = src->size_bytes;
dst->created_time = src->created_time;
dst->modified_time = src->modified_time;
if (src->path)
dst->path = strdup(src->path);
strncpy(dst->owner, src->owner, sizeof(dst->owner) - 1);
dst->owner[sizeof(dst->owner) - 1] = '\0';
}
static void free_entries(void) {
for (size_t i = 0; i < g_entry_count; ++i) {
free(g_entries[i].path);
}
free(g_entries);
g_entries = NULL;
g_entry_count = 0;
g_entry_capacity = 0;
}
// ----------------------------- Owner lookup ------------------------------
static void get_file_owner(uid_t uid, char *out, size_t out_sz) {
struct passwd *pw = getpwuid(uid);
if (pw) {
snprintf(out, out_sz, "%s", pw->pw_name);
} else {
snprintf(out, out_sz, "UNKNOWN");
}
}
// ----------------------------- Format time helper -------------------------
static void format_time(uint64_t t, char *out, size_t out_sz) {
if (t == 0) {
snprintf(out, out_sz, "N/A");
return;
}
time_t tt = (time_t)t;
struct tm tm;
#if PLATFORM_WINDOWS
localtime_s(&tm, &tt);
#else
localtime_r(&tt, &tm);
#endif
strftime(out, out_sz, "%Y-%m-%d %H:%M:%S", &tm);
}
// --------------- parallel directory scanning ----------------
// Add queue helper functions
static void dirqueue_push(DirQueue *q, const char *path) {
DirJob *job = malloc(sizeof(*job));
job->path = strdup(path);
job->next = NULL;
pthread_mutex_lock(&q->mutex);
if (q->tail)
q->tail->next = job;
else
q->head = job;
q->tail = job;
pthread_cond_signal(&q->cond);
pthread_mutex_unlock(&q->mutex);
}
static char *dirqueue_pop(DirQueue *q) {
pthread_mutex_lock(&q->mutex);
while (!q->head && !q->stop)
pthread_cond_wait(&q->cond, &q->mutex);
if (q->stop) {
pthread_mutex_unlock(&q->mutex);
return NULL;
}
DirJob *job = q->head;
q->head = job->next;
if (!q->head)
q->tail = NULL;
q->active_workers++;
pthread_mutex_unlock(&q->mutex);
char *path = job->path;
free(job);
return path;
}
static void dirqueue_done(DirQueue *q) {
pthread_mutex_lock(&q->mutex);
q->active_workers--;
if (!q->head && q->active_workers == 0) {
q->stop = 1;
pthread_cond_broadcast(&q->cond);
}
pthread_mutex_unlock(&q->mutex);
}
// Scanning directory worker thread function
static void scan_worker(void *arg) {
DirQueue *q = arg;
for (;;) {
char *dir = dirqueue_pop(q);
if (!dir)
break;
scan_folder_posix_parallel(dir, q);
free(dir);
dirqueue_done(q);
}
}
// Scanning directory function
void scan_folder_posix_parallel(const char *base, DirQueue *q) {
DIR *d = opendir(base);
if (!d)
return;
struct dirent *ent;
while ((ent = readdir(d))) {
if (!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, ".."))
continue;
char full[MAX_PATHLEN];
snprintf(full, sizeof(full), "%s/%s", base, ent->d_name);
struct stat st;
if (lstat(full, &st) != 0)
continue;
if (S_ISDIR(st.st_mode)) {
dirqueue_push(q, full);
} else if (S_ISREG(st.st_mode)) {
FileEntry fe;
memset(&fe, 0, sizeof(fe));
normalize_path(full);
fe.path = full;
fe.size_bytes = (uint64_t)st.st_size;
fe.created_time = (uint64_t)st.st_ctime;
fe.modified_time = (uint64_t)st.st_mtime;
get_file_owner(st.st_uid, fe.owner, sizeof(fe.owner));
add_entry(&fe);
}
}
closedir(d);
}
// ----------------------------- Job queue ----------------------------------
static void jobqueue_init(JobQueue *q) {
q->head = q->tail = NULL;
atomic_store(&q->count, 0);
q->stop = 0;
pthread_mutex_init(&q->mutex, NULL);
pthread_cond_init(&q->cond, NULL);
}
static void jobqueue_push(JobQueue *q, Job *job) {
pthread_mutex_lock(&q->mutex);
job->next = NULL;
if (q->tail)
q->tail->next = job;
else
q->head = job;
q->tail = job;
atomic_fetch_add(&q->count, 1);
pthread_cond_signal(&q->cond);
pthread_mutex_unlock(&q->mutex);
}
static Job *jobqueue_pop(JobQueue *q) {
pthread_mutex_lock(&q->mutex);
while (!q->head && !q->stop)
pthread_cond_wait(&q->cond, &q->mutex);
if (q->stop && !q->head) {
pthread_mutex_unlock(&q->mutex);
return NULL;
}
Job *j = q->head;
q->head = j->next;
if (!q->head)
q->tail = NULL;
pthread_mutex_unlock(&q->mutex);
if (j)
atomic_fetch_sub(&q->count, 1);
return j;
}
static void jobqueue_stop(JobQueue *q) {
pthread_mutex_lock(&q->mutex);
q->stop = 1;
pthread_cond_broadcast(&q->cond);
pthread_mutex_unlock(&q->mutex);
}
// ----------------------------- Hashing helpers -----------------------------
static void xxh3_hash_file_stream(const char *path, char *out_hex) {
// compute XXH3_128 over file. POSIX and Windows use standard reads in this
// helper.
int fd = open(path, O_RDONLY);
if (fd < 0) {
strcpy(out_hex, "ERROR");
return;
}
XXH128_hash_t h;
XXH3_state_t *state = XXH3_createState();
XXH3_128bits_reset(state);
unsigned char *buf = (unsigned char *)malloc(READ_BLOCK);
ssize_t r;
while ((r = read(fd, buf, READ_BLOCK)) > 0) {
XXH3_128bits_update(state, buf, (size_t)r);
atomic_fetch_add(&g_bytes_processed, (uint64_t)r);
}
h = XXH3_128bits_digest(state);
XXH3_freeState(state);
close(fd);
free(buf);
snprintf(out_hex, HASH_STRLEN, "%016llx%016llx", (unsigned long long)h.high64,
(unsigned long long)h.low64);
}
// ----------------------------- Worker --------------------------------------
static void *worker_thread_posix(void *argp) {
WorkerArg *w = (WorkerArg *)argp;
JobQueue *q = w->queue;
for (;;) {
Job *job = jobqueue_pop(q);
if (!job)
break;
char hex[HASH_STRLEN];
xxh3_hash_file_stream(job->file->path, hex);
// append to file_hashes.txt atomically: we will store results to a temp
// buffer and write them at the end (to avoid synchronization issues). But
// for simplicity, here we append directly using a file lock (fopen+fwrite
// guarded by mutex). We'll store results in job->file->path? Instead,
// simple global append with a mutex. Using a file-level append lock:
static pthread_mutex_t append_mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_lock(&append_mutex);
FILE *hf = fopen(FILE_HASHES_TXT, "a");
if (hf) {
char created[32], modified[32];
format_time(job->file->created_time, created, sizeof(created));
format_time(job->file->modified_time, modified, sizeof(modified));
double size_kib = (double)job->file->size_bytes / (1024.0);
fprintf(hf, "%s\t%s\t%.2f\t%s\t%s\t%s\n", hex, job->file->path, size_kib,
created, modified, job->file->owner);
fclose(hf);
}
pthread_mutex_unlock(&append_mutex);
atomic_fetch_add(w->done_counter, 1);
free(job);
}
atomic_fetch_sub(w->live_workers, 1);
return NULL;
}
// ----------------------------- Progress display ---------------------------
static void print_progress(size_t done, size_t total) {
const int barw = 40;
double pct = total ? (double)done / (double)total : 0.0;
int filled = (int)(pct * barw + 0.5);
printf("\r[");
for (int i = 0; i < filled; ++i)
putchar('#');
for (int i = filled; i < barw; ++i)
putchar(' ');
printf("] %6.2f%% (%zu / %zu) ", pct * 100.0, done, total);
fflush(stdout);
}
// ----------------------------- Helpers: load/save --------------------------
static int file_exists(const char *path) {
struct stat st;
return (stat(path, &st) == 0);
}
static void save_file_list(const char *list_path) {
FILE *f = fopen(list_path, "w");
if (!f) {
perror("fopen file_list");
return;
}
for (size_t i = 0; i < g_entry_count; ++i) {
fprintf(f, "%s\n", g_entries[i].path);
}
fclose(f);
}
static void load_file_list(const char *list_path) {
FILE *f = fopen(list_path, "r");
if (!f)
return;
char line[MAX_PATHLEN];
while (fgets(line, sizeof(line), f)) {
line[strcspn(line, "\r\n")] = 0;
FileEntry fe;
memset(&fe, 0, sizeof(fe));
fe.path = line;
/* Populate metadata from filesystem */
platform_get_file_times(line, &fe.created_time, &fe.modified_time);
platform_get_file_owner(line, fe.owner, sizeof(fe.owner));
add_entry(&fe);
}
fclose(f);
}
// Read existing hashes into memory map for resume
// Simple linear search mapping: returns 1 if path has hash found (and writes
// into out_hex)
static int find_hash_in_file(const char *hashfile, const char *path,
char *out_hex) {
FILE *f = fopen(hashfile, "r");
if (!f)
return 0;
char p[MAX_PATHLEN];
char h[128];
int found = 0;
while (fscanf(f, "%4095s %127s", p, h) == 2) {
if (strcmp(p, path) == 0) {
strncpy(out_hex, h, HASH_STRLEN);
out_hex[HASH_STRLEN - 1] = 0;
found = 1;
break;
}
}
fclose(f);
return found;
}
// ----------------------------- Get file metadata -------------------------
void platform_get_file_times(const char *path, uint64_t *out_created,
uint64_t *out_modified) {
struct stat st;
if (stat(path, &st) == 0) {
*out_created = (uint64_t)st.st_ctime;
*out_modified = (uint64_t)st.st_mtime;
} else {
*out_created = 0;
*out_modified = 0;
}
}
void platform_get_file_owner(const char *path, char *out_owner,
size_t out_owner_size) {
struct stat st;
if (stat(path, &st) == 0) {
get_file_owner(st.st_uid, out_owner, out_owner_size);
} else {
snprintf(out_owner, out_owner_size, "UNKNOWN");
}
}
// ----------------------------- Main ---------------------------------------
int main(int argc, char **argv) {
char folders[64][MAX_PATHLEN]; // up to 64 input folders
int folder_count = 0;
int resume = 0;
// -------------------------------
// Parse arguments
// -------------------------------
for (int i = 1; i < argc; ++i) {
if (strcmp(argv[i], "-resume") == 0) {
resume = 1;
} else {
if (folder_count < 64) {
strncpy(folders[folder_count], argv[i], MAX_PATHLEN - 1);
folders[folder_count][MAX_PATHLEN - 1] = 0;
folder_count++;
}
}
}
// -------------------------------
// Ask user if no folders provided
// -------------------------------
if (folder_count == 0 && !resume) {
printf("Enter folder to process (Enter = current folder): ");
fflush(stdout);
char buf[MAX_PATHLEN];
if (!fgets(buf, sizeof(buf), stdin))
return 1;
buf[strcspn(buf, "\r\n")] = 0;
if (buf[0] == 0)
strcpy(folders[0], ".");
else
strncpy(folders[0], buf, MAX_PATHLEN - 1);
folder_count = 1;
} else if (folder_count == 0 && resume) {
strcpy(folders[0], ".");
folder_count = 1;
}
// -------------------------------
// Display selected folders
// -------------------------------
printf("Processing %d folder(s):\n", folder_count);
for (int i = 0; i < folder_count; ++i) {
printf(" - %s\n", folders[i]);
}
// -------------------------------
// Detect hardware threads (CPU cores)
// -------------------------------
size_t hw_threads = 1;
long cpus = sysconf(_SC_NPROCESSORS_ONLN);
if (cpus > 0)
hw_threads = (size_t)cpus;
// Add some extra threads to overlap I/O more aggressively
size_t num_threads = hw_threads * 2;
if (num_threads < 2)
num_threads = 2;
// -------------------------------
// Step 1: Scan all folders
// -------------------------------
if (!resume) {
DirQueue q = {0};
pthread_mutex_init(&q.mutex, NULL);
pthread_cond_init(&q.cond, NULL);
// Seed queue
for (int i = 0; i < folder_count; ++i)
dirqueue_push(&q, folders[i]);
pthread_t *threads = malloc(sizeof(pthread_t) * num_threads);
for (size_t i = 0; i < num_threads; ++i)
pthread_create(&threads[i], NULL, (void *(*)(void *))scan_worker, &q);
for (size_t i = 0; i < num_threads; ++i)
pthread_join(threads[i], NULL);
free(threads);
pthread_mutex_destroy(&q.mutex);
pthread_cond_destroy(&q.cond);
printf("Found %zu files. Saving to %s\n", g_entry_count, FILE_LIST_TXT);
save_file_list(FILE_LIST_TXT);
} else {
if (!file_exists(FILE_LIST_TXT)) {
fprintf(stderr, "Resume requested but %s not found\n", FILE_LIST_TXT);
return 1;
}
load_file_list(FILE_LIST_TXT);
printf("Loaded %zu files from %s\n", g_entry_count, FILE_LIST_TXT);
}
if (g_entry_count == 0) {
printf("No files to process.\n");
return 0;
}
// If resume: create map of which files are already hashed
char **existing_hash = calloc(g_entry_count, sizeof(char *));
for (size_t i = 0; i < g_entry_count; ++i)
existing_hash[i] = NULL;
if (resume && file_exists(FILE_HASHES_TXT)) {
// For simplicity we parse hash file and match lines to list entries.
for (size_t i = 0; i < g_entry_count; ++i) {
char hex[HASH_STRLEN] = {0};
if (find_hash_in_file(FILE_HASHES_TXT, g_entries[i].path, hex)) {
existing_hash[i] = strdup(hex);
}
}
}
// Prepare job queue of only missing files (or all if not resume)
JobQueue queue;
jobqueue_init(&queue);
size_t total_jobs = 0;
for (size_t i = 0; i < g_entry_count; ++i) {
if (resume && existing_hash[i])
continue;
Job *j = (Job *)malloc(sizeof(Job));
j->file = &g_entries[i];
j->next = NULL;
jobqueue_push(&queue, j);
++total_jobs;
}
if (total_jobs == 0) {
printf("Nothing to do — all files already hashed.\n");
return 0;
}
// Remove old hashes file if we're recomputing from scratch.
if (!resume) {
// create/overwrite hashes file
FILE *hf = fopen(FILE_HASHES_TXT, "w");
if (hf)
fclose(hf);
} // if resume, we append only missing
// Starting thread pool
atomic_size_t done_counter;
atomic_store(&done_counter, 0);
atomic_int live_workers;
atomic_store(&live_workers, (int)num_threads);
WorkerArg warg = {.queue = &queue,
.done_counter = &done_counter,
.total_jobs = total_jobs,
.live_workers = &live_workers};
printf("Starting thread pool: %zu threads (CPU cores: %zu)\n", num_threads,
hw_threads);
// Launch threads
pthread_t *tids = malloc(sizeof(pthread_t) * num_threads);
for (size_t i = 0; i < num_threads; ++i) {
pthread_create(&tids[i], NULL, worker_thread_posix, &warg);
}
// Progress / timer
struct timespec tstart, tnow;
clock_gettime(CLOCK_MONOTONIC, &tstart);
size_t last_done = 0;
// ---------- Correct real-time MB/s (stable & accurate) ----------
uint64_t last_bytes = atomic_load(&g_bytes_processed);
double last_time = 0.0;
double displayed_speed = 0.0;
const double sample_interval = 0.5;
char linebuf[256];
for (;;) {
size_t done = (size_t)atomic_load(&done_counter);
// ---- monotonic time ----
clock_gettime(CLOCK_MONOTONIC, &tnow);
double now =
(tnow.tv_sec - tstart.tv_sec) + (tnow.tv_nsec - tstart.tv_nsec) / 1e9;
// ---- bytes so far ----
uint64_t bytes = atomic_load(&g_bytes_processed);
// ---- real sampler (independent of UI sleep) ----
if (last_time == 0.0) {
last_time = now;
last_bytes = bytes;
}
double dt = now - last_time;
if (dt >= sample_interval) {
uint64_t db = bytes - last_bytes;
if (db > 0 && dt > 0.0001) {
displayed_speed = (double)db / (1024.0 * 1024.0) / dt;
}
last_bytes = bytes;
last_time = now;
}
// ---- progress bar build ----
const int barw = 40;
double pct = total_jobs ? (double)done / (double)total_jobs : 0.0;
int filled = (int)(pct * barw + 0.5);
int p = 0;
p += snprintf(linebuf + p, sizeof(linebuf) - p, "[");
for (int i = 0; i < filled && p < (int)sizeof(linebuf); ++i)
p += snprintf(linebuf + p, sizeof(linebuf) - p, "#");
for (int i = filled; i < barw && p < (int)sizeof(linebuf); ++i)
p += snprintf(linebuf + p, sizeof(linebuf) - p, ".");
snprintf(linebuf + p, sizeof(linebuf) - p,
"] %6.2f%% (%zu / %zu) %8.2f MB/s", pct * 100.0, done, total_jobs,
displayed_speed);
printf("\r%s", linebuf);
fflush(stdout);
if (done >= total_jobs)
break;
usleep(100000);
}
printf("\n\n");
// stop queue and join threads
jobqueue_stop(&queue);
for (size_t i = 0; i < num_threads; ++i)
pthread_join(tids[i], NULL);
// done time
clock_gettime(CLOCK_MONOTONIC, &tnow);
double elapsed =
(tnow.tv_sec - tstart.tv_sec) + (tnow.tv_nsec - tstart.tv_nsec) / 1e9;
printf("Completed hashing %zu files in %.2f seconds\n", total_jobs, elapsed);
uint64_t total_bytes = (uint64_t)atomic_load(&g_bytes_processed);
double total_mb = (double)total_bytes / (1024.0 * 1024.0);
double avg_mbps = total_mb / elapsed;
printf("Total: %.2f MB, Average: %.2f MB/s\n", total_mb, avg_mbps);
// If resume: we appended missing entries. If not resume: we wrote all results
// during workers. Note: This program appends hashes as workers finish. This
// avoids holding all hashes in RAM.
// Cleanup
for (size_t i = 0; i < g_entry_count; ++i)
if (existing_hash[i])
free(existing_hash[i]);
free(existing_hash);
free_entries();
return 0;
}