Skip to content

Instantly share code, notes, and snippets.

@jrosskopf
Created January 23, 2012 15:04
Show Gist options
  • Save jrosskopf/1663558 to your computer and use it in GitHub Desktop.
Save jrosskopf/1663558 to your computer and use it in GitHub Desktop.
14_dups
#include "dups.h"
void add_single_file_to_ht(hash_table *ht, char *absolute_file_name, struct stat *stat) {
assert(ht);
assert(absolute_file_name);
size_t file_size = (size_t)stat->st_size;
ht_insert(ht, file_size, (hash_value *)absolute_file_name);
}
char *build_absolute_name(char *directory_name, char *file_name) {
unsigned int len = strlen(directory_name) + strlen("/") + strlen(file_name) + 1;
char *abs_file_name = (char *)malloc(len * sizeof(char));
memset(abs_file_name,'\0', len);
strncat(abs_file_name, directory_name, len);
strncat(abs_file_name, "/", len);
strncat(abs_file_name, file_name, len);
return abs_file_name;
}
void add_files_to_ht(hash_table *ht, char *directory_name) {
DIR *d_handle = opendir(directory_name);
if (d_handle == NULL) {
fprintf(stderr, "**Warning** Kann Verzeichnis '%s' nicht öffnen\n", directory_name);
return;
}
struct dirent *d_entry = NULL;
for(d_entry = readdir(d_handle); d_entry != NULL; d_entry = readdir(d_handle))
{
if (strcmp(d_entry->d_name, ".") == 0 || strcmp(d_entry->d_name, "..") == 0)
continue;
char *absolute_file_name = build_absolute_name(directory_name, d_entry->d_name);
struct stat stat;
if (lstat(absolute_file_name, &stat) == -1) {
fprintf(stderr, "**Warning** Stat für '%s' fehlgeschlagen\n", absolute_file_name);
free(absolute_file_name);
continue;
}
//printf("-> %s\n", absolute_file_name);
if (S_ISDIR(stat.st_mode))
add_files_to_ht(ht, absolute_file_name);
if (S_ISREG(stat.st_mode))
add_single_file_to_ht(ht, absolute_file_name, &stat);
free(absolute_file_name);
}
closedir(d_handle);
}
void begin_add_files_to_ht(hash_table *ht, char *rel_dir_name) {
assert(ht);
assert(rel_dir_name);
if (chdir(rel_dir_name) != 0) {
fprintf(stderr, "**Warning** Kann nicht ins Verzeichnis '%s' wechseln\n", rel_dir_name);
return;
}
char abs_dir_name[MAXPATHLEN];
if (getcwd(abs_dir_name, MAXPATHLEN) == NULL) {
fprintf(stderr, "**Warning** Kann aktuelles Arbeitsverzeichnis nicht öffnen\n");
return;
}
add_files_to_ht(ht, abs_dir_name);
}
void show_duplicates_if_neccessary(hash_table *ht, size_t unique_key) {
assert(ht);
hash_query_result *res = ht_query(ht, unique_key);
if (res->n_results < 2) {
ht_query_free(ht, res);
return;
}
int i = 0;
for (hash_entry *cur_entry = res->first_entry; cur_entry != NULL; cur_entry = cur_entry->next) {
if (i == 0)
printf("+ %10lu %s\n", cur_entry->key, (char *)cur_entry->value);
else
printf("+->%10s %s\n", " ", (char *)cur_entry->value);
i++;
}
printf("\n");
ht_query_free(ht, res);
}
int main(int argc, char **argv) {
hash_table *ht = ht_new(1024);
for (int i = 1; i < argc; i++) {
begin_add_files_to_ht(ht, argv[i]);
}
for (unsigned int i = 0; i < ht->n_unique_keys; i++) {
show_duplicates_if_neccessary(ht, ht->key_buffer[i]);
}
//ht_dump(ht);
ht_free(ht);
return 0;
}
#ifndef _DUPS_H_
#define _DUPS_H_
#include <dirent.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
#include "hashtable.h"
#endif // _DUPS_H_
#include "hashtable.h"
unsigned int _ht_size_t_hash_key(size_t key) {
return key;
}
bool _ht_size_t_key_eq(size_t k1, size_t k2) {
return k1 == k2 ? true : false;
}
hash_entry *_ht_dup_entry(hash_entry *orig_entry) {
hash_entry *new_entry = calloc(1, sizeof(hash_entry));
new_entry->key = orig_entry->key;
new_entry->value = (hash_value *)strdup((char *)(orig_entry->value));
return new_entry;
}
void _ht_free_entry(hash_entry *entry) {
if (entry == NULL)
return;
if (entry->value != NULL)
free(entry->value);
}
hash_table *ht_new(unsigned int capacity) {
hash_table *ht = calloc(1, sizeof(hash_table));
assert(ht);
ht->n_buckets = capacity;
ht->buckets = calloc(capacity, sizeof(hash_bucket));
assert(ht->buckets);
ht->key_buffer_size = KEY_BUFFER_INCREMENT;
ht->n_unique_keys = 0;
ht->key_buffer = malloc(ht->key_buffer_size * sizeof(size_t));
ht->hash = _ht_size_t_hash_key;
ht->are_keys_eq = _ht_size_t_key_eq;
ht->dup_entry = _ht_dup_entry;
ht->free_entry = _ht_free_entry;
return ht;
}
void _ht_dump_bucket(hash_bucket bucket) {
for (hash_entry *cur_entry = bucket.first_entry; cur_entry != NULL; cur_entry = cur_entry->next) {
printf("{%lu => %s} ", cur_entry->key, (char *)cur_entry->value);
}
}
void ht_dump(hash_table *ht) {
assert(ht != NULL);
for (int i = 0; i < ht->n_buckets; i++) {
printf("[%04d] -> { ", i);
_ht_dump_bucket(ht->buckets[i]);
printf("}\n");
}
printf("Unique Keys in Hashtable: %lu\n", ht->n_unique_keys);
printf("Total Keys in Hashtable: %lu\n", ht->n_keys);
//for (size_t i = 0; i < ht->n_unique_keys; i++) {
// printf("%lu, " (ht->key_buffer[i]));
//}
printf("\n\n");
}
void _ht_free_bucket(hash_table *ht, hash_bucket bucket) {
for (hash_entry *cur_entry = bucket.first_entry; cur_entry != NULL;) {
ht->free_entry(cur_entry);
hash_entry *prev_entry = cur_entry;
cur_entry = cur_entry->next;
free(prev_entry);
}
}
void ht_free(hash_table *ht) {
if (ht == NULL)
return;
for (unsigned int i = 0; i < ht->n_buckets; i++) {
_ht_free_bucket(ht, ht->buckets[i]);
}
free(ht->buckets);
free(ht->key_buffer);
free(ht);
}
hash_entry *ht_get_entry_first_or_default(hash_table *ht, size_t key) {
assert(ht);
unsigned int hash_value = ht->hash(key) % ht->n_buckets;
hash_bucket bucket = ht->buckets[hash_value];
for (hash_entry *cur_entry = bucket.first_entry; cur_entry != NULL; cur_entry = cur_entry->next) {
if (ht->are_keys_eq(cur_entry->key, key) == true)
return cur_entry;
}
return NULL;
}
bool ht_contains_key(hash_table *ht, size_t key) {
assert(ht);
return ht_get_entry_first_or_default(ht, key) != NULL ? true : false;
}
void _ht_add_key_to_unique_buffer(hash_table *ht, size_t key) {
assert(ht);
ht->n_keys++;
if (ht_contains_key(ht, key) == true)
return;
if (ht->key_buffer_size <= ht->n_unique_keys) {
ht->key_buffer_size += KEY_BUFFER_INCREMENT;
ht->key_buffer = realloc(ht->key_buffer, ht->key_buffer_size * sizeof(size_t));
assert(ht->key_buffer);
}
ht->key_buffer[ht->n_unique_keys++] = key;
}
void ht_insert(hash_table *ht, size_t key, hash_value *value) {
assert(ht);
_ht_add_key_to_unique_buffer(ht, key);
unsigned int hash_value = ht->hash(key) % ht->n_buckets;
hash_bucket *bucket = &(ht->buckets[hash_value]);
hash_entry *entry = calloc(1, sizeof(hash_entry));
entry->key = key;
entry->value = strdup((char *)(value));
entry->next = bucket->first_entry;
bucket->first_entry = entry;
}
void ht_put(hash_table *ht, size_t key, hash_value *value) {
assert(ht);
hash_entry *entry = ht_get_entry_first_or_default(ht, key);
if (entry == NULL) {
ht_insert(ht, key, value);
}
else {
ht->free_entry(entry);
entry->key = key;
entry->value = value;
}
}
hash_value *ht_get_first_or_default(hash_table *ht, size_t key) {
assert(ht);
hash_entry *entry = ht_get_entry_first_or_default(ht, key);
if (entry == NULL)
return NULL;
return entry->value;
}
hash_query_result *ht_query(hash_table *ht, size_t key) {
assert(ht);
hash_query_result *ret = calloc(1, sizeof(hash_query_result));
ret->key = key;
unsigned int hash_value = ht->hash(key) % ht->n_buckets;
hash_bucket bucket = ht->buckets[hash_value];
for (hash_entry *cur_entry = bucket.first_entry; cur_entry != NULL; cur_entry = cur_entry->next) {
if (! ht->are_keys_eq(cur_entry->key, key))
continue;
hash_entry *entry_copy = ht->dup_entry(cur_entry);
entry_copy->next = ret->first_entry;
ret->first_entry = entry_copy;
ret->n_results++;
}
return ret;
}
void ht_query_free(hash_table *ht, hash_query_result *res) {
assert(ht);
if (res == NULL)
return;
for (hash_entry *cur_entry = res->first_entry; cur_entry != NULL;) {
ht->free_entry(cur_entry);
hash_entry *prev_entry = cur_entry;
cur_entry = cur_entry->next;
free(prev_entry);
}
free(res);
}
void ht_query_dump(hash_query_result *res) {
printf("[%04lu] { ", res->key);
for (hash_entry *cur_entry = res->first_entry; cur_entry != NULL; cur_entry = cur_entry->next) {
printf("{%s} ", (char *)cur_entry->value);
}
printf("}\n");
}
#ifndef _HASHTABLE_H_
#define _HASHTABLE_H_
#include <assert.h>
#include <dirent.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>
#define KEY_BUFFER_INCREMENT 1024
typedef char * hash_value;
struct _hash_entry;
typedef unsigned int (*func_hash)(size_t key);
typedef struct _hash_entry *(*func_dup_entry)(struct _hash_entry *orig_entry);
typedef void (*func_free_entry)(struct _hash_entry *entry);
typedef bool (*func_are_keys_eq)(size_t k1, size_t k2);
typedef struct _hash_entry {
size_t key;
hash_value *value;
struct _hash_entry *next;
} hash_entry;
typedef struct _hash_bucket {
hash_entry *first_entry;
} hash_bucket;
typedef struct _hash_table {
unsigned int n_buckets;
hash_bucket *buckets;
size_t key_buffer_size;
size_t n_unique_keys;
size_t n_keys;
size_t *key_buffer;
func_hash hash;
func_are_keys_eq are_keys_eq;
func_dup_entry dup_entry;
func_free_entry free_entry;
} hash_table;
typedef struct _hash_query_result {
unsigned int n_results;
size_t key;
hash_entry *first_entry;
} hash_query_result;
hash_table *ht_new(unsigned int capacity);
void ht_free(hash_table *ht);
void ht_dump(hash_table *ht);
hash_entry *ht_get_entry_first_or_default(hash_table *ht, size_t key);
bool ht_contains_key(hash_table *ht, size_t key);
void ht_insert(hash_table *ht, size_t key, hash_value *value);
void ht_put(hash_table *ht, size_t key, hash_value *value);
hash_value *ht_get_first_or_default(hash_table *ht, size_t key);
hash_query_result *ht_query(hash_table *ht, size_t key);
void ht_query_free(hash_table *ht, hash_query_result *res);
void ht_query_dump(hash_query_result *res);
#endif // _HASHTABLE_H_
CFLAGS = -Wall -g -std=gnu99
LFLAGS = -R/usr/local/lib -lm
SHARED = -fPIC -shared
CC = gcc
DEPENDFILE = .dependencies
SRC = dups.c hashtable.c
SHSRC =
OBJ = $(SRC:%.c=%.o)
SHOBJ = $(SHSRC:%.c=%.so)
BIN = dups
.PHONY: dep clean
all: $(OBJ) $(SHOBJ)
$(CC) $(CFLAGS) -o $(BIN) $(OBJ) $(LFLAGS)
dep:
$(CC) -MM $(SRC) $(SHSRC) > $(DEPENDFILE)
-include $(DEPENDFILE)
$(SHOBJ): %.so: %.c
$(CC) -o $@ $(SHARED) $(CFLAGS) $< $(LFLAGS)
%.o: %.c
$(CC) $(CFLAGS) -c $<
clean:
rm -f $(OBJ) $(SHOBJ) $(BIN) $(DEPENDFILE)
rm -rf *.dSYMS
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment