Added simple entry hashing based on size.

This commit is contained in:
Camilla Berglund
2012-01-23 22:48:32 +01:00
parent f2012c256a
commit 26eb2ca3eb
3 changed files with 46 additions and 33 deletions
+1
View File
@@ -5,6 +5,7 @@ Maintained since version 0.3.1.
2012-01-23 Camilla Berglund <elmindreda@elmindreda.org>
* duffdriver.c: Moved from single to BUCKET_COUNT entry lists, indexed by size.
* duffdriver.c (process_args): Added function.
* duffdriver.c (process_path, report_clusters): Made private to duffdriver.
* duff.c duffdriver.c: Moved argument processing to duffdriver.
+1
View File
@@ -34,6 +34,7 @@
#endif
#define SAMPLE_SIZE 512
#define HASH_BITS 8
/* Typedefs for structs and enums.
*/
+44 -33
View File
@@ -84,6 +84,9 @@
#include "duffstring.h"
#include "duff.h"
#define BUCKET_COUNT (1 << HASH_BITS)
#define BUCKET_INDEX(size) ((size) & (BUCKET_COUNT - 1))
/* These flags are defined and documented in duff.c.
*/
extern int follow_links_mode;
@@ -97,9 +100,9 @@ extern int excess_flag;
extern const char* header_format;
extern int header_uses_digest;
/* List of collected entries.
/* Buckets of list of collected entries.
*/
static List files;
static List buckets[1 << HASH_BITS];
/* List head for traversed directories.
*/
static Directory* directories = NULL;
@@ -262,19 +265,19 @@ static void process_file(const char* path, struct stat* sb)
{
/* TODO: Make this less pessimal */
size_t i;
size_t i, bucket = BUCKET_INDEX(sb->st_size);
for (i = 0; i < files.allocated; i++)
for (i = 0; i < buckets[bucket].allocated; i++)
{
if (files.entries[i].device == sb->st_dev &&
files.entries[i].inode == sb->st_ino)
if (buckets[bucket].entries[i].device == sb->st_dev &&
buckets[bucket].entries[i].inode == sb->st_ino)
{
return;
}
}
}
fill_entry(entry_list_alloc(&files), path, sb);
fill_entry(entry_list_alloc(&buckets[BUCKET_INDEX(sb->st_size)]), path, sb);
}
/* Initializes the driver, processes the specified arguments and reports the
@@ -282,10 +285,11 @@ static void process_file(const char* path, struct stat* sb)
*/
void process_args(int argc, char** argv)
{
size_t i;
size_t i, j;
char path[PATH_MAX];
entry_list_init(&files);
for (i = 0; i < BUCKET_COUNT; i++)
entry_list_init(&buckets[i]);
if (argc)
{
@@ -308,10 +312,13 @@ void process_args(int argc, char** argv)
report_clusters();
for (i = 0; i < files.allocated; i++)
free_entry(&files.entries[i]);
for (i = 0; i < BUCKET_COUNT; i++)
{
for (j = 0; j < buckets[i].allocated; j++)
free_entry(&buckets[i].entries[j]);
entry_list_free(&files);
entry_list_free(&buckets[i]);
}
}
/* Processes a path name, whether from the command line or from
@@ -447,37 +454,41 @@ static void report_clusters(void)
entry_list_init(&duplicates);
for (first = 0; first < files.allocated; first++)
for (i = 0; i < BUCKET_COUNT; i++)
{
if (files.entries[first].status == INVALID ||
files.entries[first].status == REPORTED)
for (first = 0; first < buckets[i].allocated; first++)
{
continue;
}
for (second = first + 1; second < files.allocated; second++)
{
if (files.entries[second].status == INVALID ||
files.entries[second].status == REPORTED)
if (buckets[i].entries[first].status == INVALID ||
buckets[i].entries[first].status == REPORTED)
{
continue;
continue;
}
if (compare_entries(&files.entries[first], &files.entries[second]) == 0)
for (second = first + 1; second < buckets[i].allocated; second++)
{
if (duplicates.allocated == 0)
*entry_list_alloc(&duplicates) = files.entries[first];
if (buckets[i].entries[second].status == INVALID ||
buckets[i].entries[second].status == REPORTED)
{
continue;
}
*entry_list_alloc(&duplicates) = files.entries[second];
if (compare_entries(&buckets[i].entries[first],
&buckets[i].entries[second]) == 0)
{
if (duplicates.allocated == 0)
*entry_list_alloc(&duplicates) = buckets[i].entries[first];
*entry_list_alloc(&duplicates) = buckets[i].entries[second];
}
}
}
if (duplicates.allocated)
{
report_cluster(&duplicates, index);
entry_list_empty(&duplicates);
if (duplicates.allocated)
{
report_cluster(&duplicates, index);
entry_list_empty(&duplicates);
index++;
index++;
}
}
}