Added simple entry hashing based on size.

2026-05-28 00:21:10 +02:00 · 2012-01-23 22:48:32 +01:00
parent f2012c256a
commit 26eb2ca3eb
3 changed files with 46 additions and 33 deletions
@@ -5,6 +5,7 @@ Maintained since version 0.3.1.

 2012-01-23  Camilla Berglund  <elmindreda@elmindreda.org>

+ * duffdriver.c: Moved from single to BUCKET_COUNT entry lists, indexed by size.
 * duffdriver.c (process_args): Added function.
 * duffdriver.c (process_path, report_clusters): Made private to duffdriver.
 * duff.c duffdriver.c: Moved argument processing to duffdriver.
@@ -34,6 +34,7 @@
 #endif

 #define SAMPLE_SIZE 512
+#define HASH_BITS 8

 /* Typedefs for structs and enums.
 */
@@ -84,6 +84,9 @@
 #include "duffstring.h"
 #include "duff.h"

+#define BUCKET_COUNT (1 << HASH_BITS)
+#define BUCKET_INDEX(size) ((size) & (BUCKET_COUNT - 1))
+
 /* These flags are defined and documented in duff.c.
 */
 extern int follow_links_mode;
@@ -97,9 +100,9 @@ extern int excess_flag;
 extern const char* header_format;
 extern int header_uses_digest;

-/* List of collected entries.
+/* Buckets of list of collected entries.
 */
-static List files;
+static List buckets[1 << HASH_BITS];
 /* List head for traversed directories.
 */
 static Directory* directories = NULL;
@@ -262,19 +265,19 @@ static void process_file(const char* path, struct stat* sb)
  {
    /* TODO: Make this less pessimal */

-    size_t i;
+    size_t i, bucket = BUCKET_INDEX(sb->st_size);

-    for (i = 0;  i < files.allocated;  i++)
+    for (i = 0;  i < buckets[bucket].allocated;  i++)
    {
-      if (files.entries[i].device == sb->st_dev &&
-          files.entries[i].inode == sb->st_ino)
+      if (buckets[bucket].entries[i].device == sb->st_dev &&
+          buckets[bucket].entries[i].inode == sb->st_ino)
      {
        return;
      }
    }
  }

-  fill_entry(entry_list_alloc(&files), path, sb);
+  fill_entry(entry_list_alloc(&buckets[BUCKET_INDEX(sb->st_size)]), path, sb);
 }

 /* Initializes the driver, processes the specified arguments and reports the
@@ -282,10 +285,11 @@ static void process_file(const char* path, struct stat* sb)
 */
 void process_args(int argc, char** argv)
 {
-  size_t i;
+  size_t i, j;
  char path[PATH_MAX];

-  entry_list_init(&files);
+  for (i = 0;  i < BUCKET_COUNT;  i++)
+    entry_list_init(&buckets[i]);

  if (argc)
  {
@@ -308,10 +312,13 @@ void process_args(int argc, char** argv)

  report_clusters();

-  for (i = 0;  i < files.allocated;  i++)
-    free_entry(&files.entries[i]);
+  for (i = 0;  i < BUCKET_COUNT;  i++)
+  {
+    for (j = 0;  j < buckets[i].allocated;  j++)
+      free_entry(&buckets[i].entries[j]);

-  entry_list_free(&files);
+    entry_list_free(&buckets[i]);
+  }
 }

 /* Processes a path name, whether from the command line or from
@@ -447,37 +454,41 @@ static void report_clusters(void)

  entry_list_init(&duplicates);

-  for (first = 0;  first < files.allocated;  first++)
+  for (i = 0;  i < BUCKET_COUNT;  i++)
  {
-    if (files.entries[first].status == INVALID ||
-        files.entries[first].status == REPORTED)
+    for (first = 0;  first < buckets[i].allocated;  first++)
    {
-      continue;
-    }
-
-    for (second = first + 1;  second < files.allocated;  second++)
-    {
-      if (files.entries[second].status == INVALID ||
-          files.entries[second].status == REPORTED)
+      if (buckets[i].entries[first].status == INVALID ||
+          buckets[i].entries[first].status == REPORTED)
      {
-          continue;
+        continue;
      }

-      if (compare_entries(&files.entries[first], &files.entries[second]) == 0)
+      for (second = first + 1;  second < buckets[i].allocated;  second++)
      {
-	if (duplicates.allocated == 0)
-          *entry_list_alloc(&duplicates) = files.entries[first];
+        if (buckets[i].entries[second].status == INVALID ||
+            buckets[i].entries[second].status == REPORTED)
+        {
+            continue;
+        }

-        *entry_list_alloc(&duplicates) = files.entries[second];
+        if (compare_entries(&buckets[i].entries[first],
+                            &buckets[i].entries[second]) == 0)
+        {
+          if (duplicates.allocated == 0)
+            *entry_list_alloc(&duplicates) = buckets[i].entries[first];
+
+          *entry_list_alloc(&duplicates) = buckets[i].entries[second];
+        }
      }
-    }

-    if (duplicates.allocated)
-    {
-      report_cluster(&duplicates, index);
-      entry_list_empty(&duplicates);
+      if (duplicates.allocated)
+      {
+        report_cluster(&duplicates, index);
+        entry_list_empty(&duplicates);

-      index++;
+        index++;
+      }
    }
  }