mirror of
https://github.com/phishdestroy/destroylist.git
synced 2025-12-14 20:38:37 +01:00
130 lines
3.6 KiB
Python
130 lines
3.6 KiB
Python
#!/usr/bin/env python3
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Set, List, Tuple
|
|
import tldextract
|
|
|
|
SCRIPT_DIR = Path(__file__).resolve().parent
|
|
PROJECT_ROOT = SCRIPT_DIR.parent
|
|
|
|
LIST_FILE = PROJECT_ROOT / 'list.json'
|
|
ALLOWLIST_FILE = PROJECT_ROOT / 'allow' / 'allowlist.json'
|
|
|
|
def load_json_list(filepath: Path) -> List[str]:
|
|
if not filepath.exists():
|
|
return []
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
if isinstance(data, list):
|
|
return [str(d).strip().lower() for d in data if d]
|
|
return []
|
|
except Exception as e:
|
|
print(f"Error loading {filepath}: {e}", file=sys.stderr)
|
|
return []
|
|
|
|
def get_registered_domain(domain: str) -> str:
|
|
ext = tldextract.extract(domain)
|
|
if not ext.suffix:
|
|
return domain
|
|
return f"{ext.domain}.{ext.suffix}"
|
|
|
|
def deduplicate_subdomains(domains: List[str]) -> Tuple[List[str], int]:
|
|
root_map = {}
|
|
|
|
for domain in domains:
|
|
root = get_registered_domain(domain)
|
|
if root not in root_map:
|
|
root_map[root] = []
|
|
root_map[root].append(domain)
|
|
|
|
kept = []
|
|
removed_count = 0
|
|
|
|
for root, subdomain_list in root_map.items():
|
|
if root in subdomain_list:
|
|
kept.append(root)
|
|
removed_count += len(subdomain_list) - 1
|
|
else:
|
|
kept.extend(subdomain_list)
|
|
|
|
return sorted(kept), removed_count
|
|
|
|
def main():
|
|
import sys
|
|
dedupe_subdomains = '--dedupe-subdomains' in sys.argv
|
|
|
|
print("Loading lists...")
|
|
domains = load_json_list(LIST_FILE)
|
|
allowlist = load_json_list(ALLOWLIST_FILE)
|
|
|
|
if not domains:
|
|
print("No domains found in list.json")
|
|
return 1
|
|
|
|
print(f"Loaded {len(domains)} domains")
|
|
print(f"Loaded {len(allowlist)} allowlist entries")
|
|
|
|
original_count = len(domains)
|
|
|
|
allowlist_set = set(allowlist)
|
|
allowlist_patterns = {d for d in allowlist if d.startswith('.')}
|
|
allowlist_exact = allowlist_set - allowlist_patterns
|
|
|
|
filtered = []
|
|
removed_by_allowlist = 0
|
|
|
|
for domain in domains:
|
|
if domain in allowlist_exact:
|
|
removed_by_allowlist += 1
|
|
continue
|
|
|
|
is_allowed = False
|
|
for pattern in allowlist_patterns:
|
|
if domain.endswith(pattern) or domain == pattern[1:]:
|
|
is_allowed = True
|
|
break
|
|
|
|
if is_allowed:
|
|
removed_by_allowlist += 1
|
|
continue
|
|
|
|
filtered.append(domain)
|
|
|
|
print(f"Removed {removed_by_allowlist} domains via allowlist")
|
|
|
|
if dedupe_subdomains:
|
|
deduplicated, removed_dupes = deduplicate_subdomains(filtered)
|
|
print(f"Removed {removed_dupes} subdomain duplicates (--dedupe-subdomains enabled)")
|
|
else:
|
|
deduplicated = filtered
|
|
print("Subdomain deduplication disabled (use --dedupe-subdomains to enable)")
|
|
|
|
unique = sorted(list(set(deduplicated)))
|
|
removed_exact_dupes = len(deduplicated) - len(unique)
|
|
if removed_exact_dupes > 0:
|
|
print(f"Removed {removed_exact_dupes} exact duplicates")
|
|
|
|
total_removed = original_count - len(unique)
|
|
print(f"\nTotal: {original_count} -> {len(unique)} (removed {total_removed})")
|
|
|
|
if total_removed == 0:
|
|
print("No changes needed")
|
|
return 0
|
|
|
|
with open(LIST_FILE, 'w', encoding='utf-8') as f:
|
|
json.dump(unique, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Updated {LIST_FILE.name}")
|
|
return 0
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
sys.exit(main())
|
|
except Exception as e:
|
|
print(f"Fatal error: {e}", file=sys.stderr)
|
|
import traceback
|
|
traceback.print_exc()
|
|
sys.exit(1)
|