Files
destroylist-mirror/scripts/validate_json.py
phishdestroy 51dd05f1fd Fix data quality issues: invalid entries, allowlist filtering, IP handling
- Add domain validation to validate_and_clean.py (Punycode TLD support)
- Add allowlist filtering to build_rootlist.py and json_to_txt.py
- Add IP address filtering to output generation scripts
- Fix path-based entries: preserve in list.json, filter at output time
- Fix validate_json.py crash on non-string entries in JSON arrays
- Fix tldextract deprecation warning in build_rootlist.py
- Update .gitignore

Closes #13
2026-02-24 07:35:21 -11:00

78 lines
2.6 KiB
Python

#!/usr/bin/env python3
"""Validate all critical JSON files before any pipeline step."""
import json
import re
import sys
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parent.parent
FILES_TO_CHECK = [
PROJECT_ROOT / "list.json",
PROJECT_ROOT / "allow" / "allowlist.json",
PROJECT_ROOT / "community" / "blocklist.json",
PROJECT_ROOT / "community" / "live_blocklist.json",
PROJECT_ROOT / "dns" / "active_domains.json",
]
IPV4_RE = re.compile(r"^\d{1,3}(?:\.\d{1,3}){3}$")
def validate_file(filepath: Path) -> bool:
if not filepath.exists():
print(f"SKIP: {filepath.relative_to(PROJECT_ROOT)} (not found)")
return True
try:
with open(filepath, "r", encoding="utf-8") as f:
data = json.load(f)
except json.JSONDecodeError as e:
print(f"FATAL: {filepath.relative_to(PROJECT_ROOT)} — invalid JSON at line {e.lineno}: {e.msg}", file=sys.stderr)
return False
if not isinstance(data, list):
print(f"FATAL: {filepath.relative_to(PROJECT_ROOT)} — expected array, got {type(data).__name__}", file=sys.stderr)
return False
bad = [i for i, d in enumerate(data) if not isinstance(d, str) or not d.strip()]
if bad:
print(f"WARN: {filepath.relative_to(PROJECT_ROOT)}{len(bad)} empty/non-string entries (indices: {bad[:5]}...)")
str_entries = [d for d in data if isinstance(d, str) and d.strip()]
dupes = len(str_entries) - len(set(d.lower().strip() for d in str_entries))
if dupes > 0:
print(f"WARN: {filepath.relative_to(PROJECT_ROOT)}{dupes} duplicate entries")
# Check for entries without dots (invalid domains)
no_dots = [d for d in str_entries if "." not in d.split("/")[0]]
if no_dots:
print(f"WARN: {filepath.relative_to(PROJECT_ROOT)}{len(no_dots)} entries without dots: {no_dots[:5]}...")
# Check for IP address entries
ips = [d for d in str_entries if IPV4_RE.fullmatch(d.split("/")[0])]
if ips:
print(f"INFO: {filepath.relative_to(PROJECT_ROOT)}{len(ips)} IP address entries")
print(f"OK: {filepath.relative_to(PROJECT_ROOT)}{len(data)} entries")
return True
def main():
extra = [Path(a) for a in sys.argv[1:] if Path(a).exists()]
files = FILES_TO_CHECK + extra
ok = True
for f in files:
if not validate_file(f):
ok = False
if not ok:
print("\n❌ Validation FAILED — fix JSON errors before proceeding", file=sys.stderr)
sys.exit(1)
print("\n✅ All files valid")
if __name__ == "__main__":
main()