Anthropic-Cybersecurity-Skills/skills/validating-backup-integrity-for-recovery/scripts/agent.py

#!/usr/bin/env python3
"""Agent for validating backup integrity for disaster recovery.

Computes cryptographic hashes, compares manifests, detects corruption,
scans for ransomware artifacts, measures file entropy, and validates
backup recoverability.
"""

import argparse
import hashlib
import json
import math
import os
from collections import Counter
from datetime import datetime, timezone
from pathlib import Path


RANSOMWARE_EXTENSIONS = {
    ".encrypted", ".locked", ".crypt", ".ransom", ".pay",
    ".wncry", ".wcry", ".cerber", ".locky", ".zepto",
    ".osiris", ".aesir", ".thor", ".odin", ".crypz",
    ".crypted", ".enc", ".crypto", ".lockbit",
}

RANSOM_NOTE_PATTERNS = [
    "README_TO_DECRYPT", "HOW_TO_RECOVER", "DECRYPT_INSTRUCTIONS",
    "HELP_DECRYPT", "RECOVERY_INSTRUCTIONS", "RESTORE_FILES",
    "READ_ME_TO_DECRYPT", "YOUR_FILES_ARE_ENCRYPTED",
    "!README!", "DECRYPT_YOUR_FILES",
]


def compute_file_hash(filepath, algorithm="sha256"):
    """Compute cryptographic hash of a single file."""
    h = hashlib.new(algorithm)
    try:
        with open(filepath, "rb") as f:
            for chunk in iter(lambda: f.read(65536), b""):
                h.update(chunk)
        return h.hexdigest()
    except (PermissionError, OSError) as e:
        return f"ERROR:{e}"


def generate_manifest(directory, algorithm="sha256"):
    """Generate hash manifest for all files in a directory."""
    manifest = {}
    dir_path = Path(directory)
    if not dir_path.is_dir():
        return {"error": f"Directory not found: {directory}"}

    total = 0
    errors = 0
    for fpath in sorted(dir_path.rglob("*")):
        if fpath.is_file():
            total += 1
            digest = compute_file_hash(str(fpath), algorithm)
            rel = str(fpath.relative_to(dir_path))
            manifest[rel] = digest
            if digest.startswith("ERROR:"):
                errors += 1

    return {
        "directory": str(directory),
        "algorithm": algorithm,
        "generated_at": datetime.now(timezone.utc).isoformat(),
        "total_files": total,
        "errors": errors,
        "hashes": manifest,
    }


def compare_manifests(baseline_path, restored_path):
    """Compare two manifest files to detect integrity issues."""
    with open(baseline_path, "r") as f:
        baseline = json.load(f)
    with open(restored_path, "r") as f:
        restored = json.load(f)

    base_hashes = baseline.get("hashes", baseline)
    rest_hashes = restored.get("hashes", restored)

    missing = []
    modified = []
    added = []

    for fname, base_hash in base_hashes.items():
        if fname not in rest_hashes:
            missing.append(fname)
        elif rest_hashes[fname] != base_hash:
            modified.append({"file": fname, "baseline": base_hash,
                             "restored": rest_hashes[fname]})

    for fname in rest_hashes:
        if fname not in base_hashes:
            added.append(fname)

    integrity_pass = len(missing) == 0 and len(modified) == 0
    return {
        "baseline_files": len(base_hashes),
        "restored_files": len(rest_hashes),
        "missing_files": missing,
        "missing_count": len(missing),
        "modified_files": modified,
        "modified_count": len(modified),
        "added_files": added,
        "added_count": len(added),
        "integrity_pass": integrity_pass,
    }


def calculate_entropy(filepath):
    """Calculate Shannon entropy of a file (0-8 bits per byte)."""
    try:
        with open(filepath, "rb") as f:
            data = f.read()
    except (PermissionError, OSError):
        return None

    if not data:
        return 0.0

    byte_counts = Counter(data)
    length = len(data)
    entropy = 0.0
    for count in byte_counts.values():
        p = count / length
        if p > 0:
            entropy -= p * math.log2(p)
    return round(entropy, 4)


def entropy_scan(directory, threshold=7.9):
    """Scan directory for files with suspiciously high entropy (possible encryption)."""
    suspicious = []
    scanned = 0
    dir_path = Path(directory)

    for fpath in dir_path.rglob("*"):
        if not fpath.is_file():
            continue
        if fpath.stat().st_size < 1024:
            continue
        scanned += 1
        ent = calculate_entropy(str(fpath))
        if ent is not None and ent >= threshold:
            suspicious.append({
                "file": str(fpath.relative_to(dir_path)),
                "entropy": ent,
                "size_bytes": fpath.stat().st_size,
            })

    return {
        "directory": str(directory),
        "threshold": threshold,
        "files_scanned": scanned,
        "suspicious_count": len(suspicious),
        "suspicious_files": suspicious[:100],
    }


def scan_ransomware_artifacts(directory):
    """Scan restored backup for ransomware indicators."""
    findings = {
        "ransomware_extensions": [],
        "ransom_notes": [],
        "total_scanned": 0,
    }
    dir_path = Path(directory)

    for fpath in dir_path.rglob("*"):
        if not fpath.is_file():
            continue
        findings["total_scanned"] += 1

        if fpath.suffix.lower() in RANSOMWARE_EXTENSIONS:
            findings["ransomware_extensions"].append(
                str(fpath.relative_to(dir_path))
            )

        for pattern in RANSOM_NOTE_PATTERNS:
            if pattern.lower() in fpath.name.lower():
                findings["ransom_notes"].append(
                    str(fpath.relative_to(dir_path))
                )
                break

    findings["clean"] = (
        len(findings["ransomware_extensions"]) == 0
        and len(findings["ransom_notes"]) == 0
    )
    return findings


def validate_backup(directory, baseline_manifest=None, check_ransomware=True,
                    check_entropy=True, entropy_threshold=7.9):
    """Run full backup validation suite."""
    results = {
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "directory": str(directory),
        "checks": {},
    }

    # File count and size
    dir_path = Path(directory)
    if not dir_path.is_dir():
        return {"error": f"Directory not found: {directory}"}

    total_files = sum(1 for _ in dir_path.rglob("*") if _.is_file())
    total_size = sum(f.stat().st_size for f in dir_path.rglob("*") if f.is_file())
    results["checks"]["file_stats"] = {
        "total_files": total_files,
        "total_size_bytes": total_size,
        "total_size_mb": round(total_size / (1024 * 1024), 2),
        "pass": total_files > 0,
    }

    # Manifest comparison
    if baseline_manifest and os.path.isfile(baseline_manifest):
        current = generate_manifest(directory)
        current_path = str(dir_path / ".current_manifest.json")
        with open(current_path, "w") as f:
            json.dump(current, f)
        comparison = compare_manifests(baseline_manifest, current_path)
        results["checks"]["integrity"] = comparison
        os.remove(current_path)
    else:
        results["checks"]["integrity"] = {"skipped": True,
                                           "reason": "No baseline manifest provided"}

    # Ransomware artifact scan
    if check_ransomware:
        results["checks"]["ransomware_scan"] = scan_ransomware_artifacts(directory)

    # Entropy scan
    if check_entropy:
        results["checks"]["entropy_scan"] = entropy_scan(directory, entropy_threshold)

    # Overall verdict
    checks = results["checks"]
    results["overall_pass"] = (
        checks.get("file_stats", {}).get("pass", False)
        and checks.get("integrity", {}).get("integrity_pass", True)
        and checks.get("ransomware_scan", {}).get("clean", True)
        and checks.get("entropy_scan", {}).get("suspicious_count", 0) == 0
    )

    return results


def main():
    parser = argparse.ArgumentParser(
        description="Backup Integrity Validation Agent"
    )
    parser.add_argument("--generate-manifest",
                        help="Generate hash manifest for a directory")
    parser.add_argument("--compare", nargs=2, metavar=("BASELINE", "RESTORED"),
                        help="Compare two manifest JSON files")
    parser.add_argument("--validate", help="Run full validation on a backup directory")
    parser.add_argument("--baseline", help="Baseline manifest for comparison")
    parser.add_argument("--entropy-scan", help="Scan directory for high-entropy files")
    parser.add_argument("--entropy-threshold", type=float, default=7.9,
                        help="Entropy threshold (default: 7.9)")
    parser.add_argument("--ransomware-scan",
                        help="Scan directory for ransomware artifacts")
    parser.add_argument("--algorithm", default="sha256",
                        choices=["sha256", "sha512", "sha3_256", "blake2b"],
                        help="Hash algorithm (default: sha256)")
    parser.add_argument("--output", "-o", help="Output file path")
    args = parser.parse_args()

    print("[*] Backup Integrity Validation Agent")
    result = None

    if args.generate_manifest:
        result = generate_manifest(args.generate_manifest, args.algorithm)
        print(f"[*] Generated manifest: {result.get('total_files', 0)} files")

    elif args.compare:
        result = compare_manifests(args.compare[0], args.compare[1])
        status = "PASS" if result["integrity_pass"] else "FAIL"
        print(f"[*] Integrity check: {status}")
        if result["missing_count"]:
            print(f"[!] Missing files: {result['missing_count']}")
        if result["modified_count"]:
            print(f"[!] Modified files: {result['modified_count']}")

    elif args.validate:
        result = validate_backup(
            args.validate,
            baseline_manifest=args.baseline,
            entropy_threshold=args.entropy_threshold,
        )
        status = "PASS" if result.get("overall_pass") else "FAIL"
        print(f"[*] Overall validation: {status}")

    elif args.entropy_scan:
        result = entropy_scan(args.entropy_scan, args.entropy_threshold)
        print(f"[*] Scanned {result['files_scanned']} files, "
              f"{result['suspicious_count']} suspicious")

    elif args.ransomware_scan:
        result = scan_ransomware_artifacts(args.ransomware_scan)
        status = "CLEAN" if result["clean"] else "INFECTED"
        print(f"[*] Ransomware scan: {status}")

    else:
        parser.print_help()
        return

    if result:
        output = json.dumps(result, indent=2)
        if args.output:
            with open(args.output, "w") as f:
                f.write(output)
            print(f"[*] Results saved to {args.output}")
        else:
            print(output)


if __name__ == "__main__":
    main()