Anthropic-Cybersecurity-Skills/skills/hunting-for-data-exfiltration-indicators/scripts/agent.py

#!/usr/bin/env python3
"""Agent for hunting data exfiltration indicators in network traffic."""

import argparse
import csv
import json
import math
from collections import defaultdict
from datetime import datetime, timezone


DNS_EXFIL_ENTROPY_THRESHOLD = 3.5
DNS_LABEL_LENGTH_THRESHOLD = 40
LARGE_UPLOAD_THRESHOLD_MB = 50

SUSPICIOUS_PORTS = {
    20: "FTP Data", 21: "FTP", 22: "SSH/SCP", 53: "DNS",
    443: "HTTPS", 993: "IMAPS", 995: "POP3S",
    8443: "Alt HTTPS", 6667: "IRC",
}


def shannon_entropy(data):
    """Calculate Shannon entropy of a string."""
    if not data:
        return 0.0
    freq = defaultdict(int)
    for c in data:
        freq[c] += 1
    length = len(data)
    return -sum((count/length) * math.log2(count/length) for count in freq.values())


def analyze_dns_queries(filepath):
    """Analyze DNS query log for exfiltration indicators."""
    findings = []
    domain_stats = defaultdict(lambda: {"count": 0, "total_length": 0, "queries": []})
    try:
        with open(filepath, "r") as f:
            reader = csv.DictReader(f, delimiter="\t")
            for row in reader:
                query = row.get("query", "")
                if not query:
                    continue
                parts = query.split(".")
                if len(parts) < 2:
                    continue
                domain = ".".join(parts[-2:])
                subdomain = ".".join(parts[:-2])
                domain_stats[domain]["count"] += 1
                domain_stats[domain]["total_length"] += len(subdomain)
                domain_stats[domain]["queries"].append(subdomain)
    except (OSError, csv.Error):
        return findings

    for domain, stats in domain_stats.items():
        if stats["count"] < 5:
            continue
        avg_subdomain_len = stats["total_length"] / stats["count"]
        all_subdomains = "".join(stats["queries"])
        entropy = shannon_entropy(all_subdomains)
        if entropy > DNS_EXFIL_ENTROPY_THRESHOLD and avg_subdomain_len > 20:
            findings.append({
                "type": "dns_exfiltration",
                "domain": domain,
                "query_count": stats["count"],
                "avg_subdomain_length": round(avg_subdomain_len, 1),
                "entropy": round(entropy, 3),
                "severity": "CRITICAL",
            })
    return findings


def analyze_network_flows(filepath):
    """Analyze network flow data for large outbound transfers."""
    findings = []
    dest_bytes = defaultdict(int)
    try:
        with open(filepath, "r") as f:
            reader = csv.DictReader(f, delimiter="\t")
            for row in reader:
                dst = row.get("id.resp_h", row.get("dst", ""))
                orig_bytes = int(row.get("orig_bytes", 0) or 0)
                dest_bytes[dst] += orig_bytes
    except (OSError, csv.Error, ValueError):
        return findings

    for dst, total in dest_bytes.items():
        mb = total / (1024 * 1024)
        if mb >= LARGE_UPLOAD_THRESHOLD_MB:
            findings.append({
                "type": "large_outbound_transfer",
                "destination": dst,
                "total_bytes": total,
                "total_mb": round(mb, 2),
                "severity": "HIGH",
            })
    return findings


def analyze_off_hours_traffic(filepath):
    """Check for significant data transfers during off-hours."""
    findings = []
    off_hours_transfers = defaultdict(int)
    try:
        with open(filepath, "r") as f:
            reader = csv.DictReader(f, delimiter="\t")
            for row in reader:
                ts = float(row.get("ts", 0))
                hour = datetime.fromtimestamp(ts).hour
                if hour < 6 or hour > 22:
                    dst = row.get("id.resp_h", row.get("dst", ""))
                    orig_bytes = int(row.get("orig_bytes", 0) or 0)
                    off_hours_transfers[dst] += orig_bytes
    except (OSError, csv.Error, ValueError):
        return findings

    for dst, total in off_hours_transfers.items():
        mb = total / (1024 * 1024)
        if mb >= 10:
            findings.append({
                "type": "off_hours_transfer",
                "destination": dst,
                "total_mb": round(mb, 2),
                "severity": "MEDIUM",
            })
    return findings


def main():
    parser = argparse.ArgumentParser(
        description="Data exfiltration indicator hunter"
    )
    parser.add_argument("--conn-log", help="Zeek conn.log or network flow CSV")
    parser.add_argument("--dns-log", help="Zeek dns.log or DNS query CSV")
    parser.add_argument("--output", "-o", help="Output JSON report")
    parser.add_argument("--verbose", "-v", action="store_true")
    args = parser.parse_args()

    if not args.conn_log and not args.dns_log:
        parser.error("At least one of --conn-log or --dns-log is required")

    print("[*] Data Exfiltration Indicator Hunter")
    report = {"timestamp": datetime.now(timezone.utc).isoformat(), "findings": []}

    if args.dns_log:
        report["findings"].extend(analyze_dns_queries(args.dns_log))
    if args.conn_log:
        report["findings"].extend(analyze_network_flows(args.conn_log))
        report["findings"].extend(analyze_off_hours_traffic(args.conn_log))

    report["risk_level"] = (
        "CRITICAL" if any(f["severity"] == "CRITICAL" for f in report["findings"])
        else "HIGH" if any(f["severity"] == "HIGH" for f in report["findings"])
        else "MEDIUM" if report["findings"] else "LOW"
    )
    report["total_findings"] = len(report["findings"])

    print(f"[*] {report['total_findings']} exfiltration indicators found")
    if args.output:
        with open(args.output, "w") as f:
            json.dump(report, f, indent=2)
        print(f"[*] Report saved to {args.output}")
    else:
        print(json.dumps(report, indent=2))


if __name__ == "__main__":
    main()