Anthropic-Cybersecurity-Skills/skills/detecting-dns-exfiltration-with-dns-query-analysis/scripts/agent.py

#!/usr/bin/env python3
"""DNS exfiltration detection agent using entropy analysis and query pattern anomalies.

Analyzes DNS query logs for tunneling indicators: high entropy subdomains,
excessive query length, abnormal TXT record usage, and volume spikes.
"""

import argparse
import json
import math
from collections import Counter, defaultdict
from datetime import datetime

KNOWN_TUNNEL_DOMAINS = {
    "dnscat2", "iodine", "dns2tcp", "heyoka", "ozyman",
    "tuns", "dnscapy", "dns-tunnel",
}

TXT_THRESHOLD = 0.3
ENTROPY_THRESHOLD = 3.5
SUBDOMAIN_LENGTH_THRESHOLD = 40
QUERY_RATE_THRESHOLD = 100


def calculate_entropy(text):
    if not text:
        return 0.0
    freq = Counter(text)
    length = len(text)
    return -sum((count / length) * math.log2(count / length) for count in freq.values())


def parse_dns_log(filepath, log_format="zeek"):
    queries = []
    with open(filepath, "r") as f:
        if log_format == "zeek":
            headers = None
            for line in f:
                if line.startswith("#fields"):
                    headers = line.strip().split("\t")[1:]
                    continue
                if line.startswith("#"):
                    continue
                if not headers:
                    continue
                fields = line.strip().split("\t")
                if len(fields) >= len(headers):
                    record = dict(zip(headers, fields))
                    queries.append({
                        "timestamp": record.get("ts", ""),
                        "source": record.get("id.orig_h", ""),
                        "query": record.get("query", ""),
                        "qtype": record.get("qtype_name", record.get("qtype", "")),
                        "rcode": record.get("rcode_name", ""),
                        "answers": record.get("answers", ""),
                    })
        else:
            for line in f:
                parts = line.strip().split()
                if len(parts) >= 3:
                    queries.append({
                        "timestamp": parts[0],
                        "source": parts[1] if len(parts) > 3 else "",
                        "query": parts[-2] if len(parts) > 2 else parts[1],
                        "qtype": parts[-1] if len(parts) > 2 else "",
                    })
    return queries


def analyze_queries(queries):
    findings = []
    domain_stats = defaultdict(lambda: {"count": 0, "sources": set(),
                                         "entropies": [], "lengths": [],
                                         "txt_count": 0, "total": 0})
    for q in queries:
        query = q.get("query", "")
        if not query or query == "-":
            continue
        parts = query.rstrip(".").split(".")
        if len(parts) < 2:
            continue
        base_domain = ".".join(parts[-2:])
        subdomain = ".".join(parts[:-2])

        stats = domain_stats[base_domain]
        stats["count"] += 1
        stats["total"] += 1
        stats["sources"].add(q.get("source", ""))

        if subdomain:
            entropy = calculate_entropy(subdomain.replace(".", ""))
            stats["entropies"].append(entropy)
            stats["lengths"].append(len(subdomain))

            if entropy > ENTROPY_THRESHOLD and len(subdomain) > SUBDOMAIN_LENGTH_THRESHOLD:
                findings.append({
                    "type": "high_entropy_long_subdomain",
                    "query": query,
                    "subdomain": subdomain,
                    "entropy": round(entropy, 3),
                    "length": len(subdomain),
                    "source": q.get("source", ""),
                    "severity": "HIGH",
                })

        if q.get("qtype", "").upper() in ("TXT", "NULL", "CNAME"):
            stats["txt_count"] += 1

    for domain, stats in domain_stats.items():
        if stats["total"] > QUERY_RATE_THRESHOLD:
            avg_entropy = (sum(stats["entropies"]) / len(stats["entropies"])
                           if stats["entropies"] else 0)
            avg_length = (sum(stats["lengths"]) / len(stats["lengths"])
                          if stats["lengths"] else 0)
            txt_ratio = stats["txt_count"] / stats["total"]

            score = 0
            if avg_entropy > ENTROPY_THRESHOLD:
                score += 30
            if avg_length > 30:
                score += 20
            if txt_ratio > TXT_THRESHOLD:
                score += 25
            if stats["total"] > 500:
                score += 25

            if score >= 50:
                findings.append({
                    "type": "suspected_dns_tunnel",
                    "domain": domain,
                    "total_queries": stats["total"],
                    "avg_entropy": round(avg_entropy, 3),
                    "avg_subdomain_length": round(avg_length, 1),
                    "txt_ratio": round(txt_ratio, 3),
                    "tunnel_score": score,
                    "unique_sources": len(stats["sources"]),
                    "severity": "CRITICAL" if score >= 75 else "HIGH",
                })

    return findings


def main():
    global ENTROPY_THRESHOLD, SUBDOMAIN_LENGTH_THRESHOLD
    parser = argparse.ArgumentParser(description="DNS Exfiltration Detector")
    parser.add_argument("--dns-log", required=True, help="DNS log file (Zeek or text)")
    parser.add_argument("--format", choices=["zeek", "text"], default="zeek")
    parser.add_argument("--entropy-threshold", type=float, default=ENTROPY_THRESHOLD)
    parser.add_argument("--length-threshold", type=int, default=SUBDOMAIN_LENGTH_THRESHOLD)
    args = parser.parse_args()

    ENTROPY_THRESHOLD = args.entropy_threshold
    SUBDOMAIN_LENGTH_THRESHOLD = args.length_threshold

    queries = parse_dns_log(args.dns_log, args.format)
    findings = analyze_queries(queries)

    results = {
        "timestamp": datetime.utcnow().isoformat() + "Z",
        "total_queries_analyzed": len(queries),
        "findings": findings,
        "total_findings": len(findings),
    }
    print(json.dumps(results, indent=2))


if __name__ == "__main__":
    main()