#!/usr/bin/env python3
"""DNS exfiltration detection agent using entropy analysis and query pattern detection."""

import math
import os
import sys
import json
import csv
import datetime
from collections import Counter, defaultdict


def shannon_entropy(text):
    """Calculate Shannon entropy of a string."""
    if not text:
        return 0.0
    counter = Counter(text.lower())
    length = len(text)
    entropy = -sum(
        (count / length) * math.log2(count / length)
        for count in counter.values()
    )
    return round(entropy, 4)


def extract_subdomain(fqdn):
    """Extract the subdomain portion from a fully qualified domain name."""
    parts = fqdn.rstrip(".").split(".")
    if len(parts) > 2:
        return ".".join(parts[:-2])
    return ""


def extract_registered_domain(fqdn):
    """Extract the registered domain (SLD + TLD) from an FQDN."""
    parts = fqdn.rstrip(".").split(".")
    if len(parts) >= 2:
        return ".".join(parts[-2:])
    return fqdn


def detect_tunneling(dns_records, subdomain_len_threshold=50, min_queries=20):
    """Detect DNS tunneling based on subdomain length anomalies."""
    domain_stats = defaultdict(lambda: {"queries": 0, "unique_queries": set(),
                                         "subdomain_lengths": [], "sources": set()})
    for record in dns_records:
        query = record.get("query", "")
        src = record.get("src_ip", "unknown")
        subdomain = extract_subdomain(query)
        reg_domain = extract_registered_domain(query)
        if len(subdomain) > subdomain_len_threshold:
            stats = domain_stats[reg_domain]
            stats["queries"] += 1
            stats["unique_queries"].add(query)
            stats["subdomain_lengths"].append(len(subdomain))
            stats["sources"].add(src)
    alerts = []
    for domain, stats in domain_stats.items():
        if stats["queries"] >= min_queries:
            avg_len = sum(stats["subdomain_lengths"]) / len(stats["subdomain_lengths"])
            max_len = max(stats["subdomain_lengths"])
            alerts.append({
                "domain": domain,
                "queries": stats["queries"],
                "unique_queries": len(stats["unique_queries"]),
                "avg_subdomain_length": round(avg_len, 1),
                "max_subdomain_length": max_len,
                "sources": list(stats["sources"]),
                "verdict": "CRITICAL - Likely DNS tunneling",
            })
    return sorted(alerts, key=lambda x: x["avg_subdomain_length"], reverse=True)


def detect_dga(dns_records, entropy_threshold=3.5, min_sld_length=12):
    """Detect Domain Generation Algorithm queries using entropy scoring."""
    suspicious = defaultdict(lambda: {"count": 0, "sources": set(), "entropies": []})
    for record in dns_records:
        query = record.get("query", "").rstrip(".")
        src = record.get("src_ip", "unknown")
        parts = query.split(".")
        if len(parts) < 2:
            continue
        sld = parts[-2]
        if len(sld) < min_sld_length:
            continue
        ent = shannon_entropy(sld)
        if ent > entropy_threshold:
            suspicious[query]["count"] += 1
            suspicious[query]["sources"].add(src)
            suspicious[query]["entropies"].append(ent)
    alerts = []
    for domain, data in suspicious.items():
        avg_entropy = sum(data["entropies"]) / len(data["entropies"])
        alerts.append({
            "domain": domain,
            "queries": data["count"],
            "avg_entropy": round(avg_entropy, 4),
            "sources": list(data["sources"]),
            "verdict": "HIGH - Possible DGA domain",
        })
    return sorted(alerts, key=lambda x: x["avg_entropy"], reverse=True)


def detect_volume_anomaly(dns_records, z_score_threshold=3.0):
    """Detect hosts with anomalously high DNS query volumes."""
    host_counts = defaultdict(int)
    for record in dns_records:
        src = record.get("src_ip", "unknown")
        host_counts[src] += 1
    if not host_counts:
        return []
    values = list(host_counts.values())
    mean_q = sum(values) / len(values)
    if len(values) < 2:
        return []
    variance = sum((x - mean_q) ** 2 for x in values) / (len(values) - 1)
    stdev_q = variance ** 0.5
    if stdev_q == 0:
        return []
    anomalies = []
    for host, count in host_counts.items():
        z = (count - mean_q) / stdev_q
        if z > z_score_threshold:
            anomalies.append({
                "src_ip": host,
                "queries": count,
                "z_score": round(z, 2),
                "mean": round(mean_q, 1),
                "verdict": "HIGH - Anomalous query volume",
            })
    return sorted(anomalies, key=lambda x: x["z_score"], reverse=True)


def detect_txt_abuse(dns_records, threshold=100):
    """Detect excessive TXT record queries (common tunneling method)."""
    txt_counts = defaultdict(lambda: {"count": 0, "unique_domains": set()})
    for record in dns_records:
        qtype = str(record.get("query_type", "")).upper()
        if qtype in ("TXT", "16"):
            src = record.get("src_ip", "unknown")
            txt_counts[src]["count"] += 1
            txt_counts[src]["unique_domains"].add(record.get("query", ""))
    alerts = []
    for src, data in txt_counts.items():
        if data["count"] > threshold:
            level = "CRITICAL" if data["count"] > 1000 else "HIGH" if data["count"] > 500 else "MEDIUM"
            alerts.append({
                "src_ip": src,
                "txt_queries": data["count"],
                "unique_domains": len(data["unique_domains"]),
                "verdict": f"{level} - Possible DNS tunneling via TXT records",
            })
    return sorted(alerts, key=lambda x: x["txt_queries"], reverse=True)


def estimate_exfil_volume(dns_records, target_domain):
    """Estimate data volume encoded in DNS queries to a specific domain."""
    total_encoded_bytes = 0
    query_count = 0
    for record in dns_records:
        query = record.get("query", "")
        if target_domain in query:
            subdomain = extract_subdomain(query)
            total_encoded_bytes += len(subdomain)
            query_count += 1
    decoded_bytes = int(total_encoded_bytes * 0.75)  # Base64 decode factor
    return {
        "target_domain": target_domain,
        "total_queries": query_count,
        "encoded_bytes": total_encoded_bytes,
        "estimated_decoded_bytes": decoded_bytes,
        "estimated_kb": round(decoded_bytes / 1024, 1),
        "estimated_mb": round(decoded_bytes / (1024 * 1024), 3),
    }


def parse_zeek_dns_log(log_path):
    """Parse a Zeek dns.log file into structured records."""
    records = []
    with open(log_path, "r") as f:
        for line in f:
            if line.startswith("#"):
                continue
            parts = line.strip().split("\t")
            if len(parts) >= 10:
                records.append({
                    "timestamp": parts[0],
                    "src_ip": parts[2],
                    "src_port": parts[3],
                    "dst_ip": parts[4],
                    "query": parts[9] if len(parts) > 9 else "",
                    "query_type": parts[13] if len(parts) > 13 else "",
                })
    return records


if __name__ == "__main__":
    print("=" * 60)
    print("DNS Exfiltration Detection Agent")
    print("Tunneling, DGA, volume anomaly, and TXT abuse detection")
    print("=" * 60)

    # Demo with synthetic DNS records
    demo_records = [
        {"query": f"{'a' * 60}.evil-tunnel.com", "src_ip": "192.168.1.105",
         "query_type": "TXT"} for _ in range(50)
    ] + [
        {"query": "x8kj2m9p4qw7nz3.xyz", "src_ip": "192.168.1.110",
         "query_type": "A"} for _ in range(5)
    ] + [
        {"query": "google.com", "src_ip": "192.168.1.50", "query_type": "A"}
        for _ in range(10)
    ]

    print("\n--- DNS Tunneling Detection ---")
    tunneling = detect_tunneling(demo_records, subdomain_len_threshold=30, min_queries=10)
    for t in tunneling:
        print(f"[!] {t['domain']}: {t['queries']} queries, "
              f"avg subdomain len={t['avg_subdomain_length']}")

    print("\n--- DGA Detection ---")
    dga = detect_dga(demo_records, entropy_threshold=3.0, min_sld_length=10)
    for d in dga[:5]:
        print(f"[!] {d['domain']}: entropy={d['avg_entropy']}")

    print("\n--- TXT Record Abuse ---")
    txt = detect_txt_abuse(demo_records, threshold=10)
    for t in txt:
        print(f"[!] {t['src_ip']}: {t['txt_queries']} TXT queries")

    print("\n--- Entropy Examples ---")
    examples = ["google", "x8kj2m9p4qw7n", "aGVsbG8gd29ybGQ"]
    for ex in examples:
        print(f"  '{ex}' -> entropy={shannon_entropy(ex)}")