#!/usr/bin/env python3 """DNS exfiltration detection agent using entropy analysis and query pattern detection.""" import math from collections import Counter, defaultdict def shannon_entropy(text): """Calculate Shannon entropy of a string.""" if not text: return 0.0 counter = Counter(text.lower()) length = len(text) entropy = -sum( (count / length) * math.log2(count / length) for count in counter.values() ) return round(entropy, 4) def extract_subdomain(fqdn): """Extract the subdomain portion from a fully qualified domain name.""" parts = fqdn.rstrip(".").split(".") if len(parts) > 2: return ".".join(parts[:-2]) return "" def extract_registered_domain(fqdn): """Extract the registered domain (SLD + TLD) from an FQDN.""" parts = fqdn.rstrip(".").split(".") if len(parts) >= 2: return ".".join(parts[-2:]) return fqdn def detect_tunneling(dns_records, subdomain_len_threshold=50, min_queries=20): """Detect DNS tunneling based on subdomain length anomalies.""" domain_stats = defaultdict(lambda: {"queries": 0, "unique_queries": set(), "subdomain_lengths": [], "sources": set()}) for record in dns_records: query = record.get("query", "") src = record.get("src_ip", "unknown") subdomain = extract_subdomain(query) reg_domain = extract_registered_domain(query) if len(subdomain) > subdomain_len_threshold: stats = domain_stats[reg_domain] stats["queries"] += 1 stats["unique_queries"].add(query) stats["subdomain_lengths"].append(len(subdomain)) stats["sources"].add(src) alerts = [] for domain, stats in domain_stats.items(): if stats["queries"] >= min_queries: avg_len = sum(stats["subdomain_lengths"]) / len(stats["subdomain_lengths"]) max_len = max(stats["subdomain_lengths"]) alerts.append({ "domain": domain, "queries": stats["queries"], "unique_queries": len(stats["unique_queries"]), "avg_subdomain_length": round(avg_len, 1), "max_subdomain_length": max_len, "sources": list(stats["sources"]), "verdict": "CRITICAL - Likely DNS tunneling", }) return sorted(alerts, key=lambda x: x["avg_subdomain_length"], reverse=True) def detect_dga(dns_records, entropy_threshold=3.5, min_sld_length=12): """Detect Domain Generation Algorithm queries using entropy scoring.""" suspicious = defaultdict(lambda: {"count": 0, "sources": set(), "entropies": []}) for record in dns_records: query = record.get("query", "").rstrip(".") src = record.get("src_ip", "unknown") parts = query.split(".") if len(parts) < 2: continue sld = parts[-2] if len(sld) < min_sld_length: continue ent = shannon_entropy(sld) if ent > entropy_threshold: suspicious[query]["count"] += 1 suspicious[query]["sources"].add(src) suspicious[query]["entropies"].append(ent) alerts = [] for domain, data in suspicious.items(): avg_entropy = sum(data["entropies"]) / len(data["entropies"]) alerts.append({ "domain": domain, "queries": data["count"], "avg_entropy": round(avg_entropy, 4), "sources": list(data["sources"]), "verdict": "HIGH - Possible DGA domain", }) return sorted(alerts, key=lambda x: x["avg_entropy"], reverse=True) def detect_volume_anomaly(dns_records, z_score_threshold=3.0): """Detect hosts with anomalously high DNS query volumes.""" host_counts = defaultdict(int) for record in dns_records: src = record.get("src_ip", "unknown") host_counts[src] += 1 if not host_counts: return [] values = list(host_counts.values()) mean_q = sum(values) / len(values) if len(values) < 2: return [] variance = sum((x - mean_q) ** 2 for x in values) / (len(values) - 1) stdev_q = variance ** 0.5 if stdev_q == 0: return [] anomalies = [] for host, count in host_counts.items(): z = (count - mean_q) / stdev_q if z > z_score_threshold: anomalies.append({ "src_ip": host, "queries": count, "z_score": round(z, 2), "mean": round(mean_q, 1), "verdict": "HIGH - Anomalous query volume", }) return sorted(anomalies, key=lambda x: x["z_score"], reverse=True) def detect_txt_abuse(dns_records, threshold=100): """Detect excessive TXT record queries (common tunneling method).""" txt_counts = defaultdict(lambda: {"count": 0, "unique_domains": set()}) for record in dns_records: qtype = str(record.get("query_type", "")).upper() if qtype in ("TXT", "16"): src = record.get("src_ip", "unknown") txt_counts[src]["count"] += 1 txt_counts[src]["unique_domains"].add(record.get("query", "")) alerts = [] for src, data in txt_counts.items(): if data["count"] > threshold: level = "CRITICAL" if data["count"] > 1000 else "HIGH" if data["count"] > 500 else "MEDIUM" alerts.append({ "src_ip": src, "txt_queries": data["count"], "unique_domains": len(data["unique_domains"]), "verdict": f"{level} - Possible DNS tunneling via TXT records", }) return sorted(alerts, key=lambda x: x["txt_queries"], reverse=True) def estimate_exfil_volume(dns_records, target_domain): """Estimate data volume encoded in DNS queries to a specific domain.""" total_encoded_bytes = 0 query_count = 0 for record in dns_records: query = record.get("query", "") if target_domain in query: subdomain = extract_subdomain(query) total_encoded_bytes += len(subdomain) query_count += 1 decoded_bytes = int(total_encoded_bytes * 0.75) # Base64 decode factor return { "target_domain": target_domain, "total_queries": query_count, "encoded_bytes": total_encoded_bytes, "estimated_decoded_bytes": decoded_bytes, "estimated_kb": round(decoded_bytes / 1024, 1), "estimated_mb": round(decoded_bytes / (1024 * 1024), 3), } def parse_zeek_dns_log(log_path): """Parse a Zeek dns.log file into structured records.""" records = [] with open(log_path, "r") as f: for line in f: if line.startswith("#"): continue parts = line.strip().split("\t") if len(parts) >= 10: records.append({ "timestamp": parts[0], "src_ip": parts[2], "src_port": parts[3], "dst_ip": parts[4], "query": parts[9] if len(parts) > 9 else "", "query_type": parts[13] if len(parts) > 13 else "", }) return records if __name__ == "__main__": print("=" * 60) print("DNS Exfiltration Detection Agent") print("Tunneling, DGA, volume anomaly, and TXT abuse detection") print("=" * 60) # Demo with synthetic DNS records demo_records = [ {"query": f"{'a' * 60}.evil-tunnel.com", "src_ip": "192.168.1.105", "query_type": "TXT"} for _ in range(50) ] + [ {"query": "x8kj2m9p4qw7nz3.xyz", "src_ip": "192.168.1.110", "query_type": "A"} for _ in range(5) ] + [ {"query": "google.com", "src_ip": "192.168.1.50", "query_type": "A"} for _ in range(10) ] print("\n--- DNS Tunneling Detection ---") tunneling = detect_tunneling(demo_records, subdomain_len_threshold=30, min_queries=10) for t in tunneling: print(f"[!] {t['domain']}: {t['queries']} queries, " f"avg subdomain len={t['avg_subdomain_length']}") print("\n--- DGA Detection ---") dga = detect_dga(demo_records, entropy_threshold=3.0, min_sld_length=10) for d in dga[:5]: print(f"[!] {d['domain']}: entropy={d['avg_entropy']}") print("\n--- TXT Record Abuse ---") txt = detect_txt_abuse(demo_records, threshold=10) for t in txt: print(f"[!] {t['src_ip']}: {t['txt_queries']} TXT queries") print("\n--- Entropy Examples ---") examples = ["google", "x8kj2m9p4qw7n", "aGVsbG8gd29ybGQ"] for ex in examples: print(f" '{ex}' -> entropy={shannon_entropy(ex)}")