Files
Anthropic-Cybersecurity-Skills/skills/hunting-for-data-exfiltration-indicators/scripts/process.py
T

262 lines
9.7 KiB
Python

#!/usr/bin/env python3
"""
Data Exfiltration Detection Script
Analyzes network logs for unusual data transfer volumes, DNS tunneling,
cloud storage uploads, and protocol abuse indicators.
"""
import json
import csv
import argparse
import datetime
import math
from collections import defaultdict
from pathlib import Path
CLOUD_STORAGE_DOMAINS = {
"drive.google.com", "docs.google.com", "storage.googleapis.com",
"dropbox.com", "dl.dropboxusercontent.com",
"box.com", "upload.box.com",
"onedrive.live.com", "sharepoint.com",
"mega.nz", "mega.co.nz",
"wetransfer.com", "sendspace.com",
"mediafire.com", "4shared.com",
"pastebin.com", "paste.ee", "hastebin.com",
"github.com", "gitlab.com", "bitbucket.org",
"discord.com", "cdn.discordapp.com",
"api.telegram.org", "slack.com",
}
LEGITIMATE_HIGH_VOLUME = {
"windowsupdate.com", "microsoft.com", "windows.com",
"googleapis.com", "gstatic.com",
"amazonaws.com", "cloudfront.net",
"apple.com", "icloud.com",
"adobe.com", "akamai.net",
}
def parse_logs(input_path: str) -> list[dict]:
path = Path(input_path)
if path.suffix == ".json":
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
return data if isinstance(data, list) else data.get("events", [])
elif path.suffix == ".csv":
with open(path, "r", encoding="utf-8-sig") as f:
return [dict(row) for row in csv.DictReader(f)]
return []
def normalize_event(event: dict) -> dict:
field_map = {
"timestamp": ["ts", "timestamp", "_time", "@timestamp"],
"src_ip": ["src_ip", "id.orig_h", "source_ip", "LocalIP"],
"dst_ip": ["dst_ip", "id.resp_h", "dest_ip", "RemoteIP"],
"domain": ["domain", "host", "query", "dest", "RemoteUrl"],
"bytes_out": ["bytes_out", "orig_bytes", "SentBytes", "bytes_sent"],
"bytes_in": ["bytes_in", "resp_bytes", "ReceivedBytes", "bytes_recv"],
"method": ["method", "http_method", "Method"],
"user": ["user", "User", "AccountName", "user.name"],
"query_type": ["query_type", "qtype_name", "QueryType"],
}
normalized = {}
for target, sources in field_map.items():
for src in sources:
if src in event and event[src] and event[src] != "-":
normalized[target] = str(event[src])
break
if target not in normalized:
normalized[target] = ""
return normalized
def is_legitimate(domain: str) -> bool:
domain = domain.lower()
return any(domain.endswith(d) for d in LEGITIMATE_HIGH_VOLUME)
def is_cloud_storage(domain: str) -> bool:
domain = domain.lower()
return any(domain.endswith(d) or d in domain for d in CLOUD_STORAGE_DOMAINS)
def detect_volume_anomalies(events: list[dict]) -> list[dict]:
host_data = defaultdict(lambda: {"bytes_out": 0, "destinations": set(), "count": 0})
for e in events:
src = e.get("src_ip", "")
domain = e.get("domain", "") or e.get("dst_ip", "")
if not src or is_legitimate(domain):
continue
try:
bytes_out = int(e.get("bytes_out", 0) or 0)
except ValueError:
bytes_out = 0
host_data[src]["bytes_out"] += bytes_out
host_data[src]["destinations"].add(domain)
host_data[src]["count"] += 1
findings = []
threshold_bytes = 100 * 1024 * 1024 # 100 MB
for src, data in host_data.items():
if data["bytes_out"] > threshold_bytes:
mb = data["bytes_out"] / (1024 * 1024)
risk = min(90, 30 + int(mb / 100) * 10)
findings.append({
"detection_type": "VOLUME_ANOMALY",
"technique": "T1041",
"src_ip": src,
"bytes_out": data["bytes_out"],
"mb_out": round(mb, 2),
"unique_destinations": len(data["destinations"]),
"connection_count": data["count"],
"risk_score": risk,
"risk_level": "CRITICAL" if risk >= 70 else "HIGH" if risk >= 50 else "MEDIUM",
"indicators": [f"High outbound volume: {round(mb, 2)} MB to {len(data['destinations'])} destinations"],
})
return sorted(findings, key=lambda x: x["bytes_out"], reverse=True)
def detect_cloud_exfiltration(events: list[dict]) -> list[dict]:
cloud_uploads = defaultdict(lambda: {"bytes_out": 0, "services": set(), "count": 0})
for e in events:
domain = e.get("domain", "")
method = e.get("method", "").upper()
if not is_cloud_storage(domain):
continue
if method not in ("POST", "PUT", "PATCH", ""):
continue
src = e.get("src_ip", "") or e.get("user", "")
try:
bytes_out = int(e.get("bytes_out", 0) or 0)
except ValueError:
bytes_out = 0
cloud_uploads[src]["bytes_out"] += bytes_out
cloud_uploads[src]["services"].add(domain)
cloud_uploads[src]["count"] += 1
findings = []
for src, data in cloud_uploads.items():
if data["bytes_out"] > 50 * 1024 * 1024: # 50 MB
mb = data["bytes_out"] / (1024 * 1024)
findings.append({
"detection_type": "CLOUD_EXFILTRATION",
"technique": "T1567.002",
"source": src,
"bytes_out": data["bytes_out"],
"mb_out": round(mb, 2),
"cloud_services": list(data["services"]),
"upload_count": data["count"],
"risk_score": 60,
"risk_level": "HIGH",
"indicators": [f"Cloud upload: {round(mb, 2)} MB to {', '.join(data['services'])}"],
})
return sorted(findings, key=lambda x: x["bytes_out"], reverse=True)
def detect_dns_exfiltration(events: list[dict]) -> list[dict]:
domain_stats = defaultdict(lambda: {"queries": 0, "unique_subs": set(), "total_len": 0})
for e in events:
domain = e.get("domain", "")
if not domain or "." not in domain:
continue
parts = domain.split(".")
if len(parts) < 3:
continue
base = ".".join(parts[-2:])
sub = ".".join(parts[:-2])
domain_stats[base]["queries"] += 1
domain_stats[base]["unique_subs"].add(sub)
domain_stats[base]["total_len"] += len(domain)
findings = []
for base, stats in domain_stats.items():
if stats["queries"] < 50 or is_legitimate(base):
continue
avg_len = stats["total_len"] / stats["queries"]
unique = len(stats["unique_subs"])
risk = 0
indicators = []
if unique > 50:
risk += 30
indicators.append(f"High unique subdomains: {unique}")
if avg_len > 40:
risk += 25
indicators.append(f"Long query avg: {avg_len:.1f}")
if stats["queries"] > 500:
risk += 15
indicators.append(f"High volume: {stats['queries']} queries")
if risk >= 30:
findings.append({
"detection_type": "DNS_EXFILTRATION",
"technique": "T1048.003",
"domain": base,
"query_count": stats["queries"],
"unique_subdomains": unique,
"avg_query_length": round(avg_len, 1),
"risk_score": risk,
"risk_level": "CRITICAL" if risk >= 70 else "HIGH" if risk >= 50 else "MEDIUM",
"indicators": indicators,
})
return sorted(findings, key=lambda x: x["risk_score"], reverse=True)
def run_hunt(input_path: str, output_dir: str) -> None:
print(f"[*] Data Exfiltration Hunt - {datetime.datetime.now().isoformat()}")
events = [normalize_event(e) for e in parse_logs(input_path)]
print(f"[*] Loaded {len(events)} events")
vol_findings = detect_volume_anomalies(events)
cloud_findings = detect_cloud_exfiltration(events)
dns_findings = detect_dns_exfiltration(events)
all_findings = vol_findings + cloud_findings + dns_findings
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
with open(output_path / "exfil_findings.json", "w", encoding="utf-8") as f:
json.dump({
"hunt_id": f"TH-EXFIL-{datetime.date.today().isoformat()}",
"total_events": len(events),
"findings": all_findings,
}, f, indent=2)
with open(output_path / "hunt_report.md", "w", encoding="utf-8") as f:
f.write(f"# Data Exfiltration Hunt Report\n\n")
f.write(f"**Date**: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"**Volume Anomalies**: {len(vol_findings)}\n")
f.write(f"**Cloud Exfil**: {len(cloud_findings)}\n")
f.write(f"**DNS Exfil**: {len(dns_findings)}\n\n")
for finding in all_findings[:20]:
f.write(f"### [{finding['risk_level']}] {finding['detection_type']}\n")
f.write(f"- {', '.join(finding['indicators'])}\n\n")
print(f"[+] {len(all_findings)} findings written to {output_dir}")
def main():
parser = argparse.ArgumentParser(description="Data Exfiltration Detection")
subparsers = parser.add_subparsers(dest="command")
hunt_p = subparsers.add_parser("hunt")
hunt_p.add_argument("--input", "-i", required=True)
hunt_p.add_argument("--output", "-o", default="./exfil_output")
subparsers.add_parser("queries")
args = parser.parse_args()
if args.command == "hunt":
run_hunt(args.input, args.output)
elif args.command == "queries":
print("=== Data Exfiltration Queries ===\n")
print("--- Volume Anomaly ---")
print("index=proxy | stats sum(bytes_out) as total by src_ip\n| eval MB=round(total/1048576,2)\n| where MB > 100 | sort -MB")
else:
parser.print_help()
if __name__ == "__main__":
main()