mirror of
https://github.com/mukul975/Anthropic-Cybersecurity-Skills.git
synced 2026-06-12 22:24:56 +03:00
262 lines
9.7 KiB
Python
262 lines
9.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Data Exfiltration Detection Script
|
|
Analyzes network logs for unusual data transfer volumes, DNS tunneling,
|
|
cloud storage uploads, and protocol abuse indicators.
|
|
"""
|
|
|
|
import json
|
|
import csv
|
|
import argparse
|
|
import datetime
|
|
import math
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
CLOUD_STORAGE_DOMAINS = {
|
|
"drive.google.com", "docs.google.com", "storage.googleapis.com",
|
|
"dropbox.com", "dl.dropboxusercontent.com",
|
|
"box.com", "upload.box.com",
|
|
"onedrive.live.com", "sharepoint.com",
|
|
"mega.nz", "mega.co.nz",
|
|
"wetransfer.com", "sendspace.com",
|
|
"mediafire.com", "4shared.com",
|
|
"pastebin.com", "paste.ee", "hastebin.com",
|
|
"github.com", "gitlab.com", "bitbucket.org",
|
|
"discord.com", "cdn.discordapp.com",
|
|
"api.telegram.org", "slack.com",
|
|
}
|
|
|
|
LEGITIMATE_HIGH_VOLUME = {
|
|
"windowsupdate.com", "microsoft.com", "windows.com",
|
|
"googleapis.com", "gstatic.com",
|
|
"amazonaws.com", "cloudfront.net",
|
|
"apple.com", "icloud.com",
|
|
"adobe.com", "akamai.net",
|
|
}
|
|
|
|
|
|
def parse_logs(input_path: str) -> list[dict]:
|
|
path = Path(input_path)
|
|
if path.suffix == ".json":
|
|
with open(path, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
return data if isinstance(data, list) else data.get("events", [])
|
|
elif path.suffix == ".csv":
|
|
with open(path, "r", encoding="utf-8-sig") as f:
|
|
return [dict(row) for row in csv.DictReader(f)]
|
|
return []
|
|
|
|
|
|
def normalize_event(event: dict) -> dict:
|
|
field_map = {
|
|
"timestamp": ["ts", "timestamp", "_time", "@timestamp"],
|
|
"src_ip": ["src_ip", "id.orig_h", "source_ip", "LocalIP"],
|
|
"dst_ip": ["dst_ip", "id.resp_h", "dest_ip", "RemoteIP"],
|
|
"domain": ["domain", "host", "query", "dest", "RemoteUrl"],
|
|
"bytes_out": ["bytes_out", "orig_bytes", "SentBytes", "bytes_sent"],
|
|
"bytes_in": ["bytes_in", "resp_bytes", "ReceivedBytes", "bytes_recv"],
|
|
"method": ["method", "http_method", "Method"],
|
|
"user": ["user", "User", "AccountName", "user.name"],
|
|
"query_type": ["query_type", "qtype_name", "QueryType"],
|
|
}
|
|
normalized = {}
|
|
for target, sources in field_map.items():
|
|
for src in sources:
|
|
if src in event and event[src] and event[src] != "-":
|
|
normalized[target] = str(event[src])
|
|
break
|
|
if target not in normalized:
|
|
normalized[target] = ""
|
|
return normalized
|
|
|
|
|
|
def is_legitimate(domain: str) -> bool:
|
|
domain = domain.lower()
|
|
return any(domain.endswith(d) for d in LEGITIMATE_HIGH_VOLUME)
|
|
|
|
|
|
def is_cloud_storage(domain: str) -> bool:
|
|
domain = domain.lower()
|
|
return any(domain.endswith(d) or d in domain for d in CLOUD_STORAGE_DOMAINS)
|
|
|
|
|
|
def detect_volume_anomalies(events: list[dict]) -> list[dict]:
|
|
host_data = defaultdict(lambda: {"bytes_out": 0, "destinations": set(), "count": 0})
|
|
for e in events:
|
|
src = e.get("src_ip", "")
|
|
domain = e.get("domain", "") or e.get("dst_ip", "")
|
|
if not src or is_legitimate(domain):
|
|
continue
|
|
try:
|
|
bytes_out = int(e.get("bytes_out", 0) or 0)
|
|
except ValueError:
|
|
bytes_out = 0
|
|
host_data[src]["bytes_out"] += bytes_out
|
|
host_data[src]["destinations"].add(domain)
|
|
host_data[src]["count"] += 1
|
|
|
|
findings = []
|
|
threshold_bytes = 100 * 1024 * 1024 # 100 MB
|
|
|
|
for src, data in host_data.items():
|
|
if data["bytes_out"] > threshold_bytes:
|
|
mb = data["bytes_out"] / (1024 * 1024)
|
|
risk = min(90, 30 + int(mb / 100) * 10)
|
|
findings.append({
|
|
"detection_type": "VOLUME_ANOMALY",
|
|
"technique": "T1041",
|
|
"src_ip": src,
|
|
"bytes_out": data["bytes_out"],
|
|
"mb_out": round(mb, 2),
|
|
"unique_destinations": len(data["destinations"]),
|
|
"connection_count": data["count"],
|
|
"risk_score": risk,
|
|
"risk_level": "CRITICAL" if risk >= 70 else "HIGH" if risk >= 50 else "MEDIUM",
|
|
"indicators": [f"High outbound volume: {round(mb, 2)} MB to {len(data['destinations'])} destinations"],
|
|
})
|
|
|
|
return sorted(findings, key=lambda x: x["bytes_out"], reverse=True)
|
|
|
|
|
|
def detect_cloud_exfiltration(events: list[dict]) -> list[dict]:
|
|
cloud_uploads = defaultdict(lambda: {"bytes_out": 0, "services": set(), "count": 0})
|
|
for e in events:
|
|
domain = e.get("domain", "")
|
|
method = e.get("method", "").upper()
|
|
if not is_cloud_storage(domain):
|
|
continue
|
|
if method not in ("POST", "PUT", "PATCH", ""):
|
|
continue
|
|
src = e.get("src_ip", "") or e.get("user", "")
|
|
try:
|
|
bytes_out = int(e.get("bytes_out", 0) or 0)
|
|
except ValueError:
|
|
bytes_out = 0
|
|
cloud_uploads[src]["bytes_out"] += bytes_out
|
|
cloud_uploads[src]["services"].add(domain)
|
|
cloud_uploads[src]["count"] += 1
|
|
|
|
findings = []
|
|
for src, data in cloud_uploads.items():
|
|
if data["bytes_out"] > 50 * 1024 * 1024: # 50 MB
|
|
mb = data["bytes_out"] / (1024 * 1024)
|
|
findings.append({
|
|
"detection_type": "CLOUD_EXFILTRATION",
|
|
"technique": "T1567.002",
|
|
"source": src,
|
|
"bytes_out": data["bytes_out"],
|
|
"mb_out": round(mb, 2),
|
|
"cloud_services": list(data["services"]),
|
|
"upload_count": data["count"],
|
|
"risk_score": 60,
|
|
"risk_level": "HIGH",
|
|
"indicators": [f"Cloud upload: {round(mb, 2)} MB to {', '.join(data['services'])}"],
|
|
})
|
|
|
|
return sorted(findings, key=lambda x: x["bytes_out"], reverse=True)
|
|
|
|
|
|
def detect_dns_exfiltration(events: list[dict]) -> list[dict]:
|
|
domain_stats = defaultdict(lambda: {"queries": 0, "unique_subs": set(), "total_len": 0})
|
|
for e in events:
|
|
domain = e.get("domain", "")
|
|
if not domain or "." not in domain:
|
|
continue
|
|
parts = domain.split(".")
|
|
if len(parts) < 3:
|
|
continue
|
|
base = ".".join(parts[-2:])
|
|
sub = ".".join(parts[:-2])
|
|
domain_stats[base]["queries"] += 1
|
|
domain_stats[base]["unique_subs"].add(sub)
|
|
domain_stats[base]["total_len"] += len(domain)
|
|
|
|
findings = []
|
|
for base, stats in domain_stats.items():
|
|
if stats["queries"] < 50 or is_legitimate(base):
|
|
continue
|
|
avg_len = stats["total_len"] / stats["queries"]
|
|
unique = len(stats["unique_subs"])
|
|
risk = 0
|
|
indicators = []
|
|
if unique > 50:
|
|
risk += 30
|
|
indicators.append(f"High unique subdomains: {unique}")
|
|
if avg_len > 40:
|
|
risk += 25
|
|
indicators.append(f"Long query avg: {avg_len:.1f}")
|
|
if stats["queries"] > 500:
|
|
risk += 15
|
|
indicators.append(f"High volume: {stats['queries']} queries")
|
|
if risk >= 30:
|
|
findings.append({
|
|
"detection_type": "DNS_EXFILTRATION",
|
|
"technique": "T1048.003",
|
|
"domain": base,
|
|
"query_count": stats["queries"],
|
|
"unique_subdomains": unique,
|
|
"avg_query_length": round(avg_len, 1),
|
|
"risk_score": risk,
|
|
"risk_level": "CRITICAL" if risk >= 70 else "HIGH" if risk >= 50 else "MEDIUM",
|
|
"indicators": indicators,
|
|
})
|
|
|
|
return sorted(findings, key=lambda x: x["risk_score"], reverse=True)
|
|
|
|
|
|
def run_hunt(input_path: str, output_dir: str) -> None:
|
|
print(f"[*] Data Exfiltration Hunt - {datetime.datetime.now().isoformat()}")
|
|
events = [normalize_event(e) for e in parse_logs(input_path)]
|
|
print(f"[*] Loaded {len(events)} events")
|
|
|
|
vol_findings = detect_volume_anomalies(events)
|
|
cloud_findings = detect_cloud_exfiltration(events)
|
|
dns_findings = detect_dns_exfiltration(events)
|
|
all_findings = vol_findings + cloud_findings + dns_findings
|
|
|
|
output_path = Path(output_dir)
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_path / "exfil_findings.json", "w", encoding="utf-8") as f:
|
|
json.dump({
|
|
"hunt_id": f"TH-EXFIL-{datetime.date.today().isoformat()}",
|
|
"total_events": len(events),
|
|
"findings": all_findings,
|
|
}, f, indent=2)
|
|
|
|
with open(output_path / "hunt_report.md", "w", encoding="utf-8") as f:
|
|
f.write(f"# Data Exfiltration Hunt Report\n\n")
|
|
f.write(f"**Date**: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
|
f.write(f"**Volume Anomalies**: {len(vol_findings)}\n")
|
|
f.write(f"**Cloud Exfil**: {len(cloud_findings)}\n")
|
|
f.write(f"**DNS Exfil**: {len(dns_findings)}\n\n")
|
|
for finding in all_findings[:20]:
|
|
f.write(f"### [{finding['risk_level']}] {finding['detection_type']}\n")
|
|
f.write(f"- {', '.join(finding['indicators'])}\n\n")
|
|
|
|
print(f"[+] {len(all_findings)} findings written to {output_dir}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Data Exfiltration Detection")
|
|
subparsers = parser.add_subparsers(dest="command")
|
|
hunt_p = subparsers.add_parser("hunt")
|
|
hunt_p.add_argument("--input", "-i", required=True)
|
|
hunt_p.add_argument("--output", "-o", default="./exfil_output")
|
|
subparsers.add_parser("queries")
|
|
args = parser.parse_args()
|
|
|
|
if args.command == "hunt":
|
|
run_hunt(args.input, args.output)
|
|
elif args.command == "queries":
|
|
print("=== Data Exfiltration Queries ===\n")
|
|
print("--- Volume Anomaly ---")
|
|
print("index=proxy | stats sum(bytes_out) as total by src_ip\n| eval MB=round(total/1048576,2)\n| where MB > 100 | sort -MB")
|
|
else:
|
|
parser.print_help()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|