#!/usr/bin/env python3
"""Threat hunt hypothesis framework builder.

Generates structured threat hunting hypotheses from MITRE ATT&CK techniques,
maps data sources, defines detection logic, and tracks hunt outcomes.
"""

import sys
import json
import datetime
import hashlib

try:
    import requests
    HAS_REQUESTS = True
except ImportError:
    HAS_REQUESTS = False


HUNT_MATURITY_LEVELS = {
    0: "Initial - ad hoc, no documentation",
    1: "Minimal - basic procedures, limited data",
    2: "Procedural - documented hypotheses, repeatable",
    3: "Innovative - custom analytics, threat intel driven",
    4: "Leading - automated, ML-assisted, continuous",
}

DATA_SOURCE_MAP = {
    "T1059.001": {"name": "PowerShell", "sources": ["Script Block Logging (4104)", "Module Logging (4103)",
                   "Process Creation (4688/Sysmon 1)"], "log_channel": "Microsoft-Windows-PowerShell/Operational"},
    "T1053.005": {"name": "Scheduled Task", "sources": ["Task Scheduler (4698/4702)", "Sysmon Event 1"],
                   "log_channel": "Microsoft-Windows-TaskScheduler/Operational"},
    "T1078": {"name": "Valid Accounts", "sources": ["Logon Events (4624/4625)", "Kerberos (4768/4769)"],
               "log_channel": "Security"},
    "T1003.001": {"name": "LSASS Memory", "sources": ["Sysmon Event 10 (ProcessAccess)", "Windows Defender alerts"],
                   "log_channel": "Microsoft-Windows-Sysmon/Operational"},
    "T1071.001": {"name": "Web Protocols C2", "sources": ["Proxy logs", "DNS query logs", "Zeek http.log"],
                   "log_channel": "Proxy/DNS"},
    "T1486": {"name": "Data Encrypted for Impact", "sources": ["File creation burst (Sysmon 11)",
               "Canary file triggers", "VSS deletion (Sysmon 1)"], "log_channel": "Sysmon"},
    "T1021.001": {"name": "Remote Desktop Protocol", "sources": ["Logon Type 10 (4624)",
                   "RDP connection (1149)"], "log_channel": "Security / TerminalServices-RemoteConnectionManager"},
}


def generate_hypothesis(technique_id, threat_actor=None, environment=None):
    """Generate a structured threat hunting hypothesis."""
    ds = DATA_SOURCE_MAP.get(technique_id, {})
    technique_name = ds.get("name", technique_id)
    hyp_id = "HYP-" + hashlib.md5(
        (technique_id + str(datetime.datetime.utcnow())).encode()
    ).hexdigest()[:8].upper()

    hypothesis = {
        "hypothesis_id": hyp_id,
        "created": datetime.datetime.utcnow().isoformat() + "Z",
        "technique_id": technique_id,
        "technique_name": technique_name,
        "hypothesis_statement": (
            "An adversary{} may be using {} ({}) within our environment{}. "
            "Evidence of this activity can be found in {}.".format(
                " (" + threat_actor + ")" if threat_actor else "",
                technique_name,
                technique_id,
                " targeting " + environment if environment else "",
                ", ".join(ds.get("sources", ["endpoint telemetry"])),
            )
        ),
        "data_sources": ds.get("sources", []),
        "log_channel": ds.get("log_channel", "Unknown"),
        "priority": "high" if technique_id in ["T1003.001", "T1486", "T1059.001"] else "medium",
        "status": "planned",
    }
    return hypothesis


def build_hunt_plan(hypotheses, analyst="SOC Analyst"):
    """Build a hunt plan from a list of hypotheses."""
    plan = {
        "plan_id": "PLAN-" + datetime.datetime.utcnow().strftime("%Y%m%d"),
        "created": datetime.datetime.utcnow().isoformat() + "Z",
        "analyst": analyst,
        "maturity_level": 2,
        "maturity_description": HUNT_MATURITY_LEVELS[2],
        "hypothesis_count": len(hypotheses),
        "hypotheses": hypotheses,
        "data_coverage": list(set(
            src for h in hypotheses for src in h.get("data_sources", [])
        )),
        "estimated_hours": len(hypotheses) * 4,
    }
    return plan


def evaluate_hunt_results(hypothesis, findings_count, true_positives, false_positives):
    """Evaluate hunt execution results and update hypothesis."""
    hypothesis["status"] = "completed"
    hypothesis["results"] = {
        "total_findings": findings_count,
        "true_positives": true_positives,
        "false_positives": false_positives,
        "precision": round(true_positives / max(findings_count, 1), 3),
        "outcome": "confirmed" if true_positives > 0 else "not_confirmed",
        "recommendation": (
            "Create detection rule" if true_positives > 0
            else "Refine hypothesis and re-hunt with broader data"
        ),
    }
    return hypothesis


def fetch_attack_techniques():
    """Fetch MITRE ATT&CK technique list."""
    if not HAS_REQUESTS:
        return list(DATA_SOURCE_MAP.keys())
    try:
        url = "https://raw.githubusercontent.com/mitre/cti/master/enterprise-attack/enterprise-attack.json"
        resp = requests.get(url, timeout=30)
        bundle = resp.json()
        techniques = [
            obj["external_references"][0]["external_id"]
            for obj in bundle.get("objects", [])
            if obj.get("type") == "attack-pattern"
            and obj.get("external_references")
            and not obj.get("x_mitre_deprecated", False)
        ]
        return techniques[:50]
    except Exception:
        return list(DATA_SOURCE_MAP.keys())


if __name__ == "__main__":
    print("=" * 60)
    print("Threat Hunt Hypothesis Framework")
    print("Hypothesis generation, hunt planning, result tracking")
    print("=" * 60)

    techniques = sys.argv[1:] if len(sys.argv) > 1 else ["T1059.001", "T1078", "T1003.001", "T1486"]
    actor = "APT29"

    hypotheses = []
    for t in techniques:
        h = generate_hypothesis(t, threat_actor=actor)
        hypotheses.append(h)

    plan = build_hunt_plan(hypotheses)
    print("\nHunt Plan: {} ({} hypotheses, ~{} hours)".format(
        plan["plan_id"], plan["hypothesis_count"], plan["estimated_hours"]))
    print("Maturity: {}".format(plan["maturity_description"]))

    print("\n--- Hypotheses ---")
    for h in hypotheses:
        print("  [{}] {} - {}".format(h["priority"].upper(), h["technique_id"], h["technique_name"]))
        print("    {}".format(h["hypothesis_statement"][:120] + "..."))
        print("    Sources: {}".format(", ".join(h["data_sources"][:3])))

    evaluated = evaluate_hunt_results(hypotheses[0], findings_count=12, true_positives=3, false_positives=9)
    print("\n--- Sample Result ---")
    print("  {} precision: {} -> {}".format(
        evaluated["technique_id"],
        evaluated["results"]["precision"],
        evaluated["results"]["recommendation"]))

    print("\n" + json.dumps({"hypotheses_generated": len(hypotheses)}, indent=2))