Anthropic-Cybersecurity-Skills/skills/implementing-siem-use-case-tuning/scripts/agent.py

#!/usr/bin/env python3
"""SIEM use case tuning agent - analyzes alert data to reduce false positives and optimize detection rules."""

import json
import csv
import math
import argparse
from collections import defaultdict
from datetime import datetime


def load_alert_data(filepath):
    """Load alert/notable event export (CSV with columns: rule_name, timestamp, disposition, source, user)."""
    alerts = []
    with open(filepath, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            alerts.append({
                "rule_name": row.get("rule_name", row.get("search_name", "")),
                "timestamp": row.get("timestamp", row.get("_time", "")),
                "disposition": row.get("disposition", row.get("status", "unknown")),
                "source": row.get("source", row.get("src", "")),
                "user": row.get("user", row.get("dest_user", "")),
                "severity": row.get("severity", "medium"),
            })
    return alerts


def calculate_rule_metrics(alerts):
    """Calculate per-rule alert volume, FP rate, and disposition breakdown."""
    rule_stats = defaultdict(lambda: {"total": 0, "true_positive": 0, "false_positive": 0,
                                       "pending": 0, "sources": set(), "users": set()})
    for alert in alerts:
        rule = alert["rule_name"]
        rule_stats[rule]["total"] += 1
        disp = alert["disposition"].lower()
        if disp in ("true_positive", "tp", "confirmed", "escalated"):
            rule_stats[rule]["true_positive"] += 1
        elif disp in ("false_positive", "fp", "benign", "closed_fp"):
            rule_stats[rule]["false_positive"] += 1
        else:
            rule_stats[rule]["pending"] += 1
        if alert["source"]:
            rule_stats[rule]["sources"].add(alert["source"])
        if alert["user"]:
            rule_stats[rule]["users"].add(alert["user"])

    metrics = []
    for rule, stats in rule_stats.items():
        reviewed = stats["true_positive"] + stats["false_positive"]
        fp_rate = stats["false_positive"] / reviewed if reviewed > 0 else 0.0
        precision = stats["true_positive"] / reviewed if reviewed > 0 else 0.0
        metrics.append({
            "rule_name": rule,
            "total_alerts": stats["total"],
            "true_positives": stats["true_positive"],
            "false_positives": stats["false_positive"],
            "pending": stats["pending"],
            "fp_rate": round(fp_rate, 4),
            "precision": round(precision, 4),
            "unique_sources": len(stats["sources"]),
            "unique_users": len(stats["users"]),
            "top_sources": list(stats["sources"])[:10],
        })
    return sorted(metrics, key=lambda x: x["fp_rate"], reverse=True)


def identify_whitelist_candidates(alerts, fp_threshold=0.8):
    """Identify source/user pairs that consistently trigger FPs for a given rule."""
    rule_source_stats = defaultdict(lambda: defaultdict(lambda: {"tp": 0, "fp": 0}))
    for alert in alerts:
        disp = alert["disposition"].lower()
        key = alert["source"] or alert["user"]
        if not key:
            continue
        if disp in ("false_positive", "fp", "benign", "closed_fp"):
            rule_source_stats[alert["rule_name"]][key]["fp"] += 1
        elif disp in ("true_positive", "tp", "confirmed", "escalated"):
            rule_source_stats[alert["rule_name"]][key]["tp"] += 1

    candidates = []
    for rule, sources in rule_source_stats.items():
        for source, counts in sources.items():
            total = counts["tp"] + counts["fp"]
            if total >= 3 and counts["fp"] / total >= fp_threshold:
                candidates.append({
                    "rule_name": rule,
                    "entity": source,
                    "fp_count": counts["fp"],
                    "tp_count": counts["tp"],
                    "fp_ratio": round(counts["fp"] / total, 4),
                    "recommendation": "Add to whitelist" if counts["tp"] == 0 else "Review before whitelisting",
                })
    return sorted(candidates, key=lambda x: x["fp_count"], reverse=True)


def compute_threshold_recommendation(alerts, rule_name, field="total"):
    """Compute statistical threshold for a rule based on hourly alert distribution."""
    hourly_counts = defaultdict(int)
    for alert in alerts:
        if alert["rule_name"] != rule_name:
            continue
        try:
            dt = datetime.fromisoformat(alert["timestamp"].replace("Z", "+00:00"))
            hourly_counts[dt.strftime("%Y-%m-%d %H")] += 1
        except (ValueError, AttributeError):
            continue
    if not hourly_counts:
        return None
    values = list(hourly_counts.values())
    mean = sum(values) / len(values)
    variance = sum((x - mean) ** 2 for x in values) / len(values)
    stdev = math.sqrt(variance)
    return {
        "rule_name": rule_name,
        "hourly_mean": round(mean, 2),
        "hourly_stdev": round(stdev, 2),
        "suggested_threshold_2sd": round(mean + 2 * stdev, 0),
        "suggested_threshold_3sd": round(mean + 3 * stdev, 0),
        "sample_hours": len(hourly_counts),
    }


def generate_tuning_report(metrics, whitelist, thresholds):
    """Generate comprehensive tuning report with recommendations."""
    high_fp_rules = [m for m in metrics if m["fp_rate"] > 0.7]
    medium_fp_rules = [m for m in metrics if 0.3 < m["fp_rate"] <= 0.7]
    total_alerts = sum(m["total_alerts"] for m in metrics)
    total_fp = sum(m["false_positives"] for m in metrics)
    projected_reduction = sum(w["fp_count"] for w in whitelist)

    return {
        "analysis_time": datetime.utcnow().isoformat() + "Z",
        "summary": {
            "total_rules_analyzed": len(metrics),
            "total_alerts": total_alerts,
            "total_false_positives": total_fp,
            "overall_fp_rate": round(total_fp / total_alerts, 4) if total_alerts else 0,
            "high_fp_rules": len(high_fp_rules),
            "whitelist_candidates": len(whitelist),
            "projected_alert_reduction": projected_reduction,
        },
        "high_fp_rules": high_fp_rules,
        "medium_fp_rules": medium_fp_rules,
        "whitelist_recommendations": whitelist[:20],
        "threshold_recommendations": thresholds,
        "actions": [
            {"priority": "high", "action": f"Disable or rewrite {len(high_fp_rules)} rules with FP rate > 70%"},
            {"priority": "medium", "action": f"Add {len(whitelist)} whitelist entries to reduce {projected_reduction} FP alerts"},
            {"priority": "low", "action": f"Review {len(medium_fp_rules)} rules with FP rate 30-70%"},
        ],
    }


def main():
    parser = argparse.ArgumentParser(description="SIEM Use Case Tuning Agent")
    parser.add_argument("--alert-csv", required=True, help="CSV export of SIEM alerts with disposition data")
    parser.add_argument("--fp-threshold", type=float, default=0.8, help="FP ratio threshold for whitelist candidates")
    parser.add_argument("--top-rules", type=int, default=5, help="Number of top rules to compute thresholds for")
    parser.add_argument("--output", default="tuning_report.json", help="Output report path")
    args = parser.parse_args()

    alerts = load_alert_data(args.alert_csv)
    print(f"[+] Loaded {len(alerts)} alerts from {args.alert_csv}")

    metrics = calculate_rule_metrics(alerts)
    print(f"[+] Analyzed {len(metrics)} unique detection rules")

    whitelist = identify_whitelist_candidates(alerts, args.fp_threshold)
    print(f"[+] Found {len(whitelist)} whitelist candidates (FP ratio >= {args.fp_threshold})")

    thresholds = []
    for m in metrics[:args.top_rules]:
        t = compute_threshold_recommendation(alerts, m["rule_name"])
        if t:
            thresholds.append(t)

    report = generate_tuning_report(metrics, whitelist, thresholds)
    with open(args.output, "w") as f:
        json.dump(report, f, indent=2)
    print(f"[+] Tuning report saved to {args.output}")
    print(f"[+] Overall FP rate: {report['summary']['overall_fp_rate']:.1%}")
    print(f"[+] Projected alert reduction from whitelisting: {report['summary']['projected_alert_reduction']}")


if __name__ == "__main__":
    main()