Anthropic-Cybersecurity-Skills/skills/implementing-network-traffic-baselining/scripts/agent.py

#!/usr/bin/env python3
"""Network traffic baselining agent using pandas for NetFlow/IPFIX statistical analysis."""

import json
import argparse
from datetime import datetime

import pandas as pd


def load_netflow_csv(filepath):
    """Load NetFlow/IPFIX records from CSV export."""
    df = pd.read_csv(filepath, parse_dates=["timestamp"])
    required = {"timestamp", "src_ip", "dst_ip", "src_port", "dst_port", "protocol", "bytes", "packets"}
    missing = required - set(df.columns)
    if missing:
        alt_map = {"ts": "timestamp", "sa": "src_ip", "da": "dst_ip", "sp": "src_port",
                   "dp": "dst_port", "pr": "protocol", "ibyt": "bytes", "ipkt": "packets"}
        df.rename(columns={k: v for k, v in alt_map.items() if k in df.columns}, inplace=True)
    print(f"[+] Loaded {len(df)} flow records from {filepath}")
    return df


def compute_hourly_baseline(df):
    """Compute hourly traffic volume baseline."""
    df["hour"] = df["timestamp"].dt.hour
    hourly = df.groupby("hour").agg(
        total_bytes=("bytes", "sum"),
        total_packets=("packets", "sum"),
        flow_count=("bytes", "count"),
    ).reset_index()
    hourly["bytes_mean"] = hourly["total_bytes"] / max(df["timestamp"].dt.date.nunique(), 1)
    hourly["bytes_std"] = df.groupby("hour")["bytes"].std().values
    return hourly.to_dict(orient="records")


def compute_host_baselines(df):
    """Compute per-source-IP traffic baselines."""
    host_stats = df.groupby("src_ip").agg(
        total_bytes=("bytes", "sum"),
        total_packets=("packets", "sum"),
        flow_count=("bytes", "count"),
        unique_dst_ips=("dst_ip", "nunique"),
        unique_dst_ports=("dst_port", "nunique"),
        mean_bytes_per_flow=("bytes", "mean"),
        std_bytes_per_flow=("bytes", "std"),
    ).reset_index()
    host_stats = host_stats.fillna(0)
    return host_stats


def compute_protocol_baseline(df):
    """Compute protocol distribution baseline."""
    proto_map = {6: "TCP", 17: "UDP", 1: "ICMP"}
    df["proto_name"] = df["protocol"].map(lambda x: proto_map.get(x, str(x)))
    proto_stats = df.groupby("proto_name").agg(
        flow_count=("bytes", "count"),
        total_bytes=("bytes", "sum"),
    ).reset_index()
    total = proto_stats["flow_count"].sum()
    proto_stats["percentage"] = (proto_stats["flow_count"] / total * 100).round(2)
    return proto_stats.to_dict(orient="records")


def detect_zscore_anomalies(df, host_baselines, threshold=3.0):
    """Detect anomalous hosts using z-score on bytes transferred."""
    mean_bytes = host_baselines["total_bytes"].mean()
    std_bytes = host_baselines["total_bytes"].std()
    if std_bytes == 0:
        return []
    host_baselines["zscore"] = ((host_baselines["total_bytes"] - mean_bytes) / std_bytes).round(4)
    anomalies = host_baselines[host_baselines["zscore"].abs() >= threshold]
    alerts = []
    for _, row in anomalies.iterrows():
        alerts.append({
            "detection": "Z-Score Traffic Anomaly",
            "src_ip": row["src_ip"],
            "total_bytes": int(row["total_bytes"]),
            "zscore": float(row["zscore"]),
            "threshold": threshold,
            "flow_count": int(row["flow_count"]),
            "unique_destinations": int(row["unique_dst_ips"]),
            "severity": "critical" if abs(row["zscore"]) >= 5.0 else "high",
        })
    return alerts


def detect_iqr_anomalies(df, host_baselines):
    """Detect outlier hosts using IQR method on bytes per flow."""
    q1 = host_baselines["mean_bytes_per_flow"].quantile(0.25)
    q3 = host_baselines["mean_bytes_per_flow"].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    outliers = host_baselines[
        (host_baselines["mean_bytes_per_flow"] < lower) | (host_baselines["mean_bytes_per_flow"] > upper)
    ]
    alerts = []
    for _, row in outliers.iterrows():
        alerts.append({
            "detection": "IQR Bytes-Per-Flow Outlier",
            "src_ip": row["src_ip"],
            "mean_bytes_per_flow": round(float(row["mean_bytes_per_flow"]), 2),
            "iqr_lower": round(float(lower), 2),
            "iqr_upper": round(float(upper), 2),
            "severity": "medium",
        })
    return alerts


def detect_port_scan_pattern(df, threshold=50):
    """Detect hosts connecting to an unusually high number of unique ports."""
    port_counts = df.groupby("src_ip")["dst_port"].nunique().reset_index()
    port_counts.columns = ["src_ip", "unique_ports"]
    scanners = port_counts[port_counts["unique_ports"] >= threshold]
    return [{"detection": "Port Scan Pattern", "src_ip": row["src_ip"],
             "unique_ports": int(row["unique_ports"]), "severity": "high"}
            for _, row in scanners.iterrows()]


def main():
    parser = argparse.ArgumentParser(description="Network Traffic Baselining Agent")
    parser.add_argument("--netflow-csv", required=True, help="Path to NetFlow/IPFIX CSV export")
    parser.add_argument("--zscore-threshold", type=float, default=3.0, help="Z-score anomaly threshold")
    parser.add_argument("--scan-threshold", type=int, default=50, help="Port scan unique ports threshold")
    parser.add_argument("--output", default="traffic_baseline_report.json", help="Output report path")
    args = parser.parse_args()

    df = load_netflow_csv(args.netflow_csv)
    hourly = compute_hourly_baseline(df)
    host_baselines = compute_host_baselines(df)
    protocol = compute_protocol_baseline(df)

    zscore_alerts = detect_zscore_anomalies(df, host_baselines, args.zscore_threshold)
    iqr_alerts = detect_iqr_anomalies(df, host_baselines)
    scan_alerts = detect_port_scan_pattern(df, args.scan_threshold)

    top_talkers = host_baselines.nlargest(10, "total_bytes")[["src_ip", "total_bytes", "flow_count"]].to_dict(orient="records")

    report = {
        "analysis_time": datetime.utcnow().isoformat() + "Z",
        "total_flows": len(df),
        "date_range": {"start": str(df["timestamp"].min()), "end": str(df["timestamp"].max())},
        "baselines": {
            "hourly_profile": hourly,
            "protocol_distribution": protocol,
            "top_talkers": top_talkers,
        },
        "anomalies": {
            "zscore_anomalies": zscore_alerts,
            "iqr_outliers": iqr_alerts,
            "port_scan_patterns": scan_alerts,
        },
        "total_anomalies": len(zscore_alerts) + len(iqr_alerts) + len(scan_alerts),
    }

    with open(args.output, "w") as f:
        json.dump(report, f, indent=2, default=str)
    print(f"[+] Z-score anomalies: {len(zscore_alerts)}")
    print(f"[+] IQR outliers: {len(iqr_alerts)}")
    print(f"[+] Port scan patterns: {len(scan_alerts)}")
    print(f"[+] Report saved to {args.output}")


if __name__ == "__main__":
    main()