Anthropic-Cybersecurity-Skills/skills/building-detection-rule-with-splunk-spl/scripts/process.py

#!/usr/bin/env python3
"""
Splunk SPL Detection Rule Builder and Validator

Generates, validates, and manages Splunk SPL detection rules
for SOC correlation searches. Supports MITRE ATT&CK mapping
and rule quality scoring.
"""

import json
import re
import hashlib
from datetime import datetime
from typing import Optional


MITRE_TECHNIQUES = {
    "T1110.001": {"name": "Password Guessing", "tactic": "Credential Access"},
    "T1110.003": {"name": "Password Spraying", "tactic": "Credential Access"},
    "T1059.001": {"name": "PowerShell", "tactic": "Execution"},
    "T1059.003": {"name": "Windows Command Shell", "tactic": "Execution"},
    "T1021.002": {"name": "SMB/Windows Admin Shares", "tactic": "Lateral Movement"},
    "T1021.001": {"name": "Remote Desktop Protocol", "tactic": "Lateral Movement"},
    "T1048": {"name": "Exfiltration Over C2 Channel", "tactic": "Exfiltration"},
    "T1048.003": {"name": "Exfiltration Over Unencrypted Protocol", "tactic": "Exfiltration"},
    "T1053.005": {"name": "Scheduled Task", "tactic": "Persistence"},
    "T1003.001": {"name": "LSASS Memory", "tactic": "Credential Access"},
    "T1078": {"name": "Valid Accounts", "tactic": "Defense Evasion"},
    "T1078.002": {"name": "Domain Accounts", "tactic": "Defense Evasion"},
    "T1547.001": {"name": "Registry Run Keys", "tactic": "Persistence"},
    "T1055": {"name": "Process Injection", "tactic": "Defense Evasion"},
    "T1071.001": {"name": "Web Protocols", "tactic": "Command and Control"},
    "T1036.005": {"name": "Match Legitimate Name", "tactic": "Defense Evasion"},
    "T1027": {"name": "Obfuscated Files or Information", "tactic": "Defense Evasion"},
    "T1218.011": {"name": "Rundll32", "tactic": "Defense Evasion"},
    "T1543.003": {"name": "Windows Service", "tactic": "Persistence"},
    "T1105": {"name": "Ingress Tool Transfer", "tactic": "Command and Control"},
}


class SplunkDetectionRule:
    """Represents a Splunk SPL detection rule with metadata and validation."""

    def __init__(
        self,
        name: str,
        description: str,
        spl_query: str,
        mitre_techniques: list,
        severity: str = "medium",
        schedule_cron: str = "*/15 * * * *",
        time_window: str = "-20m",
        data_sources: Optional[list] = None,
        false_positive_notes: Optional[list] = None,
    ):
        self.name = name
        self.description = description
        self.spl_query = spl_query
        self.mitre_techniques = mitre_techniques
        self.severity = severity
        self.schedule_cron = schedule_cron
        self.time_window = time_window
        self.data_sources = data_sources or []
        self.false_positive_notes = false_positive_notes or []
        self.created = datetime.utcnow().isoformat()
        self.rule_id = self._generate_rule_id()

    def _generate_rule_id(self) -> str:
        hash_input = f"{self.name}:{self.spl_query}"
        return f"SPL-{hashlib.sha256(hash_input.encode()).hexdigest()[:12].upper()}"

    def validate(self) -> dict:
        """Validate the SPL detection rule for common issues."""
        issues = []
        score = 100

        # Check for missing time constraint
        if "earliest=" not in self.spl_query and "span=" not in self.spl_query:
            issues.append("WARNING: No time constraint in query - may scan too much data")
            score -= 10

        # Check for wildcard-heavy searches
        wildcard_count = self.spl_query.count("*")
        if wildcard_count > 5:
            issues.append(f"WARNING: {wildcard_count} wildcards detected - may impact performance")
            score -= 5 * min(wildcard_count - 5, 4)

        # Check for aggregation
        agg_commands = ["stats", "eventstats", "streamstats", "tstats", "chart", "timechart"]
        has_aggregation = any(cmd in self.spl_query.lower() for cmd in agg_commands)
        if not has_aggregation:
            issues.append("WARNING: No aggregation command - rule may generate excessive alerts")
            score -= 15

        # Check for threshold
        if "where" not in self.spl_query.lower():
            issues.append("WARNING: No where clause - rule has no threshold filtering")
            score -= 15

        # Check for enrichment
        if "lookup" not in self.spl_query.lower():
            issues.append("INFO: No lookup enrichment - consider adding asset/identity context")
            score -= 5

        # Check MITRE mapping
        if not self.mitre_techniques:
            issues.append("WARNING: No MITRE ATT&CK technique mapped")
            score -= 10

        for tech_id in self.mitre_techniques:
            if tech_id not in MITRE_TECHNIQUES:
                issues.append(f"WARNING: Unknown MITRE technique ID: {tech_id}")
                score -= 5

        # Check severity is valid
        valid_severities = ["informational", "low", "medium", "high", "critical"]
        if self.severity not in valid_severities:
            issues.append(f"ERROR: Invalid severity '{self.severity}' - must be one of {valid_severities}")
            score -= 20

        # Check for eval description
        if "eval description" not in self.spl_query.lower() and "eval rule_description" not in self.spl_query.lower():
            issues.append("INFO: No description field in output - analysts will lack context")
            score -= 5

        # Check for CIM data model usage
        if "datamodel=" in self.spl_query.lower() or "tstats" in self.spl_query.lower():
            score += 5  # Bonus for using CIM-accelerated searches

        return {
            "rule_id": self.rule_id,
            "rule_name": self.name,
            "valid": score >= 60,
            "quality_score": max(0, min(100, score)),
            "issues": issues,
            "issue_count": len(issues),
        }

    def to_splunk_savedsearch_conf(self) -> str:
        """Generate Splunk savedsearches.conf stanza for the rule."""
        mitre_str = ", ".join(self.mitre_techniques)
        stanza = f"""[{self.name}]
search = {self.spl_query}
description = {self.description}
dispatch.earliest_time = {self.time_window}
dispatch.latest_time = now
cron_schedule = {self.schedule_cron}
is_scheduled = 1
enableSched = 1
alert.severity = {self._severity_to_int()}
alert.suppress = 1
alert.suppress.period = 1h
alert.suppress.fields = src_ip
action.notable = 1
action.notable.param.rule_title = {self.name}
action.notable.param.rule_description = {self.description}
action.notable.param.severity = {self.severity}
action.notable.param.security_domain = threat
action.notable.param.drilldown_name = View triggering events
action.notable.param.drilldown_search = {self.spl_query}
action.notable.param.mitre_attack = {mitre_str}
"""
        return stanza

    def _severity_to_int(self) -> int:
        mapping = {"informational": 1, "low": 2, "medium": 3, "high": 4, "critical": 5}
        return mapping.get(self.severity, 3)

    def to_json(self) -> str:
        return json.dumps(
            {
                "rule_id": self.rule_id,
                "name": self.name,
                "description": self.description,
                "spl_query": self.spl_query,
                "mitre_techniques": self.mitre_techniques,
                "severity": self.severity,
                "schedule_cron": self.schedule_cron,
                "time_window": self.time_window,
                "data_sources": self.data_sources,
                "false_positive_notes": self.false_positive_notes,
                "created": self.created,
            },
            indent=2,
        )


class DetectionRuleLibrary:
    """Manages a collection of Splunk detection rules."""

    def __init__(self):
        self.rules = []

    def add_rule(self, rule: SplunkDetectionRule):
        self.rules.append(rule)

    def validate_all(self) -> dict:
        results = {"total_rules": len(self.rules), "valid_rules": 0, "invalid_rules": 0, "details": []}
        for rule in self.rules:
            validation = rule.validate()
            results["details"].append(validation)
            if validation["valid"]:
                results["valid_rules"] += 1
            else:
                results["invalid_rules"] += 1
        return results

    def get_mitre_coverage(self) -> dict:
        coverage = {}
        for rule in self.rules:
            for tech_id in rule.mitre_techniques:
                if tech_id not in coverage:
                    coverage[tech_id] = {
                        "technique": MITRE_TECHNIQUES.get(tech_id, {}).get("name", "Unknown"),
                        "tactic": MITRE_TECHNIQUES.get(tech_id, {}).get("tactic", "Unknown"),
                        "rules": [],
                    }
                coverage[tech_id]["rules"].append(rule.name)
        return {
            "techniques_covered": len(coverage),
            "total_known_techniques": len(MITRE_TECHNIQUES),
            "coverage_percentage": round(len(coverage) / len(MITRE_TECHNIQUES) * 100, 1),
            "coverage_map": coverage,
        }

    def export_savedsearches_conf(self) -> str:
        output = "# Auto-generated Splunk savedsearches.conf\n"
        output += f"# Generated: {datetime.utcnow().isoformat()}\n"
        output += f"# Total Rules: {len(self.rules)}\n\n"
        for rule in self.rules:
            output += rule.to_splunk_savedsearch_conf() + "\n"
        return output


def build_sample_detection_library() -> DetectionRuleLibrary:
    """Build a sample detection rule library with common SOC use cases."""
    library = DetectionRuleLibrary()

    library.add_rule(
        SplunkDetectionRule(
            name="Brute Force - Multiple Failed Logins",
            description="Detects brute force attacks with multiple failed login attempts from a single source",
            spl_query=(
                '| tstats summariesonly=true count from datamodel=Authentication '
                'where Authentication.action=failure by Authentication.src, Authentication.user, _time span=5m '
                '| rename "Authentication.*" as * '
                '| stats count as total_failures dc(user) as unique_users values(user) as targeted_users by src '
                '| where total_failures > 20 AND unique_users > 3 '
                '| lookup asset_lookup ip as src OUTPUT priority as asset_priority '
                '| eval severity=case(unique_users > 10, "critical", unique_users > 5, "high", true(), "medium") '
                '| eval description="Brute force detected from ".src." targeting ".unique_users." accounts"'
            ),
            mitre_techniques=["T1110.001"],
            severity="high",
            schedule_cron="*/5 * * * *",
            time_window="-10m",
            data_sources=["Windows Security Event Log", "Linux Auth Log"],
            false_positive_notes=["Service accounts with expired passwords", "Misconfigured applications"],
        )
    )

    library.add_rule(
        SplunkDetectionRule(
            name="Suspicious PowerShell Execution",
            description="Detects encoded or obfuscated PowerShell commands indicating potential malicious activity",
            spl_query=(
                'index=wineventlog sourcetype=WinEventLog:Security EventCode=4104 '
                '| where match(ScriptBlockText, "(?i)(encodedcommand|invoke-expression|iex|downloadstring|frombase64string|net\\.webclient|invoke-mimikatz)") '
                '| stats count values(ScriptBlockText) as commands by Computer, UserName '
                '| where count > 0 '
                '| lookup identity_lookup identity as UserName OUTPUT department, manager '
                '| eval severity="high" '
                '| eval description="Suspicious PowerShell on ".Computer." by ".UserName'
            ),
            mitre_techniques=["T1059.001", "T1027"],
            severity="high",
            data_sources=["Windows PowerShell Script Block Logging"],
            false_positive_notes=["IT automation scripts using encoded commands", "SCCM deployment scripts"],
        )
    )

    library.add_rule(
        SplunkDetectionRule(
            name="Lateral Movement - Multiple Host Access",
            description="Detects a user or source IP accessing an unusual number of hosts via network logon",
            spl_query=(
                '| tstats summariesonly=true dc(Authentication.dest) as unique_hosts '
                'from datamodel=Authentication where Authentication.action=success Authentication.Logon_Type=3 '
                'by Authentication.src, Authentication.user, _time span=1h '
                '| rename "Authentication.*" as * '
                '| where unique_hosts > 5 '
                '| lookup asset_lookup ip as src OUTPUT asset_name, asset_category '
                '| eval severity=case(unique_hosts > 20, "critical", unique_hosts > 10, "high", true(), "medium") '
                '| eval description=user." accessed ".unique_hosts." hosts from ".src." in 1 hour"'
            ),
            mitre_techniques=["T1021.002", "T1078.002"],
            severity="high",
            data_sources=["Windows Security Event Log"],
            false_positive_notes=["Vulnerability scanners", "IT management tools", "Software deployment systems"],
        )
    )

    return library


if __name__ == "__main__":
    library = build_sample_detection_library()

    print("=" * 70)
    print("SPLUNK SPL DETECTION RULE LIBRARY")
    print("=" * 70)

    # Validate all rules
    validation = library.validate_all()
    print(f"\nTotal Rules: {validation['total_rules']}")
    print(f"Valid Rules: {validation['valid_rules']}")
    print(f"Invalid Rules: {validation['invalid_rules']}")

    for detail in validation["details"]:
        print(f"\n--- {detail['rule_name']} ---")
        print(f"  Rule ID: {detail['rule_id']}")
        print(f"  Quality Score: {detail['quality_score']}/100")
        print(f"  Valid: {detail['valid']}")
        for issue in detail["issues"]:
            print(f"  {issue}")

    # MITRE coverage
    coverage = library.get_mitre_coverage()
    print(f"\nMITRE ATT&CK Coverage: {coverage['techniques_covered']}/{coverage['total_known_techniques']} ({coverage['coverage_percentage']}%)")
    for tech_id, info in coverage["coverage_map"].items():
        print(f"  {tech_id} ({info['technique']}): {', '.join(info['rules'])}")

    # Export savedsearches.conf
    conf = library.export_savedsearches_conf()
    print(f"\n{'=' * 70}")
    print("GENERATED savedsearches.conf")
    print("=" * 70)
    print(conf)