#!/usr/bin/env python3
"""SQL Injection WAF Log Analysis Agent - Detects SQLi attacks from ModSecurity and WAF logs."""

import json
import re
import logging
import argparse
from datetime import datetime
from collections import defaultdict

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)

SQLI_PATTERNS = [
    (r"(?i)\bUNION\s+(?:ALL\s+)?SELECT\b", "UNION-based", "critical"),
    (r"(?i)\bOR\s+['\"]?\d+['\"]?\s*=\s*['\"]?\d+", "Tautology (OR 1=1)", "high"),
    (r"(?i)\bAND\s+['\"]?\d+['\"]?\s*=\s*['\"]?\d+", "Tautology (AND 1=1)", "high"),
    (r"(?i)\bSLEEP\s*\(\s*\d+\s*\)", "Time-based blind (SLEEP)", "critical"),
    (r"(?i)\bBENCHMARK\s*\(", "Time-based blind (BENCHMARK)", "critical"),
    (r"(?i)\bWAITFOR\s+DELAY\b", "Time-based blind (WAITFOR)", "critical"),
    (r"(?i)['\"]\s*;\s*(?:DROP|DELETE|UPDATE|INSERT)\b", "Stacked query", "critical"),
    (r"(?i)\bINFORMATION_SCHEMA\b", "Schema enumeration", "high"),
    (r"(?i)\bLOAD_FILE\s*\(", "File read (LOAD_FILE)", "critical"),
    (r"(?i)\bINTO\s+(?:OUT|DUMP)FILE\b", "File write (INTO OUTFILE)", "critical"),
    (r"(?i)\bCONCAT\s*\(.*\bSELECT\b", "Nested SELECT in CONCAT", "high"),
    (r"(?i)\bGROUP_CONCAT\s*\(", "Data extraction (GROUP_CONCAT)", "high"),
    (r"(?i)\bEXTRACTVALUE\s*\(", "Error-based (EXTRACTVALUE)", "high"),
    (r"(?i)\bUPDATEXML\s*\(", "Error-based (UPDATEXML)", "high"),
    (r"(?i)(?:--|#|/\*)\s*$", "Comment termination", "medium"),
    (r"(?i)\bCHAR\s*\(\s*\d+(?:\s*,\s*\d+)*\s*\)", "CHAR() encoding bypass", "medium"),
    (r"(?i)0x[0-9a-f]{6,}", "Hex encoding bypass", "medium"),
]

MODSEC_RULE_MAP = {
    "942100": "SQL Injection via libinjection",
    "942110": "SQL Injection (common keywords)",
    "942120": "SQL Injection operator",
    "942130": "SQL Injection tautology",
    "942140": "SQL Injection (DB names)",
    "942150": "SQL Injection (functions)",
    "942160": "SQL Injection blind test (sleep/benchmark)",
    "942170": "SQL Injection (UNION query)",
    "942180": "SQL Injection bypass (basic auth)",
    "942190": "SQL Injection (MSSQL exec)",
    "942200": "SQL Injection (MySQL comment/space obfuscation)",
    "942210": "SQL Injection (chained)",
    "942220": "SQL Injection (integer overflow)",
    "942230": "SQL Injection (conditional)",
    "942240": "SQL Injection (MySQL charset switch)",
    "942250": "SQL Injection (MATCH AGAINST)",
    "942260": "SQL Injection bypass (basic auth 2)",
    "942270": "SQL Injection (common DB names)",
    "942280": "SQL Injection (pg_sleep/waitfor)",
    "942290": "SQL Injection (MongoDB)",
}


def parse_modsecurity_audit_log(log_file):
    """Parse ModSecurity audit log format."""
    entries = []
    current_entry = {}
    current_section = None

    with open(log_file, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            line = line.rstrip()
            if line.startswith("--") and line.endswith("-A--"):
                if current_entry:
                    entries.append(current_entry)
                current_entry = {"id": line.strip("-A-").strip("-"), "sections": {}}
                current_section = "A"
            elif line.startswith("--") and re.match(r"--\w+-[A-Z]--$", line):
                current_section = line[-3]
            elif current_section:
                current_entry.setdefault("sections", {})
                current_entry["sections"].setdefault(current_section, [])
                current_entry["sections"][current_section].append(line)

    if current_entry:
        entries.append(current_entry)
    logger.info("Parsed %d ModSecurity audit log entries", len(entries))
    return entries


def parse_json_waf_log(log_file):
    """Parse JSON-formatted WAF logs (AWS WAF, Cloudflare)."""
    entries = []
    with open(log_file, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            try:
                entry = json.loads(line.strip())
                entries.append(entry)
            except json.JSONDecodeError:
                continue
    logger.info("Parsed %d JSON WAF log entries", len(entries))
    return entries


def classify_sqli(payload):
    """Classify SQL injection type and severity from payload string."""
    matches = []
    for pattern, attack_type, severity in SQLI_PATTERNS:
        if re.search(pattern, payload):
            matches.append({"type": attack_type, "severity": severity})
    return matches


def analyze_modsecurity_entries(entries):
    """Analyze parsed ModSecurity entries for SQLi attacks."""
    findings = []
    for entry in entries:
        sections = entry.get("sections", {})
        request_lines = sections.get("B", [])
        header_lines = sections.get("H", [])

        request_uri = ""
        source_ip = ""
        rule_ids = []

        if request_lines:
            first_line = request_lines[0]
            parts = first_line.split(" ")
            if len(parts) >= 2:
                request_uri = parts[1]

        for line in header_lines:
            m = re.search(r"id\s*\"(\d+)\"", line)
            if m:
                rule_ids.append(m.group(1))
            m = re.search(r"Remote-Addr:\s*(\S+)", line)
            if m:
                source_ip = m.group(1)

        sqli_rules = [rid for rid in rule_ids if rid in MODSEC_RULE_MAP]
        if sqli_rules:
            sqli_classes = classify_sqli(request_uri)
            findings.append({
                "source_ip": source_ip,
                "request_uri": request_uri[:500],
                "rules_triggered": [{"id": r, "desc": MODSEC_RULE_MAP.get(r, "Unknown")} for r in sqli_rules],
                "sqli_classification": sqli_classes if sqli_classes else [{"type": "WAF rule match", "severity": "high"}],
                "severity": "critical" if any(c["severity"] == "critical" for c in sqli_classes) else "high",
            })
    return findings


def analyze_json_waf_entries(entries):
    """Analyze JSON WAF log entries for SQLi patterns."""
    findings = []
    for entry in entries:
        uri = entry.get("httpRequest", {}).get("uri", "") or entry.get("ClientRequestURI", "")
        args = entry.get("httpRequest", {}).get("args", "") or entry.get("queryString", "")
        source_ip = entry.get("httpRequest", {}).get("clientIp", "") or entry.get("ClientIP", "")
        action = entry.get("action", "") or entry.get("Action", "")

        payload = f"{uri}?{args}" if args else uri
        sqli_classes = classify_sqli(payload)

        if sqli_classes:
            findings.append({
                "source_ip": source_ip,
                "request_uri": payload[:500],
                "action": action,
                "sqli_classification": sqli_classes,
                "severity": max((c["severity"] for c in sqli_classes), key=lambda s: {"critical": 3, "high": 2, "medium": 1}.get(s, 0)),
            })
    return findings


def correlate_campaigns(findings, time_window_sec=300, min_requests=5):
    """Identify SQLi attack campaigns by source IP clustering."""
    ip_groups = defaultdict(list)
    for f in findings:
        ip_groups[f["source_ip"]].append(f)

    campaigns = []
    for ip, group in ip_groups.items():
        if len(group) >= min_requests:
            attack_types = set()
            for f in group:
                for c in f.get("sqli_classification", []):
                    attack_types.add(c["type"])
            campaigns.append({
                "source_ip": ip,
                "request_count": len(group),
                "attack_types": list(attack_types),
                "severity": "critical" if len(attack_types) > 2 else "high",
                "classification": "automated" if len(group) > 20 else "manual",
            })
            logger.warning("SQLi campaign: %s (%d requests, %d attack types)", ip, len(group), len(attack_types))
    return campaigns


def generate_report(findings, campaigns):
    """Generate SQLi detection report."""
    critical = [f for f in findings if f.get("severity") == "critical"]
    report = {
        "timestamp": datetime.utcnow().isoformat(),
        "total_sqli_events": len(findings),
        "critical_events": len(critical),
        "unique_sources": len(set(f["source_ip"] for f in findings if f.get("source_ip"))),
        "campaigns_detected": len(campaigns),
        "campaigns": campaigns,
        "top_findings": findings[:100],
    }
    print(f"SQLI REPORT: {len(findings)} events, {len(campaigns)} campaigns, {len(critical)} critical")
    return report


def main():
    parser = argparse.ArgumentParser(description="SQL Injection WAF Log Analysis Agent")
    parser.add_argument("--log-file", required=True, help="WAF log file path")
    parser.add_argument("--format", choices=["modsecurity", "json"], default="modsecurity")
    parser.add_argument("--output", default="sqli_report.json")
    args = parser.parse_args()

    if args.format == "modsecurity":
        entries = parse_modsecurity_audit_log(args.log_file)
        findings = analyze_modsecurity_entries(entries)
    else:
        entries = parse_json_waf_log(args.log_file)
        findings = analyze_json_waf_entries(entries)

    campaigns = correlate_campaigns(findings)
    report = generate_report(findings, campaigns)
    with open(args.output, "w") as f:
        json.dump(report, f, indent=2)
    logger.info("Report saved to %s", args.output)


if __name__ == "__main__":
    main()