#!/usr/bin/env python3
"""Agent for hunting web shell activity on web servers via process tree and log analysis."""

import json
import argparse
import re
from datetime import datetime
from collections import defaultdict
from pathlib import Path


WEB_SERVER_PROCESSES = [
    "w3wp.exe", "httpd", "apache2", "nginx", "tomcat", "java",
    "php-cgi", "php-fpm", "node", "iisexpress",
]

SHELL_SPAWNS = [
    "cmd.exe", "powershell.exe", "pwsh.exe", "bash", "sh",
    "wscript.exe", "cscript.exe", "certutil.exe", "whoami.exe",
    "net.exe", "net1.exe", "ipconfig.exe", "systeminfo.exe",
    "tasklist.exe", "nslookup.exe",
]

WEBSHELL_HTTP_PATTERNS = [
    r"POST\s+.*\.(asp|aspx|php|jsp|jspx)\s+",
    r"cmd=", r"exec=", r"command=", r"shell=",
    r"c99shell", r"r57shell", r"b374k", r"weevely",
    r"china\s*chopper", r"antsword",
]


def load_process_logs(log_path):
    """Load process creation logs (JSON lines)."""
    entries = []
    with open(log_path) as f:
        for line in f:
            try:
                entries.append(json.loads(line))
            except json.JSONDecodeError:
                continue
    return entries


def detect_webserver_child_shells(process_logs):
    """Detect shell processes spawned by web server processes."""
    findings = []
    for entry in process_logs:
        parent = entry.get("ParentImage", entry.get("parent_process", "")).lower()
        child = entry.get("Image", entry.get("process_name", "")).lower()
        is_web_parent = any(ws in parent for ws in WEB_SERVER_PROCESSES)
        is_shell_child = any(sh in child for sh in SHELL_SPAWNS)
        if is_web_parent and is_shell_child:
            cmd = entry.get("CommandLine", entry.get("command_line", ""))
            findings.append({
                "timestamp": entry.get("UtcTime", entry.get("timestamp", "")),
                "hostname": entry.get("Computer", entry.get("hostname", "")),
                "parent_process": parent,
                "child_process": child,
                "command_line": cmd[:500],
                "user": entry.get("User", ""),
                "severity": "CRITICAL",
                "technique": "T1505.003",
            })
    return findings


def analyze_web_access_logs(access_log_path):
    """Analyze web access logs for webshell indicators."""
    findings = []
    with open(access_log_path) as f:
        for i, line in enumerate(f, 1):
            for pattern in WEBSHELL_HTTP_PATTERNS:
                if re.search(pattern, line, re.IGNORECASE):
                    ip_match = re.match(r"^(\S+)", line)
                    findings.append({
                        "line_number": i,
                        "source_ip": ip_match.group(1) if ip_match else "",
                        "log_entry": line.strip()[:500],
                        "pattern_matched": pattern,
                        "severity": "HIGH",
                    })
                    break
    return findings


def detect_file_creation_in_webroot(file_events, webroot_paths=None):
    """Detect new script files created in web server directories."""
    if webroot_paths is None:
        webroot_paths = [
            "/var/www", "/opt/lampp/htdocs", "inetpub/wwwroot",
            "/usr/share/nginx/html", "/srv/www",
        ]
    script_extensions = [".php", ".asp", ".aspx", ".jsp", ".jspx", ".cgi", ".cfm"]
    findings = []
    for event in file_events:
        filepath = event.get("TargetFilename", event.get("file_path", "")).lower()
        in_webroot = any(wr in filepath for wr in webroot_paths)
        is_script = any(filepath.endswith(ext) for ext in script_extensions)
        if in_webroot and is_script:
            findings.append({
                "timestamp": event.get("UtcTime", event.get("timestamp", "")),
                "file_path": filepath,
                "process": event.get("Image", event.get("process_name", "")),
                "hostname": event.get("Computer", event.get("hostname", "")),
                "severity": "CRITICAL",
                "reason": "script_created_in_webroot",
            })
    return findings


def detect_post_exploitation(process_logs):
    """Detect reconnaissance commands typically run through webshells."""
    recon_patterns = [
        (r"whoami", "user_discovery"),
        (r"ipconfig|ifconfig", "network_config"),
        (r"net\s+(user|group|localgroup)", "account_enum"),
        (r"systeminfo", "system_info"),
        (r"tasklist|ps\s+aux", "process_enum"),
        (r"netstat\s+-an", "connection_enum"),
        (r"dir\s+/s|find\s+/|ls\s+-la", "file_enum"),
    ]
    findings = []
    for entry in process_logs:
        parent = entry.get("ParentImage", entry.get("parent_process", "")).lower()
        if not any(ws in parent for ws in WEB_SERVER_PROCESSES):
            continue
        cmd = entry.get("CommandLine", entry.get("command_line", ""))
        for pattern, category in recon_patterns:
            if re.search(pattern, cmd, re.IGNORECASE):
                findings.append({
                    "timestamp": entry.get("UtcTime", entry.get("timestamp", "")),
                    "command": cmd[:300],
                    "category": category,
                    "parent": parent,
                    "severity": "HIGH",
                })
                break
    return findings


def main():
    parser = argparse.ArgumentParser(description="Webshell Activity Hunter")
    parser.add_argument("--process-log", help="JSON lines process creation log")
    parser.add_argument("--access-log", help="Web server access log")
    parser.add_argument("--file-events", help="JSON lines file creation events")
    parser.add_argument("--output", default="webshell_hunt_report.json")
    args = parser.parse_args()

    report = {"generated_at": datetime.utcnow().isoformat(), "findings": {}}

    if args.process_log:
        logs = load_process_logs(args.process_log)
        shells = detect_webserver_child_shells(logs)
        report["findings"]["shell_spawns"] = shells
        print(f"[+] Web server shell spawns: {len(shells)}")
        recon = detect_post_exploitation(logs)
        report["findings"]["post_exploitation"] = recon
        print(f"[+] Post-exploitation commands: {len(recon)}")

    if args.access_log:
        hits = analyze_web_access_logs(args.access_log)
        report["findings"]["access_log_hits"] = hits
        print(f"[+] Access log webshell indicators: {len(hits)}")

    if args.file_events:
        events = load_process_logs(args.file_events)
        files = detect_file_creation_in_webroot(events)
        report["findings"]["webroot_files"] = files
        print(f"[+] Scripts created in webroot: {len(files)}")

    with open(args.output, "w") as f:
        json.dump(report, f, indent=2, default=str)
    print(f"[+] Report saved to {args.output}")


if __name__ == "__main__":
    main()