#!/usr/bin/env python3
"""Agent for hunting web shell activity on web servers via process tree and log analysis."""

import json
import argparse
import re
from datetime import datetime


WEB_SERVER_PROCESSES = [
    "w3wp.exe", "httpd", "apache2", "nginx", "tomcat", "java",
    "php-cgi", "php-fpm", "node", "iisexpress",
]

SHELL_SPAWNS = [
    "cmd.exe", "powershell.exe", "pwsh.exe", "bash", "sh",
    "wscript.exe", "cscript.exe", "certutil.exe", "whoami.exe",
    "net.exe", "net1.exe", "ipconfig.exe", "systeminfo.exe",
    "tasklist.exe", "nslookup.exe",
]

WEBSHELL_HTTP_PATTERNS = [
    r"POST\s+.*\.(asp|aspx|php|jsp|jspx)\s+",
    r"cmd=", r"exec=", r"command=", r"shell=",
    r"c99shell", r"r57shell", r"b374k", r"weevely",
    r"china\s*chopper", r"antsword",
]


def load_process_logs(log_path):
    """Load process creation logs (JSON lines)."""
    entries = []
    with open(log_path) as f:
        for line in f:
            try:
                entries.append(json.loads(line))
            except json.JSONDecodeError:
                continue
    return entries


def detect_webserver_child_shells(process_logs):
    """Detect shell processes spawned by web server processes."""
    findings = []
    for entry in process_logs:
        parent = entry.get("ParentImage", entry.get("parent_process", "")).lower()
        child = entry.get("Image", entry.get("process_name", "")).lower()
        is_web_parent = any(ws in parent for ws in WEB_SERVER_PROCESSES)
        is_shell_child = any(sh in child for sh in SHELL_SPAWNS)
        if is_web_parent and is_shell_child:
            cmd = entry.get("CommandLine", entry.get("command_line", ""))
            findings.append({
                "timestamp": entry.get("UtcTime", entry.get("timestamp", "")),
                "hostname": entry.get("Computer", entry.get("hostname", "")),
                "parent_process": parent,
                "child_process": child,
                "command_line": cmd[:500],
                "user": entry.get("User", ""),
                "severity": "CRITICAL",
                "technique": "T1505.003",
            })
    return findings


def analyze_web_access_logs(access_log_path):
    """Analyze web access logs for webshell indicators."""
    findings = []
    with open(access_log_path) as f:
        for i, line in enumerate(f, 1):
            for pattern in WEBSHELL_HTTP_PATTERNS:
                if re.search(pattern, line, re.IGNORECASE):
                    ip_match = re.match(r"^(\S+)", line)
                    findings.append({
                        "line_number": i,
                        "source_ip": ip_match.group(1) if ip_match else "",
                        "log_entry": line.strip()[:500],
                        "pattern_matched": pattern,
                        "severity": "HIGH",
                    })
                    break
    return findings


def detect_file_creation_in_webroot(file_events, webroot_paths=None):
    """Detect new script files created in web server directories."""
    if webroot_paths is None:
        webroot_paths = [
            "/var/www", "/opt/lampp/htdocs", "inetpub/wwwroot",
            "/usr/share/nginx/html", "/srv/www",
        ]
    script_extensions = [".php", ".asp", ".aspx", ".jsp", ".jspx", ".cgi", ".cfm"]
    findings = []
    for event in file_events:
        filepath = event.get("TargetFilename", event.get("file_path", "")).lower()
        in_webroot = any(wr in filepath for wr in webroot_paths)
        is_script = any(filepath.endswith(ext) for ext in script_extensions)
        if in_webroot and is_script:
            findings.append({
                "timestamp": event.get("UtcTime", event.get("timestamp", "")),
                "file_path": filepath,
                "process": event.get("Image", event.get("process_name", "")),
                "hostname": event.get("Computer", event.get("hostname", "")),
                "severity": "CRITICAL",
                "reason": "script_created_in_webroot",
            })
    return findings


def detect_post_exploitation(process_logs):
    """Detect reconnaissance commands typically run through webshells."""
    recon_patterns = [
        (r"whoami", "user_discovery"),
        (r"ipconfig|ifconfig", "network_config"),
        (r"net\s+(user|group|localgroup)", "account_enum"),
        (r"systeminfo", "system_info"),
        (r"tasklist|ps\s+aux", "process_enum"),
        (r"netstat\s+-an", "connection_enum"),
        (r"dir\s+/s|find\s+/|ls\s+-la", "file_enum"),
    ]
    findings = []
    for entry in process_logs:
        parent = entry.get("ParentImage", entry.get("parent_process", "")).lower()
        if not any(ws in parent for ws in WEB_SERVER_PROCESSES):
            continue
        cmd = entry.get("CommandLine", entry.get("command_line", ""))
        for pattern, category in recon_patterns:
            if re.search(pattern, cmd, re.IGNORECASE):
                findings.append({
                    "timestamp": entry.get("UtcTime", entry.get("timestamp", "")),
                    "command": cmd[:300],
                    "category": category,
                    "parent": parent,
                    "severity": "HIGH",
                })
                break
    return findings


def main():
    parser = argparse.ArgumentParser(description="Webshell Activity Hunter")
    parser.add_argument("--process-log", help="JSON lines process creation log")
    parser.add_argument("--access-log", help="Web server access log")
    parser.add_argument("--file-events", help="JSON lines file creation events")
    parser.add_argument("--output", default="webshell_hunt_report.json")
    args = parser.parse_args()

    report = {"generated_at": datetime.utcnow().isoformat(), "findings": {}}

    if args.process_log:
        logs = load_process_logs(args.process_log)
        shells = detect_webserver_child_shells(logs)
        report["findings"]["shell_spawns"] = shells
        print(f"[+] Web server shell spawns: {len(shells)}")
        recon = detect_post_exploitation(logs)
        report["findings"]["post_exploitation"] = recon
        print(f"[+] Post-exploitation commands: {len(recon)}")

    if args.access_log:
        hits = analyze_web_access_logs(args.access_log)
        report["findings"]["access_log_hits"] = hits
        print(f"[+] Access log webshell indicators: {len(hits)}")

    if args.file_events:
        events = load_process_logs(args.file_events)
        files = detect_file_creation_in_webroot(events)
        report["findings"]["webroot_files"] = files
        print(f"[+] Scripts created in webroot: {len(files)}")

    with open(args.output, "w") as f:
        json.dump(report, f, indent=2, default=str)
    print(f"[+] Report saved to {args.output}")


if __name__ == "__main__":
    main()