#!/usr/bin/env python3
"""
Malicious npm package triage helper.

Orchestrates GuardDog static scanning, lifecycle-script inspection, and IOC
extraction for a single npm package (by name@version) or a local tarball, and
emits a structured verdict report.

WARNING: download with scripts disabled and run only in an isolated, disposable
environment with no production credentials. Defensive / authorized use only.
"""
import argparse
import json
import re
import shutil
import subprocess
import sys
import tarfile
import tempfile
import urllib.request
from pathlib import Path

URL_RE = re.compile(r"https?://[A-Za-z0-9./?=_%:&#-]+")
SUSPECT_RE = re.compile(
    r"child_process|\bexec\s*\(|\bspawn\b|\beval\s*\(|Buffer\.from\([^)]*base64|process\.env",
    re.IGNORECASE,
)
HIGH_SIGNAL_RULES = [
    "npm-install-script",
    "npm-serialize-environment",
    "npm-exec-base64",
    "npm-silent-process-execution",
    "npm-obfuscation",
    "shady-links",
    "typosquatting",
]


def run(cmd: list[str], timeout: int = 600) -> tuple[int, str, str]:
    try:
        p = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
        return p.returncode, p.stdout, p.stderr
    except FileNotFoundError:
        return 127, "", f"command not found: {cmd[0]}"
    except subprocess.SubprocessError as exc:
        return 1, "", f"subprocess error: {exc}"


def guarddog_scan(target: str, version: str | None) -> dict:
    if not shutil.which("guarddog"):
        return {"error": "guarddog not installed (pip install guarddog)"}
    cmd = ["guarddog", "npm", "scan", target, "--output-format=json"]
    if version:
        cmd += ["--version", version]
    for rule in HIGH_SIGNAL_RULES:
        cmd += ["--rules", rule]
    rc, out, err = run(cmd)
    try:
        return {"returncode": rc, "result": json.loads(out)} if out.strip() else {"returncode": rc, "stderr": err}
    except json.JSONDecodeError:
        return {"returncode": rc, "raw": out, "stderr": err}


def download_tarball(name: str, version: str | None, dest: Path) -> Path | None:
    spec = f"{name}@{version}" if version else name
    rc, out, err = run(["npm", "view", spec, "dist.tarball"])
    if rc != 0 or not out.strip():
        print(f"[warn] could not resolve tarball for {spec}: {err}", file=sys.stderr)
        return None
    url = out.strip().splitlines()[-1]
    tgz = dest / "package.tgz"
    try:
        urllib.request.urlretrieve(url, tgz)  # noqa: S310 - registry URL from npm
        return tgz
    except Exception as exc:  # noqa: BLE001
        print(f"[warn] download failed: {exc}", file=sys.stderr)
        return None


def inspect_tarball(tgz: Path, workdir: Path) -> dict:
    findings = {"lifecycle_scripts": {}, "suspicious_lines": [], "urls": []}
    extract = workdir / "package"
    try:
        with tarfile.open(tgz, "r:gz") as tf:
            members = [m for m in tf.getmembers() if not m.name.startswith(("/", ".."))]
            tf.extractall(extract, members=members)  # noqa: S202 - path-checked members
    except (tarfile.TarError, OSError) as exc:
        findings["error"] = f"extract failed: {exc}"
        return findings

    pkg_json = next(extract.rglob("package.json"), None)
    if pkg_json:
        try:
            data = json.loads(pkg_json.read_text(encoding="utf-8"))
            findings["lifecycle_scripts"] = {
                k: v for k, v in (data.get("scripts") or {}).items()
                if k in ("preinstall", "install", "postinstall")
            }
        except (json.JSONDecodeError, OSError):
            pass

    for src in extract.rglob("*.js"):
        try:
            text = src.read_text(encoding="utf-8", errors="ignore")
        except OSError:
            continue
        for m in SUSPECT_RE.finditer(text):
            findings["suspicious_lines"].append({"file": str(src.relative_to(extract)), "match": m.group(0)})
        findings["urls"].extend(URL_RE.findall(text))
    findings["urls"] = sorted(set(findings["urls"]))[:50]
    findings["suspicious_lines"] = findings["suspicious_lines"][:50]
    return findings


def verdict(gd: dict, insp: dict) -> str:
    score = 0
    res = gd.get("result")
    if isinstance(res, dict):
        for v in res.get("results", res).values() if isinstance(res.get("results", res), dict) else []:
            if v:
                score += 2
    if insp.get("lifecycle_scripts"):
        score += 2
    score += min(len(insp.get("suspicious_lines", [])), 5)
    if score >= 6:
        return "MALICIOUS (high confidence)"
    if score >= 2:
        return "SUSPICIOUS (manual review required)"
    return "benign (no strong indicators)"


def main() -> int:
    ap = argparse.ArgumentParser(description="npm malicious package triage")
    ap.add_argument("--package", help="npm package name")
    ap.add_argument("--version", help="specific version")
    ap.add_argument("--tarball", help="path to a local .tgz instead of downloading")
    ap.add_argument("--output", help="write JSON report")
    args = ap.parse_args()

    if not args.package and not args.tarball:
        ap.error("provide --package or --tarball")

    report: dict = {"package": args.package, "version": args.version}
    with tempfile.TemporaryDirectory() as td:
        work = Path(td)
        if args.tarball:
            tgz = Path(args.tarball)
            report["guarddog"] = (
                guarddog_scan(str(tgz), None) if shutil.which("guarddog") else {"error": "no guarddog"}
            )
        else:
            report["guarddog"] = guarddog_scan(args.package, args.version)
            tgz = download_tarball(args.package, args.version, work)

        report["inspection"] = inspect_tarball(tgz, work) if tgz and tgz.exists() else {"error": "no tarball"}

    report["verdict"] = verdict(report["guarddog"], report["inspection"])
    print(f"[+] verdict: {report['verdict']}")
    if report["inspection"].get("lifecycle_scripts"):
        print(f"[!] install scripts present: {report['inspection']['lifecycle_scripts']}")
    if report["inspection"].get("urls"):
        print(f"[i] {len(report['inspection']['urls'])} URL(s) found in source")

    if args.output:
        Path(args.output).write_text(json.dumps(report, indent=2), encoding="utf-8")
        print(f"[+] report written to {args.output}")
    return 0


if __name__ == "__main__":
    sys.exit(main())