Anthropic-Cybersecurity-Skills/skills/analyzing-pdf-malware-with-pdfid/scripts/agent.py

#!/usr/bin/env python3
"""PDF malware analysis agent using pdfid concepts and pdf-parser for object extraction."""

import re
import os
import sys
import hashlib
import zlib


def compute_hash(filepath):
    """Compute SHA-256 hash of a file."""
    sha256 = hashlib.sha256()
    with open(filepath, "rb") as f:
        for chunk in iter(lambda: f.read(65536), b""):
            sha256.update(chunk)
    return sha256.hexdigest()


PDF_SUSPICIOUS_KEYWORDS = {
    "/JS": "JavaScript (embedded script execution)",
    "/JavaScript": "JavaScript action",
    "/AA": "Additional Actions (auto-execute triggers)",
    "/OpenAction": "Action on document open",
    "/AcroForm": "Interactive form (can contain JavaScript)",
    "/JBIG2Decode": "JBIG2 decoder (CVE-2009-0658 exploit vector)",
    "/RichMedia": "Rich media / Flash content",
    "/Launch": "Launch action (execute external file)",
    "/EmbeddedFile": "Embedded file (potential payload)",
    "/XFA": "XML Forms Architecture (script execution)",
    "/URI": "URI action (external link)",
    "/SubmitForm": "Form submission (data exfiltration)",
    "/ObjStm": "Object Stream (can hide objects from basic parsers)",
}


def scan_pdf_keywords(filepath):
    """Scan a PDF file for suspicious keywords similar to pdfid."""
    with open(filepath, "rb") as f:
        data = f.read()

    text = data.decode("latin-1", errors="replace")
    results = {}
    for keyword, description in PDF_SUSPICIOUS_KEYWORDS.items():
        count = text.count(keyword)
        if count > 0:
            results[keyword] = {"count": count, "description": description}

    # Count standard PDF structure elements
    structure = {
        "obj": len(re.findall(r"\d+ \d+ obj", text)),
        "endobj": text.count("endobj"),
        "stream": text.count("stream"),
        "endstream": text.count("endstream"),
        "xref": text.count("xref"),
        "trailer": text.count("trailer"),
        "startxref": text.count("startxref"),
        "page_count": len(re.findall(r"/Type\s*/Page[^s]", text)),
        "encrypted": 1 if "/Encrypt" in text else 0,
    }
    return results, structure


def extract_pdf_version(filepath):
    """Extract the PDF version from the header."""
    with open(filepath, "rb") as f:
        header = f.read(20)
    match = re.search(rb"%PDF-(\d+\.\d+)", header)
    return match.group(1).decode() if match else "unknown"


def find_stream_objects(filepath):
    """Find and extract stream objects from the PDF."""
    with open(filepath, "rb") as f:
        data = f.read()

    streams = []
    pattern = rb"(\d+)\s+(\d+)\s+obj.*?stream\r?\n(.*?)endstream"
    for match in re.finditer(pattern, data, re.DOTALL):
        obj_num = int(match.group(1))
        gen_num = int(match.group(2))
        stream_data = match.group(3)
        decoded = None
        try:
            decoded = zlib.decompress(stream_data)
        except zlib.error:
            pass
        streams.append({
            "object": f"{obj_num} {gen_num}",
            "raw_size": len(stream_data),
            "decoded_size": len(decoded) if decoded else 0,
            "decodable": decoded is not None,
            "preview": (decoded[:200] if decoded else stream_data[:200]).decode(
                "latin-1", errors="replace"),
        })
    return streams


def extract_javascript(filepath):
    """Extract JavaScript code from PDF objects."""
    with open(filepath, "rb") as f:
        data = f.read()
    text = data.decode("latin-1", errors="replace")

    js_blocks = []
    # Look for JavaScript in stream objects
    js_pattern = re.compile(r"/JS\s*\((.*?)\)", re.DOTALL)
    for match in js_pattern.finditer(text):
        js_blocks.append({"type": "inline", "code": match.group(1)[:500]})

    # Look for JavaScript in hex-encoded strings
    hex_pattern = re.compile(r"/JS\s*<([0-9A-Fa-f]+)>")
    for match in hex_pattern.finditer(text):
        try:
            decoded = bytes.fromhex(match.group(1)).decode("utf-8", errors="replace")
            js_blocks.append({"type": "hex_encoded", "code": decoded[:500]})
        except ValueError:
            pass
    return js_blocks


def extract_urls(filepath):
    """Extract URLs from the PDF content."""
    with open(filepath, "rb") as f:
        data = f.read()
    text = data.decode("latin-1", errors="replace")
    urls = list(set(re.findall(r"https?://[^\s<>\"')\]]+", text)))
    return urls


def detect_exploits(keywords, streams):
    """Check for known PDF exploit indicators."""
    exploits = []
    if "/JBIG2Decode" in keywords:
        exploits.append({
            "cve": "CVE-2009-0658",
            "description": "JBIG2 decoder vulnerability in Adobe Reader",
            "confidence": "MEDIUM",
        })
    for stream in streams:
        preview = stream.get("preview", "").lower()
        if "shellcode" in preview or "\\x90\\x90" in preview:
            exploits.append({
                "cve": "Generic shellcode",
                "description": "Potential shellcode detected in stream",
                "confidence": "HIGH",
            })
        if "util.printf" in preview or "collab.geticon" in preview:
            exploits.append({
                "cve": "CVE-2008-2992 / CVE-2009-0927",
                "description": "Known Adobe Reader JavaScript exploits",
                "confidence": "HIGH",
            })
    return exploits


def calculate_risk_score(keywords, structure, exploits, js_blocks):
    """Calculate a risk score for the PDF."""
    score = 0
    if "/JS" in keywords or "/JavaScript" in keywords:
        score += 30
    if "/OpenAction" in keywords or "/AA" in keywords:
        score += 20
    if "/Launch" in keywords:
        score += 25
    if "/EmbeddedFile" in keywords:
        score += 15
    if "/JBIG2Decode" in keywords:
        score += 20
    if structure.get("encrypted"):
        score += 10
    score += len(exploits) * 20
    score += len(js_blocks) * 10
    return min(score, 100)


def generate_report(filepath, keywords, structure, streams, js_blocks,
                    urls, exploits, risk_score):
    """Generate PDF malware analysis report."""
    return {
        "file": filepath,
        "sha256": compute_hash(filepath),
        "size": os.path.getsize(filepath),
        "pdf_version": extract_pdf_version(filepath),
        "structure": structure,
        "suspicious_keywords": keywords,
        "streams": len(streams),
        "javascript_blocks": len(js_blocks),
        "urls_found": len(urls),
        "exploit_indicators": exploits,
        "risk_score": risk_score,
        "risk_level": "HIGH" if risk_score >= 60 else "MEDIUM" if risk_score >= 30 else "LOW",
    }


if __name__ == "__main__":
    print("=" * 60)
    print("PDF Malware Analysis Agent")
    print("Keyword scanning, JavaScript extraction, exploit detection")
    print("=" * 60)

    target = sys.argv[1] if len(sys.argv) > 1 else None

    if target and os.path.exists(target):
        print(f"\n[*] Analyzing: {target}")
        print(f"[*] SHA-256: {compute_hash(target)}")
        print(f"[*] PDF version: {extract_pdf_version(target)}")

        print("\n--- Suspicious Keywords (pdfid-style) ---")
        keywords, structure = scan_pdf_keywords(target)
        for kw, info in keywords.items():
            print(f"  [!] {kw}: {info['count']}x - {info['description']}")

        print(f"\n--- Structure ---")
        for key, val in structure.items():
            print(f"  {key}: {val}")

        print("\n--- Stream Objects ---")
        streams = find_stream_objects(target)
        print(f"  Found: {len(streams)} streams")

        print("\n--- JavaScript Extraction ---")
        js = extract_javascript(target)
        for j in js:
            print(f"  [{j['type']}] {j['code'][:100]}...")

        print("\n--- URLs ---")
        urls = extract_urls(target)
        for u in urls[:10]:
            print(f"  {u}")

        print("\n--- Exploit Detection ---")
        exploits = detect_exploits(keywords, streams)
        for e in exploits:
            print(f"  [{e['confidence']}] {e['cve']}: {e['description']}")

        risk = calculate_risk_score(keywords, structure, exploits, js)
        print(f"\n[*] Risk Score: {risk}/100")
    else:
        print(f"\n[DEMO] Usage: python agent.py <document.pdf>")