Files
Anthropic-Cybersecurity-Skills/skills/analyzing-pdf-malware-with-pdfid/scripts/agent.py
T
mukul975 c47eed6a64 Production hardening: security fixes, code quality, 724 skills complete
- Fix 25 shell=True subprocess calls with list-based commands
- Fix 49 verify=False in defensive skills (env-var override)
- Add timeout to 231 HTTP/subprocess/socket calls
- Fix 6 SQL injection patterns with whitelist validation
- Replace 8 __import__() with standard imports
- Remove 701 unused imports across 442 files
- Add authorized-testing disclaimers to all offensive skills
- Complete 11 incomplete skill directories
- Expand 10 stub SKILL.md files with full content
- Fix 2 YAML parse errors in frontmatter
- Fix 5 pre-existing syntax errors
- Convert 22 hardcoded paths/ports to environment variables
- Back up 21 redundant skill pairs to .bak
- Fix 2 global declaration errors
- 724/724 skills with full folder anatomy (SKILL.md + agent.py + api-reference.md + LICENSE)
- 0 compile errors across all 724 agent.py files
2026-03-19 13:26:49 +01:00

241 lines
8.1 KiB
Python

#!/usr/bin/env python3
"""PDF malware analysis agent using pdfid concepts and pdf-parser for object extraction."""
import re
import os
import sys
import hashlib
import zlib
def compute_hash(filepath):
"""Compute SHA-256 hash of a file."""
sha256 = hashlib.sha256()
with open(filepath, "rb") as f:
for chunk in iter(lambda: f.read(65536), b""):
sha256.update(chunk)
return sha256.hexdigest()
PDF_SUSPICIOUS_KEYWORDS = {
"/JS": "JavaScript (embedded script execution)",
"/JavaScript": "JavaScript action",
"/AA": "Additional Actions (auto-execute triggers)",
"/OpenAction": "Action on document open",
"/AcroForm": "Interactive form (can contain JavaScript)",
"/JBIG2Decode": "JBIG2 decoder (CVE-2009-0658 exploit vector)",
"/RichMedia": "Rich media / Flash content",
"/Launch": "Launch action (execute external file)",
"/EmbeddedFile": "Embedded file (potential payload)",
"/XFA": "XML Forms Architecture (script execution)",
"/URI": "URI action (external link)",
"/SubmitForm": "Form submission (data exfiltration)",
"/ObjStm": "Object Stream (can hide objects from basic parsers)",
}
def scan_pdf_keywords(filepath):
"""Scan a PDF file for suspicious keywords similar to pdfid."""
with open(filepath, "rb") as f:
data = f.read()
text = data.decode("latin-1", errors="replace")
results = {}
for keyword, description in PDF_SUSPICIOUS_KEYWORDS.items():
count = text.count(keyword)
if count > 0:
results[keyword] = {"count": count, "description": description}
# Count standard PDF structure elements
structure = {
"obj": len(re.findall(r"\d+ \d+ obj", text)),
"endobj": text.count("endobj"),
"stream": text.count("stream"),
"endstream": text.count("endstream"),
"xref": text.count("xref"),
"trailer": text.count("trailer"),
"startxref": text.count("startxref"),
"page_count": len(re.findall(r"/Type\s*/Page[^s]", text)),
"encrypted": 1 if "/Encrypt" in text else 0,
}
return results, structure
def extract_pdf_version(filepath):
"""Extract the PDF version from the header."""
with open(filepath, "rb") as f:
header = f.read(20)
match = re.search(rb"%PDF-(\d+\.\d+)", header)
return match.group(1).decode() if match else "unknown"
def find_stream_objects(filepath):
"""Find and extract stream objects from the PDF."""
with open(filepath, "rb") as f:
data = f.read()
streams = []
pattern = rb"(\d+)\s+(\d+)\s+obj.*?stream\r?\n(.*?)endstream"
for match in re.finditer(pattern, data, re.DOTALL):
obj_num = int(match.group(1))
gen_num = int(match.group(2))
stream_data = match.group(3)
decoded = None
try:
decoded = zlib.decompress(stream_data)
except zlib.error:
pass
streams.append({
"object": f"{obj_num} {gen_num}",
"raw_size": len(stream_data),
"decoded_size": len(decoded) if decoded else 0,
"decodable": decoded is not None,
"preview": (decoded[:200] if decoded else stream_data[:200]).decode(
"latin-1", errors="replace"),
})
return streams
def extract_javascript(filepath):
"""Extract JavaScript code from PDF objects."""
with open(filepath, "rb") as f:
data = f.read()
text = data.decode("latin-1", errors="replace")
js_blocks = []
# Look for JavaScript in stream objects
js_pattern = re.compile(r"/JS\s*\((.*?)\)", re.DOTALL)
for match in js_pattern.finditer(text):
js_blocks.append({"type": "inline", "code": match.group(1)[:500]})
# Look for JavaScript in hex-encoded strings
hex_pattern = re.compile(r"/JS\s*<([0-9A-Fa-f]+)>")
for match in hex_pattern.finditer(text):
try:
decoded = bytes.fromhex(match.group(1)).decode("utf-8", errors="replace")
js_blocks.append({"type": "hex_encoded", "code": decoded[:500]})
except ValueError:
pass
return js_blocks
def extract_urls(filepath):
"""Extract URLs from the PDF content."""
with open(filepath, "rb") as f:
data = f.read()
text = data.decode("latin-1", errors="replace")
urls = list(set(re.findall(r"https?://[^\s<>\"')\]]+", text)))
return urls
def detect_exploits(keywords, streams):
"""Check for known PDF exploit indicators."""
exploits = []
if "/JBIG2Decode" in keywords:
exploits.append({
"cve": "CVE-2009-0658",
"description": "JBIG2 decoder vulnerability in Adobe Reader",
"confidence": "MEDIUM",
})
for stream in streams:
preview = stream.get("preview", "").lower()
if "shellcode" in preview or "\\x90\\x90" in preview:
exploits.append({
"cve": "Generic shellcode",
"description": "Potential shellcode detected in stream",
"confidence": "HIGH",
})
if "util.printf" in preview or "collab.geticon" in preview:
exploits.append({
"cve": "CVE-2008-2992 / CVE-2009-0927",
"description": "Known Adobe Reader JavaScript exploits",
"confidence": "HIGH",
})
return exploits
def calculate_risk_score(keywords, structure, exploits, js_blocks):
"""Calculate a risk score for the PDF."""
score = 0
if "/JS" in keywords or "/JavaScript" in keywords:
score += 30
if "/OpenAction" in keywords or "/AA" in keywords:
score += 20
if "/Launch" in keywords:
score += 25
if "/EmbeddedFile" in keywords:
score += 15
if "/JBIG2Decode" in keywords:
score += 20
if structure.get("encrypted"):
score += 10
score += len(exploits) * 20
score += len(js_blocks) * 10
return min(score, 100)
def generate_report(filepath, keywords, structure, streams, js_blocks,
urls, exploits, risk_score):
"""Generate PDF malware analysis report."""
return {
"file": filepath,
"sha256": compute_hash(filepath),
"size": os.path.getsize(filepath),
"pdf_version": extract_pdf_version(filepath),
"structure": structure,
"suspicious_keywords": keywords,
"streams": len(streams),
"javascript_blocks": len(js_blocks),
"urls_found": len(urls),
"exploit_indicators": exploits,
"risk_score": risk_score,
"risk_level": "HIGH" if risk_score >= 60 else "MEDIUM" if risk_score >= 30 else "LOW",
}
if __name__ == "__main__":
print("=" * 60)
print("PDF Malware Analysis Agent")
print("Keyword scanning, JavaScript extraction, exploit detection")
print("=" * 60)
target = sys.argv[1] if len(sys.argv) > 1 else None
if target and os.path.exists(target):
print(f"\n[*] Analyzing: {target}")
print(f"[*] SHA-256: {compute_hash(target)}")
print(f"[*] PDF version: {extract_pdf_version(target)}")
print("\n--- Suspicious Keywords (pdfid-style) ---")
keywords, structure = scan_pdf_keywords(target)
for kw, info in keywords.items():
print(f" [!] {kw}: {info['count']}x - {info['description']}")
print(f"\n--- Structure ---")
for key, val in structure.items():
print(f" {key}: {val}")
print("\n--- Stream Objects ---")
streams = find_stream_objects(target)
print(f" Found: {len(streams)} streams")
print("\n--- JavaScript Extraction ---")
js = extract_javascript(target)
for j in js:
print(f" [{j['type']}] {j['code'][:100]}...")
print("\n--- URLs ---")
urls = extract_urls(target)
for u in urls[:10]:
print(f" {u}")
print("\n--- Exploit Detection ---")
exploits = detect_exploits(keywords, streams)
for e in exploits:
print(f" [{e['confidence']}] {e['cve']}: {e['description']}")
risk = calculate_risk_score(keywords, structure, exploits, js)
print(f"\n[*] Risk Score: {risk}/100")
else:
print(f"\n[DEMO] Usage: python agent.py <document.pdf>")