Files
Anthropic-Cybersecurity-Skills/skills/analyzing-malicious-pdf-with-peepdf/scripts/agent.py
T
mukul975 c47eed6a64 Production hardening: security fixes, code quality, 724 skills complete
- Fix 25 shell=True subprocess calls with list-based commands
- Fix 49 verify=False in defensive skills (env-var override)
- Add timeout to 231 HTTP/subprocess/socket calls
- Fix 6 SQL injection patterns with whitelist validation
- Replace 8 __import__() with standard imports
- Remove 701 unused imports across 442 files
- Add authorized-testing disclaimers to all offensive skills
- Complete 11 incomplete skill directories
- Expand 10 stub SKILL.md files with full content
- Fix 2 YAML parse errors in frontmatter
- Fix 5 pre-existing syntax errors
- Convert 22 hardcoded paths/ports to environment variables
- Back up 21 redundant skill pairs to .bak
- Fix 2 global declaration errors
- 724/724 skills with full folder anatomy (SKILL.md + agent.py + api-reference.md + LICENSE)
- 0 compile errors across all 724 agent.py files
2026-03-19 13:26:49 +01:00

220 lines
8.2 KiB
Python

#!/usr/bin/env python3
"""Malicious PDF Analysis Agent - static analysis using peepdf, pdfid, and pdf-parser for threat detection."""
import json
import argparse
import logging
import subprocess
import hashlib
import os
import re
from datetime import datetime
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)
SUSPICIOUS_KEYWORDS = [
"/JS", "/JavaScript", "/OpenAction", "/AA", "/Launch", "/EmbeddedFile",
"/RichMedia", "/XFA", "/AcroForm", "/JBIG2Decode", "/URI", "/SubmitForm",
"/ImportData", "/Names", "/ObjStm",
]
HIGH_RISK_KEYWORDS = ["/JS", "/JavaScript", "/OpenAction", "/Launch", "/EmbeddedFile", "/XFA"]
def compute_hashes(filepath):
"""Compute MD5 and SHA-256 hashes of the PDF file."""
md5 = hashlib.md5()
sha256 = hashlib.sha256()
with open(filepath, "rb") as f:
for chunk in iter(lambda: f.read(65536), b""):
md5.update(chunk)
sha256.update(chunk)
return {"md5": md5.hexdigest(), "sha256": sha256.hexdigest()}
def run_pdfid(filepath):
"""Run pdfid.py to triage PDF for suspicious keywords."""
cmd = ["python3", "-m", "pdfid", filepath]
alt_cmd = ["pdfid.py", filepath]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
if result.returncode != 0:
result = subprocess.run(alt_cmd, capture_output=True, text=True, timeout=120)
keywords = {}
for line in result.stdout.strip().split("\n"):
line = line.strip()
for kw in SUSPICIOUS_KEYWORDS:
if kw.lower() in line.lower():
parts = line.rsplit(None, 1)
if len(parts) == 2:
try:
count = int(parts[1])
keywords[kw] = count
except ValueError:
pass
return keywords
def run_peepdf_analysis(filepath):
"""Run peepdf for detailed PDF object analysis."""
cmd = ["peepdf", "-f", "-l", filepath]
alt_cmd = ["python3", "-m", "peepdf", "-f", "-l", filepath]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
if result.returncode != 0:
result = subprocess.run(alt_cmd, capture_output=True, text=True, timeout=120)
analysis = {
"versions": 0,
"objects": 0,
"streams": 0,
"encoded_streams": 0,
"suspicious_objects": [],
"js_objects": [],
"vulns": [],
"urls": [],
"raw_output": result.stdout[:2000],
}
for line in result.stdout.split("\n"):
line = line.strip()
if "Version" in line and "Objects" in line:
nums = re.findall(r"\d+", line)
if nums:
analysis["objects"] = int(nums[-1]) if nums else 0
if "Suspicious" in line or "suspicious" in line:
analysis["suspicious_objects"].append(line)
if "/JS" in line or "/JavaScript" in line:
obj_ids = re.findall(r"(\d+)", line)
analysis["js_objects"].extend(obj_ids)
if "CVE" in line.upper():
cves = re.findall(r"CVE-\d{4}-\d{4,}", line, re.IGNORECASE)
analysis["vulns"].extend(cves)
urls = re.findall(r"https?://[^\s\"'<>]+", line)
analysis["urls"].extend(urls)
return analysis
def run_pdf_parser(filepath, object_id=None):
"""Run pdf-parser.py to extract specific objects."""
if object_id:
cmd = ["pdf-parser.py", "-o", str(object_id), "-f", "-d", filepath]
else:
cmd = ["pdf-parser.py", "--stats", filepath]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
return result.stdout[:3000]
def extract_javascript(filepath, peepdf_analysis):
"""Extract JavaScript content from identified objects."""
js_content = []
for obj_id in peepdf_analysis.get("js_objects", []):
cmd = ["pdf-parser.py", "-o", str(obj_id), "-f", "-w", filepath]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
if result.stdout:
js_content.append({
"object_id": obj_id,
"content_preview": result.stdout[:1000],
"length": len(result.stdout),
})
return js_content
def detect_shellcode_patterns(content):
"""Detect common shellcode patterns in extracted content."""
patterns = {
"heap_spray": r"(%u[0-9a-fA-F]{4}){4,}",
"nop_sled": r"(\\x90){8,}|(%u9090){4,}",
"unescape_chain": r"unescape\s*\(",
"shellcode_var": r"shellcode|payload|sc\s*=\s*[\"']",
"fromcharcode": r"String\.fromCharCode",
"eval_call": r"eval\s*\(",
"activex": r"new\s+ActiveXObject",
}
detected = {}
for name, pattern in patterns.items():
matches = re.findall(pattern, content, re.IGNORECASE)
if matches:
detected[name] = len(matches)
return detected
def calculate_risk_score(pdfid_results, peepdf_analysis, shellcode_patterns):
"""Calculate overall risk score for the PDF."""
score = 0
for kw, count in pdfid_results.items():
if count > 0:
if kw in HIGH_RISK_KEYWORDS:
score += count * 20
else:
score += count * 5
score += len(peepdf_analysis.get("vulns", [])) * 30
score += len(peepdf_analysis.get("js_objects", [])) * 15
score += sum(shellcode_patterns.values()) * 10
risk_level = "critical" if score >= 80 else "high" if score >= 50 else "medium" if score >= 20 else "low"
return {"score": min(score, 100), "risk_level": risk_level}
def generate_report(filepath, hashes, pdfid_results, peepdf_analysis, js_content, shellcode, risk):
"""Generate comprehensive PDF malware analysis report."""
report = {
"timestamp": datetime.utcnow().isoformat(),
"file": os.path.basename(filepath),
"file_size": os.path.getsize(filepath),
"hashes": hashes,
"risk_assessment": risk,
"pdfid_keywords": pdfid_results,
"suspicious_keyword_count": sum(1 for v in pdfid_results.values() if v > 0),
"peepdf_analysis": {
"objects": peepdf_analysis.get("objects", 0),
"js_objects": peepdf_analysis.get("js_objects", []),
"cve_references": peepdf_analysis.get("vulns", []),
"extracted_urls": list(set(peepdf_analysis.get("urls", []))),
},
"javascript_content": js_content[:5],
"shellcode_indicators": shellcode,
"iocs": {
"sha256": hashes["sha256"],
"urls": list(set(peepdf_analysis.get("urls", []))),
"cves": peepdf_analysis.get("vulns", []),
},
}
return report
def main():
parser = argparse.ArgumentParser(description="Malicious PDF Analysis Agent")
parser.add_argument("file", help="Path to PDF file to analyze")
parser.add_argument("--extract-js", action="store_true", help="Extract JavaScript objects")
parser.add_argument("--output", default="pdf_analysis_report.json")
args = parser.parse_args()
if not os.path.exists(args.file):
logger.error("File not found: %s", args.file)
return
logger.info("Analyzing: %s (%d bytes)", args.file, os.path.getsize(args.file))
hashes = compute_hashes(args.file)
logger.info("SHA-256: %s", hashes["sha256"])
pdfid_results = run_pdfid(args.file)
peepdf_analysis = run_peepdf_analysis(args.file)
js_content = []
shellcode = {}
if args.extract_js or peepdf_analysis.get("js_objects"):
js_content = extract_javascript(args.file, peepdf_analysis)
all_js = " ".join(j["content_preview"] for j in js_content)
shellcode = detect_shellcode_patterns(all_js)
risk = calculate_risk_score(pdfid_results, peepdf_analysis, shellcode)
report = generate_report(args.file, hashes, pdfid_results, peepdf_analysis, js_content, shellcode, risk)
with open(args.output, "w") as f:
json.dump(report, f, indent=2, default=str)
logger.info("Risk: %s (score %d), %d suspicious keywords, %d JS objects, %d CVEs",
risk["risk_level"], risk["score"], report["suspicious_keyword_count"],
len(peepdf_analysis.get("js_objects", [])), len(peepdf_analysis.get("vulns", [])))
print(json.dumps(report, indent=2, default=str))
if __name__ == "__main__":
main()