#!/usr/bin/env python3 """PDF malware analysis agent using pdfid concepts and pdf-parser for object extraction.""" import re import os import sys import hashlib import zlib def compute_hash(filepath): """Compute SHA-256 hash of a file.""" sha256 = hashlib.sha256() with open(filepath, "rb") as f: for chunk in iter(lambda: f.read(65536), b""): sha256.update(chunk) return sha256.hexdigest() PDF_SUSPICIOUS_KEYWORDS = { "/JS": "JavaScript (embedded script execution)", "/JavaScript": "JavaScript action", "/AA": "Additional Actions (auto-execute triggers)", "/OpenAction": "Action on document open", "/AcroForm": "Interactive form (can contain JavaScript)", "/JBIG2Decode": "JBIG2 decoder (CVE-2009-0658 exploit vector)", "/RichMedia": "Rich media / Flash content", "/Launch": "Launch action (execute external file)", "/EmbeddedFile": "Embedded file (potential payload)", "/XFA": "XML Forms Architecture (script execution)", "/URI": "URI action (external link)", "/SubmitForm": "Form submission (data exfiltration)", "/ObjStm": "Object Stream (can hide objects from basic parsers)", } def scan_pdf_keywords(filepath): """Scan a PDF file for suspicious keywords similar to pdfid.""" with open(filepath, "rb") as f: data = f.read() text = data.decode("latin-1", errors="replace") results = {} for keyword, description in PDF_SUSPICIOUS_KEYWORDS.items(): count = text.count(keyword) if count > 0: results[keyword] = {"count": count, "description": description} # Count standard PDF structure elements structure = { "obj": len(re.findall(r"\d+ \d+ obj", text)), "endobj": text.count("endobj"), "stream": text.count("stream"), "endstream": text.count("endstream"), "xref": text.count("xref"), "trailer": text.count("trailer"), "startxref": text.count("startxref"), "page_count": len(re.findall(r"/Type\s*/Page[^s]", text)), "encrypted": 1 if "/Encrypt" in text else 0, } return results, structure def extract_pdf_version(filepath): """Extract the PDF version from the header.""" with open(filepath, "rb") as f: header = f.read(20) match = re.search(rb"%PDF-(\d+\.\d+)", header) return match.group(1).decode() if match else "unknown" def find_stream_objects(filepath): """Find and extract stream objects from the PDF.""" with open(filepath, "rb") as f: data = f.read() streams = [] pattern = rb"(\d+)\s+(\d+)\s+obj.*?stream\r?\n(.*?)endstream" for match in re.finditer(pattern, data, re.DOTALL): obj_num = int(match.group(1)) gen_num = int(match.group(2)) stream_data = match.group(3) decoded = None try: decoded = zlib.decompress(stream_data) except zlib.error: pass streams.append({ "object": f"{obj_num} {gen_num}", "raw_size": len(stream_data), "decoded_size": len(decoded) if decoded else 0, "decodable": decoded is not None, "preview": (decoded[:200] if decoded else stream_data[:200]).decode( "latin-1", errors="replace"), }) return streams def extract_javascript(filepath): """Extract JavaScript code from PDF objects.""" with open(filepath, "rb") as f: data = f.read() text = data.decode("latin-1", errors="replace") js_blocks = [] # Look for JavaScript in stream objects js_pattern = re.compile(r"/JS\s*\((.*?)\)", re.DOTALL) for match in js_pattern.finditer(text): js_blocks.append({"type": "inline", "code": match.group(1)[:500]}) # Look for JavaScript in hex-encoded strings hex_pattern = re.compile(r"/JS\s*<([0-9A-Fa-f]+)>") for match in hex_pattern.finditer(text): try: decoded = bytes.fromhex(match.group(1)).decode("utf-8", errors="replace") js_blocks.append({"type": "hex_encoded", "code": decoded[:500]}) except ValueError: pass return js_blocks def extract_urls(filepath): """Extract URLs from the PDF content.""" with open(filepath, "rb") as f: data = f.read() text = data.decode("latin-1", errors="replace") urls = list(set(re.findall(r"https?://[^\s<>\"')\]]+", text))) return urls def detect_exploits(keywords, streams): """Check for known PDF exploit indicators.""" exploits = [] if "/JBIG2Decode" in keywords: exploits.append({ "cve": "CVE-2009-0658", "description": "JBIG2 decoder vulnerability in Adobe Reader", "confidence": "MEDIUM", }) for stream in streams: preview = stream.get("preview", "").lower() if "shellcode" in preview or "\\x90\\x90" in preview: exploits.append({ "cve": "Generic shellcode", "description": "Potential shellcode detected in stream", "confidence": "HIGH", }) if "util.printf" in preview or "collab.geticon" in preview: exploits.append({ "cve": "CVE-2008-2992 / CVE-2009-0927", "description": "Known Adobe Reader JavaScript exploits", "confidence": "HIGH", }) return exploits def calculate_risk_score(keywords, structure, exploits, js_blocks): """Calculate a risk score for the PDF.""" score = 0 if "/JS" in keywords or "/JavaScript" in keywords: score += 30 if "/OpenAction" in keywords or "/AA" in keywords: score += 20 if "/Launch" in keywords: score += 25 if "/EmbeddedFile" in keywords: score += 15 if "/JBIG2Decode" in keywords: score += 20 if structure.get("encrypted"): score += 10 score += len(exploits) * 20 score += len(js_blocks) * 10 return min(score, 100) def generate_report(filepath, keywords, structure, streams, js_blocks, urls, exploits, risk_score): """Generate PDF malware analysis report.""" return { "file": filepath, "sha256": compute_hash(filepath), "size": os.path.getsize(filepath), "pdf_version": extract_pdf_version(filepath), "structure": structure, "suspicious_keywords": keywords, "streams": len(streams), "javascript_blocks": len(js_blocks), "urls_found": len(urls), "exploit_indicators": exploits, "risk_score": risk_score, "risk_level": "HIGH" if risk_score >= 60 else "MEDIUM" if risk_score >= 30 else "LOW", } if __name__ == "__main__": print("=" * 60) print("PDF Malware Analysis Agent") print("Keyword scanning, JavaScript extraction, exploit detection") print("=" * 60) target = sys.argv[1] if len(sys.argv) > 1 else None if target and os.path.exists(target): print(f"\n[*] Analyzing: {target}") print(f"[*] SHA-256: {compute_hash(target)}") print(f"[*] PDF version: {extract_pdf_version(target)}") print("\n--- Suspicious Keywords (pdfid-style) ---") keywords, structure = scan_pdf_keywords(target) for kw, info in keywords.items(): print(f" [!] {kw}: {info['count']}x - {info['description']}") print(f"\n--- Structure ---") for key, val in structure.items(): print(f" {key}: {val}") print("\n--- Stream Objects ---") streams = find_stream_objects(target) print(f" Found: {len(streams)} streams") print("\n--- JavaScript Extraction ---") js = extract_javascript(target) for j in js: print(f" [{j['type']}] {j['code'][:100]}...") print("\n--- URLs ---") urls = extract_urls(target) for u in urls[:10]: print(f" {u}") print("\n--- Exploit Detection ---") exploits = detect_exploits(keywords, streams) for e in exploits: print(f" [{e['confidence']}] {e['cve']}: {e['description']}") risk = calculate_risk_score(keywords, structure, exploits, js) print(f"\n[*] Risk Score: {risk}/100") else: print(f"\n[DEMO] Usage: python agent.py ")