mirror of
https://github.com/mukul975/Anthropic-Cybersecurity-Skills.git
synced 2026-06-11 21:54:56 +03:00
c47eed6a64
- Fix 25 shell=True subprocess calls with list-based commands - Fix 49 verify=False in defensive skills (env-var override) - Add timeout to 231 HTTP/subprocess/socket calls - Fix 6 SQL injection patterns with whitelist validation - Replace 8 __import__() with standard imports - Remove 701 unused imports across 442 files - Add authorized-testing disclaimers to all offensive skills - Complete 11 incomplete skill directories - Expand 10 stub SKILL.md files with full content - Fix 2 YAML parse errors in frontmatter - Fix 5 pre-existing syntax errors - Convert 22 hardcoded paths/ports to environment variables - Back up 21 redundant skill pairs to .bak - Fix 2 global declaration errors - 724/724 skills with full folder anatomy (SKILL.md + agent.py + api-reference.md + LICENSE) - 0 compile errors across all 724 agent.py files
241 lines
8.1 KiB
Python
241 lines
8.1 KiB
Python
#!/usr/bin/env python3
|
|
"""PDF malware analysis agent using pdfid concepts and pdf-parser for object extraction."""
|
|
|
|
import re
|
|
import os
|
|
import sys
|
|
import hashlib
|
|
import zlib
|
|
|
|
|
|
def compute_hash(filepath):
|
|
"""Compute SHA-256 hash of a file."""
|
|
sha256 = hashlib.sha256()
|
|
with open(filepath, "rb") as f:
|
|
for chunk in iter(lambda: f.read(65536), b""):
|
|
sha256.update(chunk)
|
|
return sha256.hexdigest()
|
|
|
|
|
|
PDF_SUSPICIOUS_KEYWORDS = {
|
|
"/JS": "JavaScript (embedded script execution)",
|
|
"/JavaScript": "JavaScript action",
|
|
"/AA": "Additional Actions (auto-execute triggers)",
|
|
"/OpenAction": "Action on document open",
|
|
"/AcroForm": "Interactive form (can contain JavaScript)",
|
|
"/JBIG2Decode": "JBIG2 decoder (CVE-2009-0658 exploit vector)",
|
|
"/RichMedia": "Rich media / Flash content",
|
|
"/Launch": "Launch action (execute external file)",
|
|
"/EmbeddedFile": "Embedded file (potential payload)",
|
|
"/XFA": "XML Forms Architecture (script execution)",
|
|
"/URI": "URI action (external link)",
|
|
"/SubmitForm": "Form submission (data exfiltration)",
|
|
"/ObjStm": "Object Stream (can hide objects from basic parsers)",
|
|
}
|
|
|
|
|
|
def scan_pdf_keywords(filepath):
|
|
"""Scan a PDF file for suspicious keywords similar to pdfid."""
|
|
with open(filepath, "rb") as f:
|
|
data = f.read()
|
|
|
|
text = data.decode("latin-1", errors="replace")
|
|
results = {}
|
|
for keyword, description in PDF_SUSPICIOUS_KEYWORDS.items():
|
|
count = text.count(keyword)
|
|
if count > 0:
|
|
results[keyword] = {"count": count, "description": description}
|
|
|
|
# Count standard PDF structure elements
|
|
structure = {
|
|
"obj": len(re.findall(r"\d+ \d+ obj", text)),
|
|
"endobj": text.count("endobj"),
|
|
"stream": text.count("stream"),
|
|
"endstream": text.count("endstream"),
|
|
"xref": text.count("xref"),
|
|
"trailer": text.count("trailer"),
|
|
"startxref": text.count("startxref"),
|
|
"page_count": len(re.findall(r"/Type\s*/Page[^s]", text)),
|
|
"encrypted": 1 if "/Encrypt" in text else 0,
|
|
}
|
|
return results, structure
|
|
|
|
|
|
def extract_pdf_version(filepath):
|
|
"""Extract the PDF version from the header."""
|
|
with open(filepath, "rb") as f:
|
|
header = f.read(20)
|
|
match = re.search(rb"%PDF-(\d+\.\d+)", header)
|
|
return match.group(1).decode() if match else "unknown"
|
|
|
|
|
|
def find_stream_objects(filepath):
|
|
"""Find and extract stream objects from the PDF."""
|
|
with open(filepath, "rb") as f:
|
|
data = f.read()
|
|
|
|
streams = []
|
|
pattern = rb"(\d+)\s+(\d+)\s+obj.*?stream\r?\n(.*?)endstream"
|
|
for match in re.finditer(pattern, data, re.DOTALL):
|
|
obj_num = int(match.group(1))
|
|
gen_num = int(match.group(2))
|
|
stream_data = match.group(3)
|
|
decoded = None
|
|
try:
|
|
decoded = zlib.decompress(stream_data)
|
|
except zlib.error:
|
|
pass
|
|
streams.append({
|
|
"object": f"{obj_num} {gen_num}",
|
|
"raw_size": len(stream_data),
|
|
"decoded_size": len(decoded) if decoded else 0,
|
|
"decodable": decoded is not None,
|
|
"preview": (decoded[:200] if decoded else stream_data[:200]).decode(
|
|
"latin-1", errors="replace"),
|
|
})
|
|
return streams
|
|
|
|
|
|
def extract_javascript(filepath):
|
|
"""Extract JavaScript code from PDF objects."""
|
|
with open(filepath, "rb") as f:
|
|
data = f.read()
|
|
text = data.decode("latin-1", errors="replace")
|
|
|
|
js_blocks = []
|
|
# Look for JavaScript in stream objects
|
|
js_pattern = re.compile(r"/JS\s*\((.*?)\)", re.DOTALL)
|
|
for match in js_pattern.finditer(text):
|
|
js_blocks.append({"type": "inline", "code": match.group(1)[:500]})
|
|
|
|
# Look for JavaScript in hex-encoded strings
|
|
hex_pattern = re.compile(r"/JS\s*<([0-9A-Fa-f]+)>")
|
|
for match in hex_pattern.finditer(text):
|
|
try:
|
|
decoded = bytes.fromhex(match.group(1)).decode("utf-8", errors="replace")
|
|
js_blocks.append({"type": "hex_encoded", "code": decoded[:500]})
|
|
except ValueError:
|
|
pass
|
|
return js_blocks
|
|
|
|
|
|
def extract_urls(filepath):
|
|
"""Extract URLs from the PDF content."""
|
|
with open(filepath, "rb") as f:
|
|
data = f.read()
|
|
text = data.decode("latin-1", errors="replace")
|
|
urls = list(set(re.findall(r"https?://[^\s<>\"')\]]+", text)))
|
|
return urls
|
|
|
|
|
|
def detect_exploits(keywords, streams):
|
|
"""Check for known PDF exploit indicators."""
|
|
exploits = []
|
|
if "/JBIG2Decode" in keywords:
|
|
exploits.append({
|
|
"cve": "CVE-2009-0658",
|
|
"description": "JBIG2 decoder vulnerability in Adobe Reader",
|
|
"confidence": "MEDIUM",
|
|
})
|
|
for stream in streams:
|
|
preview = stream.get("preview", "").lower()
|
|
if "shellcode" in preview or "\\x90\\x90" in preview:
|
|
exploits.append({
|
|
"cve": "Generic shellcode",
|
|
"description": "Potential shellcode detected in stream",
|
|
"confidence": "HIGH",
|
|
})
|
|
if "util.printf" in preview or "collab.geticon" in preview:
|
|
exploits.append({
|
|
"cve": "CVE-2008-2992 / CVE-2009-0927",
|
|
"description": "Known Adobe Reader JavaScript exploits",
|
|
"confidence": "HIGH",
|
|
})
|
|
return exploits
|
|
|
|
|
|
def calculate_risk_score(keywords, structure, exploits, js_blocks):
|
|
"""Calculate a risk score for the PDF."""
|
|
score = 0
|
|
if "/JS" in keywords or "/JavaScript" in keywords:
|
|
score += 30
|
|
if "/OpenAction" in keywords or "/AA" in keywords:
|
|
score += 20
|
|
if "/Launch" in keywords:
|
|
score += 25
|
|
if "/EmbeddedFile" in keywords:
|
|
score += 15
|
|
if "/JBIG2Decode" in keywords:
|
|
score += 20
|
|
if structure.get("encrypted"):
|
|
score += 10
|
|
score += len(exploits) * 20
|
|
score += len(js_blocks) * 10
|
|
return min(score, 100)
|
|
|
|
|
|
def generate_report(filepath, keywords, structure, streams, js_blocks,
|
|
urls, exploits, risk_score):
|
|
"""Generate PDF malware analysis report."""
|
|
return {
|
|
"file": filepath,
|
|
"sha256": compute_hash(filepath),
|
|
"size": os.path.getsize(filepath),
|
|
"pdf_version": extract_pdf_version(filepath),
|
|
"structure": structure,
|
|
"suspicious_keywords": keywords,
|
|
"streams": len(streams),
|
|
"javascript_blocks": len(js_blocks),
|
|
"urls_found": len(urls),
|
|
"exploit_indicators": exploits,
|
|
"risk_score": risk_score,
|
|
"risk_level": "HIGH" if risk_score >= 60 else "MEDIUM" if risk_score >= 30 else "LOW",
|
|
}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print("=" * 60)
|
|
print("PDF Malware Analysis Agent")
|
|
print("Keyword scanning, JavaScript extraction, exploit detection")
|
|
print("=" * 60)
|
|
|
|
target = sys.argv[1] if len(sys.argv) > 1 else None
|
|
|
|
if target and os.path.exists(target):
|
|
print(f"\n[*] Analyzing: {target}")
|
|
print(f"[*] SHA-256: {compute_hash(target)}")
|
|
print(f"[*] PDF version: {extract_pdf_version(target)}")
|
|
|
|
print("\n--- Suspicious Keywords (pdfid-style) ---")
|
|
keywords, structure = scan_pdf_keywords(target)
|
|
for kw, info in keywords.items():
|
|
print(f" [!] {kw}: {info['count']}x - {info['description']}")
|
|
|
|
print(f"\n--- Structure ---")
|
|
for key, val in structure.items():
|
|
print(f" {key}: {val}")
|
|
|
|
print("\n--- Stream Objects ---")
|
|
streams = find_stream_objects(target)
|
|
print(f" Found: {len(streams)} streams")
|
|
|
|
print("\n--- JavaScript Extraction ---")
|
|
js = extract_javascript(target)
|
|
for j in js:
|
|
print(f" [{j['type']}] {j['code'][:100]}...")
|
|
|
|
print("\n--- URLs ---")
|
|
urls = extract_urls(target)
|
|
for u in urls[:10]:
|
|
print(f" {u}")
|
|
|
|
print("\n--- Exploit Detection ---")
|
|
exploits = detect_exploits(keywords, streams)
|
|
for e in exploits:
|
|
print(f" [{e['confidence']}] {e['cve']}: {e['description']}")
|
|
|
|
risk = calculate_risk_score(keywords, structure, exploits, js)
|
|
print(f"\n[*] Risk Score: {risk}/100")
|
|
else:
|
|
print(f"\n[DEMO] Usage: python agent.py <document.pdf>")
|