Files
Anthropic-Cybersecurity-Skills/skills/extracting-iocs-from-malware-samples/scripts/agent.py
T
mukul975 c47eed6a64 Production hardening: security fixes, code quality, 724 skills complete
- Fix 25 shell=True subprocess calls with list-based commands
- Fix 49 verify=False in defensive skills (env-var override)
- Add timeout to 231 HTTP/subprocess/socket calls
- Fix 6 SQL injection patterns with whitelist validation
- Replace 8 __import__() with standard imports
- Remove 701 unused imports across 442 files
- Add authorized-testing disclaimers to all offensive skills
- Complete 11 incomplete skill directories
- Expand 10 stub SKILL.md files with full content
- Fix 2 YAML parse errors in frontmatter
- Fix 5 pre-existing syntax errors
- Convert 22 hardcoded paths/ports to environment variables
- Back up 21 redundant skill pairs to .bak
- Fix 2 global declaration errors
- 724/724 skills with full folder anatomy (SKILL.md + agent.py + api-reference.md + LICENSE)
- 0 compile errors across all 724 agent.py files
2026-03-19 13:26:49 +01:00

272 lines
11 KiB
Python

#!/usr/bin/env python3
# For authorized testing in lab/CTF environments only
"""IOC extraction agent using pefile, yara-python, and requests for VirusTotal validation."""
import argparse
import csv
import hashlib
import json
import logging
import os
import re
import sys
from datetime import datetime
from typing import List, Set
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)
try:
import pefile
except ImportError:
sys.exit("pefile required: pip install pefile")
try:
import yara
except ImportError:
yara = None
logger.warning("yara-python not installed; YARA scanning disabled")
try:
import requests
except ImportError:
requests = None
logger.warning("requests not installed; VT validation disabled")
IP_RE = re.compile(r"\b(?:(?:25[0-5]|2[0-4]\d|1?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|1?\d\d?)\b")
DOMAIN_RE = re.compile(r"\b[a-zA-Z0-9](?:[a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z]{2,})+\b")
URL_RE = re.compile(r"https?://[^\s<>\"'{}|\\^`\[\]]+")
EMAIL_RE = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z]{2,}")
PRIVATE_IP_PREFIXES = ("10.", "127.", "0.", "192.168.", "169.254.")
FALSE_DOMAIN_SUFFIXES = (".dll", ".exe", ".sys", ".ocx", ".drv", ".pdb")
def compute_hashes(file_path: str) -> dict:
"""Compute MD5, SHA-1, SHA-256 hashes of a file."""
with open(file_path, "rb") as f:
data = f.read()
return {
"md5": hashlib.md5(data).hexdigest(),
"sha1": hashlib.sha1(data).hexdigest(),
"sha256": hashlib.sha256(data).hexdigest(),
"size_bytes": len(data),
}
def extract_pe_metadata(file_path: str) -> dict:
"""Extract PE file metadata including imphash and compile time."""
try:
pe = pefile.PE(file_path)
meta = {
"imphash": pe.get_imphash(),
"compile_time": datetime.utcfromtimestamp(pe.FILE_HEADER.TimeDateStamp).isoformat(),
"sections": [],
"imports": [],
}
for section in pe.sections:
name = section.Name.rstrip(b"\x00").decode("ascii", errors="replace")
meta["sections"].append({
"name": name, "entropy": round(section.get_entropy(), 2),
"virtual_size": section.Misc_VirtualSize, "raw_size": section.SizeOfRawData,
})
if hasattr(pe, "DIRECTORY_ENTRY_IMPORT"):
for entry in pe.DIRECTORY_ENTRY_IMPORT:
dll_name = entry.dll.decode("ascii", errors="replace")
funcs = [imp.name.decode("ascii", errors="replace") for imp in entry.imports if imp.name]
meta["imports"].append({"dll": dll_name, "functions": funcs[:20]})
pe.close()
return meta
except pefile.PEFormatError:
return {"error": "Not a valid PE file"}
def extract_strings(file_path: str, min_length: int = 4) -> List[str]:
"""Extract ASCII and Unicode strings from binary."""
with open(file_path, "rb") as f:
data = f.read()
ascii_strs = [s.decode("ascii") for s in re.findall(b"[ -~]{%d,}" % min_length, data)]
unicode_strs = [s.decode("utf-16-le", errors="ignore")
for s in re.findall(b"(?:[ -~]\x00){%d,}" % min_length, data)]
return ascii_strs + unicode_strs
def extract_network_iocs(strings: List[str]) -> dict:
"""Extract IPs, domains, URLs, emails from string list."""
ips: Set[str] = set()
domains: Set[str] = set()
urls: Set[str] = set()
emails: Set[str] = set()
for s in strings:
for ip in IP_RE.findall(s):
if not any(ip.startswith(p) for p in PRIVATE_IP_PREFIXES):
octets = ip.split(".")
if not (int(octets[0]) == 172 and 16 <= int(octets[1]) <= 31):
ips.add(ip)
for d in DOMAIN_RE.findall(s):
if not any(d.lower().endswith(sfx) for sfx in FALSE_DOMAIN_SUFFIXES):
domains.add(d.lower())
for u in URL_RE.findall(s):
urls.add(u)
for e in EMAIL_RE.findall(s):
emails.add(e.lower())
return {"ips": sorted(ips), "domains": sorted(domains),
"urls": sorted(urls), "emails": sorted(emails)}
def extract_host_iocs(strings: List[str]) -> dict:
"""Extract file paths, registry keys, and mutexes from strings."""
file_paths = set()
registry_keys = set()
mutexes = set()
for s in strings:
if re.match(r"[A-Z]:\\", s) and len(s) > 5:
file_paths.add(s)
if re.match(r"(?i)(HKLM|HKCU|HKCR|HKU|HKCC)\\", s):
registry_keys.add(s)
if re.match(r"(?i)(Global\\|Local\\)", s):
mutexes.add(s)
return {"file_paths": sorted(file_paths)[:30], "registry_keys": sorted(registry_keys)[:20],
"mutexes": sorted(mutexes)[:10]}
def run_yara_scan(file_path: str, rules_path: str) -> List[dict]:
"""Scan file with YARA rules."""
if not yara:
return [{"error": "yara-python not installed"}]
try:
rules = yara.compile(filepath=rules_path)
matches = rules.match(file_path)
return [{"rule": m.rule, "tags": m.tags, "meta": m.meta,
"strings": [(s.identifier, s.instances[0].offset if s.instances else 0)
for s in m.strings][:10]}
for m in matches]
except yara.Error as exc:
return [{"error": str(exc)}]
def validate_ioc_virustotal(ioc_value: str, ioc_type: str, api_key: str) -> dict:
"""Validate a single IOC against VirusTotal API v3."""
if not requests or not api_key:
return {"validated": False}
endpoints = {"ip": f"https://www.virustotal.com/api/v3/ip_addresses/{ioc_value}",
"domain": f"https://www.virustotal.com/api/v3/domains/{ioc_value}",
"hash": f"https://www.virustotal.com/api/v3/files/{ioc_value}"}
url = endpoints.get(ioc_type)
if not url:
return {"validated": False}
try:
resp = requests.get(url, headers={"x-apikey": api_key}, timeout=10)
if resp.status_code == 200:
stats = resp.json()["data"]["attributes"]["last_analysis_stats"]
return {"validated": True, "malicious": stats.get("malicious", 0),
"suspicious": stats.get("suspicious", 0)}
except Exception:
pass
return {"validated": False}
def defang_ioc(value: str) -> str:
"""Defang an IOC for safe sharing."""
return value.replace("http", "hxxp").replace(".", "[.]")
def export_stix_bundle(iocs: dict, sha256: str) -> dict:
"""Build a minimal STIX 2.1 bundle from extracted IOCs."""
indicators = []
ts = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
indicators.append({"type": "indicator", "spec_version": "2.1",
"pattern": f"[file:hashes.'SHA-256' = '{sha256}']",
"pattern_type": "stix", "valid_from": ts, "name": "Malware Hash"})
for ip in iocs.get("ips", []):
indicators.append({"type": "indicator", "spec_version": "2.1",
"pattern": f"[ipv4-addr:value = '{ip}']",
"pattern_type": "stix", "valid_from": ts, "name": f"C2 IP {ip}"})
for domain in iocs.get("domains", [])[:20]:
indicators.append({"type": "indicator", "spec_version": "2.1",
"pattern": f"[domain-name:value = '{domain}']",
"pattern_type": "stix", "valid_from": ts, "name": f"C2 Domain {domain}"})
return {"type": "bundle", "id": "bundle--ioc-extract", "objects": indicators}
def export_csv(iocs: dict, hashes: dict, output_path: str) -> None:
"""Export IOCs to CSV for SIEM ingestion."""
with open(output_path, "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["type", "value", "context", "confidence"])
writer.writerow(["sha256", hashes["sha256"], "malware_sample", "high"])
writer.writerow(["md5", hashes["md5"], "malware_sample", "high"])
for ip in iocs.get("ips", []):
writer.writerow(["ipv4", ip, "c2_server", "high"])
for d in iocs.get("domains", []):
writer.writerow(["domain", d, "c2_domain", "medium"])
for u in iocs.get("urls", []):
writer.writerow(["url", u, "c2_url", "medium"])
logger.info("Exported IOCs to %s", output_path)
def run_extraction(sample_path: str, output_dir: str, yara_rules: str = "",
vt_key: str = "") -> dict:
"""Run full IOC extraction pipeline."""
report = {"analysis_date": datetime.utcnow().isoformat(), "sample": sample_path}
report["hashes"] = compute_hashes(sample_path)
report["pe_metadata"] = extract_pe_metadata(sample_path)
strings = extract_strings(sample_path)
report["string_count"] = len(strings)
report["network_iocs"] = extract_network_iocs(strings)
report["host_iocs"] = extract_host_iocs(strings)
if yara_rules and os.path.isfile(yara_rules):
report["yara_matches"] = run_yara_scan(sample_path, yara_rules)
else:
report["yara_matches"] = []
if vt_key:
vt_result = validate_ioc_virustotal(report["hashes"]["sha256"], "hash", vt_key)
report["virustotal"] = vt_result
stix = export_stix_bundle(report["network_iocs"], report["hashes"]["sha256"])
stix_path = os.path.join(output_dir, "iocs_stix.json")
with open(stix_path, "w") as f:
json.dump(stix, f, indent=2)
export_csv(report["network_iocs"], report["hashes"], os.path.join(output_dir, "iocs.csv"))
report["summary"] = {
"ips": len(report["network_iocs"]["ips"]),
"domains": len(report["network_iocs"]["domains"]),
"urls": len(report["network_iocs"]["urls"]),
"file_paths": len(report["host_iocs"]["file_paths"]),
"registry_keys": len(report["host_iocs"]["registry_keys"]),
"yara_hits": len(report["yara_matches"]),
}
return report
def main():
parser = argparse.ArgumentParser(description="Malware IOC Extraction Agent")
parser.add_argument("--sample", required=True, help="Path to malware sample")
parser.add_argument("--yara-rules", default="", help="Path to YARA rules file")
parser.add_argument("--vt-key", default="", help="VirusTotal API key")
parser.add_argument("--output-dir", default=".", help="Output directory")
parser.add_argument("--output", default="ioc_report.json")
args = parser.parse_args()
os.makedirs(args.output_dir, exist_ok=True)
report = run_extraction(args.sample, args.output_dir, args.yara_rules, args.vt_key)
out_path = os.path.join(args.output_dir, args.output)
with open(out_path, "w") as f:
json.dump(report, f, indent=2, default=str)
logger.info("Report saved to %s", out_path)
print(json.dumps(report, indent=2, default=str))
if __name__ == "__main__":
main()