Files
Anthropic-Cybersecurity-Skills/skills/extracting-iocs-from-malware-samples/scripts/agent.py
T
mukul975 27c6414ca5 Add folder anatomy (scripts/agent.py + references/api-reference.md) for 648 cybersecurity skills
Complete skill folder anatomy across all cybersecurity skills:
- scripts/agent.py: 80-150 line Python agents using real libraries (impacket,
  boto3, azure-mgmt-*, kubernetes, pefile, yara, scapy, shodan, stix2, etc.)
- references/api-reference.md: real API documentation with method signatures
- LICENSE: MIT license for all skill folders
2026-03-10 21:02:12 +01:00

272 lines
11 KiB
Python

#!/usr/bin/env python3
# For authorized testing in lab/CTF environments only
"""IOC extraction agent using pefile, yara-python, and requests for VirusTotal validation."""
import argparse
import csv
import hashlib
import json
import logging
import os
import re
import sys
from datetime import datetime
from typing import Dict, List, Optional, Set
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)
try:
import pefile
except ImportError:
sys.exit("pefile required: pip install pefile")
try:
import yara
except ImportError:
yara = None
logger.warning("yara-python not installed; YARA scanning disabled")
try:
import requests
except ImportError:
requests = None
logger.warning("requests not installed; VT validation disabled")
IP_RE = re.compile(r"\b(?:(?:25[0-5]|2[0-4]\d|1?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|1?\d\d?)\b")
DOMAIN_RE = re.compile(r"\b[a-zA-Z0-9](?:[a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z]{2,})+\b")
URL_RE = re.compile(r"https?://[^\s<>\"'{}|\\^`\[\]]+")
EMAIL_RE = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z]{2,}")
PRIVATE_IP_PREFIXES = ("10.", "127.", "0.", "192.168.", "169.254.")
FALSE_DOMAIN_SUFFIXES = (".dll", ".exe", ".sys", ".ocx", ".drv", ".pdb")
def compute_hashes(file_path: str) -> dict:
"""Compute MD5, SHA-1, SHA-256 hashes of a file."""
with open(file_path, "rb") as f:
data = f.read()
return {
"md5": hashlib.md5(data).hexdigest(),
"sha1": hashlib.sha1(data).hexdigest(),
"sha256": hashlib.sha256(data).hexdigest(),
"size_bytes": len(data),
}
def extract_pe_metadata(file_path: str) -> dict:
"""Extract PE file metadata including imphash and compile time."""
try:
pe = pefile.PE(file_path)
meta = {
"imphash": pe.get_imphash(),
"compile_time": datetime.utcfromtimestamp(pe.FILE_HEADER.TimeDateStamp).isoformat(),
"sections": [],
"imports": [],
}
for section in pe.sections:
name = section.Name.rstrip(b"\x00").decode("ascii", errors="replace")
meta["sections"].append({
"name": name, "entropy": round(section.get_entropy(), 2),
"virtual_size": section.Misc_VirtualSize, "raw_size": section.SizeOfRawData,
})
if hasattr(pe, "DIRECTORY_ENTRY_IMPORT"):
for entry in pe.DIRECTORY_ENTRY_IMPORT:
dll_name = entry.dll.decode("ascii", errors="replace")
funcs = [imp.name.decode("ascii", errors="replace") for imp in entry.imports if imp.name]
meta["imports"].append({"dll": dll_name, "functions": funcs[:20]})
pe.close()
return meta
except pefile.PEFormatError:
return {"error": "Not a valid PE file"}
def extract_strings(file_path: str, min_length: int = 4) -> List[str]:
"""Extract ASCII and Unicode strings from binary."""
with open(file_path, "rb") as f:
data = f.read()
ascii_strs = [s.decode("ascii") for s in re.findall(b"[ -~]{%d,}" % min_length, data)]
unicode_strs = [s.decode("utf-16-le", errors="ignore")
for s in re.findall(b"(?:[ -~]\x00){%d,}" % min_length, data)]
return ascii_strs + unicode_strs
def extract_network_iocs(strings: List[str]) -> dict:
"""Extract IPs, domains, URLs, emails from string list."""
ips: Set[str] = set()
domains: Set[str] = set()
urls: Set[str] = set()
emails: Set[str] = set()
for s in strings:
for ip in IP_RE.findall(s):
if not any(ip.startswith(p) for p in PRIVATE_IP_PREFIXES):
octets = ip.split(".")
if not (int(octets[0]) == 172 and 16 <= int(octets[1]) <= 31):
ips.add(ip)
for d in DOMAIN_RE.findall(s):
if not any(d.lower().endswith(sfx) for sfx in FALSE_DOMAIN_SUFFIXES):
domains.add(d.lower())
for u in URL_RE.findall(s):
urls.add(u)
for e in EMAIL_RE.findall(s):
emails.add(e.lower())
return {"ips": sorted(ips), "domains": sorted(domains),
"urls": sorted(urls), "emails": sorted(emails)}
def extract_host_iocs(strings: List[str]) -> dict:
"""Extract file paths, registry keys, and mutexes from strings."""
file_paths = set()
registry_keys = set()
mutexes = set()
for s in strings:
if re.match(r"[A-Z]:\\", s) and len(s) > 5:
file_paths.add(s)
if re.match(r"(?i)(HKLM|HKCU|HKCR|HKU|HKCC)\\", s):
registry_keys.add(s)
if re.match(r"(?i)(Global\\|Local\\)", s):
mutexes.add(s)
return {"file_paths": sorted(file_paths)[:30], "registry_keys": sorted(registry_keys)[:20],
"mutexes": sorted(mutexes)[:10]}
def run_yara_scan(file_path: str, rules_path: str) -> List[dict]:
"""Scan file with YARA rules."""
if not yara:
return [{"error": "yara-python not installed"}]
try:
rules = yara.compile(filepath=rules_path)
matches = rules.match(file_path)
return [{"rule": m.rule, "tags": m.tags, "meta": m.meta,
"strings": [(s.identifier, s.instances[0].offset if s.instances else 0)
for s in m.strings][:10]}
for m in matches]
except yara.Error as exc:
return [{"error": str(exc)}]
def validate_ioc_virustotal(ioc_value: str, ioc_type: str, api_key: str) -> dict:
"""Validate a single IOC against VirusTotal API v3."""
if not requests or not api_key:
return {"validated": False}
endpoints = {"ip": f"https://www.virustotal.com/api/v3/ip_addresses/{ioc_value}",
"domain": f"https://www.virustotal.com/api/v3/domains/{ioc_value}",
"hash": f"https://www.virustotal.com/api/v3/files/{ioc_value}"}
url = endpoints.get(ioc_type)
if not url:
return {"validated": False}
try:
resp = requests.get(url, headers={"x-apikey": api_key}, timeout=10)
if resp.status_code == 200:
stats = resp.json()["data"]["attributes"]["last_analysis_stats"]
return {"validated": True, "malicious": stats.get("malicious", 0),
"suspicious": stats.get("suspicious", 0)}
except Exception:
pass
return {"validated": False}
def defang_ioc(value: str) -> str:
"""Defang an IOC for safe sharing."""
return value.replace("http", "hxxp").replace(".", "[.]")
def export_stix_bundle(iocs: dict, sha256: str) -> dict:
"""Build a minimal STIX 2.1 bundle from extracted IOCs."""
indicators = []
ts = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
indicators.append({"type": "indicator", "spec_version": "2.1",
"pattern": f"[file:hashes.'SHA-256' = '{sha256}']",
"pattern_type": "stix", "valid_from": ts, "name": "Malware Hash"})
for ip in iocs.get("ips", []):
indicators.append({"type": "indicator", "spec_version": "2.1",
"pattern": f"[ipv4-addr:value = '{ip}']",
"pattern_type": "stix", "valid_from": ts, "name": f"C2 IP {ip}"})
for domain in iocs.get("domains", [])[:20]:
indicators.append({"type": "indicator", "spec_version": "2.1",
"pattern": f"[domain-name:value = '{domain}']",
"pattern_type": "stix", "valid_from": ts, "name": f"C2 Domain {domain}"})
return {"type": "bundle", "id": "bundle--ioc-extract", "objects": indicators}
def export_csv(iocs: dict, hashes: dict, output_path: str) -> None:
"""Export IOCs to CSV for SIEM ingestion."""
with open(output_path, "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["type", "value", "context", "confidence"])
writer.writerow(["sha256", hashes["sha256"], "malware_sample", "high"])
writer.writerow(["md5", hashes["md5"], "malware_sample", "high"])
for ip in iocs.get("ips", []):
writer.writerow(["ipv4", ip, "c2_server", "high"])
for d in iocs.get("domains", []):
writer.writerow(["domain", d, "c2_domain", "medium"])
for u in iocs.get("urls", []):
writer.writerow(["url", u, "c2_url", "medium"])
logger.info("Exported IOCs to %s", output_path)
def run_extraction(sample_path: str, output_dir: str, yara_rules: str = "",
vt_key: str = "") -> dict:
"""Run full IOC extraction pipeline."""
report = {"analysis_date": datetime.utcnow().isoformat(), "sample": sample_path}
report["hashes"] = compute_hashes(sample_path)
report["pe_metadata"] = extract_pe_metadata(sample_path)
strings = extract_strings(sample_path)
report["string_count"] = len(strings)
report["network_iocs"] = extract_network_iocs(strings)
report["host_iocs"] = extract_host_iocs(strings)
if yara_rules and os.path.isfile(yara_rules):
report["yara_matches"] = run_yara_scan(sample_path, yara_rules)
else:
report["yara_matches"] = []
if vt_key:
vt_result = validate_ioc_virustotal(report["hashes"]["sha256"], "hash", vt_key)
report["virustotal"] = vt_result
stix = export_stix_bundle(report["network_iocs"], report["hashes"]["sha256"])
stix_path = os.path.join(output_dir, "iocs_stix.json")
with open(stix_path, "w") as f:
json.dump(stix, f, indent=2)
export_csv(report["network_iocs"], report["hashes"], os.path.join(output_dir, "iocs.csv"))
report["summary"] = {
"ips": len(report["network_iocs"]["ips"]),
"domains": len(report["network_iocs"]["domains"]),
"urls": len(report["network_iocs"]["urls"]),
"file_paths": len(report["host_iocs"]["file_paths"]),
"registry_keys": len(report["host_iocs"]["registry_keys"]),
"yara_hits": len(report["yara_matches"]),
}
return report
def main():
parser = argparse.ArgumentParser(description="Malware IOC Extraction Agent")
parser.add_argument("--sample", required=True, help="Path to malware sample")
parser.add_argument("--yara-rules", default="", help="Path to YARA rules file")
parser.add_argument("--vt-key", default="", help="VirusTotal API key")
parser.add_argument("--output-dir", default=".", help="Output directory")
parser.add_argument("--output", default="ioc_report.json")
args = parser.parse_args()
os.makedirs(args.output_dir, exist_ok=True)
report = run_extraction(args.sample, args.output_dir, args.yara_rules, args.vt_key)
out_path = os.path.join(args.output_dir, args.output)
with open(out_path, "w") as f:
json.dump(report, f, indent=2, default=str)
logger.info("Report saved to %s", out_path)
print(json.dumps(report, indent=2, default=str))
if __name__ == "__main__":
main()