#!/usr/bin/env python3 # For authorized testing in lab/CTF environments only """IOC extraction agent using pefile, yara-python, and requests for VirusTotal validation.""" import argparse import csv import hashlib import json import logging import os import re import sys from datetime import datetime from typing import List, Set logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") logger = logging.getLogger(__name__) try: import pefile except ImportError: sys.exit("pefile required: pip install pefile") try: import yara except ImportError: yara = None logger.warning("yara-python not installed; YARA scanning disabled") try: import requests except ImportError: requests = None logger.warning("requests not installed; VT validation disabled") IP_RE = re.compile(r"\b(?:(?:25[0-5]|2[0-4]\d|1?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|1?\d\d?)\b") DOMAIN_RE = re.compile(r"\b[a-zA-Z0-9](?:[a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z]{2,})+\b") URL_RE = re.compile(r"https?://[^\s<>\"'{}|\\^`\[\]]+") EMAIL_RE = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z]{2,}") PRIVATE_IP_PREFIXES = ("10.", "127.", "0.", "192.168.", "169.254.") FALSE_DOMAIN_SUFFIXES = (".dll", ".exe", ".sys", ".ocx", ".drv", ".pdb") def compute_hashes(file_path: str) -> dict: """Compute MD5, SHA-1, SHA-256 hashes of a file.""" with open(file_path, "rb") as f: data = f.read() return { "md5": hashlib.md5(data).hexdigest(), "sha1": hashlib.sha1(data).hexdigest(), "sha256": hashlib.sha256(data).hexdigest(), "size_bytes": len(data), } def extract_pe_metadata(file_path: str) -> dict: """Extract PE file metadata including imphash and compile time.""" try: pe = pefile.PE(file_path) meta = { "imphash": pe.get_imphash(), "compile_time": datetime.utcfromtimestamp(pe.FILE_HEADER.TimeDateStamp).isoformat(), "sections": [], "imports": [], } for section in pe.sections: name = section.Name.rstrip(b"\x00").decode("ascii", errors="replace") meta["sections"].append({ "name": name, "entropy": round(section.get_entropy(), 2), "virtual_size": section.Misc_VirtualSize, "raw_size": section.SizeOfRawData, }) if hasattr(pe, "DIRECTORY_ENTRY_IMPORT"): for entry in pe.DIRECTORY_ENTRY_IMPORT: dll_name = entry.dll.decode("ascii", errors="replace") funcs = [imp.name.decode("ascii", errors="replace") for imp in entry.imports if imp.name] meta["imports"].append({"dll": dll_name, "functions": funcs[:20]}) pe.close() return meta except pefile.PEFormatError: return {"error": "Not a valid PE file"} def extract_strings(file_path: str, min_length: int = 4) -> List[str]: """Extract ASCII and Unicode strings from binary.""" with open(file_path, "rb") as f: data = f.read() ascii_strs = [s.decode("ascii") for s in re.findall(b"[ -~]{%d,}" % min_length, data)] unicode_strs = [s.decode("utf-16-le", errors="ignore") for s in re.findall(b"(?:[ -~]\x00){%d,}" % min_length, data)] return ascii_strs + unicode_strs def extract_network_iocs(strings: List[str]) -> dict: """Extract IPs, domains, URLs, emails from string list.""" ips: Set[str] = set() domains: Set[str] = set() urls: Set[str] = set() emails: Set[str] = set() for s in strings: for ip in IP_RE.findall(s): if not any(ip.startswith(p) for p in PRIVATE_IP_PREFIXES): octets = ip.split(".") if not (int(octets[0]) == 172 and 16 <= int(octets[1]) <= 31): ips.add(ip) for d in DOMAIN_RE.findall(s): if not any(d.lower().endswith(sfx) for sfx in FALSE_DOMAIN_SUFFIXES): domains.add(d.lower()) for u in URL_RE.findall(s): urls.add(u) for e in EMAIL_RE.findall(s): emails.add(e.lower()) return {"ips": sorted(ips), "domains": sorted(domains), "urls": sorted(urls), "emails": sorted(emails)} def extract_host_iocs(strings: List[str]) -> dict: """Extract file paths, registry keys, and mutexes from strings.""" file_paths = set() registry_keys = set() mutexes = set() for s in strings: if re.match(r"[A-Z]:\\", s) and len(s) > 5: file_paths.add(s) if re.match(r"(?i)(HKLM|HKCU|HKCR|HKU|HKCC)\\", s): registry_keys.add(s) if re.match(r"(?i)(Global\\|Local\\)", s): mutexes.add(s) return {"file_paths": sorted(file_paths)[:30], "registry_keys": sorted(registry_keys)[:20], "mutexes": sorted(mutexes)[:10]} def run_yara_scan(file_path: str, rules_path: str) -> List[dict]: """Scan file with YARA rules.""" if not yara: return [{"error": "yara-python not installed"}] try: rules = yara.compile(filepath=rules_path) matches = rules.match(file_path) return [{"rule": m.rule, "tags": m.tags, "meta": m.meta, "strings": [(s.identifier, s.instances[0].offset if s.instances else 0) for s in m.strings][:10]} for m in matches] except yara.Error as exc: return [{"error": str(exc)}] def validate_ioc_virustotal(ioc_value: str, ioc_type: str, api_key: str) -> dict: """Validate a single IOC against VirusTotal API v3.""" if not requests or not api_key: return {"validated": False} endpoints = {"ip": f"https://www.virustotal.com/api/v3/ip_addresses/{ioc_value}", "domain": f"https://www.virustotal.com/api/v3/domains/{ioc_value}", "hash": f"https://www.virustotal.com/api/v3/files/{ioc_value}"} url = endpoints.get(ioc_type) if not url: return {"validated": False} try: resp = requests.get(url, headers={"x-apikey": api_key}, timeout=10) if resp.status_code == 200: stats = resp.json()["data"]["attributes"]["last_analysis_stats"] return {"validated": True, "malicious": stats.get("malicious", 0), "suspicious": stats.get("suspicious", 0)} except Exception: pass return {"validated": False} def defang_ioc(value: str) -> str: """Defang an IOC for safe sharing.""" return value.replace("http", "hxxp").replace(".", "[.]") def export_stix_bundle(iocs: dict, sha256: str) -> dict: """Build a minimal STIX 2.1 bundle from extracted IOCs.""" indicators = [] ts = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") indicators.append({"type": "indicator", "spec_version": "2.1", "pattern": f"[file:hashes.'SHA-256' = '{sha256}']", "pattern_type": "stix", "valid_from": ts, "name": "Malware Hash"}) for ip in iocs.get("ips", []): indicators.append({"type": "indicator", "spec_version": "2.1", "pattern": f"[ipv4-addr:value = '{ip}']", "pattern_type": "stix", "valid_from": ts, "name": f"C2 IP {ip}"}) for domain in iocs.get("domains", [])[:20]: indicators.append({"type": "indicator", "spec_version": "2.1", "pattern": f"[domain-name:value = '{domain}']", "pattern_type": "stix", "valid_from": ts, "name": f"C2 Domain {domain}"}) return {"type": "bundle", "id": "bundle--ioc-extract", "objects": indicators} def export_csv(iocs: dict, hashes: dict, output_path: str) -> None: """Export IOCs to CSV for SIEM ingestion.""" with open(output_path, "w", newline="", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(["type", "value", "context", "confidence"]) writer.writerow(["sha256", hashes["sha256"], "malware_sample", "high"]) writer.writerow(["md5", hashes["md5"], "malware_sample", "high"]) for ip in iocs.get("ips", []): writer.writerow(["ipv4", ip, "c2_server", "high"]) for d in iocs.get("domains", []): writer.writerow(["domain", d, "c2_domain", "medium"]) for u in iocs.get("urls", []): writer.writerow(["url", u, "c2_url", "medium"]) logger.info("Exported IOCs to %s", output_path) def run_extraction(sample_path: str, output_dir: str, yara_rules: str = "", vt_key: str = "") -> dict: """Run full IOC extraction pipeline.""" report = {"analysis_date": datetime.utcnow().isoformat(), "sample": sample_path} report["hashes"] = compute_hashes(sample_path) report["pe_metadata"] = extract_pe_metadata(sample_path) strings = extract_strings(sample_path) report["string_count"] = len(strings) report["network_iocs"] = extract_network_iocs(strings) report["host_iocs"] = extract_host_iocs(strings) if yara_rules and os.path.isfile(yara_rules): report["yara_matches"] = run_yara_scan(sample_path, yara_rules) else: report["yara_matches"] = [] if vt_key: vt_result = validate_ioc_virustotal(report["hashes"]["sha256"], "hash", vt_key) report["virustotal"] = vt_result stix = export_stix_bundle(report["network_iocs"], report["hashes"]["sha256"]) stix_path = os.path.join(output_dir, "iocs_stix.json") with open(stix_path, "w") as f: json.dump(stix, f, indent=2) export_csv(report["network_iocs"], report["hashes"], os.path.join(output_dir, "iocs.csv")) report["summary"] = { "ips": len(report["network_iocs"]["ips"]), "domains": len(report["network_iocs"]["domains"]), "urls": len(report["network_iocs"]["urls"]), "file_paths": len(report["host_iocs"]["file_paths"]), "registry_keys": len(report["host_iocs"]["registry_keys"]), "yara_hits": len(report["yara_matches"]), } return report def main(): parser = argparse.ArgumentParser(description="Malware IOC Extraction Agent") parser.add_argument("--sample", required=True, help="Path to malware sample") parser.add_argument("--yara-rules", default="", help="Path to YARA rules file") parser.add_argument("--vt-key", default="", help="VirusTotal API key") parser.add_argument("--output-dir", default=".", help="Output directory") parser.add_argument("--output", default="ioc_report.json") args = parser.parse_args() os.makedirs(args.output_dir, exist_ok=True) report = run_extraction(args.sample, args.output_dir, args.yara_rules, args.vt_key) out_path = os.path.join(args.output_dir, args.output) with open(out_path, "w") as f: json.dump(report, f, indent=2, default=str) logger.info("Report saved to %s", out_path) print(json.dumps(report, indent=2, default=str)) if __name__ == "__main__": main()