#!/usr/bin/env python3 """Agent for performing malware IOC extraction from files, reports, and samples.""" import json import argparse import re import hashlib from pathlib import Path IOC_PATTERNS = { "ipv4": re.compile(r"\b(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)\.){3}(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)\b"), "ipv6": re.compile(r"\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b"), "domain": re.compile(r"\b(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+(?:com|net|org|io|ru|cn|xyz|top|info|biz|cc|tk|ml|ga|cf|gq|pw)\b"), "url": re.compile(r"https?://[^\s<>\"'\)]+"), "email": re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b"), "md5": re.compile(r"\b[a-f0-9]{32}\b"), "sha1": re.compile(r"\b[a-f0-9]{40}\b"), "sha256": re.compile(r"\b[a-f0-9]{64}\b"), "cve": re.compile(r"CVE-\d{4}-\d{4,7}", re.I), "registry_key": re.compile(r"(?:HKLM|HKCU|HKCR|HKU|HKCC)\\[^\s\"']+"), "file_path_windows": re.compile(r"[A-Z]:\\(?:[^\s\\\"]+\\)*[^\s\\\"]+\.\w{1,5}"), "file_path_unix": re.compile(r"/(?:tmp|var|etc|usr|home|opt|bin|sbin)/[^\s\"']+"), "mutex": re.compile(r"(?:Global|Local)\\[^\s\"']+"), "bitcoin_addr": re.compile(r"\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b"), "mitre_technique": re.compile(r"T\d{4}(?:\.\d{3})?"), "user_agent": re.compile(r"Mozilla/5\.0[^\n\"]{20,200}"), } DEFANGED_PATTERNS = { "ip_defanged": (re.compile(r"\b\d+\[\.\]\d+\[\.\]\d+\[\.\]\d+\b"), lambda m: m.group().replace("[.]", ".")), "url_defanged": (re.compile(r"hxxps?://[^\s]+"), lambda m: m.group().replace("hxxp", "http")), "domain_defanged": (re.compile(r"\b\S+\[\.\]\S+\b"), lambda m: m.group().replace("[.]", ".")), } def extract_iocs_from_text(text): """Extract all IOC types from raw text.""" refanged = text for name, (pattern, fixer) in DEFANGED_PATTERNS.items(): refanged = pattern.sub(fixer, refanged) extracted = {} for ioc_type, pattern in IOC_PATTERNS.items(): matches = list(set(pattern.findall(refanged))) if matches: extracted[ioc_type] = sorted(matches)[:200] private_ip = re.compile(r"^(?:10\.|172\.(?:1[6-9]|2\d|3[01])\.|192\.168\.|127\.)") if "ipv4" in extracted: extracted["ipv4"] = [ip for ip in extracted["ipv4"] if not private_ip.match(ip)] return extracted def extract_from_file(file_path): """Extract IOCs from a file (text, PDF text, or report).""" content = Path(file_path).read_text(encoding="utf-8", errors="replace") iocs = extract_iocs_from_text(content) total = sum(len(v) for v in iocs.values()) return { "source": file_path, "total_iocs": total, "by_type": {k: len(v) for k, v in iocs.items()}, "indicators": iocs, } def hash_file(file_path): """Calculate file hashes for malware sample identification.""" data = Path(file_path).read_bytes() return { "file": file_path, "size_bytes": len(data), "md5": hashlib.md5(data).hexdigest(), "sha1": hashlib.sha1(data).hexdigest(), "sha256": hashlib.sha256(data).hexdigest(), } def extract_strings(file_path, min_length=6): """Extract printable strings from binary file.""" data = Path(file_path).read_bytes() ascii_strings = re.findall(rb"[\x20-\x7e]{%d,}" % min_length, data) wide_strings = re.findall(rb"(?:[\x20-\x7e]\x00){%d,}" % min_length, data) all_strings = [s.decode("ascii", errors="replace") for s in ascii_strings] all_strings += [s.decode("utf-16-le", errors="replace") for s in wide_strings] iocs = extract_iocs_from_text("\n".join(all_strings)) suspicious = [] suspicious_kw = ["http", "socket", "connect", "download", "upload", "exec", "cmd.exe", "powershell", "reg add", "CreateRemoteThread", "VirtualAlloc", "WriteProcessMemory", "LoadLibrary", "GetProcAddress", "WinExec", "ShellExecute"] for s in all_strings: if any(kw.lower() in s.lower() for kw in suspicious_kw): suspicious.append(s[:200]) return { "file": file_path, "total_strings": len(all_strings), "suspicious_strings": suspicious[:30], "extracted_iocs": {k: len(v) for k, v in iocs.items()}, "ioc_details": iocs, } def generate_ioc_report(file_path, output=None): """Generate comprehensive IOC extraction report.""" hashes = hash_file(file_path) strings = extract_strings(file_path) report = { "generated": datetime.utcnow().isoformat() if "datetime" in dir() else "", "file_info": hashes, "strings_analysis": { "total": strings["total_strings"], "suspicious": strings["suspicious_strings"], }, "extracted_iocs": strings["ioc_details"], "ioc_summary": strings["extracted_iocs"], } if output: with open(output, "w") as f: json.dump(report, f, indent=2) return report def main(): parser = argparse.ArgumentParser(description="Malware IOC Extraction Agent") sub = parser.add_subparsers(dest="command") t = sub.add_parser("text", help="Extract IOCs from text/report file") t.add_argument("--file", required=True) h = sub.add_parser("hash", help="Calculate file hashes") h.add_argument("--file", required=True) s = sub.add_parser("strings", help="Extract strings and IOCs from binary") s.add_argument("--file", required=True) s.add_argument("--min-length", type=int, default=6) r = sub.add_parser("report", help="Generate full IOC report") r.add_argument("--file", required=True) r.add_argument("--output", help="Output JSON file") args = parser.parse_args() if args.command == "text": result = extract_from_file(args.file) elif args.command == "hash": result = hash_file(args.file) elif args.command == "strings": result = extract_strings(args.file, args.min_length) elif args.command == "report": result = generate_ioc_report(args.file, args.output) else: parser.print_help() return print(json.dumps(result, indent=2, default=str)) if __name__ == "__main__": main()