#!/usr/bin/env python3 """ Malware IOC Extraction Script Performs static analysis on PE files to extract IOCs: - File hash generation (MD5, SHA-1, SHA-256, imphash) - PE header parsing and section analysis - String extraction with IOC pattern matching - YARA rule scanning - STIX 2.1 bundle generation Requirements: pip install pefile yara-python stix2 requests Usage: python process.py --file malware.exe --output iocs.json python process.py --file malware.exe --yara-rules rules/ --stix-output bundle.json python process.py --file malware.exe --vt-check --vt-key YOUR_KEY """ import argparse import hashlib import json import os import re import sys from datetime import datetime from typing import Optional try: import pefile except ImportError: pefile = None try: import yara except ImportError: yara = None class MalwareIOCExtractor: """Extract IOCs from malware samples via static analysis.""" def __init__(self, filepath: str): self.filepath = filepath self.filename = os.path.basename(filepath) with open(filepath, "rb") as f: self.data = f.read() self.hashes = self._calculate_hashes() self.iocs = { "file": {"name": self.filename, "size": len(self.data)}, "hashes": self.hashes, "pe_info": {}, "network_iocs": {}, "host_iocs": {}, "yara_matches": [], "suspicious_strings": [], } def _calculate_hashes(self) -> dict: return { "md5": hashlib.md5(self.data).hexdigest(), "sha1": hashlib.sha1(self.data).hexdigest(), "sha256": hashlib.sha256(self.data).hexdigest(), } def analyze_pe(self): """Parse PE file structure and extract metadata IOCs.""" if pefile is None: print("[-] pefile not installed, skipping PE analysis") return try: pe = pefile.PE(data=self.data) except pefile.PEFormatError: print("[-] Not a valid PE file") return self.iocs["hashes"]["imphash"] = pe.get_imphash() # Compilation timestamp timestamp = pe.FILE_HEADER.TimeDateStamp try: compile_time = datetime.utcfromtimestamp(timestamp).isoformat() except (OSError, ValueError): compile_time = f"invalid ({timestamp})" self.iocs["pe_info"] = { "compile_time": compile_time, "machine": hex(pe.FILE_HEADER.Machine), "is_dll": pe.is_dll(), "is_exe": pe.is_exe(), "entry_point": hex(pe.OPTIONAL_HEADER.AddressOfEntryPoint), "image_base": hex(pe.OPTIONAL_HEADER.ImageBase), "sections": [], "imports": [], "exports": [], } # Section analysis for section in pe.sections: name = section.Name.decode("utf-8", errors="ignore").strip("\x00") entropy = section.get_entropy() self.iocs["pe_info"]["sections"].append({ "name": name, "virtual_size": section.Misc_VirtualSize, "raw_size": section.SizeOfRawData, "entropy": round(entropy, 2), "suspicious": entropy > 7.0, "md5": hashlib.md5(section.get_data()).hexdigest(), }) # Import table if hasattr(pe, "DIRECTORY_ENTRY_IMPORT"): for entry in pe.DIRECTORY_ENTRY_IMPORT: dll = entry.dll.decode("utf-8", errors="ignore") funcs = [] for imp in entry.imports: if imp.name: funcs.append(imp.name.decode("utf-8", errors="ignore")) self.iocs["pe_info"]["imports"].append({ "dll": dll, "functions": funcs, }) # Suspicious API imports suspicious_apis = { "VirtualAlloc", "VirtualProtect", "CreateRemoteThread", "WriteProcessMemory", "NtUnmapViewOfSection", "IsDebuggerPresent", "GetProcAddress", "LoadLibraryA", "LoadLibraryW", "URLDownloadToFileA", "InternetOpenA", "HttpSendRequestA", "WinExec", "ShellExecuteA", "CreateProcessA", "RegSetValueExA", "CryptEncrypt", "CryptDecrypt", } found_suspicious = set() for imp_entry in self.iocs["pe_info"]["imports"]: for func in imp_entry["functions"]: if func in suspicious_apis: found_suspicious.add(func) self.iocs["pe_info"]["suspicious_apis"] = sorted(found_suspicious) # Export table if hasattr(pe, "DIRECTORY_ENTRY_EXPORT"): for exp in pe.DIRECTORY_ENTRY_EXPORT.symbols: if exp.name: self.iocs["pe_info"]["exports"].append( exp.name.decode("utf-8", errors="ignore") ) pe.close() def extract_strings(self, min_length: int = 4): """Extract and classify strings from the binary.""" patterns = { "ipv4": re.compile( r"\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}" r"(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b" ), "domain": re.compile( r"\b(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+" r"(?:com|net|org|io|ru|cn|tk|xyz|top|info|biz|cc|ws|pw|" r"onion|bit|me|co|uk|de|fr|jp|kr|br)\b" ), "url": re.compile(r"https?://[^\s\"'<>\x00]{5,200}"), "email": re.compile( r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b" ), "registry": re.compile( r"(?:HKEY_[A-Z_]+|HKLM|HKCU|HKU|HKCR)" r"\\[\\a-zA-Z0-9_ .{}\-]+" ), "filepath": re.compile( r"[A-Z]:\\(?:[^\\/:*?\"<>|\r\n\x00]+\\)*[^\\/:*?\"<>|\r\n\x00]+" ), "mutex": re.compile(r"(?:Global\\|Local\\)[a-zA-Z0-9_\-{}.]{4,}"), "useragent": re.compile(r"Mozilla/[45]\.0[^\"'\x00]{10,200}"), "pdb_path": re.compile(r"[A-Z]:\\[^\x00\"]{5,200}\.pdb"), "bitcoin_wallet": re.compile( r"\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b" ), } # Extract ASCII strings ascii_regex = re.compile(rb"[\x20-\x7e]{%d,}" % min_length) ascii_strings = [ s.decode("ascii", errors="ignore") for s in ascii_regex.findall(self.data) ] # Extract Unicode strings unicode_regex = re.compile( rb"(?:[\x20-\x7e]\x00){%d,}" % min_length ) unicode_strings = [ s.decode("utf-16-le", errors="ignore") for s in unicode_regex.findall(self.data) ] all_strings = ascii_strings + unicode_strings network_iocs = {"ipv4": set(), "domain": set(), "url": set(), "email": set()} host_iocs = {"registry": set(), "filepath": set(), "mutex": set()} other = {"useragent": set(), "pdb_path": set(), "bitcoin_wallet": set()} for string in all_strings: for category, pattern in patterns.items(): for match in pattern.findall(string): if category in network_iocs: network_iocs[category].add(match) elif category in host_iocs: host_iocs[category].add(match) else: other[category].add(match) # Filter private IPs private_prefixes = ("10.", "172.16.", "172.17.", "172.18.", "172.19.", "172.20.", "172.21.", "172.22.", "172.23.", "172.24.", "172.25.", "172.26.", "172.27.", "172.28.", "172.29.", "172.30.", "172.31.", "192.168.", "127.", "0.", "255.") network_iocs["ipv4"] = { ip for ip in network_iocs["ipv4"] if not ip.startswith(private_prefixes) } # Filter common benign domains benign_domains = { "microsoft.com", "windows.com", "google.com", "w3.org", "xmlsoap.org", "openxmlformats.org", "schemas.microsoft.com", } network_iocs["domain"] = { d for d in network_iocs["domain"] if not any(d.endswith(b) for b in benign_domains) } self.iocs["network_iocs"] = {k: sorted(v) for k, v in network_iocs.items() if v} self.iocs["host_iocs"] = {k: sorted(v) for k, v in host_iocs.items() if v} self.iocs["suspicious_strings"] = {k: sorted(v) for k, v in other.items() if v} def scan_yara(self, rules_path: str): """Scan with YARA rules.""" if yara is None: print("[-] yara-python not installed, skipping YARA scan") return try: if os.path.isdir(rules_path): rule_files = {} for f in os.listdir(rules_path): if f.endswith((".yar", ".yara")): rule_files[f] = os.path.join(rules_path, f) rules = yara.compile(filepaths=rule_files) else: rules = yara.compile(filepath=rules_path) matches = rules.match(data=self.data) for match in matches: self.iocs["yara_matches"].append({ "rule": match.rule, "tags": match.tags, "meta": match.meta, "string_count": len(match.strings), }) print(f"[+] YARA match: {match.rule} (tags: {match.tags})") except yara.Error as e: print(f"[-] YARA error: {e}") def check_virustotal(self, api_key: str) -> Optional[dict]: """Check file hash against VirusTotal.""" import requests sha256 = self.hashes["sha256"] resp = requests.get( f"https://www.virustotal.com/api/v3/files/{sha256}", headers={"x-apikey": api_key}, timeout=30, ) if resp.status_code == 200: data = resp.json().get("data", {}).get("attributes", {}) stats = data.get("last_analysis_stats", {}) vt_result = { "malicious": stats.get("malicious", 0), "suspicious": stats.get("suspicious", 0), "undetected": stats.get("undetected", 0), "total": sum(stats.values()), "popular_threat_name": data.get( "popular_threat_classification", {} ).get("suggested_threat_label", ""), "tags": data.get("tags", []), "type_description": data.get("type_description", ""), "names": data.get("names", [])[:5], } self.iocs["virustotal"] = vt_result print( f"[+] VT: {vt_result['malicious']}/{vt_result['total']} " f"detections - {vt_result['popular_threat_name']}" ) return vt_result elif resp.status_code == 404: print(f"[!] Hash not found on VirusTotal: {sha256}") else: print(f"[-] VT API error: {resp.status_code}") return None def generate_stix_bundle(self) -> dict: """Generate STIX 2.1 bundle from extracted IOCs.""" from stix2 import Bundle, Indicator, Malware, Relationship objects = [] malware_obj = Malware( name=self.filename, is_family=False, malware_types=["unknown"], description=( f"SHA256: {self.hashes['sha256']}\n" f"MD5: {self.hashes['md5']}" ), allow_custom=True, ) objects.append(malware_obj) # Hash indicator hash_ind = Indicator( name=f"File hash: {self.hashes['sha256'][:16]}...", pattern=f"[file:hashes.'SHA-256' = '{self.hashes['sha256']}']", pattern_type="stix", valid_from=datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"), indicator_types=["malicious-activity"], allow_custom=True, ) objects.append(hash_ind) objects.append(Relationship( relationship_type="indicates", source_ref=hash_ind.id, target_ref=malware_obj.id, )) # Network indicators for ip in self.iocs.get("network_iocs", {}).get("ipv4", []): ind = Indicator( name=f"C2 IP: {ip}", pattern=f"[ipv4-addr:value = '{ip}']", pattern_type="stix", valid_from=datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"), indicator_types=["malicious-activity"], allow_custom=True, ) objects.append(ind) objects.append(Relationship( relationship_type="indicates", source_ref=ind.id, target_ref=malware_obj.id, )) for domain in self.iocs.get("network_iocs", {}).get("domain", []): ind = Indicator( name=f"C2 Domain: {domain}", pattern=f"[domain-name:value = '{domain}']", pattern_type="stix", valid_from=datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"), indicator_types=["malicious-activity"], allow_custom=True, ) objects.append(ind) objects.append(Relationship( relationship_type="indicates", source_ref=ind.id, target_ref=malware_obj.id, )) bundle = Bundle(objects=objects, allow_custom=True) return json.loads(bundle.serialize()) def get_report(self) -> dict: """Get complete IOC extraction report.""" return self.iocs def main(): parser = argparse.ArgumentParser(description="Malware IOC Extraction Tool") parser.add_argument("--file", required=True, help="Path to malware sample") parser.add_argument("--output", default="iocs.json", help="Output IOC file") parser.add_argument("--yara-rules", help="YARA rules file or directory") parser.add_argument("--vt-check", action="store_true", help="Check VirusTotal") parser.add_argument("--vt-key", help="VirusTotal API key") parser.add_argument("--stix-output", help="Output STIX 2.1 bundle file") parser.add_argument( "--min-string-length", type=int, default=4, help="Minimum string length for extraction", ) args = parser.parse_args() if not os.path.isfile(args.file): print(f"[-] File not found: {args.file}") sys.exit(1) print(f"[*] Analyzing: {args.file}") extractor = MalwareIOCExtractor(args.file) print("[*] Calculating hashes...") print(f" MD5: {extractor.hashes['md5']}") print(f" SHA1: {extractor.hashes['sha1']}") print(f" SHA256: {extractor.hashes['sha256']}") print("[*] Parsing PE structure...") extractor.analyze_pe() print("[*] Extracting strings and IOC patterns...") extractor.extract_strings(min_length=args.min_string_length) if args.yara_rules: print(f"[*] Scanning with YARA rules: {args.yara_rules}") extractor.scan_yara(args.yara_rules) if args.vt_check and args.vt_key: print("[*] Checking VirusTotal...") extractor.check_virustotal(args.vt_key) report = extractor.get_report() with open(args.output, "w") as f: json.dump(report, f, indent=2, default=str) print(f"[+] IOC report saved to {args.output}") if args.stix_output: print("[*] Generating STIX 2.1 bundle...") bundle = extractor.generate_stix_bundle() with open(args.stix_output, "w") as f: json.dump(bundle, f, indent=2) print(f"[+] STIX bundle saved to {args.stix_output}") # Print summary net = report.get("network_iocs", {}) host = report.get("host_iocs", {}) print(f"\n=== IOC Summary ===") print(f" IPs: {len(net.get('ipv4', []))}") print(f" Domains: {len(net.get('domain', []))}") print(f" URLs: {len(net.get('url', []))}") print(f" Registry keys: {len(host.get('registry', []))}") print(f" File paths: {len(host.get('filepath', []))}") print(f" YARA matches: {len(report.get('yara_matches', []))}") if __name__ == "__main__": main()