#!/usr/bin/env python3 """Agent for performing malware triage with YARA. Compiles and applies YARA rules to classify malware samples, perform batch scanning, and generate triage reports. """ import yara import sys import json import hashlib from pathlib import Path from collections import defaultdict from datetime import datetime class YaraTriageAgent: """Batch malware triage and classification using YARA rules.""" def __init__(self, output_dir): self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) self.rules = None self.results = [] def compile_rules(self, rule_paths): """Compile YARA rules from file paths or directories.""" filepaths = {} for path in rule_paths: p = Path(path) if p.is_file() and p.suffix in (".yar", ".yara"): filepaths[p.stem] = str(p) elif p.is_dir(): for rule_file in p.rglob("*.yar"): filepaths[rule_file.stem] = str(rule_file) for rule_file in p.rglob("*.yara"): filepaths[rule_file.stem] = str(rule_file) if not filepaths: raise ValueError(f"No YARA rule files found in: {rule_paths}") self.rules = yara.compile(filepaths=filepaths) return len(filepaths) def scan_file(self, filepath): """Scan a single file against compiled YARA rules.""" filepath = Path(filepath) if not filepath.is_file(): return None with open(filepath, "rb") as f: data = f.read() sha256 = hashlib.sha256(data).hexdigest() md5 = hashlib.md5(data).hexdigest() matches = self.rules.match(data=data) result = { "filename": filepath.name, "path": str(filepath), "sha256": sha256, "md5": md5, "size": len(data), "matches": [], "match_count": len(matches), "classification": "UNKNOWN", } for match in matches: match_info = { "rule": match.rule, "namespace": match.namespace, "tags": match.tags, "meta": match.meta, "strings": [], } if match.strings: for string_match in match.strings[:10]: match_info["strings"].append({ "identifier": string_match[1], "offset": hex(string_match[0]), "data": string_match[2].decode("utf-8", errors="replace")[:80], }) result["matches"].append(match_info) if result["matches"]: result["classification"] = result["matches"][0].get("namespace", "DETECTED").upper() return result def scan_directory(self, sample_dir, recursive=True): """Scan all files in a directory.""" sample_path = Path(sample_dir) glob_fn = sample_path.rglob if recursive else sample_path.glob for filepath in glob_fn("*"): if filepath.is_file() and filepath.stat().st_size > 0: result = self.scan_file(filepath) if result: self.results.append(result) return self.results def get_classification_summary(self): """Summarize scan results by classification.""" summary = defaultdict(int) for result in self.results: summary[result["classification"]] += 1 return dict(sorted(summary.items(), key=lambda x: x[1], reverse=True)) def get_top_rules(self, limit=20): """Get most frequently matching rules.""" rule_counts = defaultdict(int) for result in self.results: for match in result["matches"]: rule_counts[match["rule"]] += 1 return dict(sorted(rule_counts.items(), key=lambda x: x[1], reverse=True)[:limit]) def generate_report(self): """Generate comprehensive triage report.""" classified = [r for r in self.results if r["classification"] != "UNKNOWN"] unknown = [r for r in self.results if r["classification"] == "UNKNOWN"] report = { "scan_date": datetime.utcnow().isoformat(), "total_scanned": len(self.results), "classified": len(classified), "unknown": len(unknown), "classification_rate": round( len(classified) / max(len(self.results), 1) * 100, 1 ), "classification_summary": self.get_classification_summary(), "top_matching_rules": self.get_top_rules(), "detected_samples": [ { "filename": r["filename"], "sha256": r["sha256"], "classification": r["classification"], "rules_matched": [m["rule"] for m in r["matches"]], } for r in classified ], } report_path = self.output_dir / "yara_triage_report.json" with open(report_path, "w") as f: json.dump(report, f, indent=2) print(f"YARA Triage Results") print(f"={'=' * 40}") print(f"Scanned: {report['total_scanned']}") print(f"Classified: {report['classified']} ({report['classification_rate']}%)") print(f"Unknown: {report['unknown']}") print(f"\nClassification Summary:") for cls, count in report["classification_summary"].items(): print(f" {cls}: {count}") print(f"\nTop Rules:") for rule, count in list(report["top_matching_rules"].items())[:10]: print(f" {rule}: {count} matches") return report def main(): if len(sys.argv) < 3: print("Usage: agent.py [output_dir]") print(" rules_path: YARA rule file or directory of .yar files") print(" samples_dir: Directory of files to scan") sys.exit(1) rules_path = sys.argv[1] samples_dir = sys.argv[2] output_dir = sys.argv[3] if len(sys.argv) > 3 else "./triage_output" agent = YaraTriageAgent(output_dir) rule_count = agent.compile_rules([rules_path]) print(f"Compiled {rule_count} rule files") agent.scan_directory(samples_dir) agent.generate_report() if __name__ == "__main__": main()