Anthropic-Cybersecurity-Skills/skills/performing-yara-rule-development-for-detection/scripts/process.py

#!/usr/bin/env python3
"""
YARA Rule Development and Testing Framework

Assists in creating, testing, and optimizing YARA rules
for malware detection.

Requirements:
    pip install yara-python pefile

Usage:
    python process.py --analyze sample.exe
    python process.py --test rule.yar --samples ./malware --clean ./goodware
    python process.py --generate --name MalwareX --strings strings.txt
"""

import argparse
import json
import os
import re
import sys
import time
from collections import Counter
from pathlib import Path

try:
    import yara
except ImportError:
    print("ERROR: yara-python not installed. Run: pip install yara-python")
    sys.exit(1)

try:
    import pefile
except ImportError:
    pefile = None


class YaraRuleBuilder:
    """Build and test YARA rules."""

    def __init__(self):
        self.candidate_strings = []
        self.candidate_hex = []
        self.imports = []

    def analyze_sample(self, filepath):
        """Extract candidate patterns from a malware sample."""
        with open(filepath, 'rb') as f:
            data = f.read()

        # Extract ASCII strings (min 8 chars)
        ascii_strings = [
            s.decode('ascii')
            for s in re.findall(rb'[\x20-\x7e]{8,}', data)
        ]

        # Extract wide strings
        wide_strings = [
            s.decode('utf-16-le')
            for s in re.findall(rb'(?:[\x20-\x7e]\x00){8,}', data)
        ]

        # Score strings by uniqueness/suspiciousness
        suspicious = [
            'http', 'https', 'ftp', 'cmd.exe', 'powershell',
            'mutex', 'pipe', 'password', 'encrypt', 'decrypt',
            'inject', 'hook', 'shell', 'backdoor', 'keylog',
            'screenshot', 'clipboard', 'download', 'upload',
            'sandbox', 'vmware', 'virtualbox', 'debug',
        ]

        scored = []
        for s in ascii_strings + wide_strings:
            score = 0
            s_lower = s.lower()
            for kw in suspicious:
                if kw in s_lower:
                    score += 10
            if len(s) > 20:
                score += 5
            if re.search(r'[A-Z][a-z]+[A-Z]', s):  # CamelCase
                score += 3
            scored.append((s, score))

        scored.sort(key=lambda x: -x[1])
        self.candidate_strings = scored[:30]

        # PE imports if available
        if pefile:
            try:
                pe = pefile.PE(filepath)
                if hasattr(pe, 'DIRECTORY_ENTRY_IMPORT'):
                    for entry in pe.DIRECTORY_ENTRY_IMPORT:
                        for imp in entry.imports:
                            if imp.name:
                                self.imports.append(
                                    imp.name.decode('utf-8', errors='replace')
                                )
            except Exception:
                pass

        return {
            "total_ascii": len(ascii_strings),
            "total_wide": len(wide_strings),
            "top_candidates": [(s, sc) for s, sc in scored[:10]],
            "suspicious_imports": [
                i for i in self.imports
                if i in ['VirtualAlloc', 'VirtualAllocEx',
                         'WriteProcessMemory', 'CreateRemoteThread',
                         'NtUnmapViewOfSection', 'OpenProcess',
                         'CryptEncrypt', 'InternetOpenA']
            ],
        }

    def generate_rule(self, name, author="analyst", description=""):
        """Generate YARA rule from analyzed patterns."""
        strings_section = []
        conditions = []

        # Add top candidate strings
        for i, (s, score) in enumerate(self.candidate_strings[:8]):
            if score > 0:
                escaped = s.replace('\\', '\\\\').replace('"', '\\"')
                strings_section.append(
                    f'$str{i} = "{escaped}" ascii wide'
                )

        # Add import-based strings
        sus_imports = [
            i for i in self.imports
            if i in ['VirtualAlloc', 'VirtualAllocEx',
                     'WriteProcessMemory', 'CreateRemoteThread']
        ]
        for i, imp in enumerate(sus_imports[:4]):
            strings_section.append(f'$api{i} = "{imp}" ascii')

        # Build condition
        str_count = len([s for s in strings_section if s.startswith('$str')])
        api_count = len([s for s in strings_section if s.startswith('$api')])

        condition_parts = ['uint16(0) == 0x5A4D', 'filesize < 5MB']
        if str_count > 0:
            threshold = max(2, str_count // 2)
            condition_parts.append(f'{threshold} of ($str*)')
        if api_count > 0:
            condition_parts.append(f'{max(1, api_count - 1)} of ($api*)')

        rule = f"""rule {name} {{
    meta:
        description = "{description or f'Detects {name}'}"
        author = "{author}"
        date = "{time.strftime('%Y-%m-%d')}"
        tlp = "WHITE"

    strings:
        {chr(10) + "        ".join(strings_section)}

    condition:
        {" and ".join(condition_parts)}
}}"""
        return rule

    def test_rule(self, rule_path_or_text, sample_dir, clean_dir=None):
        """Test YARA rule for detection and false positive rates."""
        if os.path.isfile(rule_path_or_text):
            rules = yara.compile(filepath=rule_path_or_text)
        else:
            rules = yara.compile(source=rule_path_or_text)

        results = {
            "true_positives": 0,
            "false_negatives": 0,
            "false_positives": 0,
            "true_negatives": 0,
            "scan_time": 0,
            "details": [],
        }

        # Scan malware samples
        start = time.perf_counter()
        for f in Path(sample_dir).rglob('*'):
            if f.is_file():
                try:
                    matches = rules.match(str(f))
                    if matches:
                        results["true_positives"] += 1
                    else:
                        results["false_negatives"] += 1
                        results["details"].append(
                            {"file": str(f), "result": "FALSE_NEGATIVE"}
                        )
                except Exception:
                    pass

        # Scan clean files
        if clean_dir:
            for f in Path(clean_dir).rglob('*'):
                if f.is_file():
                    try:
                        matches = rules.match(str(f))
                        if matches:
                            results["false_positives"] += 1
                            results["details"].append(
                                {"file": str(f), "result": "FALSE_POSITIVE"}
                            )
                        else:
                            results["true_negatives"] += 1
                    except Exception:
                        pass

        results["scan_time"] = time.perf_counter() - start

        total_samples = results["true_positives"] + results["false_negatives"]
        if total_samples > 0:
            results["detection_rate"] = round(
                results["true_positives"] / total_samples * 100, 2
            )

        total_clean = results["false_positives"] + results["true_negatives"]
        if total_clean > 0:
            results["fp_rate"] = round(
                results["false_positives"] / total_clean * 100, 4
            )

        return results


def main():
    parser = argparse.ArgumentParser(
        description="YARA Rule Development Framework"
    )
    parser.add_argument("--analyze", help="Analyze sample for YARA patterns")
    parser.add_argument("--generate", action="store_true",
                        help="Generate rule from analysis")
    parser.add_argument("--name", default="MalwareDetection",
                        help="Rule name")
    parser.add_argument("--test", help="Test YARA rule file")
    parser.add_argument("--samples", help="Malware samples directory")
    parser.add_argument("--clean", help="Clean files directory")
    parser.add_argument("--output", help="Output rule file")

    args = parser.parse_args()
    builder = YaraRuleBuilder()

    if args.analyze:
        analysis = builder.analyze_sample(args.analyze)
        print(json.dumps(analysis, indent=2, default=str))

        if args.generate:
            rule = builder.generate_rule(args.name)
            print(f"\n{rule}")
            if args.output:
                with open(args.output, 'w') as f:
                    f.write(rule)
                print(f"[+] Rule saved to {args.output}")

    elif args.test and args.samples:
        results = builder.test_rule(args.test, args.samples, args.clean)
        print(json.dumps(results, indent=2))

    else:
        parser.print_help()


if __name__ == "__main__":
    main()