Anthropic-Cybersecurity-Skills/skills/implementing-endpoint-dlp-controls/scripts/agent.py

#!/usr/bin/env python3
"""Agent for implementing and monitoring endpoint DLP controls."""

import json
import argparse
import re
from datetime import datetime
from pathlib import Path


SENSITIVE_PATTERNS = {
    "SSN": r"\b\d{3}-\d{2}-\d{4}\b",
    "Credit Card": r"\b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13})\b",
    "Email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
    "AWS Key": r"AKIA[0-9A-Z]{16}",
    "Private Key": r"-----BEGIN (RSA |EC )?PRIVATE KEY-----",
    "API Key": r"(?:api[_-]?key|apikey)\s*[:=]\s*['\"]?[a-zA-Z0-9]{20,}",
}

SENSITIVE_EXTENSIONS = [
    ".pem", ".key", ".pfx", ".p12", ".kdbx", ".env",
    ".sql", ".bak", ".dump", ".mdb",
]


def scan_file_for_sensitive_data(file_path):
    """Scan a single file for sensitive data patterns."""
    try:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            content = f.read(1024 * 1024)
    except (OSError, PermissionError):
        return None
    matches = {}
    for pattern_name, pattern in SENSITIVE_PATTERNS.items():
        found = re.findall(pattern, content)
        if found:
            matches[pattern_name] = len(found)
    ext = Path(file_path).suffix.lower()
    is_sensitive_ext = ext in SENSITIVE_EXTENSIONS
    if not matches and not is_sensitive_ext:
        return None
    return {
        "file": str(file_path),
        "matches": matches,
        "sensitive_extension": is_sensitive_ext,
        "file_size": Path(file_path).stat().st_size,
        "severity": "CRITICAL" if "Private Key" in matches or "AWS Key" in matches
                    else "HIGH" if matches else "MEDIUM",
    }


def scan_directory(dir_path, max_files=10000):
    """Scan a directory for files containing sensitive data."""
    findings = []
    count = 0
    for filepath in Path(dir_path).rglob("*"):
        if filepath.is_file() and count < max_files:
            count += 1
            result = scan_file_for_sensitive_data(filepath)
            if result:
                findings.append(result)
    return sorted(findings, key=lambda x: x["severity"])


def monitor_usb_transfers(event_log_path):
    """Monitor file transfers to USB/removable devices."""
    findings = []
    with open(event_log_path) as f:
        for line in f:
            try:
                entry = json.loads(line)
            except json.JSONDecodeError:
                continue
            dest = entry.get("destination", entry.get("target_path", "")).lower()
            if any(ind in dest for ind in ["removable", "usb", "external"]):
                findings.append({
                    "timestamp": entry.get("timestamp", ""),
                    "user": entry.get("user", ""),
                    "file": entry.get("file_path", entry.get("source", "")),
                    "destination": dest,
                    "size_bytes": entry.get("size", entry.get("bytes", 0)),
                    "severity": "HIGH",
                    "channel": "USB",
                })
    return findings


def monitor_cloud_uploads(event_log_path):
    """Monitor file uploads to cloud storage services."""
    cloud_domains = ["drive.google.com", "dropbox.com", "onedrive.live.com",
                     "box.com", "wetransfer.com", "mega.nz"]
    findings = []
    with open(event_log_path) as f:
        for line in f:
            try:
                entry = json.loads(line)
            except json.JSONDecodeError:
                continue
            url = entry.get("url", entry.get("destination", "")).lower()
            if any(domain in url for domain in cloud_domains):
                findings.append({
                    "timestamp": entry.get("timestamp", ""),
                    "user": entry.get("user", ""),
                    "url": url[:200],
                    "file": entry.get("file_name", entry.get("filename", "")),
                    "severity": "HIGH",
                    "channel": "cloud_upload",
                })
    return findings


def generate_dlp_policy():
    """Generate endpoint DLP policy recommendations."""
    return {
        "data_classification": ["PII", "Financial", "Credentials", "Source Code"],
        "channels_monitored": ["USB", "Cloud Storage", "Email Attachments",
                               "Clipboard", "Print", "Screen Capture"],
        "actions": {
            "PII_to_USB": "block_and_notify",
            "credentials_to_cloud": "block_and_alert_soc",
            "source_code_to_email": "encrypt_and_log",
            "financial_to_print": "log_and_watermark",
        },
    }


def main():
    parser = argparse.ArgumentParser(description="Endpoint DLP Controls Agent")
    parser.add_argument("--scan-dir", help="Directory to scan for sensitive data")
    parser.add_argument("--usb-log", help="USB transfer event log")
    parser.add_argument("--cloud-log", help="Cloud upload event log")
    parser.add_argument("--output", default="endpoint_dlp_report.json")
    parser.add_argument("--action", choices=["scan", "usb", "cloud", "policy", "full"],
                        default="full")
    args = parser.parse_args()

    report = {"generated_at": datetime.utcnow().isoformat(), "findings": {}}

    if args.action in ("scan", "full") and args.scan_dir:
        findings = scan_directory(args.scan_dir)
        report["findings"]["sensitive_data_scan"] = findings
        print(f"[+] Sensitive files found: {len(findings)}")

    if args.action in ("usb", "full") and args.usb_log:
        findings = monitor_usb_transfers(args.usb_log)
        report["findings"]["usb_transfers"] = findings
        print(f"[+] USB transfer events: {len(findings)}")

    if args.action in ("cloud", "full") and args.cloud_log:
        findings = monitor_cloud_uploads(args.cloud_log)
        report["findings"]["cloud_uploads"] = findings
        print(f"[+] Cloud upload events: {len(findings)}")

    if args.action in ("policy", "full"):
        policy = generate_dlp_policy()
        report["findings"]["dlp_policy"] = policy
        print("[+] DLP policy generated")

    with open(args.output, "w") as fout:
        json.dump(report, fout, indent=2, default=str)
    print(f"[+] Report saved to {args.output}")


if __name__ == "__main__":
    main()