Anthropic-Cybersecurity-Skills/skills/implementing-data-loss-prevention-with-microsoft-purview/scripts/agent.py

#!/usr/bin/env python3
# For authorized Microsoft 365 compliance administration only
"""Microsoft Purview DLP Management Agent - Automates DLP policy deployment and monitoring via Graph API."""

import json
import logging
import argparse
import csv
from datetime import datetime, timezone, timedelta
from pathlib import Path

import requests

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)

GRAPH_BASE = "https://graph.microsoft.com/v1.0"
GRAPH_BETA = "https://graph.microsoft.com/beta"


class PurviewAuthClient:
    """Handles OAuth2 client credentials authentication for Microsoft Graph."""

    def __init__(self, tenant_id, client_id, client_secret):
        self.tenant_id = tenant_id
        self.client_id = client_id
        self.client_secret = client_secret
        self.access_token = None
        self.token_expiry = None

    def get_token(self):
        if self.access_token and self.token_expiry and datetime.now(timezone.utc) < self.token_expiry:
            return self.access_token
        token_url = f"https://login.microsoftonline.com/{self.tenant_id}/oauth2/v2.0/token"
        response = requests.post(token_url, data={
            "client_id": self.client_id,
            "client_secret": self.client_secret,
            "scope": "https://graph.microsoft.com/.default",
            "grant_type": "client_credentials",
        }, timeout=30)
        response.raise_for_status()
        token_data = response.json()
        self.access_token = token_data["access_token"]
        self.token_expiry = datetime.now(timezone.utc) + timedelta(
            seconds=token_data.get("expires_in", 3600) - 300
        )
        logger.info("Obtained Graph API access token (expires in %d seconds)",
                     token_data.get("expires_in", 3600))
        return self.access_token

    def headers(self):
        return {
            "Authorization": f"Bearer {self.get_token()}",
            "Content-Type": "application/json",
        }


def get_dlp_alerts(auth_client, days_back=7, severity=None, top=50):
    """Retrieve DLP alerts from Microsoft Graph Security API."""
    url = f"{GRAPH_BASE}/security/alerts_v2"
    start_date = (datetime.now(timezone.utc) - timedelta(days=days_back)).strftime(
        "%Y-%m-%dT%H:%M:%SZ"
    )
    filter_parts = [
        "serviceSource eq 'microsoftDataLossPrevention'",
        f"createdDateTime ge {start_date}",
    ]
    if severity:
        filter_parts.append(f"severity eq '{severity}'")
    params = {
        "$filter": " and ".join(filter_parts),
        "$top": top,
        "$orderby": "createdDateTime desc",
    }
    response = requests.get(url, headers=auth_client.headers(), params=params, timeout=60)
    response.raise_for_status()
    alerts = response.json().get("value", [])
    logger.info("Retrieved %d DLP alerts from last %d days", len(alerts), days_back)
    return alerts


def get_sensitivity_labels(auth_client):
    """Retrieve all sensitivity labels from the tenant."""
    url = f"{GRAPH_BETA}/security/informationProtection/sensitivityLabels"
    response = requests.get(url, headers=auth_client.headers(), timeout=30)
    response.raise_for_status()
    labels = response.json().get("value", [])
    logger.info("Retrieved %d sensitivity labels", len(labels))
    return labels


def evaluate_dlp_protection_scope(auth_client, user_id):
    """Evaluate DLP protection scope for a specific user."""
    url = f"{GRAPH_BETA}/users/{user_id}/security/informationProtection/policy/evaluateApplication"
    payload = {
        "contentInfo": {
            "@odata.type": "#microsoft.graph.security.contentInfo",
            "format@odata.type": "#microsoft.graph.security.contentFormat",
            "format": "default",
        }
    }
    response = requests.post(url, headers=auth_client.headers(), json=payload, timeout=30)
    if response.status_code == 200:
        return response.json()
    logger.warning("DLP evaluation for user %s returned status %d", user_id, response.status_code)
    return None


def generate_alert_summary(alerts):
    """Generate summary statistics from DLP alerts."""
    severity_counts = {"high": 0, "medium": 0, "low": 0, "informational": 0}
    policy_counts = {}
    user_counts = {}
    status_counts = {"new": 0, "inProgress": 0, "resolved": 0}

    for alert in alerts:
        sev = alert.get("severity", "informational").lower()
        severity_counts[sev] = severity_counts.get(sev, 0) + 1

        title = alert.get("title", "Unknown Policy")
        policy_counts[title] = policy_counts.get(title, 0) + 1

        status = alert.get("status", "new")
        status_counts[status] = status_counts.get(status, 0) + 1

        user_states = alert.get("userStates", [])
        for user_state in user_states:
            upn = user_state.get("userPrincipalName", "Unknown")
            user_counts[upn] = user_counts.get(upn, 0) + 1

    top_policies = sorted(policy_counts.items(), key=lambda x: x[1], reverse=True)[:10]
    top_users = sorted(user_counts.items(), key=lambda x: x[1], reverse=True)[:10]

    return {
        "total_alerts": len(alerts),
        "severity_breakdown": severity_counts,
        "status_breakdown": status_counts,
        "top_policies": [{"policy": p, "count": c} for p, c in top_policies],
        "top_users": [{"user": u, "count": c} for u, c in top_users],
    }


def generate_label_report(labels):
    """Generate a report of sensitivity label configuration."""
    report = []
    for label in labels:
        entry = {
            "id": label.get("id"),
            "name": label.get("name"),
            "description": label.get("description", ""),
            "color": label.get("color", ""),
            "sensitivity": label.get("sensitivity", 0),
            "is_active": label.get("isActive", False),
            "parent_id": label.get("parent", {}).get("id") if label.get("parent") else None,
            "content_formats": label.get("contentFormats", []),
            "has_protection": bool(label.get("protectionEnabled")),
        }
        report.append(entry)
    return sorted(report, key=lambda x: x.get("sensitivity", 0))


def check_policy_health(alerts, threshold_high=10, threshold_override_pct=20.0):
    """Analyze DLP policy health based on alert patterns."""
    findings = []

    high_severity = [a for a in alerts if a.get("severity", "").lower() == "high"]
    if len(high_severity) > threshold_high:
        findings.append({
            "finding": "HIGH_ALERT_VOLUME",
            "severity": "WARNING",
            "detail": f"{len(high_severity)} high-severity DLP alerts in the analysis period. "
                      f"Threshold: {threshold_high}. Investigate for data exfiltration patterns.",
            "recommendation": "Review top-triggered policies and affected users. Check for "
                              "compromised accounts or policy misconfiguration.",
        })

    policy_alert_counts = {}
    for alert in alerts:
        title = alert.get("title", "Unknown")
        policy_alert_counts[title] = policy_alert_counts.get(title, 0) + 1

    for policy, count in policy_alert_counts.items():
        if count > 100:
            findings.append({
                "finding": "NOISY_POLICY",
                "severity": "INFO",
                "detail": f"Policy '{policy}' generated {count} alerts. May indicate "
                          f"false positive issues or overly broad matching rules.",
                "recommendation": "Review SIT confidence thresholds and policy conditions. "
                                  "Consider increasing MinConfidence or adding exclusions.",
            })

    unresolved = [a for a in alerts if a.get("status") == "new"]
    if len(unresolved) > 50:
        findings.append({
            "finding": "UNRESOLVED_ALERT_BACKLOG",
            "severity": "WARNING",
            "detail": f"{len(unresolved)} DLP alerts in 'new' status. Alert fatigue risk.",
            "recommendation": "Assign alerts to compliance analysts. Configure auto-resolution "
                              "for low-severity informational alerts. Implement alert triage SOP.",
        })

    if not findings:
        findings.append({
            "finding": "HEALTHY",
            "severity": "INFO",
            "detail": "DLP policy health checks passed. No anomalies detected.",
            "recommendation": "Continue regular monitoring. Schedule quarterly policy review.",
        })

    return findings


def export_alerts_csv(alerts, output_path):
    """Export DLP alerts to CSV for compliance reporting."""
    fieldnames = [
        "id", "title", "severity", "status", "createdDateTime",
        "user", "description", "category",
    ]
    with open(output_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for alert in alerts:
            user_states = alert.get("userStates", [])
            upn = user_states[0].get("userPrincipalName", "N/A") if user_states else "N/A"
            writer.writerow({
                "id": alert.get("id", ""),
                "title": alert.get("title", ""),
                "severity": alert.get("severity", ""),
                "status": alert.get("status", ""),
                "createdDateTime": alert.get("createdDateTime", ""),
                "user": upn,
                "description": alert.get("description", ""),
                "category": alert.get("category", ""),
            })
    logger.info("Exported %d alerts to %s", len(alerts), output_path)


def generate_compliance_report(auth_client, days_back=30, output_dir="."):
    """Generate comprehensive DLP compliance report."""
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    logger.info("Generating DLP compliance report for last %d days", days_back)

    alerts = get_dlp_alerts(auth_client, days_back=days_back, top=500)
    alert_summary = generate_alert_summary(alerts)
    health_findings = check_policy_health(alerts)

    labels = get_sensitivity_labels(auth_client)
    label_report = generate_label_report(labels)

    report = {
        "report_generated": datetime.now(timezone.utc).isoformat(),
        "analysis_period_days": days_back,
        "alert_summary": alert_summary,
        "policy_health": health_findings,
        "sensitivity_labels": label_report,
        "alert_details": alerts[:100],
    }

    report_path = output_dir / "dlp_compliance_report.json"
    report_path.write_text(json.dumps(report, indent=2, default=str))
    logger.info("Compliance report saved to %s", report_path)

    csv_path = output_dir / "dlp_alerts_export.csv"
    export_alerts_csv(alerts, csv_path)

    print("\n" + "=" * 70)
    print("DLP COMPLIANCE REPORT SUMMARY")
    print("=" * 70)
    print(f"Report Period: Last {days_back} days")
    print(f"Total Alerts: {alert_summary['total_alerts']}")
    print(f"Severity: High={alert_summary['severity_breakdown']['high']}, "
          f"Medium={alert_summary['severity_breakdown']['medium']}, "
          f"Low={alert_summary['severity_breakdown']['low']}")
    print(f"Sensitivity Labels Configured: {len(label_report)}")
    print(f"\nPolicy Health Findings: {len(health_findings)}")
    for finding in health_findings:
        print(f"  [{finding['severity']}] {finding['finding']}: {finding['detail']}")
    print(f"\nTop Triggered Policies:")
    for entry in alert_summary.get("top_policies", [])[:5]:
        print(f"  - {entry['policy']}: {entry['count']} alerts")
    print(f"\nTop Affected Users:")
    for entry in alert_summary.get("top_users", [])[:5]:
        print(f"  - {entry['user']}: {entry['count']} alerts")
    print("=" * 70)
    print(f"Full report: {report_path}")
    print(f"Alert export: {csv_path}")

    return report


def main():
    parser = argparse.ArgumentParser(
        description="Microsoft Purview DLP Management Agent - Monitor and report on DLP policies"
    )
    parser.add_argument("--tenant-id", required=True, help="Azure AD tenant ID")
    parser.add_argument("--client-id", required=True, help="App registration client ID")
    parser.add_argument("--client-secret", required=True,
                        help="App registration client secret")
    parser.add_argument("--action", required=True,
                        choices=["alerts", "labels", "health", "report"],
                        help="Action to perform")
    parser.add_argument("--days", type=int, default=7,
                        help="Number of days to look back for alerts (default: 7)")
    parser.add_argument("--severity", choices=["high", "medium", "low", "informational"],
                        help="Filter alerts by severity")
    parser.add_argument("--output-dir", default=".",
                        help="Directory for output files (default: current directory)")
    parser.add_argument("--output", help="Output file path (overrides default naming)")
    args = parser.parse_args()

    auth_client = PurviewAuthClient(args.tenant_id, args.client_id, args.client_secret)

    if args.action == "alerts":
        alerts = get_dlp_alerts(auth_client, days_back=args.days, severity=args.severity)
        summary = generate_alert_summary(alerts)
        output = {"summary": summary, "alerts": alerts}
        out_path = args.output or Path(args.output_dir) / "dlp_alerts.json"
        Path(out_path).write_text(json.dumps(output, indent=2, default=str))
        logger.info("Alert report saved to %s (%d alerts)", out_path, len(alerts))

    elif args.action == "labels":
        labels = get_sensitivity_labels(auth_client)
        label_report = generate_label_report(labels)
        out_path = args.output or Path(args.output_dir) / "sensitivity_labels.json"
        Path(out_path).write_text(json.dumps(label_report, indent=2, default=str))
        logger.info("Label report saved to %s (%d labels)", out_path, len(labels))

    elif args.action == "health":
        alerts = get_dlp_alerts(auth_client, days_back=args.days, top=500)
        findings = check_policy_health(alerts)
        out_path = args.output or Path(args.output_dir) / "dlp_health.json"
        Path(out_path).write_text(json.dumps(findings, indent=2, default=str))
        logger.info("Health report saved to %s (%d findings)", out_path, len(findings))
        for finding in findings:
            level = logging.WARNING if finding["severity"] == "WARNING" else logging.INFO
            logger.log(level, "[%s] %s: %s", finding["severity"], finding["finding"],
                       finding["detail"])

    elif args.action == "report":
        generate_compliance_report(auth_client, days_back=args.days, output_dir=args.output_dir)


if __name__ == "__main__":
    main()