Anthropic-Cybersecurity-Skills/skills/detecting-data-and-model-poisoning/scripts/agent.py

#!/usr/bin/env python3
# For authorized defensive ML-security use in isolated environments only.
"""Data and model poisoning detection agent.

Modes:
  integrity -- verify model-weight digests and flag unsafe pickle-based artifacts.
  labels    -- run Cleanlab label-issue detection from saved labels + pred_probs (.npy).
  activations -- run ART activation-clustering poisoning detection on a saved model.

Examples:
  python agent.py integrity --model-dir ./models --expected-sha256 ABC...
  python agent.py labels --labels labels.npy --pred-probs probs.npy
  python agent.py activations --model model.h5 --x x_train.npy --y y_train.npy
"""
import argparse
import glob
import hashlib
import json
import os
import sys
from datetime import datetime, timezone

UNSAFE_EXT = (".pt", ".bin", ".pkl", ".pickle", ".ckpt", ".pth")


def sha256_file(path, chunk=1 << 20):
    h = hashlib.sha256()
    with open(path, "rb") as fh:
        for block in iter(lambda: fh.read(chunk), b""):
            h.update(block)
    return h.hexdigest()


def run_integrity(args):
    report = {"ts": datetime.now(timezone.utc).isoformat(),
              "atlas": "AML.T0010", "unsafe_artifacts": [], "digests": {}}
    for path in glob.glob(os.path.join(args.model_dir, "**", "*"), recursive=True):
        if not os.path.isfile(path):
            continue
        if path.lower().endswith(UNSAFE_EXT):
            report["unsafe_artifacts"].append(path)
        if path.lower().endswith(".safetensors"):
            report["digests"][path] = sha256_file(path)

    if args.expected_sha256:
        match = any(d.lower() == args.expected_sha256.lower()
                    for d in report["digests"].values())
        report["expected_digest_match"] = match
        if not match:
            report["warning"] = "No .safetensors matched the expected digest"

    if report["unsafe_artifacts"]:
        report["warning_unsafe"] = (
            "Pickle-based artifacts can execute code on load; prefer safetensors")
    print(json.dumps(report, indent=2))
    return report


def run_labels(args):
    try:
        import numpy as np
        from cleanlab.filter import find_label_issues
    except ImportError:
        print("Install: pip install cleanlab numpy", file=sys.stderr)
        sys.exit(1)
    labels = np.load(args.labels)
    pred_probs = np.load(args.pred_probs)
    issues = find_label_issues(labels=labels, pred_probs=pred_probs,
                               return_indices_ranked_by="self_confidence")
    report = {"ts": datetime.now(timezone.utc).isoformat(), "atlas": "AML.T0020",
              "n_samples": int(len(labels)), "n_label_issues": int(len(issues)),
              "top_suspect_indices": [int(i) for i in issues[:50]]}
    print(json.dumps(report, indent=2))
    return report


def run_activations(args):
    try:
        import numpy as np
        from tensorflow.keras.models import load_model
        from art.estimators.classification import KerasClassifier
        from art.defences.detector.poison import ActivationDefence
    except ImportError as exc:
        print(f"Install: pip install adversarial-robustness-toolbox tensorflow numpy "
              f"({exc})", file=sys.stderr)
        sys.exit(1)
    model = load_model(args.model)
    x_train = np.load(args.x)
    y_train = np.load(args.y)
    classifier = KerasClassifier(model=model)
    defence = ActivationDefence(classifier, x_train, y_train)
    _, is_clean_lst = defence.detect_poison(nb_clusters=2, nb_dims=10, reduce="PCA")
    poisoned = [int(i) for i, c in enumerate(is_clean_lst) if c == 0]
    report = {"ts": datetime.now(timezone.utc).isoformat(), "atlas": "AML.T0018",
              "n_samples": int(len(is_clean_lst)),
              "n_poisoned_flagged": len(poisoned),
              "poisoned_indices_sample": poisoned[:50]}
    print(json.dumps(report, indent=2))
    return report


def main():
    ap = argparse.ArgumentParser(description="Data/model poisoning detection agent")
    sub = ap.add_subparsers(dest="mode", required=True)

    pi = sub.add_parser("integrity", help="Verify weight digests / flag unsafe formats")
    pi.add_argument("--model-dir", required=True)
    pi.add_argument("--expected-sha256", help="Expected safetensors SHA-256")

    pl = sub.add_parser("labels", help="Cleanlab label-issue detection")
    pl.add_argument("--labels", required=True, help=".npy integer labels")
    pl.add_argument("--pred-probs", required=True, help=".npy out-of-sample probs")

    pa = sub.add_parser("activations", help="ART activation-clustering detection")
    pa.add_argument("--model", required=True, help="Keras .h5 model")
    pa.add_argument("--x", required=True, help=".npy training inputs")
    pa.add_argument("--y", required=True, help=".npy training labels")

    args = ap.parse_args()
    if args.mode == "integrity":
        run_integrity(args)
    elif args.mode == "labels":
        run_labels(args)
    elif args.mode == "activations":
        run_activations(args)


if __name__ == "__main__":
    main()