Files
T

528 lines
22 KiB
Python

#!/usr/bin/env python3
"""
LLM Guardrails Security Agent
Implements input and output validation guardrails for LLM-powered applications.
Provides multi-layered security including prompt injection blocking, PII detection
and redaction, content policy enforcement, topic restriction, and output validation.
Supports NVIDIA NeMo Guardrails Colang integration and custom Python validators.
"""
import argparse
import json
import logging
import re
import sys
import time
from dataclasses import dataclass, field, asdict
from pathlib import Path
from typing import Optional
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Default content policy
# ---------------------------------------------------------------------------
DEFAULT_POLICY = {
"allowed_topics": [],
"blocked_topics": ["violence", "illegal_activities", "weapons", "drugs", "exploitation"],
"blocked_patterns": [
r"(?i)\b(how\s+to\s+(hack|crack|break\s+into|exploit|bypass))\b",
r"(?i)\b(create|write|generate)\b.{0,20}\b(malware|virus|trojan|ransomware|keylogger|rootkit)\b",
r"(?i)\b(steal|exfiltrate|extract)\b.{0,20}\b(data|credentials?|passwords?|tokens?|keys?)\b",
r"(?i)\b(make|build|synthesize)\b.{0,20}\b(bomb|weapon|explosive|poison)\b",
r"(?i)\b(social\s+engineer|phish|spear\s*phish|impersonate)\b.{0,20}\b(someone|a\s+person|employee|user)\b",
],
"pii_patterns": {
"US_SSN": r"\b\d{3}-\d{2}-\d{4}\b",
"EMAIL_ADDRESS": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
"PHONE_NUMBER": r"\b(\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b",
"CREDIT_CARD": r"\b(?:\d{4}[-\s]?){3}\d{4}\b",
"IP_ADDRESS": r"\b(?:\d{1,3}\.){3}\d{1,3}\b",
"US_PASSPORT": r"\b[A-Z]\d{8}\b",
"AWS_ACCESS_KEY": r"\bAKIA[0-9A-Z]{16}\b",
"GENERIC_API_KEY": r"\b(?:api[_-]?key|token|secret)[=:\s]+['\"]?[A-Za-z0-9_\-]{20,}['\"]?",
},
"injection_patterns": [
r"(?i)\b(ignore|disregard|forget|override|bypass)\b.{0,30}\b(previous|above|prior|all|system|initial)\b.{0,20}\b(instructions?|prompts?|rules?)\b",
r"(?i)\b(you\s+are\s+now|act\s+as|pretend\s+(to\s+be|you\s+are)|simulate\s+being)\b",
r"(?i)\b(do\s+not\s+follow|stop\s+following|new\s+instructions?|instead\s+(do|say|output))\b",
r"(?i)(```\s*(system|assistant|user)\s*\n|<\s*/?\s*(system|instruction|prompt)\s*>)",
r"(?i)\b(developer\s+mode|DAN\s+mode|jailbreak\s+mode|god\s+mode|sudo\s+mode)\b",
r"(?i)\b(output|reveal|show|display|print|leak)\b.{0,30}\b(system\s+prompt|instructions?|config|password|api\s*key)\b",
r"(?i)\b(what\s+(is|are)\s+your\s+(system\s+)?instructions?|repeat\s+your\s+prompt|show\s+me\s+your\s+rules)\b",
],
"max_input_length": 4000,
"max_output_length": 8000,
"output_blocked_patterns": [
r"(?i)\b(my\s+system\s+prompt\s+is|here\s+are\s+my\s+instructions|as\s+an?\s+ai\s+language\s+model,?\s+i\s+don'?t\s+have\s+a\s+system\s+prompt)\b",
r"(?i)\b(sure,?\s+i'?ll\s+(help\s+you\s+)?(hack|create\s+malware|bypass\s+security|write\s+a\s+virus))\b",
],
}
@dataclass
class ValidationResult:
"""Result from a guardrail validation pass."""
safe: bool = True
blocked_reason: str = ""
violations: list[dict] = field(default_factory=list)
pii_detected: list[dict] = field(default_factory=list)
sanitized_text: str = ""
risk_score: float = 0.0
validation_time_ms: float = 0.0
layer_results: dict = field(default_factory=dict)
class InjectionGuard:
"""Input guard that detects and blocks prompt injection attempts."""
def __init__(self, patterns: list[str]) -> None:
self._compiled = [(i, re.compile(p)) for i, p in enumerate(patterns)]
def check(self, text: str) -> tuple[bool, list[str]]:
violations: list[str] = []
for idx, pattern in self._compiled:
match = pattern.search(text)
if match:
violations.append(f"injection_pattern_{idx}: matched '{match.group()}'")
return len(violations) == 0, violations
class ContentPolicyGuard:
"""Guard that enforces content policy rules on text."""
def __init__(self, blocked_patterns: list[str], blocked_topics: list[str]) -> None:
self._blocked_compiled = [(i, re.compile(p)) for i, p in enumerate(blocked_patterns)]
self._blocked_topics = blocked_topics
self._topic_patterns = self._build_topic_patterns()
def _build_topic_patterns(self) -> list[tuple[str, re.Pattern]]:
topic_regexes: dict[str, str] = {
"violence": r"(?i)\b(kill|murder|assault|torture|attack\s+someone|hurt\s+people|violence\s+against)\b",
"illegal_activities": r"(?i)\b(illegal|launder\s+money|traffic|counterfeit|forge\s+documents?|fraud\s+scheme)\b",
"weapons": r"(?i)\b(gun|firearm|weapon|ammunition|3d\s+print.{0,10}(gun|weapon)|ghost\s+gun)\b",
"drugs": r"(?i)\b(synthesize\s+(meth|cocaine|heroin|fentanyl)|cook\s+meth|manufacture\s+drugs?|drug\s+recipe)\b",
"exploitation": r"(?i)\b(exploit\s+(children|minors?|vulnerable)|human\s+traffic|child\s+abuse)\b",
"politics": r"(?i)\b(vote\s+for|political\s+party|democrat|republican|liberal|conservative)\b.{0,40}\b(best|worst|should|must)\b",
"competitor_products": r"(?i)\b(switch\s+to|better\s+than\s+us|use\s+.{0,20}instead)\b",
}
patterns = []
for topic in self._blocked_topics:
if topic in topic_regexes:
patterns.append((topic, re.compile(topic_regexes[topic])))
return patterns
def check(self, text: str) -> tuple[bool, list[str]]:
violations: list[str] = []
# Check blocked content patterns
for idx, pattern in self._blocked_compiled:
match = pattern.search(text)
if match:
violations.append(f"blocked_content_{idx}: matched '{match.group()}'")
# Check blocked topics
for topic, pattern in self._topic_patterns:
match = pattern.search(text)
if match:
violations.append(f"blocked_topic_{topic}: matched '{match.group()}'")
return len(violations) == 0, violations
class PIIGuard:
"""Guard that detects and redacts personally identifiable information."""
REDACTION_MAP = {
"US_SSN": "[SSN_REDACTED]",
"EMAIL_ADDRESS": "[EMAIL_REDACTED]",
"PHONE_NUMBER": "[PHONE_REDACTED]",
"CREDIT_CARD": "[CARD_REDACTED]",
"IP_ADDRESS": "[IP_REDACTED]",
"US_PASSPORT": "[PASSPORT_REDACTED]",
"AWS_ACCESS_KEY": "[AWS_KEY_REDACTED]",
"GENERIC_API_KEY": "[API_KEY_REDACTED]",
}
def __init__(self, pii_patterns: dict[str, str]) -> None:
self._compiled: dict[str, re.Pattern] = {}
for name, pattern_str in pii_patterns.items():
self._compiled[name] = re.compile(pattern_str)
def detect(self, text: str) -> list[dict]:
findings: list[dict] = []
for name, pattern in self._compiled.items():
for match in pattern.finditer(text):
findings.append({
"type": name,
"value": match.group(),
"start": match.start(),
"end": match.end(),
})
return findings
def redact(self, text: str) -> tuple[str, list[dict]]:
findings = self.detect(text)
if not findings:
return text, findings
# Sort by position descending to replace from end to start
findings_sorted = sorted(findings, key=lambda f: f["start"], reverse=True)
redacted = text
for finding in findings_sorted:
replacement = self.REDACTION_MAP.get(finding["type"], "[REDACTED]")
redacted = redacted[:finding["start"]] + replacement + redacted[finding["end"]:]
return redacted, findings
class OutputGuard:
"""Guard that validates LLM-generated output for safety violations."""
def __init__(self, blocked_patterns: list[str], max_length: int = 8000) -> None:
self._blocked = [(i, re.compile(p)) for i, p in enumerate(blocked_patterns)]
self._max_length = max_length
def check(self, response: str, original_input: str = "") -> tuple[bool, list[str]]:
violations: list[str] = []
# Check length
if len(response) > self._max_length:
violations.append(f"output_too_long: {len(response)} chars exceeds {self._max_length} limit")
# Check blocked output patterns
for idx, pattern in self._blocked:
match = pattern.search(response)
if match:
violations.append(f"output_blocked_{idx}: matched '{match.group()}'")
# Check for system prompt leakage indicators
system_prompt_indicators = [
r"(?i)(you\s+are\s+a\s+helpful\s+assistant|your\s+role\s+is\s+to|you\s+must\s+always)",
r"(?i)(system\s*:\s*\n|<<\s*SYS\s*>>|<\|system\|>)",
]
for indicator_pat in system_prompt_indicators:
if re.search(indicator_pat, response):
violations.append(f"potential_system_prompt_leak: matched indicator pattern")
break
# Check for PII in output
pii_guard = PIIGuard(DEFAULT_POLICY["pii_patterns"])
pii_findings = pii_guard.detect(response)
for finding in pii_findings:
violations.append(f"pii_in_output: {finding['type']} detected")
return len(violations) == 0, violations
class LengthGuard:
"""Guard that enforces input length limits."""
def __init__(self, max_length: int = 4000) -> None:
self._max_length = max_length
def check(self, text: str) -> tuple[bool, list[str]]:
if len(text) > self._max_length:
return False, [f"input_too_long: {len(text)} chars exceeds {self._max_length} limit"]
return True, []
class GuardrailsPipeline:
"""Complete input/output validation pipeline combining all guardrail layers."""
def __init__(self, policy: Optional[dict] = None, policy_path: Optional[str] = None) -> None:
if policy_path:
with open(policy_path, "r", encoding="utf-8") as fh:
self.policy = json.load(fh)
elif policy:
self.policy = policy
else:
self.policy = DEFAULT_POLICY
# Merge with defaults for any missing keys
for key, value in DEFAULT_POLICY.items():
if key not in self.policy:
self.policy[key] = value
# Initialize guards
self.injection_guard = InjectionGuard(self.policy.get("injection_patterns", []))
self.content_guard = ContentPolicyGuard(
blocked_patterns=self.policy.get("blocked_patterns", []),
blocked_topics=self.policy.get("blocked_topics", []),
)
self.pii_guard = PIIGuard(self.policy.get("pii_patterns", {}))
self.length_guard = LengthGuard(self.policy.get("max_input_length", 4000))
self.output_guard = OutputGuard(
blocked_patterns=self.policy.get("output_blocked_patterns", []),
max_length=self.policy.get("max_output_length", 8000),
)
def validate_input(self, text: str) -> ValidationResult:
start = time.perf_counter()
result = ValidationResult(sanitized_text=text)
all_violations: list[dict] = []
# Layer 1: Length check
length_safe, length_issues = self.length_guard.check(text)
result.layer_results["length_guard"] = {"safe": length_safe, "issues": length_issues}
if not length_safe:
for issue in length_issues:
all_violations.append({"guard": "length", "detail": issue})
# Layer 2: Injection detection
injection_safe, injection_issues = self.injection_guard.check(text)
result.layer_results["injection_guard"] = {"safe": injection_safe, "issues": injection_issues}
if not injection_safe:
for issue in injection_issues:
all_violations.append({"guard": "injection", "detail": issue})
# Layer 3: Content policy
content_safe, content_issues = self.content_guard.check(text)
result.layer_results["content_policy_guard"] = {"safe": content_safe, "issues": content_issues}
if not content_safe:
for issue in content_issues:
all_violations.append({"guard": "content_policy", "detail": issue})
# Layer 4: PII detection and redaction
redacted_text, pii_findings = self.pii_guard.redact(text)
result.pii_detected = pii_findings
result.sanitized_text = redacted_text
result.layer_results["pii_guard"] = {
"pii_found": len(pii_findings),
"types": list(set(f["type"] for f in pii_findings)),
}
if pii_findings:
for finding in pii_findings:
all_violations.append({
"guard": "pii",
"detail": f"detected {finding['type']}",
"severity": "warning",
})
# Compute risk score
critical_violations = sum(1 for v in all_violations if v.get("severity") != "warning")
warning_violations = sum(1 for v in all_violations if v.get("severity") == "warning")
result.risk_score = min(1.0, critical_violations * 0.35 + warning_violations * 0.1)
# Final verdict: block on critical violations, warn on PII-only
result.violations = all_violations
if critical_violations > 0:
result.safe = False
reasons = [v["detail"] for v in all_violations if v.get("severity") != "warning"]
result.blocked_reason = "; ".join(reasons[:3])
else:
result.safe = True
result.validation_time_ms = round((time.perf_counter() - start) * 1000, 2)
return result
def validate_output(self, response: str, original_input: str = "") -> ValidationResult:
start = time.perf_counter()
result = ValidationResult(sanitized_text=response)
all_violations: list[dict] = []
# Check output safety
output_safe, output_issues = self.output_guard.check(response, original_input)
result.layer_results["output_guard"] = {"safe": output_safe, "issues": output_issues}
if not output_safe:
for issue in output_issues:
all_violations.append({"guard": "output", "detail": issue})
# PII redaction on output
redacted_output, pii_findings = self.pii_guard.redact(response)
result.pii_detected = pii_findings
result.sanitized_text = redacted_output
result.violations = all_violations
critical = sum(1 for v in all_violations if "pii_in_output" not in v.get("detail", ""))
result.risk_score = min(1.0, critical * 0.35 + len(pii_findings) * 0.1)
if critical > 0:
result.safe = False
reasons = [v["detail"] for v in all_violations]
result.blocked_reason = "; ".join(reasons[:3])
else:
result.safe = True
result.validation_time_ms = round((time.perf_counter() - start) * 1000, 2)
return result
def validate_pii_only(self, text: str) -> ValidationResult:
start = time.perf_counter()
result = ValidationResult(sanitized_text=text)
redacted_text, pii_findings = self.pii_guard.redact(text)
result.pii_detected = pii_findings
result.sanitized_text = redacted_text
result.safe = len(pii_findings) == 0
if pii_findings:
types_found = list(set(f["type"] for f in pii_findings))
result.blocked_reason = f"PII detected: {', '.join(types_found)}"
result.violations = [{"guard": "pii", "detail": f"detected {f['type']}"} for f in pii_findings]
result.risk_score = min(1.0, len(pii_findings) * 0.15)
result.validation_time_ms = round((time.perf_counter() - start) * 1000, 2)
return result
def format_result_text(result: ValidationResult, label: str = "INPUT") -> str:
"""Format a validation result as human-readable text."""
verdict = "SAFE" if result.safe else "BLOCKED"
lines = [
f"[{label}] Verdict: {verdict}",
f" Risk Score : {result.risk_score:.4f}",
f" Validation Time : {result.validation_time_ms:.2f} ms",
]
if result.blocked_reason:
lines.append(f" Blocked Reason : {result.blocked_reason}")
if result.violations:
lines.append(f" Violations ({len(result.violations)}):")
for v in result.violations[:5]:
severity = v.get("severity", "critical")
lines.append(f" [{severity.upper()}] {v['guard']}: {v['detail']}")
if result.pii_detected:
lines.append(f" PII Detected ({len(result.pii_detected)}):")
for pii in result.pii_detected:
masked = pii["value"][:3] + "***"
lines.append(f" {pii['type']}: {masked}")
lines.append(f" Sanitized Text : {result.sanitized_text[:200]}")
lines.append("-" * 70)
return "\n".join(lines)
def format_result_json(result: ValidationResult) -> str:
"""Format a validation result as JSON."""
data = asdict(result)
data["sanitized_text"] = data["sanitized_text"][:500]
# Mask PII values in JSON output
for pii in data.get("pii_detected", []):
pii["value"] = pii["value"][:3] + "***"
return json.dumps(data, indent=2, default=str)
def main() -> None:
parser = argparse.ArgumentParser(
description="LLM Guardrails Security Agent - input/output validation for LLM applications.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python agent.py --input "Tell me how to hack into a network"
python agent.py --input "My SSN is 123-45-6789" --mode pii
python agent.py --file prompts.txt --mode full --output json
python agent.py --input "Question" --response "LLM answer" --mode output-only
python agent.py --input "Some text" --policy custom_policy.json
""",
)
parser.add_argument("--input", "-i", type=str, help="User input to validate")
parser.add_argument("--response", "-r", type=str, help="LLM response to validate (for output-only mode)")
parser.add_argument("--file", "-f", type=str, help="File with one prompt per line to scan")
parser.add_argument(
"--mode", "-m",
choices=["full", "input-only", "output-only", "pii"],
default="full",
help="Validation mode. Default: full",
)
parser.add_argument("--policy", "-p", type=str, help="Path to JSON content policy file")
parser.add_argument(
"--output", "-o",
choices=["text", "json"],
default="text",
help="Output format. Default: text",
)
args = parser.parse_args()
if not args.input and not args.file:
parser.error("Provide either --input or --file")
if args.mode == "output-only" and not args.response:
parser.error("--response is required for output-only mode")
# Initialize pipeline
pipeline = GuardrailsPipeline(policy_path=args.policy)
# Collect inputs
inputs: list[str] = []
if args.input:
inputs.append(args.input)
if args.file:
filepath = Path(args.file)
if not filepath.is_file():
logger.error("File not found: %s", args.file)
sys.exit(1)
with open(filepath, "r", encoding="utf-8") as fh:
for line in fh:
stripped = line.strip()
if stripped:
inputs.append(stripped)
if not inputs:
logger.error("No inputs to validate.")
sys.exit(1)
logger.info("Validating %d input(s) in '%s' mode ...", len(inputs), args.mode)
blocked_count = 0
for idx, user_input in enumerate(inputs, 1):
if args.mode == "pii":
result = pipeline.validate_pii_only(user_input)
label = "PII"
elif args.mode == "output-only":
result = pipeline.validate_output(args.response, original_input=user_input)
label = "OUTPUT"
elif args.mode == "input-only":
result = pipeline.validate_input(user_input)
label = "INPUT"
else:
# Full mode: validate input, then simulate output check
input_result = pipeline.validate_input(user_input)
if args.output == "text":
print(f"\n[{idx}/{len(inputs)}]")
print(format_result_text(input_result, label="INPUT"))
else:
print(format_result_json(input_result))
if not input_result.safe:
blocked_count += 1
# If a response is provided, also validate output
if args.response:
output_result = pipeline.validate_output(args.response, original_input=user_input)
if args.output == "text":
print(format_result_text(output_result, label="OUTPUT"))
else:
print(format_result_json(output_result))
if not output_result.safe:
blocked_count += 1
continue
if not result.safe:
blocked_count += 1
if args.output == "text":
print(f"\n[{idx}/{len(inputs)}]")
print(format_result_text(result, label=label))
else:
print(format_result_json(result))
# Summary
if args.output == "text" and len(inputs) > 1:
print(f"\n{'=' * 70}")
print(f"SUMMARY: {blocked_count}/{len(inputs)} inputs blocked or flagged")
if blocked_count > 0:
sys.exit(2)
sys.exit(0)
if __name__ == "__main__":
main()