#!/usr/bin/env python3
"""
Prompt Injection Detection Agent
Multi-layered detector for identifying prompt injection attacks targeting LLM applications.
Combines regex pattern matching, heuristic anomaly scoring, and DeBERTa-based classification
to provide defense-in-depth against direct and indirect prompt injection attempts.
Based on OWASP LLM Top 10 (LLM01:2025) and Simon Willison's prompt injection taxonomy.
"""
import argparse
import json
import logging
import re
import sys
import time
from dataclasses import dataclass, field, asdict
from pathlib import Path
from typing import Optional
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Regex patterns for known prompt injection signatures
# ---------------------------------------------------------------------------
INJECTION_PATTERNS: list[tuple[str, str]] = [
("system_prompt_override", r"(?i)\b(ignore|disregard|forget|override|bypass)\b.{0,30}\b(previous|above|prior|all|system|initial)\b.{0,20}\b(instructions?|prompts?|rules?|directives?|context)\b"),
("role_play_escape", r"(?i)\b(you\s+are\s+now|act\s+as|pretend\s+(to\s+be|you\s+are)|simulate\s+being|switch\s+to|enter\s+.{0,10}mode)\b"),
("instruction_hijack", r"(?i)\b(do\s+not\s+follow|stop\s+following|new\s+instructions?|instead\s+(do|say|output|respond|print))\b"),
("delimiter_escape", r"(?i)(```\s*(system|assistant|user)\s*\n|<\s*/?\s*(system|instruction|prompt)\s*>|\[INST\]|\[/INST\]|<<\s*SYS\s*>>)"),
("data_exfiltration", r"(?i)\b(output|reveal|show|display|print|leak|exfiltrate|extract)\b.{0,30}\b(system\s+prompt|instructions?|config|password|secret|api\s*key|token|credentials?)\b"),
("encoding_obfuscation", r"(?i)\b(base64|rot13|hex\s*encode|url\s*encode|unicode\s*escape)\b.{0,30}\b(decode|convert|translate|interpret)\b"),
("sql_injection_via_prompt", r"(?i)(;\s*(DROP|DELETE|UPDATE|INSERT|ALTER|EXEC)\b|'\s*(OR|AND)\s+['\d]|UNION\s+SELECT)"),
("command_injection_via_prompt", r"(?i)(;\s*(rm|cat|wget|curl|bash|sh|python|exec|eval)\b|\|\s*(cat|ls|id|whoami|nc)\b|`[^`]+`)"),
("markdown_injection", r"(?i)(\!\[.*?\]\(javascript:|
]*onerror|