#!/usr/bin/env python3
"""Agent for testing sensitive data exposure vulnerabilities during authorized assessments."""

import requests
import re
import json
import argparse
import urllib3
from datetime import datetime
from urllib.parse import urljoin

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

SECRET_PATTERNS = {
    "AWS Access Key": r"AKIA[0-9A-Z]{16}",
    "AWS Secret Key": r"(?i)aws(.{0,20})?(?-i)['\"][0-9a-zA-Z/+]{40}['\"]",
    "Google API Key": r"AIza[0-9A-Za-z\-_]{35}",
    "Stripe Secret": r"sk_live_[0-9a-zA-Z]{24,}",
    "GitHub Token": r"ghp_[a-zA-Z0-9]{36}",
    "Slack Token": r"xox[bpsa]-[0-9a-zA-Z\-]{10,}",
    "Private Key": r"-----BEGIN (RSA |EC )?PRIVATE KEY-----",
    "Generic Secret": r"(?i)(password|secret|api_key|apikey|token)\s*[=:]\s*['\"][^'\"]{8,}['\"]",
}

SENSITIVE_FIELDS = [
    "password", "password_hash", "salt", "ssn", "social_security",
    "credit_card", "card_number", "cvv", "secret_key", "api_key",
    "private_key", "token", "access_token", "refresh_token",
]


def scan_javascript_files(base_url):
    """Download and scan JavaScript files for hardcoded secrets."""
    print("\n[*] Scanning JavaScript files for secrets...")
    findings = []
    try:
        resp = requests.get(base_url, timeout=15, verify=False)
        js_urls = re.findall(r'src=["\']([^"\']*\.js[^"\']*)["\']', resp.text)
        for js_path in js_urls[:20]:
            if js_path.startswith("//"):
                js_url = "https:" + js_path
            elif js_path.startswith("/"):
                js_url = urljoin(base_url, js_path)
            elif js_path.startswith("http"):
                js_url = js_path
            else:
                js_url = urljoin(base_url, js_path)
            try:
                js_resp = requests.get(js_url, timeout=15, verify=False)
                for name, pattern in SECRET_PATTERNS.items():
                    matches = re.findall(pattern, js_resp.text)
                    if matches:
                        findings.append({
                            "type": "SECRET_IN_JS", "file": js_url,
                            "pattern": name, "count": len(matches), "severity": "HIGH",
                        })
                        print(f"  [!] {name} found in {js_path} ({len(matches)} matches)")
            except requests.RequestException:
                continue
    except requests.RequestException as e:
        print(f"  [-] Error: {e}")
    return findings


def check_config_files(base_url):
    """Check for exposed configuration files."""
    print("\n[*] Checking for exposed configuration files...")
    findings = []
    config_files = [
        ".env", ".env.local", ".env.production", "config.json", "settings.json",
        ".aws/credentials", ".docker/config.json", "wp-config.php",
        ".git/config", ".git/HEAD", "composer.json", "package.json",
        ".htaccess", "web.config", "phpinfo.php",
    ]
    for cf in config_files:
        url = urljoin(base_url, cf)
        try:
            resp = requests.get(url, timeout=5, verify=False)
            if resp.status_code == 200 and len(resp.text) > 10:
                content_type = resp.headers.get("Content-Type", "")
                if "text/html" not in content_type or cf.endswith((".json", ".php")):
                    findings.append({
                        "type": "EXPOSED_CONFIG", "file": cf, "url": url,
                        "size": len(resp.text), "severity": "CRITICAL",
                    })
                    print(f"  [!] FOUND: {cf} ({len(resp.text)} bytes)")
        except requests.RequestException:
            continue
    return findings


def check_api_data_exposure(base_url, token, endpoints):
    """Check API responses for excessive sensitive data."""
    print("\n[*] Checking API responses for sensitive data exposure...")
    findings = []
    headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
    for endpoint in endpoints:
        url = urljoin(base_url, endpoint)
        try:
            resp = requests.get(url, headers=headers, timeout=10, verify=False)
            if resp.status_code == 200:
                data_str = resp.text.lower()
                exposed = [f for f in SENSITIVE_FIELDS if f in data_str]
                if exposed:
                    findings.append({
                        "type": "API_DATA_EXPOSURE", "endpoint": endpoint,
                        "exposed_fields": exposed, "severity": "HIGH",
                    })
                    print(f"  [!] {endpoint}: Exposes {exposed}")
        except requests.RequestException:
            continue
    return findings


def check_security_headers(base_url, sensitive_endpoints):
    """Check Cache-Control and security headers on sensitive pages."""
    print("\n[*] Checking cache headers on sensitive endpoints...")
    findings = []
    for endpoint in sensitive_endpoints:
        url = urljoin(base_url, endpoint)
        try:
            resp = requests.get(url, timeout=10, verify=False)
            cache_control = resp.headers.get("Cache-Control", "")
            if "no-store" not in cache_control and resp.status_code == 200:
                findings.append({
                    "type": "MISSING_NO_STORE", "endpoint": endpoint,
                    "cache_control": cache_control, "severity": "MEDIUM",
                })
                print(f"  [!] {endpoint}: Missing no-store (Cache-Control: {cache_control})")
        except requests.RequestException:
            continue
    return findings


def check_tls_config(host):
    """Basic TLS configuration check."""
    print(f"\n[*] Checking TLS on {host}...")
    findings = []
    try:
        resp = requests.get(f"http://{host}/", timeout=5, allow_redirects=False, verify=False)
        if resp.status_code not in (301, 302, 307, 308):
            findings.append({
                "type": "NO_HTTPS_REDIRECT", "host": host,
                "status": resp.status_code, "severity": "HIGH",
            })
            print(f"  [!] HTTP does not redirect to HTTPS (status {resp.status_code})")
        else:
            location = resp.headers.get("Location", "")
            if location.startswith("https://"):
                print(f"  [+] HTTP redirects to HTTPS")
    except requests.RequestException:
        print(f"  [+] HTTP not accessible (HTTPS only)")

    try:
        resp = requests.get(f"https://{host}/", timeout=5, verify=False)
        hsts = resp.headers.get("Strict-Transport-Security", "")
        if not hsts:
            findings.append({"type": "MISSING_HSTS", "host": host, "severity": "MEDIUM"})
            print(f"  [!] Missing HSTS header")
        else:
            print(f"  [+] HSTS: {hsts}")
    except requests.RequestException:
        pass
    return findings


def check_error_verbosity(base_url):
    """Test if error responses leak sensitive information."""
    print("\n[*] Testing error response verbosity...")
    findings = []
    test_requests = [
        {"method": "POST", "url": "/api/users", "data": '{"invalid": data'},
        {"method": "GET", "url": "/api/nonexistent/path"},
        {"method": "GET", "url": "/api/users/999999999"},
    ]
    verbose_patterns = ["traceback", "stack trace", "exception", "sql", "at line",
                        "file \"", "internal server", "debug"]
    for tr in test_requests:
        url = urljoin(base_url, tr["url"])
        try:
            resp = requests.request(tr["method"], url, data=tr.get("data"),
                                    timeout=10, verify=False)
            text_lower = resp.text.lower()
            matches = [p for p in verbose_patterns if p in text_lower]
            if matches:
                findings.append({
                    "type": "VERBOSE_ERROR", "url": tr["url"],
                    "patterns": matches, "severity": "MEDIUM",
                })
                print(f"  [!] {tr['url']}: Verbose error ({matches})")
        except requests.RequestException:
            continue
    return findings


def generate_report(findings, output_path):
    """Generate sensitive data exposure report."""
    report = {
        "assessment_date": datetime.now().isoformat(),
        "total_findings": len(findings),
        "by_type": {},
        "findings": findings,
    }
    for f in findings:
        t = f.get("type", "UNKNOWN")
        report["by_type"][t] = report["by_type"].get(t, 0) + 1
    with open(output_path, "w") as fh:
        json.dump(report, fh, indent=2)
    print(f"\n[*] Report: {output_path} | Total: {len(findings)}")


def main():
    parser = argparse.ArgumentParser(description="Sensitive Data Exposure Testing Agent")
    parser.add_argument("base_url", help="Base URL of the target")
    parser.add_argument("--token", help="Bearer token for authenticated testing")
    parser.add_argument("--endpoints", nargs="+",
                        default=["/api/users/me", "/api/users", "/api/account"])
    parser.add_argument("-o", "--output", default="data_exposure_report.json")
    args = parser.parse_args()

    print(f"[*] Sensitive Data Exposure Assessment: {args.base_url}")
    findings = []
    findings.extend(scan_javascript_files(args.base_url))
    findings.extend(check_config_files(args.base_url))
    findings.extend(check_error_verbosity(args.base_url))
    from urllib.parse import urlparse
    host = urlparse(args.base_url).netloc
    findings.extend(check_tls_config(host))
    if args.token:
        findings.extend(check_api_data_exposure(args.base_url, args.token, args.endpoints))
        findings.extend(check_security_headers(args.base_url, args.endpoints))
    generate_report(findings, args.output)


if __name__ == "__main__":
    main()