Anthropic-Cybersecurity-Skills/skills/analyzing-phishing-email-headers/scripts/agent.py

#!/usr/bin/env python3
"""Phishing email header analysis agent.

Parses email headers to detect spoofing, authentication failures,
suspicious routing, and phishing indicators.
"""

import os
import sys
import json
import re
import email
import email.utils
from datetime import datetime
from collections import OrderedDict


def parse_email_file(filepath):
    with open(filepath, "r", encoding="utf-8", errors="replace") as f:
        return email.message_from_string(f.read())


def extract_received_chain(msg):
    chain = []
    for header in msg.get_all("Received", []):
        entry = {"raw": header.strip()[:300]}
        from_match = re.search(r"from\s+([\w.-]+)", header)
        by_match = re.search(r"by\s+([\w.-]+)", header)
        ip_match = re.search(r"\[(\d+\.\d+\.\d+\.\d+)\]", header)
        date_match = re.search(r";\s*(.+)$", header)
        if from_match:
            entry["from_host"] = from_match.group(1)
        if by_match:
            entry["by_host"] = by_match.group(1)
        if ip_match:
            entry["ip"] = ip_match.group(1)
        if date_match:
            entry["date"] = date_match.group(1).strip()[:60]
        chain.append(entry)
    return chain


def check_spf(msg):
    spf_headers = msg.get_all("Received-SPF", [])
    auth_results = msg.get("Authentication-Results", "")
    result = {"status": "none", "details": ""}
    for h in spf_headers:
        h_lower = h.lower()
        if "pass" in h_lower:
            result = {"status": "pass", "details": h[:200]}
        elif "fail" in h_lower or "softfail" in h_lower:
            result = {"status": "fail", "details": h[:200]}
        elif "neutral" in h_lower:
            result = {"status": "neutral", "details": h[:200]}
    if "spf=" in auth_results.lower():
        spf_match = re.search(r"spf=(\w+)", auth_results, re.IGNORECASE)
        if spf_match:
            result["auth_result_spf"] = spf_match.group(1)
    return result


def check_dkim(msg):
    auth_results = msg.get("Authentication-Results", "")
    dkim_sig = msg.get("DKIM-Signature", "")
    result = {"status": "none", "domain": ""}
    if "dkim=" in auth_results.lower():
        dkim_match = re.search(r"dkim=(\w+)", auth_results, re.IGNORECASE)
        if dkim_match:
            result["status"] = dkim_match.group(1)
    if dkim_sig:
        d_match = re.search(r"d=([\w.-]+)", dkim_sig)
        if d_match:
            result["domain"] = d_match.group(1)
    return result


def check_dmarc(msg):
    auth_results = msg.get("Authentication-Results", "")
    result = {"status": "none"}
    if "dmarc=" in auth_results.lower():
        dmarc_match = re.search(r"dmarc=(\w+)", auth_results, re.IGNORECASE)
        if dmarc_match:
            result["status"] = dmarc_match.group(1)
    return result


def extract_urls(msg):
    urls = set()
    body = ""
    if msg.is_multipart():
        for part in msg.walk():
            ct = part.get_content_type()
            if ct in ("text/plain", "text/html"):
                payload = part.get_payload(decode=True)
                if payload:
                    body += payload.decode("utf-8", errors="replace")
    else:
        payload = msg.get_payload(decode=True)
        if payload:
            body = payload.decode("utf-8", errors="replace")
    urls.update(re.findall(r"https?://[^\s<>\"')\]]+", body))
    href_urls = re.findall(r'href=["\']([^"\']+)["\']', body)
    urls.update(u for u in href_urls if u.startswith("http"))
    return sorted(urls)


def detect_display_name_spoofing(msg):
    from_header = msg.get("From", "")
    reply_to = msg.get("Reply-To", "")
    findings = []
    name, addr = email.utils.parseaddr(from_header)
    if name and addr:
        if re.search(r"@", name):
            findings.append({
                "type": "email_in_display_name",
                "detail": f"Display name contains email: {name}",
            })
    if reply_to:
        _, reply_addr = email.utils.parseaddr(reply_to)
        if reply_addr and addr and reply_addr.lower() != addr.lower():
            findings.append({
                "type": "reply_to_mismatch",
                "detail": f"From: {addr} vs Reply-To: {reply_addr}",
            })
    return findings


def detect_phishing_indicators(msg, urls):
    indicators = []
    subject = msg.get("Subject", "").lower()
    urgency = ["urgent", "immediate", "action required", "suspended",
               "verify", "expires today", "click here", "limited time"]
    for word in urgency:
        if word in subject:
            indicators.append({
                "type": "urgency_subject", "keyword": word, "severity": "MEDIUM",
            })
            break
    for url in urls:
        if re.search(r"https?://\d+\.\d+\.\d+\.\d+", url):
            indicators.append({
                "type": "ip_url", "url": url[:100], "severity": "HIGH",
            })
        if len(url) > 200:
            indicators.append({
                "type": "long_url", "url_length": len(url), "severity": "MEDIUM",
            })
    x_mailer = msg.get("X-Mailer", "")
    if x_mailer and any(s in x_mailer.lower() for s in ["phpmailer", "swiftmailer"]):
        indicators.append({
            "type": "suspicious_mailer", "mailer": x_mailer, "severity": "MEDIUM",
        })
    return indicators


def generate_report(filepath, msg):
    received = extract_received_chain(msg)
    spf = check_spf(msg)
    dkim = check_dkim(msg)
    dmarc = check_dmarc(msg)
    urls = extract_urls(msg)
    spoofing = detect_display_name_spoofing(msg)
    phishing = detect_phishing_indicators(msg, urls)
    return {
        "file": filepath,
        "subject": msg.get("Subject", ""),
        "from": msg.get("From", ""),
        "to": msg.get("To", ""),
        "date": msg.get("Date", ""),
        "message_id": msg.get("Message-ID", ""),
        "received_hops": len(received),
        "received_chain": received,
        "authentication": {"spf": spf, "dkim": dkim, "dmarc": dmarc},
        "urls_found": len(urls),
        "urls": urls[:20],
        "spoofing_indicators": spoofing,
        "phishing_indicators": phishing,
        "verdict": "SUSPICIOUS" if (phishing or spoofing or
                   spf.get("status") == "fail") else "CLEAN",
    }


if __name__ == "__main__":
    print("=" * 60)
    print("Phishing Email Header Analysis Agent")
    print("SPF/DKIM/DMARC, spoofing detection, URL extraction")
    print("=" * 60)

    target = sys.argv[1] if len(sys.argv) > 1 else None
    if not target or not os.path.exists(target):
        print("\n[DEMO] Usage: python agent.py <email.eml>")
        sys.exit(0)

    msg = parse_email_file(target)
    report = generate_report(target, msg)

    print(f"\n[*] Subject: {report['subject']}")
    print(f"[*] From: {report['from']}")
    print(f"[*] Date: {report['date']}")
    print(f"[*] Received hops: {report['received_hops']}")

    auth = report["authentication"]
    print(f"\n--- Authentication ---")
    print(f"  SPF:   {auth['spf']['status']}")
    print(f"  DKIM:  {auth['dkim']['status']}")
    print(f"  DMARC: {auth['dmarc']['status']}")

    print(f"\n--- URLs ({report['urls_found']}) ---")
    for u in report["urls"][:5]:
        print(f"  {u[:80]}")

    print(f"\n--- Indicators ---")
    for i in report["phishing_indicators"] + report["spoofing_indicators"]:
        print(f"  [{i.get('severity','INFO')}] {i['type']}: {i.get('detail', i.get('keyword', ''))}")

    print(f"\n[*] Verdict: {report['verdict']}")