Anthropic-Cybersecurity-Skills/skills/analyzing-phishing-email-headers/scripts/process.py

#!/usr/bin/env python3
"""
Phishing Email Header Analyzer

Parses raw email headers to extract authentication results, routing information,
and phishing indicators. Performs IP geolocation, domain age checks, and
generates a risk assessment report.

Usage:
    python process.py --file email_headers.txt
    python process.py --eml suspicious_email.eml
    python process.py --stdin < headers.txt
"""

import argparse
import email
import re
import json
import sys
import socket
import hashlib
from datetime import datetime, timezone
from email import policy
from email.parser import HeaderParser, BytesParser
from pathlib import Path
from typing import Optional
from dataclasses import dataclass, field, asdict

try:
    import requests
    HAS_REQUESTS = True
except ImportError:
    HAS_REQUESTS = False


@dataclass
class ReceivedHop:
    """Represents a single hop in the email routing chain."""
    server_from: str = ""
    server_by: str = ""
    ip_address: str = ""
    timestamp: str = ""
    protocol: str = ""
    hop_number: int = 0
    geo_location: str = ""
    reverse_dns: str = ""


@dataclass
class AuthenticationResult:
    """Email authentication check results."""
    spf: str = "none"
    spf_domain: str = ""
    dkim: str = "none"
    dkim_domain: str = ""
    dmarc: str = "none"
    dmarc_domain: str = ""
    compauth: str = ""


@dataclass
class PhishingIndicator:
    """A single phishing indicator found in headers."""
    category: str = ""
    description: str = ""
    severity: str = "low"  # low, medium, high, critical
    raw_value: str = ""


@dataclass
class HeaderAnalysis:
    """Complete header analysis results."""
    message_id: str = ""
    from_address: str = ""
    from_domain: str = ""
    return_path: str = ""
    return_path_domain: str = ""
    reply_to: str = ""
    reply_to_domain: str = ""
    subject: str = ""
    date: str = ""
    x_originating_ip: str = ""
    x_mailer: str = ""
    received_hops: list = field(default_factory=list)
    authentication: AuthenticationResult = field(default_factory=AuthenticationResult)
    indicators: list = field(default_factory=list)
    risk_score: int = 0
    risk_level: str = "unknown"
    urls_in_headers: list = field(default_factory=list)
    file_hash: str = ""


def extract_ip_from_received(received_value: str) -> str:
    """Extract IP address from a Received header value."""
    ip_patterns = [
        r'\[(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\]',
        r'\((\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\)',
        r'from\s+\S+\s+\(.*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})',
    ]
    for pattern in ip_patterns:
        match = re.search(pattern, received_value)
        if match:
            ip = match.group(1)
            if not ip.startswith(('10.', '172.16.', '172.17.', '172.18.',
                                  '172.19.', '172.2', '172.30.', '172.31.',
                                  '192.168.', '127.')):
                return ip
    return ""


def extract_domain(email_address: str) -> str:
    """Extract domain from an email address."""
    if not email_address:
        return ""
    match = re.search(r'@([\w.-]+)', email_address)
    return match.group(1).lower() if match else ""


def parse_received_header(received_value: str, hop_num: int) -> ReceivedHop:
    """Parse a single Received header into structured data."""
    hop = ReceivedHop(hop_number=hop_num)

    from_match = re.search(r'from\s+([\w.\-]+)', received_value, re.IGNORECASE)
    if from_match:
        hop.server_from = from_match.group(1)

    by_match = re.search(r'by\s+([\w.\-]+)', received_value, re.IGNORECASE)
    if by_match:
        hop.server_by = by_match.group(1)

    hop.ip_address = extract_ip_from_received(received_value)

    date_match = re.search(r';\s*(.+)$', received_value)
    if date_match:
        hop.timestamp = date_match.group(1).strip()

    proto_match = re.search(r'with\s+(ESMTP[SA]*|SMTP[SA]*|HTTP[S]?|LMTP)',
                            received_value, re.IGNORECASE)
    if proto_match:
        hop.protocol = proto_match.group(1).upper()

    return hop


def parse_authentication_results(auth_header: str) -> AuthenticationResult:
    """Parse Authentication-Results header."""
    result = AuthenticationResult()

    spf_match = re.search(r'spf=(pass|fail|softfail|neutral|none|temperror|permerror)',
                          auth_header, re.IGNORECASE)
    if spf_match:
        result.spf = spf_match.group(1).lower()

    spf_domain_match = re.search(r'smtp\.mailfrom=([\w.\-@]+)', auth_header, re.IGNORECASE)
    if spf_domain_match:
        result.spf_domain = spf_domain_match.group(1)

    dkim_match = re.search(r'dkim=(pass|fail|none|neutral|temperror|permerror)',
                           auth_header, re.IGNORECASE)
    if dkim_match:
        result.dkim = dkim_match.group(1).lower()

    dkim_domain_match = re.search(r'header\.[di]=([\w.\-]+)', auth_header, re.IGNORECASE)
    if dkim_domain_match:
        result.dkim_domain = dkim_domain_match.group(1)

    dmarc_match = re.search(r'dmarc=(pass|fail|none|bestguesspass|temperror|permerror)',
                            auth_header, re.IGNORECASE)
    if dmarc_match:
        result.dmarc = dmarc_match.group(1).lower()

    dmarc_domain_match = re.search(r'header\.from=([\w.\-]+)', auth_header, re.IGNORECASE)
    if dmarc_domain_match:
        result.dmarc_domain = dmarc_domain_match.group(1)

    compauth_match = re.search(r'compauth=(\w+)', auth_header, re.IGNORECASE)
    if compauth_match:
        result.compauth = compauth_match.group(1)

    return result


def geolocate_ip(ip_address: str) -> str:
    """Geolocate an IP address using ip-api.com (free, no key required)."""
    if not HAS_REQUESTS or not ip_address:
        return "unknown"
    try:
        resp = requests.get(f"http://ip-api.com/json/{ip_address}",
                            timeout=5,
                            params={"fields": "country,city,org,status"})
        if resp.status_code == 200:
            data = resp.json()
            if data.get("status") == "success":
                return f"{data.get('city', '')}, {data.get('country', '')} ({data.get('org', '')})"
    except Exception:
        pass
    return "unknown"


def reverse_dns_lookup(ip_address: str) -> str:
    """Perform reverse DNS lookup on an IP address."""
    if not ip_address:
        return ""
    try:
        hostname = socket.gethostbyaddr(ip_address)
        return hostname[0]
    except (socket.herror, socket.gaierror, OSError):
        return ""


def check_abuseipdb(ip_address: str, api_key: str = "") -> dict:
    """Check IP against AbuseIPDB (requires API key)."""
    if not HAS_REQUESTS or not api_key or not ip_address:
        return {}
    try:
        headers = {"Key": api_key, "Accept": "application/json"}
        params = {"ipAddress": ip_address, "maxAgeInDays": "90"}
        resp = requests.get("https://api.abuseipdb.com/api/v2/check",
                            headers=headers, params=params, timeout=10)
        if resp.status_code == 200:
            return resp.json().get("data", {})
    except Exception:
        pass
    return {}


def analyze_indicators(analysis: HeaderAnalysis) -> list:
    """Detect phishing indicators from parsed header data."""
    indicators = []

    # Check From vs Return-Path mismatch
    if (analysis.from_domain and analysis.return_path_domain and
            analysis.from_domain != analysis.return_path_domain):
        indicators.append(PhishingIndicator(
            category="sender_mismatch",
            description=f"From domain ({analysis.from_domain}) differs from "
                        f"Return-Path domain ({analysis.return_path_domain})",
            severity="high",
            raw_value=f"From: {analysis.from_domain}, Return-Path: {analysis.return_path_domain}"
        ))

    # Check From vs Reply-To mismatch
    if (analysis.from_domain and analysis.reply_to_domain and
            analysis.from_domain != analysis.reply_to_domain):
        indicators.append(PhishingIndicator(
            category="reply_to_mismatch",
            description=f"From domain ({analysis.from_domain}) differs from "
                        f"Reply-To domain ({analysis.reply_to_domain})",
            severity="high",
            raw_value=f"From: {analysis.from_domain}, Reply-To: {analysis.reply_to_domain}"
        ))

    # Check SPF failure
    if analysis.authentication.spf in ("fail", "softfail"):
        indicators.append(PhishingIndicator(
            category="authentication_failure",
            description=f"SPF check returned {analysis.authentication.spf}",
            severity="high" if analysis.authentication.spf == "fail" else "medium",
            raw_value=f"spf={analysis.authentication.spf}"
        ))

    # Check DKIM failure
    if analysis.authentication.dkim == "fail":
        indicators.append(PhishingIndicator(
            category="authentication_failure",
            description="DKIM signature verification failed",
            severity="high",
            raw_value="dkim=fail"
        ))

    # Check DMARC failure
    if analysis.authentication.dmarc == "fail":
        indicators.append(PhishingIndicator(
            category="authentication_failure",
            description="DMARC policy check failed",
            severity="critical",
            raw_value="dmarc=fail"
        ))

    # Check for missing Message-ID
    if not analysis.message_id:
        indicators.append(PhishingIndicator(
            category="missing_header",
            description="Message-ID header is missing",
            severity="medium",
            raw_value=""
        ))

    # Check for suspicious X-Mailer
    suspicious_mailers = [
        "PHPMailer", "King Phisher", "GoPhish", "Swaks",
        "Sendinblue", "Mass Mailer", "Bulk Mailer"
    ]
    if analysis.x_mailer:
        for mailer in suspicious_mailers:
            if mailer.lower() in analysis.x_mailer.lower():
                indicators.append(PhishingIndicator(
                    category="suspicious_mailer",
                    description=f"Suspicious X-Mailer detected: {analysis.x_mailer}",
                    severity="high",
                    raw_value=analysis.x_mailer
                ))
                break

    # Check for too few received hops (direct injection)
    if len(analysis.received_hops) <= 1:
        indicators.append(PhishingIndicator(
            category="routing_anomaly",
            description="Very few Received hops - possible direct SMTP injection",
            severity="medium",
            raw_value=f"Hop count: {len(analysis.received_hops)}"
        ))

    # Check for missing authentication results
    auth = analysis.authentication
    if auth.spf == "none" and auth.dkim == "none" and auth.dmarc == "none":
        indicators.append(PhishingIndicator(
            category="no_authentication",
            description="No email authentication results found (SPF, DKIM, DMARC all absent)",
            severity="high",
            raw_value=""
        ))

    return indicators


def calculate_risk_score(indicators: list) -> tuple:
    """Calculate risk score from indicators. Returns (score, level)."""
    severity_weights = {"critical": 30, "high": 20, "medium": 10, "low": 5}
    score = 0
    for indicator in indicators:
        score += severity_weights.get(indicator.severity, 0)

    score = min(score, 100)

    if score >= 70:
        level = "CRITICAL"
    elif score >= 50:
        level = "HIGH"
    elif score >= 30:
        level = "MEDIUM"
    elif score >= 10:
        level = "LOW"
    else:
        level = "CLEAN"

    return score, level


def analyze_headers(raw_headers: str, enrich: bool = False,
                    abuseipdb_key: str = "") -> HeaderAnalysis:
    """
    Main analysis function. Parses raw email headers and produces
    a complete HeaderAnalysis report.
    """
    analysis = HeaderAnalysis()

    # Calculate hash of raw input for evidence tracking
    analysis.file_hash = hashlib.sha256(raw_headers.encode()).hexdigest()

    # Parse using Python's email library
    parser = HeaderParser()
    msg = parser.parsestr(raw_headers)

    # Extract basic fields
    analysis.from_address = msg.get("From", "")
    analysis.from_domain = extract_domain(analysis.from_address)
    analysis.return_path = msg.get("Return-Path", "")
    analysis.return_path_domain = extract_domain(analysis.return_path)
    analysis.reply_to = msg.get("Reply-To", "")
    analysis.reply_to_domain = extract_domain(analysis.reply_to)
    analysis.message_id = msg.get("Message-ID", "")
    analysis.subject = msg.get("Subject", "")
    analysis.date = msg.get("Date", "")
    analysis.x_mailer = msg.get("X-Mailer", "") or msg.get("User-Agent", "")

    # Extract X-Originating-IP
    x_orig = msg.get("X-Originating-IP", "")
    if x_orig:
        ip_match = re.search(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', x_orig)
        if ip_match:
            analysis.x_originating_ip = ip_match.group(1)

    # Parse Received headers (they appear in reverse order)
    received_headers = msg.get_all("Received", [])
    for i, received in enumerate(received_headers):
        hop = parse_received_header(received, len(received_headers) - i)
        if enrich and hop.ip_address:
            hop.geo_location = geolocate_ip(hop.ip_address)
            hop.reverse_dns = reverse_dns_lookup(hop.ip_address)
        analysis.received_hops.append(hop)

    # Reverse to chronological order (first hop first)
    analysis.received_hops.reverse()

    # Parse Authentication-Results
    auth_results = msg.get("Authentication-Results", "")
    if auth_results:
        analysis.authentication = parse_authentication_results(auth_results)

    # Also check ARC-Authentication-Results
    arc_auth = msg.get("ARC-Authentication-Results", "")
    if arc_auth and analysis.authentication.spf == "none":
        analysis.authentication = parse_authentication_results(arc_auth)

    # Extract URLs from headers
    url_pattern = r'https?://[^\s<>"\')\]>]+'
    all_header_text = raw_headers
    analysis.urls_in_headers = list(set(re.findall(url_pattern, all_header_text)))

    # Detect phishing indicators
    analysis.indicators = analyze_indicators(analysis)

    # Calculate risk score
    analysis.risk_score, analysis.risk_level = calculate_risk_score(analysis.indicators)

    # Enrich with threat intelligence if requested
    if enrich and analysis.x_originating_ip and abuseipdb_key:
        abuse_data = check_abuseipdb(analysis.x_originating_ip, abuseipdb_key)
        if abuse_data and abuse_data.get("abuseConfidenceScore", 0) > 50:
            analysis.indicators.append(PhishingIndicator(
                category="threat_intelligence",
                description=f"IP {analysis.x_originating_ip} has abuse confidence "
                            f"score of {abuse_data['abuseConfidenceScore']}%",
                severity="critical",
                raw_value=json.dumps(abuse_data)
            ))
            # Recalculate risk
            analysis.risk_score, analysis.risk_level = calculate_risk_score(analysis.indicators)

    return analysis


def format_report(analysis: HeaderAnalysis) -> str:
    """Format analysis results as a human-readable report."""
    lines = []
    lines.append("=" * 70)
    lines.append("  PHISHING EMAIL HEADER ANALYSIS REPORT")
    lines.append("=" * 70)
    lines.append(f"  Generated: {datetime.now(timezone.utc).isoformat()}")
    lines.append(f"  Evidence Hash: {analysis.file_hash[:16]}...")
    lines.append("")

    # Risk Assessment
    lines.append(f"  RISK LEVEL: {analysis.risk_level} (Score: {analysis.risk_score}/100)")
    lines.append("-" * 70)

    # Sender Information
    lines.append("\n[SENDER INFORMATION]")
    lines.append(f"  From:        {analysis.from_address}")
    lines.append(f"  Return-Path: {analysis.return_path}")
    lines.append(f"  Reply-To:    {analysis.reply_to}")
    lines.append(f"  Subject:     {analysis.subject}")
    lines.append(f"  Date:        {analysis.date}")
    lines.append(f"  Message-ID:  {analysis.message_id}")
    lines.append(f"  X-Mailer:    {analysis.x_mailer}")
    if analysis.x_originating_ip:
        lines.append(f"  Origin IP:   {analysis.x_originating_ip}")

    # Authentication Results
    lines.append("\n[AUTHENTICATION RESULTS]")
    auth = analysis.authentication
    spf_icon = "PASS" if auth.spf == "pass" else "FAIL" if auth.spf in ("fail", "softfail") else "NONE"
    dkim_icon = "PASS" if auth.dkim == "pass" else "FAIL" if auth.dkim == "fail" else "NONE"
    dmarc_icon = "PASS" if auth.dmarc == "pass" else "FAIL" if auth.dmarc == "fail" else "NONE"
    lines.append(f"  SPF:   {spf_icon} ({auth.spf}) domain={auth.spf_domain}")
    lines.append(f"  DKIM:  {dkim_icon} ({auth.dkim}) domain={auth.dkim_domain}")
    lines.append(f"  DMARC: {dmarc_icon} ({auth.dmarc}) domain={auth.dmarc_domain}")

    # Routing Path
    lines.append(f"\n[ROUTING PATH] ({len(analysis.received_hops)} hops)")
    for hop in analysis.received_hops:
        lines.append(f"  Hop {hop.hop_number}: {hop.server_from} -> {hop.server_by}")
        if hop.ip_address:
            lines.append(f"           IP: {hop.ip_address}")
        if hop.geo_location and hop.geo_location != "unknown":
            lines.append(f"           Location: {hop.geo_location}")
        if hop.protocol:
            lines.append(f"           Protocol: {hop.protocol}")
        if hop.timestamp:
            lines.append(f"           Time: {hop.timestamp}")

    # Phishing Indicators
    if analysis.indicators:
        lines.append(f"\n[PHISHING INDICATORS] ({len(analysis.indicators)} found)")
        for i, ind in enumerate(analysis.indicators, 1):
            lines.append(f"  {i}. [{ind.severity.upper()}] {ind.description}")
            if ind.raw_value:
                lines.append(f"     Value: {ind.raw_value}")
    else:
        lines.append("\n[PHISHING INDICATORS] None detected")

    # URLs in Headers
    if analysis.urls_in_headers:
        lines.append(f"\n[URLS IN HEADERS] ({len(analysis.urls_in_headers)} found)")
        for url in analysis.urls_in_headers[:10]:
            lines.append(f"  - {url}")

    lines.append("\n" + "=" * 70)
    lines.append("  END OF REPORT")
    lines.append("=" * 70)

    return "\n".join(lines)


def main():
    parser = argparse.ArgumentParser(
        description="Analyze email headers for phishing indicators"
    )
    input_group = parser.add_mutually_exclusive_group(required=True)
    input_group.add_argument("--file", "-f", help="Path to file containing raw headers")
    input_group.add_argument("--eml", "-e", help="Path to .eml file")
    input_group.add_argument("--stdin", action="store_true", help="Read headers from stdin")

    parser.add_argument("--enrich", action="store_true",
                        help="Enrich with IP geolocation and reverse DNS")
    parser.add_argument("--abuseipdb-key", default="",
                        help="AbuseIPDB API key for threat intelligence")
    parser.add_argument("--json", action="store_true",
                        help="Output results as JSON")
    parser.add_argument("--output", "-o", help="Write report to file")

    args = parser.parse_args()

    # Read input
    if args.stdin:
        raw_headers = sys.stdin.read()
    elif args.eml:
        with open(args.eml, "rb") as f:
            msg = BytesParser(policy=policy.default).parse(f)
            raw_headers = str(msg)
    else:
        with open(args.file, "r", encoding="utf-8", errors="replace") as f:
            raw_headers = f.read()

    # Analyze
    analysis = analyze_headers(
        raw_headers,
        enrich=args.enrich,
        abuseipdb_key=args.abuseipdb_key
    )

    # Output
    if args.json:
        output = json.dumps(asdict(analysis), indent=2, default=str)
    else:
        output = format_report(analysis)

    if args.output:
        with open(args.output, "w", encoding="utf-8") as f:
            f.write(output)
        print(f"Report written to {args.output}")
    else:
        print(output)

    # Exit code based on risk
    if analysis.risk_level in ("CRITICAL", "HIGH"):
        sys.exit(2)
    elif analysis.risk_level == "MEDIUM":
        sys.exit(1)
    else:
        sys.exit(0)


if __name__ == "__main__":
    main()