Files
Anthropic-Cybersecurity-Skills/skills/analyzing-phishing-email-headers/scripts/process.py
T

567 lines
20 KiB
Python

#!/usr/bin/env python3
"""
Phishing Email Header Analyzer
Parses raw email headers to extract authentication results, routing information,
and phishing indicators. Performs IP geolocation, domain age checks, and
generates a risk assessment report.
Usage:
python process.py --file email_headers.txt
python process.py --eml suspicious_email.eml
python process.py --stdin < headers.txt
"""
import argparse
import email
import re
import json
import sys
import socket
import hashlib
from datetime import datetime, timezone
from email import policy
from email.parser import HeaderParser, BytesParser
from pathlib import Path
from typing import Optional
from dataclasses import dataclass, field, asdict
try:
import requests
HAS_REQUESTS = True
except ImportError:
HAS_REQUESTS = False
@dataclass
class ReceivedHop:
"""Represents a single hop in the email routing chain."""
server_from: str = ""
server_by: str = ""
ip_address: str = ""
timestamp: str = ""
protocol: str = ""
hop_number: int = 0
geo_location: str = ""
reverse_dns: str = ""
@dataclass
class AuthenticationResult:
"""Email authentication check results."""
spf: str = "none"
spf_domain: str = ""
dkim: str = "none"
dkim_domain: str = ""
dmarc: str = "none"
dmarc_domain: str = ""
compauth: str = ""
@dataclass
class PhishingIndicator:
"""A single phishing indicator found in headers."""
category: str = ""
description: str = ""
severity: str = "low" # low, medium, high, critical
raw_value: str = ""
@dataclass
class HeaderAnalysis:
"""Complete header analysis results."""
message_id: str = ""
from_address: str = ""
from_domain: str = ""
return_path: str = ""
return_path_domain: str = ""
reply_to: str = ""
reply_to_domain: str = ""
subject: str = ""
date: str = ""
x_originating_ip: str = ""
x_mailer: str = ""
received_hops: list = field(default_factory=list)
authentication: AuthenticationResult = field(default_factory=AuthenticationResult)
indicators: list = field(default_factory=list)
risk_score: int = 0
risk_level: str = "unknown"
urls_in_headers: list = field(default_factory=list)
file_hash: str = ""
def extract_ip_from_received(received_value: str) -> str:
"""Extract IP address from a Received header value."""
ip_patterns = [
r'\[(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\]',
r'\((\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\)',
r'from\s+\S+\s+\(.*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})',
]
for pattern in ip_patterns:
match = re.search(pattern, received_value)
if match:
ip = match.group(1)
if not ip.startswith(('10.', '172.16.', '172.17.', '172.18.',
'172.19.', '172.2', '172.30.', '172.31.',
'192.168.', '127.')):
return ip
return ""
def extract_domain(email_address: str) -> str:
"""Extract domain from an email address."""
if not email_address:
return ""
match = re.search(r'@([\w.-]+)', email_address)
return match.group(1).lower() if match else ""
def parse_received_header(received_value: str, hop_num: int) -> ReceivedHop:
"""Parse a single Received header into structured data."""
hop = ReceivedHop(hop_number=hop_num)
from_match = re.search(r'from\s+([\w.\-]+)', received_value, re.IGNORECASE)
if from_match:
hop.server_from = from_match.group(1)
by_match = re.search(r'by\s+([\w.\-]+)', received_value, re.IGNORECASE)
if by_match:
hop.server_by = by_match.group(1)
hop.ip_address = extract_ip_from_received(received_value)
date_match = re.search(r';\s*(.+)$', received_value)
if date_match:
hop.timestamp = date_match.group(1).strip()
proto_match = re.search(r'with\s+(ESMTP[SA]*|SMTP[SA]*|HTTP[S]?|LMTP)',
received_value, re.IGNORECASE)
if proto_match:
hop.protocol = proto_match.group(1).upper()
return hop
def parse_authentication_results(auth_header: str) -> AuthenticationResult:
"""Parse Authentication-Results header."""
result = AuthenticationResult()
spf_match = re.search(r'spf=(pass|fail|softfail|neutral|none|temperror|permerror)',
auth_header, re.IGNORECASE)
if spf_match:
result.spf = spf_match.group(1).lower()
spf_domain_match = re.search(r'smtp\.mailfrom=([\w.\-@]+)', auth_header, re.IGNORECASE)
if spf_domain_match:
result.spf_domain = spf_domain_match.group(1)
dkim_match = re.search(r'dkim=(pass|fail|none|neutral|temperror|permerror)',
auth_header, re.IGNORECASE)
if dkim_match:
result.dkim = dkim_match.group(1).lower()
dkim_domain_match = re.search(r'header\.[di]=([\w.\-]+)', auth_header, re.IGNORECASE)
if dkim_domain_match:
result.dkim_domain = dkim_domain_match.group(1)
dmarc_match = re.search(r'dmarc=(pass|fail|none|bestguesspass|temperror|permerror)',
auth_header, re.IGNORECASE)
if dmarc_match:
result.dmarc = dmarc_match.group(1).lower()
dmarc_domain_match = re.search(r'header\.from=([\w.\-]+)', auth_header, re.IGNORECASE)
if dmarc_domain_match:
result.dmarc_domain = dmarc_domain_match.group(1)
compauth_match = re.search(r'compauth=(\w+)', auth_header, re.IGNORECASE)
if compauth_match:
result.compauth = compauth_match.group(1)
return result
def geolocate_ip(ip_address: str) -> str:
"""Geolocate an IP address using ip-api.com (free, no key required)."""
if not HAS_REQUESTS or not ip_address:
return "unknown"
try:
resp = requests.get(f"http://ip-api.com/json/{ip_address}",
timeout=5,
params={"fields": "country,city,org,status"})
if resp.status_code == 200:
data = resp.json()
if data.get("status") == "success":
return f"{data.get('city', '')}, {data.get('country', '')} ({data.get('org', '')})"
except Exception:
pass
return "unknown"
def reverse_dns_lookup(ip_address: str) -> str:
"""Perform reverse DNS lookup on an IP address."""
if not ip_address:
return ""
try:
hostname = socket.gethostbyaddr(ip_address)
return hostname[0]
except (socket.herror, socket.gaierror, OSError):
return ""
def check_abuseipdb(ip_address: str, api_key: str = "") -> dict:
"""Check IP against AbuseIPDB (requires API key)."""
if not HAS_REQUESTS or not api_key or not ip_address:
return {}
try:
headers = {"Key": api_key, "Accept": "application/json"}
params = {"ipAddress": ip_address, "maxAgeInDays": "90"}
resp = requests.get("https://api.abuseipdb.com/api/v2/check",
headers=headers, params=params, timeout=10)
if resp.status_code == 200:
return resp.json().get("data", {})
except Exception:
pass
return {}
def analyze_indicators(analysis: HeaderAnalysis) -> list:
"""Detect phishing indicators from parsed header data."""
indicators = []
# Check From vs Return-Path mismatch
if (analysis.from_domain and analysis.return_path_domain and
analysis.from_domain != analysis.return_path_domain):
indicators.append(PhishingIndicator(
category="sender_mismatch",
description=f"From domain ({analysis.from_domain}) differs from "
f"Return-Path domain ({analysis.return_path_domain})",
severity="high",
raw_value=f"From: {analysis.from_domain}, Return-Path: {analysis.return_path_domain}"
))
# Check From vs Reply-To mismatch
if (analysis.from_domain and analysis.reply_to_domain and
analysis.from_domain != analysis.reply_to_domain):
indicators.append(PhishingIndicator(
category="reply_to_mismatch",
description=f"From domain ({analysis.from_domain}) differs from "
f"Reply-To domain ({analysis.reply_to_domain})",
severity="high",
raw_value=f"From: {analysis.from_domain}, Reply-To: {analysis.reply_to_domain}"
))
# Check SPF failure
if analysis.authentication.spf in ("fail", "softfail"):
indicators.append(PhishingIndicator(
category="authentication_failure",
description=f"SPF check returned {analysis.authentication.spf}",
severity="high" if analysis.authentication.spf == "fail" else "medium",
raw_value=f"spf={analysis.authentication.spf}"
))
# Check DKIM failure
if analysis.authentication.dkim == "fail":
indicators.append(PhishingIndicator(
category="authentication_failure",
description="DKIM signature verification failed",
severity="high",
raw_value="dkim=fail"
))
# Check DMARC failure
if analysis.authentication.dmarc == "fail":
indicators.append(PhishingIndicator(
category="authentication_failure",
description="DMARC policy check failed",
severity="critical",
raw_value="dmarc=fail"
))
# Check for missing Message-ID
if not analysis.message_id:
indicators.append(PhishingIndicator(
category="missing_header",
description="Message-ID header is missing",
severity="medium",
raw_value=""
))
# Check for suspicious X-Mailer
suspicious_mailers = [
"PHPMailer", "King Phisher", "GoPhish", "Swaks",
"Sendinblue", "Mass Mailer", "Bulk Mailer"
]
if analysis.x_mailer:
for mailer in suspicious_mailers:
if mailer.lower() in analysis.x_mailer.lower():
indicators.append(PhishingIndicator(
category="suspicious_mailer",
description=f"Suspicious X-Mailer detected: {analysis.x_mailer}",
severity="high",
raw_value=analysis.x_mailer
))
break
# Check for too few received hops (direct injection)
if len(analysis.received_hops) <= 1:
indicators.append(PhishingIndicator(
category="routing_anomaly",
description="Very few Received hops - possible direct SMTP injection",
severity="medium",
raw_value=f"Hop count: {len(analysis.received_hops)}"
))
# Check for missing authentication results
auth = analysis.authentication
if auth.spf == "none" and auth.dkim == "none" and auth.dmarc == "none":
indicators.append(PhishingIndicator(
category="no_authentication",
description="No email authentication results found (SPF, DKIM, DMARC all absent)",
severity="high",
raw_value=""
))
return indicators
def calculate_risk_score(indicators: list) -> tuple:
"""Calculate risk score from indicators. Returns (score, level)."""
severity_weights = {"critical": 30, "high": 20, "medium": 10, "low": 5}
score = 0
for indicator in indicators:
score += severity_weights.get(indicator.severity, 0)
score = min(score, 100)
if score >= 70:
level = "CRITICAL"
elif score >= 50:
level = "HIGH"
elif score >= 30:
level = "MEDIUM"
elif score >= 10:
level = "LOW"
else:
level = "CLEAN"
return score, level
def analyze_headers(raw_headers: str, enrich: bool = False,
abuseipdb_key: str = "") -> HeaderAnalysis:
"""
Main analysis function. Parses raw email headers and produces
a complete HeaderAnalysis report.
"""
analysis = HeaderAnalysis()
# Calculate hash of raw input for evidence tracking
analysis.file_hash = hashlib.sha256(raw_headers.encode()).hexdigest()
# Parse using Python's email library
parser = HeaderParser()
msg = parser.parsestr(raw_headers)
# Extract basic fields
analysis.from_address = msg.get("From", "")
analysis.from_domain = extract_domain(analysis.from_address)
analysis.return_path = msg.get("Return-Path", "")
analysis.return_path_domain = extract_domain(analysis.return_path)
analysis.reply_to = msg.get("Reply-To", "")
analysis.reply_to_domain = extract_domain(analysis.reply_to)
analysis.message_id = msg.get("Message-ID", "")
analysis.subject = msg.get("Subject", "")
analysis.date = msg.get("Date", "")
analysis.x_mailer = msg.get("X-Mailer", "") or msg.get("User-Agent", "")
# Extract X-Originating-IP
x_orig = msg.get("X-Originating-IP", "")
if x_orig:
ip_match = re.search(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', x_orig)
if ip_match:
analysis.x_originating_ip = ip_match.group(1)
# Parse Received headers (they appear in reverse order)
received_headers = msg.get_all("Received", [])
for i, received in enumerate(received_headers):
hop = parse_received_header(received, len(received_headers) - i)
if enrich and hop.ip_address:
hop.geo_location = geolocate_ip(hop.ip_address)
hop.reverse_dns = reverse_dns_lookup(hop.ip_address)
analysis.received_hops.append(hop)
# Reverse to chronological order (first hop first)
analysis.received_hops.reverse()
# Parse Authentication-Results
auth_results = msg.get("Authentication-Results", "")
if auth_results:
analysis.authentication = parse_authentication_results(auth_results)
# Also check ARC-Authentication-Results
arc_auth = msg.get("ARC-Authentication-Results", "")
if arc_auth and analysis.authentication.spf == "none":
analysis.authentication = parse_authentication_results(arc_auth)
# Extract URLs from headers
url_pattern = r'https?://[^\s<>"\')\]>]+'
all_header_text = raw_headers
analysis.urls_in_headers = list(set(re.findall(url_pattern, all_header_text)))
# Detect phishing indicators
analysis.indicators = analyze_indicators(analysis)
# Calculate risk score
analysis.risk_score, analysis.risk_level = calculate_risk_score(analysis.indicators)
# Enrich with threat intelligence if requested
if enrich and analysis.x_originating_ip and abuseipdb_key:
abuse_data = check_abuseipdb(analysis.x_originating_ip, abuseipdb_key)
if abuse_data and abuse_data.get("abuseConfidenceScore", 0) > 50:
analysis.indicators.append(PhishingIndicator(
category="threat_intelligence",
description=f"IP {analysis.x_originating_ip} has abuse confidence "
f"score of {abuse_data['abuseConfidenceScore']}%",
severity="critical",
raw_value=json.dumps(abuse_data)
))
# Recalculate risk
analysis.risk_score, analysis.risk_level = calculate_risk_score(analysis.indicators)
return analysis
def format_report(analysis: HeaderAnalysis) -> str:
"""Format analysis results as a human-readable report."""
lines = []
lines.append("=" * 70)
lines.append(" PHISHING EMAIL HEADER ANALYSIS REPORT")
lines.append("=" * 70)
lines.append(f" Generated: {datetime.now(timezone.utc).isoformat()}")
lines.append(f" Evidence Hash: {analysis.file_hash[:16]}...")
lines.append("")
# Risk Assessment
lines.append(f" RISK LEVEL: {analysis.risk_level} (Score: {analysis.risk_score}/100)")
lines.append("-" * 70)
# Sender Information
lines.append("\n[SENDER INFORMATION]")
lines.append(f" From: {analysis.from_address}")
lines.append(f" Return-Path: {analysis.return_path}")
lines.append(f" Reply-To: {analysis.reply_to}")
lines.append(f" Subject: {analysis.subject}")
lines.append(f" Date: {analysis.date}")
lines.append(f" Message-ID: {analysis.message_id}")
lines.append(f" X-Mailer: {analysis.x_mailer}")
if analysis.x_originating_ip:
lines.append(f" Origin IP: {analysis.x_originating_ip}")
# Authentication Results
lines.append("\n[AUTHENTICATION RESULTS]")
auth = analysis.authentication
spf_icon = "PASS" if auth.spf == "pass" else "FAIL" if auth.spf in ("fail", "softfail") else "NONE"
dkim_icon = "PASS" if auth.dkim == "pass" else "FAIL" if auth.dkim == "fail" else "NONE"
dmarc_icon = "PASS" if auth.dmarc == "pass" else "FAIL" if auth.dmarc == "fail" else "NONE"
lines.append(f" SPF: {spf_icon} ({auth.spf}) domain={auth.spf_domain}")
lines.append(f" DKIM: {dkim_icon} ({auth.dkim}) domain={auth.dkim_domain}")
lines.append(f" DMARC: {dmarc_icon} ({auth.dmarc}) domain={auth.dmarc_domain}")
# Routing Path
lines.append(f"\n[ROUTING PATH] ({len(analysis.received_hops)} hops)")
for hop in analysis.received_hops:
lines.append(f" Hop {hop.hop_number}: {hop.server_from} -> {hop.server_by}")
if hop.ip_address:
lines.append(f" IP: {hop.ip_address}")
if hop.geo_location and hop.geo_location != "unknown":
lines.append(f" Location: {hop.geo_location}")
if hop.protocol:
lines.append(f" Protocol: {hop.protocol}")
if hop.timestamp:
lines.append(f" Time: {hop.timestamp}")
# Phishing Indicators
if analysis.indicators:
lines.append(f"\n[PHISHING INDICATORS] ({len(analysis.indicators)} found)")
for i, ind in enumerate(analysis.indicators, 1):
lines.append(f" {i}. [{ind.severity.upper()}] {ind.description}")
if ind.raw_value:
lines.append(f" Value: {ind.raw_value}")
else:
lines.append("\n[PHISHING INDICATORS] None detected")
# URLs in Headers
if analysis.urls_in_headers:
lines.append(f"\n[URLS IN HEADERS] ({len(analysis.urls_in_headers)} found)")
for url in analysis.urls_in_headers[:10]:
lines.append(f" - {url}")
lines.append("\n" + "=" * 70)
lines.append(" END OF REPORT")
lines.append("=" * 70)
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(
description="Analyze email headers for phishing indicators"
)
input_group = parser.add_mutually_exclusive_group(required=True)
input_group.add_argument("--file", "-f", help="Path to file containing raw headers")
input_group.add_argument("--eml", "-e", help="Path to .eml file")
input_group.add_argument("--stdin", action="store_true", help="Read headers from stdin")
parser.add_argument("--enrich", action="store_true",
help="Enrich with IP geolocation and reverse DNS")
parser.add_argument("--abuseipdb-key", default="",
help="AbuseIPDB API key for threat intelligence")
parser.add_argument("--json", action="store_true",
help="Output results as JSON")
parser.add_argument("--output", "-o", help="Write report to file")
args = parser.parse_args()
# Read input
if args.stdin:
raw_headers = sys.stdin.read()
elif args.eml:
with open(args.eml, "rb") as f:
msg = BytesParser(policy=policy.default).parse(f)
raw_headers = str(msg)
else:
with open(args.file, "r", encoding="utf-8", errors="replace") as f:
raw_headers = f.read()
# Analyze
analysis = analyze_headers(
raw_headers,
enrich=args.enrich,
abuseipdb_key=args.abuseipdb_key
)
# Output
if args.json:
output = json.dumps(asdict(analysis), indent=2, default=str)
else:
output = format_report(analysis)
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(output)
print(f"Report written to {args.output}")
else:
print(output)
# Exit code based on risk
if analysis.risk_level in ("CRITICAL", "HIGH"):
sys.exit(2)
elif analysis.risk_level == "MEDIUM":
sys.exit(1)
else:
sys.exit(0)
if __name__ == "__main__":
main()