Files
Anthropic-Cybersecurity-Skills/skills/analyzing-malicious-url-with-urlscan/scripts/process.py
T

440 lines
15 KiB
Python

#!/usr/bin/env python3
"""
URLScan.io URL Analysis Automation
Submits suspicious URLs to URLScan.io for analysis, retrieves results,
extracts IOCs, and cross-references with threat intelligence sources.
Usage:
python process.py scan --url "https://suspicious-site.com"
python process.py scan --url-file urls.txt
python process.py result --uuid <scan-uuid>
python process.py search --query "domain:evil.com"
python process.py ioc --uuid <scan-uuid>
"""
import argparse
import json
import sys
import time
import os
import hashlib
from datetime import datetime, timezone
from pathlib import Path
from dataclasses import dataclass, field, asdict
try:
import requests
HAS_REQUESTS = True
except ImportError:
HAS_REQUESTS = False
URLSCAN_API_KEY = os.environ.get("URLSCAN_API_KEY", "")
URLSCAN_BASE = "https://urlscan.io/api/v1"
VT_API_KEY = os.environ.get("VT_API_KEY", "")
@dataclass
class URLScanResult:
"""Parsed URLScan result."""
uuid: str = ""
url: str = ""
effective_url: str = ""
status_code: int = 0
domain: str = ""
ip: str = ""
asn: str = ""
asn_name: str = ""
country: str = ""
server: str = ""
title: str = ""
tls_issuer: str = ""
tls_subject: str = ""
tls_valid_from: str = ""
tls_valid_to: str = ""
screenshot_url: str = ""
dom_url: str = ""
technologies: list = field(default_factory=list)
redirects: list = field(default_factory=list)
domains_contacted: list = field(default_factory=list)
ips_contacted: list = field(default_factory=list)
urls_contacted: list = field(default_factory=list)
has_login_form: bool = False
resource_hashes: list = field(default_factory=list)
verdicts: dict = field(default_factory=dict)
is_malicious: bool = False
risk_indicators: list = field(default_factory=list)
def submit_scan(url: str, visibility: str = "private",
api_key: str = "") -> dict:
"""Submit a URL to URLScan for scanning."""
if not api_key:
api_key = URLSCAN_API_KEY
if not api_key:
print("Warning: No URLScan API key provided. Using public submission.", file=sys.stderr)
headers = {"Content-Type": "application/json"}
if api_key:
headers["API-Key"] = api_key
data = {"url": url, "visibility": visibility}
try:
resp = requests.post(f"{URLSCAN_BASE}/scan/", headers=headers,
json=data, timeout=30)
if resp.status_code == 200:
return resp.json()
elif resp.status_code == 429:
print("Rate limited. Waiting 10 seconds...", file=sys.stderr)
time.sleep(10)
resp = requests.post(f"{URLSCAN_BASE}/scan/", headers=headers,
json=data, timeout=30)
return resp.json() if resp.status_code == 200 else {"error": resp.text}
else:
return {"error": f"HTTP {resp.status_code}: {resp.text}"}
except Exception as e:
return {"error": str(e)}
def get_result(uuid: str, api_key: str = "", max_wait: int = 60) -> dict:
"""Get scan results, polling until ready."""
if not api_key:
api_key = URLSCAN_API_KEY
headers = {}
if api_key:
headers["API-Key"] = api_key
for attempt in range(max_wait // 5):
try:
resp = requests.get(f"{URLSCAN_BASE}/result/{uuid}/",
headers=headers, timeout=30)
if resp.status_code == 200:
return resp.json()
elif resp.status_code == 404:
time.sleep(5)
continue
else:
return {"error": f"HTTP {resp.status_code}: {resp.text}"}
except Exception as e:
return {"error": str(e)}
return {"error": "Timeout waiting for scan results"}
def search_scans(query: str, api_key: str = "", size: int = 10) -> list:
"""Search URLScan database."""
if not api_key:
api_key = URLSCAN_API_KEY
headers = {}
if api_key:
headers["API-Key"] = api_key
try:
resp = requests.get(f"{URLSCAN_BASE}/search/?q={query}&size={size}",
headers=headers, timeout=30)
if resp.status_code == 200:
return resp.json().get("results", [])
except Exception:
pass
return []
def parse_result(raw_result: dict) -> URLScanResult:
"""Parse raw URLScan API result into structured data."""
result = URLScanResult()
task = raw_result.get("task", {})
result.uuid = task.get("uuid", "")
result.url = task.get("url", "")
page = raw_result.get("page", {})
result.effective_url = page.get("url", "")
result.status_code = page.get("status", 0)
result.domain = page.get("domain", "")
result.ip = page.get("ip", "")
result.asn = page.get("asn", "")
result.asn_name = page.get("asnname", "")
result.country = page.get("country", "")
result.server = page.get("server", "")
result.title = page.get("title", "")
# TLS info
tls_list = raw_result.get("lists", {}).get("certificates", [])
if tls_list:
cert = tls_list[0]
result.tls_issuer = cert.get("issuer", "")
result.tls_subject = cert.get("subjectName", "")
result.tls_valid_from = cert.get("validFrom", "")
result.tls_valid_to = cert.get("validTo", "")
# Screenshot and DOM URLs
result.screenshot_url = f"https://urlscan.io/screenshots/{result.uuid}.png"
result.dom_url = f"https://urlscan.io/dom/{result.uuid}/"
# Technologies
meta = raw_result.get("meta", {})
for processor in meta.get("processors", {}).values():
if isinstance(processor, dict) and "data" in processor:
techs = processor["data"]
if isinstance(techs, list):
for tech in techs:
if isinstance(tech, dict) and "app" in tech:
result.technologies.append(tech["app"])
# Redirects
data = raw_result.get("data", {})
for request in data.get("requests", [])[:5]:
req_url = request.get("request", {}).get("request", {}).get("url", "")
resp_url = request.get("response", {}).get("response", {}).get("url", "")
if req_url != result.url:
result.redirects.append(req_url)
# Domains and IPs contacted
lists = raw_result.get("lists", {})
result.domains_contacted = lists.get("domains", [])
result.ips_contacted = lists.get("ips", [])
result.urls_contacted = lists.get("urls", [])[:50]
# Resource hashes
for request in data.get("requests", []):
resp_data = request.get("response", {}).get("response", {})
resp_hash = resp_data.get("hash", "")
if resp_hash:
result.resource_hashes.append({
"url": resp_data.get("url", ""),
"hash": resp_hash,
"size": resp_data.get("size", 0),
"mimeType": resp_data.get("mimeType", "")
})
# Check for login forms in DOM
dom_content = raw_result.get("data", {}).get("dom", "")
if isinstance(dom_content, str):
if ('type="password"' in dom_content.lower() or
'input type=password' in dom_content.lower() or
'<form' in dom_content.lower()):
result.has_login_form = True
# Verdicts
verdicts = raw_result.get("verdicts", {})
result.verdicts = {
"overall_score": verdicts.get("overall", {}).get("score", 0),
"overall_malicious": verdicts.get("overall", {}).get("malicious", False),
"urlscan_score": verdicts.get("urlscan", {}).get("score", 0),
"engines": verdicts.get("engines", {}).get("malicious", []),
"community_score": verdicts.get("community", {}).get("score", 0),
}
result.is_malicious = verdicts.get("overall", {}).get("malicious", False)
# Risk indicators
if result.has_login_form and result.domain != result.url.split("/")[2]:
result.risk_indicators.append("Credential harvesting form on non-origin domain")
if result.url != result.effective_url:
result.risk_indicators.append(f"URL redirected: {result.url} -> {result.effective_url}")
if result.is_malicious:
result.risk_indicators.append("Flagged as malicious by URLScan verdicts")
if len(result.redirects) > 3:
result.risk_indicators.append(f"Excessive redirects ({len(result.redirects)})")
return result
def extract_iocs(result: URLScanResult) -> dict:
"""Extract IOCs from scan result."""
iocs = {
"domains": list(set(result.domains_contacted)),
"ips": list(set(result.ips_contacted)),
"urls": [result.url, result.effective_url] + result.redirects,
"hashes": [h["hash"] for h in result.resource_hashes if h.get("hash")],
"tls_fingerprint": result.tls_subject,
"scan_uuid": result.uuid,
"scan_date": datetime.now(timezone.utc).isoformat(),
}
# Deduplicate URLs
iocs["urls"] = list(set(u for u in iocs["urls"] if u))
return iocs
def check_virustotal(url: str, api_key: str = "") -> dict:
"""Check URL against VirusTotal (requires API key)."""
if not api_key:
api_key = VT_API_KEY
if not api_key:
return {}
url_id = hashlib.sha256(url.encode()).hexdigest()
headers = {"x-apikey": api_key}
try:
resp = requests.get(f"https://www.virustotal.com/api/v3/urls/{url_id}",
headers=headers, timeout=15)
if resp.status_code == 200:
data = resp.json().get("data", {}).get("attributes", {})
stats = data.get("last_analysis_stats", {})
return {
"malicious": stats.get("malicious", 0),
"suspicious": stats.get("suspicious", 0),
"harmless": stats.get("harmless", 0),
"undetected": stats.get("undetected", 0),
}
except Exception:
pass
return {}
def format_report(result: URLScanResult) -> str:
"""Format scan result as text report."""
lines = []
lines.append("=" * 60)
lines.append(" URL ANALYSIS REPORT (URLScan.io)")
lines.append("=" * 60)
lines.append(f" Scan UUID: {result.uuid}")
lines.append(f" Submitted URL: {result.url}")
lines.append(f" Effective URL: {result.effective_url}")
lines.append(f" Status Code: {result.status_code}")
lines.append(f" Malicious: {'YES' if result.is_malicious else 'NO'}")
lines.append("")
lines.append("[PAGE INFO]")
lines.append(f" Title: {result.title}")
lines.append(f" Domain: {result.domain}")
lines.append(f" IP: {result.ip}")
lines.append(f" ASN: {result.asn} ({result.asn_name})")
lines.append(f" Country: {result.country}")
lines.append(f" Server: {result.server}")
lines.append(f" Login Form: {'DETECTED' if result.has_login_form else 'Not found'}")
lines.append(f" Screenshot: {result.screenshot_url}")
lines.append("")
if result.tls_issuer:
lines.append("[TLS CERTIFICATE]")
lines.append(f" Issuer: {result.tls_issuer}")
lines.append(f" Subject: {result.tls_subject}")
lines.append("")
if result.redirects:
lines.append(f"[REDIRECTS] ({len(result.redirects)} found)")
for r in result.redirects[:10]:
lines.append(f" -> {r}")
lines.append("")
if result.risk_indicators:
lines.append(f"[RISK INDICATORS] ({len(result.risk_indicators)})")
for ind in result.risk_indicators:
lines.append(f" - {ind}")
lines.append("")
lines.append(f"[INFRASTRUCTURE]")
lines.append(f" Domains contacted: {len(result.domains_contacted)}")
lines.append(f" IPs contacted: {len(result.ips_contacted)}")
lines.append(f" Resource hashes: {len(result.resource_hashes)}")
lines.append("=" * 60)
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(description="URLScan.io URL Analysis Tool")
subparsers = parser.add_subparsers(dest="command")
scan_parser = subparsers.add_parser("scan", help="Scan a URL")
scan_parser.add_argument("--url", help="Single URL to scan")
scan_parser.add_argument("--url-file", help="File with URLs (one per line)")
scan_parser.add_argument("--visibility", default="private",
choices=["public", "unlisted", "private"])
scan_parser.add_argument("--wait", action="store_true", help="Wait for results")
result_parser = subparsers.add_parser("result", help="Get scan result")
result_parser.add_argument("--uuid", required=True)
search_parser = subparsers.add_parser("search", help="Search URLScan database")
search_parser.add_argument("--query", "-q", required=True)
search_parser.add_argument("--size", type=int, default=10)
ioc_parser = subparsers.add_parser("ioc", help="Extract IOCs from scan")
ioc_parser.add_argument("--uuid", required=True)
parser.add_argument("--api-key", default=URLSCAN_API_KEY)
parser.add_argument("--json", action="store_true")
parser.add_argument("--output", "-o")
args = parser.parse_args()
if not HAS_REQUESTS:
print("Error: 'requests' library required", file=sys.stderr)
sys.exit(1)
api_key = args.api_key
if args.command == "scan":
urls = []
if args.url:
urls.append(args.url)
elif args.url_file:
with open(args.url_file) as f:
urls = [line.strip() for line in f if line.strip()]
for url in urls:
print(f"Scanning: {url}")
scan_result = submit_scan(url, args.visibility, api_key)
if "error" in scan_result:
print(f" Error: {scan_result['error']}", file=sys.stderr)
continue
uuid = scan_result.get("uuid", "")
print(f" UUID: {uuid}")
print(f" Result URL: https://urlscan.io/result/{uuid}/")
if args.wait and uuid:
print(" Waiting for results...")
time.sleep(10)
raw = get_result(uuid, api_key)
if "error" not in raw:
result = parse_result(raw)
if args.json:
print(json.dumps(asdict(result), indent=2, default=str))
else:
print(format_report(result))
if len(urls) > 1:
time.sleep(2) # Rate limiting
elif args.command == "result":
raw = get_result(args.uuid, api_key)
if "error" in raw:
print(f"Error: {raw['error']}", file=sys.stderr)
sys.exit(1)
result = parse_result(raw)
if args.json:
print(json.dumps(asdict(result), indent=2, default=str))
else:
print(format_report(result))
elif args.command == "search":
results = search_scans(args.query, api_key, args.size)
for r in results:
task = r.get("task", {})
page = r.get("page", {})
print(f" {task.get('time', '')} | {task.get('url', '')} | "
f"{page.get('domain', '')} | {page.get('ip', '')}")
elif args.command == "ioc":
raw = get_result(args.uuid, api_key)
if "error" in raw:
print(f"Error: {raw['error']}", file=sys.stderr)
sys.exit(1)
result = parse_result(raw)
iocs = extract_iocs(result)
print(json.dumps(iocs, indent=2))
else:
parser.print_help()
if __name__ == "__main__":
main()