Files
Anthropic-Cybersecurity-Skills/skills/performing-ai-driven-osint-correlation/scripts/agent.py
T
mukul975 c47eed6a64 Production hardening: security fixes, code quality, 724 skills complete
- Fix 25 shell=True subprocess calls with list-based commands
- Fix 49 verify=False in defensive skills (env-var override)
- Add timeout to 231 HTTP/subprocess/socket calls
- Fix 6 SQL injection patterns with whitelist validation
- Replace 8 __import__() with standard imports
- Remove 701 unused imports across 442 files
- Add authorized-testing disclaimers to all offensive skills
- Complete 11 incomplete skill directories
- Expand 10 stub SKILL.md files with full content
- Fix 2 YAML parse errors in frontmatter
- Fix 5 pre-existing syntax errors
- Convert 22 hardcoded paths/ports to environment variables
- Back up 21 redundant skill pairs to .bak
- Fix 2 global declaration errors
- 724/724 skills with full folder anatomy (SKILL.md + agent.py + api-reference.md + LICENSE)
- 0 compile errors across all 724 agent.py files
2026-03-19 13:26:49 +01:00

439 lines
15 KiB
Python

#!/usr/bin/env python3
"""Agent for performing AI-driven OSINT correlation.
Collects and normalizes OSINT data from multiple sources (Sherlock,
theHarvester, SpiderFoot, breach databases), performs cross-source
entity resolution and correlation, and generates unified intelligence
profiles with confidence scoring.
"""
import argparse
import csv
import json
import os
import re
import sys
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
try:
import requests
except ImportError:
requests = None
# Confidence scoring weights for different correlation types
CORRELATION_WEIGHTS = {
"exact_username_match": 0.85,
"exact_email_match": 0.95,
"domain_match": 0.60,
"similar_username": 0.45,
"same_ip_infrastructure": 0.70,
"breach_email_match": 0.90,
"co_registration_temporal": 0.40,
}
def load_sherlock_results(filepath):
"""Load and normalize Sherlock username enumeration results."""
findings = []
if not os.path.isfile(filepath):
return findings
# Sherlock outputs CSV with columns: username, name, url_user, exists, http_status
try:
with open(filepath, "r", errors="replace") as f:
reader = csv.DictReader(f)
for row in reader:
status = row.get("exists", row.get("status", "")).strip().lower()
if status in ("claimed", "true", "yes"):
findings.append({
"source": "sherlock",
"type": "social_profile",
"platform": row.get("name", row.get("platform", "")),
"url": row.get("url_user", row.get("url", "")),
"username": row.get("username", ""),
"collected_at": datetime.now(timezone.utc).isoformat(),
})
except (csv.Error, KeyError):
# Try line-by-line format (Sherlock text output)
with open(filepath, "r", errors="replace") as f:
for line in f:
line = line.strip()
if line.startswith("[+]") or line.startswith("http"):
url_match = re.search(r'(https?://\S+)', line)
if url_match:
url = url_match.group(1)
platform = url.split("/")[2].replace("www.", "").split(".")[0]
findings.append({
"source": "sherlock",
"type": "social_profile",
"platform": platform,
"url": url,
"collected_at": datetime.now(timezone.utc).isoformat(),
})
return findings
def load_harvester_results(filepath):
"""Load and normalize theHarvester results."""
findings = []
if not os.path.isfile(filepath):
return findings
try:
with open(filepath, "r") as f:
data = json.load(f)
except (json.JSONDecodeError, ValueError):
return findings
for email in data.get("emails", []):
findings.append({
"source": "theHarvester",
"type": "email",
"value": email,
"collected_at": datetime.now(timezone.utc).isoformat(),
})
for host in data.get("hosts", []):
findings.append({
"source": "theHarvester",
"type": "hostname",
"value": host,
"collected_at": datetime.now(timezone.utc).isoformat(),
})
for ip in data.get("ips", []):
findings.append({
"source": "theHarvester",
"type": "ip_address",
"value": ip,
"collected_at": datetime.now(timezone.utc).isoformat(),
})
return findings
def load_spiderfoot_results(filepath):
"""Load and normalize SpiderFoot scan results."""
findings = []
if not os.path.isfile(filepath):
return findings
try:
with open(filepath, "r") as f:
data = json.load(f)
except (json.JSONDecodeError, ValueError):
return findings
items = data if isinstance(data, list) else data.get("results", [])
for item in items:
findings.append({
"source": "spiderfoot",
"type": item.get("type", "unknown"),
"value": item.get("data", item.get("value", "")),
"module": item.get("module", ""),
"collected_at": datetime.now(timezone.utc).isoformat(),
})
return findings
def load_breach_results(filepath):
"""Load and normalize breach/HIBP results."""
findings = []
if not os.path.isfile(filepath):
return findings
try:
with open(filepath, "r") as f:
data = json.load(f)
except (json.JSONDecodeError, ValueError):
return findings
breaches = data if isinstance(data, list) else [data]
for breach in breaches:
findings.append({
"source": "breach_database",
"type": "breach_exposure",
"breach_name": breach.get("Name", breach.get("name", "")),
"breach_date": breach.get("BreachDate", breach.get("date", "")),
"data_classes": breach.get("DataClasses", breach.get("data_types", [])),
"collected_at": datetime.now(timezone.utc).isoformat(),
})
return findings
def normalize_all_sources(source_files):
"""Load and combine findings from all OSINT sources."""
all_findings = []
for source_type, filepath in source_files.items():
if not filepath or not os.path.isfile(filepath):
continue
if source_type == "sherlock":
all_findings.extend(load_sherlock_results(filepath))
elif source_type == "harvester":
all_findings.extend(load_harvester_results(filepath))
elif source_type == "spiderfoot":
all_findings.extend(load_spiderfoot_results(filepath))
elif source_type == "breach":
all_findings.extend(load_breach_results(filepath))
elif source_type == "generic":
try:
with open(filepath, "r") as f:
data = json.load(f)
if isinstance(data, list):
all_findings.extend(data)
elif isinstance(data, dict) and "findings" in data:
all_findings.extend(data["findings"])
except (json.JSONDecodeError, ValueError):
pass
return all_findings
def extract_identifiers(findings):
"""Extract unique identifiers (usernames, emails, IPs, domains) from findings."""
identifiers = {
"usernames": set(),
"emails": set(),
"domains": set(),
"ip_addresses": set(),
"urls": set(),
}
for f in findings:
ftype = f.get("type", "")
value = f.get("value", "")
username = f.get("username", "")
url = f.get("url", "")
if username:
identifiers["usernames"].add(username.lower())
if url:
identifiers["urls"].add(url)
if ftype == "email" and value:
identifiers["emails"].add(value.lower())
domain = value.split("@")[-1] if "@" in value else ""
if domain:
identifiers["domains"].add(domain.lower())
elif ftype == "hostname" and value:
identifiers["domains"].add(value.lower())
elif ftype == "ip_address" and value:
identifiers["ip_addresses"].add(value)
elif ftype == "social_profile":
platform_user = f.get("username", "")
if platform_user:
identifiers["usernames"].add(platform_user.lower())
return {k: sorted(v) for k, v in identifiers.items()}
def correlate_findings(findings):
"""Perform cross-source correlation to identify linked entities."""
entities = []
source_map = defaultdict(list)
# Group findings by identifiers
for f in findings:
username = f.get("username", "").lower()
email = f.get("value", "").lower() if f.get("type") == "email" else ""
url = f.get("url", "")
if username:
source_map[f"user:{username}"].append(f)
if email:
source_map[f"email:{email}"].append(f)
# Also link by email username part
email_user = email.split("@")[0] if "@" in email else ""
if email_user:
source_map[f"user:{email_user}"].append(f)
# Build entities from correlated groups
processed = set()
for key, group_findings in source_map.items():
if key in processed or len(group_findings) < 1:
continue
processed.add(key)
sources_seen = set(f.get("source", "") for f in group_findings)
platforms = [f.get("platform", "") for f in group_findings if f.get("platform")]
urls = [f.get("url", "") for f in group_findings if f.get("url")]
# Calculate confidence based on cross-source corroboration
confidence = 0.5
if len(sources_seen) > 1:
confidence = min(0.95, 0.5 + 0.15 * len(sources_seen))
if len(platforms) > 3:
confidence = min(0.98, confidence + 0.1)
identifier = key.split(":", 1)[1] if ":" in key else key
entity = {
"identifier": identifier,
"identifier_type": key.split(":")[0] if ":" in key else "unknown",
"confidence": round(confidence, 2),
"sources": sorted(sources_seen),
"source_count": len(sources_seen),
"linked_accounts": [],
"flags": [],
}
for f in group_findings:
link = {
"source": f.get("source", ""),
"platform": f.get("platform", ""),
"url": f.get("url", ""),
"type": f.get("type", ""),
"value": f.get("value", f.get("username", "")),
}
entity["linked_accounts"].append(link)
# Risk assessment
breach_findings = [f for f in group_findings if f.get("type") == "breach_exposure"]
if breach_findings:
entity["flags"].append(
f"Exposed in {len(breach_findings)} breach(es)"
)
entity["risk_level"] = "high"
elif len(sources_seen) >= 3:
entity["risk_level"] = "medium"
else:
entity["risk_level"] = "low"
entities.append(entity)
# Sort by confidence descending
entities.sort(key=lambda e: e["confidence"], reverse=True)
return entities
def generate_report(findings, entities, target="unknown"):
"""Generate structured OSINT correlation report."""
sources_used = sorted(set(f.get("source", "") for f in findings))
identifier_summary = extract_identifiers(findings)
report = {
"meta": {
"target": target,
"generated_at": datetime.now(timezone.utc).isoformat(),
"sources_used": sources_used,
"total_findings": len(findings),
"total_entities": len(entities),
},
"identifiers": identifier_summary,
"entities": entities,
"risk_summary": {
"high_risk": sum(1 for e in entities if e.get("risk_level") == "high"),
"medium_risk": sum(1 for e in entities if e.get("risk_level") == "medium"),
"low_risk": sum(1 for e in entities if e.get("risk_level") == "low"),
},
}
return report
def generate_markdown_report(report, output_path):
"""Generate a Markdown intelligence profile from the report."""
md = "# OSINT Correlation Report\n\n"
meta = report.get("meta", {})
md += f"**Target:** {meta.get('target', 'N/A')}\n"
md += f"**Generated:** {meta.get('generated_at', '')}\n"
md += f"**Sources:** {', '.join(meta.get('sources_used', []))}\n"
md += f"**Total Findings:** {meta.get('total_findings', 0)}\n"
md += f"**Entities Identified:** {meta.get('total_entities', 0)}\n\n"
risk = report.get("risk_summary", {})
md += "## Risk Summary\n\n"
md += f"| Risk Level | Count |\n|-----------|-------|\n"
md += f"| High | {risk.get('high_risk', 0)} |\n"
md += f"| Medium | {risk.get('medium_risk', 0)} |\n"
md += f"| Low | {risk.get('low_risk', 0)} |\n\n"
md += "## Entity Profiles\n\n"
for entity in report.get("entities", [])[:50]:
eid = entity.get("identifier", "Unknown")
conf = entity.get("confidence", 0)
risk_level = entity.get("risk_level", "N/A")
md += f"### {eid} (Confidence: {conf:.0%}, Risk: {risk_level})\n\n"
md += "| Source | Platform | Value |\n|--------|----------|-------|\n"
for link in entity.get("linked_accounts", []):
md += (f"| {link.get('source', '')} | {link.get('platform', '')} "
f"| {link.get('value', '')} |\n")
for flag in entity.get("flags", []):
md += f"\n- WARNING: {flag}\n"
md += "\n"
with open(output_path, "w") as f:
f.write(md)
print(f"[*] Markdown report saved to {output_path}")
def main():
parser = argparse.ArgumentParser(
description="AI-Driven OSINT Correlation Agent"
)
parser.add_argument("--target", default="unknown",
help="Target identifier (domain, username, etc.)")
parser.add_argument("--sherlock", help="Sherlock results file (CSV or text)")
parser.add_argument("--harvester", help="theHarvester results file (JSON)")
parser.add_argument("--spiderfoot", help="SpiderFoot results file (JSON)")
parser.add_argument("--breach", help="Breach/HIBP results file (JSON)")
parser.add_argument("--generic", help="Generic normalized findings JSON")
parser.add_argument("--normalize-only", action="store_true",
help="Only normalize data, skip correlation")
parser.add_argument("--markdown", help="Output Markdown report path")
parser.add_argument("--output", "-o", help="Output JSON report path")
args = parser.parse_args()
print("[*] AI-Driven OSINT Correlation Agent")
source_files = {
"sherlock": args.sherlock,
"harvester": args.harvester,
"spiderfoot": args.spiderfoot,
"breach": args.breach,
"generic": args.generic,
}
active_sources = {k: v for k, v in source_files.items() if v}
if not active_sources:
parser.print_help()
print("\n[!] Provide at least one data source file.")
return
print(f"[*] Loading data from {len(active_sources)} source(s): "
f"{', '.join(active_sources.keys())}")
findings = normalize_all_sources(source_files)
print(f"[*] Normalized {len(findings)} findings")
if args.normalize_only:
output = json.dumps(findings, indent=2)
if args.output:
with open(args.output, "w") as f:
f.write(output)
print(f"[*] Normalized findings saved to {args.output}")
else:
print(output)
return
print("[*] Performing cross-source correlation...")
entities = correlate_findings(findings)
print(f"[*] Identified {len(entities)} entities")
report = generate_report(findings, entities, target=args.target)
output = json.dumps(report, indent=2)
if args.output:
with open(args.output, "w") as f:
f.write(output)
print(f"[*] JSON report saved to {args.output}")
else:
print(output)
if args.markdown:
generate_markdown_report(report, args.markdown)
if __name__ == "__main__":
main()