Files
T

605 lines
21 KiB
Python

#!/usr/bin/env python3
"""
OSINT Gathering Automation Tool
Performs automated open source intelligence collection including:
- Subdomain enumeration via Certificate Transparency logs
- DNS record collection
- WHOIS information gathering
- Technology fingerprinting
- Google dorking query generation
- Email pattern discovery
- Shodan/Censys integration
Usage:
python process.py --domain targetdomain.com --output ./osint_report
python process.py --domain targetdomain.com --modules all
python process.py --domain targetdomain.com --modules dns,subdomains,emails
Requirements:
pip install requests dnspython whois beautifulsoup4 rich
"""
import argparse
import json
import re
import socket
import sys
from datetime import datetime
from pathlib import Path
from typing import Any
from urllib.parse import urlparse
try:
import dns.resolver
import requests
from bs4 import BeautifulSoup
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.progress import Progress, SpinnerColumn, TextColumn
except ImportError:
print("[!] Missing dependencies. Install with:")
print(" pip install requests dnspython beautifulsoup4 rich")
sys.exit(1)
console = Console()
SESSION = requests.Session()
SESSION.headers.update(
{
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
)
def resolve_dns_records(domain: str) -> dict:
"""Collect DNS records for a domain."""
records = {}
record_types = ["A", "AAAA", "MX", "NS", "TXT", "SOA", "CNAME", "SRV"]
for rtype in record_types:
try:
answers = dns.resolver.resolve(domain, rtype)
records[rtype] = [str(rdata) for rdata in answers]
except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN, dns.resolver.NoNameservers):
pass
except Exception:
pass
return records
def enumerate_subdomains_ct(domain: str) -> list[str]:
"""Enumerate subdomains using Certificate Transparency logs via crt.sh."""
subdomains = set()
try:
url = f"https://crt.sh/?q=%.{domain}&output=json"
response = SESSION.get(url, timeout=30)
if response.status_code == 200:
data = response.json()
for entry in data:
name_value = entry.get("name_value", "")
for name in name_value.split("\n"):
name = name.strip().lower()
if name.endswith(f".{domain}") or name == domain:
# Skip wildcard entries
if not name.startswith("*"):
subdomains.add(name)
except Exception as e:
console.print(f"[yellow][!] crt.sh query failed: {e}[/yellow]")
# Also try common subdomain prefixes
common_prefixes = [
"www", "mail", "ftp", "smtp", "pop", "imap", "webmail",
"vpn", "remote", "portal", "admin", "dev", "staging",
"test", "api", "app", "blog", "shop", "store", "cdn",
"ns1", "ns2", "dns", "mx", "exchange", "owa", "autodiscover",
"sso", "login", "auth", "git", "gitlab", "jenkins",
"jira", "confluence", "wiki", "docs", "support", "help",
]
for prefix in common_prefixes:
subdomain = f"{prefix}.{domain}"
try:
dns.resolver.resolve(subdomain, "A")
subdomains.add(subdomain)
except Exception:
pass
return sorted(subdomains)
def perform_whois_lookup(domain: str) -> dict:
"""Perform WHOIS lookup for a domain."""
try:
import whois as python_whois
w = python_whois.whois(domain)
result = {
"domain_name": str(w.domain_name) if w.domain_name else "N/A",
"registrar": str(w.registrar) if w.registrar else "N/A",
"creation_date": str(w.creation_date) if w.creation_date else "N/A",
"expiration_date": str(w.expiration_date) if w.expiration_date else "N/A",
"name_servers": w.name_servers if w.name_servers else [],
"registrant_org": str(w.org) if w.org else "N/A",
"registrant_country": str(w.country) if w.country else "N/A",
"emails": w.emails if w.emails else [],
}
return result
except Exception as e:
console.print(f"[yellow][!] WHOIS lookup failed: {e}[/yellow]")
return {}
def discover_email_format(domain: str) -> dict:
"""Attempt to discover email format patterns using Hunter.io free tier."""
result = {
"domain": domain,
"format_guess": [],
"discovered_emails": [],
}
# Common email format patterns
common_formats = [
"{first}.{last}@" + domain,
"{first}{last}@" + domain,
"{f}{last}@" + domain,
"{first}_{last}@" + domain,
"{first}@" + domain,
"{last}@" + domain,
]
result["format_guess"] = common_formats
# Try to discover emails from web pages
try:
response = SESSION.get(f"https://{domain}", timeout=10)
if response.status_code == 200:
# Extract emails from page content
email_pattern = re.compile(
rf"[a-zA-Z0-9._%+-]+@{re.escape(domain)}",
re.IGNORECASE
)
emails = email_pattern.findall(response.text)
result["discovered_emails"] = list(set(emails))
except Exception:
pass
# Search for emails on common pages
common_pages = ["/contact", "/about", "/team", "/about-us", "/contact-us"]
for page in common_pages:
try:
response = SESSION.get(f"https://{domain}{page}", timeout=10)
if response.status_code == 200:
email_pattern = re.compile(
rf"[a-zA-Z0-9._%+-]+@{re.escape(domain)}",
re.IGNORECASE
)
emails = email_pattern.findall(response.text)
result["discovered_emails"].extend(emails)
except Exception:
pass
result["discovered_emails"] = list(set(result["discovered_emails"]))
return result
def fingerprint_web_technologies(domain: str) -> dict:
"""Identify web technologies using HTTP headers and response analysis."""
tech = {
"server": None,
"powered_by": None,
"frameworks": [],
"security_headers": {},
"cookies": [],
"cdn": None,
"cms": None,
}
try:
response = SESSION.get(f"https://{domain}", timeout=10, allow_redirects=True)
headers = response.headers
# Server identification
tech["server"] = headers.get("Server", "Not disclosed")
tech["powered_by"] = headers.get("X-Powered-By", "Not disclosed")
# Security headers
security_headers = [
"Strict-Transport-Security",
"Content-Security-Policy",
"X-Frame-Options",
"X-Content-Type-Options",
"X-XSS-Protection",
"Referrer-Policy",
"Permissions-Policy",
"Cross-Origin-Opener-Policy",
"Cross-Origin-Resource-Policy",
]
for header in security_headers:
value = headers.get(header)
tech["security_headers"][header] = value if value else "MISSING"
# Cookie analysis
for cookie in response.cookies:
tech["cookies"].append(
{
"name": cookie.name,
"secure": cookie.secure,
"httponly": "httponly" in cookie._rest,
"domain": cookie.domain,
}
)
# CDN detection
cdn_indicators = {
"cloudflare": ["cf-ray", "cf-cache-status"],
"akamai": ["x-akamai-transformed"],
"cloudfront": ["x-amz-cf-id", "x-amz-cf-pop"],
"fastly": ["x-served-by", "x-fastly-request-id"],
"incapsula": ["x-iinfo"],
}
for cdn, indicators in cdn_indicators.items():
for indicator in indicators:
if indicator in [h.lower() for h in headers]:
tech["cdn"] = cdn
break
# CMS detection from HTML
html = response.text.lower()
if "wp-content" in html or "wordpress" in html:
tech["cms"] = "WordPress"
elif "drupal" in html:
tech["cms"] = "Drupal"
elif "joomla" in html:
tech["cms"] = "Joomla"
elif "shopify" in html:
tech["cms"] = "Shopify"
# Framework detection
if "react" in html or "reactdom" in html:
tech["frameworks"].append("React")
if "angular" in html or "ng-" in html:
tech["frameworks"].append("Angular")
if "vue" in html or "vuejs" in html:
tech["frameworks"].append("Vue.js")
if "jquery" in html:
tech["frameworks"].append("jQuery")
if "bootstrap" in html:
tech["frameworks"].append("Bootstrap")
except Exception as e:
console.print(f"[yellow][!] Web fingerprinting failed: {e}[/yellow]")
return tech
def generate_google_dorks(domain: str) -> list[str]:
"""Generate Google dorking queries for the target domain."""
dorks = [
# Sensitive files
f'site:{domain} filetype:pdf',
f'site:{domain} filetype:xlsx',
f'site:{domain} filetype:docx',
f'site:{domain} filetype:csv',
f'site:{domain} filetype:sql',
f'site:{domain} filetype:log',
f'site:{domain} filetype:bak',
f'site:{domain} filetype:conf',
f'site:{domain} filetype:env',
f'site:{domain} filetype:xml',
# Configuration and credentials
f'site:{domain} inurl:admin',
f'site:{domain} inurl:login',
f'site:{domain} inurl:wp-admin',
f'site:{domain} inurl:wp-login',
f'site:{domain} intitle:"index of"',
f'site:{domain} intitle:"dashboard"',
f'site:{domain} inurl:config',
f'site:{domain} inurl:setup',
# Error messages
f'site:{domain} "error" "sql syntax"',
f'site:{domain} "php error" "on line"',
f'site:{domain} "ORA-" "error"',
f'site:{domain} "mysql" "error"',
f'site:{domain} "stack trace" "at"',
# Sensitive information
f'site:{domain} "confidential"',
f'site:{domain} "internal use only"',
f'site:{domain} "not for distribution"',
f'site:{domain} "password" filetype:txt',
f'site:{domain} "api_key" OR "apikey" OR "api-key"',
# Infrastructure
f'site:{domain} inurl:vpn',
f'site:{domain} inurl:remote',
f'site:{domain} inurl:portal',
f'site:{domain} inurl:citrix',
f'site:{domain} inurl:owa',
# GitHub leaks
f'"{domain}" password site:github.com',
f'"{domain}" api_key site:github.com',
f'"{domain}" secret site:github.com',
f'"{domain}" token site:github.com',
f'"{domain}" site:pastebin.com',
# Cloud storage
f'site:s3.amazonaws.com "{domain.split(".")[0]}"',
f'site:blob.core.windows.net "{domain.split(".")[0]}"',
f'site:storage.googleapis.com "{domain.split(".")[0]}"',
]
return dorks
def check_security_txt(domain: str) -> dict | None:
"""Check for security.txt file per RFC 9116."""
urls = [
f"https://{domain}/.well-known/security.txt",
f"https://{domain}/security.txt",
]
for url in urls:
try:
response = SESSION.get(url, timeout=10)
if response.status_code == 200 and "contact" in response.text.lower():
return {
"url": url,
"content": response.text[:2000],
}
except Exception:
pass
return None
def check_robots_txt(domain: str) -> dict | None:
"""Check robots.txt for interesting paths."""
try:
response = SESSION.get(f"https://{domain}/robots.txt", timeout=10)
if response.status_code == 200:
disallowed = []
for line in response.text.split("\n"):
if line.strip().lower().startswith("disallow:"):
path = line.split(":", 1)[1].strip()
if path and path != "/":
disallowed.append(path)
return {
"content": response.text[:2000],
"disallowed_paths": disallowed,
}
except Exception:
pass
return None
def generate_report(domain: str, results: dict, output_dir: Path):
"""Generate comprehensive OSINT report."""
report = f"""# OSINT Report: {domain}
## Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
---
## 1. Domain Information
### WHOIS Data
"""
whois_data = results.get("whois", {})
if whois_data:
for key, value in whois_data.items():
report += f"- **{key}**: {value}\n"
else:
report += "WHOIS data not available.\n"
report += "\n### DNS Records\n\n"
dns_data = results.get("dns", {})
if dns_data:
for rtype, records in dns_data.items():
report += f"#### {rtype} Records\n"
for record in records:
report += f"- `{record}`\n"
report += "\n"
report += "## 2. Subdomain Enumeration\n\n"
subdomains = results.get("subdomains", [])
report += f"**Total subdomains discovered:** {len(subdomains)}\n\n"
for sub in subdomains:
try:
ip = socket.gethostbyname(sub)
report += f"- `{sub}` -> `{ip}`\n"
except Exception:
report += f"- `{sub}` -> [unresolvable]\n"
report += "\n## 3. Web Technology Fingerprint\n\n"
tech = results.get("technology", {})
if tech:
report += f"- **Server:** {tech.get('server', 'N/A')}\n"
report += f"- **Powered By:** {tech.get('powered_by', 'N/A')}\n"
report += f"- **CMS:** {tech.get('cms', 'None detected')}\n"
report += f"- **CDN:** {tech.get('cdn', 'None detected')}\n"
report += f"- **Frameworks:** {', '.join(tech.get('frameworks', [])) or 'None detected'}\n\n"
report += "### Security Headers\n\n"
report += "| Header | Status |\n|--------|--------|\n"
for header, value in tech.get("security_headers", {}).items():
status = "MISSING" if value == "MISSING" else "Present"
report += f"| {header} | {status} |\n"
report += "\n## 4. Email Intelligence\n\n"
email_data = results.get("emails", {})
if email_data:
report += "### Discovered Emails\n"
for email in email_data.get("discovered_emails", []):
report += f"- `{email}`\n"
report += "\n### Likely Email Formats\n"
for fmt in email_data.get("format_guess", []):
report += f"- `{fmt}`\n"
report += "\n## 5. Google Dorking Queries\n\n"
dorks = results.get("dorks", [])
for dork in dorks[:20]:
report += f"- `{dork}`\n"
report += "\n## 6. Additional Findings\n\n"
security_txt = results.get("security_txt")
if security_txt:
report += f"### security.txt Found\n- URL: {security_txt['url']}\n\n"
robots_txt = results.get("robots_txt")
if robots_txt:
report += "### Interesting robots.txt Paths\n"
for path in robots_txt.get("disallowed_paths", []):
report += f"- `{path}`\n"
report += f"""
---
## 7. Recommendations for Attack Planning
### Priority Initial Access Vectors
1. Review discovered subdomains for vulnerable web applications
2. Validate credential leaks against target systems
3. Use discovered email format for spearphishing campaign
4. Investigate missing security headers for potential exploitation
5. Check disallowed paths from robots.txt for sensitive content
### Social Engineering Targets
- Use discovered personnel for targeted phishing
- Leverage technology stack knowledge for pretexting
- Utilize physical location data for physical access testing
---
*Report generated by OSINT Automation Tool*
*Classification: CONFIDENTIAL - Red Team Use Only*
"""
report_path = output_dir / f"osint_report_{domain}.md"
with open(report_path, "w") as f:
f.write(report)
console.print(f"[green][+] Report saved to: {report_path}[/green]")
# Save raw data as JSON
json_path = output_dir / f"osint_data_{domain}.json"
serializable_results = {}
for key, value in results.items():
try:
json.dumps(value)
serializable_results[key] = value
except (TypeError, ValueError):
serializable_results[key] = str(value)
with open(json_path, "w") as f:
json.dump(serializable_results, f, indent=2, default=str)
console.print(f"[green][+] Raw data saved to: {json_path}[/green]")
def main():
parser = argparse.ArgumentParser(
description="OSINT Gathering Automation Tool"
)
parser.add_argument("--domain", required=True, help="Target domain")
parser.add_argument(
"--output", default="./osint_output", help="Output directory"
)
parser.add_argument(
"--modules",
default="all",
help="Comma-separated modules: dns,subdomains,whois,emails,tech,dorks,all",
)
args = parser.parse_args()
output_dir = Path(args.output)
output_dir.mkdir(parents=True, exist_ok=True)
modules = args.modules.split(",") if args.modules != "all" else [
"dns", "subdomains", "whois", "emails", "tech", "dorks", "security_txt", "robots_txt"
]
console.print(
Panel(
f"[bold red]OSINT Gathering Tool[/bold red]\n"
f"Target: {args.domain}\n"
f"Modules: {', '.join(modules)}\n"
f"Output: {args.output}",
title="Configuration",
)
)
results = {}
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
console=console,
) as progress:
if "dns" in modules:
task = progress.add_task("[cyan]Collecting DNS records...", total=None)
results["dns"] = resolve_dns_records(args.domain)
progress.update(task, completed=True, description="[green]DNS records collected")
if "subdomains" in modules:
task = progress.add_task("[cyan]Enumerating subdomains...", total=None)
results["subdomains"] = enumerate_subdomains_ct(args.domain)
progress.update(
task,
completed=True,
description=f"[green]Found {len(results['subdomains'])} subdomains",
)
if "whois" in modules:
task = progress.add_task("[cyan]Performing WHOIS lookup...", total=None)
results["whois"] = perform_whois_lookup(args.domain)
progress.update(task, completed=True, description="[green]WHOIS data collected")
if "emails" in modules:
task = progress.add_task("[cyan]Discovering email formats...", total=None)
results["emails"] = discover_email_format(args.domain)
progress.update(task, completed=True, description="[green]Email discovery complete")
if "tech" in modules:
task = progress.add_task("[cyan]Fingerprinting technologies...", total=None)
results["technology"] = fingerprint_web_technologies(args.domain)
progress.update(task, completed=True, description="[green]Technology fingerprint complete")
if "dorks" in modules:
task = progress.add_task("[cyan]Generating Google dorks...", total=None)
results["dorks"] = generate_google_dorks(args.domain)
progress.update(
task,
completed=True,
description=f"[green]Generated {len(results['dorks'])} dork queries",
)
if "security_txt" in modules:
task = progress.add_task("[cyan]Checking security.txt...", total=None)
results["security_txt"] = check_security_txt(args.domain)
progress.update(task, completed=True, description="[green]security.txt check complete")
if "robots_txt" in modules:
task = progress.add_task("[cyan]Checking robots.txt...", total=None)
results["robots_txt"] = check_robots_txt(args.domain)
progress.update(task, completed=True, description="[green]robots.txt check complete")
generate_report(args.domain, results, output_dir)
# Display summary table
table = Table(title=f"OSINT Summary: {args.domain}")
table.add_column("Category", style="cyan")
table.add_column("Count/Status", style="green")
table.add_row("DNS Record Types", str(len(results.get("dns", {}))))
table.add_row("Subdomains Found", str(len(results.get("subdomains", []))))
table.add_row("Emails Discovered", str(len(results.get("emails", {}).get("discovered_emails", []))))
table.add_row("Google Dorks Generated", str(len(results.get("dorks", []))))
table.add_row("security.txt", "Found" if results.get("security_txt") else "Not Found")
table.add_row("robots.txt", "Found" if results.get("robots_txt") else "Not Found")
console.print(table)
if __name__ == "__main__":
main()