#!/usr/bin/env python3 """ OSINT Gathering Automation Tool Performs automated open source intelligence collection including: - Subdomain enumeration via Certificate Transparency logs - DNS record collection - WHOIS information gathering - Technology fingerprinting - Google dorking query generation - Email pattern discovery - Shodan/Censys integration Usage: python process.py --domain targetdomain.com --output ./osint_report python process.py --domain targetdomain.com --modules all python process.py --domain targetdomain.com --modules dns,subdomains,emails Requirements: pip install requests dnspython whois beautifulsoup4 rich """ import argparse import json import re import socket import sys from datetime import datetime from pathlib import Path from typing import Any from urllib.parse import urlparse try: import dns.resolver import requests from bs4 import BeautifulSoup from rich.console import Console from rich.table import Table from rich.panel import Panel from rich.progress import Progress, SpinnerColumn, TextColumn except ImportError: print("[!] Missing dependencies. Install with:") print(" pip install requests dnspython beautifulsoup4 rich") sys.exit(1) console = Console() SESSION = requests.Session() SESSION.headers.update( { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" } ) def resolve_dns_records(domain: str) -> dict: """Collect DNS records for a domain.""" records = {} record_types = ["A", "AAAA", "MX", "NS", "TXT", "SOA", "CNAME", "SRV"] for rtype in record_types: try: answers = dns.resolver.resolve(domain, rtype) records[rtype] = [str(rdata) for rdata in answers] except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN, dns.resolver.NoNameservers): pass except Exception: pass return records def enumerate_subdomains_ct(domain: str) -> list[str]: """Enumerate subdomains using Certificate Transparency logs via crt.sh.""" subdomains = set() try: url = f"https://crt.sh/?q=%.{domain}&output=json" response = SESSION.get(url, timeout=30) if response.status_code == 200: data = response.json() for entry in data: name_value = entry.get("name_value", "") for name in name_value.split("\n"): name = name.strip().lower() if name.endswith(f".{domain}") or name == domain: # Skip wildcard entries if not name.startswith("*"): subdomains.add(name) except Exception as e: console.print(f"[yellow][!] crt.sh query failed: {e}[/yellow]") # Also try common subdomain prefixes common_prefixes = [ "www", "mail", "ftp", "smtp", "pop", "imap", "webmail", "vpn", "remote", "portal", "admin", "dev", "staging", "test", "api", "app", "blog", "shop", "store", "cdn", "ns1", "ns2", "dns", "mx", "exchange", "owa", "autodiscover", "sso", "login", "auth", "git", "gitlab", "jenkins", "jira", "confluence", "wiki", "docs", "support", "help", ] for prefix in common_prefixes: subdomain = f"{prefix}.{domain}" try: dns.resolver.resolve(subdomain, "A") subdomains.add(subdomain) except Exception: pass return sorted(subdomains) def perform_whois_lookup(domain: str) -> dict: """Perform WHOIS lookup for a domain.""" try: import whois as python_whois w = python_whois.whois(domain) result = { "domain_name": str(w.domain_name) if w.domain_name else "N/A", "registrar": str(w.registrar) if w.registrar else "N/A", "creation_date": str(w.creation_date) if w.creation_date else "N/A", "expiration_date": str(w.expiration_date) if w.expiration_date else "N/A", "name_servers": w.name_servers if w.name_servers else [], "registrant_org": str(w.org) if w.org else "N/A", "registrant_country": str(w.country) if w.country else "N/A", "emails": w.emails if w.emails else [], } return result except Exception as e: console.print(f"[yellow][!] WHOIS lookup failed: {e}[/yellow]") return {} def discover_email_format(domain: str) -> dict: """Attempt to discover email format patterns using Hunter.io free tier.""" result = { "domain": domain, "format_guess": [], "discovered_emails": [], } # Common email format patterns common_formats = [ "{first}.{last}@" + domain, "{first}{last}@" + domain, "{f}{last}@" + domain, "{first}_{last}@" + domain, "{first}@" + domain, "{last}@" + domain, ] result["format_guess"] = common_formats # Try to discover emails from web pages try: response = SESSION.get(f"https://{domain}", timeout=10) if response.status_code == 200: # Extract emails from page content email_pattern = re.compile( rf"[a-zA-Z0-9._%+-]+@{re.escape(domain)}", re.IGNORECASE ) emails = email_pattern.findall(response.text) result["discovered_emails"] = list(set(emails)) except Exception: pass # Search for emails on common pages common_pages = ["/contact", "/about", "/team", "/about-us", "/contact-us"] for page in common_pages: try: response = SESSION.get(f"https://{domain}{page}", timeout=10) if response.status_code == 200: email_pattern = re.compile( rf"[a-zA-Z0-9._%+-]+@{re.escape(domain)}", re.IGNORECASE ) emails = email_pattern.findall(response.text) result["discovered_emails"].extend(emails) except Exception: pass result["discovered_emails"] = list(set(result["discovered_emails"])) return result def fingerprint_web_technologies(domain: str) -> dict: """Identify web technologies using HTTP headers and response analysis.""" tech = { "server": None, "powered_by": None, "frameworks": [], "security_headers": {}, "cookies": [], "cdn": None, "cms": None, } try: response = SESSION.get(f"https://{domain}", timeout=10, allow_redirects=True) headers = response.headers # Server identification tech["server"] = headers.get("Server", "Not disclosed") tech["powered_by"] = headers.get("X-Powered-By", "Not disclosed") # Security headers security_headers = [ "Strict-Transport-Security", "Content-Security-Policy", "X-Frame-Options", "X-Content-Type-Options", "X-XSS-Protection", "Referrer-Policy", "Permissions-Policy", "Cross-Origin-Opener-Policy", "Cross-Origin-Resource-Policy", ] for header in security_headers: value = headers.get(header) tech["security_headers"][header] = value if value else "MISSING" # Cookie analysis for cookie in response.cookies: tech["cookies"].append( { "name": cookie.name, "secure": cookie.secure, "httponly": "httponly" in cookie._rest, "domain": cookie.domain, } ) # CDN detection cdn_indicators = { "cloudflare": ["cf-ray", "cf-cache-status"], "akamai": ["x-akamai-transformed"], "cloudfront": ["x-amz-cf-id", "x-amz-cf-pop"], "fastly": ["x-served-by", "x-fastly-request-id"], "incapsula": ["x-iinfo"], } for cdn, indicators in cdn_indicators.items(): for indicator in indicators: if indicator in [h.lower() for h in headers]: tech["cdn"] = cdn break # CMS detection from HTML html = response.text.lower() if "wp-content" in html or "wordpress" in html: tech["cms"] = "WordPress" elif "drupal" in html: tech["cms"] = "Drupal" elif "joomla" in html: tech["cms"] = "Joomla" elif "shopify" in html: tech["cms"] = "Shopify" # Framework detection if "react" in html or "reactdom" in html: tech["frameworks"].append("React") if "angular" in html or "ng-" in html: tech["frameworks"].append("Angular") if "vue" in html or "vuejs" in html: tech["frameworks"].append("Vue.js") if "jquery" in html: tech["frameworks"].append("jQuery") if "bootstrap" in html: tech["frameworks"].append("Bootstrap") except Exception as e: console.print(f"[yellow][!] Web fingerprinting failed: {e}[/yellow]") return tech def generate_google_dorks(domain: str) -> list[str]: """Generate Google dorking queries for the target domain.""" dorks = [ # Sensitive files f'site:{domain} filetype:pdf', f'site:{domain} filetype:xlsx', f'site:{domain} filetype:docx', f'site:{domain} filetype:csv', f'site:{domain} filetype:sql', f'site:{domain} filetype:log', f'site:{domain} filetype:bak', f'site:{domain} filetype:conf', f'site:{domain} filetype:env', f'site:{domain} filetype:xml', # Configuration and credentials f'site:{domain} inurl:admin', f'site:{domain} inurl:login', f'site:{domain} inurl:wp-admin', f'site:{domain} inurl:wp-login', f'site:{domain} intitle:"index of"', f'site:{domain} intitle:"dashboard"', f'site:{domain} inurl:config', f'site:{domain} inurl:setup', # Error messages f'site:{domain} "error" "sql syntax"', f'site:{domain} "php error" "on line"', f'site:{domain} "ORA-" "error"', f'site:{domain} "mysql" "error"', f'site:{domain} "stack trace" "at"', # Sensitive information f'site:{domain} "confidential"', f'site:{domain} "internal use only"', f'site:{domain} "not for distribution"', f'site:{domain} "password" filetype:txt', f'site:{domain} "api_key" OR "apikey" OR "api-key"', # Infrastructure f'site:{domain} inurl:vpn', f'site:{domain} inurl:remote', f'site:{domain} inurl:portal', f'site:{domain} inurl:citrix', f'site:{domain} inurl:owa', # GitHub leaks f'"{domain}" password site:github.com', f'"{domain}" api_key site:github.com', f'"{domain}" secret site:github.com', f'"{domain}" token site:github.com', f'"{domain}" site:pastebin.com', # Cloud storage f'site:s3.amazonaws.com "{domain.split(".")[0]}"', f'site:blob.core.windows.net "{domain.split(".")[0]}"', f'site:storage.googleapis.com "{domain.split(".")[0]}"', ] return dorks def check_security_txt(domain: str) -> dict | None: """Check for security.txt file per RFC 9116.""" urls = [ f"https://{domain}/.well-known/security.txt", f"https://{domain}/security.txt", ] for url in urls: try: response = SESSION.get(url, timeout=10) if response.status_code == 200 and "contact" in response.text.lower(): return { "url": url, "content": response.text[:2000], } except Exception: pass return None def check_robots_txt(domain: str) -> dict | None: """Check robots.txt for interesting paths.""" try: response = SESSION.get(f"https://{domain}/robots.txt", timeout=10) if response.status_code == 200: disallowed = [] for line in response.text.split("\n"): if line.strip().lower().startswith("disallow:"): path = line.split(":", 1)[1].strip() if path and path != "/": disallowed.append(path) return { "content": response.text[:2000], "disallowed_paths": disallowed, } except Exception: pass return None def generate_report(domain: str, results: dict, output_dir: Path): """Generate comprehensive OSINT report.""" report = f"""# OSINT Report: {domain} ## Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} --- ## 1. Domain Information ### WHOIS Data """ whois_data = results.get("whois", {}) if whois_data: for key, value in whois_data.items(): report += f"- **{key}**: {value}\n" else: report += "WHOIS data not available.\n" report += "\n### DNS Records\n\n" dns_data = results.get("dns", {}) if dns_data: for rtype, records in dns_data.items(): report += f"#### {rtype} Records\n" for record in records: report += f"- `{record}`\n" report += "\n" report += "## 2. Subdomain Enumeration\n\n" subdomains = results.get("subdomains", []) report += f"**Total subdomains discovered:** {len(subdomains)}\n\n" for sub in subdomains: try: ip = socket.gethostbyname(sub) report += f"- `{sub}` -> `{ip}`\n" except Exception: report += f"- `{sub}` -> [unresolvable]\n" report += "\n## 3. Web Technology Fingerprint\n\n" tech = results.get("technology", {}) if tech: report += f"- **Server:** {tech.get('server', 'N/A')}\n" report += f"- **Powered By:** {tech.get('powered_by', 'N/A')}\n" report += f"- **CMS:** {tech.get('cms', 'None detected')}\n" report += f"- **CDN:** {tech.get('cdn', 'None detected')}\n" report += f"- **Frameworks:** {', '.join(tech.get('frameworks', [])) or 'None detected'}\n\n" report += "### Security Headers\n\n" report += "| Header | Status |\n|--------|--------|\n" for header, value in tech.get("security_headers", {}).items(): status = "MISSING" if value == "MISSING" else "Present" report += f"| {header} | {status} |\n" report += "\n## 4. Email Intelligence\n\n" email_data = results.get("emails", {}) if email_data: report += "### Discovered Emails\n" for email in email_data.get("discovered_emails", []): report += f"- `{email}`\n" report += "\n### Likely Email Formats\n" for fmt in email_data.get("format_guess", []): report += f"- `{fmt}`\n" report += "\n## 5. Google Dorking Queries\n\n" dorks = results.get("dorks", []) for dork in dorks[:20]: report += f"- `{dork}`\n" report += "\n## 6. Additional Findings\n\n" security_txt = results.get("security_txt") if security_txt: report += f"### security.txt Found\n- URL: {security_txt['url']}\n\n" robots_txt = results.get("robots_txt") if robots_txt: report += "### Interesting robots.txt Paths\n" for path in robots_txt.get("disallowed_paths", []): report += f"- `{path}`\n" report += f""" --- ## 7. Recommendations for Attack Planning ### Priority Initial Access Vectors 1. Review discovered subdomains for vulnerable web applications 2. Validate credential leaks against target systems 3. Use discovered email format for spearphishing campaign 4. Investigate missing security headers for potential exploitation 5. Check disallowed paths from robots.txt for sensitive content ### Social Engineering Targets - Use discovered personnel for targeted phishing - Leverage technology stack knowledge for pretexting - Utilize physical location data for physical access testing --- *Report generated by OSINT Automation Tool* *Classification: CONFIDENTIAL - Red Team Use Only* """ report_path = output_dir / f"osint_report_{domain}.md" with open(report_path, "w") as f: f.write(report) console.print(f"[green][+] Report saved to: {report_path}[/green]") # Save raw data as JSON json_path = output_dir / f"osint_data_{domain}.json" serializable_results = {} for key, value in results.items(): try: json.dumps(value) serializable_results[key] = value except (TypeError, ValueError): serializable_results[key] = str(value) with open(json_path, "w") as f: json.dump(serializable_results, f, indent=2, default=str) console.print(f"[green][+] Raw data saved to: {json_path}[/green]") def main(): parser = argparse.ArgumentParser( description="OSINT Gathering Automation Tool" ) parser.add_argument("--domain", required=True, help="Target domain") parser.add_argument( "--output", default="./osint_output", help="Output directory" ) parser.add_argument( "--modules", default="all", help="Comma-separated modules: dns,subdomains,whois,emails,tech,dorks,all", ) args = parser.parse_args() output_dir = Path(args.output) output_dir.mkdir(parents=True, exist_ok=True) modules = args.modules.split(",") if args.modules != "all" else [ "dns", "subdomains", "whois", "emails", "tech", "dorks", "security_txt", "robots_txt" ] console.print( Panel( f"[bold red]OSINT Gathering Tool[/bold red]\n" f"Target: {args.domain}\n" f"Modules: {', '.join(modules)}\n" f"Output: {args.output}", title="Configuration", ) ) results = {} with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console, ) as progress: if "dns" in modules: task = progress.add_task("[cyan]Collecting DNS records...", total=None) results["dns"] = resolve_dns_records(args.domain) progress.update(task, completed=True, description="[green]DNS records collected") if "subdomains" in modules: task = progress.add_task("[cyan]Enumerating subdomains...", total=None) results["subdomains"] = enumerate_subdomains_ct(args.domain) progress.update( task, completed=True, description=f"[green]Found {len(results['subdomains'])} subdomains", ) if "whois" in modules: task = progress.add_task("[cyan]Performing WHOIS lookup...", total=None) results["whois"] = perform_whois_lookup(args.domain) progress.update(task, completed=True, description="[green]WHOIS data collected") if "emails" in modules: task = progress.add_task("[cyan]Discovering email formats...", total=None) results["emails"] = discover_email_format(args.domain) progress.update(task, completed=True, description="[green]Email discovery complete") if "tech" in modules: task = progress.add_task("[cyan]Fingerprinting technologies...", total=None) results["technology"] = fingerprint_web_technologies(args.domain) progress.update(task, completed=True, description="[green]Technology fingerprint complete") if "dorks" in modules: task = progress.add_task("[cyan]Generating Google dorks...", total=None) results["dorks"] = generate_google_dorks(args.domain) progress.update( task, completed=True, description=f"[green]Generated {len(results['dorks'])} dork queries", ) if "security_txt" in modules: task = progress.add_task("[cyan]Checking security.txt...", total=None) results["security_txt"] = check_security_txt(args.domain) progress.update(task, completed=True, description="[green]security.txt check complete") if "robots_txt" in modules: task = progress.add_task("[cyan]Checking robots.txt...", total=None) results["robots_txt"] = check_robots_txt(args.domain) progress.update(task, completed=True, description="[green]robots.txt check complete") generate_report(args.domain, results, output_dir) # Display summary table table = Table(title=f"OSINT Summary: {args.domain}") table.add_column("Category", style="cyan") table.add_column("Count/Status", style="green") table.add_row("DNS Record Types", str(len(results.get("dns", {})))) table.add_row("Subdomains Found", str(len(results.get("subdomains", [])))) table.add_row("Emails Discovered", str(len(results.get("emails", {}).get("discovered_emails", [])))) table.add_row("Google Dorks Generated", str(len(results.get("dorks", [])))) table.add_row("security.txt", "Found" if results.get("security_txt") else "Not Found") table.add_row("robots.txt", "Found" if results.get("robots_txt") else "Not Found") console.print(table) if __name__ == "__main__": main()