#!/usr/bin/env python3 """ SAST Pipeline Orchestration Script Runs CodeQL and Semgrep scans, aggregates SARIF results, evaluates quality gates, and produces a consolidated report. Designed to be invoked from GitHub Actions or any CI/CD platform. Usage: python process.py --repo-path /path/to/repo --output report.json python process.py --repo-path . --severity-threshold high --fail-on-findings """ import argparse import json import os import subprocess import sys from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Optional @dataclass class ScanFinding: rule_id: str severity: str message: str file_path: str start_line: int end_line: int tool: str cwe: str = "" owasp: str = "" fingerprint: str = "" @dataclass class ScanResult: tool: str findings: list = field(default_factory=list) rules_evaluated: int = 0 scan_duration_seconds: float = 0.0 exit_code: int = 0 error_message: str = "" SEVERITY_ORDER = {"critical": 0, "high": 1, "medium": 2, "low": 3, "note": 4, "none": 5} def run_semgrep(repo_path: str, config: str = "auto", extra_configs: Optional[list] = None) -> ScanResult: """Run Semgrep scan and return structured results.""" result = ScanResult(tool="semgrep") sarif_output = os.path.join(repo_path, "semgrep-results.sarif") cmd = [ "semgrep", "ci", "--config", config, "--sarif", "--output", sarif_output, "--json", "--quiet" ] if extra_configs: for cfg in extra_configs: cmd.extend(["--config", cfg]) start_time = datetime.now(timezone.utc) try: proc = subprocess.run( cmd, cwd=repo_path, capture_output=True, text=True, timeout=600 ) result.exit_code = proc.returncode if proc.returncode not in (0, 1): result.error_message = proc.stderr[:500] return result except subprocess.TimeoutExpired: result.error_message = "Semgrep scan timed out after 600 seconds" result.exit_code = -1 return result except FileNotFoundError: result.error_message = "semgrep binary not found. Install with: pip install semgrep" result.exit_code = -1 return result result.scan_duration_seconds = (datetime.now(timezone.utc) - start_time).total_seconds() if os.path.exists(sarif_output): result.findings = parse_sarif(sarif_output, "semgrep") with open(sarif_output, "r") as f: sarif_data = json.load(f) for run in sarif_data.get("runs", []): result.rules_evaluated = len(run.get("tool", {}).get("driver", {}).get("rules", [])) return result def run_codeql_query(repo_path: str, language: str, database_path: str) -> ScanResult: """Run CodeQL analysis on a pre-created database and return structured results.""" result = ScanResult(tool=f"codeql-{language}") sarif_output = os.path.join(repo_path, f"codeql-{language}-results.sarif") cmd = [ "codeql", "database", "analyze", database_path, f"codeql/{language}-queries:codeql-suites/{language}-security-extended.qls", "--format=sarifv2.1.0", f"--output={sarif_output}", "--threads=0" ] start_time = datetime.now(timezone.utc) try: proc = subprocess.run( cmd, cwd=repo_path, capture_output=True, text=True, timeout=1200 ) result.exit_code = proc.returncode if proc.returncode != 0: result.error_message = proc.stderr[:500] except subprocess.TimeoutExpired: result.error_message = "CodeQL analysis timed out after 1200 seconds" result.exit_code = -1 return result except FileNotFoundError: result.error_message = "codeql binary not found. Install from https://github.com/github/codeql-cli-binaries" result.exit_code = -1 return result result.scan_duration_seconds = (datetime.now(timezone.utc) - start_time).total_seconds() if os.path.exists(sarif_output): result.findings = parse_sarif(sarif_output, f"codeql-{language}") with open(sarif_output, "r") as f: sarif_data = json.load(f) for run in sarif_data.get("runs", []): result.rules_evaluated = len(run.get("tool", {}).get("driver", {}).get("rules", [])) return result def parse_sarif(sarif_path: str, tool_name: str) -> list: """Parse a SARIF file and extract findings as ScanFinding objects.""" findings = [] with open(sarif_path, "r") as f: sarif_data = json.load(f) for run in sarif_data.get("runs", []): rules_map = {} for rule in run.get("tool", {}).get("driver", {}).get("rules", []): rule_id = rule.get("id", "") properties = rule.get("properties", {}) cwe_tags = [t for t in properties.get("tags", []) if t.startswith("CWE")] owasp_tags = [t for t in properties.get("tags", []) if "owasp" in t.lower()] rules_map[rule_id] = { "cwe": cwe_tags[0] if cwe_tags else "", "owasp": owasp_tags[0] if owasp_tags else "", "severity": rule.get("defaultConfiguration", {}).get("level", "warning") } for result in run.get("results", []): rule_id = result.get("ruleId", "unknown") rule_info = rules_map.get(rule_id, {}) level = result.get("level", rule_info.get("severity", "warning")) severity_map = {"error": "high", "warning": "medium", "note": "low", "none": "none"} severity = severity_map.get(level, "medium") security_severity = None for rule in run.get("tool", {}).get("driver", {}).get("rules", []): if rule.get("id") == rule_id: security_severity = rule.get("properties", {}).get("security-severity") break if security_severity: score = float(security_severity) if score >= 9.0: severity = "critical" elif score >= 7.0: severity = "high" elif score >= 4.0: severity = "medium" else: severity = "low" locations = result.get("locations", [{}]) physical = locations[0].get("physicalLocation", {}) if locations else {} artifact = physical.get("artifactLocation", {}) region = physical.get("region", {}) findings.append(ScanFinding( rule_id=rule_id, severity=severity, message=result.get("message", {}).get("text", ""), file_path=artifact.get("uri", "unknown"), start_line=region.get("startLine", 0), end_line=region.get("endLine", region.get("startLine", 0)), tool=tool_name, cwe=rule_info.get("cwe", ""), owasp=rule_info.get("owasp", ""), fingerprint=str(result.get("fingerprints", {}).get("primaryLocationLineHash", "")) )) return findings def evaluate_quality_gate(findings: list, severity_threshold: str) -> dict: """Evaluate quality gate based on finding severities.""" threshold_level = SEVERITY_ORDER.get(severity_threshold.lower(), 1) blocking_findings = [ f for f in findings if SEVERITY_ORDER.get(f.severity.lower(), 5) <= threshold_level ] severity_counts = {} for f in findings: sev = f.severity.lower() severity_counts[sev] = severity_counts.get(sev, 0) + 1 return { "passed": len(blocking_findings) == 0, "threshold": severity_threshold, "total_findings": len(findings), "blocking_findings": len(blocking_findings), "severity_counts": severity_counts, "blocking_details": [ { "rule_id": f.rule_id, "severity": f.severity, "file": f.file_path, "line": f.start_line, "tool": f.tool, "message": f.message[:200] } for f in blocking_findings ] } def generate_report(scan_results: list, quality_gate: dict, repo_path: str) -> dict: """Generate a consolidated SAST report.""" all_findings = [] for sr in scan_results: all_findings.extend(sr.findings) cwe_counts = {} for f in all_findings: if f.cwe: cwe_counts[f.cwe] = cwe_counts.get(f.cwe, 0) + 1 report = { "report_metadata": { "generated_at": datetime.now(timezone.utc).isoformat(), "repository": repo_path, "report_version": "1.0.0" }, "scan_summary": [ { "tool": sr.tool, "findings_count": len(sr.findings), "rules_evaluated": sr.rules_evaluated, "duration_seconds": sr.scan_duration_seconds, "status": "success" if sr.exit_code in (0, 1) else "error", "error": sr.error_message } for sr in scan_results ], "quality_gate": quality_gate, "top_cwes": sorted(cwe_counts.items(), key=lambda x: x[1], reverse=True)[:10], "findings": [ { "rule_id": f.rule_id, "severity": f.severity, "tool": f.tool, "file": f.file_path, "line": f.start_line, "cwe": f.cwe, "owasp": f.owasp, "message": f.message[:300] } for f in sorted(all_findings, key=lambda x: SEVERITY_ORDER.get(x.severity.lower(), 5)) ] } return report def main(): parser = argparse.ArgumentParser(description="SAST Pipeline Orchestration") parser.add_argument("--repo-path", required=True, help="Path to the repository to scan") parser.add_argument("--output", default="sast-report.json", help="Output report file path") parser.add_argument("--severity-threshold", default="high", choices=["critical", "high", "medium", "low"], help="Minimum severity to block pipeline") parser.add_argument("--fail-on-findings", action="store_true", help="Exit with non-zero code if quality gate fails") parser.add_argument("--semgrep-config", default="auto", help="Semgrep configuration (default: auto)") parser.add_argument("--semgrep-extra-configs", nargs="*", help="Additional Semgrep config paths") parser.add_argument("--skip-semgrep", action="store_true", help="Skip Semgrep scan") parser.add_argument("--skip-codeql", action="store_true", help="Skip CodeQL scan") parser.add_argument("--codeql-language", default=None, help="Language for CodeQL analysis") parser.add_argument("--codeql-db-path", default=None, help="Path to CodeQL database") parser.add_argument("--sarif-only", nargs="*", help="Only parse existing SARIF files instead of running scans") args = parser.parse_args() repo_path = os.path.abspath(args.repo_path) scan_results = [] if args.sarif_only: for sarif_file in args.sarif_only: tool_name = Path(sarif_file).stem findings = parse_sarif(sarif_file, tool_name) sr = ScanResult(tool=tool_name, findings=findings) scan_results.append(sr) else: if not args.skip_semgrep: print("[*] Running Semgrep scan...") semgrep_result = run_semgrep( repo_path, config=args.semgrep_config, extra_configs=args.semgrep_extra_configs ) scan_results.append(semgrep_result) print(f" Found {len(semgrep_result.findings)} findings in {semgrep_result.scan_duration_seconds:.1f}s") if semgrep_result.error_message: print(f" Warning: {semgrep_result.error_message}") if not args.skip_codeql and args.codeql_language and args.codeql_db_path: print(f"[*] Running CodeQL analysis for {args.codeql_language}...") codeql_result = run_codeql_query(repo_path, args.codeql_language, args.codeql_db_path) scan_results.append(codeql_result) print(f" Found {len(codeql_result.findings)} findings in {codeql_result.scan_duration_seconds:.1f}s") all_findings = [] for sr in scan_results: all_findings.extend(sr.findings) quality_gate = evaluate_quality_gate(all_findings, args.severity_threshold) report = generate_report(scan_results, quality_gate, repo_path) output_path = os.path.abspath(args.output) with open(output_path, "w") as f: json.dump(report, f, indent=2) print(f"\n[*] Report written to {output_path}") if quality_gate["passed"]: print(f"[PASS] Quality gate passed. {quality_gate['total_findings']} findings, none blocking.") else: print(f"[FAIL] Quality gate failed. {quality_gate['blocking_findings']} blocking findings:") for detail in quality_gate["blocking_details"]: print(f" - [{detail['severity'].upper()}] {detail['rule_id']} in {detail['file']}:{detail['line']}") if args.fail_on_findings and not quality_gate["passed"]: sys.exit(1) if __name__ == "__main__": main()