Anthropic-Cybersecurity-Skills/skills/integrating-sast-into-github-actions-pipeline/scripts/process.py

#!/usr/bin/env python3
"""
SAST Pipeline Orchestration Script

Runs CodeQL and Semgrep scans, aggregates SARIF results, evaluates quality gates,
and produces a consolidated report. Designed to be invoked from GitHub Actions
or any CI/CD platform.

Usage:
    python process.py --repo-path /path/to/repo --output report.json
    python process.py --repo-path . --severity-threshold high --fail-on-findings
"""

import argparse
import json
import os
import subprocess
import sys
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional


@dataclass
class ScanFinding:
    rule_id: str
    severity: str
    message: str
    file_path: str
    start_line: int
    end_line: int
    tool: str
    cwe: str = ""
    owasp: str = ""
    fingerprint: str = ""


@dataclass
class ScanResult:
    tool: str
    findings: list = field(default_factory=list)
    rules_evaluated: int = 0
    scan_duration_seconds: float = 0.0
    exit_code: int = 0
    error_message: str = ""


SEVERITY_ORDER = {"critical": 0, "high": 1, "medium": 2, "low": 3, "note": 4, "none": 5}


def run_semgrep(repo_path: str, config: str = "auto", extra_configs: Optional[list] = None) -> ScanResult:
    """Run Semgrep scan and return structured results."""
    result = ScanResult(tool="semgrep")
    sarif_output = os.path.join(repo_path, "semgrep-results.sarif")

    cmd = [
        "semgrep", "ci",
        "--config", config,
        "--sarif",
        "--output", sarif_output,
        "--json",
        "--quiet"
    ]

    if extra_configs:
        for cfg in extra_configs:
            cmd.extend(["--config", cfg])

    start_time = datetime.now(timezone.utc)

    try:
        proc = subprocess.run(
            cmd,
            cwd=repo_path,
            capture_output=True,
            text=True,
            timeout=600
        )
        result.exit_code = proc.returncode

        if proc.returncode not in (0, 1):
            result.error_message = proc.stderr[:500]
            return result

    except subprocess.TimeoutExpired:
        result.error_message = "Semgrep scan timed out after 600 seconds"
        result.exit_code = -1
        return result
    except FileNotFoundError:
        result.error_message = "semgrep binary not found. Install with: pip install semgrep"
        result.exit_code = -1
        return result

    result.scan_duration_seconds = (datetime.now(timezone.utc) - start_time).total_seconds()

    if os.path.exists(sarif_output):
        result.findings = parse_sarif(sarif_output, "semgrep")
        with open(sarif_output, "r") as f:
            sarif_data = json.load(f)
            for run in sarif_data.get("runs", []):
                result.rules_evaluated = len(run.get("tool", {}).get("driver", {}).get("rules", []))

    return result


def run_codeql_query(repo_path: str, language: str, database_path: str) -> ScanResult:
    """Run CodeQL analysis on a pre-created database and return structured results."""
    result = ScanResult(tool=f"codeql-{language}")
    sarif_output = os.path.join(repo_path, f"codeql-{language}-results.sarif")

    cmd = [
        "codeql", "database", "analyze",
        database_path,
        f"codeql/{language}-queries:codeql-suites/{language}-security-extended.qls",
        "--format=sarifv2.1.0",
        f"--output={sarif_output}",
        "--threads=0"
    ]

    start_time = datetime.now(timezone.utc)

    try:
        proc = subprocess.run(
            cmd,
            cwd=repo_path,
            capture_output=True,
            text=True,
            timeout=1200
        )
        result.exit_code = proc.returncode

        if proc.returncode != 0:
            result.error_message = proc.stderr[:500]

    except subprocess.TimeoutExpired:
        result.error_message = "CodeQL analysis timed out after 1200 seconds"
        result.exit_code = -1
        return result
    except FileNotFoundError:
        result.error_message = "codeql binary not found. Install from https://github.com/github/codeql-cli-binaries"
        result.exit_code = -1
        return result

    result.scan_duration_seconds = (datetime.now(timezone.utc) - start_time).total_seconds()

    if os.path.exists(sarif_output):
        result.findings = parse_sarif(sarif_output, f"codeql-{language}")
        with open(sarif_output, "r") as f:
            sarif_data = json.load(f)
            for run in sarif_data.get("runs", []):
                result.rules_evaluated = len(run.get("tool", {}).get("driver", {}).get("rules", []))

    return result


def parse_sarif(sarif_path: str, tool_name: str) -> list:
    """Parse a SARIF file and extract findings as ScanFinding objects."""
    findings = []

    with open(sarif_path, "r") as f:
        sarif_data = json.load(f)

    for run in sarif_data.get("runs", []):
        rules_map = {}
        for rule in run.get("tool", {}).get("driver", {}).get("rules", []):
            rule_id = rule.get("id", "")
            properties = rule.get("properties", {})
            cwe_tags = [t for t in properties.get("tags", []) if t.startswith("CWE")]
            owasp_tags = [t for t in properties.get("tags", []) if "owasp" in t.lower()]
            rules_map[rule_id] = {
                "cwe": cwe_tags[0] if cwe_tags else "",
                "owasp": owasp_tags[0] if owasp_tags else "",
                "severity": rule.get("defaultConfiguration", {}).get("level", "warning")
            }

        for result in run.get("results", []):
            rule_id = result.get("ruleId", "unknown")
            rule_info = rules_map.get(rule_id, {})

            level = result.get("level", rule_info.get("severity", "warning"))
            severity_map = {"error": "high", "warning": "medium", "note": "low", "none": "none"}
            severity = severity_map.get(level, "medium")

            security_severity = None
            for rule in run.get("tool", {}).get("driver", {}).get("rules", []):
                if rule.get("id") == rule_id:
                    security_severity = rule.get("properties", {}).get("security-severity")
                    break

            if security_severity:
                score = float(security_severity)
                if score >= 9.0:
                    severity = "critical"
                elif score >= 7.0:
                    severity = "high"
                elif score >= 4.0:
                    severity = "medium"
                else:
                    severity = "low"

            locations = result.get("locations", [{}])
            physical = locations[0].get("physicalLocation", {}) if locations else {}
            artifact = physical.get("artifactLocation", {})
            region = physical.get("region", {})

            findings.append(ScanFinding(
                rule_id=rule_id,
                severity=severity,
                message=result.get("message", {}).get("text", ""),
                file_path=artifact.get("uri", "unknown"),
                start_line=region.get("startLine", 0),
                end_line=region.get("endLine", region.get("startLine", 0)),
                tool=tool_name,
                cwe=rule_info.get("cwe", ""),
                owasp=rule_info.get("owasp", ""),
                fingerprint=str(result.get("fingerprints", {}).get("primaryLocationLineHash", ""))
            ))

    return findings


def evaluate_quality_gate(findings: list, severity_threshold: str) -> dict:
    """Evaluate quality gate based on finding severities."""
    threshold_level = SEVERITY_ORDER.get(severity_threshold.lower(), 1)

    blocking_findings = [
        f for f in findings
        if SEVERITY_ORDER.get(f.severity.lower(), 5) <= threshold_level
    ]

    severity_counts = {}
    for f in findings:
        sev = f.severity.lower()
        severity_counts[sev] = severity_counts.get(sev, 0) + 1

    return {
        "passed": len(blocking_findings) == 0,
        "threshold": severity_threshold,
        "total_findings": len(findings),
        "blocking_findings": len(blocking_findings),
        "severity_counts": severity_counts,
        "blocking_details": [
            {
                "rule_id": f.rule_id,
                "severity": f.severity,
                "file": f.file_path,
                "line": f.start_line,
                "tool": f.tool,
                "message": f.message[:200]
            }
            for f in blocking_findings
        ]
    }


def generate_report(scan_results: list, quality_gate: dict, repo_path: str) -> dict:
    """Generate a consolidated SAST report."""
    all_findings = []
    for sr in scan_results:
        all_findings.extend(sr.findings)

    cwe_counts = {}
    for f in all_findings:
        if f.cwe:
            cwe_counts[f.cwe] = cwe_counts.get(f.cwe, 0) + 1

    report = {
        "report_metadata": {
            "generated_at": datetime.now(timezone.utc).isoformat(),
            "repository": repo_path,
            "report_version": "1.0.0"
        },
        "scan_summary": [
            {
                "tool": sr.tool,
                "findings_count": len(sr.findings),
                "rules_evaluated": sr.rules_evaluated,
                "duration_seconds": sr.scan_duration_seconds,
                "status": "success" if sr.exit_code in (0, 1) else "error",
                "error": sr.error_message
            }
            for sr in scan_results
        ],
        "quality_gate": quality_gate,
        "top_cwes": sorted(cwe_counts.items(), key=lambda x: x[1], reverse=True)[:10],
        "findings": [
            {
                "rule_id": f.rule_id,
                "severity": f.severity,
                "tool": f.tool,
                "file": f.file_path,
                "line": f.start_line,
                "cwe": f.cwe,
                "owasp": f.owasp,
                "message": f.message[:300]
            }
            for f in sorted(all_findings, key=lambda x: SEVERITY_ORDER.get(x.severity.lower(), 5))
        ]
    }

    return report


def main():
    parser = argparse.ArgumentParser(description="SAST Pipeline Orchestration")
    parser.add_argument("--repo-path", required=True, help="Path to the repository to scan")
    parser.add_argument("--output", default="sast-report.json", help="Output report file path")
    parser.add_argument("--severity-threshold", default="high",
                        choices=["critical", "high", "medium", "low"],
                        help="Minimum severity to block pipeline")
    parser.add_argument("--fail-on-findings", action="store_true",
                        help="Exit with non-zero code if quality gate fails")
    parser.add_argument("--semgrep-config", default="auto",
                        help="Semgrep configuration (default: auto)")
    parser.add_argument("--semgrep-extra-configs", nargs="*",
                        help="Additional Semgrep config paths")
    parser.add_argument("--skip-semgrep", action="store_true", help="Skip Semgrep scan")
    parser.add_argument("--skip-codeql", action="store_true", help="Skip CodeQL scan")
    parser.add_argument("--codeql-language", default=None, help="Language for CodeQL analysis")
    parser.add_argument("--codeql-db-path", default=None, help="Path to CodeQL database")
    parser.add_argument("--sarif-only", nargs="*",
                        help="Only parse existing SARIF files instead of running scans")
    args = parser.parse_args()

    repo_path = os.path.abspath(args.repo_path)
    scan_results = []

    if args.sarif_only:
        for sarif_file in args.sarif_only:
            tool_name = Path(sarif_file).stem
            findings = parse_sarif(sarif_file, tool_name)
            sr = ScanResult(tool=tool_name, findings=findings)
            scan_results.append(sr)
    else:
        if not args.skip_semgrep:
            print("[*] Running Semgrep scan...")
            semgrep_result = run_semgrep(
                repo_path,
                config=args.semgrep_config,
                extra_configs=args.semgrep_extra_configs
            )
            scan_results.append(semgrep_result)
            print(f"    Found {len(semgrep_result.findings)} findings in {semgrep_result.scan_duration_seconds:.1f}s")

            if semgrep_result.error_message:
                print(f"    Warning: {semgrep_result.error_message}")

        if not args.skip_codeql and args.codeql_language and args.codeql_db_path:
            print(f"[*] Running CodeQL analysis for {args.codeql_language}...")
            codeql_result = run_codeql_query(repo_path, args.codeql_language, args.codeql_db_path)
            scan_results.append(codeql_result)
            print(f"    Found {len(codeql_result.findings)} findings in {codeql_result.scan_duration_seconds:.1f}s")

    all_findings = []
    for sr in scan_results:
        all_findings.extend(sr.findings)

    quality_gate = evaluate_quality_gate(all_findings, args.severity_threshold)

    report = generate_report(scan_results, quality_gate, repo_path)

    output_path = os.path.abspath(args.output)
    with open(output_path, "w") as f:
        json.dump(report, f, indent=2)
    print(f"\n[*] Report written to {output_path}")

    if quality_gate["passed"]:
        print(f"[PASS] Quality gate passed. {quality_gate['total_findings']} findings, none blocking.")
    else:
        print(f"[FAIL] Quality gate failed. {quality_gate['blocking_findings']} blocking findings:")
        for detail in quality_gate["blocking_details"]:
            print(f"  - [{detail['severity'].upper()}] {detail['rule_id']} in {detail['file']}:{detail['line']}")

    if args.fail_on_findings and not quality_gate["passed"]:
        sys.exit(1)


if __name__ == "__main__":
    main()