Files
Anthropic-Cybersecurity-Skills/skills/implementing-secret-scanning-with-gitleaks/scripts/process.py
T

300 lines
9.8 KiB
Python

#!/usr/bin/env python3
"""
Gitleaks Secret Scanning Pipeline Script
Runs Gitleaks scans, manages baselines, evaluates findings,
and generates remediation reports.
Usage:
python process.py --repo-path /path/to/repo --scan-type detect
python process.py --repo-path . --scan-type protect --staged
python process.py --repo-path . --baseline .gitleaks-baseline.json --output report.json
"""
import argparse
import json
import os
import subprocess
import sys
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Optional
@dataclass
class SecretFinding:
rule_id: str
description: str
file: str
line: int
commit: str
author: str
date: str
secret: str
entropy: float
match: str
tags: list = field(default_factory=list)
is_new: bool = True
@dataclass
class ScanResult:
findings: list = field(default_factory=list)
new_findings: list = field(default_factory=list)
baseline_findings: list = field(default_factory=list)
commits_scanned: int = 0
scan_duration: float = 0.0
error: str = ""
def run_gitleaks(repo_path: str, scan_type: str = "detect",
baseline_path: Optional[str] = None,
commit_range: Optional[str] = None,
staged: bool = False,
config_path: Optional[str] = None) -> dict:
"""Execute Gitleaks and return JSON results."""
cmd = ["gitleaks", scan_type, "--source", repo_path,
"--report-format", "json", "--report-path", "/dev/stdout"]
if baseline_path and os.path.exists(baseline_path):
cmd.extend(["--baseline-path", baseline_path])
if commit_range:
cmd.extend(["--log-opts", commit_range])
if staged:
cmd.append("--staged")
if config_path:
cmd.extend(["--config", config_path])
cmd.append("--verbose")
try:
proc = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=300
)
findings = []
if proc.stdout.strip():
try:
findings = json.loads(proc.stdout)
except json.JSONDecodeError:
pass
return {
"findings": findings if isinstance(findings, list) else [],
"exit_code": proc.returncode,
"stderr": proc.stderr
}
except subprocess.TimeoutExpired:
return {"findings": [], "exit_code": -1, "stderr": "Scan timed out after 300s"}
except FileNotFoundError:
return {"findings": [], "exit_code": -1,
"stderr": "gitleaks not found. Install from https://github.com/gitleaks/gitleaks"}
def parse_findings(raw_findings: list) -> list:
"""Parse raw Gitleaks JSON findings into SecretFinding objects."""
findings = []
for f in raw_findings:
redacted_secret = redact_secret(f.get("Secret", ""))
findings.append(SecretFinding(
rule_id=f.get("RuleID", "unknown"),
description=f.get("Description", ""),
file=f.get("File", ""),
line=f.get("StartLine", 0),
commit=f.get("Commit", "")[:8],
author=f.get("Author", ""),
date=f.get("Date", ""),
secret=redacted_secret,
entropy=f.get("Entropy", 0.0),
match=f.get("Match", "")[:100],
tags=f.get("Tags", [])
))
return findings
def redact_secret(secret: str) -> str:
"""Redact a secret, showing only first 4 and last 4 characters."""
if len(secret) <= 12:
return "*" * len(secret)
return secret[:4] + "..." + secret[-4:]
def load_baseline(baseline_path: str) -> set:
"""Load baseline fingerprints for comparison."""
if not os.path.exists(baseline_path):
return set()
with open(baseline_path, "r") as f:
try:
baseline = json.load(f)
except json.JSONDecodeError:
return set()
fingerprints = set()
for entry in baseline:
fp = f"{entry.get('RuleID', '')}:{entry.get('File', '')}:{entry.get('Commit', '')}"
fingerprints.add(fp)
return fingerprints
def classify_findings(findings: list, baseline_fingerprints: set) -> tuple:
"""Separate findings into new and baseline (pre-existing)."""
new_findings = []
baseline_findings = []
for f in findings:
fp = f"{f.rule_id}:{f.file}:{f.commit}"
if fp in baseline_fingerprints:
f.is_new = False
baseline_findings.append(f)
else:
f.is_new = True
new_findings.append(f)
return new_findings, baseline_findings
def generate_report(scan_result: ScanResult, repo_path: str) -> dict:
"""Generate a structured scan report."""
rule_summary = {}
for f in scan_result.findings:
rule_summary[f.rule_id] = rule_summary.get(f.rule_id, 0) + 1
author_summary = {}
for f in scan_result.new_findings:
author_summary[f.author] = author_summary.get(f.author, 0) + 1
return {
"report_metadata": {
"repository": repo_path,
"scan_date": datetime.now(timezone.utc).isoformat(),
"duration_seconds": scan_result.scan_duration,
"commits_scanned": scan_result.commits_scanned
},
"summary": {
"total_findings": len(scan_result.findings),
"new_findings": len(scan_result.new_findings),
"baseline_findings": len(scan_result.baseline_findings),
"unique_rules_triggered": len(rule_summary),
"rules_breakdown": rule_summary,
"authors_with_new_findings": author_summary
},
"quality_gate": {
"passed": len(scan_result.new_findings) == 0,
"blocking_count": len(scan_result.new_findings)
},
"new_findings": [
{
"rule_id": f.rule_id,
"description": f.description,
"file": f.file,
"line": f.line,
"commit": f.commit,
"author": f.author,
"date": f.date,
"secret_preview": f.secret,
"entropy": f.entropy
}
for f in scan_result.new_findings
],
"remediation_steps": [
f"1. Rotate the {f.rule_id} credential found in {f.file}"
for f in scan_result.new_findings
]
}
def main():
parser = argparse.ArgumentParser(description="Gitleaks Secret Scanning Pipeline")
parser.add_argument("--repo-path", required=True, help="Path to git repository")
parser.add_argument("--scan-type", default="detect", choices=["detect", "protect"],
help="Scan type: detect (history) or protect (staged/pre-commit)")
parser.add_argument("--baseline", default=None, help="Path to baseline JSON file")
parser.add_argument("--commit-range", default=None,
help="Git log range (e.g., HEAD~10..HEAD)")
parser.add_argument("--staged", action="store_true",
help="Scan only staged changes (for pre-commit)")
parser.add_argument("--config", default=None, help="Path to .gitleaks.toml config")
parser.add_argument("--output", default="gitleaks-report.json", help="Output report path")
parser.add_argument("--fail-on-findings", action="store_true",
help="Exit non-zero on new findings")
parser.add_argument("--create-baseline", action="store_true",
help="Generate baseline from current findings")
args = parser.parse_args()
repo_path = os.path.abspath(args.repo_path)
start_time = datetime.now(timezone.utc)
print(f"[*] Running Gitleaks {args.scan_type} on {repo_path}")
raw_result = run_gitleaks(
repo_path,
scan_type=args.scan_type,
baseline_path=args.baseline,
commit_range=args.commit_range,
staged=args.staged,
config_path=args.config
)
if raw_result["exit_code"] == -1:
print(f"[ERROR] {raw_result['stderr']}")
sys.exit(2)
scan_result = ScanResult()
scan_result.findings = parse_findings(raw_result["findings"])
scan_result.scan_duration = (datetime.now(timezone.utc) - start_time).total_seconds()
if args.baseline:
baseline_fps = load_baseline(args.baseline)
scan_result.new_findings, scan_result.baseline_findings = classify_findings(
scan_result.findings, baseline_fps
)
else:
scan_result.new_findings = scan_result.findings
scan_result.baseline_findings = []
if args.create_baseline:
baseline_path = os.path.join(repo_path, ".gitleaks-baseline.json")
with open(baseline_path, "w") as f:
json.dump(raw_result["findings"], f, indent=2)
print(f"[*] Baseline created: {baseline_path}")
print(f" Contains {len(raw_result['findings'])} findings")
return
report = generate_report(scan_result, repo_path)
output_path = os.path.abspath(args.output)
with open(output_path, "w") as f:
json.dump(report, f, indent=2)
print(f"[*] Report: {output_path}")
print(f"\n[*] Total: {len(scan_result.findings)} | "
f"New: {len(scan_result.new_findings)} | "
f"Baseline: {len(scan_result.baseline_findings)}")
if scan_result.new_findings:
print("\n[!] New secrets detected:")
for f in scan_result.new_findings:
print(f" [{f.rule_id}] {f.file}:{f.line} (commit: {f.commit}, author: {f.author})")
print(f" Secret: {f.secret}")
if report["quality_gate"]["passed"]:
print("\n[PASS] No new secrets detected.")
else:
print(f"\n[FAIL] {len(scan_result.new_findings)} new secrets found. Rotate immediately.")
if args.fail_on_findings and not report["quality_gate"]["passed"]:
sys.exit(1)
if __name__ == "__main__":
main()