diff --git a/tools/README.md b/tools/README.md new file mode 100644 index 00000000..19e8c64b --- /dev/null +++ b/tools/README.md @@ -0,0 +1,30 @@ +# Skill Validation Tools + +## validate-skill.py + +Validate SKILL.md metadata before submitting a PR. + +### Usage + +```bash +# Validate a single skill +python tools/validate-skill.py skills/my-new-skill/ + +# Validate all skills +python tools/validate-skill.py --all +``` + +### What it checks + +- SKILL.md exists in the skill directory +- Valid YAML frontmatter (between `---` markers) +- Required fields present: `name`, `description`, `domain`, `subdomain`, `tags` +- Name is kebab-case, 1–64 characters +- Description is at least 50 characters (no upper limit; multi-line folded scalars are valid) +- Domain is `cybersecurity` +- Subdomain is from the allowed list +- Tags is a list with at least 2 items + +### Requirements + +Python 3.8+ (stdlib only, no external dependencies) diff --git a/tools/validate-skill.py b/tools/validate-skill.py new file mode 100755 index 00000000..9932abc8 --- /dev/null +++ b/tools/validate-skill.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Validate SKILL.md metadata for the Anthropic-Cybersecurity-Skills repository. + +Usage: + python tools/validate-skill.py skills/my-skill/ + python tools/validate-skill.py --all +""" +import os +import re +import sys +import glob + +REQUIRED_FIELDS = ["name", "description", "domain", "subdomain", "tags"] + +# Canonical subdomain → set of accepted aliases (including canonical itself). +# When a skill uses an alias, the validator accepts it but the canonical form +# is the first entry in each group below. New skills should use the canonical. +_SUBDOMAIN_ALIASES = { + # identity + "identity-access-management": {"identity-access-management", "identity-and-access-management", "identity-security"}, + # zero-trust + "zero-trust-architecture": {"zero-trust-architecture", "zero-trust"}, + # OT/ICS + "ot-ics-security": {"ot-ics-security", "ot-security"}, + # SOC / security ops + "soc-operations": {"soc-operations", "security-operations"}, + # red team + "red-teaming": {"red-teaming", "red-team"}, + # standalone (no aliases) + "web-application-security": {"web-application-security", "application-security"}, + "network-security": {"network-security"}, + "penetration-testing": {"penetration-testing", "offensive-security"}, + "digital-forensics": {"digital-forensics"}, + "malware-analysis": {"malware-analysis"}, + "threat-intelligence": {"threat-intelligence"}, + "cloud-security": {"cloud-security"}, + "container-security": {"container-security"}, + "cryptography": {"cryptography"}, + "vulnerability-management": {"vulnerability-management"}, + "compliance-governance": {"compliance-governance", "governance-risk-compliance"}, + "devsecops": {"devsecops"}, + "threat-hunting": {"threat-hunting"}, + "incident-response": {"incident-response"}, + "endpoint-security": {"endpoint-security"}, + "phishing-defense": {"phishing-defense", "social-engineering-defense"}, + "api-security": {"api-security"}, + "mobile-security": {"mobile-security"}, + "ransomware-defense": {"ransomware-defense"}, + "threat-detection": {"threat-detection"}, + "blockchain-security": {"blockchain-security"}, + "data-protection": {"data-protection"}, + "deception-technology": {"deception-technology"}, + "firmware-analysis": {"firmware-analysis", "firmware-security"}, + "privacy-compliance": {"privacy-compliance"}, + "purple-team": {"purple-team"}, + "supply-chain-security": {"supply-chain-security"}, + "wireless-security": {"wireless-security"}, + "ai-security": {"ai-security"}, +} + +# Flat set of all accepted subdomain values (canonical + aliases). +ALLOWED_SUBDOMAINS: set = {v for group in _SUBDOMAIN_ALIASES.values() for v in group} + +# Reverse map: alias → canonical (for warning messages). +_ALIAS_TO_CANONICAL: dict = {} +for canonical, aliases in _SUBDOMAIN_ALIASES.items(): + for alias in aliases: + _ALIAS_TO_CANONICAL[alias] = canonical + +KEBAB_RE = re.compile(r"^[a-z0-9]+(-[a-z0-9]+)*$") + +# Minimum description length. Other repo tooling uses 50 chars; align here. +DESCRIPTION_MIN_CHARS = 50 + +RED = "\033[91m" +GREEN = "\033[92m" +YELLOW = "\033[93m" +RESET = "\033[0m" + + +def parse_frontmatter(text): + """Extract YAML frontmatter as a dict (simple stdlib-only parser). + + Handles the common SKILL.md patterns: + - key: scalar value + - key: [inline, list] + - key:\n - list\n - items + - key: >- (folded scalar — content on following indented lines) + + Edge case note: ``list_values`` is reset to ``[]`` whenever a new key + with a scalar value is encountered, so a list from a prior block cannot + leak into an unrelated key. The only remaining theoretical edge case is + a key with *no* value that is immediately followed by non-list, non-empty + lines that look like scalars — those lines are currently ignored (the key + is treated as having no value). This is acceptable for well-formed SKILL.md + files and matches the behaviour contributors expect. + """ + if not text.startswith("---"): + return None + end = text.find("---", 3) + if end == -1: + return None + block = text[3:end].strip() + data = {} + current_key = None + list_values: list = [] + in_folded = False # True when we are collecting a YAML >- / > folded scalar + folded_lines: list = [] + + for line in block.split("\n"): + stripped = line.strip() + + # Flush a completed folded scalar when we hit the next top-level key. + if in_folded and stripped and not line.startswith(" ") and not line.startswith("\t"): + if current_key and folded_lines: + data[current_key] = " ".join(folded_lines) + in_folded = False + folded_lines = [] + current_key = None + + if in_folded: + if stripped: + folded_lines.append(stripped) + continue + + if not stripped or stripped.startswith("#"): + continue + + # Handle list items (must come before key: value to avoid misparse). + if stripped.startswith("- ") and current_key: + list_values.append(stripped[2:].strip().strip('"').strip("'")) + data[current_key] = list(list_values) # copy so future mutations don't leak + continue + + # Handle inline list: tags: [a, b, c] + m = re.match(r"^(\w[\w_-]*):\s*\[(.+)\]\s*$", stripped) + if m: + current_key = m.group(1) + items = [i.strip().strip('"').strip("'") for i in m.group(2).split(",")] + data[current_key] = items + list_values = list(items) + continue + + # Handle key: >- or key: > (folded scalar start) + m = re.match(r"^(\w[\w_-]*):\s*>[-|]?\s*$", stripped) + if m: + current_key = m.group(1) + list_values = [] + in_folded = True + folded_lines = [] + continue + + # Handle key: value (plain scalar) + m = re.match(r'^(\w[\w_-]*):\s*(.*)$', stripped) + if m: + current_key = m.group(1) + val = m.group(2).strip().strip('"').strip("'") + list_values = [] # reset; new scalar key cannot inherit a prior list + if val: + data[current_key] = val + # If val is empty the key is present but value-less (e.g. start of block list) + continue + + # Flush any trailing folded scalar. + if in_folded and current_key and folded_lines: + data[current_key] = " ".join(folded_lines) + + return data + + +def validate_skill(skill_dir): + """Validate a single skill directory. Returns list of error strings.""" + errors = [] + skill_md = os.path.join(skill_dir, "SKILL.md") + + if not os.path.isfile(skill_md): + return [f"SKILL.md not found in {skill_dir}"] + + try: + with open(skill_md, encoding="utf-8") as f: + content = f.read() + except IOError as e: + return [f"Could not read SKILL.md: {e}"] + except UnicodeDecodeError as e: + return [f"Encoding error in SKILL.md (not valid UTF-8): {e}"] + + fm = parse_frontmatter(content) + if fm is None: + return ["No valid YAML frontmatter found (must start with ---)"] + + # Check required fields. + for field in REQUIRED_FIELDS: + if field not in fm: + errors.append(f"Missing required field: {field}") + + # Validate name. + name = fm.get("name", "") + if name: + if not KEBAB_RE.match(name): + errors.append( + f"Name '{name}' is not valid kebab-case (lowercase letters, digits, hyphens only)" + ) + if len(name) > 64: + errors.append(f"Name too long ({len(name)} chars, max 64)") + + # Validate description. + desc = fm.get("description", "") + if isinstance(desc, list): + errors.append("Description must be a string value, not a list") + elif isinstance(desc, str): + if len(desc) < DESCRIPTION_MIN_CHARS: + errors.append( + f"Description too short ({len(desc)} chars, min {DESCRIPTION_MIN_CHARS})" + ) + # No hard upper-limit enforced; multi-line folded scalars (>-) produce + # long strings that are valid and common in this repo. + + # Validate domain. + domain = fm.get("domain", "") + if domain and domain != "cybersecurity": + errors.append(f"Domain must be 'cybersecurity', got '{domain}'") + + # Validate subdomain. + subdomain = fm.get("subdomain", "") + if subdomain: + if subdomain not in ALLOWED_SUBDOMAINS: + errors.append( + f"Unknown subdomain '{subdomain}'. Allowed: {', '.join(sorted(ALLOWED_SUBDOMAINS))}" + ) + else: + canonical = _ALIAS_TO_CANONICAL.get(subdomain, subdomain) + if subdomain != canonical: + # Warn (non-blocking) — alias is accepted but canonical is preferred + print( + f"{YELLOW}WARN{RESET} subdomain '{subdomain}' is an alias;" + f" canonical form is '{canonical}'" + ) + + # Validate tags. + tags = fm.get("tags", []) + if isinstance(tags, str): + tags = [tags] + if len(tags) < 2: + errors.append(f"Need at least 2 tags, got {len(tags)}") + + return errors + + +def main(): + if len(sys.argv) < 2: + print(f"Usage: {sys.argv[0]} | --all") + sys.exit(1) + + if sys.argv[1] == "--all": + skill_dirs = sorted(glob.glob("skills/*/")) + if not skill_dirs: + print("ERROR: No skill directories found. Run from the repository root.") + sys.exit(1) + else: + skill_dirs = [sys.argv[1].rstrip("/") + "/"] + + total = 0 + passed = 0 + failed = 0 + + for skill_dir in skill_dirs: + if not os.path.isdir(skill_dir.rstrip("/")): + print(f"{RED}SKIP{RESET} {skill_dir} — not a directory") + continue + + total += 1 + errors = validate_skill(skill_dir.rstrip("/")) + + name = os.path.basename(skill_dir.rstrip("/")) + if errors: + failed += 1 + print(f"{RED}FAIL{RESET} {name}") + for e in errors: + print(f" {YELLOW}→ {e}{RESET}") + else: + passed += 1 + print(f"{GREEN}PASS{RESET} {name}") + + print(f"\n{'='*50}") + print(f"Total: {total} {GREEN}Passed: {passed}{RESET} {RED}Failed: {failed}{RESET}") + + sys.exit(0 if failed == 0 else 1) + + +if __name__ == "__main__": + main()