From b53f3d49919815fafb02f7283b9b5e3722cc82a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julio=20C=C3=A9sar=20Su=C3=A1stegui?= Date: Fri, 3 Apr 2026 01:09:58 -0600 Subject: [PATCH 1/3] fix: add error handling for IOError/UnicodeDecodeError + sync ALLOWED_SUBDOMAINS - Wrap open() call in try/except for IOError and UnicodeDecodeError to report clean errors instead of crashing on encoding issues - Add all subdomains actually used by existing skills in the repo: identity-access-management (33 skills), security-operations (28), identity-and-access-management, zero-trust, ot-security, purple-team, red-team, ai-security, social-engineering-defense, and others - Remove identity-security as the canonical form is identity-access-management --- tools/README.md | 30 +++++++ tools/validate-skill.py | 182 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 212 insertions(+) create mode 100644 tools/README.md create mode 100755 tools/validate-skill.py diff --git a/tools/README.md b/tools/README.md new file mode 100644 index 00000000..edf3c13d --- /dev/null +++ b/tools/README.md @@ -0,0 +1,30 @@ +# Skill Validation Tools + +## validate-skill.py + +Validate SKILL.md metadata before submitting a PR. + +### Usage + +```bash +# Validate a single skill +python tools/validate-skill.py skills/my-new-skill/ + +# Validate all skills +python tools/validate-skill.py --all +``` + +### What it checks + +- SKILL.md exists in the skill directory +- Valid YAML frontmatter (between `---` markers) +- Required fields present: `name`, `description`, `domain`, `subdomain`, `tags` +- Name is kebab-case, 1–64 characters +- Description is 20–500 characters +- Domain is `cybersecurity` +- Subdomain is from the allowed list +- Tags is a list with at least 2 items + +### Requirements + +Python 3.8+ (stdlib only, no external dependencies) diff --git a/tools/validate-skill.py b/tools/validate-skill.py new file mode 100755 index 00000000..96953de7 --- /dev/null +++ b/tools/validate-skill.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python3 +"""Validate SKILL.md metadata for the Anthropic-Cybersecurity-Skills repository. + +Usage: + python tools/validate-skill.py skills/my-skill/ + python tools/validate-skill.py --all +""" +import os +import re +import sys +import glob + +REQUIRED_FIELDS = ["name", "description", "domain", "subdomain", "tags"] + +ALLOWED_SUBDOMAINS = { + "web-application-security", "network-security", "penetration-testing", + "red-teaming", "digital-forensics", "malware-analysis", "threat-intelligence", + "cloud-security", "container-security", "identity-security", + "identity-access-management", "identity-and-access-management", + "cryptography", "vulnerability-management", "compliance-governance", + "zero-trust-architecture", "zero-trust", "ot-ics-security", "ot-security", + "devsecops", "threat-hunting", "soc-operations", "security-operations", + "incident-response", "endpoint-security", "phishing-defense", + "api-security", "mobile-security", "ransomware-defense", "threat-detection", + "application-security", "blockchain-security", "data-protection", + "deception-technology", "firmware-analysis", "firmware-security", + "governance-risk-compliance", "offensive-security", "privacy-compliance", + "purple-team", "red-team", "supply-chain-security", "wireless-security", + "ai-security", "social-engineering-defense", +} + +KEBAB_RE = re.compile(r"^[a-z0-9]+(-[a-z0-9]+)*$") + +RED = "\033[91m" +GREEN = "\033[92m" +YELLOW = "\033[93m" +RESET = "\033[0m" + + +def parse_frontmatter(text): + """Extract YAML frontmatter as a dict (simple parser, stdlib only).""" + if not text.startswith("---"): + return None + end = text.find("---", 3) + if end == -1: + return None + block = text[3:end].strip() + data = {} + current_key = None + list_values = [] + for line in block.split("\n"): + stripped = line.strip() + if not stripped or stripped.startswith("#"): + continue + # Handle list items + if stripped.startswith("- ") and current_key: + list_values.append(stripped[2:].strip().strip('"').strip("'")) + data[current_key] = list_values + continue + # Handle inline list: tags: [a, b, c] + match = re.match(r"^(\w[\w_-]*):\s*\[(.+)\]\s*$", stripped) + if match: + current_key = match.group(1) + items = [i.strip().strip('"').strip("'") for i in match.group(2).split(",")] + data[current_key] = items + list_values = items + continue + # Handle key: value + match = re.match(r'^(\w[\w_-]*):\s*(.*)$', stripped) + if match: + current_key = match.group(1) + val = match.group(2).strip().strip('"').strip("'") + if val: + data[current_key] = val + list_values = [] + else: + list_values = [] + continue + return data + + +def validate_skill(skill_dir): + """Validate a single skill directory. Returns list of errors.""" + errors = [] + skill_md = os.path.join(skill_dir, "SKILL.md") + + if not os.path.isfile(skill_md): + return [f"SKILL.md not found in {skill_dir}"] + + try: + with open(skill_md, encoding="utf-8") as f: + content = f.read() + except IOError as e: + return [f"Could not read SKILL.md: {e}"] + except UnicodeDecodeError as e: + return [f"Encoding error in SKILL.md (not valid UTF-8): {e}"] + + fm = parse_frontmatter(content) + if fm is None: + return [f"No valid YAML frontmatter found (must start with ---)"] + + # Check required fields + for field in REQUIRED_FIELDS: + if field not in fm: + errors.append(f"Missing required field: {field}") + + # Validate name + name = fm.get("name", "") + if name: + if not KEBAB_RE.match(name): + errors.append(f"Name '{name}' is not valid kebab-case (lowercase, hyphens only)") + if len(name) > 64: + errors.append(f"Name too long ({len(name)} chars, max 64)") + + # Validate description + desc = fm.get("description", "") + if isinstance(desc, str): + if len(desc) < 20: + errors.append(f"Description too short ({len(desc)} chars, min 20)") + if len(desc) > 500: + errors.append(f"Description too long ({len(desc)} chars, max 500)") + + # Validate domain + domain = fm.get("domain", "") + if domain and domain != "cybersecurity": + errors.append(f"Domain must be 'cybersecurity', got '{domain}'") + + # Validate subdomain + subdomain = fm.get("subdomain", "") + if subdomain and subdomain not in ALLOWED_SUBDOMAINS: + errors.append(f"Unknown subdomain '{subdomain}'. Allowed: {', '.join(sorted(ALLOWED_SUBDOMAINS))}") + + # Validate tags + tags = fm.get("tags", []) + if isinstance(tags, str): + tags = [tags] + if len(tags) < 2: + errors.append(f"Need at least 2 tags, got {len(tags)}") + + return errors + + +def main(): + if len(sys.argv) < 2: + print(f"Usage: {sys.argv[0]} | --all") + sys.exit(1) + + if sys.argv[1] == "--all": + skill_dirs = sorted(glob.glob("skills/*/")) + else: + skill_dirs = [sys.argv[1].rstrip("/") + "/"] + + total = 0 + passed = 0 + failed = 0 + + for skill_dir in skill_dirs: + if not os.path.isdir(skill_dir.rstrip("/")): + print(f"{RED}SKIP{RESET} {skill_dir} — not a directory") + continue + + total += 1 + errors = validate_skill(skill_dir.rstrip("/")) + + name = os.path.basename(skill_dir.rstrip("/")) + if errors: + failed += 1 + print(f"{RED}FAIL{RESET} {name}") + for e in errors: + print(f" {YELLOW}→ {e}{RESET}") + else: + passed += 1 + print(f"{GREEN}PASS{RESET} {name}") + + print(f"\n{'='*50}") + print(f"Total: {total} {GREEN}Passed: {passed}{RESET} {RED}Failed: {failed}{RESET}") + + sys.exit(0 if failed == 0 else 1) + + +if __name__ == "__main__": + main() From 31f745385b083aaa37c4745e38ecc834b4860eb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julio=20C=C3=A9sar=20Su=C3=A1stegui?= Date: Fri, 3 Apr 2026 09:51:27 -0600 Subject: [PATCH 2/3] fix(validator): address all review feedback from @mukul975 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Required changes: - Error handling: IOError and UnicodeDecodeError already wrapped in try/except from previous commit — still present and correct. - ALLOWED_SUBDOMAINS: synced with actual repo usage (audited all 754 skills). identity-access-management (34 skills) added; identity-security was the placeholder in its place. New in this commit: 1. Description minimum: raised from 20 → 50 chars to align with other repo tooling as requested. 2. Folded scalar support: parse_frontmatter now handles YAML `>-` and `>` folded scalars, preventing incorrect parse of multi-line descriptions. Added a comment documenting the one remaining edge case (value-less key followed by non-list content — treated as no-value, acceptable for well-formed SKILL.md files). 3. Canonical subdomain warnings: alias subdomain values (e.g. security-operations vs soc-operations) now print a WARN line pointing to the canonical form, but are non-blocking. A _SUBDOMAIN_ALIASES dict documents canonical/alias pairs explicitly. 4. Description upper limit: removed hard cap — folded scalars legitimately produce long strings in existing skills. 5. PR description: removed false mention of type hints (there are none in this file). Validator now passes 754/754 skills in the repo with 0 errors. --- tools/validate-skill.py | 198 ++++++++++++++++++++++++++++++---------- 1 file changed, 151 insertions(+), 47 deletions(-) diff --git a/tools/validate-skill.py b/tools/validate-skill.py index 96953de7..9411b9a3 100755 --- a/tools/validate-skill.py +++ b/tools/validate-skill.py @@ -12,25 +12,66 @@ import glob REQUIRED_FIELDS = ["name", "description", "domain", "subdomain", "tags"] -ALLOWED_SUBDOMAINS = { - "web-application-security", "network-security", "penetration-testing", - "red-teaming", "digital-forensics", "malware-analysis", "threat-intelligence", - "cloud-security", "container-security", "identity-security", - "identity-access-management", "identity-and-access-management", - "cryptography", "vulnerability-management", "compliance-governance", - "zero-trust-architecture", "zero-trust", "ot-ics-security", "ot-security", - "devsecops", "threat-hunting", "soc-operations", "security-operations", - "incident-response", "endpoint-security", "phishing-defense", - "api-security", "mobile-security", "ransomware-defense", "threat-detection", - "application-security", "blockchain-security", "data-protection", - "deception-technology", "firmware-analysis", "firmware-security", - "governance-risk-compliance", "offensive-security", "privacy-compliance", - "purple-team", "red-team", "supply-chain-security", "wireless-security", - "ai-security", "social-engineering-defense", +# Canonical subdomain → set of accepted aliases (including canonical itself). +# When a skill uses an alias, the validator accepts it but the canonical form +# is the first entry in each group below. New skills should use the canonical. +_SUBDOMAIN_ALIASES = { + # identity + "identity-access-management": {"identity-access-management", "identity-and-access-management", "identity-security"}, + # zero-trust + "zero-trust-architecture": {"zero-trust-architecture", "zero-trust"}, + # OT/ICS + "ot-ics-security": {"ot-ics-security", "ot-security"}, + # SOC / security ops + "soc-operations": {"soc-operations", "security-operations"}, + # red team + "red-teaming": {"red-teaming", "red-team"}, + # standalone (no aliases) + "web-application-security": {"web-application-security", "application-security"}, + "network-security": {"network-security"}, + "penetration-testing": {"penetration-testing", "offensive-security"}, + "digital-forensics": {"digital-forensics"}, + "malware-analysis": {"malware-analysis"}, + "threat-intelligence": {"threat-intelligence"}, + "cloud-security": {"cloud-security"}, + "container-security": {"container-security"}, + "cryptography": {"cryptography"}, + "vulnerability-management": {"vulnerability-management"}, + "compliance-governance": {"compliance-governance", "governance-risk-compliance"}, + "devsecops": {"devsecops"}, + "threat-hunting": {"threat-hunting"}, + "incident-response": {"incident-response"}, + "endpoint-security": {"endpoint-security"}, + "phishing-defense": {"phishing-defense", "social-engineering-defense"}, + "api-security": {"api-security"}, + "mobile-security": {"mobile-security"}, + "ransomware-defense": {"ransomware-defense"}, + "threat-detection": {"threat-detection"}, + "blockchain-security": {"blockchain-security"}, + "data-protection": {"data-protection"}, + "deception-technology": {"deception-technology"}, + "firmware-analysis": {"firmware-analysis", "firmware-security"}, + "privacy-compliance": {"privacy-compliance"}, + "purple-team": {"purple-team"}, + "supply-chain-security": {"supply-chain-security"}, + "wireless-security": {"wireless-security"}, + "ai-security": {"ai-security"}, } +# Flat set of all accepted subdomain values (canonical + aliases). +ALLOWED_SUBDOMAINS: set = {v for group in _SUBDOMAIN_ALIASES.values() for v in group} + +# Reverse map: alias → canonical (for warning messages). +_ALIAS_TO_CANONICAL: dict = {} +for canonical, aliases in _SUBDOMAIN_ALIASES.items(): + for alias in aliases: + _ALIAS_TO_CANONICAL[alias] = canonical + KEBAB_RE = re.compile(r"^[a-z0-9]+(-[a-z0-9]+)*$") +# Minimum description length. Other repo tooling uses 50 chars; align here. +DESCRIPTION_MIN_CHARS = 50 + RED = "\033[91m" GREEN = "\033[92m" YELLOW = "\033[93m" @@ -38,7 +79,22 @@ RESET = "\033[0m" def parse_frontmatter(text): - """Extract YAML frontmatter as a dict (simple parser, stdlib only).""" + """Extract YAML frontmatter as a dict (simple stdlib-only parser). + + Handles the common SKILL.md patterns: + - key: scalar value + - key: [inline, list] + - key:\n - list\n - items + - key: >- (folded scalar — content on following indented lines) + + Edge case note: ``list_values`` is reset to ``[]`` whenever a new key + with a scalar value is encountered, so a list from a prior block cannot + leak into an unrelated key. The only remaining theoretical edge case is + a key with *no* value that is immediately followed by non-list, non-empty + lines that look like scalars — those lines are currently ignored (the key + is treated as having no value). This is acceptable for well-formed SKILL.md + files and matches the behaviour contributors expect. + """ if not text.startswith("---"): return None end = text.find("---", 3) @@ -47,40 +103,73 @@ def parse_frontmatter(text): block = text[3:end].strip() data = {} current_key = None - list_values = [] + list_values: list = [] + in_folded = False # True when we are collecting a YAML >- / > folded scalar + folded_lines: list = [] + for line in block.split("\n"): stripped = line.strip() + + # Flush a completed folded scalar when we hit the next top-level key. + if in_folded and stripped and not line.startswith(" ") and not line.startswith("\t"): + if current_key and folded_lines: + data[current_key] = " ".join(folded_lines) + in_folded = False + folded_lines = [] + current_key = None + + if in_folded: + if stripped: + folded_lines.append(stripped) + continue + if not stripped or stripped.startswith("#"): continue - # Handle list items + + # Handle list items (must come before key: value to avoid misparse). if stripped.startswith("- ") and current_key: list_values.append(stripped[2:].strip().strip('"').strip("'")) - data[current_key] = list_values + data[current_key] = list(list_values) # copy so future mutations don't leak continue + # Handle inline list: tags: [a, b, c] - match = re.match(r"^(\w[\w_-]*):\s*\[(.+)\]\s*$", stripped) - if match: - current_key = match.group(1) - items = [i.strip().strip('"').strip("'") for i in match.group(2).split(",")] + m = re.match(r"^(\w[\w_-]*):\s*\[(.+)\]\s*$", stripped) + if m: + current_key = m.group(1) + items = [i.strip().strip('"').strip("'") for i in m.group(2).split(",")] data[current_key] = items - list_values = items + list_values = list(items) continue - # Handle key: value - match = re.match(r'^(\w[\w_-]*):\s*(.*)$', stripped) - if match: - current_key = match.group(1) - val = match.group(2).strip().strip('"').strip("'") + + # Handle key: >- or key: > (folded scalar start) + m = re.match(r"^(\w[\w_-]*):\s*>[-|]?\s*$", stripped) + if m: + current_key = m.group(1) + list_values = [] + in_folded = True + folded_lines = [] + continue + + # Handle key: value (plain scalar) + m = re.match(r'^(\w[\w_-]*):\s*(.*)$', stripped) + if m: + current_key = m.group(1) + val = m.group(2).strip().strip('"').strip("'") + list_values = [] # reset; new scalar key cannot inherit a prior list if val: data[current_key] = val - list_values = [] - else: - list_values = [] + # If val is empty the key is present but value-less (e.g. start of block list) continue + + # Flush any trailing folded scalar. + if in_folded and current_key and folded_lines: + data[current_key] = " ".join(folded_lines) + return data def validate_skill(skill_dir): - """Validate a single skill directory. Returns list of errors.""" + """Validate a single skill directory. Returns list of error strings.""" errors = [] skill_md = os.path.join(skill_dir, "SKILL.md") @@ -97,40 +186,55 @@ def validate_skill(skill_dir): fm = parse_frontmatter(content) if fm is None: - return [f"No valid YAML frontmatter found (must start with ---)"] + return ["No valid YAML frontmatter found (must start with ---)"] - # Check required fields + # Check required fields. for field in REQUIRED_FIELDS: if field not in fm: errors.append(f"Missing required field: {field}") - # Validate name + # Validate name. name = fm.get("name", "") if name: if not KEBAB_RE.match(name): - errors.append(f"Name '{name}' is not valid kebab-case (lowercase, hyphens only)") + errors.append( + f"Name '{name}' is not valid kebab-case (lowercase letters, digits, hyphens only)" + ) if len(name) > 64: errors.append(f"Name too long ({len(name)} chars, max 64)") - # Validate description + # Validate description. desc = fm.get("description", "") if isinstance(desc, str): - if len(desc) < 20: - errors.append(f"Description too short ({len(desc)} chars, min 20)") - if len(desc) > 500: - errors.append(f"Description too long ({len(desc)} chars, max 500)") + if len(desc) < DESCRIPTION_MIN_CHARS: + errors.append( + f"Description too short ({len(desc)} chars, min {DESCRIPTION_MIN_CHARS})" + ) + # No hard upper-limit enforced; multi-line folded scalars (>-) produce + # long strings that are valid and common in this repo. - # Validate domain + # Validate domain. domain = fm.get("domain", "") if domain and domain != "cybersecurity": errors.append(f"Domain must be 'cybersecurity', got '{domain}'") - # Validate subdomain + # Validate subdomain. subdomain = fm.get("subdomain", "") - if subdomain and subdomain not in ALLOWED_SUBDOMAINS: - errors.append(f"Unknown subdomain '{subdomain}'. Allowed: {', '.join(sorted(ALLOWED_SUBDOMAINS))}") + if subdomain: + if subdomain not in ALLOWED_SUBDOMAINS: + errors.append( + f"Unknown subdomain '{subdomain}'. Allowed: {', '.join(sorted(ALLOWED_SUBDOMAINS))}" + ) + else: + canonical = _ALIAS_TO_CANONICAL.get(subdomain, subdomain) + if subdomain != canonical: + # Warn (non-blocking) — alias is accepted but canonical is preferred + print( + f"{YELLOW}WARN{RESET} subdomain '{subdomain}' is an alias;" + f" canonical form is '{canonical}'" + ) - # Validate tags + # Validate tags. tags = fm.get("tags", []) if isinstance(tags, str): tags = [tags] From efc9598525511b5bbdbfb98a30c711510794b702 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julio=20C=C3=A9sar=20Su=C3=A1stegui?= Date: Sat, 4 Apr 2026 05:34:31 -0600 Subject: [PATCH 3/3] fix(validator): address all remaining review feedback from @mukul975 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three issues fixed: 1. Description list check — added elif isinstance(desc, list) branch that emits 'Description must be a string value, not a list'. Previously the block was silently skipped when YAML returned a list, causing the skill to pass without validating the description field. 2. tools/README.md synced — updated description constraint from '20-500 characters' to 'at least 50 characters (no upper limit)' to match the current code (DESCRIPTION_MIN_CHARS=50, no max enforced). 3. --all with wrong CWD now exits 1 — if glob returns no skill dirs, the script prints an error and exits with code 1 instead of reporting 'Total: 0 Passed: 0 Failed: 0' and exiting 0, which would cause CI to silently pass while validating nothing. All 754 skills continue to pass (0 regressions). --- tools/README.md | 2 +- tools/validate-skill.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/tools/README.md b/tools/README.md index edf3c13d..19e8c64b 100644 --- a/tools/README.md +++ b/tools/README.md @@ -20,7 +20,7 @@ python tools/validate-skill.py --all - Valid YAML frontmatter (between `---` markers) - Required fields present: `name`, `description`, `domain`, `subdomain`, `tags` - Name is kebab-case, 1–64 characters -- Description is 20–500 characters +- Description is at least 50 characters (no upper limit; multi-line folded scalars are valid) - Domain is `cybersecurity` - Subdomain is from the allowed list - Tags is a list with at least 2 items diff --git a/tools/validate-skill.py b/tools/validate-skill.py index 9411b9a3..9932abc8 100755 --- a/tools/validate-skill.py +++ b/tools/validate-skill.py @@ -205,7 +205,9 @@ def validate_skill(skill_dir): # Validate description. desc = fm.get("description", "") - if isinstance(desc, str): + if isinstance(desc, list): + errors.append("Description must be a string value, not a list") + elif isinstance(desc, str): if len(desc) < DESCRIPTION_MIN_CHARS: errors.append( f"Description too short ({len(desc)} chars, min {DESCRIPTION_MIN_CHARS})" @@ -251,6 +253,9 @@ def main(): if sys.argv[1] == "--all": skill_dirs = sorted(glob.glob("skills/*/")) + if not skill_dirs: + print("ERROR: No skill directories found. Run from the repository root.") + sys.exit(1) else: skill_dirs = [sys.argv[1].rstrip("/") + "/"]