fix: add error handling for IOError/UnicodeDecodeError + sync ALLOWED_SUBDOMAINS

- Wrap open() call in try/except for IOError and UnicodeDecodeError to report clean errors instead of crashing on encoding issues - Add all subdomains actually used by existing skills in the repo: identity-access-management (33 skills), security-operations (28), identity-and-access-management, zero-trust, ot-security, purple-team, red-team, ai-security, social-engineering-defense, and others - Remove identity-security as the canonical form is identity-access-management
2026-07-26 06:10:57 +03:00 · 2026-04-03 09:49:04 -06:00
parent c15f73db46
commit b53f3d4991
2 changed files with 212 additions and 0 deletions
@@ -0,0 +1,30 @@
+# Skill Validation Tools
+
+## validate-skill.py
+
+Validate SKILL.md metadata before submitting a PR.
+
+### Usage
+
+```bash
+# Validate a single skill
+python tools/validate-skill.py skills/my-new-skill/
+
+# Validate all skills
+python tools/validate-skill.py --all
+```
+
+### What it checks
+
+- SKILL.md exists in the skill directory
+- Valid YAML frontmatter (between `---` markers)
+- Required fields present: `name`, `description`, `domain`, `subdomain`, `tags`
+- Name is kebab-case, 1–64 characters
+- Description is 20–500 characters
+- Domain is `cybersecurity`
+- Subdomain is from the allowed list
+- Tags is a list with at least 2 items
+
+### Requirements
+
+Python 3.8+ (stdlib only, no external dependencies)
@@ -0,0 +1,182 @@
+#!/usr/bin/env python3
+"""Validate SKILL.md metadata for the Anthropic-Cybersecurity-Skills repository.
+
+Usage:
+    python tools/validate-skill.py skills/my-skill/
+    python tools/validate-skill.py --all
+"""
+import os
+import re
+import sys
+import glob
+
+REQUIRED_FIELDS = ["name", "description", "domain", "subdomain", "tags"]
+
+ALLOWED_SUBDOMAINS = {
+    "web-application-security", "network-security", "penetration-testing",
+    "red-teaming", "digital-forensics", "malware-analysis", "threat-intelligence",
+    "cloud-security", "container-security", "identity-security",
+    "identity-access-management", "identity-and-access-management",
+    "cryptography", "vulnerability-management", "compliance-governance",
+    "zero-trust-architecture", "zero-trust", "ot-ics-security", "ot-security",
+    "devsecops", "threat-hunting", "soc-operations", "security-operations",
+    "incident-response", "endpoint-security", "phishing-defense",
+    "api-security", "mobile-security", "ransomware-defense", "threat-detection",
+    "application-security", "blockchain-security", "data-protection",
+    "deception-technology", "firmware-analysis", "firmware-security",
+    "governance-risk-compliance", "offensive-security", "privacy-compliance",
+    "purple-team", "red-team", "supply-chain-security", "wireless-security",
+    "ai-security", "social-engineering-defense",
+}
+
+KEBAB_RE = re.compile(r"^[a-z0-9]+(-[a-z0-9]+)*$")
+
+RED = "\033[91m"
+GREEN = "\033[92m"
+YELLOW = "\033[93m"
+RESET = "\033[0m"
+
+
+def parse_frontmatter(text):
+    """Extract YAML frontmatter as a dict (simple parser, stdlib only)."""
+    if not text.startswith("---"):
+        return None
+    end = text.find("---", 3)
+    if end == -1:
+        return None
+    block = text[3:end].strip()
+    data = {}
+    current_key = None
+    list_values = []
+    for line in block.split("\n"):
+        stripped = line.strip()
+        if not stripped or stripped.startswith("#"):
+            continue
+        # Handle list items
+        if stripped.startswith("- ") and current_key:
+            list_values.append(stripped[2:].strip().strip('"').strip("'"))
+            data[current_key] = list_values
+            continue
+        # Handle inline list: tags: [a, b, c]
+        match = re.match(r"^(\w[\w_-]*):\s*\[(.+)\]\s*$", stripped)
+        if match:
+            current_key = match.group(1)
+            items = [i.strip().strip('"').strip("'") for i in match.group(2).split(",")]
+            data[current_key] = items
+            list_values = items
+            continue
+        # Handle key: value
+        match = re.match(r'^(\w[\w_-]*):\s*(.*)$', stripped)
+        if match:
+            current_key = match.group(1)
+            val = match.group(2).strip().strip('"').strip("'")
+            if val:
+                data[current_key] = val
+                list_values = []
+            else:
+                list_values = []
+            continue
+    return data
+
+
+def validate_skill(skill_dir):
+    """Validate a single skill directory. Returns list of errors."""
+    errors = []
+    skill_md = os.path.join(skill_dir, "SKILL.md")
+
+    if not os.path.isfile(skill_md):
+        return [f"SKILL.md not found in {skill_dir}"]
+
+    try:
+        with open(skill_md, encoding="utf-8") as f:
+            content = f.read()
+    except IOError as e:
+        return [f"Could not read SKILL.md: {e}"]
+    except UnicodeDecodeError as e:
+        return [f"Encoding error in SKILL.md (not valid UTF-8): {e}"]
+
+    fm = parse_frontmatter(content)
+    if fm is None:
+        return [f"No valid YAML frontmatter found (must start with ---)"]
+
+    # Check required fields
+    for field in REQUIRED_FIELDS:
+        if field not in fm:
+            errors.append(f"Missing required field: {field}")
+
+    # Validate name
+    name = fm.get("name", "")
+    if name:
+        if not KEBAB_RE.match(name):
+            errors.append(f"Name '{name}' is not valid kebab-case (lowercase, hyphens only)")
+        if len(name) > 64:
+            errors.append(f"Name too long ({len(name)} chars, max 64)")
+
+    # Validate description
+    desc = fm.get("description", "")
+    if isinstance(desc, str):
+        if len(desc) < 20:
+            errors.append(f"Description too short ({len(desc)} chars, min 20)")
+        if len(desc) > 500:
+            errors.append(f"Description too long ({len(desc)} chars, max 500)")
+
+    # Validate domain
+    domain = fm.get("domain", "")
+    if domain and domain != "cybersecurity":
+        errors.append(f"Domain must be 'cybersecurity', got '{domain}'")
+
+    # Validate subdomain
+    subdomain = fm.get("subdomain", "")
+    if subdomain and subdomain not in ALLOWED_SUBDOMAINS:
+        errors.append(f"Unknown subdomain '{subdomain}'. Allowed: {', '.join(sorted(ALLOWED_SUBDOMAINS))}")
+
+    # Validate tags
+    tags = fm.get("tags", [])
+    if isinstance(tags, str):
+        tags = [tags]
+    if len(tags) < 2:
+        errors.append(f"Need at least 2 tags, got {len(tags)}")
+
+    return errors
+
+
+def main():
+    if len(sys.argv) < 2:
+        print(f"Usage: {sys.argv[0]} <skill-dir> | --all")
+        sys.exit(1)
+
+    if sys.argv[1] == "--all":
+        skill_dirs = sorted(glob.glob("skills/*/"))
+    else:
+        skill_dirs = [sys.argv[1].rstrip("/") + "/"]
+
+    total = 0
+    passed = 0
+    failed = 0
+
+    for skill_dir in skill_dirs:
+        if not os.path.isdir(skill_dir.rstrip("/")):
+            print(f"{RED}SKIP{RESET} {skill_dir} — not a directory")
+            continue
+
+        total += 1
+        errors = validate_skill(skill_dir.rstrip("/"))
+
+        name = os.path.basename(skill_dir.rstrip("/"))
+        if errors:
+            failed += 1
+            print(f"{RED}FAIL{RESET} {name}")
+            for e in errors:
+                print(f"      {YELLOW}→ {e}{RESET}")
+        else:
+            passed += 1
+            print(f"{GREEN}PASS{RESET} {name}")
+
+    print(f"\n{'='*50}")
+    print(f"Total: {total}  {GREEN}Passed: {passed}{RESET}  {RED}Failed: {failed}{RESET}")
+
+    sys.exit(0 if failed == 0 else 1)
+
+
+if __name__ == "__main__":
+    main()