From 31f745385b083aaa37c4745e38ecc834b4860eb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julio=20C=C3=A9sar=20Su=C3=A1stegui?= Date: Fri, 3 Apr 2026 09:51:27 -0600 Subject: [PATCH] fix(validator): address all review feedback from @mukul975 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Required changes: - Error handling: IOError and UnicodeDecodeError already wrapped in try/except from previous commit — still present and correct. - ALLOWED_SUBDOMAINS: synced with actual repo usage (audited all 754 skills). identity-access-management (34 skills) added; identity-security was the placeholder in its place. New in this commit: 1. Description minimum: raised from 20 → 50 chars to align with other repo tooling as requested. 2. Folded scalar support: parse_frontmatter now handles YAML `>-` and `>` folded scalars, preventing incorrect parse of multi-line descriptions. Added a comment documenting the one remaining edge case (value-less key followed by non-list content — treated as no-value, acceptable for well-formed SKILL.md files). 3. Canonical subdomain warnings: alias subdomain values (e.g. security-operations vs soc-operations) now print a WARN line pointing to the canonical form, but are non-blocking. A _SUBDOMAIN_ALIASES dict documents canonical/alias pairs explicitly. 4. Description upper limit: removed hard cap — folded scalars legitimately produce long strings in existing skills. 5. PR description: removed false mention of type hints (there are none in this file). Validator now passes 754/754 skills in the repo with 0 errors. --- tools/validate-skill.py | 198 ++++++++++++++++++++++++++++++---------- 1 file changed, 151 insertions(+), 47 deletions(-) diff --git a/tools/validate-skill.py b/tools/validate-skill.py index 96953de7..9411b9a3 100755 --- a/tools/validate-skill.py +++ b/tools/validate-skill.py @@ -12,25 +12,66 @@ import glob REQUIRED_FIELDS = ["name", "description", "domain", "subdomain", "tags"] -ALLOWED_SUBDOMAINS = { - "web-application-security", "network-security", "penetration-testing", - "red-teaming", "digital-forensics", "malware-analysis", "threat-intelligence", - "cloud-security", "container-security", "identity-security", - "identity-access-management", "identity-and-access-management", - "cryptography", "vulnerability-management", "compliance-governance", - "zero-trust-architecture", "zero-trust", "ot-ics-security", "ot-security", - "devsecops", "threat-hunting", "soc-operations", "security-operations", - "incident-response", "endpoint-security", "phishing-defense", - "api-security", "mobile-security", "ransomware-defense", "threat-detection", - "application-security", "blockchain-security", "data-protection", - "deception-technology", "firmware-analysis", "firmware-security", - "governance-risk-compliance", "offensive-security", "privacy-compliance", - "purple-team", "red-team", "supply-chain-security", "wireless-security", - "ai-security", "social-engineering-defense", +# Canonical subdomain → set of accepted aliases (including canonical itself). +# When a skill uses an alias, the validator accepts it but the canonical form +# is the first entry in each group below. New skills should use the canonical. +_SUBDOMAIN_ALIASES = { + # identity + "identity-access-management": {"identity-access-management", "identity-and-access-management", "identity-security"}, + # zero-trust + "zero-trust-architecture": {"zero-trust-architecture", "zero-trust"}, + # OT/ICS + "ot-ics-security": {"ot-ics-security", "ot-security"}, + # SOC / security ops + "soc-operations": {"soc-operations", "security-operations"}, + # red team + "red-teaming": {"red-teaming", "red-team"}, + # standalone (no aliases) + "web-application-security": {"web-application-security", "application-security"}, + "network-security": {"network-security"}, + "penetration-testing": {"penetration-testing", "offensive-security"}, + "digital-forensics": {"digital-forensics"}, + "malware-analysis": {"malware-analysis"}, + "threat-intelligence": {"threat-intelligence"}, + "cloud-security": {"cloud-security"}, + "container-security": {"container-security"}, + "cryptography": {"cryptography"}, + "vulnerability-management": {"vulnerability-management"}, + "compliance-governance": {"compliance-governance", "governance-risk-compliance"}, + "devsecops": {"devsecops"}, + "threat-hunting": {"threat-hunting"}, + "incident-response": {"incident-response"}, + "endpoint-security": {"endpoint-security"}, + "phishing-defense": {"phishing-defense", "social-engineering-defense"}, + "api-security": {"api-security"}, + "mobile-security": {"mobile-security"}, + "ransomware-defense": {"ransomware-defense"}, + "threat-detection": {"threat-detection"}, + "blockchain-security": {"blockchain-security"}, + "data-protection": {"data-protection"}, + "deception-technology": {"deception-technology"}, + "firmware-analysis": {"firmware-analysis", "firmware-security"}, + "privacy-compliance": {"privacy-compliance"}, + "purple-team": {"purple-team"}, + "supply-chain-security": {"supply-chain-security"}, + "wireless-security": {"wireless-security"}, + "ai-security": {"ai-security"}, } +# Flat set of all accepted subdomain values (canonical + aliases). +ALLOWED_SUBDOMAINS: set = {v for group in _SUBDOMAIN_ALIASES.values() for v in group} + +# Reverse map: alias → canonical (for warning messages). +_ALIAS_TO_CANONICAL: dict = {} +for canonical, aliases in _SUBDOMAIN_ALIASES.items(): + for alias in aliases: + _ALIAS_TO_CANONICAL[alias] = canonical + KEBAB_RE = re.compile(r"^[a-z0-9]+(-[a-z0-9]+)*$") +# Minimum description length. Other repo tooling uses 50 chars; align here. +DESCRIPTION_MIN_CHARS = 50 + RED = "\033[91m" GREEN = "\033[92m" YELLOW = "\033[93m" @@ -38,7 +79,22 @@ RESET = "\033[0m" def parse_frontmatter(text): - """Extract YAML frontmatter as a dict (simple parser, stdlib only).""" + """Extract YAML frontmatter as a dict (simple stdlib-only parser). + + Handles the common SKILL.md patterns: + - key: scalar value + - key: [inline, list] + - key:\n - list\n - items + - key: >- (folded scalar — content on following indented lines) + + Edge case note: ``list_values`` is reset to ``[]`` whenever a new key + with a scalar value is encountered, so a list from a prior block cannot + leak into an unrelated key. The only remaining theoretical edge case is + a key with *no* value that is immediately followed by non-list, non-empty + lines that look like scalars — those lines are currently ignored (the key + is treated as having no value). This is acceptable for well-formed SKILL.md + files and matches the behaviour contributors expect. + """ if not text.startswith("---"): return None end = text.find("---", 3) @@ -47,40 +103,73 @@ def parse_frontmatter(text): block = text[3:end].strip() data = {} current_key = None - list_values = [] + list_values: list = [] + in_folded = False # True when we are collecting a YAML >- / > folded scalar + folded_lines: list = [] + for line in block.split("\n"): stripped = line.strip() + + # Flush a completed folded scalar when we hit the next top-level key. + if in_folded and stripped and not line.startswith(" ") and not line.startswith("\t"): + if current_key and folded_lines: + data[current_key] = " ".join(folded_lines) + in_folded = False + folded_lines = [] + current_key = None + + if in_folded: + if stripped: + folded_lines.append(stripped) + continue + if not stripped or stripped.startswith("#"): continue - # Handle list items + + # Handle list items (must come before key: value to avoid misparse). if stripped.startswith("- ") and current_key: list_values.append(stripped[2:].strip().strip('"').strip("'")) - data[current_key] = list_values + data[current_key] = list(list_values) # copy so future mutations don't leak continue + # Handle inline list: tags: [a, b, c] - match = re.match(r"^(\w[\w_-]*):\s*\[(.+)\]\s*$", stripped) - if match: - current_key = match.group(1) - items = [i.strip().strip('"').strip("'") for i in match.group(2).split(",")] + m = re.match(r"^(\w[\w_-]*):\s*\[(.+)\]\s*$", stripped) + if m: + current_key = m.group(1) + items = [i.strip().strip('"').strip("'") for i in m.group(2).split(",")] data[current_key] = items - list_values = items + list_values = list(items) continue - # Handle key: value - match = re.match(r'^(\w[\w_-]*):\s*(.*)$', stripped) - if match: - current_key = match.group(1) - val = match.group(2).strip().strip('"').strip("'") + + # Handle key: >- or key: > (folded scalar start) + m = re.match(r"^(\w[\w_-]*):\s*>[-|]?\s*$", stripped) + if m: + current_key = m.group(1) + list_values = [] + in_folded = True + folded_lines = [] + continue + + # Handle key: value (plain scalar) + m = re.match(r'^(\w[\w_-]*):\s*(.*)$', stripped) + if m: + current_key = m.group(1) + val = m.group(2).strip().strip('"').strip("'") + list_values = [] # reset; new scalar key cannot inherit a prior list if val: data[current_key] = val - list_values = [] - else: - list_values = [] + # If val is empty the key is present but value-less (e.g. start of block list) continue + + # Flush any trailing folded scalar. + if in_folded and current_key and folded_lines: + data[current_key] = " ".join(folded_lines) + return data def validate_skill(skill_dir): - """Validate a single skill directory. Returns list of errors.""" + """Validate a single skill directory. Returns list of error strings.""" errors = [] skill_md = os.path.join(skill_dir, "SKILL.md") @@ -97,40 +186,55 @@ def validate_skill(skill_dir): fm = parse_frontmatter(content) if fm is None: - return [f"No valid YAML frontmatter found (must start with ---)"] + return ["No valid YAML frontmatter found (must start with ---)"] - # Check required fields + # Check required fields. for field in REQUIRED_FIELDS: if field not in fm: errors.append(f"Missing required field: {field}") - # Validate name + # Validate name. name = fm.get("name", "") if name: if not KEBAB_RE.match(name): - errors.append(f"Name '{name}' is not valid kebab-case (lowercase, hyphens only)") + errors.append( + f"Name '{name}' is not valid kebab-case (lowercase letters, digits, hyphens only)" + ) if len(name) > 64: errors.append(f"Name too long ({len(name)} chars, max 64)") - # Validate description + # Validate description. desc = fm.get("description", "") if isinstance(desc, str): - if len(desc) < 20: - errors.append(f"Description too short ({len(desc)} chars, min 20)") - if len(desc) > 500: - errors.append(f"Description too long ({len(desc)} chars, max 500)") + if len(desc) < DESCRIPTION_MIN_CHARS: + errors.append( + f"Description too short ({len(desc)} chars, min {DESCRIPTION_MIN_CHARS})" + ) + # No hard upper-limit enforced; multi-line folded scalars (>-) produce + # long strings that are valid and common in this repo. - # Validate domain + # Validate domain. domain = fm.get("domain", "") if domain and domain != "cybersecurity": errors.append(f"Domain must be 'cybersecurity', got '{domain}'") - # Validate subdomain + # Validate subdomain. subdomain = fm.get("subdomain", "") - if subdomain and subdomain not in ALLOWED_SUBDOMAINS: - errors.append(f"Unknown subdomain '{subdomain}'. Allowed: {', '.join(sorted(ALLOWED_SUBDOMAINS))}") + if subdomain: + if subdomain not in ALLOWED_SUBDOMAINS: + errors.append( + f"Unknown subdomain '{subdomain}'. Allowed: {', '.join(sorted(ALLOWED_SUBDOMAINS))}" + ) + else: + canonical = _ALIAS_TO_CANONICAL.get(subdomain, subdomain) + if subdomain != canonical: + # Warn (non-blocking) — alias is accepted but canonical is preferred + print( + f"{YELLOW}WARN{RESET} subdomain '{subdomain}' is an alias;" + f" canonical form is '{canonical}'" + ) - # Validate tags + # Validate tags. tags = fm.get("tags", []) if isinstance(tags, str): tags = [tags]