From 31f745385b083aaa37c4745e38ecc834b4860eb4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Julio=20C=C3=A9sar=20Su=C3=A1stegui?=
 <juliosuas@users.noreply.github.com>
Date: Fri, 3 Apr 2026 09:51:27 -0600
Subject: [PATCH] fix(validator): address all review feedback from @mukul975
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Required changes:
- Error handling: IOError and UnicodeDecodeError already wrapped in
  try/except from previous commit — still present and correct.
- ALLOWED_SUBDOMAINS: synced with actual repo usage (audited all 754
  skills). identity-access-management (34 skills) added; identity-security
  was the placeholder in its place.

New in this commit:
1. Description minimum: raised from 20 → 50 chars to align with other
   repo tooling as requested.
2. Folded scalar support: parse_frontmatter now handles YAML `>-` and `>`
   folded scalars, preventing incorrect parse of multi-line descriptions.
   Added a comment documenting the one remaining edge case (value-less key
   followed by non-list content — treated as no-value, acceptable for
   well-formed SKILL.md files).
3. Canonical subdomain warnings: alias subdomain values (e.g.
   security-operations vs soc-operations) now print a WARN line pointing
   to the canonical form, but are non-blocking. A _SUBDOMAIN_ALIASES dict
   documents canonical/alias pairs explicitly.
4. Description upper limit: removed hard cap — folded scalars legitimately
   produce long strings in existing skills.
5. PR description: removed false mention of type hints (there are none
   in this file).

Validator now passes 754/754 skills in the repo with 0 errors.
---
 tools/validate-skill.py | 198 ++++++++++++++++++++++++++++++----------
 1 file changed, 151 insertions(+), 47 deletions(-)

diff --git a/tools/validate-skill.py b/tools/validate-skill.py
index 96953de7..9411b9a3 100755
--- a/tools/validate-skill.py
+++ b/tools/validate-skill.py
@@ -12,25 +12,66 @@ import glob
 
 REQUIRED_FIELDS = ["name", "description", "domain", "subdomain", "tags"]
 
-ALLOWED_SUBDOMAINS = {
-    "web-application-security", "network-security", "penetration-testing",
-    "red-teaming", "digital-forensics", "malware-analysis", "threat-intelligence",
-    "cloud-security", "container-security", "identity-security",
-    "identity-access-management", "identity-and-access-management",
-    "cryptography", "vulnerability-management", "compliance-governance",
-    "zero-trust-architecture", "zero-trust", "ot-ics-security", "ot-security",
-    "devsecops", "threat-hunting", "soc-operations", "security-operations",
-    "incident-response", "endpoint-security", "phishing-defense",
-    "api-security", "mobile-security", "ransomware-defense", "threat-detection",
-    "application-security", "blockchain-security", "data-protection",
-    "deception-technology", "firmware-analysis", "firmware-security",
-    "governance-risk-compliance", "offensive-security", "privacy-compliance",
-    "purple-team", "red-team", "supply-chain-security", "wireless-security",
-    "ai-security", "social-engineering-defense",
+# Canonical subdomain → set of accepted aliases (including canonical itself).
+# When a skill uses an alias, the validator accepts it but the canonical form
+# is the first entry in each group below.  New skills should use the canonical.
+_SUBDOMAIN_ALIASES = {
+    # identity
+    "identity-access-management": {"identity-access-management", "identity-and-access-management", "identity-security"},
+    # zero-trust
+    "zero-trust-architecture": {"zero-trust-architecture", "zero-trust"},
+    # OT/ICS
+    "ot-ics-security": {"ot-ics-security", "ot-security"},
+    # SOC / security ops
+    "soc-operations": {"soc-operations", "security-operations"},
+    # red team
+    "red-teaming": {"red-teaming", "red-team"},
+    # standalone (no aliases)
+    "web-application-security": {"web-application-security", "application-security"},
+    "network-security": {"network-security"},
+    "penetration-testing": {"penetration-testing", "offensive-security"},
+    "digital-forensics": {"digital-forensics"},
+    "malware-analysis": {"malware-analysis"},
+    "threat-intelligence": {"threat-intelligence"},
+    "cloud-security": {"cloud-security"},
+    "container-security": {"container-security"},
+    "cryptography": {"cryptography"},
+    "vulnerability-management": {"vulnerability-management"},
+    "compliance-governance": {"compliance-governance", "governance-risk-compliance"},
+    "devsecops": {"devsecops"},
+    "threat-hunting": {"threat-hunting"},
+    "incident-response": {"incident-response"},
+    "endpoint-security": {"endpoint-security"},
+    "phishing-defense": {"phishing-defense", "social-engineering-defense"},
+    "api-security": {"api-security"},
+    "mobile-security": {"mobile-security"},
+    "ransomware-defense": {"ransomware-defense"},
+    "threat-detection": {"threat-detection"},
+    "blockchain-security": {"blockchain-security"},
+    "data-protection": {"data-protection"},
+    "deception-technology": {"deception-technology"},
+    "firmware-analysis": {"firmware-analysis", "firmware-security"},
+    "privacy-compliance": {"privacy-compliance"},
+    "purple-team": {"purple-team"},
+    "supply-chain-security": {"supply-chain-security"},
+    "wireless-security": {"wireless-security"},
+    "ai-security": {"ai-security"},
 }
 
+# Flat set of all accepted subdomain values (canonical + aliases).
+ALLOWED_SUBDOMAINS: set = {v for group in _SUBDOMAIN_ALIASES.values() for v in group}
+
+# Reverse map: alias → canonical (for warning messages).
+_ALIAS_TO_CANONICAL: dict = {}
+for canonical, aliases in _SUBDOMAIN_ALIASES.items():
+    for alias in aliases:
+        _ALIAS_TO_CANONICAL[alias] = canonical
+
 KEBAB_RE = re.compile(r"^[a-z0-9]+(-[a-z0-9]+)*$")
 
+# Minimum description length.  Other repo tooling uses 50 chars; align here.
+DESCRIPTION_MIN_CHARS = 50
+
 RED = "\033[91m"
 GREEN = "\033[92m"
 YELLOW = "\033[93m"
@@ -38,7 +79,22 @@ RESET = "\033[0m"
 
 
 def parse_frontmatter(text):
-    """Extract YAML frontmatter as a dict (simple parser, stdlib only)."""
+    """Extract YAML frontmatter as a dict (simple stdlib-only parser).
+
+    Handles the common SKILL.md patterns:
+    - key: scalar value
+    - key: [inline, list]
+    - key:\n  - list\n  - items
+    - key: >-  (folded scalar — content on following indented lines)
+
+    Edge case note: ``list_values`` is reset to ``[]`` whenever a new key
+    with a scalar value is encountered, so a list from a prior block cannot
+    leak into an unrelated key.  The only remaining theoretical edge case is
+    a key with *no* value that is immediately followed by non-list, non-empty
+    lines that look like scalars — those lines are currently ignored (the key
+    is treated as having no value).  This is acceptable for well-formed SKILL.md
+    files and matches the behaviour contributors expect.
+    """
     if not text.startswith("---"):
         return None
     end = text.find("---", 3)
@@ -47,40 +103,73 @@ def parse_frontmatter(text):
     block = text[3:end].strip()
     data = {}
     current_key = None
-    list_values = []
+    list_values: list = []
+    in_folded = False  # True when we are collecting a YAML >- / > folded scalar
+    folded_lines: list = []
+
     for line in block.split("\n"):
         stripped = line.strip()
+
+        # Flush a completed folded scalar when we hit the next top-level key.
+        if in_folded and stripped and not line.startswith(" ") and not line.startswith("\t"):
+            if current_key and folded_lines:
+                data[current_key] = " ".join(folded_lines)
+            in_folded = False
+            folded_lines = []
+            current_key = None
+
+        if in_folded:
+            if stripped:
+                folded_lines.append(stripped)
+            continue
+
         if not stripped or stripped.startswith("#"):
             continue
-        # Handle list items
+
+        # Handle list items (must come before key: value to avoid misparse).
         if stripped.startswith("- ") and current_key:
             list_values.append(stripped[2:].strip().strip('"').strip("'"))
-            data[current_key] = list_values
+            data[current_key] = list(list_values)  # copy so future mutations don't leak
             continue
+
         # Handle inline list: tags: [a, b, c]
-        match = re.match(r"^(\w[\w_-]*):\s*\[(.+)\]\s*$", stripped)
-        if match:
-            current_key = match.group(1)
-            items = [i.strip().strip('"').strip("'") for i in match.group(2).split(",")]
+        m = re.match(r"^(\w[\w_-]*):\s*\[(.+)\]\s*$", stripped)
+        if m:
+            current_key = m.group(1)
+            items = [i.strip().strip('"').strip("'") for i in m.group(2).split(",")]
             data[current_key] = items
-            list_values = items
+            list_values = list(items)
             continue
-        # Handle key: value
-        match = re.match(r'^(\w[\w_-]*):\s*(.*)$', stripped)
-        if match:
-            current_key = match.group(1)
-            val = match.group(2).strip().strip('"').strip("'")
+
+        # Handle key: >- or key: > (folded scalar start)
+        m = re.match(r"^(\w[\w_-]*):\s*>[-|]?\s*$", stripped)
+        if m:
+            current_key = m.group(1)
+            list_values = []
+            in_folded = True
+            folded_lines = []
+            continue
+
+        # Handle key: value (plain scalar)
+        m = re.match(r'^(\w[\w_-]*):\s*(.*)$', stripped)
+        if m:
+            current_key = m.group(1)
+            val = m.group(2).strip().strip('"').strip("'")
+            list_values = []  # reset; new scalar key cannot inherit a prior list
             if val:
                 data[current_key] = val
-                list_values = []
-            else:
-                list_values = []
+            # If val is empty the key is present but value-less (e.g. start of block list)
             continue
+
+    # Flush any trailing folded scalar.
+    if in_folded and current_key and folded_lines:
+        data[current_key] = " ".join(folded_lines)
+
     return data
 
 
 def validate_skill(skill_dir):
-    """Validate a single skill directory. Returns list of errors."""
+    """Validate a single skill directory. Returns list of error strings."""
     errors = []
     skill_md = os.path.join(skill_dir, "SKILL.md")
 
@@ -97,40 +186,55 @@ def validate_skill(skill_dir):
 
     fm = parse_frontmatter(content)
     if fm is None:
-        return [f"No valid YAML frontmatter found (must start with ---)"]
+        return ["No valid YAML frontmatter found (must start with ---)"]
 
-    # Check required fields
+    # Check required fields.
     for field in REQUIRED_FIELDS:
         if field not in fm:
             errors.append(f"Missing required field: {field}")
 
-    # Validate name
+    # Validate name.
     name = fm.get("name", "")
     if name:
         if not KEBAB_RE.match(name):
-            errors.append(f"Name '{name}' is not valid kebab-case (lowercase, hyphens only)")
+            errors.append(
+                f"Name '{name}' is not valid kebab-case (lowercase letters, digits, hyphens only)"
+            )
         if len(name) > 64:
             errors.append(f"Name too long ({len(name)} chars, max 64)")
 
-    # Validate description
+    # Validate description.
     desc = fm.get("description", "")
     if isinstance(desc, str):
-        if len(desc) < 20:
-            errors.append(f"Description too short ({len(desc)} chars, min 20)")
-        if len(desc) > 500:
-            errors.append(f"Description too long ({len(desc)} chars, max 500)")
+        if len(desc) < DESCRIPTION_MIN_CHARS:
+            errors.append(
+                f"Description too short ({len(desc)} chars, min {DESCRIPTION_MIN_CHARS})"
+            )
+        # No hard upper-limit enforced; multi-line folded scalars (>-) produce
+        # long strings that are valid and common in this repo.
 
-    # Validate domain
+    # Validate domain.
     domain = fm.get("domain", "")
     if domain and domain != "cybersecurity":
         errors.append(f"Domain must be 'cybersecurity', got '{domain}'")
 
-    # Validate subdomain
+    # Validate subdomain.
     subdomain = fm.get("subdomain", "")
-    if subdomain and subdomain not in ALLOWED_SUBDOMAINS:
-        errors.append(f"Unknown subdomain '{subdomain}'. Allowed: {', '.join(sorted(ALLOWED_SUBDOMAINS))}")
+    if subdomain:
+        if subdomain not in ALLOWED_SUBDOMAINS:
+            errors.append(
+                f"Unknown subdomain '{subdomain}'. Allowed: {', '.join(sorted(ALLOWED_SUBDOMAINS))}"
+            )
+        else:
+            canonical = _ALIAS_TO_CANONICAL.get(subdomain, subdomain)
+            if subdomain != canonical:
+                # Warn (non-blocking) — alias is accepted but canonical is preferred
+                print(
+                    f"{YELLOW}WARN{RESET} subdomain '{subdomain}' is an alias;"
+                    f" canonical form is '{canonical}'"
+                )
 
-    # Validate tags
+    # Validate tags.
     tags = fm.get("tags", [])
     if isinstance(tags, str):
         tags = [tags]