diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index 9a8b161..84895b3 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -1,8 +1,8 @@
 {
   "$schema": "https://anthropic.com/claude-code/marketplace.schema.json",
   "name": "pm-skills",
-  "version": "1.0.1",
-  "description": "Structured AI workflows for better product decisions. 65 domain-specific skills and 36 chained workflows across 8 PM plugins — from discovery to strategy, execution, launch, and growth.",
+  "version": "2.0.0",
+  "description": "Structured AI workflows for better product decisions. 68 domain-specific skills and 42 chained workflows across 9 PM plugins — from discovery to strategy, execution, launch, growth, and shipping AI-built software.",
   "owner": {
     "name": "Paweł Huryn",
     "email": "pawel@productcompass.pm",
@@ -56,6 +56,12 @@
       "description": "PM utility skills: resume review, NDA drafting, privacy policy generation, and grammar/flow checking. Essential tools for product managers beyond core product work.",
       "source": "./pm-toolkit",
       "category": "product-management"
+    },
+    {
+      "name": "pm-ai-shipping",
+      "description": "AI Shipping Kit — for PMs and founders accountable for AI-built code. Document a vibe-coded app, audit it for intended-vs-implemented security gaps and performance issues, and produce a reviewer-ready shipping packet.",
+      "source": "./pm-ai-shipping",
+      "category": "product-management"
     }
   ]
 }
diff --git a/.docs/images/examples.png b/.docs/images/examples.png
new file mode 100644
index 0000000..fe1e554
Binary files /dev/null and b/.docs/images/examples.png differ
diff --git a/.docs/images/plugins.png b/.docs/images/plugins.png
new file mode 100644
index 0000000..3de3110
Binary files /dev/null and b/.docs/images/plugins.png differ
diff --git a/CLAUDE.md b/CLAUDE.md
index 99c174f..d346c47 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -4,7 +4,7 @@ Guidance for AI agents (Claude Code, Cowork, and others) working in this reposit
 
 ## Project Overview
 
-**PM Skills** (`phuryn/pm-skills`) — a marketplace of **8 independent plugins** (65 skills, 36 commands) that bring structured product-management workflows to AI coding assistants. Built for Claude Code and Claude Cowork; the skills are also compatible with other agents (Gemini CLI, Cursor, Codex CLI).
+**PM Skills** (`phuryn/pm-skills`) — a marketplace of **9 independent plugins** (68 skills, 42 commands) that bring structured product-management workflows to AI coding assistants. Built for Claude Code and Claude Cowork; the skills are also compatible with other agents (Gemini CLI, Cursor, Codex CLI).
 
 Owner: Paweł Huryn — pawel@productcompass.pm — https://www.productcompass.pm
 
@@ -12,7 +12,7 @@ Owner: Paweł Huryn — pawel@productcompass.pm — https://www.productcompass.p
 
 ```
 pm-skills/                           <- repo root
-├── .claude-plugin/marketplace.json  <- root marketplace manifest (lists all 8 plugins)
+├── .claude-plugin/marketplace.json  <- root marketplace manifest (lists all 9 plugins)
 ├── .docs/images/                    <- images used by README (webp, gif)
 ├── .gitattributes
 ├── .gitignore
@@ -22,25 +22,26 @@ pm-skills/                           <- repo root
 ├── README.md                        <- public documentation (GitHub)
 ├── LICENSE                          <- MIT
 ├── validate_plugins.py              <- plugin validator
-└── pm-{name}/                       <- 8 plugin directories
+└── pm-{name}/                       <- 9 plugin directories
     ├── .claude-plugin/plugin.json   <- per-plugin manifest
     ├── skills/{skill}/SKILL.md      <- one folder per skill
     ├── commands/{command}.md        <- one file per command
     └── README.md                    <- per-plugin documentation
 ```
 
-### The 8 plugins
+### The 9 plugins
 
 | Plugin | Focus |
 |--------|-------|
 | `pm-product-discovery` | Ideation, experiments, assumption testing, prioritization, interview synthesis |
 | `pm-product-strategy` | Vision, strategy/lean/business-model canvas, SWOT, PESTLE, Ansoff, Porter, monetization |
-| `pm-execution` | PRDs, OKRs, roadmaps, sprints, pre-mortems, stakeholder maps, user stories |
+| `pm-execution` | PRDs, OKRs, roadmaps, sprints, pre-mortems, stakeholder maps, user stories, red-teaming |
 | `pm-market-research` | Personas, segmentation, sentiment analysis, competitive analysis, market sizing |
 | `pm-data-analytics` | SQL query generation, cohort/retention analysis |
 | `pm-go-to-market` | GTM strategy, growth loops, motions, beachhead segments, ICPs |
 | `pm-marketing-growth` | Marketing ideas, value-prop statements, North Star metrics, naming, positioning |
 | `pm-toolkit` | Resume review, NDA drafting, privacy policy, grammar/flow checking |
+| `pm-ai-shipping` | AI Shipping Kit: document a vibe-coded app, map test coverage, audit security/performance against intended behavior, compile a shipping packet |
 
 ## Key Design Rules
 
@@ -68,9 +69,9 @@ Descriptions in `plugin.json` and the repo `README.md` should stay aligned (iden
 
 ## Versioning
 
-- All versions are currently **1.0.1** — `marketplace.json` and all 8 `plugin.json` files.
+- All versions are currently **2.0.0** — `marketplace.json` and all 9 `plugin.json` files.
 - **Keep every version in sync.** There is no independent per-plugin versioning.
-- Bump any `plugin.json` → also bump `marketplace.json`, and vice-versa (bump all 8 to match).
+- Bump any `plugin.json` → also bump `marketplace.json`, and vice-versa (bump all 9 to match).
 
 ## Article Links in Skills (Further Reading)
 
diff --git a/README.md b/README.md
index d3c1285..4d73444 100644
--- a/README.md
+++ b/README.md
@@ -5,9 +5,9 @@
 
 # PM Skills Marketplace: The AI Operating System for Better Product Decisions
 
-> 65 PM skills and 36 chained workflows across 8 plugins. Claude Code, Cowork, and more. From discovery to strategy, execution, launch, and growth. 
+> 68 PM skills and 42 chained workflows across 9 plugins. Claude Code, Cowork, and more. From discovery to strategy, execution, launch, growth, and shipping AI-built code. 
 
-![Plugin overview](.docs/images/plugins-overview.webp)
+![PM Skills marketplace: skills, commands, and all 9 plugins at a glance](.docs/images/plugins.png)
 
 Designed for Claude Code and Cowork. Skills compatible with other AI assistants.
 
@@ -31,13 +31,15 @@ The result: better product decisions, not just faster documents.
 
 ## How It Works (Skills, Commands, Plugins)
 
+![Example prompts: a skill and two commands (/write-prd, /ship-check) in action](.docs/images/examples.png)
+
 **Skills** are the building blocks of the marketplace. Each skill gives Claude domain knowledge, analytical frameworks, or a guided workflow for a specific PM task. Some skills also work as reusable foundations that multiple commands share. 
 
 Skills are loaded automatically when relevant to the conversation — no explicit invocation needed. If needed (e.g., prioritizing skills over general knowledge), you can **force loading skills** with `/plugin-name:skill-name` or `/skill-name` (Claude will add the prefix).
 
 **Commands** are user-triggered workflows invoked with `/command-name`. They chain one or more skills into an end-to-end process. For example, `/discover` chains four skills together: brainstorm-ideas → identify-assumptions → prioritize-assumptions → brainstorm-experiments.
 
-**Plugins** group related skills and commands into installable packages. Each plugin covers a PM domain — discovery, strategy, execution, and so on. Installing the marketplace gives you all 8 plugins at once.
+**Plugins** group related skills and commands into installable packages. Each plugin covers a PM domain — discovery, strategy, execution, and so on. Installing the marketplace gives you all 9 plugins at once.
 
 ![How skills work](.docs/images/how-skills-work.webp)
 
@@ -54,7 +56,7 @@ Commands are designed to flow into each other, matching the PM workflow. After a
 3. Select **Add marketplace from GitHub**
 4. Enter: `phuryn/pm-skills`
 
-All 8 plugins install automatically. You get both commands (`/discover`, `/strategy`, etc.) and skills.
+All 9 plugins install automatically. You get both commands (`/discover`, `/strategy`, etc.) and skills.
 
 ![Installing PM Skills in Claude Cowork](.docs/images/pm-skills-install.gif)
 
@@ -73,6 +75,7 @@ claude plugin install pm-data-analytics@pm-skills
 claude plugin install pm-marketing-growth@pm-skills
 claude plugin install pm-go-to-market@pm-skills
 claude plugin install pm-execution@pm-skills
+claude plugin install pm-ai-shipping@pm-skills
 ```
 
 ### Other AI assistants (skills only)
@@ -188,11 +191,11 @@ Commands:
 </details>
 
 <details>
-<summary><strong>3. pm-execution</strong> — PRDs, OKRs, roadmaps, sprints, retros, release notes, stakeholder management (15 skills, 10 commands)</summary>
+<summary><strong>3. pm-execution</strong> — PRDs, OKRs, roadmaps, sprints, retros, release notes, stakeholder management (16 skills, 11 commands)</summary>
 
 Day-to-day product management: PRDs, OKRs, roadmaps, sprints, retrospectives, release notes, pre-mortems, stakeholder management, user stories, and prioritization frameworks.
 
-**Skills (15):**
+**Skills (16):**
 
 - `create-prd` — Comprehensive 8-section PRD template
 - `brainstorm-okrs` — Team-level OKRs aligned with company objectives
@@ -209,14 +212,16 @@ Day-to-day product management: PRDs, OKRs, roadmaps, sprints, retrospectives, re
 - `test-scenarios` — Test scenarios: happy paths, edge cases, error handling
 - `dummy-dataset` — Realistic dummy datasets as CSV, JSON, SQL, or Python
 - `prioritization-frameworks` — Reference guide to 9 prioritization frameworks (Opportunity Score, ICE, RICE, MoSCoW, Kano, etc.)
+- `strategy-red-team` — Adversarial stress-test of a plan: surface load-bearing assumptions, name what would make each one fail, and rank by cheapest test
 
-**Commands (10):**
+**Commands (11):**
 
 - `/write-prd` — Create a PRD from a feature idea or problem statement
 - `/plan-okrs` — Brainstorm team-level OKRs
 - `/transform-roadmap` — Convert a feature-based roadmap into outcome-focused
 - `/sprint` — Sprint lifecycle (`plan|retro|release`)
 - `/pre-mortem` — Pre-mortem risk analysis on a PRD or launch plan
+- `/red-team-prd` — Adversarially stress-test a PRD, roadmap, or strategy and rank the riskiest assumptions by cheapest test
 - `/meeting-notes` — Summarize a meeting transcript into structured notes
 - `/stakeholder-map` — Map stakeholders and create a communication plan
 - `/write-stories` — Break features into backlog items (`user|job|wwa`)
@@ -400,6 +405,38 @@ Commands:
 
 </details>
 
+<details>
+<summary><strong>9. pm-ai-shipping</strong> — AI Shipping Kit: document a vibe-coded app, audit security and performance, map test coverage, compile a shipping packet (2 skills, 5 commands)</summary>
+
+For PMs and founders accountable for AI-built code. AI agents write code fast but leave no record of *intent* — what the system should do, who may do what, where the secrets live, which rules are actually verified. This kit restores reviewability: it documents the system, then audits the gap between what the docs say and what the code actually does — the class of bug generic scanners miss.
+
+**Skills (2):**
+
+- `shipping-artifacts` — The durable documentation set that makes an AI-built app reviewable: a core every app needs (architecture, user/permission flows, permissions, variables/secrets, test-coverage map) plus conditional docs added only when they apply (emails, cron, SEO, embedded agents/automation). Defines what each doc must capture and how a reviewer uses it
+- `intended-vs-implemented` — The method for finding the gap between what a system is documented to do and what the code actually does, with cited evidence on both sides
+
+**Commands (5):**
+
+- `/ship-check` — Turn a vibe-coded repo into a reviewer-ready shipping packet: document, wire agent context, run security and performance audits, map test coverage, and compile the results
+- `/document-app` — Reverse-engineer a codebase into the system documents reviewers and auditors need — a core set (architecture, flows, permissions, variables) plus conditional docs (emails, cron, SEO, automation) when they apply
+- `/derive-tests` — Turn documented intent into a test-coverage map: inventory the tests that exist today, separate them from proposed tests and unverified gaps, and recommend a green-before-merge CI gate
+- `/security-audit-static` — Static security audit: map trust boundaries, cross-reference documented intent, self-refute every finding, and report only evidence-backed risks
+- `/performance-audit-static` — Static performance audit: find over-fetching, missing indexes, and caching opportunities, ranked by effort and impact
+
+**Examples:**
+
+Skills:
+- `What documentation does my Supabase app need before someone can review it?`
+- `Where does what this code does diverge from what the docs say it should do?`
+
+Commands:
+- `/ship-check the payments service`
+- `/document-app — Reverse-engineer the system docs for this repo`
+- `/derive-tests — Which documented rules have no test yet?`
+- `/security-audit-static src/api`
+
+</details>
+
 ---
 
 ## About
diff --git a/pm-ai-shipping/.claude-plugin/plugin.json b/pm-ai-shipping/.claude-plugin/plugin.json
new file mode 100644
index 0000000..fb213e2
--- /dev/null
+++ b/pm-ai-shipping/.claude-plugin/plugin.json
@@ -0,0 +1,23 @@
+{
+  "name": "pm-ai-shipping",
+  "version": "2.0.0",
+  "description": "AI Shipping Kit — for PMs and founders accountable for AI-built code. Document a vibe-coded app, audit it for intended-vs-implemented security gaps and performance issues, and produce a reviewer-ready shipping packet.",
+  "author": {
+    "name": "Paweł Huryn",
+    "email": "pawel@productcompass.pm",
+    "url": "https://www.productcompass.pm"
+  },
+  "keywords": [
+    "product-management",
+    "ai-shipping",
+    "vibe-coding",
+    "security-audit",
+    "performance-audit",
+    "code-review",
+    "documentation",
+    "owasp",
+    "shipping"
+  ],
+  "homepage": "https://www.productcompass.pm",
+  "license": "MIT"
+}
diff --git a/pm-ai-shipping/README.md b/pm-ai-shipping/README.md
new file mode 100644
index 0000000..4c3b97b
--- /dev/null
+++ b/pm-ai-shipping/README.md
@@ -0,0 +1,34 @@
+# pm-ai-shipping — AI Shipping Kit
+
+For PMs and founders accountable for AI-built code. Document a vibe-coded app, audit it for intended-vs-implemented security gaps and performance issues, and produce a reviewer-ready shipping packet.
+
+## Overview
+
+AI agents write code fast but leave no record of *intent* — what the system should do, who may do what, where the secrets live. Without that record, no human and no auditing agent can tell whether the code is safe to ship. This kit restores reviewability: it documents the system, then audits the gap between what the docs say and what the code does — the class of bug generic scanners miss because they have no model of intent.
+
+Start with `/ship-check` for the full sequence, or run a single stage with the specialist commands.
+
+## Install
+
+Install from the [pm-skills marketplace](https://github.com/phuryn/pm-skills) and enable the `pm-ai-shipping` plugin. Each command can be triggered with `/pm-ai-shipping:<command>` or its short `/<command>` form; skills auto-load when the topic matches.
+
+## Skills (2)
+
+- **shipping-artifacts** — The durable documentation set that makes an AI-built app reviewable: a core every app needs (architecture, user/permission flows, permissions, variables/secrets, test-coverage map) plus conditional docs added only when they apply (emails, cron, SEO, embedded agents/automation). Defines what each doc must capture and how a reviewer uses it.
+- **intended-vs-implemented** — The method for finding the gap between what a system is documented to do and what the code actually does, with cited evidence on both sides and without hand-wavy findings.
+
+## Commands (5)
+
+- `/pm-ai-shipping:ship-check` — Turn a vibe-coded repo into a reviewer-ready shipping packet: document, wire agent context, run security and performance audits, map test coverage, and compile the results.
+- `/pm-ai-shipping:document-app` — Reverse-engineer a codebase into the system documents reviewers and auditors need — a core set (architecture, flows, permissions, variables) plus conditional docs (emails, cron, SEO, automation) when they apply.
+- `/pm-ai-shipping:derive-tests` — Turn documented intent into a test-coverage map: inventory the tests that exist today, separate them from proposed tests and unverified gaps, mark each unit / guarded-live / manual, and recommend a green-before-merge CI gate.
+- `/pm-ai-shipping:security-audit-static` — Static security audit: map trust boundaries, cross-reference documented intent, self-refute every finding, and report only evidence-backed risks.
+- `/pm-ai-shipping:performance-audit-static` — Static performance audit: find over-fetching, missing indexes, and caching opportunities, ranked by effort and impact.
+
+## Author
+
+Paweł Huryn — [The Product Compass Newsletter](https://www.productcompass.pm)
+
+## License
+
+MIT
diff --git a/pm-ai-shipping/commands/derive-tests.md b/pm-ai-shipping/commands/derive-tests.md
new file mode 100644
index 0000000..a7a0ef0
--- /dev/null
+++ b/pm-ai-shipping/commands/derive-tests.md
@@ -0,0 +1,114 @@
+---
+description: Turn documented intent into a test-coverage map — inventory the tests that exist today, derive use-case cases from the system docs, separate existing coverage from proposed tests and unverified gaps, mark each unit / guarded-live / manual, and recommend a green-before-merge CI gate
+argument-hint: "<repo path or area; defaults to the whole repository>"
+---
+
+# /derive-tests -- Turn Intent Into Tests
+
+The docs say what the system *should* do. An audit finds where the code *doesn't*. Tests are what stop that gap from reopening after the next AI edit. This command reads the documented intent, turns each load-bearing rule into a concrete test case, sorts them into what to automate, what needs a guarded live run, and what stays manual — then recommends the CI gate that keeps `main` honest.
+
+This produces a coverage map (`tests.md`) and concrete test cases, not a finished suite — you or the next agent implement the deterministic ones.
+
+## Invocation
+
+```
+/derive-tests
+/derive-tests the checkout flow
+/derive-tests supabase/functions
+```
+
+## Prerequisite: documented intent
+
+Tests are derived from the docs, so the docs come first. If `/documentation/*.md` is missing or thin, run `/document-app` (and `/derive-tests` reads `flows.md`, `permissions.md`, and `automation.md` most heavily). You cannot map coverage to rules you never wrote down — where intent is absent, say so rather than inventing rules to test.
+
+## The workflow
+
+### 1. Read the intent — and the tests that already exist
+
+Read the applicable system docs (architecture, flows, permissions, variables, and any of emails, cron, seo, automation that exist). Apply the **shipping-artifacts** skill for what each doc should contain, and the **intended-vs-implemented** skill for the discipline of treating docs as claims to verify, not proof.
+
+Then inventory the **existing test suite** — the test files, what they actually assert, and what runs in CI today. The map you produce must distinguish coverage that exists *now* from coverage you're *proposing*; skipping this step yields a falsely-green map that claims rules are pinned when nothing checks them. If there are no tests, say so plainly — that is itself a finding.
+
+### 2. Extract the rules worth testing
+
+Pull out the load-bearing, deterministic rules — the ones whose violation crosses a trust, data, money, tenant, or privacy boundary:
+
+- authorization allow **and deny** cases (especially the boundary crossings in `flows.md` and the matrix in `permissions.md`),
+- input validation and output encoding at each sink,
+- idempotency of jobs and dedup keys,
+- fail-closed defaults (error / timeout / cache-miss / flag paths that must deny, not allow),
+- side-effect conditions (exactly when an email sends, a write commits, a paid action fires),
+- public-data-only constraints on public or bot routes,
+- the output-contract and tool-surface limits of any agent in `automation.md`.
+
+Skip cosmetic behavior. A rule earns a test when getting it wrong harms someone other than the actor.
+
+### 3. Build the coverage map
+
+One row per use case: **rule → expected behavior (incl. the negative case) → evidence source (doc + code) → test type → status (existing / proposed / none)**. The status column is what keeps the map honest — mark a rule *existing* only when a test in the repo actually asserts it today.
+
+Test types:
+
+- **unit** — pure and deterministic, no external services.
+- **integration (deterministic)** — exercises real wiring against a local or in-memory dependency (test DB, mocked provider) and runs the same way every time.
+- **guarded live** — needs a real external DB, email provider, LLM, or third party. Runs only behind an explicit flag, never in the default CI run.
+- **manual** — UI/visual or judgment calls. A reviewer checklist item, not an automated test.
+
+**What CI must require:** the deterministic local set — unit plus deterministic integration tests, the ones that pass or fail the same way on every run with no live dependencies. Prefer **unit** where the decision logic can be isolated; reach for **integration** when the rule lives in the wiring (middleware, RLS, auth guards) and only a real-but-local dependency can exercise it. Guarded-live and manual rows never gate the default run.
+
+When a rule can only be exercised live, you can extract its *decision* into a pure helper so the logic is unit-testable — but only as a **complement, not a replacement** for testing the real enforcement. The unit test proves the helper's logic; it does **not** prove the framework actually calls it. Wiring and policy enforcement (route middleware, DB row-level security, auth guards, provider config) still needs an integration or guarded-live check, or the helper becomes a policy shadow that passes while the real path is unprotected.
+
+### 4. Propose the tests
+
+For each rule you can pin with a deterministic automated test (unit or integration), write the case: name, arrange/act/assert intent, and the negative case it must reject. Group cases by the doc or flow they defend. Prefer the smallest test that pins the rule — one clear assertion per boundary beats a sprawling integration test that fails for ten reasons.
+
+### 5. Recommend the CI gate
+
+Recommend — don't silently install — a CI setup matched to the repo's stack and existing tooling:
+
+- run the **deterministic local set on every pull request** (unit + any integration test that runs without live services),
+- keep **guarded-live tests opt-in** (manual or scheduled, never blocking),
+- **gate merges to `main` on green** via a required status check + branch protection.
+
+Output the workflow file and the branch-protection setting as a clearly-labeled suggestion for the user to approve, not an applied change.
+
+### 6. Report coverage and gaps
+
+Write `tests.md` in three clearly separated sections:
+
+- **Existing coverage** — rules a test in the repo pins *today* (from the Step 1 inventory).
+- **Proposed tests** — the cases you're recommending but that don't exist yet, by type.
+- **Gaps** — documented rules with **no verification at all**, ranked by what crossing them exposes.
+
+The gaps are the backlog, and they are exactly where the next AI edit can silently break a boundary. Be honest that proposed ≠ existing: a rule isn't covered until a test actually asserts it.
+
+## Output
+
+```
+Test Coverage: [scope]
+
+| Use case | Rule (doc) | Expected behavior (+ deny case) | Evidence | Type | Status |
+|----------|-----------|---------------------------------|----------|------|--------|
+[status: existing / proposed / none]
+
+### Existing coverage
+[tests already in the repo, each tied to the rule it pins]
+
+### Proposed tests
+[grouped by flow/doc — name · assert · negative case · type]
+
+### Recommended CI gate
+[workflow snippet for the detected stack + "green-before-merge" branch-protection note]
+
+### Gaps — documented but unverified
+[rules with no test yet, ranked by what crossing them exposes]
+```
+
+Optionally write the coverage map to `/documentation/tests.md` and the full report to `/reports/test_plan_{timestamp}.md`.
+
+## Notes
+
+- This is the verification half of "documented == implemented": the audits find today's gap, these tests stop it from reopening tomorrow.
+- Don't fabricate rules to manufacture coverage. If the docs are silent, the gap is in the docs — fix `/document-app` first.
+- Don't wire external services into the default CI run; flaky live tests erode the green-before-merge gate until people start ignoring it.
+- Covers test derivation only. For the gap audit itself use `/security-audit-static`; for the full document → audit → test → packet sequence use `/ship-check`.
diff --git a/pm-ai-shipping/commands/document-app.md b/pm-ai-shipping/commands/document-app.md
new file mode 100644
index 0000000..dd3a618
--- /dev/null
+++ b/pm-ai-shipping/commands/document-app.md
@@ -0,0 +1,60 @@
+---
+description: Reverse-engineer an AI-built codebase into the system documents reviewers and auditors need — a core set (architecture, flows, permissions, variables) plus conditional docs (emails, cron, SEO, automation) when they apply
+argument-hint: "<repo path or area; defaults to the whole repository>"
+---
+
+# /document-app -- Make the System Reviewable
+
+Produce the durable documentation an AI-built app is missing: an honest map of what the system is, who can do what, and where the risk lives. These docs are the foundation every later audit compares the code against.
+
+## Invocation
+
+```
+/document-app
+/document-app supabase/functions
+/document-app the backend
+```
+
+## Workflow
+
+### Step 1: Scope
+
+Audit **$ARGUMENTS**. If empty, document the whole repository, prioritizing backend code, auth, data access, background jobs, and anything that sends, schedules, or exposes data.
+
+### Step 2: Reverse-Engineer the Docs
+
+Apply the **shipping-artifacts** skill. Reading the code as the source of truth, produce the applicable documents in `/documentation/`.
+
+**Core (always):**
+
+- `architecture.md` — system overview, stack, auth flow, trust boundaries
+- `flows.md` — the permission-relevant journeys: each protected step's authz check, the trust-boundary crossings, and the side effects each flow causes
+- `permissions.md` — roles, scope derivation, resource × operation × role matrix, RLS vs. code-enforced checks
+- `variables.md` — config & secrets mapped to risk and rotation
+
+**Conditional (only if the capability exists — otherwise note its absence in one line):**
+
+- `emails.md` — notification path, templates, retry/backoff, failure visibility
+- `cron.md` — scheduled-work inventory, idempotency, internal-call auth
+- `seo.md` — SPA preview approach, route coverage, metadata sanitization
+- `automation.md` — embedded agents/automations: trigger, tool surface, steering vs. hard guardrails, output contract, app-owned side effects, approval gates
+
+Be brutally honest about the current state without being paranoid. Skip any conditional document that doesn't apply and say so. Add a "Related Documents" reference in `architecture.md` for each doc produced. (The test-coverage map, `tests.md`, is produced separately by `/derive-tests`.)
+
+### Step 3: Report
+
+Summarize what was created or updated, what was skipped and why, and any gaps where the code was too unclear to document confidently (those are the first things to fix).
+
+### Step 4: Offer Next Steps
+
+- "Want me to **derive a test-coverage map** (`/derive-tests`) so each documented rule has a verification plan?"
+- "Want me to **run a security audit** now that the intended behavior is documented?"
+- "Should I **check for performance issues** — over-fetching, missing indexes, caching?"
+- "Want me to **run `/ship-check`** to wire agent context and produce a full shipping packet?"
+
+## Notes
+
+- These docs describe *this* system — keep generic theory and finished templates out.
+- Write for two readers: a human reviewer and the next AI coding agent.
+- Don't include an "updated date" line.
+- The agent operating-context file (`CLAUDE.md` / `AGENTS.md`) is produced separately at the `/ship-check` handoff step — it's instructions derived from these docs, not system documentation.
diff --git a/pm-ai-shipping/commands/performance-audit-static.md b/pm-ai-shipping/commands/performance-audit-static.md
new file mode 100644
index 0000000..6ef86a3
--- /dev/null
+++ b/pm-ai-shipping/commands/performance-audit-static.md
@@ -0,0 +1,59 @@
+---
+description: Static performance audit of AI-built code — find over-fetching, missing indexes, and caching opportunities, ranked by effort and impact
+argument-hint: "<repo path or area; defaults to the whole repository>"
+---
+
+# /performance-audit-static -- Find What Won't Scale
+
+A focused performance review for AI-built code. Agents optimize for "it works on my seed data," not "it holds at 100× the rows." This command finds the three failure modes that surface as data grows — over-fetching, missing indexes, and absent caching — and ranks fixes by effort and impact.
+
+This is a static review of code and queries, not a load test.
+
+## Invocation
+
+```
+/performance-audit-static
+/performance-audit-static src/views
+```
+
+## Scope
+
+Audit **$ARGUMENTS**. If empty, review the whole repository, prioritizing list and dashboard views, frequently hit endpoints, and large tables.
+
+## The audit
+
+### 1. Over-fetch in view payloads
+
+Review components that render list or dashboard views. Identify fields fetched from the database but never used in the frontend, `SELECT *` on wide tables, missing pagination, absent lazy loading, and redundant loads. Suggest a minimal field set per component or route.
+
+### 2. Missing or inefficient indexes
+
+Review queries, filters, and RPCs used in production views. Identify missing or inefficient indexes based on sort, filter, and join conditions, focusing on large tables and hot endpoints. Give specific index definitions, not "add an index."
+
+### 3. Caching opportunities
+
+Review endpoints and data-access patterns for frequently called paths that return static or rarely changing data. Identify where frontend or backend caching helps, and specify the invalidation rule for each — caching without an invalidation plan is a correctness bug in waiting.
+
+## Output
+
+Report findings per view, route, or table:
+
+```
+Performance Audit: [scope]
+
+<view / route / table>:
+  - Finding: <what is slow or wasteful>
+  - Recommendation: <specific change — field set, index definition, cache + invalidation>
+  - Effort: Low | Medium | High
+  - Priority: Low | Medium | High
+  - Expected effect: <e.g. payload size, query time, load time>
+```
+
+End with what's already efficient (say it explicitly) and what needs runtime profiling to confirm. Optionally write the report to `/reports/performance_audit_{timestamp}.md`.
+
+## Notes
+
+- Rank by impact-per-effort — one missing index on a hot table usually beats ten micro-optimizations.
+- Don't flag theoretical inefficiency with no growth path; flag what breaks as rows or traffic scale.
+- This command covers performance only. For authorization, injection, and data-exposure risks, use `/security-audit-static`.
+- For an end-to-end pass with documentation and a shipping packet, use `/ship-check`.
diff --git a/pm-ai-shipping/commands/security-audit-static.md b/pm-ai-shipping/commands/security-audit-static.md
new file mode 100644
index 0000000..dc40d9b
--- /dev/null
+++ b/pm-ai-shipping/commands/security-audit-static.md
@@ -0,0 +1,86 @@
+---
+description: Static security audit of AI-built code — map trust boundaries, cross-reference documented intent, self-refute every finding, and report only evidence-backed risks
+argument-hint: "<repo path or area; defaults to the whole repository>"
+---
+
+# /security-audit-static -- Audit the Code You Already Have
+
+A focused, self-contained security audit for AI-built code. It keeps a small, durable engine — map the boundaries, check intent against implementation, refute before reporting — and refuses to emit anything it can't back with cited evidence.
+
+This is a review, not a guarantee: it produces code-review findings, not confirmed exploits.
+
+> Method adapted from the public, Apache-2.0 `security-guidance` plugin in Anthropic's
+> `claude-plugins-official` repository. Not affiliated with or endorsed by Anthropic.
+
+## Invocation
+
+```
+/security-audit-static
+/security-audit-static supabase/functions
+```
+
+## Scope
+
+Audit **$ARGUMENTS**. If empty, audit the whole repository, prioritizing request handlers, auth, data access, background jobs, and anything that renders, fetches, executes, logs, or stores user-controlled data. For non-trivial scopes, fan out with parallel subagents — one per function/module cluster, each running the mapping and inspection (steps 1–3); then merge candidates and run the self-refute (step 4) yourself over the full set.
+
+## The audit (small engine, strong constraint)
+
+### 1. Map entry points to trust boundaries and sinks
+
+Optimize for recall first — read every file in scope in full, then grep for handler, route, RPC, and shared-helper names to find callers and downstream sinks. Reading the file that contains the bug is what prevents missing it.
+
+Entry points: HTTP/RPC handlers, edge/serverless functions, webhooks, queue consumers, upload handlers, auth callbacks, cron-triggered endpoints. Sinks: raw SQL / query filters, shell/exec, `eval` / `new Function` / dynamic imports, HTML render and templates, outbound fetches, filesystem paths, IAM/role writes, logs and analytics, deserializers (incl. YAML/XML and archive extraction), response headers / cache-control, and **LLM prompts and tool calls** (prompt injection). For every value reaching a sink, decide whether an attacker can influence it and trace it back to its source.
+
+### 2. Inspect the four high-value paths
+
+Authorization, data access, session/identity, and input→output encoding. Compare sibling handlers — if one enforces a check another omits, the omission is a finding. Follow cross-file flows; input in module A reaching a dangerous operation in module B is where the real bugs hide.
+
+### 3. Cross-reference intended vs. implemented
+
+Apply the **intended-vs-implemented** skill against `/documentation/*.md`. A rule documented but not enforced in code is a finding on its own. If the docs are absent, note it and recommend `/document-app` first — an intent audit needs intent on record.
+
+### 4. Self-refute every candidate
+
+For each finding, try to disprove it. Default to **keep** unless you find cited evidence (file + line) for one of: a real sanitizer/encoder/validator/authorization check stops the exploit *at the sink*; the sink is non-dangerous (typed, hardcoded, isolated, schema-decoded); a frontend gate is independently re-enforced on the backend; an unvalidated credential is immediately forwarded to an upstream system that validates it; a config/flag gates the path and users can't influence it per request; or the path isn't reachable in production.
+
+Name the **attacker** and the **victim**: refute if the only victim is the attacker on their own machine/account/tenant/data and no shared system or privilege boundary is crossed; keep if the impact reaches other users, tenants, shared infrastructure, billing, email reputation, secrets, or compliance-sensitive data. **Never apply attacker-equals-victim refutation to SSRF/outbound-network sinks, shared billing or quota sinks, data-exposure findings, cross-tenant or cross-principal flows, or server-side execution/rendering** — those harm someone other than the attacker by definition. Never refute a finding merely because the code is pre-existing — pre-existing bugs are the point. Do not speculate.
+
+### 5. Report only what survives
+
+## High-miss checklist (technology-shaped, not stack-specific)
+
+Apply these — they're where AI-built apps most often fail:
+
+- **Service-role / disabled-RLS boundaries** — if the DB client bypasses row-level security, *every* authorization decision must be in code; flag queries missing the org/owner filter.
+- **Auth-provider drift** — claims from an external identity provider (e.g. Clerk) trusted without verifying how they map to data scope.
+- **Gate/action field mismatch** — permission checked on one ID, action performed on an independent ID never proven to belong to it.
+- **Forgeable request signals** — endpoints gated by `?source=cron`, `?bot=1`, guessable headers, or unsigned webhook-like payloads instead of real auth. Raise severity when the endpoint mutates data, sends email, or triggers paid usage.
+- **Output encoding vs. input validation** — user data interpolated into HTML, `<title>`, attributes, JSON-LD, SQL, or Markdown must be encoded for *that* sink; input validation doesn't count. (XSS, CSP gaps.)
+- **SSRF / renderer abuse** — attacker-influenced URLs, HTML, SVG, or Markdown reaching an outbound fetch or a renderer (headless browser, PDF/OG-image generator).
+- **Parser / validator differentials** — the validator accepts a value the consumer interprets differently: unanchored regex, `startsWith`/substring allowlists, URL-parser disagreement, encoding/case/slash/path-normalization mismatch, or validation on one representation and execution on another.
+- **Fail-open paths** — error, `catch`, timeout, cancellation, cache-miss, stale-cache, feature-flag, or boundary-value branches that default to *allow*. AI code loves a permissive fallback.
+- **Secrets / PII to observability** — credentials, tokens, emails, or sensitive data reaching logs, traces, analytics, or error bodies; check error branches especially.
+- **Public-data-only violations** — SPA/SEO bot routes or "public" endpoints over-fetching private fields.
+
+## Output
+
+Group surviving findings by file, sorted by severity, in the standard format:
+
+```
+Security Audit: [scope]
+
+<file>:
+  N. [SEVERITY] [Category] <location>
+     Risk Level: Critical | High | Medium | Low
+     Attack Scenario: <attacker -> sink -> impact, step by step>
+     Impact: <what data or functionality is compromised>
+     Solution: <concrete code change>
+```
+
+End with: the root-cause theme across findings; **what is well-built — say it explicitly**; and what you could not verify and the user should double-check. Optionally write the report to `/reports/security_audit_{timestamp}.md`.
+
+## Notes
+
+- Don't report generic hardening with no concrete impact, outdated deps without a reachable path, or test/mock code unless it ships. Logic and authorization bugs with no classic sink still count.
+- This command covers security only. For over-fetching, indexes, and caching, use `/performance-audit-static`.
+- For an end-to-end pass that documents first and produces a shipping packet, use `/ship-check`.
diff --git a/pm-ai-shipping/commands/ship-check.md b/pm-ai-shipping/commands/ship-check.md
new file mode 100644
index 0000000..c380f76
--- /dev/null
+++ b/pm-ai-shipping/commands/ship-check.md
@@ -0,0 +1,76 @@
+---
+description: Turn a vibe-coded repo into a reviewer-ready shipping packet — document the app, wire agent context, run security and performance audits, map test coverage, and compile the results
+argument-hint: "<repo path or area; defaults to the whole repository>"
+---
+
+# /ship-check -- Is This Safe to Ship?
+
+Your AI wrote the code. This command answers the question you actually have — *is it safe to ship?* — by running the full shipping sequence and compiling the results into one reviewer-ready packet a human can sign off on.
+
+`/ship-check` does not replace the specialist commands. It coordinates them and produces the final artifact none of them produce alone: the **shipping packet**.
+
+## Invocation
+
+```
+/ship-check
+/ship-check the payments service
+/ship-check supabase/functions
+```
+
+## The shipping sequence
+
+Run on **$ARGUMENTS** (or the whole repository if empty). Each step builds on the last — the ordering is the point, because every audit is only as good as the documented intent it can compare the code against.
+
+### Step 1: Document the system
+
+Ensure the system docs exist and are current (run `/document-app` if they're missing or stale). Apply the **shipping-artifacts** skill — the core set (architecture, flows, permissions, variables) plus any conditional docs that apply (emails, cron, seo, automation). These docs are the intended-state baseline for everything that follows.
+
+### Step 2: Wire the agent operating context
+
+Create or refresh `CLAUDE.md` (and a thin `AGENTS.md` pointing to it) **derived from** the system docs — the operating instructions the next AI coding agent inherits: what the system is, the trust boundaries, what may and may not be touched, where the guardrails are. This is a different artifact from the system docs: instructions, not description.
+
+### Step 3: Security audit
+
+Run the security pass (`/security-audit-static`), applying the **intended-vs-implemented** skill to flag where the code diverges from `permissions.md`, `flows.md`, and `architecture.md`. Summarize surviving findings.
+
+### Step 4: Performance audit
+
+Run the performance pass (`/performance-audit-static`) — over-fetching, missing indexes, caching. Summarize findings.
+
+### Step 5: Derive the test-coverage map
+
+Run `/derive-tests` to turn the documented rules — and the gaps the audits just surfaced — into a coverage map (`tests.md`): which rules are pinned by tests that exist *today*, which are only proposed, which are guarded-live or manual, and which have no verification at all. Running this **after** the audits is deliberate: each confirmed finding becomes a concrete regression test to pin, so the same gap can't silently reopen on the next AI edit. This is the operational form of "documented == implemented," and the unverified boundary rules feed straight into the launch-blocker assessment below.
+
+### Step 6: Compile the shipping packet
+
+```
+## Shipping Packet: [repo / area]
+
+### Documentation Inventory
+| Doc | Status (present / stale / missing / n/a) | Notes |
+
+### Agent Context
+CLAUDE.md / AGENTS.md: [created / updated / already current]
+
+### Test Coverage
+[Rules pinned by tests that exist today · proposed but not yet written · guarded-live/manual · and the documented rules nothing verifies yet]
+
+### Security Summary
+[Counts by severity + the surviving findings, each: Risk · Attack · Impact · Fix]
+
+### Performance Summary
+[Findings by view/route/table, each: Recommendation · Effort · Priority]
+
+### Launch Blockers
+[Unresolved Critical/High items — including any boundary rule that is both unverified and unaudited — that should stop a ship]
+
+### Recommended Next Actions
+[Concrete owner actions or commands to run next]
+```
+
+## Notes
+
+- This is a handoff compiler: the value is sequencing plus synthesis, not re-deriving each audit.
+- If documentation is missing, the packet says so loudly — an audit without documented intent is incomplete, and the inventory makes that visible rather than hiding it.
+- Findings are code-review results, not confirmed exploits; the packet is a basis for human sign-off, not a substitute for it.
+- Run the specialist commands directly (`/document-app`, `/derive-tests`, `/security-audit-static`, `/performance-audit-static`) when you only need one stage.
diff --git a/pm-ai-shipping/skills/intended-vs-implemented/SKILL.md b/pm-ai-shipping/skills/intended-vs-implemented/SKILL.md
new file mode 100644
index 0000000..48ce042
--- /dev/null
+++ b/pm-ai-shipping/skills/intended-vs-implemented/SKILL.md
@@ -0,0 +1,41 @@
+---
+name: intended-vs-implemented
+description: "The method for finding the gap between what a system is supposed to do and what the code actually does — the class of bug generic scanners miss because they have no model of intent. Defines what counts as documented intent, what counts as implementation evidence, which mismatches matter, and how to avoid hand-wavy findings. Use when auditing AI-built code, reviewing access control against documented permissions, or checking whether a codebase matches its own documentation."
+---
+
+# Intended vs. Implemented: Auditing the Gap
+
+## Purpose
+
+A linter scans code in a vacuum. It can tell you the code is *internally* consistent; it cannot tell you the code does what you *meant*, because it has no model of your intent. The highest-value security and correctness bugs live in that gap — a permission documented but never enforced, a "cron-only" endpoint anyone can call, a field marked public-only that leaks private data.
+
+This skill is the method for finding that gap. It is the differentiator: it only works when intent has been written down first (see the **shipping-artifacts** skill), and that's exactly why commodity tools can't replicate it.
+
+## Context
+
+Use this when documented intent exists — `permissions.md`, `architecture.md`, `variables.md`, etc. If those docs are absent or stale, that absence is itself the first finding: you cannot audit intent you never recorded. Recommend documenting first, then auditing.
+
+## Method
+
+1. **Establish intent.** Read the `/documentation/*.md` set as the source of truth for what *should* be true: who may access what, which boundaries are trusted, which data is public. Treat the docs as claims to verify, not as proof.
+
+2. **Gather implementation evidence.** Read the code that enforces (or fails to enforce) each claim. Evidence is a cited file and line — the actual authorization check, the actual query filter, the actual sanitizer. "It's probably handled upstream" is not evidence; the code path is.
+
+3. **Compare claim to code, one boundary at a time.** For each documented rule, ask: does an enforcement point actually implement it, on the server, on every path? Distrust comments like "internal only," "admin only," or "validated elsewhere" — verify them in code.
+
+4. **Classify each mismatch by whether it matters.** A mismatch matters when crossing it lets a real actor reach data, money, infrastructure, or another tenant they shouldn't. It does not matter when the only person affected is the actor themselves on their own data. Drop cosmetic drift; keep boundary-crossing drift.
+
+5. **Avoid hand-wavy findings.** Every finding names: the **documented intent** (quote the doc), the **implemented reality** (cite the code), the **attacker and victim**, and the **concrete fix**. If you cannot cite both sides of the gap, it is a question to investigate, not a finding to report.
+
+## What counts
+
+- **Intent:** a documented rule, boundary, scope, or public/private classification.
+- **Implementation evidence:** a cited enforcement point (or its provable absence) in the code.
+- **A mismatch that matters:** doc says one thing, code does another, and the difference crosses a trust, cost, data, or tenant boundary.
+
+## Notes
+
+- Documented-but-unenforced is a finding on its own — rank it by what crossing the gap exposes.
+- Undocumented-but-enforced is usually fine, but flag it: the docs are now stale, which weakens the next audit.
+- This method feeds the security and performance audits; it does not replace their sink-level analysis — it adds the intent axis they lack.
+- Never fabricate intent to manufacture a gap. If the docs are silent, say the docs are silent.
diff --git a/pm-ai-shipping/skills/shipping-artifacts/SKILL.md b/pm-ai-shipping/skills/shipping-artifacts/SKILL.md
new file mode 100644
index 0000000..7cbc3c6
--- /dev/null
+++ b/pm-ai-shipping/skills/shipping-artifacts/SKILL.md
@@ -0,0 +1,79 @@
+---
+name: shipping-artifacts
+description: "The durable documentation set that makes an AI-built (vibe-coded) app reviewable before shipping. A small core every app needs — architecture, user/permission flows, permissions, variables/secrets, and a test-coverage map — plus conditional docs added only when they apply: emails, scheduled work, SEO, and embedded agents/automation. Defines what each doc must capture and how a reviewer or auditor uses it. Use when documenting a codebase for handoff, mapping user journeys and trust-boundary crossings, planning test coverage, or preparing for a security or performance audit."
+---
+
+# Shipping Artifacts: The Docs That Make AI-Built Code Reviewable
+
+## Purpose
+
+AI agents write code fast, but they leave no durable record of *intent* — what the system is supposed to do, who is allowed to do what, where the secrets live, which rules are actually verified. Without that record, no human (and no auditing agent) can tell whether the code is safe to ship. This skill defines the small set of documents that restore reviewability.
+
+These docs live in `/documentation/` and are written for two readers: a human reviewer and the next AI coding agent. They are the **intended-state** half of every later audit — a security or performance review is only as good as the intent it can compare the code against.
+
+## How the set is organized
+
+The set is **not** a fixed list — it is a small **core** plus **conditional** docs you add only when the capability exists.
+
+- **Core docs** — every reviewable app has these surfaces, so always produce them.
+- **Conditional docs** — include one only if the app actually has that capability. If it doesn't, write a single line in `architecture.md` ("No scheduled work — no `cron.md`.") rather than inventing an empty document. Reviewability comes from an honest map, and "we don't do X" is part of the map.
+- Most docs are reverse-engineered from code by `/document-app`. The one exception is `tests.md`, which is *derived from the other docs* by `/derive-tests` — it is the verification map, not a description of a subsystem.
+
+Be brutally honest about the current state without being paranoid. The job is an accurate map, not a clean bill of health. Each doc is short, table-and-bullet heavy, and skips generic theory.
+
+## Core documents
+
+Each entry: file · one-line purpose · what it must capture · how a reviewer uses it.
+
+1. **`architecture.md`** — what the system is and how it hangs together.
+   - Must capture: product overview + key assumptions; tech stack; how auth/sessions/claims flow end to end; the trust boundaries (e.g. service-role vs. client); a short **Known risks / assumptions** list (each entry backed by where it shows up in the code, not a generic checklist); a "Related Documents" index of every other doc produced.
+   - Reviewer use: the root document — everything else is cross-referenced from here.
+
+2. **`flows.md`** — the journeys where permissions and side effects are actually exercised.
+   - Must capture: each load-bearing flow as actor + precondition + success outcome; the step-by-step sequence across UI → server → data → jobs → providers → agents; the **authz check at each protected step** (which claim/role/scope, on which resource, and the expected *deny* case); the **trust-boundary crossings** (browser→server, server→provider, job→app, agent→tool, webhook→app); the state changes and side effects each step causes (writes, emails queued, jobs triggered, outbound calls).
+   - Reviewer use: the runtime view a static `permissions.md` matrix can't show — *where* and *in what order* authorization is enforced, and where it can be skipped.
+   - **Anti-PRD rule:** a flow that doesn't touch permissions, data integrity, external side effects, money, privacy, or operational safety does not belong here. This is a security/operations map, not a feature spec.
+
+3. **`permissions.md`** — who is allowed to do what.
+   - Must capture: roles/claims; where scope is derived (token vs. DB); a resource × operation × role matrix; which tables have row-level security and which rely on code-enforced checks.
+   - Reviewer use: the baseline an access-control audit compares the code against. `flows.md` shows it in motion; this is the static reference.
+
+4. **`variables.md`** — configuration and secrets, mapped to risk.
+   - Must capture: a table of Name · used-by · scope (server/client) · source · rotation · risk; explicit confirmation that no secret is bundled client-side; a pre-go-live checklist.
+   - Reviewer use: the secrets/PII-leak surface and the rotation plan during incident response.
+
+5. **`tests.md`** — the verification map: which documented rules are actually checked, which are only proposed, and which are checked by nothing.
+   - Must capture, in three clearly separated sections so the map can't read falsely green:
+     - **Existing coverage** — tests that are in the repo *today*, each tied to the rule it pins (so the map reflects reality, not a wish-list).
+     - **Proposed tests** — recommended cases not yet written, marked by **test type** (automated unit/integration · guarded live · manual review).
+     - **Gaps** — documented rules with no verification at all, ranked by what crossing them exposes.
+   - Each row carries: use-case → rule → expected behavior (including the deny/negative case) → evidence source (doc + code) → status (existing / proposed / none). It also notes which checks are CI-required and gate merges to `main`.
+   - Reviewer use: the operational form of "documented == implemented" — it shows whether each rule the other docs claim is actually pinned by a test today, only proposed, or unverified.
+   - Produced by `/derive-tests` (not `/document-app`), because it is derived from the other docs and the existing test suite rather than read off a subsystem.
+
+## Conditional documents (include only when the capability exists)
+
+6. **`emails.md`** — every notification the system sends. *Include only if the app sends transactional or automated email.*
+   - Must capture: the queue → processor → provider path; templates and the variables they accept; retry/backoff behavior; where to look when a send fails.
+   - Reviewer use: spotting unvalidated template inputs and PII exposure boundaries.
+
+7. **`cron.md`** — all scheduled work and how to operate it safely. *Include only if scheduled or background jobs exist.*
+   - Must capture: an inventory table (job → schedule → function → secrets → limits → retry); how each job stays idempotent; how internal calls authenticate; where to see last runs.
+   - Reviewer use: finding forgeable triggers and unbounded background jobs.
+
+8. **`seo.md`** — how a single-page app handles SEO and social previews. *Include only if there are public/indexable or bot-facing routes.*
+   - Must capture: the preview approach (static meta / prerender / edge HTML); a route → needs-SEO → public-data-only table; how dynamic metadata is sanitized; bot-vs-human routing.
+   - Reviewer use: catching public-data-only violations and metadata injection on bot routes.
+
+9. **`automation.md`** — embedded agents and other automation paths. *Include only if the app embeds AI agents, LLM workflows, tool-calling, webhooks, or external automation.*
+   - Must capture, per automation/agent: trigger + owner + whether it runs automatically or only after approval; the inputs it may read and the **exact tools/APIs it may call** (the tool surface is itself a hard guardrail); where **steering** lives (the prompt) vs. the **non-prompt hard guardrails**; the **output contract** back to the app (schema, validation, failure handling); **app-owned side effects vs. agent-owned suggestions**; and the controls — approval gates, audit/timeline logging, rate limits, retries, kill switch.
+   - Reviewer use: makes hidden automation paths visible and draws the line between what an agent *proposes* and what the app *enforces* — the highest-risk surface in modern AI-built apps.
+
+## Notes
+
+- Each produced doc adds a reference to itself in `architecture.md` under a "Related Documents" section, so the set stays discoverable.
+- Skip any conditional document that doesn't apply, and say so in one line rather than inventing content.
+- Keep examples and finished templates out of these docs — they describe *this* system, not the general method.
+- The agent operating-context file (`CLAUDE.md` / `AGENTS.md`) is a *different* artifact — instructions derived from these docs, not system documentation. It is produced at the handoff step by `/ship-check`, not here.
+- `tests.md` is produced by `/derive-tests`; the rest are produced by `/document-app`.
+- Do not include an "updated date" line; the file's history is the source of truth.
diff --git a/pm-data-analytics/.claude-plugin/plugin.json b/pm-data-analytics/.claude-plugin/plugin.json
index e561fda..44cbbac 100644
--- a/pm-data-analytics/.claude-plugin/plugin.json
+++ b/pm-data-analytics/.claude-plugin/plugin.json
@@ -1,6 +1,6 @@
 {
   "name": "pm-data-analytics",
-  "version": "1.0.1",
+  "version": "2.0.0",
   "description": "Data analytics skills for PMs: SQL query generation and cohort analysis. Analyze user data, generate queries, and identify retention patterns.",
   "author": {
     "name": "Paweł Huryn",
diff --git a/pm-execution/.claude-plugin/plugin.json b/pm-execution/.claude-plugin/plugin.json
index 4dfcf99..86c1fc4 100644
--- a/pm-execution/.claude-plugin/plugin.json
+++ b/pm-execution/.claude-plugin/plugin.json
@@ -1,6 +1,6 @@
 {
   "name": "pm-execution",
-  "version": "1.0.1",
+  "version": "2.0.0",
   "description": "Execution and product management skills: PRDs, OKRs, roadmaps, sprints, pre-mortems, stakeholder maps, user stories, prioritization frameworks, and more.",
   "author": {
     "name": "Paweł Huryn",
diff --git a/pm-execution/README.md b/pm-execution/README.md
index 47c584d..a41eac2 100644
--- a/pm-execution/README.md
+++ b/pm-execution/README.md
@@ -2,7 +2,7 @@
 
 Execution and product management skills: PRDs, OKRs, roadmaps, sprints, pre-mortems, stakeholder maps, user stories, prioritization frameworks, and more.
 
-## Skills (15)
+## Skills (16)
 
 - **brainstorm-okrs** — Brainstorm team-level OKRs aligned with company objectives.
 - **create-prd** — Create a Product Requirements Document using a comprehensive 8-section template covering summary, background, objectives, market segments, value propositions, solution details, and release planning.
@@ -15,17 +15,19 @@ Execution and product management skills: PRDs, OKRs, roadmaps, sprints, pre-mort
 - **retro** — Facilitate a structured sprint retrospective.
 - **sprint-plan** — Plan a sprint with capacity estimation, story selection, dependency mapping, and risk identification.
 - **stakeholder-map** — Build a stakeholder map using a power/interest grid, identify communication strategies per quadrant, and generate a communication plan.
+- **strategy-red-team** — Red-team a PRD, roadmap, or strategy by attacking its load-bearing assumptions; rank failure modes and return the cheapest test and kill criteria for each.
 - **summarize-meeting** — Summarize a meeting transcript into a structured template with date, participants, topic, summary points, and action items.
 - **test-scenarios** — Create comprehensive test scenarios from user stories with test objectives, starting conditions, user roles, step-by-step actions, and expected outcomes.
 - **user-stories** — Create user stories following the 3 C's (Card, Conversation, Confirmation) and INVEST criteria with descriptions, design links, and acceptance criteria.
 - **wwas** — Create product backlog items in Why-What-Acceptance format.
 
-## Commands (10)
+## Commands (11)
 
 - `/pm-execution:generate-data` — Generate realistic dummy datasets for testing — CSV, JSON, SQL inserts, or Python scripts.
 - `/pm-execution:meeting-notes` — Summarize a meeting transcript into structured notes with decisions, action items, and follow-ups.
 - `/pm-execution:plan-okrs` — Brainstorm team-level OKRs aligned with company objectives — qualitative objectives with measurable key results.
 - `/pm-execution:pre-mortem` — Run a pre-mortem risk analysis on a PRD, launch plan, or feature — identify what could go wrong before it does.
+- `/pm-execution:red-team-prd` — Red-team a PRD, roadmap, or strategy — attack its load-bearing assumptions and return the cheapest test for each before you commit.
 - `/pm-execution:sprint` — Sprint lifecycle — plan a sprint, run a retrospective, or generate release notes.
 - `/pm-execution:stakeholder-map` — Map stakeholders on a Power × Interest grid and create a tailored communication plan.
 - `/pm-execution:test-scenarios` — Generate comprehensive test scenarios from user stories or feature specs — happy paths, edge cases, and error handling.
diff --git a/pm-execution/commands/red-team-prd.md b/pm-execution/commands/red-team-prd.md
new file mode 100644
index 0000000..c631f9e
--- /dev/null
+++ b/pm-execution/commands/red-team-prd.md
@@ -0,0 +1,66 @@
+---
+description: Red-team a PRD, roadmap, or strategy — attack its load-bearing assumptions and return the cheapest test for each before you commit
+argument-hint: "<PRD, roadmap, strategy, or the current doc>"
+---
+
+# /red-team-prd -- Attack the Plan Before Reality Does
+
+Most plans only survived polite feedback. This command finds the assumptions that would make yours fail, attacks them honestly, and hands you the cheapest test for each — so you can kill a bad bet this week instead of at launch.
+
+## Invocation
+
+```
+/red-team-prd [paste or upload a PRD, roadmap, or strategy]
+/red-team-prd Prioritize AI onboarding — activation is our bottleneck
+/red-team-prd the current doc
+```
+
+## Workflow
+
+### Step 1: Accept the Plan
+
+Take it in any form — PRD, roadmap, strategy memo, one-line bet, or an uploaded doc. If the user says "the current doc," use the document in context.
+
+### Step 2: Red-Team It
+
+Apply the **strategy-red-team** skill:
+
+- Extract every claim; keep only the **load-bearing** ones (false → plan dies).
+- **Steelman each, then attack the steelman** — no strawmen.
+- Write each failure mode as "**Fails if ___**."
+- Rank by **(impact if wrong) × (likelihood wrong) × (cheapness to test)**.
+- Default "the risk is real" unless the plan cites evidence against it — but **say plainly what's well-reasoned**, and never fabricate a weakness.
+
+### Step 3: Return the Output
+
+```
+## Red-Team: [plan in one line]
+
+### Top Kill-Assumptions (ranked)
+- **Claim:** [load-bearing assertion]
+  - **Fails if:** [concrete, falsifiable]
+  - **Evidence to get this week:** [specific]
+  - **Kill criterion:** [threshold]
+  - **Cheapest test:** [smallest experiment]
+[3–5 max]
+
+### What's Well-Reasoned
+[State it explicitly — don't manufacture doubt.]
+
+### What I Couldn't Assess
+[Where the plan didn't give enough to judge.]
+```
+
+### Step 4: Offer Next Steps
+
+- "Want me to **turn the top kill-assumption into an experiment** you can run this week?"
+- "Should I **run a pre-mortem** to complement this — imagine it already failed and trace the path?"
+- "Want me to **rewrite the riskiest section** of the plan to address what survived?"
+
+## Notes
+
+- Lead with the ranking — the cheapest high-impact test is the whole point.
+- Five real kill-assumptions with tests beat twenty generic risks. Cut ruthlessly.
+- Distinct from `/pre-mortem`: pre-mortem narrates failure after the fact; red-team attacks the live assumptions and hands you the test.
+- If the plan is genuinely strong, the most valuable output is saying so — and naming the one thing still worth checking.
+- For a second-opinion pass, ask the user before adding cross-model friction; different model families miss different things, but most plans don't need it.
diff --git a/pm-execution/skills/strategy-red-team/SKILL.md b/pm-execution/skills/strategy-red-team/SKILL.md
new file mode 100644
index 0000000..fe7b7fe
--- /dev/null
+++ b/pm-execution/skills/strategy-red-team/SKILL.md
@@ -0,0 +1,72 @@
+---
+name: strategy-red-team
+description: "Red-team a PRD, roadmap, or strategy by attacking its load-bearing assumptions before reality does. Steelmans then attacks each claim, ranks failure modes by impact × likelihood × cheapness-to-test, and returns the cheapest test and kill criteria for each. Use when stress-testing a plan, pressure-testing a strategy, challenging assumptions, or preparing a doc for executive review."
+---
+
+# Strategy Red-Team: Attack the Assumptions Before Reality Does
+
+## Purpose
+
+You are a sharp, fair adversary reviewing $ARGUMENTS. Most plans only survived polite feedback. This skill finds the load-bearing assumptions that would make the plan fail, attacks them honestly, and returns — for each — the evidence to get this week, the kill criteria, and the cheapest test.
+
+## Context
+
+A red-team is not a pre-mortem. A pre-mortem imagines the plan already failed and narrates why. A red-team attacks the load-bearing assumptions and logic **now**, while there's still time to test the cheapest one. It improves judgment, not just confidence.
+
+The goal is a sharper decision, not a longer risk list. Five real kill-assumptions with tests beat twenty generic risks.
+
+## Instructions
+
+1. **Extract every claim.** Read the plan and list what it asserts as true — about the user, the market, the constraint, the mechanism, the timeline. Separate **load-bearing** claims (if false, the plan dies) from cosmetic ones. Only load-bearing claims are worth attacking.
+
+2. **Steelman, then attack.** For each load-bearing claim, first state the strongest version of why it might be true. Then attack *that* — not a strawman. An attack on a weak version of the claim is worthless.
+
+3. **Write each failure mode as "Fails if ___."** Be concrete and falsifiable. "Fails if activation isn't actually the constraint" beats "execution risk."
+
+4. **Rank by (impact if wrong) × (likelihood wrong) × (cheapness to test).** The top of the list is what to test *this week* — high-impact, plausibly wrong, and cheap to check. Surface that ranking; don't bury the lede.
+
+5. **Self-refute, don't fabricate.** Default to "this risk is real" unless the plan already cites evidence against it. But if a claim is genuinely well-reasoned, say so plainly — a red-team that manufactures doubt is as useless as one that rubber-stamps. Never invent a weakness the plan doesn't have.
+
+6. **For each surviving kill-assumption, give the operator something to do:**
+   - **Fails if:** the precise condition that breaks the plan
+   - **Evidence to get this week:** the specific data, query, or conversation that would confirm or kill it cheaply
+   - **Kill criterion:** the threshold at which you'd stop or change course
+   - **Cheapest test:** the smallest experiment that moves the belief
+
+7. **Optional cross-model mode.** If the user asks for a second opinion and another model (Codex, Gemini, a second Claude) is reachable, run the same plan through it and flag where the two disagree — different model families miss different things. Default is single-model; don't add this friction unless asked.
+
+8. **Structure the output (make it screenshot-native):**
+
+   ```
+   ## Red-Team: [plan in one line]
+
+   ### Top Kill-Assumptions (ranked)
+   For each (3–5 max):
+   - **Claim:** [the load-bearing assertion]
+   - **Fails if:** [concrete, falsifiable condition]
+   - **Evidence to get this week:** [specific]
+   - **Kill criterion:** [threshold]
+   - **Cheapest test:** [smallest experiment]
+
+   ### What's Well-Reasoned
+   [State explicitly what holds up — and why. Don't manufacture doubt.]
+
+   ### What I Couldn't Assess
+   [Gaps where the plan didn't give enough to judge.]
+   ```
+
+## Notes
+
+- No strawmanning — attack the steelman or don't attack.
+- No generic risk lists — every item must be specific to *this* plan.
+- No fabrication — if it's sound, say so.
+- Rank ruthlessly — the cheapest high-impact test is the whole point.
+- The emotional job is relief from the fear of confidently shipping the wrong bet, so end with what to *do*, not just what to fear.
+
+---
+
+### Further Reading
+
+- [Assumption Prioritization Canvas: How to Identify And Test The Right Assumptions](https://www.productcompass.pm/p/assumption-prioritization-canvas)
+- [How to Manage Risks as a Product Manager](https://www.productcompass.pm/p/how-to-manage-risks-as-a-product-manager)
+- [How Meta and Instagram Use Pre-Mortems to Avoid Post-Mortems](https://www.productcompass.pm/p/how-to-run-pre-mortem-template)
diff --git a/pm-go-to-market/.claude-plugin/plugin.json b/pm-go-to-market/.claude-plugin/plugin.json
index 2801a6a..166d046 100644
--- a/pm-go-to-market/.claude-plugin/plugin.json
+++ b/pm-go-to-market/.claude-plugin/plugin.json
@@ -1,6 +1,6 @@
 {
   "name": "pm-go-to-market",
-  "version": "1.0.1",
+  "version": "2.0.0",
   "description": "Go-to-market skills for PMs: GTM strategy, growth loops, GTM motions, beachhead segments, and ideal customer profiles.",
   "author": {
     "name": "Paweł Huryn",
diff --git a/pm-market-research/.claude-plugin/plugin.json b/pm-market-research/.claude-plugin/plugin.json
index fd658f4..ad8c5b7 100644
--- a/pm-market-research/.claude-plugin/plugin.json
+++ b/pm-market-research/.claude-plugin/plugin.json
@@ -1,6 +1,6 @@
 {
   "name": "pm-market-research",
-  "version": "1.0.1",
+  "version": "2.0.0",
   "description": "Market research skills for PMs: user personas, market segmentation, sentiment analysis, and competitive analysis.",
   "author": {
     "name": "Paweł Huryn",
diff --git a/pm-marketing-growth/.claude-plugin/plugin.json b/pm-marketing-growth/.claude-plugin/plugin.json
index 3495296..a9da88e 100644
--- a/pm-marketing-growth/.claude-plugin/plugin.json
+++ b/pm-marketing-growth/.claude-plugin/plugin.json
@@ -1,6 +1,6 @@
 {
   "name": "pm-marketing-growth",
-  "version": "1.0.1",
+  "version": "2.0.0",
   "description": "Product marketing and growth skills: marketing ideas, value proposition statements, North Star metrics, product naming, and positioning.",
   "author": {
     "name": "Paweł Huryn",
diff --git a/pm-product-discovery/.claude-plugin/plugin.json b/pm-product-discovery/.claude-plugin/plugin.json
index 1264016..7ad80a2 100644
--- a/pm-product-discovery/.claude-plugin/plugin.json
+++ b/pm-product-discovery/.claude-plugin/plugin.json
@@ -1,6 +1,6 @@
 {
   "name": "pm-product-discovery",
-  "version": "1.0.1",
+  "version": "2.0.0",
   "description": "Product discovery skills for PMs: ideation, experiments, assumption testing, feature prioritization, and customer interview synthesis.",
   "author": {
     "name": "Paweł Huryn",
diff --git a/pm-product-strategy/.claude-plugin/plugin.json b/pm-product-strategy/.claude-plugin/plugin.json
index e6148d5..223ffed 100644
--- a/pm-product-strategy/.claude-plugin/plugin.json
+++ b/pm-product-strategy/.claude-plugin/plugin.json
@@ -1,6 +1,6 @@
 {
   "name": "pm-product-strategy",
-  "version": "1.0.1",
+  "version": "2.0.0",
   "description": "Product strategy skills for PMs: vision, strategy canvas, value propositions, lean canvas, business model canvas, SWOT, PESTLE, Ansoff Matrix, Porter's Five Forces, and monetization.",
   "author": {
     "name": "Paweł Huryn",
diff --git a/pm-toolkit/.claude-plugin/plugin.json b/pm-toolkit/.claude-plugin/plugin.json
index 3390833..fac588b 100644
--- a/pm-toolkit/.claude-plugin/plugin.json
+++ b/pm-toolkit/.claude-plugin/plugin.json
@@ -1,6 +1,6 @@
 {
   "name": "pm-toolkit",
-  "version": "1.0.1",
+  "version": "2.0.0",
   "description": "PM utility skills: resume review, NDA drafting, privacy policy generation, and grammar/flow checking. Essential tools for product managers beyond core product work.",
   "author": {
     "name": "Paweł Huryn",