From def78d4754e748fd4c6500c2f86d50467830ed49 Mon Sep 17 00:00:00 2001
From: Yutaka Kurosaki <>
Date: Sat, 9 May 2026 11:18:26 +0900
Subject: [PATCH] Address final review: Vimeo regex boundary + spec accuracy

- Vimeo regex now rejects URLs like vimeo.com/123abc that were
  silently truncated to ID 123 and produced broken iframes. Negative
  lookahead (?![A-Za-z0-9]) ensures the captured digits are not
  followed by alphanumerics. Two false-positive test cases added.
- Spec corrected: HtmlInline nodes ARE filtered regardless of
  insertion path; the implementation uses a dedicated MediaEmbedNode
  + renderer to bypass the filter only for trusted programmatic embeds.
  Components list updated to include the two extra files.
- Plan Task 6 regex updated for consistency.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../plans/2026-05-09-media-embed.md           |  2 +-
 .../specs/2026-05-09-media-embed-design.md    | 30 +++++++++++++++----
 src/app/Markdown/MediaUrlResolver.php         |  2 +-
 .../Unit/Markdown/MediaUrlResolverTest.php    | 14 +++++++++
 4 files changed, 41 insertions(+), 7 deletions(-)
diff --git a/docs/superpowers/plans/2026-05-09-media-embed.md b/docs/superpowers/plans/2026-05-09-media-embed.md
index 1326b5d..8802c46 100644
--- a/docs/superpowers/plans/2026-05-09-media-embed.md
+++ b/docs/superpowers/plans/2026-05-09-media-embed.md
@@ -687,7 +687,7 @@ In `MediaUrlResolver.php`, update `resolve()` and add Vimeo methods:
 
     private function detectVimeo(string $url): ?string
     {
-        if (!preg_match('#^https?://(?:www\.|player\.)?vimeo\.com/(?:video/)?(\d+)#', $url, $m)) {
+        if (!preg_match('~^https?://(?:www\.|player\.)?vimeo\.com/(?:video/)?(\d+)(?![A-Za-z0-9])~', $url, $m)) {
             return null;
         }
         $videoId = $m[1];
diff --git a/docs/superpowers/specs/2026-05-09-media-embed-design.md b/docs/superpowers/specs/2026-05-09-media-embed-design.md
index 07d98f7..a248417 100644
--- a/docs/superpowers/specs/2026-05-09-media-embed-design.md
+++ b/docs/superpowers/specs/2026-05-09-media-embed-design.md
@@ -89,7 +89,23 @@ Pure URL classification class with no external dependencies. Highly testable.
 Thin glue layer. Receives `DocumentParsedEvent`, walks the AST, and delegates URL classification to `MediaUrlResolver`.
 
 - Public API: `handle(DocumentParsedEvent $event): void`
-- For each `Image` node: call resolver; if non-null, replace node with `HtmlInline`
+- For each `Image` node: call resolver; if non-null, replace node with a `MediaEmbedNode`
+
+#### `src/app/Markdown/MediaEmbedNode.php`
+
+Custom AST node that carries the pre-rendered embed HTML string.
+
+- Extends `AbstractStringContainer`
+- Does NOT implement `RawMarkupContainerInterface` — this is intentional so the node is not subject to `HtmlFilter`
+- Holds its literal content (the HTML string) for direct output by its renderer
+
+#### `src/app/Markdown/MediaEmbedNodeRenderer.php`
+
+Dedicated renderer for `MediaEmbedNode`.
+
+- Implements `NodeRendererInterface`
+- Returns the node's literal content directly, without invoking any HTML filter
+- This is the mechanism that allows trusted embed HTML to survive the `html_input => 'strip'` policy
 
 ### Modified files
 
@@ -206,11 +222,15 @@ All output URLs are passed through `htmlspecialchars($url, ENT_QUOTES, 'UTF-8')`
 
 ### Relation to `html_input => 'strip'`
 
-The `'strip'` setting is preserved. CommonMark strips raw HTML written by users in source Markdown. However, `HtmlInline` nodes inserted programmatically by a registered extension are not stripped. Therefore:
+The `'strip'` setting is preserved. All `HtmlInline` nodes — whether written by the user in Markdown source or inserted programmatically by an extension — go through `HtmlFilter::filter()`, which strips their content under `'strip'` mode. To emit the embed HTML safely without bypassing this policy, the extension introduces a custom node type:
 
-- User-written `<script>` in Markdown source → still stripped
-- `<video>` / `<iframe>` inserted by `MediaEmbedExtension` → output as intended
-- The security boundary becomes "the extension is responsible for its own escaping," which is enforced by passing all dynamic URL fragments through `htmlspecialchars`.
+- `MediaEmbedNode` extends `AbstractStringContainer` and deliberately does NOT implement `RawMarkupContainerInterface`
+- `MediaEmbedNodeRenderer` returns the node's literal content directly, without invoking any HTML filter
+
+Therefore:
+- User-written `<script>` in Markdown source → produces `HtmlInline` → still stripped
+- `<video>` / `<audio>` / `<iframe>` inserted by `MediaEmbedExtension` → produces `MediaEmbedNode` → output as intended
+- The security boundary is "only the explicitly trusted node type bypasses filtering," and that node type is reachable only through `MediaEmbedListener` after `MediaUrlResolver` has classified the URL as a known media pattern.
 
 ### `alt` and `title`
 
diff --git a/src/app/Markdown/MediaUrlResolver.php b/src/app/Markdown/MediaUrlResolver.php
index 23d266e..c5ba41c 100644
--- a/src/app/Markdown/MediaUrlResolver.php
+++ b/src/app/Markdown/MediaUrlResolver.php
@@ -101,7 +101,7 @@ private function parseTimestamp(string $t): ?int
 
     private function detectVimeo(string $url): ?string
     {
-        if (!preg_match('~^https?://(?:www\.|player\.)?vimeo\.com/(?:video/)?(\d+)~', $url, $m)) {
+        if (!preg_match('~^https?://(?:www\.|player\.)?vimeo\.com/(?:video/)?(\d+)(?![A-Za-z0-9])~', $url, $m)) {
             return null;
         }
         $videoId = $m[1];
diff --git a/src/tests/Unit/Markdown/MediaUrlResolverTest.php b/src/tests/Unit/Markdown/MediaUrlResolverTest.php
index 2934fd7..b45582d 100644
--- a/src/tests/Unit/Markdown/MediaUrlResolverTest.php
+++ b/src/tests/Unit/Markdown/MediaUrlResolverTest.php
@@ -186,4 +186,18 @@ public function test_vimeo_invalid_id_returns_null(): void
     {
         $this->assertNull($this->resolver->resolve('https://vimeo.com/notanumber'));
     }
+
+    #[DataProvider('vimeoFalsePositives')]
+    public function test_vimeo_false_positives_return_null(string $url): void
+    {
+        $this->assertNull($this->resolver->resolve($url));
+    }
+
+    public static function vimeoFalsePositives(): array
+    {
+        return [
+            'digits then letter' => ['https://vimeo.com/123abc'],
+            'digits then x' => ['https://vimeo.com/123x'],
+        ];
+    }
 }