From fb1f8313a719e89195716fbd08a93f4ac4d5defd Mon Sep 17 00:00:00 2001
From: Ryan Benson <ryan@dfir.blog>
Date: Fri, 17 Apr 2026 12:22:58 -0700
Subject: [PATCH 1/2] Fix fragment parsing. Add text fragment highlighting
 parsing.

---
 unfurl/app.py                 |  2 +-
 unfurl/parsers/parse_url.py   | 64 +++++++++++++++++++++++++++--------
 unfurl/templates/graph.html   | 11 +++++-
 unfurl/tests/unit/test_url.py | 48 ++++++++++++++++++++++++++
 4 files changed, 108 insertions(+), 17 deletions(-)
diff --git a/unfurl/app.py b/unfurl/app.py
index 210449b..d2f09d0 100644
--- a/unfurl/app.py
+++ b/unfurl/app.py
@@ -88,7 +88,7 @@ class JsonVisJS(Resource):
     def get(self):
         if 'url' not in request.args:
             return {}
-        unfurl_this = unquote(request.args['url'])
+        unfurl_this = request.args['url']
         return run(
             unfurl_this,
             return_type='json',
diff --git a/unfurl/parsers/parse_url.py b/unfurl/parsers/parse_url.py
index ea1cb86..13113a3 100644
--- a/unfurl/parsers/parse_url.py
+++ b/unfurl/parsers/parse_url.py
@@ -142,24 +142,58 @@ def run(unfurl, node):
                               'Numbering starts at 1.', parent_id=node.node_id, incoming_edge_config=urlparse_edge)
 
     elif node.data_type == 'url.query' or node.data_type == 'url.fragment':
-        parsed_qs = urllib.parse.parse_qs(node.value, keep_blank_values=True)
-        for key, value in parsed_qs.items():
-            assert type(value) is list, 'parsed_qs should result in type list, but did not.'
-            # In the majority of cases, query string keys are unique, but the spec is ambiguous. In the case of
-            # duplicate keys, urllib.parse.parsed_qs adds them to a list. Unfurl will loop over and create a
-            # node for each value in that list of values (this is typically only one value, but could be more).
-            for v in value:
+        fragment_value = node.value
+        fragment_directive = None
+
+        # Text Fragments (ref: https://wicg.github.io/scroll-to-text-fragment/) use :~: as a
+        # fragment directive delimiter. The part before :~: is the traditional fragment; the part
+        # after contains directives like text= that tell the browser to highlight/scroll to text.
+        if node.data_type == 'url.fragment' and ':~:' in node.value:
+            fragment_value, fragment_directive = node.value.split(':~:', 1)
+
+        # If we split off a fragment directive, the remaining fragment_value is the traditional
+        # anchor (e.g. "heading1"). Only parse it as query string pairs if it looks like one
+        # (contains '='). Otherwise, treat it as a plain anchor identifier.
+        if fragment_value and fragment_directive is not None and '=' not in fragment_value:
+            unfurl.add_to_queue(
+                data_type='url.fragment.anchor', key='Fragment Anchor', value=fragment_value,
+                label=f'Fragment Anchor: {fragment_value}',
+                hover='This is the traditional URL <b>fragment</b> (anchor) that identifies '
+                      'a specific section of the page.',
+                parent_id=node.node_id, incoming_edge_config=urlparse_edge)
+
+        if fragment_value and (fragment_directive is None or '=' in fragment_value):
+            parsed_qs = urllib.parse.parse_qs(fragment_value, keep_blank_values=True)
+            for key, value in parsed_qs.items():
+                assert type(value) is list, 'parsed_qs should result in type list, but did not.'
+                # In the majority of cases, query string keys are unique, but the spec is ambiguous. In the case of
+                # duplicate keys, urllib.parse.parsed_qs adds them to a list. Unfurl will loop over and create a
+                # node for each value in that list of values (this is typically only one value, but could be more).
+                for v in value:
+                    unfurl.add_to_queue(
+                        data_type='url.query.pair', key=key, value=v, label=f'{key}: {v}',
+                        parent_id=node.node_id, incoming_edge_config=urlparse_edge)
+
+            # If the query string or fragment is actually another URL (as seen in some redirectors), we want to
+            # continue doing subsequent parsing on it. For that, we need to recognize it and change the data_type to url.
+            if not parsed_qs:
+                parsed = try_url_parse(unfurl, node)
+                if parsed:
+                    return
+
+        if fragment_directive:
+            # Parse each directive (separated by &). Currently only text= is defined in the spec.
+            directives = urllib.parse.parse_qs(fragment_directive, keep_blank_values=True)
+            for directive_values in directives.get('text', []):
+                decoded_text = urllib.parse.unquote_plus(directive_values)
                 unfurl.add_to_queue(
-                    data_type='url.query.pair', key=key, value=v, label=f'{key}: {v}',
+                    data_type='url.fragment.text-fragment', key='Text Fragment', value=decoded_text,
+                    label=f'Text Fragment: {decoded_text}',
+                    hover='A <b>Text Fragment</b> tells the browser to scroll to and highlight '
+                          'the specified text on the page. '
+                          '<a href="https://wicg.github.io/scroll-to-text-fragment/" target="_blank">[spec]</a>',
                     parent_id=node.node_id, incoming_edge_config=urlparse_edge)
 
-        # If the query string or fragment is actually another URL (as seen in some redirectors), we want to
-        # continue doing subsequent parsing on it. For that, we need to recognize it and change the data_type to url.
-        if not parsed_qs:
-            parsed = try_url_parse(unfurl, node)
-            if parsed:
-                return
-
     elif node.data_type == 'url.params':
         split_params_re = re.compile(r'^(?P<key>[^=]+?)=(?P<value>[^=?]+)(?P<delim>[;,|])')
         split_params = split_params_re.match(node.value)
diff --git a/unfurl/templates/graph.html b/unfurl/templates/graph.html
index d80d5c6..ab61686 100644
--- a/unfurl/templates/graph.html
+++ b/unfurl/templates/graph.html
@@ -179,7 +179,16 @@ <h3>You can move and select nodes, zoom and pan the camera, and save the graph i
 
     const urlParams = new URLSearchParams(window.location.search);
     if (urlParams.get('url')) {
-        document.getElementById("text_to_unfurl").value = decodeURI(urlParams.get('url') + window.location.hash);
+        // Reconstruct the original URL exactly as the user entered it. The input is
+        // evidence being analyzed, so percent-encoding must not be silently altered.
+        // encodeURIComponent in the submit handler double-encodes (e.g. %20 → %2520),
+        // so one decodeURIComponent here restores the original input faithfully.
+        var rawMatch = window.location.search.match(/[?&]url=([^&]*)/);
+        if (rawMatch) {
+            document.getElementById("text_to_unfurl").value = decodeURIComponent(rawMatch[1]);
+        } else {
+            document.getElementById("text_to_unfurl").value = urlParams.get('url') + window.location.hash;
+        }
         var url = new URL(`${window.location.protocol}//${window.location.host}/json/visjs`);
         url.searchParams.set('url', urlParams.get('url') + window.location.hash);
         fetch(url).then(response => {
diff --git a/unfurl/tests/unit/test_url.py b/unfurl/tests/unit/test_url.py
index fc190ff..80c197f 100644
--- a/unfurl/tests/unit/test_url.py
+++ b/unfurl/tests/unit/test_url.py
@@ -47,6 +47,54 @@ def test_file_path_url(self):
         self.assertIn('File Extension: .png', test.nodes[13].label)
 
 
+    def test_text_fragment(self):
+        """Test that Text Fragments (#:~:text=...) are parsed.
+
+        Regression test for https://github.com/RyanDFIR/unfurl/issues/140
+        """
+
+        test = Unfurl()
+        test.add_to_queue(
+            data_type='url', key=None,
+            value='https://blog.chromium.org/2019/12/chrome-80-content-indexing-es-modules.html'
+                  '#:~:text=ECMAScript%20Modules%20in%20Web%20Workers')
+        test.parse_queue()
+
+        # confirm the text fragment is parsed out with the decoded text
+        text_fragments = [node for node in test.nodes.values()
+                          if node.data_type == 'url.fragment.text-fragment']
+        self.assertEqual(1, len(text_fragments))
+        self.assertEqual('ECMAScript Modules in Web Workers', text_fragments[0].value)
+
+    def test_text_fragment_multiple(self):
+        """Test that multiple Text Fragments are each parsed as separate nodes."""
+
+        test = Unfurl()
+        test.add_to_queue(
+            data_type='url', key=None,
+            value='https://example.com/page#:~:text=first%20match&text=second%20match')
+        test.parse_queue()
+
+        text_fragments = [node for node in test.nodes.values()
+                          if node.data_type == 'url.fragment.text-fragment']
+        self.assertEqual(2, len(text_fragments))
+        self.assertEqual('first match', text_fragments[0].value)
+        self.assertEqual('second match', text_fragments[1].value)
+
+    def test_text_fragment_with_anchor(self):
+        """Test a fragment that has both a traditional anchor and a text fragment."""
+
+        test = Unfurl()
+        test.add_to_queue(
+            data_type='url', key=None,
+            value='https://example.com/page#heading1:~:text=highlighted%20text')
+        test.parse_queue()
+
+        text_fragments = [node for node in test.nodes.values()
+                          if node.data_type == 'url.fragment.text-fragment']
+        self.assertEqual(1, len(text_fragments))
+        self.assertEqual('highlighted text', text_fragments[0].value)
+
     def test_query_param_no_value(self):
         """Test that query parameters with no value are preserved."""
 

From 440184829c7dde0c8bc5f4bbbce1b9f8c011d972 Mon Sep 17 00:00:00 2001
From: Ryan Benson <ryan@dfir.blog>
Date: Fri, 17 Apr 2026 14:00:12 -0700
Subject: [PATCH 2/2] Fixes for when fragment has parameters after it.

---
 unfurl/parsers/parse_url.py | 23 ++++++++++++++++-------
 unfurl/templates/graph.html |  2 +-
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/unfurl/parsers/parse_url.py b/unfurl/parsers/parse_url.py
index 13113a3..e60b3e3 100644
--- a/unfurl/parsers/parse_url.py
+++ b/unfurl/parsers/parse_url.py
@@ -151,18 +151,27 @@ def run(unfurl, node):
         if node.data_type == 'url.fragment' and ':~:' in node.value:
             fragment_value, fragment_directive = node.value.split(':~:', 1)
 
-        # If we split off a fragment directive, the remaining fragment_value is the traditional
-        # anchor (e.g. "heading1"). Only parse it as query string pairs if it looks like one
-        # (contains '='). Otherwise, treat it as a plain anchor identifier.
-        if fragment_value and fragment_directive is not None and '=' not in fragment_value:
+        # Fragments can contain SPA-style routes with a '?' separating the path from query
+        # params (e.g. "#/im?p=@user"). Split on '?' so parse_qs only sees the query part.
+        fragment_path = None
+        if fragment_value and node.data_type == 'url.fragment' and '?' in fragment_value:
+            fragment_path, fragment_value = fragment_value.split('?', 1)
+
+        # A fragment value without '=' is a plain anchor (e.g. "heading1"), not a query string.
+        # Only parse through parse_qs if it contains key=value pairs.
+        if fragment_value and node.data_type == 'url.fragment' and '=' not in fragment_value:
+            fragment_path = fragment_value
+            fragment_value = None
+
+        if fragment_path:
             unfurl.add_to_queue(
-                data_type='url.fragment.anchor', key='Fragment Anchor', value=fragment_value,
-                label=f'Fragment Anchor: {fragment_value}',
+                data_type='url.fragment.anchor', key='Fragment Anchor', value=fragment_path,
+                label=f'Fragment Anchor: {fragment_path}',
                 hover='This is the traditional URL <b>fragment</b> (anchor) that identifies '
                       'a specific section of the page.',
                 parent_id=node.node_id, incoming_edge_config=urlparse_edge)
 
-        if fragment_value and (fragment_directive is None or '=' in fragment_value):
+        if fragment_value:
             parsed_qs = urllib.parse.parse_qs(fragment_value, keep_blank_values=True)
             for key, value in parsed_qs.items():
                 assert type(value) is list, 'parsed_qs should result in type list, but did not.'
diff --git a/unfurl/templates/graph.html b/unfurl/templates/graph.html
index ab61686..5dc8c73 100644
--- a/unfurl/templates/graph.html
+++ b/unfurl/templates/graph.html
@@ -112,7 +112,7 @@ <h3>You can move and select nodes, zoom and pan the camera, and save the graph i
             evt.preventDefault();
             // Spaces aren't allowed in URLs, but people sometimes add them by accident
             value_to_send = document.getElementById("text_to_unfurl").value.replace(/ /g, "");
-            window.location.replace("/" + value_to_send);
+            window.location.replace("/graph?url=" + encodeURIComponent(value_to_send));
         }
         return false;
     });