From fb1f8313a719e89195716fbd08a93f4ac4d5defd Mon Sep 17 00:00:00 2001 From: Ryan Benson Date: Fri, 17 Apr 2026 12:22:58 -0700 Subject: [PATCH 1/2] Fix fragment parsing. Add text fragment highlighting parsing. --- unfurl/app.py | 2 +- unfurl/parsers/parse_url.py | 64 +++++++++++++++++++++++++++-------- unfurl/templates/graph.html | 11 +++++- unfurl/tests/unit/test_url.py | 48 ++++++++++++++++++++++++++ 4 files changed, 108 insertions(+), 17 deletions(-) diff --git a/unfurl/app.py b/unfurl/app.py index 210449b..d2f09d0 100644 --- a/unfurl/app.py +++ b/unfurl/app.py @@ -88,7 +88,7 @@ class JsonVisJS(Resource): def get(self): if 'url' not in request.args: return {} - unfurl_this = unquote(request.args['url']) + unfurl_this = request.args['url'] return run( unfurl_this, return_type='json', diff --git a/unfurl/parsers/parse_url.py b/unfurl/parsers/parse_url.py index ea1cb86..13113a3 100644 --- a/unfurl/parsers/parse_url.py +++ b/unfurl/parsers/parse_url.py @@ -142,24 +142,58 @@ def run(unfurl, node): 'Numbering starts at 1.', parent_id=node.node_id, incoming_edge_config=urlparse_edge) elif node.data_type == 'url.query' or node.data_type == 'url.fragment': - parsed_qs = urllib.parse.parse_qs(node.value, keep_blank_values=True) - for key, value in parsed_qs.items(): - assert type(value) is list, 'parsed_qs should result in type list, but did not.' - # In the majority of cases, query string keys are unique, but the spec is ambiguous. In the case of - # duplicate keys, urllib.parse.parsed_qs adds them to a list. Unfurl will loop over and create a - # node for each value in that list of values (this is typically only one value, but could be more). - for v in value: + fragment_value = node.value + fragment_directive = None + + # Text Fragments (ref: https://wicg.github.io/scroll-to-text-fragment/) use :~: as a + # fragment directive delimiter. The part before :~: is the traditional fragment; the part + # after contains directives like text= that tell the browser to highlight/scroll to text. + if node.data_type == 'url.fragment' and ':~:' in node.value: + fragment_value, fragment_directive = node.value.split(':~:', 1) + + # If we split off a fragment directive, the remaining fragment_value is the traditional + # anchor (e.g. "heading1"). Only parse it as query string pairs if it looks like one + # (contains '='). Otherwise, treat it as a plain anchor identifier. + if fragment_value and fragment_directive is not None and '=' not in fragment_value: + unfurl.add_to_queue( + data_type='url.fragment.anchor', key='Fragment Anchor', value=fragment_value, + label=f'Fragment Anchor: {fragment_value}', + hover='This is the traditional URL fragment (anchor) that identifies ' + 'a specific section of the page.', + parent_id=node.node_id, incoming_edge_config=urlparse_edge) + + if fragment_value and (fragment_directive is None or '=' in fragment_value): + parsed_qs = urllib.parse.parse_qs(fragment_value, keep_blank_values=True) + for key, value in parsed_qs.items(): + assert type(value) is list, 'parsed_qs should result in type list, but did not.' + # In the majority of cases, query string keys are unique, but the spec is ambiguous. In the case of + # duplicate keys, urllib.parse.parsed_qs adds them to a list. Unfurl will loop over and create a + # node for each value in that list of values (this is typically only one value, but could be more). + for v in value: + unfurl.add_to_queue( + data_type='url.query.pair', key=key, value=v, label=f'{key}: {v}', + parent_id=node.node_id, incoming_edge_config=urlparse_edge) + + # If the query string or fragment is actually another URL (as seen in some redirectors), we want to + # continue doing subsequent parsing on it. For that, we need to recognize it and change the data_type to url. + if not parsed_qs: + parsed = try_url_parse(unfurl, node) + if parsed: + return + + if fragment_directive: + # Parse each directive (separated by &). Currently only text= is defined in the spec. + directives = urllib.parse.parse_qs(fragment_directive, keep_blank_values=True) + for directive_values in directives.get('text', []): + decoded_text = urllib.parse.unquote_plus(directive_values) unfurl.add_to_queue( - data_type='url.query.pair', key=key, value=v, label=f'{key}: {v}', + data_type='url.fragment.text-fragment', key='Text Fragment', value=decoded_text, + label=f'Text Fragment: {decoded_text}', + hover='A Text Fragment tells the browser to scroll to and highlight ' + 'the specified text on the page. ' + '[spec]', parent_id=node.node_id, incoming_edge_config=urlparse_edge) - # If the query string or fragment is actually another URL (as seen in some redirectors), we want to - # continue doing subsequent parsing on it. For that, we need to recognize it and change the data_type to url. - if not parsed_qs: - parsed = try_url_parse(unfurl, node) - if parsed: - return - elif node.data_type == 'url.params': split_params_re = re.compile(r'^(?P[^=]+?)=(?P[^=?]+)(?P[;,|])') split_params = split_params_re.match(node.value) diff --git a/unfurl/templates/graph.html b/unfurl/templates/graph.html index d80d5c6..ab61686 100644 --- a/unfurl/templates/graph.html +++ b/unfurl/templates/graph.html @@ -179,7 +179,16 @@

You can move and select nodes, zoom and pan the camera, and save the graph i const urlParams = new URLSearchParams(window.location.search); if (urlParams.get('url')) { - document.getElementById("text_to_unfurl").value = decodeURI(urlParams.get('url') + window.location.hash); + // Reconstruct the original URL exactly as the user entered it. The input is + // evidence being analyzed, so percent-encoding must not be silently altered. + // encodeURIComponent in the submit handler double-encodes (e.g. %20 → %2520), + // so one decodeURIComponent here restores the original input faithfully. + var rawMatch = window.location.search.match(/[?&]url=([^&]*)/); + if (rawMatch) { + document.getElementById("text_to_unfurl").value = decodeURIComponent(rawMatch[1]); + } else { + document.getElementById("text_to_unfurl").value = urlParams.get('url') + window.location.hash; + } var url = new URL(`${window.location.protocol}//${window.location.host}/json/visjs`); url.searchParams.set('url', urlParams.get('url') + window.location.hash); fetch(url).then(response => { diff --git a/unfurl/tests/unit/test_url.py b/unfurl/tests/unit/test_url.py index fc190ff..80c197f 100644 --- a/unfurl/tests/unit/test_url.py +++ b/unfurl/tests/unit/test_url.py @@ -47,6 +47,54 @@ def test_file_path_url(self): self.assertIn('File Extension: .png', test.nodes[13].label) + def test_text_fragment(self): + """Test that Text Fragments (#:~:text=...) are parsed. + + Regression test for https://github.com/RyanDFIR/unfurl/issues/140 + """ + + test = Unfurl() + test.add_to_queue( + data_type='url', key=None, + value='https://blog.chromium.org/2019/12/chrome-80-content-indexing-es-modules.html' + '#:~:text=ECMAScript%20Modules%20in%20Web%20Workers') + test.parse_queue() + + # confirm the text fragment is parsed out with the decoded text + text_fragments = [node for node in test.nodes.values() + if node.data_type == 'url.fragment.text-fragment'] + self.assertEqual(1, len(text_fragments)) + self.assertEqual('ECMAScript Modules in Web Workers', text_fragments[0].value) + + def test_text_fragment_multiple(self): + """Test that multiple Text Fragments are each parsed as separate nodes.""" + + test = Unfurl() + test.add_to_queue( + data_type='url', key=None, + value='https://example.com/page#:~:text=first%20match&text=second%20match') + test.parse_queue() + + text_fragments = [node for node in test.nodes.values() + if node.data_type == 'url.fragment.text-fragment'] + self.assertEqual(2, len(text_fragments)) + self.assertEqual('first match', text_fragments[0].value) + self.assertEqual('second match', text_fragments[1].value) + + def test_text_fragment_with_anchor(self): + """Test a fragment that has both a traditional anchor and a text fragment.""" + + test = Unfurl() + test.add_to_queue( + data_type='url', key=None, + value='https://example.com/page#heading1:~:text=highlighted%20text') + test.parse_queue() + + text_fragments = [node for node in test.nodes.values() + if node.data_type == 'url.fragment.text-fragment'] + self.assertEqual(1, len(text_fragments)) + self.assertEqual('highlighted text', text_fragments[0].value) + def test_query_param_no_value(self): """Test that query parameters with no value are preserved.""" From 440184829c7dde0c8bc5f4bbbce1b9f8c011d972 Mon Sep 17 00:00:00 2001 From: Ryan Benson Date: Fri, 17 Apr 2026 14:00:12 -0700 Subject: [PATCH 2/2] Fixes for when fragment has parameters after it. --- unfurl/parsers/parse_url.py | 23 ++++++++++++++++------- unfurl/templates/graph.html | 2 +- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/unfurl/parsers/parse_url.py b/unfurl/parsers/parse_url.py index 13113a3..e60b3e3 100644 --- a/unfurl/parsers/parse_url.py +++ b/unfurl/parsers/parse_url.py @@ -151,18 +151,27 @@ def run(unfurl, node): if node.data_type == 'url.fragment' and ':~:' in node.value: fragment_value, fragment_directive = node.value.split(':~:', 1) - # If we split off a fragment directive, the remaining fragment_value is the traditional - # anchor (e.g. "heading1"). Only parse it as query string pairs if it looks like one - # (contains '='). Otherwise, treat it as a plain anchor identifier. - if fragment_value and fragment_directive is not None and '=' not in fragment_value: + # Fragments can contain SPA-style routes with a '?' separating the path from query + # params (e.g. "#/im?p=@user"). Split on '?' so parse_qs only sees the query part. + fragment_path = None + if fragment_value and node.data_type == 'url.fragment' and '?' in fragment_value: + fragment_path, fragment_value = fragment_value.split('?', 1) + + # A fragment value without '=' is a plain anchor (e.g. "heading1"), not a query string. + # Only parse through parse_qs if it contains key=value pairs. + if fragment_value and node.data_type == 'url.fragment' and '=' not in fragment_value: + fragment_path = fragment_value + fragment_value = None + + if fragment_path: unfurl.add_to_queue( - data_type='url.fragment.anchor', key='Fragment Anchor', value=fragment_value, - label=f'Fragment Anchor: {fragment_value}', + data_type='url.fragment.anchor', key='Fragment Anchor', value=fragment_path, + label=f'Fragment Anchor: {fragment_path}', hover='This is the traditional URL fragment (anchor) that identifies ' 'a specific section of the page.', parent_id=node.node_id, incoming_edge_config=urlparse_edge) - if fragment_value and (fragment_directive is None or '=' in fragment_value): + if fragment_value: parsed_qs = urllib.parse.parse_qs(fragment_value, keep_blank_values=True) for key, value in parsed_qs.items(): assert type(value) is list, 'parsed_qs should result in type list, but did not.' diff --git a/unfurl/templates/graph.html b/unfurl/templates/graph.html index ab61686..5dc8c73 100644 --- a/unfurl/templates/graph.html +++ b/unfurl/templates/graph.html @@ -112,7 +112,7 @@

You can move and select nodes, zoom and pan the camera, and save the graph i evt.preventDefault(); // Spaces aren't allowed in URLs, but people sometimes add them by accident value_to_send = document.getElementById("text_to_unfurl").value.replace(/ /g, ""); - window.location.replace("/" + value_to_send); + window.location.replace("/graph?url=" + encodeURIComponent(value_to_send)); } return false; });