diff --git a/unfurl/app.py b/unfurl/app.py index 210449b..d2f09d0 100644 --- a/unfurl/app.py +++ b/unfurl/app.py @@ -88,7 +88,7 @@ class JsonVisJS(Resource): def get(self): if 'url' not in request.args: return {} - unfurl_this = unquote(request.args['url']) + unfurl_this = request.args['url'] return run( unfurl_this, return_type='json', diff --git a/unfurl/parsers/parse_url.py b/unfurl/parsers/parse_url.py index ea1cb86..e60b3e3 100644 --- a/unfurl/parsers/parse_url.py +++ b/unfurl/parsers/parse_url.py @@ -142,24 +142,67 @@ def run(unfurl, node): 'Numbering starts at 1.', parent_id=node.node_id, incoming_edge_config=urlparse_edge) elif node.data_type == 'url.query' or node.data_type == 'url.fragment': - parsed_qs = urllib.parse.parse_qs(node.value, keep_blank_values=True) - for key, value in parsed_qs.items(): - assert type(value) is list, 'parsed_qs should result in type list, but did not.' - # In the majority of cases, query string keys are unique, but the spec is ambiguous. In the case of - # duplicate keys, urllib.parse.parsed_qs adds them to a list. Unfurl will loop over and create a - # node for each value in that list of values (this is typically only one value, but could be more). - for v in value: + fragment_value = node.value + fragment_directive = None + + # Text Fragments (ref: https://wicg.github.io/scroll-to-text-fragment/) use :~: as a + # fragment directive delimiter. The part before :~: is the traditional fragment; the part + # after contains directives like text= that tell the browser to highlight/scroll to text. + if node.data_type == 'url.fragment' and ':~:' in node.value: + fragment_value, fragment_directive = node.value.split(':~:', 1) + + # Fragments can contain SPA-style routes with a '?' separating the path from query + # params (e.g. "#/im?p=@user"). Split on '?' so parse_qs only sees the query part. + fragment_path = None + if fragment_value and node.data_type == 'url.fragment' and '?' in fragment_value: + fragment_path, fragment_value = fragment_value.split('?', 1) + + # A fragment value without '=' is a plain anchor (e.g. "heading1"), not a query string. + # Only parse through parse_qs if it contains key=value pairs. + if fragment_value and node.data_type == 'url.fragment' and '=' not in fragment_value: + fragment_path = fragment_value + fragment_value = None + + if fragment_path: + unfurl.add_to_queue( + data_type='url.fragment.anchor', key='Fragment Anchor', value=fragment_path, + label=f'Fragment Anchor: {fragment_path}', + hover='This is the traditional URL fragment (anchor) that identifies ' + 'a specific section of the page.', + parent_id=node.node_id, incoming_edge_config=urlparse_edge) + + if fragment_value: + parsed_qs = urllib.parse.parse_qs(fragment_value, keep_blank_values=True) + for key, value in parsed_qs.items(): + assert type(value) is list, 'parsed_qs should result in type list, but did not.' + # In the majority of cases, query string keys are unique, but the spec is ambiguous. In the case of + # duplicate keys, urllib.parse.parsed_qs adds them to a list. Unfurl will loop over and create a + # node for each value in that list of values (this is typically only one value, but could be more). + for v in value: + unfurl.add_to_queue( + data_type='url.query.pair', key=key, value=v, label=f'{key}: {v}', + parent_id=node.node_id, incoming_edge_config=urlparse_edge) + + # If the query string or fragment is actually another URL (as seen in some redirectors), we want to + # continue doing subsequent parsing on it. For that, we need to recognize it and change the data_type to url. + if not parsed_qs: + parsed = try_url_parse(unfurl, node) + if parsed: + return + + if fragment_directive: + # Parse each directive (separated by &). Currently only text= is defined in the spec. + directives = urllib.parse.parse_qs(fragment_directive, keep_blank_values=True) + for directive_values in directives.get('text', []): + decoded_text = urllib.parse.unquote_plus(directive_values) unfurl.add_to_queue( - data_type='url.query.pair', key=key, value=v, label=f'{key}: {v}', + data_type='url.fragment.text-fragment', key='Text Fragment', value=decoded_text, + label=f'Text Fragment: {decoded_text}', + hover='A Text Fragment tells the browser to scroll to and highlight ' + 'the specified text on the page. ' + '[spec]', parent_id=node.node_id, incoming_edge_config=urlparse_edge) - # If the query string or fragment is actually another URL (as seen in some redirectors), we want to - # continue doing subsequent parsing on it. For that, we need to recognize it and change the data_type to url. - if not parsed_qs: - parsed = try_url_parse(unfurl, node) - if parsed: - return - elif node.data_type == 'url.params': split_params_re = re.compile(r'^(?P[^=]+?)=(?P[^=?]+)(?P[;,|])') split_params = split_params_re.match(node.value) diff --git a/unfurl/templates/graph.html b/unfurl/templates/graph.html index d80d5c6..5dc8c73 100644 --- a/unfurl/templates/graph.html +++ b/unfurl/templates/graph.html @@ -112,7 +112,7 @@

You can move and select nodes, zoom and pan the camera, and save the graph i evt.preventDefault(); // Spaces aren't allowed in URLs, but people sometimes add them by accident value_to_send = document.getElementById("text_to_unfurl").value.replace(/ /g, ""); - window.location.replace("/" + value_to_send); + window.location.replace("/graph?url=" + encodeURIComponent(value_to_send)); } return false; }); @@ -179,7 +179,16 @@

You can move and select nodes, zoom and pan the camera, and save the graph i const urlParams = new URLSearchParams(window.location.search); if (urlParams.get('url')) { - document.getElementById("text_to_unfurl").value = decodeURI(urlParams.get('url') + window.location.hash); + // Reconstruct the original URL exactly as the user entered it. The input is + // evidence being analyzed, so percent-encoding must not be silently altered. + // encodeURIComponent in the submit handler double-encodes (e.g. %20 → %2520), + // so one decodeURIComponent here restores the original input faithfully. + var rawMatch = window.location.search.match(/[?&]url=([^&]*)/); + if (rawMatch) { + document.getElementById("text_to_unfurl").value = decodeURIComponent(rawMatch[1]); + } else { + document.getElementById("text_to_unfurl").value = urlParams.get('url') + window.location.hash; + } var url = new URL(`${window.location.protocol}//${window.location.host}/json/visjs`); url.searchParams.set('url', urlParams.get('url') + window.location.hash); fetch(url).then(response => { diff --git a/unfurl/tests/unit/test_url.py b/unfurl/tests/unit/test_url.py index fc190ff..80c197f 100644 --- a/unfurl/tests/unit/test_url.py +++ b/unfurl/tests/unit/test_url.py @@ -47,6 +47,54 @@ def test_file_path_url(self): self.assertIn('File Extension: .png', test.nodes[13].label) + def test_text_fragment(self): + """Test that Text Fragments (#:~:text=...) are parsed. + + Regression test for https://github.com/RyanDFIR/unfurl/issues/140 + """ + + test = Unfurl() + test.add_to_queue( + data_type='url', key=None, + value='https://blog.chromium.org/2019/12/chrome-80-content-indexing-es-modules.html' + '#:~:text=ECMAScript%20Modules%20in%20Web%20Workers') + test.parse_queue() + + # confirm the text fragment is parsed out with the decoded text + text_fragments = [node for node in test.nodes.values() + if node.data_type == 'url.fragment.text-fragment'] + self.assertEqual(1, len(text_fragments)) + self.assertEqual('ECMAScript Modules in Web Workers', text_fragments[0].value) + + def test_text_fragment_multiple(self): + """Test that multiple Text Fragments are each parsed as separate nodes.""" + + test = Unfurl() + test.add_to_queue( + data_type='url', key=None, + value='https://example.com/page#:~:text=first%20match&text=second%20match') + test.parse_queue() + + text_fragments = [node for node in test.nodes.values() + if node.data_type == 'url.fragment.text-fragment'] + self.assertEqual(2, len(text_fragments)) + self.assertEqual('first match', text_fragments[0].value) + self.assertEqual('second match', text_fragments[1].value) + + def test_text_fragment_with_anchor(self): + """Test a fragment that has both a traditional anchor and a text fragment.""" + + test = Unfurl() + test.add_to_queue( + data_type='url', key=None, + value='https://example.com/page#heading1:~:text=highlighted%20text') + test.parse_queue() + + text_fragments = [node for node in test.nodes.values() + if node.data_type == 'url.fragment.text-fragment'] + self.assertEqual(1, len(text_fragments)) + self.assertEqual('highlighted text', text_fragments[0].value) + def test_query_param_no_value(self): """Test that query parameters with no value are preserved."""