diff --git a/unfurl/app.py b/unfurl/app.py
index 210449b..d2f09d0 100644
--- a/unfurl/app.py
+++ b/unfurl/app.py
@@ -88,7 +88,7 @@ class JsonVisJS(Resource):
def get(self):
if 'url' not in request.args:
return {}
- unfurl_this = unquote(request.args['url'])
+ unfurl_this = request.args['url']
return run(
unfurl_this,
return_type='json',
diff --git a/unfurl/parsers/parse_url.py b/unfurl/parsers/parse_url.py
index ea1cb86..e60b3e3 100644
--- a/unfurl/parsers/parse_url.py
+++ b/unfurl/parsers/parse_url.py
@@ -142,24 +142,67 @@ def run(unfurl, node):
'Numbering starts at 1.', parent_id=node.node_id, incoming_edge_config=urlparse_edge)
elif node.data_type == 'url.query' or node.data_type == 'url.fragment':
- parsed_qs = urllib.parse.parse_qs(node.value, keep_blank_values=True)
- for key, value in parsed_qs.items():
- assert type(value) is list, 'parsed_qs should result in type list, but did not.'
- # In the majority of cases, query string keys are unique, but the spec is ambiguous. In the case of
- # duplicate keys, urllib.parse.parsed_qs adds them to a list. Unfurl will loop over and create a
- # node for each value in that list of values (this is typically only one value, but could be more).
- for v in value:
+ fragment_value = node.value
+ fragment_directive = None
+
+ # Text Fragments (ref: https://wicg.github.io/scroll-to-text-fragment/) use :~: as a
+ # fragment directive delimiter. The part before :~: is the traditional fragment; the part
+ # after contains directives like text= that tell the browser to highlight/scroll to text.
+ if node.data_type == 'url.fragment' and ':~:' in node.value:
+ fragment_value, fragment_directive = node.value.split(':~:', 1)
+
+ # Fragments can contain SPA-style routes with a '?' separating the path from query
+ # params (e.g. "#/im?p=@user"). Split on '?' so parse_qs only sees the query part.
+ fragment_path = None
+ if fragment_value and node.data_type == 'url.fragment' and '?' in fragment_value:
+ fragment_path, fragment_value = fragment_value.split('?', 1)
+
+ # A fragment value without '=' is a plain anchor (e.g. "heading1"), not a query string.
+ # Only parse through parse_qs if it contains key=value pairs.
+ if fragment_value and node.data_type == 'url.fragment' and '=' not in fragment_value:
+ fragment_path = fragment_value
+ fragment_value = None
+
+ if fragment_path:
+ unfurl.add_to_queue(
+ data_type='url.fragment.anchor', key='Fragment Anchor', value=fragment_path,
+ label=f'Fragment Anchor: {fragment_path}',
+ hover='This is the traditional URL fragment (anchor) that identifies '
+ 'a specific section of the page.',
+ parent_id=node.node_id, incoming_edge_config=urlparse_edge)
+
+ if fragment_value:
+ parsed_qs = urllib.parse.parse_qs(fragment_value, keep_blank_values=True)
+ for key, value in parsed_qs.items():
+ assert type(value) is list, 'parsed_qs should result in type list, but did not.'
+ # In the majority of cases, query string keys are unique, but the spec is ambiguous. In the case of
+ # duplicate keys, urllib.parse.parsed_qs adds them to a list. Unfurl will loop over and create a
+ # node for each value in that list of values (this is typically only one value, but could be more).
+ for v in value:
+ unfurl.add_to_queue(
+ data_type='url.query.pair', key=key, value=v, label=f'{key}: {v}',
+ parent_id=node.node_id, incoming_edge_config=urlparse_edge)
+
+ # If the query string or fragment is actually another URL (as seen in some redirectors), we want to
+ # continue doing subsequent parsing on it. For that, we need to recognize it and change the data_type to url.
+ if not parsed_qs:
+ parsed = try_url_parse(unfurl, node)
+ if parsed:
+ return
+
+ if fragment_directive:
+ # Parse each directive (separated by &). Currently only text= is defined in the spec.
+ directives = urllib.parse.parse_qs(fragment_directive, keep_blank_values=True)
+ for directive_values in directives.get('text', []):
+ decoded_text = urllib.parse.unquote_plus(directive_values)
unfurl.add_to_queue(
- data_type='url.query.pair', key=key, value=v, label=f'{key}: {v}',
+ data_type='url.fragment.text-fragment', key='Text Fragment', value=decoded_text,
+ label=f'Text Fragment: {decoded_text}',
+ hover='A Text Fragment tells the browser to scroll to and highlight '
+ 'the specified text on the page. '
+ '[spec]',
parent_id=node.node_id, incoming_edge_config=urlparse_edge)
- # If the query string or fragment is actually another URL (as seen in some redirectors), we want to
- # continue doing subsequent parsing on it. For that, we need to recognize it and change the data_type to url.
- if not parsed_qs:
- parsed = try_url_parse(unfurl, node)
- if parsed:
- return
-
elif node.data_type == 'url.params':
split_params_re = re.compile(r'^(?P[^=]+?)=(?P[^=?]+)(?P[;,|])')
split_params = split_params_re.match(node.value)
diff --git a/unfurl/templates/graph.html b/unfurl/templates/graph.html
index d80d5c6..5dc8c73 100644
--- a/unfurl/templates/graph.html
+++ b/unfurl/templates/graph.html
@@ -112,7 +112,7 @@ You can move and select nodes, zoom and pan the camera, and save the graph i
evt.preventDefault();
// Spaces aren't allowed in URLs, but people sometimes add them by accident
value_to_send = document.getElementById("text_to_unfurl").value.replace(/ /g, "");
- window.location.replace("/" + value_to_send);
+ window.location.replace("/graph?url=" + encodeURIComponent(value_to_send));
}
return false;
});
@@ -179,7 +179,16 @@ You can move and select nodes, zoom and pan the camera, and save the graph i
const urlParams = new URLSearchParams(window.location.search);
if (urlParams.get('url')) {
- document.getElementById("text_to_unfurl").value = decodeURI(urlParams.get('url') + window.location.hash);
+ // Reconstruct the original URL exactly as the user entered it. The input is
+ // evidence being analyzed, so percent-encoding must not be silently altered.
+ // encodeURIComponent in the submit handler double-encodes (e.g. %20 → %2520),
+ // so one decodeURIComponent here restores the original input faithfully.
+ var rawMatch = window.location.search.match(/[?&]url=([^&]*)/);
+ if (rawMatch) {
+ document.getElementById("text_to_unfurl").value = decodeURIComponent(rawMatch[1]);
+ } else {
+ document.getElementById("text_to_unfurl").value = urlParams.get('url') + window.location.hash;
+ }
var url = new URL(`${window.location.protocol}//${window.location.host}/json/visjs`);
url.searchParams.set('url', urlParams.get('url') + window.location.hash);
fetch(url).then(response => {
diff --git a/unfurl/tests/unit/test_url.py b/unfurl/tests/unit/test_url.py
index fc190ff..80c197f 100644
--- a/unfurl/tests/unit/test_url.py
+++ b/unfurl/tests/unit/test_url.py
@@ -47,6 +47,54 @@ def test_file_path_url(self):
self.assertIn('File Extension: .png', test.nodes[13].label)
+ def test_text_fragment(self):
+ """Test that Text Fragments (#:~:text=...) are parsed.
+
+ Regression test for https://github.com/RyanDFIR/unfurl/issues/140
+ """
+
+ test = Unfurl()
+ test.add_to_queue(
+ data_type='url', key=None,
+ value='https://blog.chromium.org/2019/12/chrome-80-content-indexing-es-modules.html'
+ '#:~:text=ECMAScript%20Modules%20in%20Web%20Workers')
+ test.parse_queue()
+
+ # confirm the text fragment is parsed out with the decoded text
+ text_fragments = [node for node in test.nodes.values()
+ if node.data_type == 'url.fragment.text-fragment']
+ self.assertEqual(1, len(text_fragments))
+ self.assertEqual('ECMAScript Modules in Web Workers', text_fragments[0].value)
+
+ def test_text_fragment_multiple(self):
+ """Test that multiple Text Fragments are each parsed as separate nodes."""
+
+ test = Unfurl()
+ test.add_to_queue(
+ data_type='url', key=None,
+ value='https://example.com/page#:~:text=first%20match&text=second%20match')
+ test.parse_queue()
+
+ text_fragments = [node for node in test.nodes.values()
+ if node.data_type == 'url.fragment.text-fragment']
+ self.assertEqual(2, len(text_fragments))
+ self.assertEqual('first match', text_fragments[0].value)
+ self.assertEqual('second match', text_fragments[1].value)
+
+ def test_text_fragment_with_anchor(self):
+ """Test a fragment that has both a traditional anchor and a text fragment."""
+
+ test = Unfurl()
+ test.add_to_queue(
+ data_type='url', key=None,
+ value='https://example.com/page#heading1:~:text=highlighted%20text')
+ test.parse_queue()
+
+ text_fragments = [node for node in test.nodes.values()
+ if node.data_type == 'url.fragment.text-fragment']
+ self.assertEqual(1, len(text_fragments))
+ self.assertEqual('highlighted text', text_fragments[0].value)
+
def test_query_param_no_value(self):
"""Test that query parameters with no value are preserved."""