Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion unfurl/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ class JsonVisJS(Resource):
def get(self):
if 'url' not in request.args:
return {}
unfurl_this = unquote(request.args['url'])
unfurl_this = request.args['url']
return run(
unfurl_this,
return_type='json',
Expand Down
73 changes: 58 additions & 15 deletions unfurl/parsers/parse_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,24 +142,67 @@ def run(unfurl, node):
'Numbering starts at 1.', parent_id=node.node_id, incoming_edge_config=urlparse_edge)

elif node.data_type == 'url.query' or node.data_type == 'url.fragment':
parsed_qs = urllib.parse.parse_qs(node.value, keep_blank_values=True)
for key, value in parsed_qs.items():
assert type(value) is list, 'parsed_qs should result in type list, but did not.'
# In the majority of cases, query string keys are unique, but the spec is ambiguous. In the case of
# duplicate keys, urllib.parse.parsed_qs adds them to a list. Unfurl will loop over and create a
# node for each value in that list of values (this is typically only one value, but could be more).
for v in value:
fragment_value = node.value
fragment_directive = None

# Text Fragments (ref: https://wicg.github.io/scroll-to-text-fragment/) use :~: as a
# fragment directive delimiter. The part before :~: is the traditional fragment; the part
# after contains directives like text= that tell the browser to highlight/scroll to text.
if node.data_type == 'url.fragment' and ':~:' in node.value:
fragment_value, fragment_directive = node.value.split(':~:', 1)

# Fragments can contain SPA-style routes with a '?' separating the path from query
# params (e.g. "#/im?p=@user"). Split on '?' so parse_qs only sees the query part.
fragment_path = None
if fragment_value and node.data_type == 'url.fragment' and '?' in fragment_value:
fragment_path, fragment_value = fragment_value.split('?', 1)

# A fragment value without '=' is a plain anchor (e.g. "heading1"), not a query string.
# Only parse through parse_qs if it contains key=value pairs.
if fragment_value and node.data_type == 'url.fragment' and '=' not in fragment_value:
fragment_path = fragment_value
fragment_value = None

if fragment_path:
unfurl.add_to_queue(
data_type='url.fragment.anchor', key='Fragment Anchor', value=fragment_path,
label=f'Fragment Anchor: {fragment_path}',
hover='This is the traditional URL <b>fragment</b> (anchor) that identifies '
'a specific section of the page.',
parent_id=node.node_id, incoming_edge_config=urlparse_edge)

if fragment_value:
parsed_qs = urllib.parse.parse_qs(fragment_value, keep_blank_values=True)
for key, value in parsed_qs.items():
assert type(value) is list, 'parsed_qs should result in type list, but did not.'
# In the majority of cases, query string keys are unique, but the spec is ambiguous. In the case of
# duplicate keys, urllib.parse.parsed_qs adds them to a list. Unfurl will loop over and create a
# node for each value in that list of values (this is typically only one value, but could be more).
for v in value:
unfurl.add_to_queue(
data_type='url.query.pair', key=key, value=v, label=f'{key}: {v}',
parent_id=node.node_id, incoming_edge_config=urlparse_edge)

# If the query string or fragment is actually another URL (as seen in some redirectors), we want to
# continue doing subsequent parsing on it. For that, we need to recognize it and change the data_type to url.
if not parsed_qs:
parsed = try_url_parse(unfurl, node)
if parsed:
return

if fragment_directive:
# Parse each directive (separated by &). Currently only text= is defined in the spec.
directives = urllib.parse.parse_qs(fragment_directive, keep_blank_values=True)
for directive_values in directives.get('text', []):
decoded_text = urllib.parse.unquote_plus(directive_values)
unfurl.add_to_queue(
data_type='url.query.pair', key=key, value=v, label=f'{key}: {v}',
data_type='url.fragment.text-fragment', key='Text Fragment', value=decoded_text,
label=f'Text Fragment: {decoded_text}',
hover='A <b>Text Fragment</b> tells the browser to scroll to and highlight '
'the specified text on the page. '
'<a href="https://wicg.github.io/scroll-to-text-fragment/" target="_blank">[spec]</a>',
parent_id=node.node_id, incoming_edge_config=urlparse_edge)

# If the query string or fragment is actually another URL (as seen in some redirectors), we want to
# continue doing subsequent parsing on it. For that, we need to recognize it and change the data_type to url.
if not parsed_qs:
parsed = try_url_parse(unfurl, node)
if parsed:
return

elif node.data_type == 'url.params':
split_params_re = re.compile(r'^(?P<key>[^=]+?)=(?P<value>[^=?]+)(?P<delim>[;,|])')
split_params = split_params_re.match(node.value)
Expand Down
13 changes: 11 additions & 2 deletions unfurl/templates/graph.html
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ <h3>You can move and select nodes, zoom and pan the camera, and save the graph i
evt.preventDefault();
// Spaces aren't allowed in URLs, but people sometimes add them by accident
value_to_send = document.getElementById("text_to_unfurl").value.replace(/ /g, "");
window.location.replace("/" + value_to_send);
window.location.replace("/graph?url=" + encodeURIComponent(value_to_send));
}
return false;
});
Expand Down Expand Up @@ -179,7 +179,16 @@ <h3>You can move and select nodes, zoom and pan the camera, and save the graph i

const urlParams = new URLSearchParams(window.location.search);
if (urlParams.get('url')) {
document.getElementById("text_to_unfurl").value = decodeURI(urlParams.get('url') + window.location.hash);
// Reconstruct the original URL exactly as the user entered it. The input is
// evidence being analyzed, so percent-encoding must not be silently altered.
// encodeURIComponent in the submit handler double-encodes (e.g. %20 → %2520),
// so one decodeURIComponent here restores the original input faithfully.
var rawMatch = window.location.search.match(/[?&]url=([^&]*)/);
if (rawMatch) {
document.getElementById("text_to_unfurl").value = decodeURIComponent(rawMatch[1]);
} else {
document.getElementById("text_to_unfurl").value = urlParams.get('url') + window.location.hash;
}
var url = new URL(`${window.location.protocol}//${window.location.host}/json/visjs`);
url.searchParams.set('url', urlParams.get('url') + window.location.hash);
fetch(url).then(response => {
Expand Down
48 changes: 48 additions & 0 deletions unfurl/tests/unit/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,54 @@ def test_file_path_url(self):
self.assertIn('File Extension: .png', test.nodes[13].label)


def test_text_fragment(self):
"""Test that Text Fragments (#:~:text=...) are parsed.

Regression test for https://github.com/RyanDFIR/unfurl/issues/140
"""

test = Unfurl()
test.add_to_queue(
data_type='url', key=None,
value='https://blog.chromium.org/2019/12/chrome-80-content-indexing-es-modules.html'
'#:~:text=ECMAScript%20Modules%20in%20Web%20Workers')
test.parse_queue()

# confirm the text fragment is parsed out with the decoded text
text_fragments = [node for node in test.nodes.values()
if node.data_type == 'url.fragment.text-fragment']
self.assertEqual(1, len(text_fragments))
self.assertEqual('ECMAScript Modules in Web Workers', text_fragments[0].value)

def test_text_fragment_multiple(self):
"""Test that multiple Text Fragments are each parsed as separate nodes."""

test = Unfurl()
test.add_to_queue(
data_type='url', key=None,
value='https://example.com/page#:~:text=first%20match&text=second%20match')
test.parse_queue()

text_fragments = [node for node in test.nodes.values()
if node.data_type == 'url.fragment.text-fragment']
self.assertEqual(2, len(text_fragments))
self.assertEqual('first match', text_fragments[0].value)
self.assertEqual('second match', text_fragments[1].value)

def test_text_fragment_with_anchor(self):
"""Test a fragment that has both a traditional anchor and a text fragment."""

test = Unfurl()
test.add_to_queue(
data_type='url', key=None,
value='https://example.com/page#heading1:~:text=highlighted%20text')
test.parse_queue()

text_fragments = [node for node in test.nodes.values()
if node.data_type == 'url.fragment.text-fragment']
self.assertEqual(1, len(text_fragments))
self.assertEqual('highlighted text', text_fragments[0].value)

def test_query_param_no_value(self):
"""Test that query parameters with no value are preserved."""

Expand Down
Loading