Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions unfurl/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,27 @@ def preceding_domain_contains(self, node, label):
labels = preceding.split('.')
return label in labels

def find_preceding_path(self, node):
"""Find the URL path associated with a node by traversing up to the URL
ancestor and looking for a url.path sibling."""
parent_nodes = self.get_predecessor_node(node)

if not parent_nodes:
return ''

for parent_node in parent_nodes:
if parent_node.data_type == 'url':
for child_node in self.get_successor_nodes(parent_node):
if child_node.data_type == 'url.path':
return child_node.value
return ''
else:
result = self.find_preceding_path(parent_node)
if result:
return result

return ''

def get_id(self):
new_id = self.next_id
self.next_id += 1
Expand Down
5 changes: 5 additions & 0 deletions unfurl/parsers/parse_google.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,11 @@ def run(unfurl, node):
parent_id=node.node_id, incoming_edge_config=google_edge)

elif node.key == 'q':
# On /url redirect pages, q is the destination URL, not a search query.
# Skip here and let the google site_def handle it with the correct label.
if unfurl.find_preceding_path(node) == '/url':
return

unfurl.add_to_queue(
data_type='google.q', key=None, value=f'Search Query: {node.value}',
hover='Terms used in the Google search', parent_id=node.node_id, incoming_edge_config=google_edge)
Expand Down
6 changes: 6 additions & 0 deletions unfurl/parsers/parse_site_defs.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,12 @@ def _check_query_rule(unfurl, node, rule, site_def):
if node.key != rule.get('key'):
return False

# Optional path scoping: only apply this rule when the URL path matches.
if 'path' in rule:
preceding_path = unfurl.find_preceding_path(node)
if preceding_path != rule['path']:
return False

apply = rule['apply']

# hover_only: just set hover text on the existing node, don't create a child
Expand Down
17 changes: 17 additions & 0 deletions unfurl/parsers/site_defs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,22 @@ use `hover_only`:
Content type/rendering flags.
```

### Scoping by path

A query parameter can mean different things depending on the URL path. Use `path` to
restrict a rule to a specific path:

```yaml
# q means "redirect target" on /url, but "search query" on /search
- key: q
path: /url
apply:
hover_only: true
hover: The redirect target URL.
```

Without `path`, the rule fires on every URL for the domain that has the matching key.

### Parsing values as URLs

Set `data_type: url` to have unfurl parse the parameter value as a full URL:
Expand Down Expand Up @@ -301,4 +317,5 @@ class TestExample(unittest.TestCase):
See the existing definitions in this directory for complete examples:
- `github.yaml` - Path rules, query rules, fragment rules, exclude_sibling
- `facebook.yaml` - Complex path rules, wildcard excludes, hover_only query rules
- `google.yaml` - Path-scoped query rules, hover_only for context-dependent parameters
- `instagram.yaml` - Multiple URL formats for the same content type
157 changes: 157 additions & 0 deletions unfurl/parsers/site_defs/google.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
# Google URL parser definition (non-search endpoints)
# Search-specific parameters (ei, ved, gs_l, aqs, etc.) are handled
# by parse_google.py. This file covers other Google URL types.
# Parsed by parse_site_defs.py

name: Google
domains:
- google.com
edge:
color: "#4285F4"
title: Google
label: "G"

# --- /url redirect pages ---
# google.com/url?q=DESTINATION&sa=D&ust=TIMESTAMP&usg=SIGNATURE
query_rules:

- key: q
path: /url
apply:
hover_only: true
hover: >-
The <b>redirect target URL</b> in a Google redirect link (/url).
Google wraps outbound links through this redirect for
click tracking and malware warnings. This is <em>not</em> a
search query — it is the destination the user was sent to.

- key: url
path: /url
apply:
hover_only: true
hover: >-
The <b>redirect target URL</b> in a Google redirect link (/url).
Alternate parameter name for the redirect target
(used in some older Google redirect formats).

- key: sa
path: /url
apply:
hover_only: true
hover: >-
Google redirect action type. Common values:
<b>t</b> = standard redirect (from search results),
<b>D</b> = redirect from a Google product (Docs, Hangouts, etc.),
<b>U</b> = redirect from a Google Cache page.

- key: usg
path: /url
apply:
hover_only: true
hover: >-
Google URL signature. A hash used to verify the redirect
link was generated by Google and has not been tampered with.

- key: ust
path: /url
apply:
hover_only: true
hover: >-
Google redirect timestamp (microseconds since Unix epoch).
Believed to indicate when the redirect link was generated
or when the referring page was loaded.

- key: ved
path: /url
apply:
hover_only: true
hover: >-
Google tracking parameter encoding the link's position and type
on the referring page. Contains a protobuf-encoded structure.

- key: rct
path: /url
apply:
hover_only: true
hover: >-
Redirect confirmation type. Typically <b>j</b> (JavaScript redirect).

- key: esrc
path: /url
apply:
hover_only: true
hover: >-
Source of the redirect. Typically <b>s</b> (search).

# --- /imgres (image result pages) ---

- key: imgurl
apply:
label: "Image URL: {value}"
data_type: url
hover: >-
The direct URL of the full-size image shown in Google Image results.

- key: imgrefurl
apply:
label: "Image Source Page: {value}"
data_type: url
hover: >-
The URL of the web page that contains/hosts the image.

- key: h
path: /imgres
apply:
label: "Image Height: {value}px"
data_type: descriptor
hover: >-
The height of the original image in pixels.

- key: w
path: /imgres
apply:
label: "Image Width: {value}px"
data_type: descriptor
hover: >-
The width of the original image in pixels.

- key: tbnid
path: /imgres
apply:
hover_only: true
hover: >-
Google Image thumbnail ID. A unique identifier for this image
in Google's index. Can be used to correlate the same image
across different search sessions.

- key: docid
path: /imgres
apply:
hover_only: true
hover: >-
Google Image document ID. Identifies the web page hosting the
image in Google's index.

- key: tbnh
path: /imgres
apply:
label: "Thumbnail Height: {value}px"
data_type: descriptor
hover: >-
The height of the thumbnail image displayed in search results.

- key: tbnw
path: /imgres
apply:
label: "Thumbnail Width: {value}px"
data_type: descriptor
hover: >-
The width of the thumbnail image displayed in search results.

- key: iact
path: /imgres
apply:
hover_only: true
hover: >-
Interaction action type — how the user reached this image result.
Common values: <b>rc</b> = right-click, <b>hc</b> = hover-click.
44 changes: 44 additions & 0 deletions unfurl/tests/unit/test_google.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from unfurl.core import Unfurl
from urllib.parse import urlparse
import unittest


Expand Down Expand Up @@ -96,5 +97,48 @@ def test_google_search_with_aqs(self):
self.assertEqual(len(test.edges), 0)


def test_google_url_redirect(self):
"""Test that google.com/url redirects are parsed correctly.

The q parameter should NOT be labeled as a search query; it is
a redirect target URL. The hover text should explain this.
"""

test = Unfurl()
test.remote_lookups = False
test.add_to_queue(
data_type='url', key=None,
value='https://www.google.com/url?q=https://example.org/landing'
'&sa=D&ust=1546552999624000&usg=AFQjCNGESR0jI6krt8QOg3NlJ0GS60RxJg')
test.parse_queue()

# confirm q is NOT labeled as "Search Query"
google_q_nodes = [n for n in test.nodes.values() if n.data_type == 'google.q']
self.assertEqual(0, len(google_q_nodes))

# confirm q has the redirect hover text
q_node = next(n for n in test.nodes.values()
if n.data_type == 'url.query.pair' and n.key == 'q')
self.assertIn('redirect target', q_node.hover.lower())

# confirm the destination URL is parsed
dest_urls = [
n for n in test.nodes.values()
if n.data_type == 'url'
and urlparse(str(n.value)).hostname == 'example.org'
]
self.assertGreaterEqual(len(dest_urls), 1)

# confirm sa has hover text
sa_node = next(n for n in test.nodes.values()
if n.data_type == 'url.query.pair' and n.key == 'sa')
self.assertIn('action type', sa_node.hover.lower())

# confirm usg has hover text
usg_node = next(n for n in test.nodes.values()
if n.data_type == 'url.query.pair' and n.key == 'usg')
self.assertIn('signature', usg_node.hover.lower())


if __name__ == '__main__':
unittest.main()
Loading