From 41e4e7b10136028f1a9ed891dd93044350b1066e Mon Sep 17 00:00:00 2001 From: Ryan Benson Date: Sat, 18 Apr 2026 17:59:01 -0700 Subject: [PATCH 1/2] Migrate non-search Google URL parsing to site_defs YAML. Move /url redirect and /imgres parameter handling into site_defs/google.yaml; parse_google.py focuses on search-specific params (ei, ved, gs_l, aqs, etc.). --- unfurl/core.py | 21 ++++ unfurl/parsers/parse_google.py | 5 + unfurl/parsers/parse_site_defs.py | 6 + unfurl/parsers/site_defs/README.md | 17 +++ unfurl/parsers/site_defs/google.yaml | 157 +++++++++++++++++++++++++++ unfurl/tests/unit/test_google.py | 40 +++++++ 6 files changed, 246 insertions(+) create mode 100644 unfurl/parsers/site_defs/google.yaml diff --git a/unfurl/core.py b/unfurl/core.py index 533f638..f5feaa7 100644 --- a/unfurl/core.py +++ b/unfurl/core.py @@ -253,6 +253,27 @@ def preceding_domain_contains(self, node, label): labels = preceding.split('.') return label in labels + def find_preceding_path(self, node): + """Find the URL path associated with a node by traversing up to the URL + ancestor and looking for a url.path sibling.""" + parent_nodes = self.get_predecessor_node(node) + + if not parent_nodes: + return '' + + for parent_node in parent_nodes: + if parent_node.data_type == 'url': + for child_node in self.get_successor_nodes(parent_node): + if child_node.data_type == 'url.path': + return child_node.value + return '' + else: + result = self.find_preceding_path(parent_node) + if result: + return result + + return '' + def get_id(self): new_id = self.next_id self.next_id += 1 diff --git a/unfurl/parsers/parse_google.py b/unfurl/parsers/parse_google.py index 2860fe2..f4f8f96 100644 --- a/unfurl/parsers/parse_google.py +++ b/unfurl/parsers/parse_google.py @@ -365,6 +365,11 @@ def run(unfurl, node): parent_id=node.node_id, incoming_edge_config=google_edge) elif node.key == 'q': + # On /url redirect pages, q is the destination URL, not a search query. + # Skip here and let the google site_def handle it with the correct label. + if unfurl.find_preceding_path(node) == '/url': + return + unfurl.add_to_queue( data_type='google.q', key=None, value=f'Search Query: {node.value}', hover='Terms used in the Google search', parent_id=node.node_id, incoming_edge_config=google_edge) diff --git a/unfurl/parsers/parse_site_defs.py b/unfurl/parsers/parse_site_defs.py index 29eb995..820f7ac 100644 --- a/unfurl/parsers/parse_site_defs.py +++ b/unfurl/parsers/parse_site_defs.py @@ -154,6 +154,12 @@ def _check_query_rule(unfurl, node, rule, site_def): if node.key != rule.get('key'): return False + # Optional path scoping: only apply this rule when the URL path matches. + if 'path' in rule: + preceding_path = unfurl.find_preceding_path(node) + if preceding_path != rule['path']: + return False + apply = rule['apply'] # hover_only: just set hover text on the existing node, don't create a child diff --git a/unfurl/parsers/site_defs/README.md b/unfurl/parsers/site_defs/README.md index a456ed5..4e53acb 100644 --- a/unfurl/parsers/site_defs/README.md +++ b/unfurl/parsers/site_defs/README.md @@ -204,6 +204,22 @@ use `hover_only`: Content type/rendering flags. ``` +### Scoping by path + +A query parameter can mean different things depending on the URL path. Use `path` to +restrict a rule to a specific path: + +```yaml + # q means "redirect target" on /url, but "search query" on /search + - key: q + path: /url + apply: + hover_only: true + hover: The redirect target URL. +``` + +Without `path`, the rule fires on every URL for the domain that has the matching key. + ### Parsing values as URLs Set `data_type: url` to have unfurl parse the parameter value as a full URL: @@ -301,4 +317,5 @@ class TestExample(unittest.TestCase): See the existing definitions in this directory for complete examples: - `github.yaml` - Path rules, query rules, fragment rules, exclude_sibling - `facebook.yaml` - Complex path rules, wildcard excludes, hover_only query rules +- `google.yaml` - Path-scoped query rules, hover_only for context-dependent parameters - `instagram.yaml` - Multiple URL formats for the same content type diff --git a/unfurl/parsers/site_defs/google.yaml b/unfurl/parsers/site_defs/google.yaml new file mode 100644 index 0000000..9dd63aa --- /dev/null +++ b/unfurl/parsers/site_defs/google.yaml @@ -0,0 +1,157 @@ +# Google URL parser definition (non-search endpoints) +# Search-specific parameters (ei, ved, gs_l, aqs, etc.) are handled +# by parse_google.py. This file covers other Google URL types. +# Parsed by parse_site_defs.py + +name: Google +domains: + - google.com +edge: + color: "#4285F4" + title: Google + label: "G" + +# --- /url redirect pages --- +# google.com/url?q=DESTINATION&sa=D&ust=TIMESTAMP&usg=SIGNATURE +query_rules: + + - key: q + path: /url + apply: + hover_only: true + hover: >- + The redirect target URL in a Google redirect link (/url). + Google wraps outbound links through this redirect for + click tracking and malware warnings. This is not a + search query — it is the destination the user was sent to. + + - key: url + path: /url + apply: + hover_only: true + hover: >- + The redirect target URL in a Google redirect link (/url). + Alternate parameter name for the redirect target + (used in some older Google redirect formats). + + - key: sa + path: /url + apply: + hover_only: true + hover: >- + Google redirect action type. Common values: + t = standard redirect (from search results), + D = redirect from a Google product (Docs, Hangouts, etc.), + U = redirect from a Google Cache page. + + - key: usg + path: /url + apply: + hover_only: true + hover: >- + Google URL signature. A hash used to verify the redirect + link was generated by Google and has not been tampered with. + + - key: ust + path: /url + apply: + hover_only: true + hover: >- + Google redirect timestamp (microseconds since Unix epoch). + Believed to indicate when the redirect link was generated + or when the referring page was loaded. + + - key: ved + path: /url + apply: + hover_only: true + hover: >- + Google tracking parameter encoding the link's position and type + on the referring page. Contains a protobuf-encoded structure. + + - key: rct + path: /url + apply: + hover_only: true + hover: >- + Redirect confirmation type. Typically j (JavaScript redirect). + + - key: esrc + path: /url + apply: + hover_only: true + hover: >- + Source of the redirect. Typically s (search). + + # --- /imgres (image result pages) --- + + - key: imgurl + apply: + label: "Image URL: {value}" + data_type: url + hover: >- + The direct URL of the full-size image shown in Google Image results. + + - key: imgrefurl + apply: + label: "Image Source Page: {value}" + data_type: url + hover: >- + The URL of the web page that contains/hosts the image. + + - key: h + path: /imgres + apply: + label: "Image Height: {value}px" + data_type: descriptor + hover: >- + The height of the original image in pixels. + + - key: w + path: /imgres + apply: + label: "Image Width: {value}px" + data_type: descriptor + hover: >- + The width of the original image in pixels. + + - key: tbnid + path: /imgres + apply: + hover_only: true + hover: >- + Google Image thumbnail ID. A unique identifier for this image + in Google's index. Can be used to correlate the same image + across different search sessions. + + - key: docid + path: /imgres + apply: + hover_only: true + hover: >- + Google Image document ID. Identifies the web page hosting the + image in Google's index. + + - key: tbnh + path: /imgres + apply: + label: "Thumbnail Height: {value}px" + data_type: descriptor + hover: >- + The height of the thumbnail image displayed in search results. + + - key: tbnw + path: /imgres + apply: + label: "Thumbnail Width: {value}px" + data_type: descriptor + hover: >- + The width of the thumbnail image displayed in search results. + + - key: iact + path: /imgres + apply: + hover_only: true + hover: >- + Interaction action type — how the user reached this image result. + Common values: rc = right-click, hc = hover-click. diff --git a/unfurl/tests/unit/test_google.py b/unfurl/tests/unit/test_google.py index 6c1d965..a193f38 100644 --- a/unfurl/tests/unit/test_google.py +++ b/unfurl/tests/unit/test_google.py @@ -96,5 +96,45 @@ def test_google_search_with_aqs(self): self.assertEqual(len(test.edges), 0) + def test_google_url_redirect(self): + """Test that google.com/url redirects are parsed correctly. + + The q parameter should NOT be labeled as a search query; it is + a redirect target URL. The hover text should explain this. + """ + + test = Unfurl() + test.remote_lookups = False + test.add_to_queue( + data_type='url', key=None, + value='https://www.google.com/url?q=https://example.org/landing' + '&sa=D&ust=1546552999624000&usg=AFQjCNGESR0jI6krt8QOg3NlJ0GS60RxJg') + test.parse_queue() + + # confirm q is NOT labeled as "Search Query" + google_q_nodes = [n for n in test.nodes.values() if n.data_type == 'google.q'] + self.assertEqual(0, len(google_q_nodes)) + + # confirm q has the redirect hover text + q_node = next(n for n in test.nodes.values() + if n.data_type == 'url.query.pair' and n.key == 'q') + self.assertIn('redirect target', q_node.hover.lower()) + + # confirm the destination URL is parsed + dest_urls = [n for n in test.nodes.values() + if n.data_type == 'url' and 'example.org' in str(n.value)] + self.assertGreaterEqual(len(dest_urls), 1) + + # confirm sa has hover text + sa_node = next(n for n in test.nodes.values() + if n.data_type == 'url.query.pair' and n.key == 'sa') + self.assertIn('action type', sa_node.hover.lower()) + + # confirm usg has hover text + usg_node = next(n for n in test.nodes.values() + if n.data_type == 'url.query.pair' and n.key == 'usg') + self.assertIn('signature', usg_node.hover.lower()) + + if __name__ == '__main__': unittest.main() From 20b1a2cf82be3b3d599f45ce8ebb039255c73f04 Mon Sep 17 00:00:00 2001 From: Ryan Benson Date: Sat, 18 Apr 2026 18:05:07 -0700 Subject: [PATCH 2/2] Make test more specific --- unfurl/tests/unit/test_google.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/unfurl/tests/unit/test_google.py b/unfurl/tests/unit/test_google.py index a193f38..8621a30 100644 --- a/unfurl/tests/unit/test_google.py +++ b/unfurl/tests/unit/test_google.py @@ -1,4 +1,5 @@ from unfurl.core import Unfurl +from urllib.parse import urlparse import unittest @@ -121,8 +122,11 @@ def test_google_url_redirect(self): self.assertIn('redirect target', q_node.hover.lower()) # confirm the destination URL is parsed - dest_urls = [n for n in test.nodes.values() - if n.data_type == 'url' and 'example.org' in str(n.value)] + dest_urls = [ + n for n in test.nodes.values() + if n.data_type == 'url' + and urlparse(str(n.value)).hostname == 'example.org' + ] self.assertGreaterEqual(len(dest_urls), 1) # confirm sa has hover text