From f7757d1641f080235dfabad785e43a635ff62974 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20B=C4=9Blka?= Date: Mon, 17 Feb 2025 19:27:00 +0100 Subject: [PATCH 1/6] support import from file, workaround with bs4 around page without body --- v7/import_page/README.md | 2 +- v7/import_page/import_page.py | 75 +++++++++++++++++++++-------------- 2 files changed, 47 insertions(+), 30 deletions(-) diff --git a/v7/import_page/README.md b/v7/import_page/README.md index 0a1ec271..3dbc4c3e 100644 --- a/v7/import_page/README.md +++ b/v7/import_page/README.md @@ -1,4 +1,4 @@ -Plugin to import arbitrary web pages. +Plugin to import arbitrary web pages (from an URL or a local file). Usage: diff --git a/v7/import_page/import_page.py b/v7/import_page/import_page.py index 618d6bfc..532841d4 100644 --- a/v7/import_page/import_page.py +++ b/v7/import_page/import_page.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright © 2015 Roberto Alsina and others +# Copyright © 2025 Roberto Alsina and others # Permission is hereby granted, free of charge, to any # person obtaining a copy of this software and associated @@ -29,10 +29,9 @@ import codecs try: - import libextract.api + from bs4 import BeautifulSoup except ImportError: - libextract = None -import lxml.html + BeautifulSoup = None import requests import sys @@ -61,34 +60,52 @@ class CommandImportPage(Command): def _execute(self, options, args): """Import a Page.""" - if libextract is None: - utils.req_missing(['libextract'], 'use the import_page plugin') + if BeautifulSoup is None: + utils.req_missing(['bs4'], 'use the import_page plugin') for url in args: self._import_page(url) def _import_page(self, url): - r = requests.get(url) - if 199 < r.status_code < 300: # Got it - # Use the page's title - doc = lxml.html.fromstring(r.content) - title = doc.find('*//title').text - if sys.version_info[0] == 2 and isinstance(title, str): - title = title.decode('utf-8') + parse = requests.utils.urlparse(url) + if 'http' in parse.scheme: + r = requests.get(url) + if not (199 < r.status_code < 300): # Did not get it + LOGGER.error(f'Error fetching URL: {url}') + return 1 + html = r.content.decode(r.encoding).encode('utf-8') if r.encoding and 'utf-8' \ + not in r.encoding.lower() else r.content + else: try: - slug = utils.slugify(title, lang='') - except TypeError: - slug = utils.slugify(title) - nodes = list(libextract.api.extract(r.content)) - # Let's assume the node with more text is the good one - lengths = [len(n.text_content()) for n in nodes] - node = nodes[lengths.index(max(lengths))] - document = doc_template.format( - title=title, - slug=slug, - content=lxml.html.tostring(node, encoding='utf8', method='html', pretty_print=True).decode('utf8') - ) - with codecs.open(slug + '.html', 'w+', encoding='utf-8') as outf: - outf.write(document) - + with open(url, 'rb') as f: + html = f.read() + except FileNotFoundError: + LOGGER.error(f'Error file does not exist: {url}') + return 1 + except (OSError, IOError) as e: + LOGGER.error(f'Error opening file "{url}": {e}') + return 1 + + try: + soup = BeautifulSoup(html, "lxml") + except ImportError: + soup = BeautifulSoup(html, "html.parser") + + title = soup.title.text if soup.title else "Untitled Page" + try: + slug = utils.slugify(title, lang='') + except TypeError: + slug = utils.slugify(title) + + candidates = soup.find_all(["p", "div", "article", "section"]) + if candidates: + node = max(candidates, key=lambda n: len(n.get_text(strip=True))) else: - LOGGER.error('Error fetching URL: {}'.format(url)) + node = None # empty + + document = doc_template.format( + title=title, + slug=slug, + content=node.get_text(strip=True) + ) + with codecs.open(slug + '.html', 'w+', encoding='utf-8') as outf: + outf.write(document) From 1df98ecf836f49bf55795153768a387427b3cd6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20B=C4=9Blka?= Date: Mon, 17 Feb 2025 19:27:41 +0100 Subject: [PATCH 2/6] move to v8 as it works on v8.3.1 --- {v7 => v8}/import_page/README.md | 0 {v7 => v8}/import_page/conf.py.sample | 0 {v7 => v8}/import_page/import_page.plugin | 0 {v7 => v8}/import_page/import_page.py | 0 {v7 => v8}/import_page/requirements.txt | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename {v7 => v8}/import_page/README.md (100%) rename {v7 => v8}/import_page/conf.py.sample (100%) rename {v7 => v8}/import_page/import_page.plugin (100%) rename {v7 => v8}/import_page/import_page.py (100%) rename {v7 => v8}/import_page/requirements.txt (100%) diff --git a/v7/import_page/README.md b/v8/import_page/README.md similarity index 100% rename from v7/import_page/README.md rename to v8/import_page/README.md diff --git a/v7/import_page/conf.py.sample b/v8/import_page/conf.py.sample similarity index 100% rename from v7/import_page/conf.py.sample rename to v8/import_page/conf.py.sample diff --git a/v7/import_page/import_page.plugin b/v8/import_page/import_page.plugin similarity index 100% rename from v7/import_page/import_page.plugin rename to v8/import_page/import_page.plugin diff --git a/v7/import_page/import_page.py b/v8/import_page/import_page.py similarity index 100% rename from v7/import_page/import_page.py rename to v8/import_page/import_page.py diff --git a/v7/import_page/requirements.txt b/v8/import_page/requirements.txt similarity index 100% rename from v7/import_page/requirements.txt rename to v8/import_page/requirements.txt From a06667231d07bccaee4f3bb5a4183f49e08e79be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20B=C4=9Blka?= Date: Mon, 17 Feb 2025 21:11:28 +0100 Subject: [PATCH 3/6] fix: html encoding remnant --- v8/import_page/import_page.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/v8/import_page/import_page.py b/v8/import_page/import_page.py index 532841d4..9af6fc30 100644 --- a/v8/import_page/import_page.py +++ b/v8/import_page/import_page.py @@ -72,8 +72,7 @@ def _import_page(self, url): if not (199 < r.status_code < 300): # Did not get it LOGGER.error(f'Error fetching URL: {url}') return 1 - html = r.content.decode(r.encoding).encode('utf-8') if r.encoding and 'utf-8' \ - not in r.encoding.lower() else r.content + html = r.content else: try: with open(url, 'rb') as f: From e8a5507ca825b9b5aae5a57f24122d76d64e58a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20B=C4=9Blka?= Date: Mon, 17 Feb 2025 21:12:29 +0100 Subject: [PATCH 4/6] fix: we want html not text --- v8/import_page/import_page.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v8/import_page/import_page.py b/v8/import_page/import_page.py index 9af6fc30..e8d749d7 100644 --- a/v8/import_page/import_page.py +++ b/v8/import_page/import_page.py @@ -104,7 +104,7 @@ def _import_page(self, url): document = doc_template.format( title=title, slug=slug, - content=node.get_text(strip=True) + content=node.prettify() ) with codecs.open(slug + '.html', 'w+', encoding='utf-8') as outf: outf.write(document) From ab15bed9f0090269fc5b20a2c8d503b7804d9890 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20B=C4=9Blka?= Date: Tue, 18 Feb 2025 01:43:08 +0100 Subject: [PATCH 5/6] introduce selector and extractor; fix requirements.txt --- v8/import_page/import_page.py | 57 ++++++++++++++++++++++++++++----- v8/import_page/requirements.txt | 2 +- 2 files changed, 50 insertions(+), 9 deletions(-) diff --git a/v8/import_page/import_page.py b/v8/import_page/import_page.py index e8d749d7..b1d65abd 100644 --- a/v8/import_page/import_page.py +++ b/v8/import_page/import_page.py @@ -40,6 +40,10 @@ LOGGER = utils.get_logger('import_page', utils.STDERR_HANDLER) +args = sys.argv[1:] +selector = None # 'body' +extractor = None # 'lambda node: BeautifulSoup(node.decode_contents(), "html.parser").prettify()' +path_or_url = None doc_template = '''