diff --git a/pyproject.toml b/pyproject.toml index b106896..fc9aafc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,9 @@ dependencies = [ "inotify==0.2.12", "tld==0.13.1", "warc2zim @ git+https://github.com/openzim/warc2zim@main", + "beautifulsoup4==4.14.3", + "warcio==1.7.5", + "argostranslate" ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] @@ -221,5 +224,5 @@ exclude_lines = [ include = ["src", "tests", "tasks.py"] exclude = [".env/**", ".venv/**"] extraPaths = ["src"] -pythonVersion = "3.14" +pythonVersion = "3.11" typeCheckingMode="basic" diff --git a/src/zimit/translate.py b/src/zimit/translate.py new file mode 100644 index 0000000..87e161b --- /dev/null +++ b/src/zimit/translate.py @@ -0,0 +1,163 @@ +from warcio.warcwriter import WARCWriter +from warcio.archiveiterator import ArchiveIterator +import argostranslate.package, argostranslate.translate +import bs4 +from bs4 import BeautifulSoup +from argostranslate.tags import Tag, translate_tags +from io import BytesIO +import tempfile +import os +from pathlib import Path +import re + + +from_code = "en" +to_code = "es" + +NON_TRANSLATEABLE_TAGS = [ + "address", + "applet", + "audio", + "canvas", + "code", + "embed", + "script", + "style", + "time", + "video", +] + +# Download and install Argos Translate package +available_packages = argostranslate.package.get_available_packages() +available_package = list( + filter( + lambda x: x.from_code == from_code and x.to_code == to_code, available_packages + ) +)[0] +download_path = available_package.download() +argostranslate.package.install_from_path(download_path) + + +def itag_of_soup(soup): + if isinstance(soup, bs4.element.NavigableString): + return str(soup) + translateable = ( + soup.name not in NON_TRANSLATEABLE_TAGS and soup.get("translate") != "no" + ) + to_return = Tag([itag_of_soup(content) for content in soup.contents], translateable) + to_return.soup = soup + return to_return + + +def soup_of_itag(itag): + if isinstance(itag, str): + return bs4.element.NavigableString(itag) + soup = itag.soup + soup.clear() + soup.extend([soup_of_itag(child) for child in itag.children]) + return soup + + +def translate_html(underlying_translation, html): + soup = BeautifulSoup(html, "html.parser") + itag = itag_of_soup(soup) + translated_tag = translate_tags(underlying_translation, itag) + translated_soup = soup_of_itag(translated_tag) + return translated_soup + + +def translate(html, target_language="en"): + + + # Translate + installed_languages = argostranslate.translate.get_installed_languages() + from_lang = list(filter(lambda x: x.code == from_code, installed_languages))[0] + to_lang = list(filter(lambda x: x.code == to_code, installed_languages))[0] + + translation = from_lang.get_translation(to_lang) + + translated_soup = translate_html(translation, html) + return str(translated_soup) + + +def get_charset(content_type: str | None): + if not content_type: + return None + match = re.search(r"charset=([^\s;]+)", content_type, flags=re.IGNORECASE) + if not match: + return None + return match.group(1).strip("\"'") + + +def content_type_with_utf8(content_type: str): + if re.search(r"charset=", content_type, flags=re.IGNORECASE): + return re.sub( + r"charset=([^\s;]+)", + "charset=utf-8", + content_type, + flags=re.IGNORECASE, + ) + return f"{content_type}; charset=utf-8" + + + +def translate_warc(warc_path, target_language): + warc_path = Path(warc_path) + tmp_path = tempfile.NamedTemporaryFile(delete=False).name + + stale_digest_headers = { + "WARC-Block-Digest", + "WARC-Payload-Digest", + "Content-Length", + } + + with warc_path.open("rb") as inp, open(tmp_path, "wb") as out: + writer = WARCWriter(out, gzip="".join(warc_path.suffixes[-2:]) == ".warc.gz") + + for record in ArchiveIterator(inp): + + if record.rec_type == "response" and record.http_headers: + ct = record.http_headers.get_header("Content-Type") + + if ct and "text/html" in ct: + html_bytes = record.content_stream().read() + charset = get_charset(ct) + if charset: + html = html_bytes.decode(charset, errors="replace") + else: + try: + html = html_bytes.decode("utf-8") + except UnicodeDecodeError: + html = html_bytes.decode("latin-1", errors="replace") + translated = translate(html, target_language) + if isinstance(translated, str): + translated = translated.encode("utf-8") + + record.http_headers.replace_header( + "Content-Length", + str(len(translated)) + ) + record.http_headers.replace_header( + "Content-Type", + content_type_with_utf8(ct), + ) + + warc_headers = [ + (name, value) + for (name, value) in record.rec_headers.headers + if name not in stale_digest_headers + ] + new_record = writer.create_warc_record( + record.rec_headers.get_header("WARC-Target-URI"), + "response", + payload=BytesIO(translated), + warc_headers_dict=warc_headers, + http_headers=record.http_headers, + ) + + writer.write_record(new_record) + continue + + writer.write_record(record) + + os.replace(tmp_path, str(warc_path)) diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index b205007..60fbb7f 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -17,6 +17,7 @@ from argparse import ArgumentParser from multiprocessing import Process from pathlib import Path +from zimit.translate import translate_warc import inotify import inotify.adapters @@ -36,6 +37,22 @@ temp_root_dir: Path | None = None +def iter_warc_files(paths: list[Path]): + """Yield concrete WARC files from a mixed list of files/directories.""" + for path in paths: + if path.is_file(): + if path.suffix == ".warc" or "".join(path.suffixes[-2:]) == ".warc.gz": + yield path + continue + if path.is_dir(): + for warc_file in sorted(path.rglob("*.warc")): + if warc_file.is_file(): + yield warc_file + for warc_file in sorted(path.rglob("*.warc.gz")): + if warc_file.is_file(): + yield warc_file + + class ProgressFileWatcher: def __init__( self, crawl_stats_path: Path, warc2zim_stats_path, zimit_stats_path: Path @@ -778,6 +795,12 @@ def run(raw_args): " used). Single value with individual error codes separated by comma", ) + parser.add_argument( + "--translate", + help="If Set, translates the scrapped content into specific language " + "(ISO 639-1 code, ex: en, fr, es...). Default is no translation.", + ) + # by design, all unknown args are for warc2zim ; known one are either for crawler # or shared known_args, warc2zim_args = parser.parse_known_args(raw_args) @@ -1095,6 +1118,15 @@ def run(raw_args): f"Processing WARC files in/at " f"{' '.join(str(warc_file) for warc_file in warc_files)}" ) + + if known_args.translate: # Call Translate before warc2zim if called + logger.info(f"Translating content to {known_args.translate} before warc2zim") + translation_targets = list(iter_warc_files(warc_files)) + if len(translation_targets) == 0: + raise RuntimeError("No WARC files found to translate") + for warc_file in translation_targets: + translate_warc(warc_file, known_args.translate) + warc2zim_args.extend(str(warc_file) for warc_file in warc_files) logger.info(f"Calling warc2zim with these args: {warc2zim_args}") diff --git a/test_commands.sh b/test_commands.sh new file mode 100755 index 0000000..039fe30 --- /dev/null +++ b/test_commands.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +OUT_NO_TRANSLATE="${ROOT_DIR}/output-no-translate" +OUT_WITH_TRANSLATE="${ROOT_DIR}/output-with-translate" + +SEED_URL="https://quotes.toscrape.com/page/1/" +PAGE_LIMIT="10" + +mkdir -p "${OUT_NO_TRANSLATE}" "${OUT_WITH_TRANSLATE}" + +echo "Building Docker image zimit:latest from ${ROOT_DIR}/Dockerfile" +# docker build -t zimit:latest "${ROOT_DIR}" + +echo "Running crawl without translation" +docker run --rm \ + -v "${OUT_NO_TRANSLATE}:/output" \ + zimit:latest zimit \ + --seeds "${SEED_URL}" \ + --pageLimit "${PAGE_LIMIT}" \ + --name "quotes-no-translate" + +echo "Running crawl with translation enabled" +docker run --rm \ + -v "${OUT_WITH_TRANSLATE}:/output" \ + zimit:latest zimit \ + --seeds "${SEED_URL}" \ + --pageLimit "${PAGE_LIMIT}" \ + --translate "es" \ + --name "quotes-with-translate" + +echo "Done." +echo "Outputs:" +echo " - ${OUT_NO_TRANSLATE}" +echo " - ${OUT_WITH_TRANSLATE}"