openzim · ItzCobaltboy · Feb 14, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,6 +12,9 @@ dependencies = [
   "inotify==0.2.12",
   "tld==0.13.1",
   "warc2zim @ git+https://github.com/openzim/warc2zim@main",
+  "beautifulsoup4==4.14.3",
+  "warcio==1.7.5",
+  "argostranslate"
 ]
 dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
 
@@ -221,5 +224,5 @@ exclude_lines = [
 include = ["src", "tests", "tasks.py"]
 exclude = [".env/**", ".venv/**"]
 extraPaths = ["src"]
-pythonVersion = "3.14"
+pythonVersion = "3.11"
 typeCheckingMode="basic"
diff --git a/src/zimit/translate.py b/src/zimit/translate.py
@@ -0,0 +1,163 @@
+from warcio.warcwriter import WARCWriter
+from warcio.archiveiterator import ArchiveIterator
+import argostranslate.package, argostranslate.translate
+import bs4
+from bs4 import BeautifulSoup
+from argostranslate.tags import Tag, translate_tags
+from io import BytesIO
+import tempfile
+import os
+from pathlib import Path
+import re
+
+
+from_code = "en"
+to_code = "es"
+
+NON_TRANSLATEABLE_TAGS = [
+    "address",
+    "applet",
+    "audio",
+    "canvas",
+    "code",
+    "embed",
+    "script",
+    "style",
+    "time",
+    "video",
+]
+
+# Download and install Argos Translate package
+available_packages = argostranslate.package.get_available_packages()
+available_package = list(
+    filter(
+        lambda x: x.from_code == from_code and x.to_code == to_code, available_packages
+    )
+)[0]
+download_path = available_package.download()
+argostranslate.package.install_from_path(download_path)
+
+
+def itag_of_soup(soup):
+    if isinstance(soup, bs4.element.NavigableString):
+        return str(soup)
+    translateable = (
+        soup.name not in NON_TRANSLATEABLE_TAGS and soup.get("translate") != "no"
+    )
+    to_return = Tag([itag_of_soup(content) for content in soup.contents], translateable)
+    to_return.soup = soup
+    return to_return
+
+
+def soup_of_itag(itag):
+    if isinstance(itag, str):
+        return bs4.element.NavigableString(itag)
+    soup = itag.soup
+    soup.clear()
+    soup.extend([soup_of_itag(child) for child in itag.children])
+    return soup
+
+
+def translate_html(underlying_translation, html):
+    soup = BeautifulSoup(html, "html.parser")
+    itag = itag_of_soup(soup)
+    translated_tag = translate_tags(underlying_translation, itag)
+    translated_soup = soup_of_itag(translated_tag)
+    return translated_soup
+
+
+def translate(html, target_language="en"):
+
+
+    # Translate
+    installed_languages = argostranslate.translate.get_installed_languages()
+    from_lang = list(filter(lambda x: x.code == from_code, installed_languages))[0]
+    to_lang = list(filter(lambda x: x.code == to_code, installed_languages))[0]
+
+    translation = from_lang.get_translation(to_lang)
+
+    translated_soup = translate_html(translation, html)
+    return str(translated_soup)
+
+
+def get_charset(content_type: str | None):
+    if not content_type:
+        return None
+    match = re.search(r"charset=([^\s;]+)", content_type, flags=re.IGNORECASE)
+    if not match:
+        return None
+    return match.group(1).strip("\"'")
+
+
+def content_type_with_utf8(content_type: str):
+    if re.search(r"charset=", content_type, flags=re.IGNORECASE):
+        return re.sub(
+            r"charset=([^\s;]+)",
+            "charset=utf-8",
+            content_type,
+            flags=re.IGNORECASE,
+        )
+    return f"{content_type}; charset=utf-8"
+
+
+
+def translate_warc(warc_path, target_language):
+    warc_path = Path(warc_path)
+    tmp_path = tempfile.NamedTemporaryFile(delete=False).name
+
+    stale_digest_headers = {
+        "WARC-Block-Digest",
+        "WARC-Payload-Digest",
+        "Content-Length",
+    }
+
+    with warc_path.open("rb") as inp, open(tmp_path, "wb") as out:
+        writer = WARCWriter(out, gzip="".join(warc_path.suffixes[-2:]) == ".warc.gz")
+
+        for record in ArchiveIterator(inp):
+
+            if record.rec_type == "response" and record.http_headers:
+                ct = record.http_headers.get_header("Content-Type")
+
+                if ct and "text/html" in ct:
+                    html_bytes = record.content_stream().read()
+                    charset = get_charset(ct)
+                    if charset:
+                        html = html_bytes.decode(charset, errors="replace")
+                    else:
+                        try:
+                            html = html_bytes.decode("utf-8")
+                        except UnicodeDecodeError:
+                            html = html_bytes.decode("latin-1", errors="replace")
+                    translated = translate(html, target_language)
+                    if isinstance(translated, str):
+                        translated = translated.encode("utf-8")
+
+                    record.http_headers.replace_header(
+                        "Content-Length",
+                        str(len(translated))
+                    )
+                    record.http_headers.replace_header(
+                        "Content-Type",
+                        content_type_with_utf8(ct),
+                    )
+
+                    warc_headers = [
+                        (name, value)
+                        for (name, value) in record.rec_headers.headers
+                        if name not in stale_digest_headers
+                    ]
+                    new_record = writer.create_warc_record(
+                        record.rec_headers.get_header("WARC-Target-URI"),
+                        "response",
+                        payload=BytesIO(translated),
+                        warc_headers_dict=warc_headers,
+                        http_headers=record.http_headers,
+                    )
+
+                    writer.write_record(new_record)
+                    continue
+
+            writer.write_record(record)
+
+    os.replace(tmp_path, str(warc_path))
diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py
@@ -17,6 +17,7 @@
 from argparse import ArgumentParser
 from multiprocessing import Process
 from pathlib import Path
+from zimit.translate import translate_warc
 
 import inotify
 import inotify.adapters
@@ -36,6 +37,22 @@
 temp_root_dir: Path | None = None
 
 
+def iter_warc_files(paths: list[Path]):
+    """Yield concrete WARC files from a mixed list of files/directories."""
+    for path in paths:
+        if path.is_file():
+            if path.suffix == ".warc" or "".join(path.suffixes[-2:]) == ".warc.gz":
+                yield path
+            continue
+        if path.is_dir():
+            for warc_file in sorted(path.rglob("*.warc")):
+                if warc_file.is_file():
+                    yield warc_file
+            for warc_file in sorted(path.rglob("*.warc.gz")):
+                if warc_file.is_file():
+                    yield warc_file
+
+
 class ProgressFileWatcher:
     def __init__(
         self, crawl_stats_path: Path, warc2zim_stats_path, zimit_stats_path: Path
@@ -778,6 +795,12 @@ def run(raw_args):
         " used). Single value with individual error codes separated by comma",
     )
 
+    parser.add_argument(
+        "--translate",
+        help="If Set, translates the scrapped content into specific language "
+        "(ISO 639-1 code, ex: en, fr, es...). Default is no translation.",
+    )
+
     # by design, all unknown args are for warc2zim ; known one are either for crawler
     # or shared
     known_args, warc2zim_args = parser.parse_known_args(raw_args)
@@ -1095,6 +1118,15 @@ def run(raw_args):
         f"Processing WARC files in/at "
         f"{' '.join(str(warc_file) for warc_file in warc_files)}"
     )
+
+    if known_args.translate:  # Call Translate before warc2zim if called
+        logger.info(f"Translating content to {known_args.translate} before warc2zim")
+        translation_targets = list(iter_warc_files(warc_files))
+        if len(translation_targets) == 0:
+            raise RuntimeError("No WARC files found to translate")
+        for warc_file in translation_targets:
+            translate_warc(warc_file, known_args.translate)
+
     warc2zim_args.extend(str(warc_file) for warc_file in warc_files)
 
     logger.info(f"Calling warc2zim with these args: {warc2zim_args}")

diff --git a/test_commands.sh b/test_commands.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+OUT_NO_TRANSLATE="${ROOT_DIR}/output-no-translate"
+OUT_WITH_TRANSLATE="${ROOT_DIR}/output-with-translate"
+
+SEED_URL="https://quotes.toscrape.com/page/1/"
+PAGE_LIMIT="10"
+
+mkdir -p "${OUT_NO_TRANSLATE}" "${OUT_WITH_TRANSLATE}"
+
+echo "Building Docker image zimit:latest from ${ROOT_DIR}/Dockerfile"
+# docker build -t zimit:latest "${ROOT_DIR}"
+
+echo "Running crawl without translation"
+docker run --rm \
+  -v "${OUT_NO_TRANSLATE}:/output" \
+  zimit:latest zimit \
+  --seeds "${SEED_URL}" \
+  --pageLimit "${PAGE_LIMIT}" \
+  --name "quotes-no-translate"
+
+echo "Running crawl with translation enabled"
+docker run --rm \
+  -v "${OUT_WITH_TRANSLATE}:/output" \
+  zimit:latest zimit \
+  --seeds "${SEED_URL}" \
+  --pageLimit "${PAGE_LIMIT}" \
+  --translate "es" \
+  --name "quotes-with-translate"
+
+echo "Done."
+echo "Outputs:"
+echo "  - ${OUT_NO_TRANSLATE}"
+echo "  - ${OUT_WITH_TRANSLATE}"