Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ dependencies = [
"inotify==0.2.12",
"tld==0.13.1",
"warc2zim @ git+https://github.com/openzim/warc2zim@main",
"beautifulsoup4==4.14.3",
"warcio==1.7.5",
"argostranslate"
]
dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]

Expand Down Expand Up @@ -221,5 +224,5 @@ exclude_lines = [
include = ["src", "tests", "tasks.py"]
exclude = [".env/**", ".venv/**"]
extraPaths = ["src"]
pythonVersion = "3.14"
pythonVersion = "3.11"
typeCheckingMode="basic"
163 changes: 163 additions & 0 deletions src/zimit/translate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
from warcio.warcwriter import WARCWriter
from warcio.archiveiterator import ArchiveIterator
import argostranslate.package, argostranslate.translate
import bs4
from bs4 import BeautifulSoup
from argostranslate.tags import Tag, translate_tags
from io import BytesIO
import tempfile
import os
from pathlib import Path
import re


from_code = "en"
to_code = "es"

NON_TRANSLATEABLE_TAGS = [
"address",
"applet",
"audio",
"canvas",
"code",
"embed",
"script",
"style",
"time",
"video",
]

# Download and install Argos Translate package
available_packages = argostranslate.package.get_available_packages()
available_package = list(
filter(
lambda x: x.from_code == from_code and x.to_code == to_code, available_packages
)
)[0]
download_path = available_package.download()
argostranslate.package.install_from_path(download_path)


def itag_of_soup(soup):
if isinstance(soup, bs4.element.NavigableString):
return str(soup)
translateable = (
soup.name not in NON_TRANSLATEABLE_TAGS and soup.get("translate") != "no"
)
to_return = Tag([itag_of_soup(content) for content in soup.contents], translateable)
to_return.soup = soup
return to_return


def soup_of_itag(itag):
if isinstance(itag, str):
return bs4.element.NavigableString(itag)
soup = itag.soup
soup.clear()
soup.extend([soup_of_itag(child) for child in itag.children])
return soup


def translate_html(underlying_translation, html):
soup = BeautifulSoup(html, "html.parser")
itag = itag_of_soup(soup)
translated_tag = translate_tags(underlying_translation, itag)
translated_soup = soup_of_itag(translated_tag)
return translated_soup


def translate(html, target_language="en"):


# Translate
installed_languages = argostranslate.translate.get_installed_languages()
from_lang = list(filter(lambda x: x.code == from_code, installed_languages))[0]
to_lang = list(filter(lambda x: x.code == to_code, installed_languages))[0]

translation = from_lang.get_translation(to_lang)

translated_soup = translate_html(translation, html)
return str(translated_soup)


def get_charset(content_type: str | None):
if not content_type:
return None
match = re.search(r"charset=([^\s;]+)", content_type, flags=re.IGNORECASE)
if not match:
return None
return match.group(1).strip("\"'")


def content_type_with_utf8(content_type: str):
if re.search(r"charset=", content_type, flags=re.IGNORECASE):
return re.sub(
r"charset=([^\s;]+)",
"charset=utf-8",
content_type,
flags=re.IGNORECASE,
)
return f"{content_type}; charset=utf-8"



def translate_warc(warc_path, target_language):
warc_path = Path(warc_path)
tmp_path = tempfile.NamedTemporaryFile(delete=False).name

stale_digest_headers = {
"WARC-Block-Digest",
"WARC-Payload-Digest",
"Content-Length",
}

with warc_path.open("rb") as inp, open(tmp_path, "wb") as out:
writer = WARCWriter(out, gzip="".join(warc_path.suffixes[-2:]) == ".warc.gz")

for record in ArchiveIterator(inp):

if record.rec_type == "response" and record.http_headers:
ct = record.http_headers.get_header("Content-Type")

if ct and "text/html" in ct:
html_bytes = record.content_stream().read()
charset = get_charset(ct)
if charset:
html = html_bytes.decode(charset, errors="replace")
else:
try:
html = html_bytes.decode("utf-8")
except UnicodeDecodeError:
html = html_bytes.decode("latin-1", errors="replace")
translated = translate(html, target_language)
if isinstance(translated, str):
translated = translated.encode("utf-8")

record.http_headers.replace_header(
"Content-Length",
str(len(translated))
)
record.http_headers.replace_header(
"Content-Type",
content_type_with_utf8(ct),
)

warc_headers = [
(name, value)
for (name, value) in record.rec_headers.headers
if name not in stale_digest_headers
]
new_record = writer.create_warc_record(
record.rec_headers.get_header("WARC-Target-URI"),
"response",
payload=BytesIO(translated),
warc_headers_dict=warc_headers,
http_headers=record.http_headers,
)

writer.write_record(new_record)
continue

writer.write_record(record)

os.replace(tmp_path, str(warc_path))
32 changes: 32 additions & 0 deletions src/zimit/zimit.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from argparse import ArgumentParser
from multiprocessing import Process
from pathlib import Path
from zimit.translate import translate_warc

import inotify
import inotify.adapters
Expand All @@ -36,6 +37,22 @@
temp_root_dir: Path | None = None


def iter_warc_files(paths: list[Path]):
"""Yield concrete WARC files from a mixed list of files/directories."""
for path in paths:
if path.is_file():
if path.suffix == ".warc" or "".join(path.suffixes[-2:]) == ".warc.gz":
yield path
continue
if path.is_dir():
for warc_file in sorted(path.rglob("*.warc")):
if warc_file.is_file():
yield warc_file
for warc_file in sorted(path.rglob("*.warc.gz")):
if warc_file.is_file():
yield warc_file


class ProgressFileWatcher:
def __init__(
self, crawl_stats_path: Path, warc2zim_stats_path, zimit_stats_path: Path
Expand Down Expand Up @@ -778,6 +795,12 @@ def run(raw_args):
" used). Single value with individual error codes separated by comma",
)

parser.add_argument(
"--translate",
help="If Set, translates the scrapped content into specific language "
"(ISO 639-1 code, ex: en, fr, es...). Default is no translation.",
)

# by design, all unknown args are for warc2zim ; known one are either for crawler
# or shared
known_args, warc2zim_args = parser.parse_known_args(raw_args)
Expand Down Expand Up @@ -1095,6 +1118,15 @@ def run(raw_args):
f"Processing WARC files in/at "
f"{' '.join(str(warc_file) for warc_file in warc_files)}"
)

if known_args.translate: # Call Translate before warc2zim if called
logger.info(f"Translating content to {known_args.translate} before warc2zim")
translation_targets = list(iter_warc_files(warc_files))
if len(translation_targets) == 0:
raise RuntimeError("No WARC files found to translate")
for warc_file in translation_targets:
translate_warc(warc_file, known_args.translate)

warc2zim_args.extend(str(warc_file) for warc_file in warc_files)

logger.info(f"Calling warc2zim with these args: {warc2zim_args}")
Expand Down
36 changes: 36 additions & 0 deletions test_commands.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env bash
set -euo pipefail

ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
OUT_NO_TRANSLATE="${ROOT_DIR}/output-no-translate"
OUT_WITH_TRANSLATE="${ROOT_DIR}/output-with-translate"

SEED_URL="https://quotes.toscrape.com/page/1/"
PAGE_LIMIT="10"

mkdir -p "${OUT_NO_TRANSLATE}" "${OUT_WITH_TRANSLATE}"

echo "Building Docker image zimit:latest from ${ROOT_DIR}/Dockerfile"
# docker build -t zimit:latest "${ROOT_DIR}"

echo "Running crawl without translation"
docker run --rm \
-v "${OUT_NO_TRANSLATE}:/output" \
zimit:latest zimit \
--seeds "${SEED_URL}" \
--pageLimit "${PAGE_LIMIT}" \
--name "quotes-no-translate"

echo "Running crawl with translation enabled"
docker run --rm \
-v "${OUT_WITH_TRANSLATE}:/output" \
zimit:latest zimit \
--seeds "${SEED_URL}" \
--pageLimit "${PAGE_LIMIT}" \
--translate "es" \
--name "quotes-with-translate"

echo "Done."
echo "Outputs:"
echo " - ${OUT_NO_TRANSLATE}"
echo " - ${OUT_WITH_TRANSLATE}"