diff --git a/README.md b/README.md index eafd168..a78b6f4 100644 --- a/README.md +++ b/README.md @@ -28,9 +28,17 @@ pip install -r requirements.txt Install these extras when you need richer file handling: -- `PyPDF2` – enables text extraction from PDF attachments so their contents can be sent to the selected backend. -- `python-docx` – parses DOCX files and pulls paragraph text for inclusion in prompts. +- `PyPDF2` – used with `--extract-text` to pull PDF contents into prompts before sending them to the selected backend. +- `python-docx` – used with `--extract-text` to parse DOCX files and include paragraph text inline. - `opencv-python` – extracts representative PNG frames from video files when using `--frame-by-frame` processing. +- `pypdfium2` + `Pillow` – render PDFs into PNG previews that are uploaded to vision-capable backends when `--extract-text` is **not** supplied. +- `Pillow` (alone) – renders DOCX snapshots so word-processing files can be viewed as images when skipping text extraction. + +Install the preview toolchain (for PDFs and DOCX files) with: + +```bash +pip install pillow pypdfium2 python-docx +``` ## Usage @@ -68,3 +76,10 @@ send extracted video frames individually and concatenate the responses. Use `-o`/`--output` to save the model response to a file. When no filename is supplied, the first attached file name with `.txt` appended is used; if no files are attached, `response.txt` is created. + +By default the CLI renders PDF and DOCX files into PNG preview images and sends +those to vision-capable backends (such as Ollama multimodal models). When the +preview toolchain is unavailable, the binary document is attached instead with a +note explaining how to enable previews. Use `--extract-text` when you prefer to +run local extraction tools (PDF via PyPDF2, DOCX via python-docx) before +embedding the contents into the prompt. diff --git a/clair.py b/clair.py index 7caf513..9c2e85a 100644 --- a/clair.py +++ b/clair.py @@ -6,7 +6,7 @@ - Prefers /api/chat; falls back to /api/generate on 404. - Images -> base64 in the "images" field (for vision models). - Text files -> appended to the prompt as code blocks. -- PDF/DOCX -> optional text extraction (PyPDF2 / python-docx). +- PDF/DOCX -> optional text extraction with --extract-text (PyPDF2 / python-docx). - MP4 -> extracts N frames (PNG, base64) and appends them to "images". Compatible with Ollama 0.11.10 (REST). @@ -22,9 +22,10 @@ import os import sys import tempfile +import textwrap import time from importlib import import_module, util -from typing import Any, List, Optional, Tuple +from typing import Any, Iterable, List, Optional, Tuple from urllib.parse import urlparse import requests @@ -205,6 +206,202 @@ def extract_video_frames_b64( ".conf", ".log", } + +PDF_PREVIEW_MAX_PAGES = 3 +DOCX_PREVIEW_MAX_IMAGES = 3 +DOCX_PREVIEW_WRAP = 90 +DOCX_PREVIEW_LINES_PER_IMAGE = 40 + + +def render_text_block_to_image_b64(lines: Iterable[str]) -> Optional[str]: + """Render wrapped text lines to a PNG image encoded as base64.""" + + image_module = _optional_import("PIL.Image") + draw_module = _optional_import("PIL.ImageDraw") + font_module = _optional_import("PIL.ImageFont") + + if not image_module or not draw_module or not font_module: + LOG.debug("Pillow is required to render text previews.") + return None + + try: + font = font_module.load_default() + except Exception as exc: # pragma: no cover - extremely unlikely + LOG.warning("Failed to load Pillow default font: %s", exc) + return None + + # Calculate approximate canvas size. + lines_list = list(lines) + if not lines_list: + lines_list = [""] + + try: + sample_bbox = font.getbbox("Mg") + char_height = sample_bbox[3] - sample_bbox[1] + char_width = sample_bbox[2] - sample_bbox[0] + except Exception: # pragma: no cover - defensive guard + char_height = 12 + char_width = 7 + + max_line_len = max(len(line) for line in lines_list) + width = max(640, min(2048, 40 + char_width * max_line_len)) + line_height = char_height + 6 + height = max(80, 40 + line_height * len(lines_list)) + + image = image_module.new("RGB", (width, height), "white") + draw = draw_module.Draw(image) + + y = 20 + for line in lines_list: + draw.text((20, y), line, fill="black", font=font) + y += line_height + + with io.BytesIO() as buf: + image.save(buf, format="PNG") + return base64.b64encode(buf.getvalue()).decode("ascii") + + +def wrap_text_for_preview(text: str, *, width: int) -> List[str]: + wrapper = textwrap.TextWrapper( + width=width, + replace_whitespace=False, + drop_whitespace=False, + ) + lines: List[str] = [] + for paragraph in text.splitlines(): + if not paragraph: + lines.append("") + continue + wrapped = wrapper.wrap(paragraph) + if wrapped: + lines.extend(wrapped) + else: + lines.append("") + if not lines: + lines.append("") + return lines + + +def convert_docx_blob_to_image_previews( + blob: bytes, + *, + max_chars: int, + wrap: int = DOCX_PREVIEW_WRAP, + max_images: int = DOCX_PREVIEW_MAX_IMAGES, + lines_per_image: int = DOCX_PREVIEW_LINES_PER_IMAGE, +) -> Tuple[List[str], bool]: + """ + Render a DOCX document to up to ``max_images`` PNG previews. + + Returns the list of base64 images and a boolean indicating whether the + preview text had to be truncated due to ``max_chars``. + """ + + docx_module = _optional_import("docx") + if docx_module is None: + LOG.debug("python-docx is required for DOCX previews.") + return [], False + + try: + document = docx_module.Document(io.BytesIO(blob)) + except Exception as exc: + LOG.warning("Could not open DOCX for preview: %s", exc) + return [], False + + text = "\n".join(p.text for p in document.paragraphs).strip() + if not text: + return [], False + + truncated = len(text) > max_chars + if truncated: + text = text[:max_chars] + "\n\n[... truncated for preview ...]" + + wrapped_lines = wrap_text_for_preview(text, width=wrap) + chunks: List[List[str]] = [] + for idx in range(0, len(wrapped_lines), lines_per_image): + if len(chunks) >= max_images: + break + chunks.append(wrapped_lines[idx : idx + lines_per_image]) + + previews: List[str] = [] + for chunk in chunks: + image_b64 = render_text_block_to_image_b64(chunk) + if image_b64: + previews.append(image_b64) + + if not previews: + LOG.debug("DOCX preview rendering produced no images (Pillow missing?).") + + return previews, truncated + + +def convert_pdf_blob_to_image_previews( + blob: bytes, + *, + max_pages: int = PDF_PREVIEW_MAX_PAGES, + dpi: int = 200, +) -> Tuple[List[str], int]: + """ + Render the first ``max_pages`` of a PDF as PNG previews. + + Returns a tuple of (images, total_pages_in_document). + """ + + pdfium = _optional_import("pypdfium2") + if pdfium is None: + LOG.debug("pypdfium2 is required for PDF previews.") + return [], 0 + + try: + pdf = pdfium.PdfDocument(io.BytesIO(blob)) + except Exception as exc: + LOG.warning("Could not open PDF for preview: %s", exc) + return [], 0 + + total_pages = len(pdf) + previews: List[str] = [] + + try: + render_pages = min(total_pages, max_pages) + for index in range(render_pages): + page = pdf[index] + try: + bitmap = page.render(scale=dpi / 72) + try: + image = bitmap.to_pil() + except Exception as exc: + LOG.warning( + "PDF preview rendering requires Pillow (page %d): %s", + index + 1, + exc, + ) + return [], total_pages + except Exception as exc: + LOG.warning("Failed to render PDF page %d: %s", index + 1, exc) + continue + finally: + try: + bitmap.close() + except Exception: + pass + try: + page.close() + except Exception: + pass + + with io.BytesIO() as buf: + image.save(buf, format="PNG") + previews.append(base64.b64encode(buf.getvalue()).decode("ascii")) + finally: + try: + pdf.close() + except Exception: + pass + + if not previews: + LOG.debug("No PDF preview images were generated.") + + return previews, total_pages VIDEO_EXT = {".mp4"} # extend as needed: .mov, .mkv ... IMAGE_MIME_PREFIXES = ("image/",) @@ -426,6 +623,9 @@ def send_with_fallback( imgs = payload_chat["messages"][0].get("images", []) if imgs: payload_generate["images"] = imgs + documents = payload_chat["messages"][0].get("documents") + if documents: + payload_generate["documents"] = documents try: if stream: @@ -665,6 +865,81 @@ def process_single(args) -> None: images_b64: List[Tuple[str, str]] = [] text_attachments: List[Tuple[str, str]] = [] video_notes: List[str] = [] + document_attachments: List[dict] = [] + + def record_document_attachment( + name: str, data: bytes, mime_type: Optional[str] + ) -> None: + """Render previews for binary documents and fall back to attachments.""" + + mime = mime_type or "application/octet-stream" + + if mime == "application/pdf": + previews, total_pages = convert_pdf_blob_to_image_previews(data) + if previews: + plural = "s" if len(previews) != 1 else "" + total_display = total_pages or len(previews) + note = ( + f"[PDF preview ready: {name} | {len(previews)} image{plural}" + f" | {len(data)} bytes | first {len(previews)} of {total_display} pages" + " | use --extract-text for local parsing]" + ) + text_attachments.append((name, note)) + for image_b64 in previews: + images_b64.append((image_b64, "image/png")) + LOG.info( + "Rendered %d preview image(s) for PDF '%s'.", + len(previews), + name, + ) + return + placeholder = ( + f"[PDF preview unavailable: {name} | {len(data)} bytes | install" + " 'pypdfium2'+'Pillow' or use --extract-text]" + ) + elif ( + mime + == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ): + previews, truncated = convert_docx_blob_to_image_previews( + data, max_chars=args.max_chars + ) + if previews: + plural = "s" if len(previews) != 1 else "" + suffix = " (truncated)" if truncated else "" + note = ( + f"[DOCX preview ready: {name} | {len(previews)} image{plural}" + f" | {len(data)} bytes{suffix} | use --extract-text for text mode]" + ) + text_attachments.append((name, note)) + for image_b64 in previews: + images_b64.append((image_b64, "image/png")) + LOG.info( + "Rendered %d preview image(s) for DOCX '%s'.", + len(previews), + name, + ) + return + placeholder = ( + f"[DOCX preview unavailable: {name} | {len(data)} bytes | install" + " 'python-docx'+'Pillow' or use --extract-text]" + ) + else: + placeholder = ( + f"[Document attached: {name} | MIME: {mime} | {len(data)} bytes]" + ) + + document_attachments.append( + { + "name": name, + "data": base64.b64encode(data).decode("ascii"), + "mime_type": mime, + } + ) + text_attachments.append((name, placeholder)) + LOG.info( + "Stored binary attachment for '%s' (mime=%s, %d bytes)", name, mime, len(data) + ) # --- Gather URLs --- for url in args.urls: @@ -714,22 +989,37 @@ def process_single(args) -> None: # default: treat as doc/text header_main = header_ct.split(";")[0].lower() - if not args.no_extract and header_main == "application/pdf": - with tempfile.NamedTemporaryFile(suffix=".pdf") as tmp: - tmp.write(resp.content) - tmp.flush() - extracted = try_extract_pdf_text(tmp.name) - content = extracted or resp.text + if not header_main: + _, guess_ext = os.path.splitext(name) + if guess_ext.lower() == ".pdf": + header_main = "application/pdf" + elif guess_ext.lower() == ".docx": + header_main = ( + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ) + if header_main == "application/pdf": + if args.extract_text: + with tempfile.NamedTemporaryFile(suffix=".pdf") as tmp: + tmp.write(resp.content) + tmp.flush() + extracted = try_extract_pdf_text(tmp.name) + content = extracted or resp.text + else: + record_document_attachment(name, resp.content, header_main) + continue elif ( - not args.no_extract - and header_main + header_main == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ): - with tempfile.NamedTemporaryFile(suffix=".docx") as tmp: - tmp.write(resp.content) - tmp.flush() - extracted = try_extract_docx_text(tmp.name) - content = extracted or resp.text + if args.extract_text: + with tempfile.NamedTemporaryFile(suffix=".docx") as tmp: + tmp.write(resp.content) + tmp.flush() + extracted = try_extract_docx_text(tmp.name) + content = extracted or resp.text + else: + record_document_attachment(name, resp.content, header_main) + continue else: content = resp.text @@ -790,21 +1080,47 @@ def process_single(args) -> None: text_attachments.append((os.path.basename(path), content)) continue - if not args.no_extract and ext == ".pdf": - extracted = try_extract_pdf_text(path) - if extracted: - if len(extracted) > args.max_chars: - extracted = extracted[: args.max_chars] + "\n\n[... truncated ...]" - text_attachments.append((os.path.basename(path), extracted)) - continue - - if not args.no_extract and ext == ".docx": - extracted = try_extract_docx_text(path) - if extracted: - if len(extracted) > args.max_chars: - extracted = extracted[: args.max_chars] + "\n\n[... truncated ...]" - text_attachments.append((os.path.basename(path), extracted)) - continue + if ext == ".pdf": + if args.extract_text: + extracted = try_extract_pdf_text(path) + if extracted: + if len(extracted) > args.max_chars: + extracted = ( + extracted[: args.max_chars] + "\n\n[... truncated ...]" + ) + text_attachments.append((os.path.basename(path), extracted)) + continue + else: + try: + with open(path, "rb") as fh: + blob = fh.read() + record_document_attachment( + os.path.basename(path), blob, "application/pdf" + ) + continue + except Exception as e: + LOG.warning("Failed to read PDF as binary (%s): %s", path, e) + + if ext == ".docx": + if args.extract_text: + extracted = try_extract_docx_text(path) + if extracted: + if len(extracted) > args.max_chars: + extracted = ( + extracted[: args.max_chars] + "\n\n[... truncated ...]" + ) + text_attachments.append((os.path.basename(path), extracted)) + continue + else: + try: + with open(path, "rb") as fh: + blob = fh.read() + record_document_attachment( + os.path.basename(path), blob, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ) + continue + except Exception as e: + LOG.warning("Failed to read DOCX as binary (%s): %s", path, e) # Fallback: try as text (may be binary) content, _ = read_text_file(path, max_chars=args.max_chars) @@ -881,6 +1197,8 @@ def process_single(args) -> None: ], "stream": True if args.stream else False, # <<< IMPORTANT } + if document_attachments: + payload["messages"][0]["documents"] = document_attachments resp = send_with_fallback( args.host, payload, @@ -965,6 +1283,8 @@ def process_single(args) -> None: "Images in payload: %d (including possible video frames)", len(images_b64), ) + if document_attachments: + payload["messages"][0]["documents"] = document_attachments # Send (with fallback & tracing) response = send_with_fallback( @@ -1071,9 +1391,9 @@ def main(): help="Max chars per text file (before truncation)", ) parser.add_argument( - "--no-extract", + "--extract-text", action="store_true", - help="Don't attempt text extraction for PDF/DOCX", + help="Use local text extraction tools for PDF/DOCX", ) parser.add_argument( "--stream", action="store_true", help="Stream response as server-sent events" diff --git a/tests/unit/test_text_extraction_flag.py b/tests/unit/test_text_extraction_flag.py new file mode 100644 index 0000000..6463a60 --- /dev/null +++ b/tests/unit/test_text_extraction_flag.py @@ -0,0 +1,75 @@ +import sys + +from clair import main + + +def test_pdf_default_skips_local_extraction(monkeypatch, tmp_path): + pdf_path = tmp_path / "sample.pdf" + pdf_path.write_bytes(b"%PDF-1.4\n1 0 obj\n<<>>\nendobj\ntrailer\n<<>>\n%%EOF") + + def fake_extract(path): + raise AssertionError("local extraction should be disabled by default") + + def fake_preview(blob): + return ["cHJldmlldy1pbWFnZQ=="], 4 + + captured = {} + + def fake_send(host, payload, images_present, user_content, stream): + captured["user_content"] = user_content + captured["payload"] = payload + return "" + + monkeypatch.setattr("clair.try_extract_pdf_text", fake_extract) + monkeypatch.setattr("clair.convert_pdf_blob_to_image_previews", fake_preview) + monkeypatch.setattr("clair.send_with_fallback", fake_send) + monkeypatch.setattr( + sys, + "argv", + ["prog", "-p", "hi", "-f", str(pdf_path)], + ) + + main() + + assert "hi" in captured["user_content"] + assert "sample.pdf" in captured["user_content"] + message = captured["payload"]["messages"][0] + images = message["images"] + assert images == ["cHJldmlldy1pbWFnZQ=="] + assert "documents" not in message + + +def test_pdf_extract_text_flag_enables_local_tools(monkeypatch, tmp_path): + pdf_path = tmp_path / "sample.pdf" + pdf_path.write_bytes(b"%PDF-1.4\n1 0 obj\n<<>>\nendobj\ntrailer\n<<>>\n%%EOF") + + calls = {"extract": False} + captured = {} + + def fake_extract(path): + calls["extract"] = True + return "EXTRACTED CONTENT" + + def fail_preview(blob): + raise AssertionError("preview should not be generated when --extract-text is set") + + def fake_send(host, payload, images_present, user_content, stream): + assert "EXTRACTED CONTENT" in user_content + captured["payload"] = payload + return "" + + monkeypatch.setattr("clair.try_extract_pdf_text", fake_extract) + monkeypatch.setattr("clair.convert_pdf_blob_to_image_previews", fail_preview) + monkeypatch.setattr("clair.send_with_fallback", fake_send) + monkeypatch.setattr( + sys, + "argv", + ["prog", "-p", "hi", "-f", str(pdf_path), "--extract-text"], + ) + + main() + + assert calls["extract"] is True + message = captured["payload"]["messages"][0] + assert "images" not in message + assert "documents" not in message