Improve the performance of table-extraction by judging whether to do … #4797

monchin · 2025-11-20T01:57:39Z

…"make_chars" or "make_edges" by checking strategy

Motivation

Hello, I used viztracer to do some table-extraction tests, and found that make_chars and make_edges would always be run whatever the strategy is, which would waste a lot time. So I change the code to run make_chars only if the strategies include "text" and run make_edges only if the strategies include "lines"/"lines_strict"

Below is my test code

from pathlib import Path

p = Path("my-test-pdf.pdf")

import pymupdf
from pymupdf import Document
from pymupdf.table import *
from viztracer import VizTracer



def new_find_tables(
    page,
    clip=None,
    vertical_strategy: str = "lines",
    horizontal_strategy: str = "lines",
    vertical_lines: list = None,
    horizontal_lines: list = None,
    snap_tolerance: float = DEFAULT_SNAP_TOLERANCE,
    snap_x_tolerance: float = None,
    snap_y_tolerance: float = None,
    join_tolerance: float = DEFAULT_JOIN_TOLERANCE,
    join_x_tolerance: float = None,
    join_y_tolerance: float = None,
    edge_min_length: float = 3,
    min_words_vertical: float = DEFAULT_MIN_WORDS_VERTICAL,
    min_words_horizontal: float = DEFAULT_MIN_WORDS_HORIZONTAL,
    intersection_tolerance: float = 3,
    intersection_x_tolerance: float = None,
    intersection_y_tolerance: float = None,
    text_tolerance=3,
    text_x_tolerance=3,
    text_y_tolerance=3,
    strategy=None,  # offer abbreviation
    add_lines=None,  # user-specified lines
    add_boxes=None,  # user-specified rectangles
    paths=None,  # accept vector graphics as parameter
):
    pymupdf._warn_layout_once()
    global CHARS, EDGES
    CHARS = []
    EDGES = []
    old_small = bool(pymupdf.TOOLS.set_small_glyph_heights())  # save old value
    pymupdf.TOOLS.set_small_glyph_heights(True)  # we need minimum bboxes
    if page.rotation != 0:
        page, old_xref, old_rot, old_mediabox = page_rotation_set0(page)
    else:
        old_xref, old_rot, old_mediabox = None, None, None

    if snap_x_tolerance is None:
        snap_x_tolerance = UNSET
    if snap_y_tolerance is None:
        snap_y_tolerance = UNSET
    if join_x_tolerance is None:
        join_x_tolerance = UNSET
    if join_y_tolerance is None:
        join_y_tolerance = UNSET
    if intersection_x_tolerance is None:
        intersection_x_tolerance = UNSET
    if intersection_y_tolerance is None:
        intersection_y_tolerance = UNSET
    if strategy is not None:
        vertical_strategy = strategy
        horizontal_strategy = strategy

    settings = {
        "vertical_strategy": vertical_strategy,
        "horizontal_strategy": horizontal_strategy,
        "explicit_vertical_lines": vertical_lines,
        "explicit_horizontal_lines": horizontal_lines,
        "snap_tolerance": snap_tolerance,
        "snap_x_tolerance": snap_x_tolerance,
        "snap_y_tolerance": snap_y_tolerance,
        "join_tolerance": join_tolerance,
        "join_x_tolerance": join_x_tolerance,
        "join_y_tolerance": join_y_tolerance,
        "edge_min_length": edge_min_length,
        "min_words_vertical": min_words_vertical,
        "min_words_horizontal": min_words_horizontal,
        "intersection_tolerance": intersection_tolerance,
        "intersection_x_tolerance": intersection_x_tolerance,
        "intersection_y_tolerance": intersection_y_tolerance,
        "text_tolerance": text_tolerance,
        "text_x_tolerance": text_x_tolerance,
        "text_y_tolerance": text_y_tolerance,
    }

    old_quad_corrections = pymupdf.TOOLS.unset_quad_corrections()
    try:
        page.get_layout()
        if page.layout_information:
            pymupdf.TOOLS.unset_quad_corrections(True)
            boxes = [
                pymupdf.Rect(b[:4]) for b in page.layout_information if b[-1] == "table"
            ]
        else:
            boxes = []

        if boxes:  # layout did find some tables
            pass
        elif page.layout_information is not None:
            # layout was executed but found no tables
            # make sure we exit quickly with an empty TableFinder
            tbf = TableFinder(page)
            return tbf

        tset = TableSettings.resolve(settings=settings)
        page.table_settings = tset

        if tset.vertical_strategy == "text" or tset.horizontal_strategy == "text":
            make_chars(page, clip=clip)  # create character list of page
        if tset.vertical_strategy.startswith("lines") or tset.horizontal_strategy.startswith(
            "lines"
        ):
            make_edges(
                page,
                clip=clip,
                tset=tset,
                paths=paths,
                add_lines=add_lines,
                add_boxes=add_boxes,
            )  # create lines and curves

        tbf = TableFinder(page, settings=tset)

        if boxes:
            # only keep Finder tables that match a layout box
            tbf.tables = [
                tab
                for tab in tbf.tables
                if any(_iou(tab.bbox, r) >= 0.6 for r in boxes)
            ]
        # build the complementary list of layout table boxes
        my_boxes = [
            r for r in boxes if all(_iou(r, tab.bbox) < 0.6 for tab in tbf.tables)
        ]
        if my_boxes:
            word_rects = [pymupdf.Rect(w[:4]) for w in TEXTPAGE.extractWORDS()]
            tp2 = page.get_textpage(flags=TABLE_DETECTOR_FLAGS)
        for rect in my_boxes:
            cells = make_table_from_bbox(tp2, word_rects, rect)  # pylint: disable=E0606
            tbf.tables.append(Table(page, cells))
    except Exception as e:
        pymupdf.message("find_tables: exception occurred: %s" % str(e))
        return None
    finally:
        pymupdf.TOOLS.set_small_glyph_heights(old_small)
        if old_xref is not None:
            page = page_rotation_reset(page, old_xref, old_rot, old_mediabox)
        pymupdf.TOOLS.unset_quad_corrections(old_quad_corrections)

    return tbf


def test():
    with Document(p) as doc:
        page = doc[0]
        with VizTracer(output_file="result.json"):
            old_tab_lines = page.find_tables()
            new_tab_lines = new_find_tables(page)
            
            old_tab_text = page.find_tables(vertical_strategy="text", horizontal_strategy="text")
            new_tab_text = new_find_tables(page, vertical_strategy="text", horizontal_strategy="text")

    
    assert len(old_tab_lines.tables) == 1
    assert old_tab_lines.cells == new_tab_lines.cells

    assert len(old_tab_text.tables) == 1
    assert old_tab_text.cells == new_tab_text.cells



if __name__ == "__main__":
    test()

From the result.json (checked by "vizviewer result.json"), we can see the performance improves a lot when "lines" and "text" strategies do not appear simultaneously.

"result.json" is too large to upload, so I paste a picture here. Old find_tables costs about 190ms and new find_tables costs about 101ms when both using default "lines" strategy.

…"make_chars" or "make_edges" by checking strategy

github-actions · 2025-11-20T01:57:51Z

All contributors have signed the CLA ✍️ ✅
_{Posted by the CLA Assistant Lite bot.}

monchin · 2025-11-20T02:10:31Z

I have read the CLA Document and I hereby sign the CLA

monchin · 2025-11-20T02:12:45Z

recheck

Basilakis · 2025-11-24T19:55:54Z

Updates? this is much needed

monchin · 2025-11-25T06:17:27Z

Hello @JorjMcKie @julian-smith-artifex-com would you please kindly look at this PR and give some comments and suggestions?

monchin · 2025-11-25T09:32:54Z

Hello @JorjMcKie @julian-smith-artifex-com would you please kindly look at this PR and give some comments and suggestions?

I've found that the comparation must be done when both functions in the "pymupdf.tables.py". At first I thought I could accelerate the program without changing the package by writing a new function in my own code like my test code above, but the speed was slowed down instead. But if I change the function directly in the "table.py", and this time the speed does be fast a lot. So please have a look at this PR, thanks!

In the source code, we can see that make_chars changes the CHARS list and make_edges changes the EDGES list, but if the strategies are both "lines"/"lines_strict", CHARS is not needed; and if the strategies are both "text", EDGES is not needed. Of course, I found that CHARS is needed in Table.extract(), so I add one more global variable CHARS_MADE to judge whether make_chars needs to be run.

JorjMcKie · 2025-12-14T15:16:44Z

Sorry for the delay looking into this, and thanks again for considering ways for improvement.
But I don't think that we need this PR to be honest.

The fact that CHARS are needed for extracting cell text is enough reason to always extract it. Nobody in reality cares about a table when not intending to actually extract its text.

In addition, I dislike adding another global variable (I regret to have started doing so anyway). Far for more logical would be to check whether the list is still empty. Apart from all that, the CHARS array is also needed when identifying the table header.

A similar thought applies to the edges:
The "text" options for detection in general are "emergency" parameters: used only when no gridlines are available. And we know that in this case we are pretty lost ... whatever we specify. The detection logic here is far from being satisfactory.

This means that people won't use "text" - unless there are no edges. In which case trying to extract edges won't cost time anyway.

monchin · 2025-12-15T02:42:58Z

Sorry for the delay looking into this, and thanks again for considering ways for improvement. But I don't think that we need this PR to be honest.

The fact that CHARS are needed for extracting cell text is enough reason to always extract it. Nobody in reality cares about a table when not intending to actually extract its text.

In addition, I dislike adding another global variable (I regret to have started doing so anyway). Far for more logical would be to check whether the list is still empty. Apart from all that, the CHARS array is also needed when identifying the table header.

A similar thought applies to the edges: The "text" options for detection in general are "emergency" parameters: used only when no gridlines are available. And we know that in this case we are pretty lost ... whatever we specify. The detection logic here is far from being satisfactory.

This means that people won't use "text" - unless there are no edges. In which case trying to extract edges won't cost time anyway.

Thank you for your kind reply! In fact I need to generate bookmarks from pdf contents, and for some cases it's hard to judge whether 2 lines of contents should be merged as they may have the same size and boldness, so I extract all the table cells' bboxes, if the 2 lines are in the same cell, they should be merged, otherwise they shouldn't, and I don't need text in this scenario. But I know that my request is rare. Anyway, thank you for reviewing this PR. I'll close it.

JorjMcKie · 2025-12-15T12:09:05Z

Thanks a lot for your understanding!
I hope you will continue to suggest improvements where you see potential. We would welcome this very much.

Improve the performance of table-extraction by judging whether to do …

95f590d

…"make_chars" or "make_edges" by checking strategy

Call make_chars before Table.extract if it is not called before

4aebd33

github-actions bot added a commit that referenced this pull request Nov 26, 2025

@monchin has signed the CLA in #4797

952aaad

fix: error on argument passing

cca0681

monchin closed this Dec 15, 2025

github-actions bot locked and limited conversation to collaborators Dec 15, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Improve the performance of table-extraction by judging whether to do … #4797

Improve the performance of table-extraction by judging whether to do … #4797

monchin commented Nov 20, 2025

Uh oh!

github-actions bot commented Nov 20, 2025 •

edited

Loading

Uh oh!

monchin commented Nov 20, 2025

Uh oh!

monchin commented Nov 20, 2025

Uh oh!

Basilakis commented Nov 24, 2025

Uh oh!

monchin commented Nov 25, 2025

Uh oh!

monchin commented Nov 25, 2025 •

edited

Loading

Uh oh!

JorjMcKie commented Dec 14, 2025

Uh oh!

monchin commented Dec 15, 2025

Uh oh!

JorjMcKie commented Dec 15, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

Improve the performance of table-extraction by judging whether to do … #4797

Improve the performance of table-extraction by judging whether to do … #4797

Conversation

monchin commented Nov 20, 2025

Motivation

Uh oh!

github-actions bot commented Nov 20, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

monchin commented Nov 20, 2025

Uh oh!

monchin commented Nov 20, 2025

Uh oh!

Basilakis commented Nov 24, 2025

Uh oh!

monchin commented Nov 25, 2025

Uh oh!

monchin commented Nov 25, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

JorjMcKie commented Dec 14, 2025

Uh oh!

monchin commented Dec 15, 2025

Uh oh!

JorjMcKie commented Dec 15, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

github-actions bot commented Nov 20, 2025 •

edited

Loading

monchin commented Nov 25, 2025 •

edited

Loading