import unittest import cluster import pdfplumber import debug class TestChars(unittest.TestCase): def test_divide_into_columns(self): for p in [ "./testdata/1-column_half-image.pdf", "./testdata/2-column_2-row.pdf", "./testdata/2-column_fancy-font.pdf", "./testdata/2-column_happy.pdf", "./testdata/2-column_non-interrupting-image.pdf", ]: with pdfplumber.open(p) as pdf: for page in pdf.pages: got = cluster.Chars(page.chars, page).divide_into_columns() print(p) debug.draw_boxes(page, [ { "x0": i.chars[0]["x0"], "x1": i.chars[0]["x1"], "y0": i.chars[0]["y0"], "y1": i.chars[0]["y1"], "debug_label": i.n, } for i in got ]) if __name__ == "__main__": unittest.main()