import unittest import cluster import pdfplumber import debug class TestChars(unittest.TestCase): def test_divide_into_columns(self): for p in [ "./testdata/2-column_2-row.pdf", #"./testdata/1-column_half-image.pdf", #"./testdata/2-column_fancy-font.pdf", #"./testdata/2-column_happy.pdf", #"./testdata/2-column_non-interrupting-image.pdf", ]: with pdfplumber.open(p) as pdf: for page in pdf.pages: im = debug.debug_im(page) words = page.extract_words() debug.draw_boxes(page, [ { "debug_label": i, "x0": words[i]["x0"], "x1": words[i]["x1"], "y0": words[i]["top"], "y1": words[i]["bottom"], } for i in range(len(words)) ]) continue got = cluster.Chars(page.chars, page).divide_into_columns() print(p) debug.draw_boxes(page, [ { "x0": i.chars[0]["x0"], "x1": i.chars[0]["x1"], "y0": i.chars[0]["y0"], "y1": i.chars[0]["y1"], "debug_label": i.n, } for i in got ]) if __name__ == "__main__": unittest.main()