import unittest import cluster import pdfplumber import debug class TestChars(unittest.TestCase): def test_divide_into_columns(self): for p in [ #"./testdata/2-column_2-row.pdf", #"./testdata/1-column_half-image.pdf", #"./testdata/2-column_fancy-font.pdf", #"./testdata/2-column_happy.pdf", "./testdata/2-column_non-interrupting-image.pdf", ]: with pdfplumber.open(p) as pdf: for page in pdf.pages: #im = debug.debug_im(page) #words = page.extract_words() #debug.draw_boxes(page, [ # { # "debug_label": i, # "x0": words[i]["x0"], # "x1": words[i]["x1"], # "y0": words[i]["top"], # "y1": words[i]["bottom"], # } # for i in range(len(words)) #]) #continue got = cluster.Chars(p, page.chars, page).divide_into_columns() print(p) debug.draw_boxes(page, [ { "x0": i.chars[0]["x0"], "x1": i.chars[0]["x1"], "y0": i.chars[0]["y0"], "y1": i.chars[0]["y1"], "debug_label": i.n, } for i in got ]) for i in got: print(i.page.height, i.page.width, i._box()) debug.draw_boxes(i.page, [ { "x0": i.chars[0]["x0"], "x1": i.chars[0]["x1"], "y0": i.chars[0]["y0"], "y1": i.chars[0]["y1"], "debug_label": i.n, } ]) if __name__ == "__main__": unittest.main()