diff --git a/debug.py b/debug.py index dcdab81..1498a99 100644 --- a/debug.py +++ b/debug.py @@ -7,14 +7,13 @@ from PIL import ImageFont def draw_boxes(page, boxes): im = debug_im(page) - #for i in boxes: - # continue - # i["y0"] = page.height - i["y0"] - # i["y1"] = page.height - i["y1"] - # #i["x0"] *= im.original.height / page.height - # #i["x1"] *= im.original.height / page.height - # #i["y0"] *= im.original.height / page.height - # #i["y1"] *= im.original.height / page.height + for i in boxes: + i["y0"] = page.height - i["y0"] + i["y1"] = page.height - i["y1"] + #i["x0"] *= im.original.height / page.height + #i["x1"] *= im.original.height / page.height + #i["y0"] *= im.original.height / page.height + #i["y1"] *= im.original.height / page.height for box in boxes: im.draw_line(((box["x0"], box["y0"]), (box["x1"], box["y0"]))) im.draw_line(((box["x1"], box["y0"]), (box["x1"], box["y1"]))) diff --git a/test_cluster.py b/test_cluster.py index 8a92aaf..c7ba767 100644 --- a/test_cluster.py +++ b/test_cluster.py @@ -8,26 +8,26 @@ class TestChars(unittest.TestCase): def test_divide_into_columns(self): for p in [ "./testdata/2-column_2-row.pdf", - #"./testdata/1-column_half-image.pdf", - #"./testdata/2-column_fancy-font.pdf", - #"./testdata/2-column_happy.pdf", - #"./testdata/2-column_non-interrupting-image.pdf", + "./testdata/1-column_half-image.pdf", + "./testdata/2-column_fancy-font.pdf", + "./testdata/2-column_happy.pdf", + "./testdata/2-column_non-interrupting-image.pdf", ]: with pdfplumber.open(p) as pdf: for page in pdf.pages: - im = debug.debug_im(page) - words = page.extract_words() - debug.draw_boxes(page, [ - { - "debug_label": i, - "x0": words[i]["x0"], - "x1": words[i]["x1"], - "y0": words[i]["top"], - "y1": words[i]["bottom"], - } - for i in range(len(words)) - ]) - continue + #im = debug.debug_im(page) + #words = page.extract_words() + #debug.draw_boxes(page, [ + # { + # "debug_label": i, + # "x0": words[i]["x0"], + # "x1": words[i]["x1"], + # "y0": words[i]["top"], + # "y1": words[i]["bottom"], + # } + # for i in range(len(words)) + #]) + #continue got = cluster.Chars(page.chars, page).divide_into_columns() print(p) debug.draw_boxes(page, [ @@ -41,6 +41,5 @@ class TestChars(unittest.TestCase): for i in got ]) - if __name__ == "__main__": unittest.main()