dnd-pdf-to-txt/test_cluster.py

57 lines
1.9 KiB
Python

import unittest
import cluster
import pdfplumber
import debug
class TestChars(unittest.TestCase):
def test_divide_into_columns(self):
for p in [
#"./testdata/2-column_2-row.pdf",
#"./testdata/1-column_half-image.pdf",
#"./testdata/2-column_fancy-font.pdf",
#"./testdata/2-column_happy.pdf",
"./testdata/2-column_non-interrupting-image.pdf",
]:
with pdfplumber.open(p) as pdf:
for page in pdf.pages:
#im = debug.debug_im(page)
#words = page.extract_words()
#debug.draw_boxes(page, [
# {
# "debug_label": i,
# "x0": words[i]["x0"],
# "x1": words[i]["x1"],
# "y0": words[i]["top"],
# "y1": words[i]["bottom"],
# }
# for i in range(len(words))
#])
#continue
got = cluster.Chars(p, page.chars, page).divide_into_columns()
print(p)
debug.draw_boxes(page, [
{
"x0": i.chars[0]["x0"],
"x1": i.chars[0]["x1"],
"y0": i.chars[0]["y0"],
"y1": i.chars[0]["y1"],
"debug_label": i.n,
}
for i in got
])
for i in got:
print(i.page.height, i.page.width, i._box())
debug.draw_boxes(i.page, [
{
"x0": i.chars[0]["x0"],
"x1": i.chars[0]["x1"],
"y0": i.chars[0]["y0"],
"y1": i.chars[0]["y1"],
"debug_label": i.n,
}
])
if __name__ == "__main__":
unittest.main()