57 lines
1.9 KiB
Python
57 lines
1.9 KiB
Python
import unittest
|
|
|
|
import cluster
|
|
import pdfplumber
|
|
import debug
|
|
|
|
class TestChars(unittest.TestCase):
|
|
def test_divide_into_columns(self):
|
|
for p in [
|
|
#"./testdata/2-column_2-row.pdf",
|
|
#"./testdata/1-column_half-image.pdf",
|
|
#"./testdata/2-column_fancy-font.pdf",
|
|
#"./testdata/2-column_happy.pdf",
|
|
"./testdata/2-column_non-interrupting-image.pdf",
|
|
]:
|
|
with pdfplumber.open(p) as pdf:
|
|
for page in pdf.pages:
|
|
#im = debug.debug_im(page)
|
|
#words = page.extract_words()
|
|
#debug.draw_boxes(page, [
|
|
# {
|
|
# "debug_label": i,
|
|
# "x0": words[i]["x0"],
|
|
# "x1": words[i]["x1"],
|
|
# "y0": words[i]["top"],
|
|
# "y1": words[i]["bottom"],
|
|
# }
|
|
# for i in range(len(words))
|
|
#])
|
|
#continue
|
|
got = cluster.Chars(p, page.chars, page).divide_into_columns()
|
|
print(p)
|
|
debug.draw_boxes(page, [
|
|
{
|
|
"x0": i.chars[0]["x0"],
|
|
"x1": i.chars[0]["x1"],
|
|
"y0": i.chars[0]["y0"],
|
|
"y1": i.chars[0]["y1"],
|
|
"debug_label": i.n,
|
|
}
|
|
for i in got
|
|
])
|
|
for i in got:
|
|
print(i.page.height, i.page.width, i._box())
|
|
debug.draw_boxes(i.page, [
|
|
{
|
|
"x0": i.chars[0]["x0"],
|
|
"x1": i.chars[0]["x1"],
|
|
"y0": i.chars[0]["y0"],
|
|
"y1": i.chars[0]["y1"],
|
|
"debug_label": i.n,
|
|
}
|
|
])
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|