improved but now i need to omit headers, footers in a safer way
parent
def34e1536
commit
43ee3d53d2
15
debug.py
15
debug.py
|
|
@ -7,14 +7,13 @@ from PIL import ImageFont
|
||||||
|
|
||||||
def draw_boxes(page, boxes):
|
def draw_boxes(page, boxes):
|
||||||
im = debug_im(page)
|
im = debug_im(page)
|
||||||
#for i in boxes:
|
for i in boxes:
|
||||||
# continue
|
i["y0"] = page.height - i["y0"]
|
||||||
# i["y0"] = page.height - i["y0"]
|
i["y1"] = page.height - i["y1"]
|
||||||
# i["y1"] = page.height - i["y1"]
|
#i["x0"] *= im.original.height / page.height
|
||||||
# #i["x0"] *= im.original.height / page.height
|
#i["x1"] *= im.original.height / page.height
|
||||||
# #i["x1"] *= im.original.height / page.height
|
#i["y0"] *= im.original.height / page.height
|
||||||
# #i["y0"] *= im.original.height / page.height
|
#i["y1"] *= im.original.height / page.height
|
||||||
# #i["y1"] *= im.original.height / page.height
|
|
||||||
for box in boxes:
|
for box in boxes:
|
||||||
im.draw_line(((box["x0"], box["y0"]), (box["x1"], box["y0"])))
|
im.draw_line(((box["x0"], box["y0"]), (box["x1"], box["y0"])))
|
||||||
im.draw_line(((box["x1"], box["y0"]), (box["x1"], box["y1"])))
|
im.draw_line(((box["x1"], box["y0"]), (box["x1"], box["y1"])))
|
||||||
|
|
|
||||||
|
|
@ -8,26 +8,26 @@ class TestChars(unittest.TestCase):
|
||||||
def test_divide_into_columns(self):
|
def test_divide_into_columns(self):
|
||||||
for p in [
|
for p in [
|
||||||
"./testdata/2-column_2-row.pdf",
|
"./testdata/2-column_2-row.pdf",
|
||||||
#"./testdata/1-column_half-image.pdf",
|
"./testdata/1-column_half-image.pdf",
|
||||||
#"./testdata/2-column_fancy-font.pdf",
|
"./testdata/2-column_fancy-font.pdf",
|
||||||
#"./testdata/2-column_happy.pdf",
|
"./testdata/2-column_happy.pdf",
|
||||||
#"./testdata/2-column_non-interrupting-image.pdf",
|
"./testdata/2-column_non-interrupting-image.pdf",
|
||||||
]:
|
]:
|
||||||
with pdfplumber.open(p) as pdf:
|
with pdfplumber.open(p) as pdf:
|
||||||
for page in pdf.pages:
|
for page in pdf.pages:
|
||||||
im = debug.debug_im(page)
|
#im = debug.debug_im(page)
|
||||||
words = page.extract_words()
|
#words = page.extract_words()
|
||||||
debug.draw_boxes(page, [
|
#debug.draw_boxes(page, [
|
||||||
{
|
# {
|
||||||
"debug_label": i,
|
# "debug_label": i,
|
||||||
"x0": words[i]["x0"],
|
# "x0": words[i]["x0"],
|
||||||
"x1": words[i]["x1"],
|
# "x1": words[i]["x1"],
|
||||||
"y0": words[i]["top"],
|
# "y0": words[i]["top"],
|
||||||
"y1": words[i]["bottom"],
|
# "y1": words[i]["bottom"],
|
||||||
}
|
# }
|
||||||
for i in range(len(words))
|
# for i in range(len(words))
|
||||||
])
|
#])
|
||||||
continue
|
#continue
|
||||||
got = cluster.Chars(page.chars, page).divide_into_columns()
|
got = cluster.Chars(page.chars, page).divide_into_columns()
|
||||||
print(p)
|
print(p)
|
||||||
debug.draw_boxes(page, [
|
debug.draw_boxes(page, [
|
||||||
|
|
@ -41,6 +41,5 @@ class TestChars(unittest.TestCase):
|
||||||
for i in got
|
for i in got
|
||||||
])
|
])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue