improved but now i need to omit headers, footers in a safer way
parent
def34e1536
commit
43ee3d53d2
15
debug.py
15
debug.py
|
|
@ -7,14 +7,13 @@ from PIL import ImageFont
|
|||
|
||||
def draw_boxes(page, boxes):
|
||||
im = debug_im(page)
|
||||
#for i in boxes:
|
||||
# continue
|
||||
# i["y0"] = page.height - i["y0"]
|
||||
# i["y1"] = page.height - i["y1"]
|
||||
# #i["x0"] *= im.original.height / page.height
|
||||
# #i["x1"] *= im.original.height / page.height
|
||||
# #i["y0"] *= im.original.height / page.height
|
||||
# #i["y1"] *= im.original.height / page.height
|
||||
for i in boxes:
|
||||
i["y0"] = page.height - i["y0"]
|
||||
i["y1"] = page.height - i["y1"]
|
||||
#i["x0"] *= im.original.height / page.height
|
||||
#i["x1"] *= im.original.height / page.height
|
||||
#i["y0"] *= im.original.height / page.height
|
||||
#i["y1"] *= im.original.height / page.height
|
||||
for box in boxes:
|
||||
im.draw_line(((box["x0"], box["y0"]), (box["x1"], box["y0"])))
|
||||
im.draw_line(((box["x1"], box["y0"]), (box["x1"], box["y1"])))
|
||||
|
|
|
|||
|
|
@ -8,26 +8,26 @@ class TestChars(unittest.TestCase):
|
|||
def test_divide_into_columns(self):
|
||||
for p in [
|
||||
"./testdata/2-column_2-row.pdf",
|
||||
#"./testdata/1-column_half-image.pdf",
|
||||
#"./testdata/2-column_fancy-font.pdf",
|
||||
#"./testdata/2-column_happy.pdf",
|
||||
#"./testdata/2-column_non-interrupting-image.pdf",
|
||||
"./testdata/1-column_half-image.pdf",
|
||||
"./testdata/2-column_fancy-font.pdf",
|
||||
"./testdata/2-column_happy.pdf",
|
||||
"./testdata/2-column_non-interrupting-image.pdf",
|
||||
]:
|
||||
with pdfplumber.open(p) as pdf:
|
||||
for page in pdf.pages:
|
||||
im = debug.debug_im(page)
|
||||
words = page.extract_words()
|
||||
debug.draw_boxes(page, [
|
||||
{
|
||||
"debug_label": i,
|
||||
"x0": words[i]["x0"],
|
||||
"x1": words[i]["x1"],
|
||||
"y0": words[i]["top"],
|
||||
"y1": words[i]["bottom"],
|
||||
}
|
||||
for i in range(len(words))
|
||||
])
|
||||
continue
|
||||
#im = debug.debug_im(page)
|
||||
#words = page.extract_words()
|
||||
#debug.draw_boxes(page, [
|
||||
# {
|
||||
# "debug_label": i,
|
||||
# "x0": words[i]["x0"],
|
||||
# "x1": words[i]["x1"],
|
||||
# "y0": words[i]["top"],
|
||||
# "y1": words[i]["bottom"],
|
||||
# }
|
||||
# for i in range(len(words))
|
||||
#])
|
||||
#continue
|
||||
got = cluster.Chars(page.chars, page).divide_into_columns()
|
||||
print(p)
|
||||
debug.draw_boxes(page, [
|
||||
|
|
@ -41,6 +41,5 @@ class TestChars(unittest.TestCase):
|
|||
for i in got
|
||||
])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
|
|||
Loading…
Reference in New Issue