improved but now i need to omit headers, footers in a safer way

master
Bel LaPointe 2023-02-20 09:01:30 -07:00
parent def34e1536
commit 43ee3d53d2
2 changed files with 24 additions and 26 deletions

View File

@ -7,14 +7,13 @@ from PIL import ImageFont
def draw_boxes(page, boxes): def draw_boxes(page, boxes):
im = debug_im(page) im = debug_im(page)
#for i in boxes: for i in boxes:
# continue i["y0"] = page.height - i["y0"]
# i["y0"] = page.height - i["y0"] i["y1"] = page.height - i["y1"]
# i["y1"] = page.height - i["y1"] #i["x0"] *= im.original.height / page.height
# #i["x0"] *= im.original.height / page.height #i["x1"] *= im.original.height / page.height
# #i["x1"] *= im.original.height / page.height #i["y0"] *= im.original.height / page.height
# #i["y0"] *= im.original.height / page.height #i["y1"] *= im.original.height / page.height
# #i["y1"] *= im.original.height / page.height
for box in boxes: for box in boxes:
im.draw_line(((box["x0"], box["y0"]), (box["x1"], box["y0"]))) im.draw_line(((box["x0"], box["y0"]), (box["x1"], box["y0"])))
im.draw_line(((box["x1"], box["y0"]), (box["x1"], box["y1"]))) im.draw_line(((box["x1"], box["y0"]), (box["x1"], box["y1"])))

View File

@ -8,26 +8,26 @@ class TestChars(unittest.TestCase):
def test_divide_into_columns(self): def test_divide_into_columns(self):
for p in [ for p in [
"./testdata/2-column_2-row.pdf", "./testdata/2-column_2-row.pdf",
#"./testdata/1-column_half-image.pdf", "./testdata/1-column_half-image.pdf",
#"./testdata/2-column_fancy-font.pdf", "./testdata/2-column_fancy-font.pdf",
#"./testdata/2-column_happy.pdf", "./testdata/2-column_happy.pdf",
#"./testdata/2-column_non-interrupting-image.pdf", "./testdata/2-column_non-interrupting-image.pdf",
]: ]:
with pdfplumber.open(p) as pdf: with pdfplumber.open(p) as pdf:
for page in pdf.pages: for page in pdf.pages:
im = debug.debug_im(page) #im = debug.debug_im(page)
words = page.extract_words() #words = page.extract_words()
debug.draw_boxes(page, [ #debug.draw_boxes(page, [
{ # {
"debug_label": i, # "debug_label": i,
"x0": words[i]["x0"], # "x0": words[i]["x0"],
"x1": words[i]["x1"], # "x1": words[i]["x1"],
"y0": words[i]["top"], # "y0": words[i]["top"],
"y1": words[i]["bottom"], # "y1": words[i]["bottom"],
} # }
for i in range(len(words)) # for i in range(len(words))
]) #])
continue #continue
got = cluster.Chars(page.chars, page).divide_into_columns() got = cluster.Chars(page.chars, page).divide_into_columns()
print(p) print(p)
debug.draw_boxes(page, [ debug.draw_boxes(page, [
@ -41,6 +41,5 @@ class TestChars(unittest.TestCase):
for i in got for i in got
]) ])
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()