master
Bel LaPointe 2023-02-19 08:46:26 -07:00
parent 1d73f84172
commit 0c1fee915a
1 changed files with 7 additions and 17 deletions

24
main.py
View File

@ -4,6 +4,7 @@ import time
import subprocess
DEBUG = os.environ.get("DEBUG", "")
DEBUG_NO_SHOW = os.environ.get("DEBUG_NO_SHOW", "")
INPUT = os.environ.get("INPUT", "./testdata/input.pdf")
def main():
@ -14,24 +15,13 @@ def main():
print(splitpage.extract_text())
print("/main")
def crop(page, x0, y0, x1, y1):
if DEBUG:
im = debug_im(page)
im.draw_lines([
((x0, y0), (x0, y1)),
((x0, y1), (x1, y1)),
((x1, y1), (x1, y0)),
((x1, y0), (x0, y0)),
], stroke_width=5)
debug_show(im)
return page.crop((x0, y0, x1, y1))
def debug_im(page):
return page.to_image(height=800)
def debug_show(im):
im.save("/tmp/out.jpg")
go("qlmanage -p /tmp/out.jpg &> /dev/null")
def debug_show(im, name=None):
im.save(f"/tmp/dnd-pdf-to-txt{'' if not name else '-'+name}.jpg")
if not DEBUG_NO_SHOW:
go(f"qlmanage -p /tmp/dnd-pdf-to-txt{'' if not name else '-'+name}.jpg &> /dev/null")
def v_split(page):
clusters = cluster(page)
@ -51,7 +41,7 @@ def v_split(page):
im = debug_im(page)
for x_cluster in x_clusters:
im.draw_line(((x_cluster, 0), (x_cluster, page.height)))
debug_show(im)
debug_show(im, name=f'v-split-xclusters-{page.page_number}')
if len(x_clusters) != 2:
return [page]
x = sum(x_clusters) / len(x_clusters)
@ -115,7 +105,7 @@ def cluster(page):
((i.x1, page.height-i.y1), (i.x1, page.height-i.y0)),
((i.x1, page.height-i.y0), (i.x0, page.height-i.y0)),
], stroke_width=5)
debug_show(im)
debug_show(im, name=f'cluster-{page.page_number}')
return clusters
__subprocesses__ = []