master
Bel LaPointe 2023-02-19 08:43:03 -07:00
parent c67ec83de9
commit 1d73f84172
1 changed files with 10 additions and 5 deletions

15
main.py
View File

@ -9,7 +9,7 @@ INPUT = os.environ.get("INPUT", "./testdata/input.pdf")
def main():
print("main")
with pdfplumber.open(INPUT) as pdf:
for page in pdf.pages[4:5]:
for page in pdf.pages[:]:
for splitpage in v_split(page):
print(splitpage.extract_text())
print("/main")
@ -35,8 +35,8 @@ def debug_show(im):
def v_split(page):
clusters = cluster(page)
points = [i.x0 for i in clusters]
points += [i.x1 for i in clusters]
points = [i.x0 for i in clusters if i.x0 > page.width//4]
points += [i.x1 for i in clusters if i.x1 < 3*page.width//4]
x_clusters = []
for point in points:
merged = False
@ -47,9 +47,14 @@ def v_split(page):
if not merged:
x_clusters.append(point)
x_clusters = sorted(x_clusters)
if len(x_clusters) != 4:
if DEBUG:
im = debug_im(page)
for x_cluster in x_clusters:
im.draw_line(((x_cluster, 0), (x_cluster, page.height)))
debug_show(im)
if len(x_clusters) != 2:
return [page]
x = (x_clusters[2] + x_clusters[1]) / 2
x = sum(x_clusters) / len(x_clusters)
result = [page.within_bbox((0, 0, x, page.height)), page.within_bbox((x, 0, page.width, page.height))]
if DEBUG:
for page in result: