diff --git a/main.py b/main.py index 703ae20..773a67a 100644 --- a/main.py +++ b/main.py @@ -9,7 +9,7 @@ INPUT = os.environ.get("INPUT", "./testdata/input.pdf") def main(): print("main") with pdfplumber.open(INPUT) as pdf: - for page in pdf.pages[4:5]: + for page in pdf.pages[:]: for splitpage in v_split(page): print(splitpage.extract_text()) print("/main") @@ -35,8 +35,8 @@ def debug_show(im): def v_split(page): clusters = cluster(page) - points = [i.x0 for i in clusters] - points += [i.x1 for i in clusters] + points = [i.x0 for i in clusters if i.x0 > page.width//4] + points += [i.x1 for i in clusters if i.x1 < 3*page.width//4] x_clusters = [] for point in points: merged = False @@ -47,9 +47,14 @@ def v_split(page): if not merged: x_clusters.append(point) x_clusters = sorted(x_clusters) - if len(x_clusters) != 4: + if DEBUG: + im = debug_im(page) + for x_cluster in x_clusters: + im.draw_line(((x_cluster, 0), (x_cluster, page.height))) + debug_show(im) + if len(x_clusters) != 2: return [page] - x = (x_clusters[2] + x_clusters[1]) / 2 + x = sum(x_clusters) / len(x_clusters) result = [page.within_bbox((0, 0, x, page.height)), page.within_bbox((x, 0, page.width, page.height))] if DEBUG: for page in result: