import pdfplumber import os import time import subprocess DEBUG = os.environ.get("DEBUG", "") DEBUG_NO_SHOW = os.environ.get("DEBUG_NO_SHOW", "") INPUT = os.environ.get("INPUT", "./testdata/input.pdf") def main(): print("main") with pdfplumber.open(INPUT) as pdf: for page in pdf.pages[:]: for splitpage in v_split(page): print(splitpage.extract_text(layout=True)) print("/main") def debug_im(page): return page.to_image(height=800) def debug_show(im, name=None): im.save(f"/tmp/dnd-pdf-to-txt{'' if not name else '-'+name}.jpg") if not DEBUG_NO_SHOW: go(f"qlmanage -p /tmp/dnd-pdf-to-txt{'' if not name else '-'+name}.jpg &> /dev/null") def v_split(page): clusters = cluster(page) points = [i.x0 for i in clusters if i.x0 > page.width//4] points += [i.x1 for i in clusters if i.x1 < 3*page.width//4] x_clusters = [] for point in points: merged = False for x_cluster in x_clusters: if (x_cluster - point) ** 2 < 100: merged = True break if not merged: x_clusters.append(point) x_clusters = sorted(x_clusters) if DEBUG: im = debug_im(page) for x_cluster in x_clusters: im.draw_line(((x_cluster, 0), (x_cluster, page.height))) debug_show(im, name=f'v-split-xclusters-{page.page_number}') if len(x_clusters) != 2: return [page] x = sum(x_clusters) / len(x_clusters) result = [page.within_bbox((0, 0, x, page.height)), page.within_bbox((x, 0, page.width, page.height))] if DEBUG: i = 0 for page in result: debug_show(debug_im(page), name=f'v-split-postsplit-{page.page_number}_{i}') i += 1 return result def cluster(page): points = [(i["x0"], i["y0"]) for i in page.chars] points += [(i["x1"], i["y1"]) for i in page.chars] clusters = [] class cluster: def __init__(self, x, y): self.x0 = x self.y0 = y self.x1 = x+1 self.y1 = y+1 self.len = 1 def merge(self, x, y): if x < self.x0: self.x0 = x elif x > self.x1: self.x1 = x if y < self.y0: self.y0 = y elif y > self.y1: self.y1 = y self.len += 1 def dist(self, x, y): x_delta = 0 y_delta = 0 if x < self.x0: x_delta = self.x0 - x elif x > self.x1: x_delta = x - self.x1 if y < self.y0 : y_delta = self.y0 - y elif y > self.y1: y_delta = y - self.y1 return x_delta**2 + y_delta**2 def __str__(self): return f'({int(self.x0)}, {int(self.y0)}, {int(self.x1)}, {int(self.y1)})' for point in points: merged = False for a_cluster in clusters: if a_cluster.dist(point[0], point[1]) < (page.width/50)**2: a_cluster.merge(point[0], point[1]) merged = True break if not merged: clusters.append(cluster(point[0], point[1])) clusters = [i for i in clusters if i.len > 100] if DEBUG: im = debug_im(page) for i in clusters: im.draw_lines([ ((i.x0, page.height-i.y0), (i.x0, page.height-i.y1)), ((i.x0, page.height-i.y1), (i.x1, page.height-i.y1)), ((i.x1, page.height-i.y1), (i.x1, page.height-i.y0)), ((i.x1, page.height-i.y0), (i.x0, page.height-i.y0)), ], stroke_width=5) debug_show(im, name=f'cluster-{page.page_number}') return clusters __subprocesses__ = [] def go(cmd): global __subprocesses__ __subprocesses__.append(subprocess.Popen(cmd, shell=True)) if __name__ == "__main__": main() for p in __subprocesses__: p.wait() p.terminate()