import pdfplumber import os import time import subprocess DEBUG = os.environ.get("DEBUG", "") INPUT = os.environ.get("INPUT", "./testdata/input.pdf") def main(): print("main") with pdfplumber.open(INPUT) as pdf: for page in pdf.pages[:]: for splitpage in v_split(page): print(splitpage.extract_text()) print("/main") def crop(page, x0, y0, x1, y1): if DEBUG: im = debug_im(page) im.draw_lines([ ((x0, y0), (x0, y1)), ((x0, y1), (x1, y1)), ((x1, y1), (x1, y0)), ((x1, y0), (x0, y0)), ], stroke_width=5) debug_show(im) return page.crop((x0, y0, x1, y1)) def debug_im(page): return page.to_image(height=800) def debug_show(im): im.save("/tmp/out.jpg") go("qlmanage -p /tmp/out.jpg &> /dev/null") def v_split(page): clusters = cluster(page) points = [i.x0 for i in clusters if i.x0 > page.width//4] points += [i.x1 for i in clusters if i.x1 < 3*page.width//4] x_clusters = [] for point in points: merged = False for x_cluster in x_clusters: if (x_cluster - point) ** 2 < 100: merged = True break if not merged: x_clusters.append(point) x_clusters = sorted(x_clusters) if DEBUG: im = debug_im(page) for x_cluster in x_clusters: im.draw_line(((x_cluster, 0), (x_cluster, page.height))) debug_show(im) if len(x_clusters) != 2: return [page] x = sum(x_clusters) / len(x_clusters) result = [page.within_bbox((0, 0, x, page.height)), page.within_bbox((x, 0, page.width, page.height))] if DEBUG: for page in result: debug_show(debug_im(page)) return result def cluster(page): points = [(i["x0"], i["y0"]) for i in page.chars] points += [(i["x1"], i["y1"]) for i in page.chars] clusters = [] class cluster: def __init__(self, x, y): self.x0 = x self.y0 = y self.x1 = x+1 self.y1 = y+1 self.len = 1 def merge(self, x, y): if x < self.x0: self.x0 = x elif x > self.x1: self.x1 = x if y < self.y0: self.y0 = y elif y > self.y1: self.y1 = y self.len += 1 def dist(self, x, y): x_delta = 0 y_delta = 0 if x < self.x0: x_delta = self.x0 - x elif x > self.x1: x_delta = x - self.x1 if y < self.y0 : y_delta = self.y0 - y elif y > self.y1: y_delta = y - self.y1 return x_delta**2 + y_delta**2 def __str__(self): return f'({int(self.x0)}, {int(self.y0)}, {int(self.x1)}, {int(self.y1)})' for point in points: merged = False for a_cluster in clusters: if a_cluster.dist(point[0], point[1]) < (page.width/50)**2: a_cluster.merge(point[0], point[1]) merged = True break if not merged: clusters.append(cluster(point[0], point[1])) clusters = [i for i in clusters if i.len > 100] if DEBUG: im = debug_im(page) for i in clusters: im.draw_lines([ ((i.x0, page.height-i.y0), (i.x0, page.height-i.y1)), ((i.x0, page.height-i.y1), (i.x1, page.height-i.y1)), ((i.x1, page.height-i.y1), (i.x1, page.height-i.y0)), ((i.x1, page.height-i.y0), (i.x0, page.height-i.y0)), ], stroke_width=5) debug_show(im) return clusters __subprocesses__ = [] def go(cmd): global __subprocesses__ __subprocesses__.append(subprocess.Popen(cmd, shell=True)) if __name__ == "__main__": main() for p in __subprocesses__: p.wait() p.terminate()