From c67ec83de98b26e3264ebd6b45bc620d63de311e Mon Sep 17 00:00:00 2001 From: Bel LaPointe Date: Sun, 19 Feb 2023 08:39:18 -0700 Subject: [PATCH] whee --- main.py | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 main.py diff --git a/main.py b/main.py new file mode 100644 index 0000000..703ae20 --- /dev/null +++ b/main.py @@ -0,0 +1,125 @@ +import pdfplumber +import os +import time +import subprocess + +DEBUG = os.environ.get("DEBUG", "") +INPUT = os.environ.get("INPUT", "./testdata/input.pdf") + +def main(): + print("main") + with pdfplumber.open(INPUT) as pdf: + for page in pdf.pages[4:5]: + for splitpage in v_split(page): + print(splitpage.extract_text()) + print("/main") + +def crop(page, x0, y0, x1, y1): + if DEBUG: + im = debug_im(page) + im.draw_lines([ + ((x0, y0), (x0, y1)), + ((x0, y1), (x1, y1)), + ((x1, y1), (x1, y0)), + ((x1, y0), (x0, y0)), + ], stroke_width=5) + debug_show(im) + return page.crop((x0, y0, x1, y1)) + +def debug_im(page): + return page.to_image(height=800) + +def debug_show(im): + im.save("/tmp/out.jpg") + go("qlmanage -p /tmp/out.jpg &> /dev/null") + +def v_split(page): + clusters = cluster(page) + points = [i.x0 for i in clusters] + points += [i.x1 for i in clusters] + x_clusters = [] + for point in points: + merged = False + for x_cluster in x_clusters: + if (x_cluster - point) ** 2 < 100: + merged = True + break + if not merged: + x_clusters.append(point) + x_clusters = sorted(x_clusters) + if len(x_clusters) != 4: + return [page] + x = (x_clusters[2] + x_clusters[1]) / 2 + result = [page.within_bbox((0, 0, x, page.height)), page.within_bbox((x, 0, page.width, page.height))] + if DEBUG: + for page in result: + debug_show(debug_im(page)) + return result + +def cluster(page): + points = [(i["x0"], i["y0"]) for i in page.chars] + points += [(i["x1"], i["y1"]) for i in page.chars] + clusters = [] + class cluster: + def __init__(self, x, y): + self.x0 = x + self.y0 = y + self.x1 = x+1 + self.y1 = y+1 + self.len = 1 + def merge(self, x, y): + if x < self.x0: + self.x0 = x + elif x > self.x1: + self.x1 = x + if y < self.y0: + self.y0 = y + elif y > self.y1: + self.y1 = y + self.len += 1 + def dist(self, x, y): + x_delta = 0 + y_delta = 0 + if x < self.x0: + x_delta = self.x0 - x + elif x > self.x1: + x_delta = x - self.x1 + if y < self.y0 : + y_delta = self.y0 - y + elif y > self.y1: + y_delta = y - self.y1 + return x_delta**2 + y_delta**2 + def __str__(self): + return f'({int(self.x0)}, {int(self.y0)}, {int(self.x1)}, {int(self.y1)})' + for point in points: + merged = False + for a_cluster in clusters: + if a_cluster.dist(point[0], point[1]) < (page.width/50)**2: + a_cluster.merge(point[0], point[1]) + merged = True + break + if not merged: + clusters.append(cluster(point[0], point[1])) + clusters = [i for i in clusters if i.len > 100] + if DEBUG: + im = debug_im(page) + for i in clusters: + im.draw_lines([ + ((i.x0, page.height-i.y0), (i.x0, page.height-i.y1)), + ((i.x0, page.height-i.y1), (i.x1, page.height-i.y1)), + ((i.x1, page.height-i.y1), (i.x1, page.height-i.y0)), + ((i.x1, page.height-i.y0), (i.x0, page.height-i.y0)), + ], stroke_width=5) + debug_show(im) + return clusters + +__subprocesses__ = [] +def go(cmd): + global __subprocesses__ + __subprocesses__.append(subprocess.Popen(cmd, shell=True)) + +if __name__ == "__main__": + main() + for p in __subprocesses__: + p.wait() + p.terminate()