From ae9e9bcc0dc910c6773d00c67e2d48070ff77103 Mon Sep 17 00:00:00 2001 From: Bel LaPointe Date: Mon, 20 Feb 2023 08:43:03 -0700 Subject: [PATCH] no more of u --- config.py | 1 + debug.py | 17 +++++-- poc.py | 122 ------------------------------------------------ test_cluster.py | 21 +++++++-- 4 files changed, 30 insertions(+), 131 deletions(-) delete mode 100644 poc.py diff --git a/config.py b/config.py index 6f303d3..604ee63 100644 --- a/config.py +++ b/config.py @@ -2,4 +2,5 @@ import os DEBUG = os.environ.get("DEBUG", "") DEBUG_NO_SHOW = os.environ.get("DEBUG_NO_SHOW", "") +DEBUG_HEIGHT = int(os.environ.get("DEBUG_HEIGHT", "800")) INPUT = os.environ.get("INPUT", "./testdata/input.pdf") diff --git a/debug.py b/debug.py index 337e9e4..57bf27e 100644 --- a/debug.py +++ b/debug.py @@ -2,6 +2,7 @@ import pdfplumber import os import time import subprocess +import config def draw_boxes(page, boxes): im = debug_im(page) @@ -18,10 +19,16 @@ def draw_boxes(page, boxes): debug_show(im) def debug_im(page): - return page.to_image(height=800) + return page.to_image(height=config.DEBUG_HEIGHT) def debug_show(im, name=None): - im.show() - #im.save(f"/tmp/dnd-pdf-to-txt{'' if not name else '-'+name}.jpg") - #if not DEBUG_NO_SHOW: - # go(f"qlmanage -p /tmp/dnd-pdf-to-txt{'' if not name else '-'+name}.jpg &> /dev/null") + #im.show() + im.save(f"/tmp/dnd-pdf-to-txt{'' if not name else '-'+name}.jpg") + if not config.DEBUG_NO_SHOW: + go(f"qlmanage -p /tmp/dnd-pdf-to-txt{'' if not name else '-'+name}.jpg &> /dev/null") + +__subprocesses__ = [] +def go(cmd): + global __subprocesses__ + __subprocesses__.append(subprocess.Popen(cmd, shell=True)) + diff --git a/poc.py b/poc.py deleted file mode 100644 index 83a6a38..0000000 --- a/poc.py +++ /dev/null @@ -1,122 +0,0 @@ -import pdfplumber -import os -import time -import subprocess - -DEBUG = os.environ.get("DEBUG", "") -DEBUG_NO_SHOW = os.environ.get("DEBUG_NO_SHOW", "") -INPUT = os.environ.get("INPUT", "./testdata/input.pdf") - -def main(): - print("main") - with pdfplumber.open(INPUT) as pdf: - for page in pdf.pages[:]: - for splitpage in v_split(page): - print(splitpage.extract_text(layout=True)) - print("/main") - -def debug_im(page): - return page.to_image(height=800) - -def debug_show(im, name=None): - im.save(f"/tmp/dnd-pdf-to-txt{'' if not name else '-'+name}.jpg") - if not DEBUG_NO_SHOW: - go(f"qlmanage -p /tmp/dnd-pdf-to-txt{'' if not name else '-'+name}.jpg &> /dev/null") - -def v_split(page): - clusters = cluster(page) - points = [i.x0 for i in clusters if i.x0 > page.width//4] - points += [i.x1 for i in clusters if i.x1 < 3*page.width//4] - x_clusters = [] - for point in points: - merged = False - for x_cluster in x_clusters: - if (x_cluster - point) ** 2 < 100: - merged = True - break - if not merged: - x_clusters.append(point) - x_clusters = sorted(x_clusters) - if DEBUG: - im = debug_im(page) - for x_cluster in x_clusters: - im.draw_line(((x_cluster, 0), (x_cluster, page.height))) - debug_show(im, name=f'v-split-xclusters-{page.page_number}') - if len(x_clusters) != 2: - return [page] - x = sum(x_clusters) / len(x_clusters) - result = [page.within_bbox((0, 0, x, page.height)), page.within_bbox((x, 0, page.width, page.height))] - if DEBUG: - i = 0 - for page in result: - debug_show(debug_im(page), name=f'v-split-postsplit-{page.page_number}_{i}') - i += 1 - return result - -def cluster(page): - points = [(i["x0"], i["y0"]) for i in page.chars] - points += [(i["x1"], i["y1"]) for i in page.chars] - clusters = [] - class cluster: - def __init__(self, x, y): - self.x0 = x - self.y0 = y - self.x1 = x+1 - self.y1 = y+1 - self.len = 1 - def merge(self, x, y): - if x < self.x0: - self.x0 = x - elif x > self.x1: - self.x1 = x - if y < self.y0: - self.y0 = y - elif y > self.y1: - self.y1 = y - self.len += 1 - def dist(self, x, y): - x_delta = 0 - y_delta = 0 - if x < self.x0: - x_delta = self.x0 - x - elif x > self.x1: - x_delta = x - self.x1 - if y < self.y0 : - y_delta = self.y0 - y - elif y > self.y1: - y_delta = y - self.y1 - return x_delta**2 + y_delta**2 - def __str__(self): - return f'({int(self.x0)}, {int(self.y0)}, {int(self.x1)}, {int(self.y1)})' - for point in points: - merged = False - for a_cluster in clusters: - if a_cluster.dist(point[0], point[1]) < (page.width/50)**2: - a_cluster.merge(point[0], point[1]) - merged = True - break - if not merged: - clusters.append(cluster(point[0], point[1])) - clusters = [i for i in clusters if i.len > 100] - if DEBUG: - im = debug_im(page) - for i in clusters: - im.draw_lines([ - ((i.x0, page.height-i.y0), (i.x0, page.height-i.y1)), - ((i.x0, page.height-i.y1), (i.x1, page.height-i.y1)), - ((i.x1, page.height-i.y1), (i.x1, page.height-i.y0)), - ((i.x1, page.height-i.y0), (i.x0, page.height-i.y0)), - ], stroke_width=5) - debug_show(im, name=f'cluster-{page.page_number}') - return clusters - -__subprocesses__ = [] -def go(cmd): - global __subprocesses__ - __subprocesses__.append(subprocess.Popen(cmd, shell=True)) - -if __name__ == "__main__": - main() - for p in __subprocesses__: - p.wait() - p.terminate() diff --git a/test_cluster.py b/test_cluster.py index 65c4dc9..8a92aaf 100644 --- a/test_cluster.py +++ b/test_cluster.py @@ -7,14 +7,27 @@ import debug class TestChars(unittest.TestCase): def test_divide_into_columns(self): for p in [ - "./testdata/1-column_half-image.pdf", "./testdata/2-column_2-row.pdf", - "./testdata/2-column_fancy-font.pdf", - "./testdata/2-column_happy.pdf", - "./testdata/2-column_non-interrupting-image.pdf", + #"./testdata/1-column_half-image.pdf", + #"./testdata/2-column_fancy-font.pdf", + #"./testdata/2-column_happy.pdf", + #"./testdata/2-column_non-interrupting-image.pdf", ]: with pdfplumber.open(p) as pdf: for page in pdf.pages: + im = debug.debug_im(page) + words = page.extract_words() + debug.draw_boxes(page, [ + { + "debug_label": i, + "x0": words[i]["x0"], + "x1": words[i]["x1"], + "y0": words[i]["top"], + "y1": words[i]["bottom"], + } + for i in range(len(words)) + ]) + continue got = cluster.Chars(page.chars, page).divide_into_columns() print(p) debug.draw_boxes(page, [