diff --git a/__pycache__/cluster.cpython-310.pyc b/__pycache__/cluster.cpython-310.pyc new file mode 100644 index 0000000..f560a59 Binary files /dev/null and b/__pycache__/cluster.cpython-310.pyc differ diff --git a/__pycache__/config.cpython-310.pyc b/__pycache__/config.cpython-310.pyc new file mode 100644 index 0000000..20462ef Binary files /dev/null and b/__pycache__/config.cpython-310.pyc differ diff --git a/__pycache__/debug.cpython-310.pyc b/__pycache__/debug.cpython-310.pyc new file mode 100644 index 0000000..2422337 Binary files /dev/null and b/__pycache__/debug.cpython-310.pyc differ diff --git a/cluster.py b/cluster.py new file mode 100644 index 0000000..9be7ae8 --- /dev/null +++ b/cluster.py @@ -0,0 +1,115 @@ +import config +import debug + +class Chars: + def __init__(self, chars, page): + self.chars = chars + self.page = page + + def divide_into_columns(self): + result = [Chars([self.chars[0]], self.page)] + for char in self.chars[1:]: + if result[-1].overlapping_y_coordinates(char): + result[-1].chars.append(char) + else: + result.append(Chars([char], self.page)) + # TODO: split clusters: find median horizontal distance between each item + [i.merge() for i in result] + #result = sorted(result, key=lambda x: x.chars[0]["y0"]) + #gaps = [ result[i+1].dist(result[i]) for i in range(0, len(result)-1)] + #median_gap = sorted(gaps)[len(gaps)//2] + #changed = True + #iteration = 0 + #while changed: + # iteration += 1 + # changed = False + # for i in range(len(gaps)-1, 0, -1): + # gap = gaps[i] + # print(iteration, "//", gap < median_gap*2, "//", gap, "between", result[i].outer_bounds(), "and", result[i+1].outer_bounds(), "is <", median_gap, "*2") + # if gap < median_gap*2: + # result[i].chars.append(result[i+1].chars[0]) + # result[i].merge() + # result = result[:i+1] + result[i+2:] + # changed = True + # result = sorted(result, key=lambda x: x.chars[0]["y0"]) + # gaps = [ result[i+1].dist(result[i]) for i in range(0, len(result)-1)] + debug.draw_boxes(self.page, [i.chars[0] for i in result]) + + def merge(self): + bounds = self.outer_bounds() + self.chars[0]["x0"] = bounds[0] + self.chars[0]["x1"] = bounds[1] + self.chars[0]["y0"] = bounds[2] + self.chars[0]["y1"] = bounds[3] + self.chars = self.chars[:1] + + def outer_bounds(self): + x_min = self.chars[0]["x0"] + x_max = self.chars[0]["x1"] + y_min = self.chars[0]["y0"] + y_max = self.chars[0]["y1"] + for char in self.chars[1:]: + if char["x0"] < x_min: + x_min = char["x0"] + if char["x1"] > x_max: + x_max = char["x1"] + if char["y0"] < y_min: + x_min = char["y0"] + if char["y1"] > y_max: + y_max = char["y1"] + return (x_min, x_max, y_min, y_max) + + def dist(self, other): + my_bounds = self.outer_bounds() + other_bounds = other.outer_bounds() + + x_delta = 0 + if not Chars.char_overlaps(my_bounds[0], my_bounds[1], other_bounds[0], other_bounds[1]): + x_delta = min([abs(i) for i in [ + my_bounds[0] - other_bounds[0], + my_bounds[0] - other_bounds[1], + my_bounds[1] - other_bounds[0], + my_bounds[1] - other_bounds[1], + ]]) + + y_delta = 0 + if not Chars.char_overlaps(my_bounds[2], my_bounds[3], other_bounds[2], other_bounds[3]): + y_delta = min([abs(i) for i in [ + my_bounds[2] - other_bounds[2], + my_bounds[2] - other_bounds[3], + my_bounds[3] - other_bounds[2], + my_bounds[3] - other_bounds[3], + ]]) + + return x_delta ** 2 + y_delta ** 2 + + def overlapping_y_coordinates(self, other_char): + for self_char in self.chars: + if Chars.char_overlapping_y_coordinates(other_char, self_char): + return True + return False + + def char_overlapping_y_coordinates(candidate, established): + result = Chars.char_overlaps( + established["y0"], + established["y1"], + candidate["y0"], + candidate["y1"], + ) + print(established["y0"], "..", established["y1"], result, candidate["y0"], "..", candidate["y1"]) + return result + + def char_overlaps(my_min, my_max, other_min, other_max): + # my.. other..other ..my + if my_min <= other_min and other_max <= my_max: + return True + # other.. my..my ..other + elif other_min <= my_min and my_max <= other_max: + return True + # my..other..my..other + elif my_min <= other_min and other_min <= my_max and my_max <= other_max: + return True + # other..my..other..my + elif other_min <= my_min and my_min <= other_max and other_max <= my_max: + return True + return False diff --git a/config.py b/config.py new file mode 100644 index 0000000..6f303d3 --- /dev/null +++ b/config.py @@ -0,0 +1,5 @@ +import os + +DEBUG = os.environ.get("DEBUG", "") +DEBUG_NO_SHOW = os.environ.get("DEBUG_NO_SHOW", "") +INPUT = os.environ.get("INPUT", "./testdata/input.pdf") diff --git a/debug.py b/debug.py new file mode 100644 index 0000000..e125e41 --- /dev/null +++ b/debug.py @@ -0,0 +1,22 @@ +import pdfplumber +import os +import time +import subprocess + +def draw_boxes(page, boxes): + im = debug_im(page) + for box in boxes: + im.draw_line(((box["x0"], page.height - box["y0"]), (box["x1"], page.height - box["y0"]))) + im.draw_line(((box["x1"], page.height - box["y0"]), (box["x1"], page.height - box["y1"]))) + im.draw_line(((box["x1"], page.height - box["y1"]), (box["x0"], page.height - box["y1"]))) + im.draw_line(((box["x0"], page.height - box["y1"]), (box["x0"], page.height - box["y0"]))) + debug_show(im) + +def debug_im(page): + return page.to_image(height=800) + +def debug_show(im, name=None): + im.show() + #im.save(f"/tmp/dnd-pdf-to-txt{'' if not name else '-'+name}.jpg") + #if not DEBUG_NO_SHOW: + # go(f"qlmanage -p /tmp/dnd-pdf-to-txt{'' if not name else '-'+name}.jpg &> /dev/null") diff --git a/test_cluster.py b/test_cluster.py new file mode 100644 index 0000000..6bdc5db --- /dev/null +++ b/test_cluster.py @@ -0,0 +1,13 @@ +import unittest + +import cluster +import pdfplumber + +class TestChars(unittest.TestCase): + def test_divide_into_columns(self): + with pdfplumber.open("./testdata/2-column_2-row.pdf") as pdf: + for page in pdf.pages: + cluster.Chars(page.chars, page).divide_into_columns() + +if __name__ == "__main__": + unittest.main()