diff --git a/cluster.py b/cluster.py index 9be7ae8..5d65604 100644 --- a/cluster.py +++ b/cluster.py @@ -1,105 +1,153 @@ import config -import debug class Chars: def __init__(self, chars, page): self.chars = chars self.page = page + self.n = 0 + + def _box(self): + xs = [i["x0"] for i in self.chars] + xs += [i["x1"] for i in self.chars] + ys = [i["y0"] for i in self.chars] + ys += [i["y1"] for i in self.chars] + return Box( + Point(min(xs), min(ys)), + Point(max(xs), max(ys)), + ) def divide_into_columns(self): + # for every sequential pair of chars on same y-coordinate + # what is median distance? + distances_when_sequential_and_same_y_coordinate = [] + for i in range(len(self.chars)-1): + box_0 = Box.from_char(self.chars[i]) + box_1 = Box.from_char(self.chars[i+1]) + if box_0.overlaps_y(box_1): + delta = box_0.delta_x(box_1) + distances_when_sequential_and_same_y_coordinate.append(delta) + median_x_delta_when_same_y = sorted(distances_when_sequential_and_same_y_coordinate)[len(distances_when_sequential_and_same_y_coordinate) // 2] + median_x_delta_when_same_y = max([5, median_x_delta_when_same_y]) + + # merge all naive overlapping boxes result = [Chars([self.chars[0]], self.page)] for char in self.chars[1:]: - if result[-1].overlapping_y_coordinates(char): - result[-1].chars.append(char) + if result[-1]._box().overlaps(Box.from_char(char), clearance=median_x_delta_when_same_y): + result[-1].merge_in(char) else: result.append(Chars([char], self.page)) - # TODO: split clusters: find median horizontal distance between each item - [i.merge() for i in result] - #result = sorted(result, key=lambda x: x.chars[0]["y0"]) - #gaps = [ result[i+1].dist(result[i]) for i in range(0, len(result)-1)] - #median_gap = sorted(gaps)[len(gaps)//2] - #changed = True - #iteration = 0 - #while changed: - # iteration += 1 - # changed = False - # for i in range(len(gaps)-1, 0, -1): - # gap = gaps[i] - # print(iteration, "//", gap < median_gap*2, "//", gap, "between", result[i].outer_bounds(), "and", result[i+1].outer_bounds(), "is <", median_gap, "*2") - # if gap < median_gap*2: - # result[i].chars.append(result[i+1].chars[0]) - # result[i].merge() - # result = result[:i+1] + result[i+2:] - # changed = True - # result = sorted(result, key=lambda x: x.chars[0]["y0"]) - # gaps = [ result[i+1].dist(result[i]) for i in range(0, len(result)-1)] - debug.draw_boxes(self.page, [i.chars[0] for i in result]) + result = [i for i in result if i.n > 2] - def merge(self): - bounds = self.outer_bounds() - self.chars[0]["x0"] = bounds[0] - self.chars[0]["x1"] = bounds[1] - self.chars[0]["y0"] = bounds[2] - self.chars[0]["y1"] = bounds[3] - self.chars = self.chars[:1] + # merge all vertically overlapping boxes + changed = True + while changed: + changed = False + result2 = [result[0]] + for sub in result[1:]: + found = False + for sub2 in result2: + if sub2._box().overlaps_x(sub._box(), clearance=median_x_delta_when_same_y): + sub2.merge_in(sub) + found = True + changed = True + if not found: + result2.append(sub) + result = result2 - def outer_bounds(self): - x_min = self.chars[0]["x0"] - x_max = self.chars[0]["x1"] - y_min = self.chars[0]["y0"] - y_max = self.chars[0]["y1"] - for char in self.chars[1:]: - if char["x0"] < x_min: - x_min = char["x0"] - if char["x1"] > x_max: - x_max = char["x1"] - if char["y0"] < y_min: - x_min = char["y0"] - if char["y1"] > y_max: - y_max = char["y1"] - return (x_min, x_max, y_min, y_max) - - def dist(self, other): - my_bounds = self.outer_bounds() - other_bounds = other.outer_bounds() - - x_delta = 0 - if not Chars.char_overlaps(my_bounds[0], my_bounds[1], other_bounds[0], other_bounds[1]): - x_delta = min([abs(i) for i in [ - my_bounds[0] - other_bounds[0], - my_bounds[0] - other_bounds[1], - my_bounds[1] - other_bounds[0], - my_bounds[1] - other_bounds[1], - ]]) - - y_delta = 0 - if not Chars.char_overlaps(my_bounds[2], my_bounds[3], other_bounds[2], other_bounds[3]): - y_delta = min([abs(i) for i in [ - my_bounds[2] - other_bounds[2], - my_bounds[2] - other_bounds[3], - my_bounds[3] - other_bounds[2], - my_bounds[3] - other_bounds[3], - ]]) - - return x_delta ** 2 + y_delta ** 2 - - def overlapping_y_coordinates(self, other_char): - for self_char in self.chars: - if Chars.char_overlapping_y_coordinates(other_char, self_char): - return True - return False - - def char_overlapping_y_coordinates(candidate, established): - result = Chars.char_overlaps( - established["y0"], - established["y1"], - candidate["y0"], - candidate["y1"], - ) - print(established["y0"], "..", established["y1"], result, candidate["y0"], "..", candidate["y1"]) return result - def char_overlaps(my_min, my_max, other_min, other_max): + def merge_in(self, other): + if isinstance(other, Chars): + self.chars.extend(other.chars) + else: + self.chars.append(other) + self.merge() + + def merge(self): + self.n += len(self.chars)-1 + box = self._box() + self.chars[0]["x0"] = box.corners[0].x + self.chars[0]["x1"] = box.corners[1].x + self.chars[0]["y0"] = box.corners[0].y + self.chars[0]["y1"] = box.corners[1].y + self.chars = self.chars[:1] + +class Box: + def __init__(self, corner1, corner2): + self.corners = [corner1, corner2] + self.diagonal = Line(corner1, corner2) + + def __str__(self): + xs = sorted(["{:.1f}".format(i.x) for i in self.corners]) + ys = sorted(["{:.1f}".format(i.y) for i in self.corners]) + return f'x=[{xs[0]}..{xs[-1]}],y=[{ys[0]}..{ys[1]}]' + + def from_char(char): + return Box( + Point(char["x0"], char["y0"]), + Point(char["x1"], char["y1"]), + ) + + def overlaps_x(self, other, clearance=0): + return self.diagonal.overlaps_x(other.diagonal, clearance=clearance) + + def overlaps_y(self, other, clearance=0): + return self.diagonal.overlaps_y(other.diagonal, clearance=clearance) + + def overlaps(self, other, clearance=0): + return self.overlaps_x(other, clearance=clearance) and self.overlaps_y(other, clearance=clearance) + + def delta_x(self, other): + if self.overlaps_x(other): + return 0 + my_xs = sorted([i.x for i in self.corners]) + other_xs = sorted([i.x for i in other.corners]) + return Box.delta(my_xs[0], my_xs[1], other_xs[0], other_xs[1]) + + def delta_y(self, other): + if self.overlaps_y(other): + return 0 + my_ys = sorted([i.y for i in self.corners]) + other_ys = sorted([i.y for i in other.corners]) + return Box.delta(my_ys[0], my_ys[1], other_ys[0], other_ys[1]) + + def delta(a0, a1, b0, b1): + return min([abs(i) for i in [a0-b0, a0-b1, a1-b0, a1-b1]]) + +class Line: + def __init__(self, pointA, pointB): + self.pointA = pointA + self.pointB = pointB + + def overlaps_x(self, other, clearance=0): + mine = self.xs() + others = other.xs() + return Line.overlaps( + mine[0], mine[1], + others[0], others[1], + clearance=clearance, + ) + + def overlaps_y(self, other, clearance=0): + mine = self.ys() + others = other.ys() + return Line.overlaps( + mine[0], mine[1], + others[0], others[1], + clearance=clearance, + ) + + def xs(self): + return sorted([self.pointA.x, self.pointB.x]) + + def ys(self): + return sorted([self.pointA.y, self.pointB.y]) + + def overlaps(my_min, my_max, other_min, other_max, clearance=0): + my_min -= clearance + my_max += clearance + other_min -= clearance + other_max += clearance # my.. other..other ..my if my_min <= other_min and other_max <= my_max: return True @@ -113,3 +161,8 @@ class Chars: elif other_min <= my_min and my_min <= other_max and other_max <= my_max: return True return False + +class Point: + def __init__(self, x, y): + self.x = x + self.y = y diff --git a/debug.py b/debug.py index e125e41..337e9e4 100644 --- a/debug.py +++ b/debug.py @@ -10,6 +10,11 @@ def draw_boxes(page, boxes): im.draw_line(((box["x1"], page.height - box["y0"]), (box["x1"], page.height - box["y1"]))) im.draw_line(((box["x1"], page.height - box["y1"]), (box["x0"], page.height - box["y1"]))) im.draw_line(((box["x0"], page.height - box["y1"]), (box["x0"], page.height - box["y0"]))) + if "debug_label" in box: + im.draw.text( + xy=(box["x0"], page.height-box["y0"]), + text=str(box["debug_label"]), + ) debug_show(im) def debug_im(page): diff --git a/test_cluster.py b/test_cluster.py index 6bdc5db..65c4dc9 100644 --- a/test_cluster.py +++ b/test_cluster.py @@ -2,12 +2,32 @@ import unittest import cluster import pdfplumber +import debug class TestChars(unittest.TestCase): def test_divide_into_columns(self): - with pdfplumber.open("./testdata/2-column_2-row.pdf") as pdf: - for page in pdf.pages: - cluster.Chars(page.chars, page).divide_into_columns() + for p in [ + "./testdata/1-column_half-image.pdf", + "./testdata/2-column_2-row.pdf", + "./testdata/2-column_fancy-font.pdf", + "./testdata/2-column_happy.pdf", + "./testdata/2-column_non-interrupting-image.pdf", + ]: + with pdfplumber.open(p) as pdf: + for page in pdf.pages: + got = cluster.Chars(page.chars, page).divide_into_columns() + print(p) + debug.draw_boxes(page, [ + { + "x0": i.chars[0]["x0"], + "x1": i.chars[0]["x1"], + "y0": i.chars[0]["y0"], + "y1": i.chars[0]["y1"], + "debug_label": i.n, + } + for i in got + ]) + if __name__ == "__main__": unittest.main()