import config class Chars: def __init__(self, chars, page): self.chars = chars self.page = page self.n = 0 def _box(self): xs = [i["x0"] for i in self.chars] xs += [i["x1"] for i in self.chars] ys = [i["y0"] for i in self.chars] ys += [i["y1"] for i in self.chars] return Box( Point(min(xs), min(ys)), Point(max(xs), max(ys)), ) def divide_into_columns(self): # for every sequential pair of chars on same y-coordinate # what is median distance? distances_when_sequential_and_same_y_coordinate = [] for i in range(len(self.chars)-1): box_0 = Box.from_char(self.chars[i]) box_1 = Box.from_char(self.chars[i+1]) if box_0.overlaps_y(box_1): delta = box_0.delta_x(box_1) distances_when_sequential_and_same_y_coordinate.append(delta) median_x_delta_when_same_y = sorted(distances_when_sequential_and_same_y_coordinate)[len(distances_when_sequential_and_same_y_coordinate) // 2] median_x_delta_when_same_y = max([5, median_x_delta_when_same_y]) # merge all naive overlapping boxes result = [Chars([self.chars[0]], self.page)] for char in self.chars[1:]: if result[-1]._box().overlaps(Box.from_char(char), clearance=median_x_delta_when_same_y): result[-1].merge_in(char) else: result.append(Chars([char], self.page)) result = [i for i in result if i.n > 2] # merge all vertically overlapping boxes changed = True while changed: changed = False result2 = [result[0]] for sub in result[1:]: found = False for sub2 in result2: if sub2._box().overlaps_x(sub._box(), clearance=median_x_delta_when_same_y): sub2.merge_in(sub) found = True changed = True if not found: result2.append(sub) result = result2 return result def merge_in(self, other): if isinstance(other, Chars): self.chars.extend(other.chars) else: self.chars.append(other) self.merge() def merge(self): self.n += len(self.chars)-1 box = self._box() self.chars[0]["x0"] = box.corners[0].x self.chars[0]["x1"] = box.corners[1].x self.chars[0]["y0"] = box.corners[0].y self.chars[0]["y1"] = box.corners[1].y self.chars = self.chars[:1] class Box: def __init__(self, corner1, corner2): self.corners = [corner1, corner2] self.diagonal = Line(corner1, corner2) def __str__(self): xs = sorted(["{:.1f}".format(i.x) for i in self.corners]) ys = sorted(["{:.1f}".format(i.y) for i in self.corners]) return f'x=[{xs[0]}..{xs[-1]}],y=[{ys[0]}..{ys[1]}]' def from_char(char): return Box( Point(char["x0"], char["y0"]), Point(char["x1"], char["y1"]), ) def overlaps_x(self, other, clearance=0): return self.diagonal.overlaps_x(other.diagonal, clearance=clearance) def overlaps_y(self, other, clearance=0): return self.diagonal.overlaps_y(other.diagonal, clearance=clearance) def overlaps(self, other, clearance=0): return self.overlaps_x(other, clearance=clearance) and self.overlaps_y(other, clearance=clearance) def delta_x(self, other): if self.overlaps_x(other): return 0 my_xs = sorted([i.x for i in self.corners]) other_xs = sorted([i.x for i in other.corners]) return Box.delta(my_xs[0], my_xs[1], other_xs[0], other_xs[1]) def delta_y(self, other): if self.overlaps_y(other): return 0 my_ys = sorted([i.y for i in self.corners]) other_ys = sorted([i.y for i in other.corners]) return Box.delta(my_ys[0], my_ys[1], other_ys[0], other_ys[1]) def delta(a0, a1, b0, b1): return min([abs(i) for i in [a0-b0, a0-b1, a1-b0, a1-b1]]) class Line: def __init__(self, pointA, pointB): self.pointA = pointA self.pointB = pointB def overlaps_x(self, other, clearance=0): mine = self.xs() others = other.xs() return Line.overlaps( mine[0], mine[1], others[0], others[1], clearance=clearance, ) def overlaps_y(self, other, clearance=0): mine = self.ys() others = other.ys() return Line.overlaps( mine[0], mine[1], others[0], others[1], clearance=clearance, ) def xs(self): return sorted([self.pointA.x, self.pointB.x]) def ys(self): return sorted([self.pointA.y, self.pointB.y]) def overlaps(my_min, my_max, other_min, other_max, clearance=0): my_min -= clearance my_max += clearance other_min -= clearance other_max += clearance # my.. other..other ..my if my_min <= other_min and other_max <= my_max: return True # other.. my..my ..other elif other_min <= my_min and my_max <= other_max: return True # my..other..my..other elif my_min <= other_min and other_min <= my_max and my_max <= other_max: return True # other..my..other..my elif other_min <= my_min and my_min <= other_max and other_max <= my_max: return True return False class Point: def __init__(self, x, y): self.x = x self.y = y