import config import debug class Chars: def __init__(self, chars, page): self.chars = chars self.page = page def divide_into_columns(self): result = [Chars([self.chars[0]], self.page)] for char in self.chars[1:]: if result[-1].overlapping_y_coordinates(char): result[-1].chars.append(char) else: result.append(Chars([char], self.page)) # TODO: split clusters: find median horizontal distance between each item [i.merge() for i in result] #result = sorted(result, key=lambda x: x.chars[0]["y0"]) #gaps = [ result[i+1].dist(result[i]) for i in range(0, len(result)-1)] #median_gap = sorted(gaps)[len(gaps)//2] #changed = True #iteration = 0 #while changed: # iteration += 1 # changed = False # for i in range(len(gaps)-1, 0, -1): # gap = gaps[i] # print(iteration, "//", gap < median_gap*2, "//", gap, "between", result[i].outer_bounds(), "and", result[i+1].outer_bounds(), "is <", median_gap, "*2") # if gap < median_gap*2: # result[i].chars.append(result[i+1].chars[0]) # result[i].merge() # result = result[:i+1] + result[i+2:] # changed = True # result = sorted(result, key=lambda x: x.chars[0]["y0"]) # gaps = [ result[i+1].dist(result[i]) for i in range(0, len(result)-1)] debug.draw_boxes(self.page, [i.chars[0] for i in result]) def merge(self): bounds = self.outer_bounds() self.chars[0]["x0"] = bounds[0] self.chars[0]["x1"] = bounds[1] self.chars[0]["y0"] = bounds[2] self.chars[0]["y1"] = bounds[3] self.chars = self.chars[:1] def outer_bounds(self): x_min = self.chars[0]["x0"] x_max = self.chars[0]["x1"] y_min = self.chars[0]["y0"] y_max = self.chars[0]["y1"] for char in self.chars[1:]: if char["x0"] < x_min: x_min = char["x0"] if char["x1"] > x_max: x_max = char["x1"] if char["y0"] < y_min: x_min = char["y0"] if char["y1"] > y_max: y_max = char["y1"] return (x_min, x_max, y_min, y_max) def dist(self, other): my_bounds = self.outer_bounds() other_bounds = other.outer_bounds() x_delta = 0 if not Chars.char_overlaps(my_bounds[0], my_bounds[1], other_bounds[0], other_bounds[1]): x_delta = min([abs(i) for i in [ my_bounds[0] - other_bounds[0], my_bounds[0] - other_bounds[1], my_bounds[1] - other_bounds[0], my_bounds[1] - other_bounds[1], ]]) y_delta = 0 if not Chars.char_overlaps(my_bounds[2], my_bounds[3], other_bounds[2], other_bounds[3]): y_delta = min([abs(i) for i in [ my_bounds[2] - other_bounds[2], my_bounds[2] - other_bounds[3], my_bounds[3] - other_bounds[2], my_bounds[3] - other_bounds[3], ]]) return x_delta ** 2 + y_delta ** 2 def overlapping_y_coordinates(self, other_char): for self_char in self.chars: if Chars.char_overlapping_y_coordinates(other_char, self_char): return True return False def char_overlapping_y_coordinates(candidate, established): result = Chars.char_overlaps( established["y0"], established["y1"], candidate["y0"], candidate["y1"], ) print(established["y0"], "..", established["y1"], result, candidate["y0"], "..", candidate["y1"]) return result def char_overlaps(my_min, my_max, other_min, other_max): # my.. other..other ..my if my_min <= other_min and other_max <= my_max: return True # other.. my..my ..other elif other_min <= my_min and my_max <= other_max: return True # my..other..my..other elif my_min <= other_min and other_min <= my_max and my_max <= other_max: return True # other..my..other..my elif other_min <= my_min and my_min <= other_max and other_max <= my_max: return True return False