169 lines
5.3 KiB
Python
169 lines
5.3 KiB
Python
import config
|
|
|
|
class Chars:
|
|
def __init__(self, chars, page):
|
|
self.chars = chars
|
|
self.page = page
|
|
self.n = 0
|
|
|
|
def _box(self):
|
|
xs = [i["x0"] for i in self.chars]
|
|
xs += [i["x1"] for i in self.chars]
|
|
ys = [i["y0"] for i in self.chars]
|
|
ys += [i["y1"] for i in self.chars]
|
|
return Box(
|
|
Point(min(xs), min(ys)),
|
|
Point(max(xs), max(ys)),
|
|
)
|
|
|
|
def divide_into_columns(self):
|
|
# for every sequential pair of chars on same y-coordinate
|
|
# what is median distance?
|
|
distances_when_sequential_and_same_y_coordinate = []
|
|
for i in range(len(self.chars)-1):
|
|
box_0 = Box.from_char(self.chars[i])
|
|
box_1 = Box.from_char(self.chars[i+1])
|
|
if box_0.overlaps_y(box_1):
|
|
delta = box_0.delta_x(box_1)
|
|
distances_when_sequential_and_same_y_coordinate.append(delta)
|
|
median_x_delta_when_same_y = sorted(distances_when_sequential_and_same_y_coordinate)[len(distances_when_sequential_and_same_y_coordinate) // 2]
|
|
median_x_delta_when_same_y = max([5, median_x_delta_when_same_y])
|
|
|
|
# merge all naive overlapping boxes
|
|
result = [Chars([self.chars[0]], self.page)]
|
|
for char in self.chars[1:]:
|
|
if result[-1]._box().overlaps(Box.from_char(char), clearance=median_x_delta_when_same_y):
|
|
result[-1].merge_in(char)
|
|
else:
|
|
result.append(Chars([char], self.page))
|
|
result = [i for i in result if i.n > 2]
|
|
|
|
# merge all vertically overlapping boxes
|
|
changed = True
|
|
while changed:
|
|
changed = False
|
|
result2 = [result[0]]
|
|
for sub in result[1:]:
|
|
found = False
|
|
for sub2 in result2:
|
|
if sub2._box().overlaps_x(sub._box(), clearance=median_x_delta_when_same_y):
|
|
sub2.merge_in(sub)
|
|
found = True
|
|
changed = True
|
|
if not found:
|
|
result2.append(sub)
|
|
result = result2
|
|
|
|
return result
|
|
|
|
def merge_in(self, other):
|
|
if isinstance(other, Chars):
|
|
self.chars.extend(other.chars)
|
|
else:
|
|
self.chars.append(other)
|
|
self.merge()
|
|
|
|
def merge(self):
|
|
self.n += len(self.chars)-1
|
|
box = self._box()
|
|
self.chars[0]["x0"] = box.corners[0].x
|
|
self.chars[0]["x1"] = box.corners[1].x
|
|
self.chars[0]["y0"] = box.corners[0].y
|
|
self.chars[0]["y1"] = box.corners[1].y
|
|
self.chars = self.chars[:1]
|
|
|
|
class Box:
|
|
def __init__(self, corner1, corner2):
|
|
self.corners = [corner1, corner2]
|
|
self.diagonal = Line(corner1, corner2)
|
|
|
|
def __str__(self):
|
|
xs = sorted(["{:.1f}".format(i.x) for i in self.corners])
|
|
ys = sorted(["{:.1f}".format(i.y) for i in self.corners])
|
|
return f'x=[{xs[0]}..{xs[-1]}],y=[{ys[0]}..{ys[1]}]'
|
|
|
|
def from_char(char):
|
|
return Box(
|
|
Point(char["x0"], char["y0"]),
|
|
Point(char["x1"], char["y1"]),
|
|
)
|
|
|
|
def overlaps_x(self, other, clearance=0):
|
|
return self.diagonal.overlaps_x(other.diagonal, clearance=clearance)
|
|
|
|
def overlaps_y(self, other, clearance=0):
|
|
return self.diagonal.overlaps_y(other.diagonal, clearance=clearance)
|
|
|
|
def overlaps(self, other, clearance=0):
|
|
return self.overlaps_x(other, clearance=clearance) and self.overlaps_y(other, clearance=clearance)
|
|
|
|
def delta_x(self, other):
|
|
if self.overlaps_x(other):
|
|
return 0
|
|
my_xs = sorted([i.x for i in self.corners])
|
|
other_xs = sorted([i.x for i in other.corners])
|
|
return Box.delta(my_xs[0], my_xs[1], other_xs[0], other_xs[1])
|
|
|
|
def delta_y(self, other):
|
|
if self.overlaps_y(other):
|
|
return 0
|
|
my_ys = sorted([i.y for i in self.corners])
|
|
other_ys = sorted([i.y for i in other.corners])
|
|
return Box.delta(my_ys[0], my_ys[1], other_ys[0], other_ys[1])
|
|
|
|
def delta(a0, a1, b0, b1):
|
|
return min([abs(i) for i in [a0-b0, a0-b1, a1-b0, a1-b1]])
|
|
|
|
class Line:
|
|
def __init__(self, pointA, pointB):
|
|
self.pointA = pointA
|
|
self.pointB = pointB
|
|
|
|
def overlaps_x(self, other, clearance=0):
|
|
mine = self.xs()
|
|
others = other.xs()
|
|
return Line.overlaps(
|
|
mine[0], mine[1],
|
|
others[0], others[1],
|
|
clearance=clearance,
|
|
)
|
|
|
|
def overlaps_y(self, other, clearance=0):
|
|
mine = self.ys()
|
|
others = other.ys()
|
|
return Line.overlaps(
|
|
mine[0], mine[1],
|
|
others[0], others[1],
|
|
clearance=clearance,
|
|
)
|
|
|
|
def xs(self):
|
|
return sorted([self.pointA.x, self.pointB.x])
|
|
|
|
def ys(self):
|
|
return sorted([self.pointA.y, self.pointB.y])
|
|
|
|
def overlaps(my_min, my_max, other_min, other_max, clearance=0):
|
|
my_min -= clearance
|
|
my_max += clearance
|
|
other_min -= clearance
|
|
other_max += clearance
|
|
# my.. other..other ..my
|
|
if my_min <= other_min and other_max <= my_max:
|
|
return True
|
|
# other.. my..my ..other
|
|
elif other_min <= my_min and my_max <= other_max:
|
|
return True
|
|
# my..other..my..other
|
|
elif my_min <= other_min and other_min <= my_max and my_max <= other_max:
|
|
return True
|
|
# other..my..other..my
|
|
elif other_min <= my_min and my_min <= other_max and other_max <= my_max:
|
|
return True
|
|
return False
|
|
|
|
class Point:
|
|
def __init__(self, x, y):
|
|
self.x = x
|
|
self.y = y
|