successful vertical split; cant handle funky conch shell
parent
4a0b7a9ff6
commit
035b1f8f6d
225
cluster.py
225
cluster.py
|
|
@ -1,105 +1,153 @@
|
|||
import config
|
||||
import debug
|
||||
|
||||
class Chars:
|
||||
def __init__(self, chars, page):
|
||||
self.chars = chars
|
||||
self.page = page
|
||||
self.n = 0
|
||||
|
||||
def _box(self):
|
||||
xs = [i["x0"] for i in self.chars]
|
||||
xs += [i["x1"] for i in self.chars]
|
||||
ys = [i["y0"] for i in self.chars]
|
||||
ys += [i["y1"] for i in self.chars]
|
||||
return Box(
|
||||
Point(min(xs), min(ys)),
|
||||
Point(max(xs), max(ys)),
|
||||
)
|
||||
|
||||
def divide_into_columns(self):
|
||||
# for every sequential pair of chars on same y-coordinate
|
||||
# what is median distance?
|
||||
distances_when_sequential_and_same_y_coordinate = []
|
||||
for i in range(len(self.chars)-1):
|
||||
box_0 = Box.from_char(self.chars[i])
|
||||
box_1 = Box.from_char(self.chars[i+1])
|
||||
if box_0.overlaps_y(box_1):
|
||||
delta = box_0.delta_x(box_1)
|
||||
distances_when_sequential_and_same_y_coordinate.append(delta)
|
||||
median_x_delta_when_same_y = sorted(distances_when_sequential_and_same_y_coordinate)[len(distances_when_sequential_and_same_y_coordinate) // 2]
|
||||
median_x_delta_when_same_y = max([5, median_x_delta_when_same_y])
|
||||
|
||||
# merge all naive overlapping boxes
|
||||
result = [Chars([self.chars[0]], self.page)]
|
||||
for char in self.chars[1:]:
|
||||
if result[-1].overlapping_y_coordinates(char):
|
||||
result[-1].chars.append(char)
|
||||
if result[-1]._box().overlaps(Box.from_char(char), clearance=median_x_delta_when_same_y):
|
||||
result[-1].merge_in(char)
|
||||
else:
|
||||
result.append(Chars([char], self.page))
|
||||
# TODO: split clusters: find median horizontal distance between each item
|
||||
[i.merge() for i in result]
|
||||
#result = sorted(result, key=lambda x: x.chars[0]["y0"])
|
||||
#gaps = [ result[i+1].dist(result[i]) for i in range(0, len(result)-1)]
|
||||
#median_gap = sorted(gaps)[len(gaps)//2]
|
||||
#changed = True
|
||||
#iteration = 0
|
||||
#while changed:
|
||||
# iteration += 1
|
||||
# changed = False
|
||||
# for i in range(len(gaps)-1, 0, -1):
|
||||
# gap = gaps[i]
|
||||
# print(iteration, "//", gap < median_gap*2, "//", gap, "between", result[i].outer_bounds(), "and", result[i+1].outer_bounds(), "is <", median_gap, "*2")
|
||||
# if gap < median_gap*2:
|
||||
# result[i].chars.append(result[i+1].chars[0])
|
||||
# result[i].merge()
|
||||
# result = result[:i+1] + result[i+2:]
|
||||
# changed = True
|
||||
# result = sorted(result, key=lambda x: x.chars[0]["y0"])
|
||||
# gaps = [ result[i+1].dist(result[i]) for i in range(0, len(result)-1)]
|
||||
debug.draw_boxes(self.page, [i.chars[0] for i in result])
|
||||
result = [i for i in result if i.n > 2]
|
||||
|
||||
def merge(self):
|
||||
bounds = self.outer_bounds()
|
||||
self.chars[0]["x0"] = bounds[0]
|
||||
self.chars[0]["x1"] = bounds[1]
|
||||
self.chars[0]["y0"] = bounds[2]
|
||||
self.chars[0]["y1"] = bounds[3]
|
||||
self.chars = self.chars[:1]
|
||||
# merge all vertically overlapping boxes
|
||||
changed = True
|
||||
while changed:
|
||||
changed = False
|
||||
result2 = [result[0]]
|
||||
for sub in result[1:]:
|
||||
found = False
|
||||
for sub2 in result2:
|
||||
if sub2._box().overlaps_x(sub._box(), clearance=median_x_delta_when_same_y):
|
||||
sub2.merge_in(sub)
|
||||
found = True
|
||||
changed = True
|
||||
if not found:
|
||||
result2.append(sub)
|
||||
result = result2
|
||||
|
||||
def outer_bounds(self):
|
||||
x_min = self.chars[0]["x0"]
|
||||
x_max = self.chars[0]["x1"]
|
||||
y_min = self.chars[0]["y0"]
|
||||
y_max = self.chars[0]["y1"]
|
||||
for char in self.chars[1:]:
|
||||
if char["x0"] < x_min:
|
||||
x_min = char["x0"]
|
||||
if char["x1"] > x_max:
|
||||
x_max = char["x1"]
|
||||
if char["y0"] < y_min:
|
||||
x_min = char["y0"]
|
||||
if char["y1"] > y_max:
|
||||
y_max = char["y1"]
|
||||
return (x_min, x_max, y_min, y_max)
|
||||
|
||||
def dist(self, other):
|
||||
my_bounds = self.outer_bounds()
|
||||
other_bounds = other.outer_bounds()
|
||||
|
||||
x_delta = 0
|
||||
if not Chars.char_overlaps(my_bounds[0], my_bounds[1], other_bounds[0], other_bounds[1]):
|
||||
x_delta = min([abs(i) for i in [
|
||||
my_bounds[0] - other_bounds[0],
|
||||
my_bounds[0] - other_bounds[1],
|
||||
my_bounds[1] - other_bounds[0],
|
||||
my_bounds[1] - other_bounds[1],
|
||||
]])
|
||||
|
||||
y_delta = 0
|
||||
if not Chars.char_overlaps(my_bounds[2], my_bounds[3], other_bounds[2], other_bounds[3]):
|
||||
y_delta = min([abs(i) for i in [
|
||||
my_bounds[2] - other_bounds[2],
|
||||
my_bounds[2] - other_bounds[3],
|
||||
my_bounds[3] - other_bounds[2],
|
||||
my_bounds[3] - other_bounds[3],
|
||||
]])
|
||||
|
||||
return x_delta ** 2 + y_delta ** 2
|
||||
|
||||
def overlapping_y_coordinates(self, other_char):
|
||||
for self_char in self.chars:
|
||||
if Chars.char_overlapping_y_coordinates(other_char, self_char):
|
||||
return True
|
||||
return False
|
||||
|
||||
def char_overlapping_y_coordinates(candidate, established):
|
||||
result = Chars.char_overlaps(
|
||||
established["y0"],
|
||||
established["y1"],
|
||||
candidate["y0"],
|
||||
candidate["y1"],
|
||||
)
|
||||
print(established["y0"], "..", established["y1"], result, candidate["y0"], "..", candidate["y1"])
|
||||
return result
|
||||
|
||||
def char_overlaps(my_min, my_max, other_min, other_max):
|
||||
def merge_in(self, other):
|
||||
if isinstance(other, Chars):
|
||||
self.chars.extend(other.chars)
|
||||
else:
|
||||
self.chars.append(other)
|
||||
self.merge()
|
||||
|
||||
def merge(self):
|
||||
self.n += len(self.chars)-1
|
||||
box = self._box()
|
||||
self.chars[0]["x0"] = box.corners[0].x
|
||||
self.chars[0]["x1"] = box.corners[1].x
|
||||
self.chars[0]["y0"] = box.corners[0].y
|
||||
self.chars[0]["y1"] = box.corners[1].y
|
||||
self.chars = self.chars[:1]
|
||||
|
||||
class Box:
|
||||
def __init__(self, corner1, corner2):
|
||||
self.corners = [corner1, corner2]
|
||||
self.diagonal = Line(corner1, corner2)
|
||||
|
||||
def __str__(self):
|
||||
xs = sorted(["{:.1f}".format(i.x) for i in self.corners])
|
||||
ys = sorted(["{:.1f}".format(i.y) for i in self.corners])
|
||||
return f'x=[{xs[0]}..{xs[-1]}],y=[{ys[0]}..{ys[1]}]'
|
||||
|
||||
def from_char(char):
|
||||
return Box(
|
||||
Point(char["x0"], char["y0"]),
|
||||
Point(char["x1"], char["y1"]),
|
||||
)
|
||||
|
||||
def overlaps_x(self, other, clearance=0):
|
||||
return self.diagonal.overlaps_x(other.diagonal, clearance=clearance)
|
||||
|
||||
def overlaps_y(self, other, clearance=0):
|
||||
return self.diagonal.overlaps_y(other.diagonal, clearance=clearance)
|
||||
|
||||
def overlaps(self, other, clearance=0):
|
||||
return self.overlaps_x(other, clearance=clearance) and self.overlaps_y(other, clearance=clearance)
|
||||
|
||||
def delta_x(self, other):
|
||||
if self.overlaps_x(other):
|
||||
return 0
|
||||
my_xs = sorted([i.x for i in self.corners])
|
||||
other_xs = sorted([i.x for i in other.corners])
|
||||
return Box.delta(my_xs[0], my_xs[1], other_xs[0], other_xs[1])
|
||||
|
||||
def delta_y(self, other):
|
||||
if self.overlaps_y(other):
|
||||
return 0
|
||||
my_ys = sorted([i.y for i in self.corners])
|
||||
other_ys = sorted([i.y for i in other.corners])
|
||||
return Box.delta(my_ys[0], my_ys[1], other_ys[0], other_ys[1])
|
||||
|
||||
def delta(a0, a1, b0, b1):
|
||||
return min([abs(i) for i in [a0-b0, a0-b1, a1-b0, a1-b1]])
|
||||
|
||||
class Line:
|
||||
def __init__(self, pointA, pointB):
|
||||
self.pointA = pointA
|
||||
self.pointB = pointB
|
||||
|
||||
def overlaps_x(self, other, clearance=0):
|
||||
mine = self.xs()
|
||||
others = other.xs()
|
||||
return Line.overlaps(
|
||||
mine[0], mine[1],
|
||||
others[0], others[1],
|
||||
clearance=clearance,
|
||||
)
|
||||
|
||||
def overlaps_y(self, other, clearance=0):
|
||||
mine = self.ys()
|
||||
others = other.ys()
|
||||
return Line.overlaps(
|
||||
mine[0], mine[1],
|
||||
others[0], others[1],
|
||||
clearance=clearance,
|
||||
)
|
||||
|
||||
def xs(self):
|
||||
return sorted([self.pointA.x, self.pointB.x])
|
||||
|
||||
def ys(self):
|
||||
return sorted([self.pointA.y, self.pointB.y])
|
||||
|
||||
def overlaps(my_min, my_max, other_min, other_max, clearance=0):
|
||||
my_min -= clearance
|
||||
my_max += clearance
|
||||
other_min -= clearance
|
||||
other_max += clearance
|
||||
# my.. other..other ..my
|
||||
if my_min <= other_min and other_max <= my_max:
|
||||
return True
|
||||
|
|
@ -113,3 +161,8 @@ class Chars:
|
|||
elif other_min <= my_min and my_min <= other_max and other_max <= my_max:
|
||||
return True
|
||||
return False
|
||||
|
||||
class Point:
|
||||
def __init__(self, x, y):
|
||||
self.x = x
|
||||
self.y = y
|
||||
|
|
|
|||
5
debug.py
5
debug.py
|
|
@ -10,6 +10,11 @@ def draw_boxes(page, boxes):
|
|||
im.draw_line(((box["x1"], page.height - box["y0"]), (box["x1"], page.height - box["y1"])))
|
||||
im.draw_line(((box["x1"], page.height - box["y1"]), (box["x0"], page.height - box["y1"])))
|
||||
im.draw_line(((box["x0"], page.height - box["y1"]), (box["x0"], page.height - box["y0"])))
|
||||
if "debug_label" in box:
|
||||
im.draw.text(
|
||||
xy=(box["x0"], page.height-box["y0"]),
|
||||
text=str(box["debug_label"]),
|
||||
)
|
||||
debug_show(im)
|
||||
|
||||
def debug_im(page):
|
||||
|
|
|
|||
|
|
@ -2,12 +2,32 @@ import unittest
|
|||
|
||||
import cluster
|
||||
import pdfplumber
|
||||
import debug
|
||||
|
||||
class TestChars(unittest.TestCase):
|
||||
def test_divide_into_columns(self):
|
||||
with pdfplumber.open("./testdata/2-column_2-row.pdf") as pdf:
|
||||
for page in pdf.pages:
|
||||
cluster.Chars(page.chars, page).divide_into_columns()
|
||||
for p in [
|
||||
"./testdata/1-column_half-image.pdf",
|
||||
"./testdata/2-column_2-row.pdf",
|
||||
"./testdata/2-column_fancy-font.pdf",
|
||||
"./testdata/2-column_happy.pdf",
|
||||
"./testdata/2-column_non-interrupting-image.pdf",
|
||||
]:
|
||||
with pdfplumber.open(p) as pdf:
|
||||
for page in pdf.pages:
|
||||
got = cluster.Chars(page.chars, page).divide_into_columns()
|
||||
print(p)
|
||||
debug.draw_boxes(page, [
|
||||
{
|
||||
"x0": i.chars[0]["x0"],
|
||||
"x1": i.chars[0]["x1"],
|
||||
"y0": i.chars[0]["y0"],
|
||||
"y1": i.chars[0]["y1"],
|
||||
"debug_label": i.n,
|
||||
}
|
||||
for i in got
|
||||
])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
|
|||
Loading…
Reference in New Issue