dnd-pdf-to-txt/cluster.py

225 lines
7.5 KiB
Python

import config
import pypdf
import pdfplumber
class Chars:
def __init__(self, path, chars, page):
self.path = path
self.chars = chars
self.page = page
self.n = 0
def _box(self):
xs = [i["x0"] for i in self.chars]
xs += [i["x1"] for i in self.chars]
ys = [i["y0"] for i in self.chars]
ys += [i["y1"] for i in self.chars]
return Box(
Point(min(xs), min(ys)),
Point(max(xs), max(ys)),
)
def divide_into_columns(self):
# given median character size
# drop all above+below first instance
heights = [i["y1"]-i["y0"] for i in self.chars]
if not heights:
return [self]
median_height = sorted(heights)[len(heights)//2]
at_least_median_height = [i for i in self.chars if i["y1"]-i["y0"] >= median_height]
at_least_median_height = sorted(at_least_median_height, key=lambda x:x["y0"])
# for every sequential pair of chars on same y-coordinate
# what is median distance?
distances_when_sequential_and_same_y_coordinate = []
for i in range(len(self.chars)-1):
box_0 = Box.from_char(self.chars[i])
box_1 = Box.from_char(self.chars[i+1])
if box_0.overlaps_y(box_1):
delta = box_0.delta_x(box_1)
distances_when_sequential_and_same_y_coordinate.append(delta)
median_x_delta_when_same_y = sorted(distances_when_sequential_and_same_y_coordinate)[len(distances_when_sequential_and_same_y_coordinate) // 2]
median_x_delta_when_same_y = max([5, median_x_delta_when_same_y])
# merge all naive overlapping boxes
result = []
for char in self.chars:
if result and result[-1]._box().overlaps(Box.from_char(char), clearance=median_x_delta_when_same_y):
result[-1].merge_in(char)
else:
result.append(Chars(self.path, [char], self.page))
result = [i for i in result if i.n > 2]
# any clusters shorter than median character and high/lower are header/footer
result2 = []
for chars in result:
chars_box = chars._box()
are_small = chars_box.y1 - chars_box.y0 < median_height
are_header = chars_box.y1 < at_least_median_height[0]["y0"]
are_footer = chars_box.y0 > at_least_median_height[-1]["y1"]
if not (are_small and (are_header or are_footer)):
result2.append(chars)
result = result2
# merge all vertically overlapping boxes
changed = True
while changed:
changed = False
result2 = [result[0]]
for sub in result[1:]:
found = False
for sub2 in result2:
if sub2._box().overlaps_x(sub._box(), clearance=median_x_delta_when_same_y):
sub2.merge_in(sub)
found = True
changed = True
if not found:
result2.append(sub)
result = result2
j = 0
for i in result:
j += 1
i.merge()
assert(len(i.chars) == 1)
#i.chars[0]["x0"] -= median_height
#i.chars[0]["x1"] += median_height
#i.chars[0]["y0"] -= median_height
#i.chars[0]["y1"] += median_height
bounds = i._box()
original_reader = pypdf.PdfReader(self.path)
modified_writer = pypdf.PdfWriter()
modified_page = original_reader.pages[self.page.page_number-1]
modified_page.trimbox.upper_right = (bounds.x0, bounds.y0)
modified_page.trimbox.upper_left = (bounds.x1, bounds.y0)
modified_page.trimbox.lower_right = (bounds.x0, bounds.y1)
modified_page.trimbox.lower_left = (bounds.x1, bounds.y1)
modified_page.cropbox.upper_right = (bounds.x0, bounds.y0-median_height)
modified_page.cropbox.upper_left = (bounds.x1, bounds.y0-median_height)
modified_page.cropbox.lower_right = (bounds.x0, bounds.y1+median_height)
modified_page.cropbox.lower_left = (bounds.x1, bounds.y1+median_height)
modified_writer.add_page(modified_page)
modified_path = "{}/{}-{:03d}-{}.modified.pdf".format(
config.TEMP_DIR,
self.path.split("/")[-1],
self.page.page_number,
j,
)
with open(modified_path, "wb") as mwf:
modified_writer.write(mwf)
with pdfplumber.open(modified_path) as modified_pdf:
i.path = modified_path
i.page = modified_pdf.pages[0]
return result
def merge_in(self, other):
if isinstance(other, Chars):
self.chars.extend(other.chars)
else:
self.chars.append(other)
self.merge()
def merge(self):
self.n += len(self.chars)-1
box = self._box()
self.chars[0]["x0"] = box.corners[0].x
self.chars[0]["x1"] = box.corners[1].x
self.chars[0]["y0"] = box.corners[0].y
self.chars[0]["y1"] = box.corners[1].y
self.chars = self.chars[:1]
class Box:
def __init__(self, corner1, corner2):
self.corners = [corner1, corner2]
self.diagonal = Line(corner1, corner2)
self.x0 = min([corner1.x, corner2.x])
self.x1 = max([corner1.x, corner2.x])
self.y0 = min([corner1.y, corner2.y])
self.y1 = max([corner1.y, corner2.y])
def __str__(self):
return f'x=[{self.x0},{self.x1}],y=[{self.y0},{self.y1}]'
def from_char(char):
return Box(
Point(char["x0"], char["y0"]),
Point(char["x1"], char["y1"]),
)
def overlaps_x(self, other, clearance=0):
return self.diagonal.overlaps_x(other.diagonal, clearance=clearance)
def overlaps_y(self, other, clearance=0):
return self.diagonal.overlaps_y(other.diagonal, clearance=clearance)
def overlaps(self, other, clearance=0):
return self.overlaps_x(other, clearance=clearance) and self.overlaps_y(other, clearance=clearance)
def delta_x(self, other):
if self.overlaps_x(other):
return 0
return Box.delta(self.x0, self.x1, other.x0, other.x1)
def delta_y(self, other):
if self.overlaps_y(other):
return 0
return Box.delta(self.y0, self.y1, other.y0, other.y1)
def delta(a0, a1, b0, b1):
return min([abs(i) for i in [a0-b0, a0-b1, a1-b0, a1-b1]])
class Line:
def __init__(self, pointA, pointB):
self.pointA = pointA
self.pointB = pointB
def overlaps_x(self, other, clearance=0):
mine = self.xs()
others = other.xs()
return Line.overlaps(
mine[0], mine[1],
others[0], others[1],
clearance=clearance,
)
def overlaps_y(self, other, clearance=0):
mine = self.ys()
others = other.ys()
return Line.overlaps(
mine[0], mine[1],
others[0], others[1],
clearance=clearance,
)
def xs(self):
return sorted([self.pointA.x, self.pointB.x])
def ys(self):
return sorted([self.pointA.y, self.pointB.y])
def overlaps(my_min, my_max, other_min, other_max, clearance=0):
my_min -= clearance
my_max += clearance
other_min -= clearance
other_max += clearance
# my.. other..other ..my
if my_min <= other_min and other_max <= my_max:
return True
# other.. my..my ..other
elif other_min <= my_min and my_max <= other_max:
return True
# my..other..my..other
elif my_min <= other_min and other_min <= my_max and my_max <= other_max:
return True
# other..my..other..my
elif other_min <= my_min and my_min <= other_max and other_max <= my_max:
return True
return False
class Point:
def __init__(self, x, y):
self.x = x
self.y = y