import config import pypdf import pdfplumber class Chars: def __init__(self, path, chars, page): self.path = path self.chars = chars self.page = page self.n = 0 def _box(self): xs = [i["x0"] for i in self.chars] xs += [i["x1"] for i in self.chars] ys = [i["y0"] for i in self.chars] ys += [i["y1"] for i in self.chars] return Box( Point(min(xs), min(ys)), Point(max(xs), max(ys)), ) def divide_into_columns(self): # given median character size # drop all above+below first instance heights = [i["y1"]-i["y0"] for i in self.chars] if not heights: return [self] median_height = sorted(heights)[len(heights)//2] at_least_median_height = [i for i in self.chars if i["y1"]-i["y0"] >= median_height] at_least_median_height = sorted(at_least_median_height, key=lambda x:x["y0"]) # for every sequential pair of chars on same y-coordinate # what is median distance? distances_when_sequential_and_same_y_coordinate = [] for i in range(len(self.chars)-1): box_0 = Box.from_char(self.chars[i]) box_1 = Box.from_char(self.chars[i+1]) if box_0.overlaps_y(box_1): delta = box_0.delta_x(box_1) distances_when_sequential_and_same_y_coordinate.append(delta) median_x_delta_when_same_y = sorted(distances_when_sequential_and_same_y_coordinate)[len(distances_when_sequential_and_same_y_coordinate) // 2] median_x_delta_when_same_y = max([5, median_x_delta_when_same_y]) # merge all naive overlapping boxes result = [] for char in self.chars: if result and result[-1]._box().overlaps(Box.from_char(char), clearance=median_x_delta_when_same_y): result[-1].merge_in(char) else: result.append(Chars(self.path, [char], self.page)) result = [i for i in result if i.n > 2] # any clusters shorter than median character and high/lower are header/footer result2 = [] for chars in result: chars_box = chars._box() are_small = chars_box.y1 - chars_box.y0 < median_height are_header = chars_box.y1 < at_least_median_height[0]["y0"] are_footer = chars_box.y0 > at_least_median_height[-1]["y1"] if not (are_small and (are_header or are_footer)): result2.append(chars) result = result2 # merge all vertically overlapping boxes changed = True while changed: changed = False result2 = [result[0]] for sub in result[1:]: found = False for sub2 in result2: if sub2._box().overlaps_x(sub._box(), clearance=median_x_delta_when_same_y): sub2.merge_in(sub) found = True changed = True if not found: result2.append(sub) result = result2 j = 0 for i in result: j += 1 i.merge() assert(len(i.chars) == 1) #i.chars[0]["x0"] -= median_height #i.chars[0]["x1"] += median_height #i.chars[0]["y0"] -= median_height #i.chars[0]["y1"] += median_height bounds = i._box() original_reader = pypdf.PdfReader(self.path) modified_writer = pypdf.PdfWriter() modified_page = original_reader.pages[self.page.page_number-1] modified_page.trimbox.upper_right = (bounds.x0, bounds.y0) modified_page.trimbox.upper_left = (bounds.x1, bounds.y0) modified_page.trimbox.lower_right = (bounds.x0, bounds.y1) modified_page.trimbox.lower_left = (bounds.x1, bounds.y1) modified_page.cropbox.upper_right = (bounds.x0, bounds.y0-median_height) modified_page.cropbox.upper_left = (bounds.x1, bounds.y0-median_height) modified_page.cropbox.lower_right = (bounds.x0, bounds.y1+median_height) modified_page.cropbox.lower_left = (bounds.x1, bounds.y1+median_height) modified_writer.add_page(modified_page) modified_path = "{}/{}-{:03d}-{}.modified.pdf".format( config.TEMP_DIR, self.path.split("/")[-1], self.page.page_number, j, ) with open(modified_path, "wb") as mwf: modified_writer.write(mwf) with pdfplumber.open(modified_path) as modified_pdf: i.path = modified_path i.page = modified_pdf.pages[0] return result def merge_in(self, other): if isinstance(other, Chars): self.chars.extend(other.chars) else: self.chars.append(other) self.merge() def merge(self): self.n += len(self.chars)-1 box = self._box() self.chars[0]["x0"] = box.corners[0].x self.chars[0]["x1"] = box.corners[1].x self.chars[0]["y0"] = box.corners[0].y self.chars[0]["y1"] = box.corners[1].y self.chars = self.chars[:1] class Box: def __init__(self, corner1, corner2): self.corners = [corner1, corner2] self.diagonal = Line(corner1, corner2) self.x0 = min([corner1.x, corner2.x]) self.x1 = max([corner1.x, corner2.x]) self.y0 = min([corner1.y, corner2.y]) self.y1 = max([corner1.y, corner2.y]) def __str__(self): return f'x=[{self.x0},{self.x1}],y=[{self.y0},{self.y1}]' def from_char(char): return Box( Point(char["x0"], char["y0"]), Point(char["x1"], char["y1"]), ) def overlaps_x(self, other, clearance=0): return self.diagonal.overlaps_x(other.diagonal, clearance=clearance) def overlaps_y(self, other, clearance=0): return self.diagonal.overlaps_y(other.diagonal, clearance=clearance) def overlaps(self, other, clearance=0): return self.overlaps_x(other, clearance=clearance) and self.overlaps_y(other, clearance=clearance) def delta_x(self, other): if self.overlaps_x(other): return 0 return Box.delta(self.x0, self.x1, other.x0, other.x1) def delta_y(self, other): if self.overlaps_y(other): return 0 return Box.delta(self.y0, self.y1, other.y0, other.y1) def delta(a0, a1, b0, b1): return min([abs(i) for i in [a0-b0, a0-b1, a1-b0, a1-b1]]) class Line: def __init__(self, pointA, pointB): self.pointA = pointA self.pointB = pointB def overlaps_x(self, other, clearance=0): mine = self.xs() others = other.xs() return Line.overlaps( mine[0], mine[1], others[0], others[1], clearance=clearance, ) def overlaps_y(self, other, clearance=0): mine = self.ys() others = other.ys() return Line.overlaps( mine[0], mine[1], others[0], others[1], clearance=clearance, ) def xs(self): return sorted([self.pointA.x, self.pointB.x]) def ys(self): return sorted([self.pointA.y, self.pointB.y]) def overlaps(my_min, my_max, other_min, other_max, clearance=0): my_min -= clearance my_max += clearance other_min -= clearance other_max += clearance # my.. other..other ..my if my_min <= other_min and other_max <= my_max: return True # other.. my..my ..other elif other_min <= my_min and my_max <= other_max: return True # my..other..my..other elif my_min <= other_min and other_min <= my_max and my_max <= other_max: return True # other..my..other..my elif other_min <= my_min and my_min <= other_max and other_max <= my_max: return True return False class Point: def __init__(self, x, y): self.x = x self.y = y