headers footers seem k

master
Bel LaPointe 2023-02-20 09:44:54 -07:00
parent faa336fca4
commit 0707390c6e
2 changed files with 32 additions and 17 deletions

View File

@ -17,6 +17,13 @@ class Chars:
)
def divide_into_columns(self):
# given median character size
# drop all above+below first instance
heights = [i["y1"]-i["y0"] for i in self.chars]
median_height = sorted(heights)[len(heights)//2]
at_least_median_height = [i for i in self.chars if i["y1"]-i["y0"] >= median_height]
at_least_median_height = sorted(at_least_median_height, key=lambda x:x["y0"])
# for every sequential pair of chars on same y-coordinate
# what is median distance?
distances_when_sequential_and_same_y_coordinate = []
@ -30,15 +37,25 @@ class Chars:
median_x_delta_when_same_y = max([5, median_x_delta_when_same_y])
# merge all naive overlapping boxes
result = [Chars([self.chars[0]], self.page)]
for char in self.chars[1:]:
if result[-1]._box().overlaps(Box.from_char(char), clearance=median_x_delta_when_same_y):
result = []
for char in self.chars:
if result and result[-1]._box().overlaps(Box.from_char(char), clearance=median_x_delta_when_same_y):
result[-1].merge_in(char)
else:
result.append(Chars([char], self.page))
# TODO remove headers, footers; maybe median font size vs. max of cluster?
result = [i for i in result if i.n > 2]
# any clusters shorter than median character and high/lower are header/footer
result2 = []
for chars in result:
chars_box = chars._box()
are_small = chars_box.y1 - chars_box.y0 < median_height
are_header = chars_box.y1 < at_least_median_height[0]["y0"]
are_footer = chars_box.y0 > at_least_median_height[-1]["y1"]
if not (are_small and (are_header or are_footer)):
result2.append(chars)
result = result2
# merge all vertically overlapping boxes
changed = True
while changed:
@ -77,11 +94,13 @@ class Box:
def __init__(self, corner1, corner2):
self.corners = [corner1, corner2]
self.diagonal = Line(corner1, corner2)
self.x0 = min([corner1.x, corner2.x])
self.x1 = max([corner1.x, corner2.x])
self.y0 = min([corner1.y, corner2.y])
self.y1 = max([corner1.y, corner2.y])
def __str__(self):
xs = sorted(["{:.1f}".format(i.x) for i in self.corners])
ys = sorted(["{:.1f}".format(i.y) for i in self.corners])
return f'x=[{xs[0]}..{xs[-1]}],y=[{ys[0]}..{ys[1]}]'
return f'x=[{self.x0},{self.x1}],y=[{self.y0},{self.y1}]'
def from_char(char):
return Box(
@ -101,16 +120,12 @@ class Box:
def delta_x(self, other):
if self.overlaps_x(other):
return 0
my_xs = sorted([i.x for i in self.corners])
other_xs = sorted([i.x for i in other.corners])
return Box.delta(my_xs[0], my_xs[1], other_xs[0], other_xs[1])
return Box.delta(self.x0, self.x1, other.x0, other.x1)
def delta_y(self, other):
if self.overlaps_y(other):
return 0
my_ys = sorted([i.y for i in self.corners])
other_ys = sorted([i.y for i in other.corners])
return Box.delta(my_ys[0], my_ys[1], other_ys[0], other_ys[1])
return Box.delta(self.y0, self.y1, other.y0, other.y1)
def delta(a0, a1, b0, b1):
return min([abs(i) for i in [a0-b0, a0-b1, a1-b0, a1-b1]])

View File

@ -7,10 +7,10 @@ import debug
class TestChars(unittest.TestCase):
def test_divide_into_columns(self):
for p in [
"./testdata/2-column_2-row.pdf",
"./testdata/1-column_half-image.pdf",
"./testdata/2-column_fancy-font.pdf",
"./testdata/2-column_happy.pdf",
#"./testdata/2-column_2-row.pdf",
#"./testdata/1-column_half-image.pdf",
#"./testdata/2-column_fancy-font.pdf",
#"./testdata/2-column_happy.pdf",
"./testdata/2-column_non-interrupting-image.pdf",
]:
with pdfplumber.open(p) as pdf: