wheee happy case is probably slow but probably ok

master
Bel LaPointe 2023-02-20 09:51:49 -07:00
parent 0707390c6e
commit 6be1203d28
2 changed files with 32 additions and 0 deletions

View File

@ -20,6 +20,8 @@ class Chars:
# given median character size
# drop all above+below first instance
heights = [i["y1"]-i["y0"] for i in self.chars]
if not heights:
return [self]
median_height = sorted(heights)[len(heights)//2]
at_least_median_height = [i for i in self.chars if i["y1"]-i["y0"] >= median_height]
at_least_median_height = sorted(at_least_median_height, key=lambda x:x["y0"])
@ -72,6 +74,14 @@ class Chars:
result2.append(sub)
result = result2
for i in result:
i.merge()
for j in range(len(i.chars)):
i.chars[j]["x0"] -= median_height
i.chars[j]["x1"] += median_height
i.chars[j]["y0"] -= median_height * 3
i.chars[j]["y1"] += median_height * 3
return result
def merge_in(self, other):

22
main.py Normal file
View File

@ -0,0 +1,22 @@
import debug
import cluster
import config
import pdfplumber
def main():
with pdfplumber.open(config.INPUT) as pdf:
for page in pdf.pages:
got = cluster.Chars(page.chars, page).divide_into_columns()
debug.draw_boxes(page, [
{
"x0": i.chars[0]["x0"],
"x1": i.chars[0]["x1"],
"y0": i.chars[0]["y0"],
"y1": i.chars[0]["y1"],
"debug_label": i.n,
}
for i in got if i.chars
])
if __name__ == "__main__":
main()