wheee happy case is probably slow but probably ok
parent
0707390c6e
commit
6be1203d28
10
cluster.py
10
cluster.py
|
|
@ -20,6 +20,8 @@ class Chars:
|
|||
# given median character size
|
||||
# drop all above+below first instance
|
||||
heights = [i["y1"]-i["y0"] for i in self.chars]
|
||||
if not heights:
|
||||
return [self]
|
||||
median_height = sorted(heights)[len(heights)//2]
|
||||
at_least_median_height = [i for i in self.chars if i["y1"]-i["y0"] >= median_height]
|
||||
at_least_median_height = sorted(at_least_median_height, key=lambda x:x["y0"])
|
||||
|
|
@ -72,6 +74,14 @@ class Chars:
|
|||
result2.append(sub)
|
||||
result = result2
|
||||
|
||||
for i in result:
|
||||
i.merge()
|
||||
for j in range(len(i.chars)):
|
||||
i.chars[j]["x0"] -= median_height
|
||||
i.chars[j]["x1"] += median_height
|
||||
i.chars[j]["y0"] -= median_height * 3
|
||||
i.chars[j]["y1"] += median_height * 3
|
||||
|
||||
return result
|
||||
|
||||
def merge_in(self, other):
|
||||
|
|
|
|||
|
|
@ -0,0 +1,22 @@
|
|||
import debug
|
||||
import cluster
|
||||
import config
|
||||
import pdfplumber
|
||||
|
||||
def main():
|
||||
with pdfplumber.open(config.INPUT) as pdf:
|
||||
for page in pdf.pages:
|
||||
got = cluster.Chars(page.chars, page).divide_into_columns()
|
||||
debug.draw_boxes(page, [
|
||||
{
|
||||
"x0": i.chars[0]["x0"],
|
||||
"x1": i.chars[0]["x1"],
|
||||
"y0": i.chars[0]["y0"],
|
||||
"y1": i.chars[0]["y1"],
|
||||
"debug_label": i.n,
|
||||
}
|
||||
for i in got if i.chars
|
||||
])
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in New Issue