wheee happy case is probably slow but probably ok
parent
0707390c6e
commit
6be1203d28
10
cluster.py
10
cluster.py
|
|
@ -20,6 +20,8 @@ class Chars:
|
||||||
# given median character size
|
# given median character size
|
||||||
# drop all above+below first instance
|
# drop all above+below first instance
|
||||||
heights = [i["y1"]-i["y0"] for i in self.chars]
|
heights = [i["y1"]-i["y0"] for i in self.chars]
|
||||||
|
if not heights:
|
||||||
|
return [self]
|
||||||
median_height = sorted(heights)[len(heights)//2]
|
median_height = sorted(heights)[len(heights)//2]
|
||||||
at_least_median_height = [i for i in self.chars if i["y1"]-i["y0"] >= median_height]
|
at_least_median_height = [i for i in self.chars if i["y1"]-i["y0"] >= median_height]
|
||||||
at_least_median_height = sorted(at_least_median_height, key=lambda x:x["y0"])
|
at_least_median_height = sorted(at_least_median_height, key=lambda x:x["y0"])
|
||||||
|
|
@ -72,6 +74,14 @@ class Chars:
|
||||||
result2.append(sub)
|
result2.append(sub)
|
||||||
result = result2
|
result = result2
|
||||||
|
|
||||||
|
for i in result:
|
||||||
|
i.merge()
|
||||||
|
for j in range(len(i.chars)):
|
||||||
|
i.chars[j]["x0"] -= median_height
|
||||||
|
i.chars[j]["x1"] += median_height
|
||||||
|
i.chars[j]["y0"] -= median_height * 3
|
||||||
|
i.chars[j]["y1"] += median_height * 3
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def merge_in(self, other):
|
def merge_in(self, other):
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,22 @@
|
||||||
|
import debug
|
||||||
|
import cluster
|
||||||
|
import config
|
||||||
|
import pdfplumber
|
||||||
|
|
||||||
|
def main():
|
||||||
|
with pdfplumber.open(config.INPUT) as pdf:
|
||||||
|
for page in pdf.pages:
|
||||||
|
got = cluster.Chars(page.chars, page).divide_into_columns()
|
||||||
|
debug.draw_boxes(page, [
|
||||||
|
{
|
||||||
|
"x0": i.chars[0]["x0"],
|
||||||
|
"x1": i.chars[0]["x1"],
|
||||||
|
"y0": i.chars[0]["y0"],
|
||||||
|
"y1": i.chars[0]["y1"],
|
||||||
|
"debug_label": i.n,
|
||||||
|
}
|
||||||
|
for i in got if i.chars
|
||||||
|
])
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Reference in New Issue