whee
parent
c67ec83de9
commit
1d73f84172
15
main.py
15
main.py
|
|
@ -9,7 +9,7 @@ INPUT = os.environ.get("INPUT", "./testdata/input.pdf")
|
||||||
def main():
|
def main():
|
||||||
print("main")
|
print("main")
|
||||||
with pdfplumber.open(INPUT) as pdf:
|
with pdfplumber.open(INPUT) as pdf:
|
||||||
for page in pdf.pages[4:5]:
|
for page in pdf.pages[:]:
|
||||||
for splitpage in v_split(page):
|
for splitpage in v_split(page):
|
||||||
print(splitpage.extract_text())
|
print(splitpage.extract_text())
|
||||||
print("/main")
|
print("/main")
|
||||||
|
|
@ -35,8 +35,8 @@ def debug_show(im):
|
||||||
|
|
||||||
def v_split(page):
|
def v_split(page):
|
||||||
clusters = cluster(page)
|
clusters = cluster(page)
|
||||||
points = [i.x0 for i in clusters]
|
points = [i.x0 for i in clusters if i.x0 > page.width//4]
|
||||||
points += [i.x1 for i in clusters]
|
points += [i.x1 for i in clusters if i.x1 < 3*page.width//4]
|
||||||
x_clusters = []
|
x_clusters = []
|
||||||
for point in points:
|
for point in points:
|
||||||
merged = False
|
merged = False
|
||||||
|
|
@ -47,9 +47,14 @@ def v_split(page):
|
||||||
if not merged:
|
if not merged:
|
||||||
x_clusters.append(point)
|
x_clusters.append(point)
|
||||||
x_clusters = sorted(x_clusters)
|
x_clusters = sorted(x_clusters)
|
||||||
if len(x_clusters) != 4:
|
if DEBUG:
|
||||||
|
im = debug_im(page)
|
||||||
|
for x_cluster in x_clusters:
|
||||||
|
im.draw_line(((x_cluster, 0), (x_cluster, page.height)))
|
||||||
|
debug_show(im)
|
||||||
|
if len(x_clusters) != 2:
|
||||||
return [page]
|
return [page]
|
||||||
x = (x_clusters[2] + x_clusters[1]) / 2
|
x = sum(x_clusters) / len(x_clusters)
|
||||||
result = [page.within_bbox((0, 0, x, page.height)), page.within_bbox((x, 0, page.width, page.height))]
|
result = [page.within_bbox((0, 0, x, page.height)), page.within_bbox((x, 0, page.width, page.height))]
|
||||||
if DEBUG:
|
if DEBUG:
|
||||||
for page in result:
|
for page in result:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue