dnd-pdf-to-txt/main.py

123 lines
3.7 KiB
Python

import pdfplumber
import os
import time
import subprocess
DEBUG = os.environ.get("DEBUG", "")
DEBUG_NO_SHOW = os.environ.get("DEBUG_NO_SHOW", "")
INPUT = os.environ.get("INPUT", "./testdata/input.pdf")
def main():
print("main")
with pdfplumber.open(INPUT) as pdf:
for page in pdf.pages[:]:
for splitpage in v_split(page):
print(splitpage.extract_text())
print("/main")
def debug_im(page):
return page.to_image(height=800)
def debug_show(im, name=None):
im.save(f"/tmp/dnd-pdf-to-txt{'' if not name else '-'+name}.jpg")
if not DEBUG_NO_SHOW:
go(f"qlmanage -p /tmp/dnd-pdf-to-txt{'' if not name else '-'+name}.jpg &> /dev/null")
def v_split(page):
clusters = cluster(page)
points = [i.x0 for i in clusters if i.x0 > page.width//4]
points += [i.x1 for i in clusters if i.x1 < 3*page.width//4]
x_clusters = []
for point in points:
merged = False
for x_cluster in x_clusters:
if (x_cluster - point) ** 2 < 100:
merged = True
break
if not merged:
x_clusters.append(point)
x_clusters = sorted(x_clusters)
if DEBUG:
im = debug_im(page)
for x_cluster in x_clusters:
im.draw_line(((x_cluster, 0), (x_cluster, page.height)))
debug_show(im, name=f'v-split-xclusters-{page.page_number}')
if len(x_clusters) != 2:
return [page]
x = sum(x_clusters) / len(x_clusters)
result = [page.within_bbox((0, 0, x, page.height)), page.within_bbox((x, 0, page.width, page.height))]
if DEBUG:
i = 0
for page in result:
debug_show(debug_im(page), name=f'v-split-postsplit-{page.page_number}_{i}')
i += 1
return result
def cluster(page):
points = [(i["x0"], i["y0"]) for i in page.chars]
points += [(i["x1"], i["y1"]) for i in page.chars]
clusters = []
class cluster:
def __init__(self, x, y):
self.x0 = x
self.y0 = y
self.x1 = x+1
self.y1 = y+1
self.len = 1
def merge(self, x, y):
if x < self.x0:
self.x0 = x
elif x > self.x1:
self.x1 = x
if y < self.y0:
self.y0 = y
elif y > self.y1:
self.y1 = y
self.len += 1
def dist(self, x, y):
x_delta = 0
y_delta = 0
if x < self.x0:
x_delta = self.x0 - x
elif x > self.x1:
x_delta = x - self.x1
if y < self.y0 :
y_delta = self.y0 - y
elif y > self.y1:
y_delta = y - self.y1
return x_delta**2 + y_delta**2
def __str__(self):
return f'({int(self.x0)}, {int(self.y0)}, {int(self.x1)}, {int(self.y1)})'
for point in points:
merged = False
for a_cluster in clusters:
if a_cluster.dist(point[0], point[1]) < (page.width/50)**2:
a_cluster.merge(point[0], point[1])
merged = True
break
if not merged:
clusters.append(cluster(point[0], point[1]))
clusters = [i for i in clusters if i.len > 100]
if DEBUG:
im = debug_im(page)
for i in clusters:
im.draw_lines([
((i.x0, page.height-i.y0), (i.x0, page.height-i.y1)),
((i.x0, page.height-i.y1), (i.x1, page.height-i.y1)),
((i.x1, page.height-i.y1), (i.x1, page.height-i.y0)),
((i.x1, page.height-i.y0), (i.x0, page.height-i.y0)),
], stroke_width=5)
debug_show(im, name=f'cluster-{page.page_number}')
return clusters
__subprocesses__ = []
def go(cmd):
global __subprocesses__
__subprocesses__.append(subprocess.Popen(cmd, shell=True))
if __name__ == "__main__":
main()
for p in __subprocesses__:
p.wait()
p.terminate()