126 lines
3.5 KiB
Python
126 lines
3.5 KiB
Python
import pdfplumber
|
|
import os
|
|
import time
|
|
import subprocess
|
|
|
|
DEBUG = os.environ.get("DEBUG", "")
|
|
INPUT = os.environ.get("INPUT", "./testdata/input.pdf")
|
|
|
|
def main():
|
|
print("main")
|
|
with pdfplumber.open(INPUT) as pdf:
|
|
for page in pdf.pages[4:5]:
|
|
for splitpage in v_split(page):
|
|
print(splitpage.extract_text())
|
|
print("/main")
|
|
|
|
def crop(page, x0, y0, x1, y1):
|
|
if DEBUG:
|
|
im = debug_im(page)
|
|
im.draw_lines([
|
|
((x0, y0), (x0, y1)),
|
|
((x0, y1), (x1, y1)),
|
|
((x1, y1), (x1, y0)),
|
|
((x1, y0), (x0, y0)),
|
|
], stroke_width=5)
|
|
debug_show(im)
|
|
return page.crop((x0, y0, x1, y1))
|
|
|
|
def debug_im(page):
|
|
return page.to_image(height=800)
|
|
|
|
def debug_show(im):
|
|
im.save("/tmp/out.jpg")
|
|
go("qlmanage -p /tmp/out.jpg &> /dev/null")
|
|
|
|
def v_split(page):
|
|
clusters = cluster(page)
|
|
points = [i.x0 for i in clusters]
|
|
points += [i.x1 for i in clusters]
|
|
x_clusters = []
|
|
for point in points:
|
|
merged = False
|
|
for x_cluster in x_clusters:
|
|
if (x_cluster - point) ** 2 < 100:
|
|
merged = True
|
|
break
|
|
if not merged:
|
|
x_clusters.append(point)
|
|
x_clusters = sorted(x_clusters)
|
|
if len(x_clusters) != 4:
|
|
return [page]
|
|
x = (x_clusters[2] + x_clusters[1]) / 2
|
|
result = [page.within_bbox((0, 0, x, page.height)), page.within_bbox((x, 0, page.width, page.height))]
|
|
if DEBUG:
|
|
for page in result:
|
|
debug_show(debug_im(page))
|
|
return result
|
|
|
|
def cluster(page):
|
|
points = [(i["x0"], i["y0"]) for i in page.chars]
|
|
points += [(i["x1"], i["y1"]) for i in page.chars]
|
|
clusters = []
|
|
class cluster:
|
|
def __init__(self, x, y):
|
|
self.x0 = x
|
|
self.y0 = y
|
|
self.x1 = x+1
|
|
self.y1 = y+1
|
|
self.len = 1
|
|
def merge(self, x, y):
|
|
if x < self.x0:
|
|
self.x0 = x
|
|
elif x > self.x1:
|
|
self.x1 = x
|
|
if y < self.y0:
|
|
self.y0 = y
|
|
elif y > self.y1:
|
|
self.y1 = y
|
|
self.len += 1
|
|
def dist(self, x, y):
|
|
x_delta = 0
|
|
y_delta = 0
|
|
if x < self.x0:
|
|
x_delta = self.x0 - x
|
|
elif x > self.x1:
|
|
x_delta = x - self.x1
|
|
if y < self.y0 :
|
|
y_delta = self.y0 - y
|
|
elif y > self.y1:
|
|
y_delta = y - self.y1
|
|
return x_delta**2 + y_delta**2
|
|
def __str__(self):
|
|
return f'({int(self.x0)}, {int(self.y0)}, {int(self.x1)}, {int(self.y1)})'
|
|
for point in points:
|
|
merged = False
|
|
for a_cluster in clusters:
|
|
if a_cluster.dist(point[0], point[1]) < (page.width/50)**2:
|
|
a_cluster.merge(point[0], point[1])
|
|
merged = True
|
|
break
|
|
if not merged:
|
|
clusters.append(cluster(point[0], point[1]))
|
|
clusters = [i for i in clusters if i.len > 100]
|
|
if DEBUG:
|
|
im = debug_im(page)
|
|
for i in clusters:
|
|
im.draw_lines([
|
|
((i.x0, page.height-i.y0), (i.x0, page.height-i.y1)),
|
|
((i.x0, page.height-i.y1), (i.x1, page.height-i.y1)),
|
|
((i.x1, page.height-i.y1), (i.x1, page.height-i.y0)),
|
|
((i.x1, page.height-i.y0), (i.x0, page.height-i.y0)),
|
|
], stroke_width=5)
|
|
debug_show(im)
|
|
return clusters
|
|
|
|
__subprocesses__ = []
|
|
def go(cmd):
|
|
global __subprocesses__
|
|
__subprocesses__.append(subprocess.Popen(cmd, shell=True))
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
for p in __subprocesses__:
|
|
p.wait()
|
|
p.terminate()
|