no more of u
parent
035b1f8f6d
commit
ae9e9bcc0d
|
|
@ -2,4 +2,5 @@ import os
|
|||
|
||||
DEBUG = os.environ.get("DEBUG", "")
|
||||
DEBUG_NO_SHOW = os.environ.get("DEBUG_NO_SHOW", "")
|
||||
DEBUG_HEIGHT = int(os.environ.get("DEBUG_HEIGHT", "800"))
|
||||
INPUT = os.environ.get("INPUT", "./testdata/input.pdf")
|
||||
|
|
|
|||
17
debug.py
17
debug.py
|
|
@ -2,6 +2,7 @@ import pdfplumber
|
|||
import os
|
||||
import time
|
||||
import subprocess
|
||||
import config
|
||||
|
||||
def draw_boxes(page, boxes):
|
||||
im = debug_im(page)
|
||||
|
|
@ -18,10 +19,16 @@ def draw_boxes(page, boxes):
|
|||
debug_show(im)
|
||||
|
||||
def debug_im(page):
|
||||
return page.to_image(height=800)
|
||||
return page.to_image(height=config.DEBUG_HEIGHT)
|
||||
|
||||
def debug_show(im, name=None):
|
||||
im.show()
|
||||
#im.save(f"/tmp/dnd-pdf-to-txt{'' if not name else '-'+name}.jpg")
|
||||
#if not DEBUG_NO_SHOW:
|
||||
# go(f"qlmanage -p /tmp/dnd-pdf-to-txt{'' if not name else '-'+name}.jpg &> /dev/null")
|
||||
#im.show()
|
||||
im.save(f"/tmp/dnd-pdf-to-txt{'' if not name else '-'+name}.jpg")
|
||||
if not config.DEBUG_NO_SHOW:
|
||||
go(f"qlmanage -p /tmp/dnd-pdf-to-txt{'' if not name else '-'+name}.jpg &> /dev/null")
|
||||
|
||||
__subprocesses__ = []
|
||||
def go(cmd):
|
||||
global __subprocesses__
|
||||
__subprocesses__.append(subprocess.Popen(cmd, shell=True))
|
||||
|
||||
|
|
|
|||
122
poc.py
122
poc.py
|
|
@ -1,122 +0,0 @@
|
|||
import pdfplumber
|
||||
import os
|
||||
import time
|
||||
import subprocess
|
||||
|
||||
DEBUG = os.environ.get("DEBUG", "")
|
||||
DEBUG_NO_SHOW = os.environ.get("DEBUG_NO_SHOW", "")
|
||||
INPUT = os.environ.get("INPUT", "./testdata/input.pdf")
|
||||
|
||||
def main():
|
||||
print("main")
|
||||
with pdfplumber.open(INPUT) as pdf:
|
||||
for page in pdf.pages[:]:
|
||||
for splitpage in v_split(page):
|
||||
print(splitpage.extract_text(layout=True))
|
||||
print("/main")
|
||||
|
||||
def debug_im(page):
|
||||
return page.to_image(height=800)
|
||||
|
||||
def debug_show(im, name=None):
|
||||
im.save(f"/tmp/dnd-pdf-to-txt{'' if not name else '-'+name}.jpg")
|
||||
if not DEBUG_NO_SHOW:
|
||||
go(f"qlmanage -p /tmp/dnd-pdf-to-txt{'' if not name else '-'+name}.jpg &> /dev/null")
|
||||
|
||||
def v_split(page):
|
||||
clusters = cluster(page)
|
||||
points = [i.x0 for i in clusters if i.x0 > page.width//4]
|
||||
points += [i.x1 for i in clusters if i.x1 < 3*page.width//4]
|
||||
x_clusters = []
|
||||
for point in points:
|
||||
merged = False
|
||||
for x_cluster in x_clusters:
|
||||
if (x_cluster - point) ** 2 < 100:
|
||||
merged = True
|
||||
break
|
||||
if not merged:
|
||||
x_clusters.append(point)
|
||||
x_clusters = sorted(x_clusters)
|
||||
if DEBUG:
|
||||
im = debug_im(page)
|
||||
for x_cluster in x_clusters:
|
||||
im.draw_line(((x_cluster, 0), (x_cluster, page.height)))
|
||||
debug_show(im, name=f'v-split-xclusters-{page.page_number}')
|
||||
if len(x_clusters) != 2:
|
||||
return [page]
|
||||
x = sum(x_clusters) / len(x_clusters)
|
||||
result = [page.within_bbox((0, 0, x, page.height)), page.within_bbox((x, 0, page.width, page.height))]
|
||||
if DEBUG:
|
||||
i = 0
|
||||
for page in result:
|
||||
debug_show(debug_im(page), name=f'v-split-postsplit-{page.page_number}_{i}')
|
||||
i += 1
|
||||
return result
|
||||
|
||||
def cluster(page):
|
||||
points = [(i["x0"], i["y0"]) for i in page.chars]
|
||||
points += [(i["x1"], i["y1"]) for i in page.chars]
|
||||
clusters = []
|
||||
class cluster:
|
||||
def __init__(self, x, y):
|
||||
self.x0 = x
|
||||
self.y0 = y
|
||||
self.x1 = x+1
|
||||
self.y1 = y+1
|
||||
self.len = 1
|
||||
def merge(self, x, y):
|
||||
if x < self.x0:
|
||||
self.x0 = x
|
||||
elif x > self.x1:
|
||||
self.x1 = x
|
||||
if y < self.y0:
|
||||
self.y0 = y
|
||||
elif y > self.y1:
|
||||
self.y1 = y
|
||||
self.len += 1
|
||||
def dist(self, x, y):
|
||||
x_delta = 0
|
||||
y_delta = 0
|
||||
if x < self.x0:
|
||||
x_delta = self.x0 - x
|
||||
elif x > self.x1:
|
||||
x_delta = x - self.x1
|
||||
if y < self.y0 :
|
||||
y_delta = self.y0 - y
|
||||
elif y > self.y1:
|
||||
y_delta = y - self.y1
|
||||
return x_delta**2 + y_delta**2
|
||||
def __str__(self):
|
||||
return f'({int(self.x0)}, {int(self.y0)}, {int(self.x1)}, {int(self.y1)})'
|
||||
for point in points:
|
||||
merged = False
|
||||
for a_cluster in clusters:
|
||||
if a_cluster.dist(point[0], point[1]) < (page.width/50)**2:
|
||||
a_cluster.merge(point[0], point[1])
|
||||
merged = True
|
||||
break
|
||||
if not merged:
|
||||
clusters.append(cluster(point[0], point[1]))
|
||||
clusters = [i for i in clusters if i.len > 100]
|
||||
if DEBUG:
|
||||
im = debug_im(page)
|
||||
for i in clusters:
|
||||
im.draw_lines([
|
||||
((i.x0, page.height-i.y0), (i.x0, page.height-i.y1)),
|
||||
((i.x0, page.height-i.y1), (i.x1, page.height-i.y1)),
|
||||
((i.x1, page.height-i.y1), (i.x1, page.height-i.y0)),
|
||||
((i.x1, page.height-i.y0), (i.x0, page.height-i.y0)),
|
||||
], stroke_width=5)
|
||||
debug_show(im, name=f'cluster-{page.page_number}')
|
||||
return clusters
|
||||
|
||||
__subprocesses__ = []
|
||||
def go(cmd):
|
||||
global __subprocesses__
|
||||
__subprocesses__.append(subprocess.Popen(cmd, shell=True))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
for p in __subprocesses__:
|
||||
p.wait()
|
||||
p.terminate()
|
||||
|
|
@ -7,14 +7,27 @@ import debug
|
|||
class TestChars(unittest.TestCase):
|
||||
def test_divide_into_columns(self):
|
||||
for p in [
|
||||
"./testdata/1-column_half-image.pdf",
|
||||
"./testdata/2-column_2-row.pdf",
|
||||
"./testdata/2-column_fancy-font.pdf",
|
||||
"./testdata/2-column_happy.pdf",
|
||||
"./testdata/2-column_non-interrupting-image.pdf",
|
||||
#"./testdata/1-column_half-image.pdf",
|
||||
#"./testdata/2-column_fancy-font.pdf",
|
||||
#"./testdata/2-column_happy.pdf",
|
||||
#"./testdata/2-column_non-interrupting-image.pdf",
|
||||
]:
|
||||
with pdfplumber.open(p) as pdf:
|
||||
for page in pdf.pages:
|
||||
im = debug.debug_im(page)
|
||||
words = page.extract_words()
|
||||
debug.draw_boxes(page, [
|
||||
{
|
||||
"debug_label": i,
|
||||
"x0": words[i]["x0"],
|
||||
"x1": words[i]["x1"],
|
||||
"y0": words[i]["top"],
|
||||
"y1": words[i]["bottom"],
|
||||
}
|
||||
for i in range(len(words))
|
||||
])
|
||||
continue
|
||||
got = cluster.Chars(page.chars, page).divide_into_columns()
|
||||
print(p)
|
||||
debug.draw_boxes(page, [
|
||||
|
|
|
|||
Loading…
Reference in New Issue