master
bel 2023-02-19 20:19:20 -07:00
parent cc8a66e283
commit 2b68a39cc7
7 changed files with 155 additions and 0 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

115
cluster.py Normal file
View File

@ -0,0 +1,115 @@
import config
import debug
class Chars:
def __init__(self, chars, page):
self.chars = chars
self.page = page
def divide_into_columns(self):
result = [Chars([self.chars[0]], self.page)]
for char in self.chars[1:]:
if result[-1].overlapping_y_coordinates(char):
result[-1].chars.append(char)
else:
result.append(Chars([char], self.page))
# TODO: split clusters: find median horizontal distance between each item
[i.merge() for i in result]
#result = sorted(result, key=lambda x: x.chars[0]["y0"])
#gaps = [ result[i+1].dist(result[i]) for i in range(0, len(result)-1)]
#median_gap = sorted(gaps)[len(gaps)//2]
#changed = True
#iteration = 0
#while changed:
# iteration += 1
# changed = False
# for i in range(len(gaps)-1, 0, -1):
# gap = gaps[i]
# print(iteration, "//", gap < median_gap*2, "//", gap, "between", result[i].outer_bounds(), "and", result[i+1].outer_bounds(), "is <", median_gap, "*2")
# if gap < median_gap*2:
# result[i].chars.append(result[i+1].chars[0])
# result[i].merge()
# result = result[:i+1] + result[i+2:]
# changed = True
# result = sorted(result, key=lambda x: x.chars[0]["y0"])
# gaps = [ result[i+1].dist(result[i]) for i in range(0, len(result)-1)]
debug.draw_boxes(self.page, [i.chars[0] for i in result])
def merge(self):
bounds = self.outer_bounds()
self.chars[0]["x0"] = bounds[0]
self.chars[0]["x1"] = bounds[1]
self.chars[0]["y0"] = bounds[2]
self.chars[0]["y1"] = bounds[3]
self.chars = self.chars[:1]
def outer_bounds(self):
x_min = self.chars[0]["x0"]
x_max = self.chars[0]["x1"]
y_min = self.chars[0]["y0"]
y_max = self.chars[0]["y1"]
for char in self.chars[1:]:
if char["x0"] < x_min:
x_min = char["x0"]
if char["x1"] > x_max:
x_max = char["x1"]
if char["y0"] < y_min:
x_min = char["y0"]
if char["y1"] > y_max:
y_max = char["y1"]
return (x_min, x_max, y_min, y_max)
def dist(self, other):
my_bounds = self.outer_bounds()
other_bounds = other.outer_bounds()
x_delta = 0
if not Chars.char_overlaps(my_bounds[0], my_bounds[1], other_bounds[0], other_bounds[1]):
x_delta = min([abs(i) for i in [
my_bounds[0] - other_bounds[0],
my_bounds[0] - other_bounds[1],
my_bounds[1] - other_bounds[0],
my_bounds[1] - other_bounds[1],
]])
y_delta = 0
if not Chars.char_overlaps(my_bounds[2], my_bounds[3], other_bounds[2], other_bounds[3]):
y_delta = min([abs(i) for i in [
my_bounds[2] - other_bounds[2],
my_bounds[2] - other_bounds[3],
my_bounds[3] - other_bounds[2],
my_bounds[3] - other_bounds[3],
]])
return x_delta ** 2 + y_delta ** 2
def overlapping_y_coordinates(self, other_char):
for self_char in self.chars:
if Chars.char_overlapping_y_coordinates(other_char, self_char):
return True
return False
def char_overlapping_y_coordinates(candidate, established):
result = Chars.char_overlaps(
established["y0"],
established["y1"],
candidate["y0"],
candidate["y1"],
)
print(established["y0"], "..", established["y1"], result, candidate["y0"], "..", candidate["y1"])
return result
def char_overlaps(my_min, my_max, other_min, other_max):
# my.. other..other ..my
if my_min <= other_min and other_max <= my_max:
return True
# other.. my..my ..other
elif other_min <= my_min and my_max <= other_max:
return True
# my..other..my..other
elif my_min <= other_min and other_min <= my_max and my_max <= other_max:
return True
# other..my..other..my
elif other_min <= my_min and my_min <= other_max and other_max <= my_max:
return True
return False

5
config.py Normal file
View File

@ -0,0 +1,5 @@
import os
DEBUG = os.environ.get("DEBUG", "")
DEBUG_NO_SHOW = os.environ.get("DEBUG_NO_SHOW", "")
INPUT = os.environ.get("INPUT", "./testdata/input.pdf")

22
debug.py Normal file
View File

@ -0,0 +1,22 @@
import pdfplumber
import os
import time
import subprocess
def draw_boxes(page, boxes):
im = debug_im(page)
for box in boxes:
im.draw_line(((box["x0"], page.height - box["y0"]), (box["x1"], page.height - box["y0"])))
im.draw_line(((box["x1"], page.height - box["y0"]), (box["x1"], page.height - box["y1"])))
im.draw_line(((box["x1"], page.height - box["y1"]), (box["x0"], page.height - box["y1"])))
im.draw_line(((box["x0"], page.height - box["y1"]), (box["x0"], page.height - box["y0"])))
debug_show(im)
def debug_im(page):
return page.to_image(height=800)
def debug_show(im, name=None):
im.show()
#im.save(f"/tmp/dnd-pdf-to-txt{'' if not name else '-'+name}.jpg")
#if not DEBUG_NO_SHOW:
# go(f"qlmanage -p /tmp/dnd-pdf-to-txt{'' if not name else '-'+name}.jpg &> /dev/null")

13
test_cluster.py Normal file
View File

@ -0,0 +1,13 @@
import unittest
import cluster
import pdfplumber
class TestChars(unittest.TestCase):
def test_divide_into_columns(self):
with pdfplumber.open("./testdata/2-column_2-row.pdf") as pdf:
for page in pdf.pages:
cluster.Chars(page.chars, page).divide_into_columns()
if __name__ == "__main__":
unittest.main()