cluster.Chars has .path

master
bel 2023-02-21 12:39:22 -07:00
parent 7636ce0038
commit 1119c46b97
3 changed files with 7 additions and 6 deletions

View File

@ -1,7 +1,8 @@
import config import config
class Chars: class Chars:
def __init__(self, chars, page): def __init__(self, path, chars, page):
self.path = path
self.chars = chars self.chars = chars
self.page = page self.page = page
self.n = 0 self.n = 0
@ -44,7 +45,7 @@ class Chars:
if result and result[-1]._box().overlaps(Box.from_char(char), clearance=median_x_delta_when_same_y): if result and result[-1]._box().overlaps(Box.from_char(char), clearance=median_x_delta_when_same_y):
result[-1].merge_in(char) result[-1].merge_in(char)
else: else:
result.append(Chars([char], self.page)) result.append(Chars(self.path, [char], self.page))
result = [i for i in result if i.n > 2] result = [i for i in result if i.n > 2]
# any clusters shorter than median character and high/lower are header/footer # any clusters shorter than median character and high/lower are header/footer

View File

@ -16,7 +16,7 @@ def de_column_ify():
with pdfplumber.open(config.INPUT) as pdf: with pdfplumber.open(config.INPUT) as pdf:
with ThreadPool(4) as pool: with ThreadPool(4) as pool:
for i in range(len(pdf.pages)): for i in range(len(pdf.pages)):
pool.apply_async(de_columnify_page, (q, pdf.pages[i], )) pool.apply_async(de_columnify_page, (q, config.INPUT, pdf.pages[i], ))
for i in range(len(pdf.pages)): for i in range(len(pdf.pages)):
log("getting", i, "of", len(pdf.pages)) log("getting", i, "of", len(pdf.pages))
got = q.get() got = q.get()
@ -24,8 +24,8 @@ def de_column_ify():
if got_i.chars: if got_i.chars:
debug.debug_show(debug.debug_im(got_i.page)) debug.debug_show(debug.debug_im(got_i.page))
def de_columnify_page(q, page): def de_columnify_page(q, path, page):
result = cluster.Chars(page.chars, page).divide_into_columns() result = cluster.Chars(path, page.chars, page).divide_into_columns()
log("putting", page.page_number) log("putting", page.page_number)
q.put((page, result)) q.put((page, result))

View File

@ -28,7 +28,7 @@ class TestChars(unittest.TestCase):
# for i in range(len(words)) # for i in range(len(words))
#]) #])
#continue #continue
got = cluster.Chars(page.chars, page).divide_into_columns() got = cluster.Chars(p, page.chars, page).divide_into_columns()
print(p) print(p)
debug.draw_boxes(page, [ debug.draw_boxes(page, [
{ {