From 1119c46b97101398183ede18e5b55e742f66ab51 Mon Sep 17 00:00:00 2001 From: bel Date: Tue, 21 Feb 2023 12:39:22 -0700 Subject: [PATCH] cluster.Chars has .path --- cluster.py | 5 +++-- main.py | 6 +++--- test_cluster.py | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/cluster.py b/cluster.py index 4319e76..c8c0619 100644 --- a/cluster.py +++ b/cluster.py @@ -1,7 +1,8 @@ import config class Chars: - def __init__(self, chars, page): + def __init__(self, path, chars, page): + self.path = path self.chars = chars self.page = page self.n = 0 @@ -44,7 +45,7 @@ class Chars: if result and result[-1]._box().overlaps(Box.from_char(char), clearance=median_x_delta_when_same_y): result[-1].merge_in(char) else: - result.append(Chars([char], self.page)) + result.append(Chars(self.path, [char], self.page)) result = [i for i in result if i.n > 2] # any clusters shorter than median character and high/lower are header/footer diff --git a/main.py b/main.py index 4e241fc..f9b09fd 100644 --- a/main.py +++ b/main.py @@ -16,7 +16,7 @@ def de_column_ify(): with pdfplumber.open(config.INPUT) as pdf: with ThreadPool(4) as pool: for i in range(len(pdf.pages)): - pool.apply_async(de_columnify_page, (q, pdf.pages[i], )) + pool.apply_async(de_columnify_page, (q, config.INPUT, pdf.pages[i], )) for i in range(len(pdf.pages)): log("getting", i, "of", len(pdf.pages)) got = q.get() @@ -24,8 +24,8 @@ def de_column_ify(): if got_i.chars: debug.debug_show(debug.debug_im(got_i.page)) -def de_columnify_page(q, page): - result = cluster.Chars(page.chars, page).divide_into_columns() +def de_columnify_page(q, path, page): + result = cluster.Chars(path, page.chars, page).divide_into_columns() log("putting", page.page_number) q.put((page, result)) diff --git a/test_cluster.py b/test_cluster.py index ffac0d9..adda9ab 100644 --- a/test_cluster.py +++ b/test_cluster.py @@ -28,7 +28,7 @@ class TestChars(unittest.TestCase): # for i in range(len(words)) #]) #continue - got = cluster.Chars(page.chars, page).divide_into_columns() + got = cluster.Chars(p, page.chars, page).divide_into_columns() print(p) debug.draw_boxes(page, [ {