From a08cb2927ad3294471035bcbd846279fdb0921dc Mon Sep 17 00:00:00 2001 From: bel Date: Tue, 21 Feb 2023 11:50:47 -0700 Subject: [PATCH] threading is coo --- main.py | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/main.py b/main.py index e66e420..127b3d3 100644 --- a/main.py +++ b/main.py @@ -2,21 +2,41 @@ import debug import cluster import config import pdfplumber +from multiprocessing.pool import ThreadPool +import queue def main(): + de_column_ify() + +def log(*args): + print(*args, flush=True) + +def de_column_ify(): + q = queue.Queue(maxsize=4) with pdfplumber.open(config.INPUT) as pdf: - for page in pdf.pages: - got = cluster.Chars(page.chars, page).divide_into_columns() - debug.draw_boxes(page, [ - { - "x0": i.chars[0]["x0"], - "x1": i.chars[0]["x1"], - "y0": i.chars[0]["y0"], - "y1": i.chars[0]["y1"], - "debug_label": i.n, - } - for i in got if i.chars - ]) + with ThreadPool(4) as pool: + n = 0 + for i in range(len(pdf.pages)): + pool.apply_async(de_columnify_page, (q, n, pdf.pages[i], )) + n += 1 + for i in range(n): + log("getting", i, "of", n) + got = q.get() + debug.draw_boxes(got[1] , [ + { + "x0": i.chars[0]["x0"], + "x1": i.chars[0]["x1"], + "y0": i.chars[0]["y0"], + "y1": i.chars[0]["y1"], + "debug_label": i.n, + } + for i in got[2] if i.chars + ]) + +def de_columnify_page(q, idx, page): + result = cluster.Chars(page.chars, page).divide_into_columns() + log("putting", idx) + q.put((idx, page, result)) if __name__ == "__main__": main()