diff --git a/main.py b/main.py index e66e420..127b3d3 100644 --- a/main.py +++ b/main.py @@ -2,21 +2,41 @@ import debug import cluster import config import pdfplumber +from multiprocessing.pool import ThreadPool +import queue def main(): + de_column_ify() + +def log(*args): + print(*args, flush=True) + +def de_column_ify(): + q = queue.Queue(maxsize=4) with pdfplumber.open(config.INPUT) as pdf: - for page in pdf.pages: - got = cluster.Chars(page.chars, page).divide_into_columns() - debug.draw_boxes(page, [ - { - "x0": i.chars[0]["x0"], - "x1": i.chars[0]["x1"], - "y0": i.chars[0]["y0"], - "y1": i.chars[0]["y1"], - "debug_label": i.n, - } - for i in got if i.chars - ]) + with ThreadPool(4) as pool: + n = 0 + for i in range(len(pdf.pages)): + pool.apply_async(de_columnify_page, (q, n, pdf.pages[i], )) + n += 1 + for i in range(n): + log("getting", i, "of", n) + got = q.get() + debug.draw_boxes(got[1] , [ + { + "x0": i.chars[0]["x0"], + "x1": i.chars[0]["x1"], + "y0": i.chars[0]["y0"], + "y1": i.chars[0]["y1"], + "debug_label": i.n, + } + for i in got[2] if i.chars + ]) + +def de_columnify_page(q, idx, page): + result = cluster.Chars(page.chars, page).divide_into_columns() + log("putting", idx) + q.put((idx, page, result)) if __name__ == "__main__": main()