import debug import cluster import config import pdfplumber from multiprocessing.pool import ThreadPool import queue def main(): de_column_ify() def log(*args): print(*args, flush=True) def de_column_ify(): q = queue.Queue(maxsize=4) with pdfplumber.open(config.INPUT) as pdf: with ThreadPool(4) as pool: for i in range(len(pdf.pages)): pool.apply_async(de_columnify_page, (q, pdf.pages[i], )) for i in range(len(pdf.pages)): log("getting", i, "of", len(pdf.pages)) got = q.get() for got_i in got[1]: if got_i.chars: debug.debug_show(debug.debug_im(got_i.page)) def de_columnify_page(q, page): result = cluster.Chars(page.chars, page).divide_into_columns() log("putting", page.page_number) q.put((page, result)) if __name__ == "__main__": main()