import debug import cluster import config import pdfplumber from multiprocessing.pool import ThreadPool import queue def main(): de_column_ify() def log(*args): print(*args, flush=True) def de_column_ify(): q = queue.Queue(maxsize=4) with pdfplumber.open(config.INPUT) as pdf: with ThreadPool(4) as pool: for i in range(len(pdf.pages)): pool.apply_async(de_columnify_page, (q, pdf.pages[i], )) for i in range(len(pdf.pages)): log("getting", i, "of", len(pdf.pages)) got = q.get() for got_i in got[1]: if got_i.chars: debug.debug_show(debug.debug_im(got_i.page)) #debug.draw_boxes(got_i.page, [{ # "x0": got_i.chars[0]["x0"], # "x1": got_i.chars[0]["x1"], # "y0": got_i.chars[0]["y0"], # "y1": got_i.chars[0]["y1"], # "debug_label": got_i.n, #}]) def de_columnify_page(q, page): result = cluster.Chars(page.chars, page).divide_into_columns() log("putting", page.page_number) q.put((page, result)) if __name__ == "__main__": main()