import debug import cluster import config import pdfplumber from multiprocessing.pool import ThreadPool import queue def main(): de_column_ify() def log(*args): print(*args, flush=True) def de_column_ify(): q = queue.Queue(maxsize=4) with pdfplumber.open(config.INPUT) as pdf: with ThreadPool(4) as pool: for i in range(len(pdf.pages)): pool.apply_async(de_columnify_page, (q, pdf.pages[i], )) for i in range(len(pdf.pages)): log("getting", i, "of", len(pdf.pages)) got = q.get() for got_i in got[1]: if got_i.chars: True or debug.debug_show(debug.debug_im(got_i.page)) textify(got_i.page) def de_columnify_page(q, page): result = cluster.Chars(page.chars, page).divide_into_columns() log("putting", page.page_number) q.put((page, result)) def textify(page): lines = page.extract_text(layout=True).split("\n") leading_spaces = [] for line in lines: leading_spaces.append(len(line) - len(line.lstrip())) median_leading_spaces = sorted(leading_spaces)[len(leading_spaces)//2] paragraphs = [] for line in lines: if line.startswith(median_leading_spaces * " "): line = line[median_leading_spaces:] if line.startswith(" "): line = "\t" + line[1:] if not paragraphs or line.startswith("\t"): paragraphs.append([]) paragraphs.append([]) paragraphs[-1].append(line.rstrip()) for paragraph in paragraphs: print(" ".join(paragraph)) if __name__ == "__main__": main()