import debug import cluster import config import pdfplumber from multiprocessing.pool import ThreadPool import pypdf import queue import subprocess def main(): de_column_ify() def log(*args): print(*args, flush=True) def de_column_ify(): q = queue.Queue(maxsize=4 if config.PARALLEL else 0) with pdfplumber.open(config.INPUT) as pdf: cropped_pages = [] with ThreadPool(4) as pool: for i in range(len(pdf.pages)): if config.PARALLEL: pool.apply_async(de_columnify_page, (q, config.INPUT, pdf.pages[i], )) else: de_columnify_page(q, config.INPUT, pdf.pages[i]) for i in range(len(pdf.pages)): log("getting", i, "of", len(pdf.pages)) got = q.get() for got_i in got[1]: cropped_pages.append((got[0], got_i.path)) #if got_i.chars: # debug.debug_show(debug.debug_im(got_i.page)) log("merging", len(cropped_pages), "de-column-ified pages") cropped_pages = sorted(cropped_pages) writer = pypdf.PdfWriter() for cropped_page in cropped_pages: with open(cropped_page[1], "rb") as f: reader = pypdf.PdfReader(f) writer.add_page(reader.pages[0]) log("dumping de-column-ified pages") output = f'{config.INPUT}.de-column-ified.pdf' with open(output, "wb") as f: writer.write(f) log(output) def de_columnify_page(q, path, page): for _ in range(3): try: result = cluster.Chars(path, page.chars, page).divide_into_columns() log("putting", page.page_number, len(result)) q.put((page.page_number, result)) return except Exception as e: log(page.page_number, "encountered", e) raise Exception(f"failure for {page.page_number}") def textify(page): lines = page.extract_text(layout=True).split("\n") leading_spaces = [] for line in lines: leading_spaces.append(len(line) - len(line.lstrip())) median_leading_spaces = sorted(leading_spaces)[len(leading_spaces)//2] paragraphs = [] for line in lines: if line.startswith(median_leading_spaces * " "): line = line[median_leading_spaces:] if line.startswith(" "): line = "\t" + line[1:] if not paragraphs or line.startswith("\t"): paragraphs.append([]) paragraphs.append([]) paragraphs[-1].append(line.rstrip()) for paragraph in paragraphs: print(" ".join(paragraph)) if __name__ == "__main__": main()