diff --git a/config.py b/config.py index 2b908a4..07dad9f 100644 --- a/config.py +++ b/config.py @@ -5,4 +5,5 @@ DEBUG_NO_SHOW = os.environ.get("DEBUG_NO_SHOW", "") DEBUG_HEIGHT = int(os.environ.get("DEBUG_HEIGHT", "800")) INPUT = os.environ.get("INPUT", "./testdata/input.pdf") TEMP_DIR = os.environ.get("TEMP_DIR", "/tmp/dnd-pdf-to-txt.d") +PARALLEL = int(os.environ.get("PARALLEL", "0")) os.makedirs(TEMP_DIR, exist_ok=True) diff --git a/main.py b/main.py index 5722594..621173e 100644 --- a/main.py +++ b/main.py @@ -13,19 +13,22 @@ def log(*args): print(*args, flush=True) def de_column_ify(): - q = queue.Queue(maxsize=4) + q = queue.Queue(maxsize=4 if config.PARALLEL else 0) with pdfplumber.open(config.INPUT) as pdf: cropped_pages = [] with ThreadPool(4) as pool: for i in range(len(pdf.pages)): - pool.apply_async(de_columnify_page, (q, config.INPUT, pdf.pages[i], )) + if config.PARALLEL: + pool.apply_async(de_columnify_page, (q, config.INPUT, pdf.pages[i], )) + else: + de_columnify_page(q, config.INPUT, pdf.pages[i]) for i in range(len(pdf.pages)): log("getting", i, "of", len(pdf.pages)) got = q.get() for got_i in got[1]: cropped_pages.append((got[0], got_i.path)) - #if got_i.chars: - # debug.debug_show(debug.debug_im(got_i.page)) + if got_i.chars: + debug.debug_show(debug.debug_im(got_i.page)) cropped_pages = sorted(cropped_pages) writer = pypdf.PdfWriter()