ok serial

master
bel 2023-02-21 13:02:00 -07:00
parent cbd964c868
commit 10b280c606
2 changed files with 8 additions and 4 deletions

View File

@ -5,4 +5,5 @@ DEBUG_NO_SHOW = os.environ.get("DEBUG_NO_SHOW", "")
DEBUG_HEIGHT = int(os.environ.get("DEBUG_HEIGHT", "800"))
INPUT = os.environ.get("INPUT", "./testdata/input.pdf")
TEMP_DIR = os.environ.get("TEMP_DIR", "/tmp/dnd-pdf-to-txt.d")
PARALLEL = int(os.environ.get("PARALLEL", "0"))
os.makedirs(TEMP_DIR, exist_ok=True)

11
main.py
View File

@ -13,19 +13,22 @@ def log(*args):
print(*args, flush=True)
def de_column_ify():
q = queue.Queue(maxsize=4)
q = queue.Queue(maxsize=4 if config.PARALLEL else 0)
with pdfplumber.open(config.INPUT) as pdf:
cropped_pages = []
with ThreadPool(4) as pool:
for i in range(len(pdf.pages)):
pool.apply_async(de_columnify_page, (q, config.INPUT, pdf.pages[i], ))
if config.PARALLEL:
pool.apply_async(de_columnify_page, (q, config.INPUT, pdf.pages[i], ))
else:
de_columnify_page(q, config.INPUT, pdf.pages[i])
for i in range(len(pdf.pages)):
log("getting", i, "of", len(pdf.pages))
got = q.get()
for got_i in got[1]:
cropped_pages.append((got[0], got_i.path))
#if got_i.chars:
# debug.debug_show(debug.debug_im(got_i.page))
if got_i.chars:
debug.debug_show(debug.debug_im(got_i.page))
cropped_pages = sorted(cropped_pages)
writer = pypdf.PdfWriter()