dnd-pdf-to-txt/main.py

41 lines
1.2 KiB
Python

import debug
import cluster
import config
import pdfplumber
from multiprocessing.pool import ThreadPool
import queue
def main():
de_column_ify()
def log(*args):
print(*args, flush=True)
def de_column_ify():
q = queue.Queue(maxsize=4)
with pdfplumber.open(config.INPUT) as pdf:
with ThreadPool(4) as pool:
for i in range(len(pdf.pages)):
pool.apply_async(de_columnify_page, (q, pdf.pages[i], ))
for i in range(len(pdf.pages)):
log("getting", i, "of", len(pdf.pages))
got = q.get()
for got_i in got[1]:
if got_i.chars:
debug.debug_show(debug.debug_im(got_i.page))
#debug.draw_boxes(got_i.page, [{
# "x0": got_i.chars[0]["x0"],
# "x1": got_i.chars[0]["x1"],
# "y0": got_i.chars[0]["y0"],
# "y1": got_i.chars[0]["y1"],
# "debug_label": got_i.n,
#}])
def de_columnify_page(q, page):
result = cluster.Chars(page.chars, page).divide_into_columns()
log("putting", page.page_number)
q.put((page, result))
if __name__ == "__main__":
main()