40 lines
1.1 KiB
Python
40 lines
1.1 KiB
Python
import debug
|
|
import cluster
|
|
import config
|
|
import pdfplumber
|
|
from multiprocessing.pool import ThreadPool
|
|
import queue
|
|
|
|
def main():
|
|
de_column_ify()
|
|
|
|
def log(*args):
|
|
print(*args, flush=True)
|
|
|
|
def de_column_ify():
|
|
q = queue.Queue(maxsize=4)
|
|
with pdfplumber.open(config.INPUT) as pdf:
|
|
with ThreadPool(4) as pool:
|
|
for i in range(len(pdf.pages)):
|
|
pool.apply_async(de_columnify_page, (q, pdf.pages[i], ))
|
|
for i in range(len(pdf.pages)):
|
|
log("getting", i, "of", len(pdf.pages))
|
|
got = q.get()
|
|
for got_i in got[1]:
|
|
if got_i.chars:
|
|
debug.draw_boxes(got_i.page, [{
|
|
"x0": got_i.chars[0]["x0"],
|
|
"x1": got_i.chars[0]["x1"],
|
|
"y0": got_i.chars[0]["y0"],
|
|
"y1": got_i.chars[0]["y1"],
|
|
"debug_label": got_i.n,
|
|
}])
|
|
|
|
def de_columnify_page(q, page):
|
|
result = cluster.Chars(page.chars, page).divide_into_columns()
|
|
log("putting", page.page_number)
|
|
q.put((page, result))
|
|
|
|
if __name__ == "__main__":
|
|
main()
|