threading is coo

master
bel 2023-02-21 11:50:47 -07:00
parent f940365111
commit a08cb2927a
1 changed files with 32 additions and 12 deletions

44
main.py
View File

@ -2,21 +2,41 @@ import debug
import cluster
import config
import pdfplumber
from multiprocessing.pool import ThreadPool
import queue
def main():
de_column_ify()
def log(*args):
print(*args, flush=True)
def de_column_ify():
q = queue.Queue(maxsize=4)
with pdfplumber.open(config.INPUT) as pdf:
for page in pdf.pages:
got = cluster.Chars(page.chars, page).divide_into_columns()
debug.draw_boxes(page, [
{
"x0": i.chars[0]["x0"],
"x1": i.chars[0]["x1"],
"y0": i.chars[0]["y0"],
"y1": i.chars[0]["y1"],
"debug_label": i.n,
}
for i in got if i.chars
])
with ThreadPool(4) as pool:
n = 0
for i in range(len(pdf.pages)):
pool.apply_async(de_columnify_page, (q, n, pdf.pages[i], ))
n += 1
for i in range(n):
log("getting", i, "of", n)
got = q.get()
debug.draw_boxes(got[1] , [
{
"x0": i.chars[0]["x0"],
"x1": i.chars[0]["x1"],
"y0": i.chars[0]["y0"],
"y1": i.chars[0]["y1"],
"debug_label": i.n,
}
for i in got[2] if i.chars
])
def de_columnify_page(q, idx, page):
result = cluster.Chars(page.chars, page).divide_into_columns()
log("putting", idx)
q.put((idx, page, result))
if __name__ == "__main__":
main()