master
bel 2023-02-21 11:52:16 -07:00
parent a08cb2927a
commit f54dbfbeec
1 changed files with 8 additions and 10 deletions

18
main.py
View File

@ -15,14 +15,12 @@ def de_column_ify():
q = queue.Queue(maxsize=4) q = queue.Queue(maxsize=4)
with pdfplumber.open(config.INPUT) as pdf: with pdfplumber.open(config.INPUT) as pdf:
with ThreadPool(4) as pool: with ThreadPool(4) as pool:
n = 0
for i in range(len(pdf.pages)): for i in range(len(pdf.pages)):
pool.apply_async(de_columnify_page, (q, n, pdf.pages[i], )) pool.apply_async(de_columnify_page, (q, pdf.pages[i], ))
n += 1 for i in range(len(pdf.pages)):
for i in range(n): log("getting", i, "of", len(pdf.pages))
log("getting", i, "of", n)
got = q.get() got = q.get()
debug.draw_boxes(got[1] , [ debug.draw_boxes(got[0] , [
{ {
"x0": i.chars[0]["x0"], "x0": i.chars[0]["x0"],
"x1": i.chars[0]["x1"], "x1": i.chars[0]["x1"],
@ -30,13 +28,13 @@ def de_column_ify():
"y1": i.chars[0]["y1"], "y1": i.chars[0]["y1"],
"debug_label": i.n, "debug_label": i.n,
} }
for i in got[2] if i.chars for i in got[1] if i.chars
]) ])
def de_columnify_page(q, idx, page): def de_columnify_page(q, page):
result = cluster.Chars(page.chars, page).divide_into_columns() result = cluster.Chars(page.chars, page).divide_into_columns()
log("putting", idx) log("putting", page.page_number)
q.put((idx, page, result)) q.put((page, result))
if __name__ == "__main__": if __name__ == "__main__":
main() main()