80 lines
2.5 KiB
Python
80 lines
2.5 KiB
Python
import debug
|
|
import cluster
|
|
import config
|
|
import pdfplumber
|
|
from multiprocessing.pool import ThreadPool
|
|
import pypdf
|
|
import queue
|
|
import subprocess
|
|
|
|
def main():
|
|
de_column_ify()
|
|
|
|
def log(*args):
|
|
print(*args, flush=True)
|
|
|
|
def de_column_ify():
|
|
q = queue.Queue(maxsize=4 if config.PARALLEL else 0)
|
|
with pdfplumber.open(config.INPUT) as pdf:
|
|
cropped_pages = []
|
|
with ThreadPool(4) as pool:
|
|
for i in range(len(pdf.pages)):
|
|
if config.PARALLEL:
|
|
pool.apply_async(de_columnify_page, (q, config.INPUT, pdf.pages[i], ))
|
|
else:
|
|
de_columnify_page(q, config.INPUT, pdf.pages[i])
|
|
for i in range(len(pdf.pages)):
|
|
log("getting", i, "of", len(pdf.pages))
|
|
got = q.get()
|
|
for got_i in got[1]:
|
|
cropped_pages.append((got[0], got_i.path))
|
|
#if got_i.chars:
|
|
# debug.debug_show(debug.debug_im(got_i.page))
|
|
|
|
log("merging", len(cropped_pages), "de-column-ified pages")
|
|
cropped_pages = sorted(cropped_pages)
|
|
writer = pypdf.PdfWriter()
|
|
for cropped_page in cropped_pages:
|
|
with open(cropped_page[1], "rb") as f:
|
|
reader = pypdf.PdfReader(f)
|
|
writer.add_page(reader.pages[0])
|
|
log("dumping de-column-ified pages")
|
|
output = f'{config.INPUT}.de-column-ified.pdf'
|
|
with open(output, "wb") as f:
|
|
writer.write(f)
|
|
log(output)
|
|
|
|
def de_columnify_page(q, path, page):
|
|
for _ in range(3):
|
|
try:
|
|
result = cluster.Chars(path, page.chars, page).divide_into_columns()
|
|
log("putting", page.page_number, len(result))
|
|
q.put((page.page_number, result))
|
|
return
|
|
except Exception as e:
|
|
log(page.page_number, "encountered", e)
|
|
raise Exception(f"failure for {page.page_number}")
|
|
|
|
def textify(page):
|
|
lines = page.extract_text(layout=True).split("\n")
|
|
leading_spaces = []
|
|
for line in lines:
|
|
leading_spaces.append(len(line) - len(line.lstrip()))
|
|
median_leading_spaces = sorted(leading_spaces)[len(leading_spaces)//2]
|
|
|
|
paragraphs = []
|
|
for line in lines:
|
|
if line.startswith(median_leading_spaces * " "):
|
|
line = line[median_leading_spaces:]
|
|
if line.startswith(" "):
|
|
line = "\t" + line[1:]
|
|
if not paragraphs or line.startswith("\t"):
|
|
paragraphs.append([])
|
|
paragraphs.append([])
|
|
paragraphs[-1].append(line.rstrip())
|
|
for paragraph in paragraphs:
|
|
print(" ".join(paragraph))
|
|
|
|
if __name__ == "__main__":
|
|
main()
|