diff --git a/cluster.py b/cluster.py index 4138bcb..5ddf250 100644 --- a/cluster.py +++ b/cluster.py @@ -91,10 +91,14 @@ class Chars: original_reader = pypdf.PdfReader(self.path) modified_writer = pypdf.PdfWriter() modified_page = original_reader.pages[self.page.page_number-1] - modified_page.mediabox.upper_right = (bounds.x0, bounds.y0) - modified_page.mediabox.upper_left = (bounds.x1, bounds.y0) - modified_page.mediabox.lower_right = (bounds.x0, bounds.y1) - modified_page.mediabox.lower_left = (bounds.x1, bounds.y1) + modified_page.trimbox.upper_right = (bounds.x0, bounds.y0) + modified_page.trimbox.upper_left = (bounds.x1, bounds.y0) + modified_page.trimbox.lower_right = (bounds.x0, bounds.y1) + modified_page.trimbox.lower_left = (bounds.x1, bounds.y1) + modified_page.cropbox.upper_right = (bounds.x0, bounds.y0) + modified_page.cropbox.upper_left = (bounds.x1, bounds.y0) + modified_page.cropbox.lower_right = (bounds.x0, bounds.y1) + modified_page.cropbox.lower_left = (bounds.x1, bounds.y1) modified_writer.add_page(modified_page) modified_path = "{}/{}-{:03d}-{}.modified.pdf".format( config.TEMP_DIR, diff --git a/main.py b/main.py index 1de3801..0017fdf 100644 --- a/main.py +++ b/main.py @@ -5,6 +5,7 @@ import pdfplumber from multiprocessing.pool import ThreadPool import pypdf import queue +import subprocess def main(): de_column_ify() @@ -30,14 +31,21 @@ def de_column_ify(): #if got_i.chars: # debug.debug_show(debug.debug_im(got_i.page)) + log("merging", len(cropped_pages), "de-column-ified pages") cropped_pages = sorted(cropped_pages) writer = pypdf.PdfWriter() for cropped_page in cropped_pages: with open(cropped_page[1], "rb") as f: reader = pypdf.PdfReader(f) writer.add_page(reader.pages[0]) - with open(config.INPUT + ".de_column_ified.pdf", "wb") as f: + log("dumping de-column-ified pages") + fat_output = f'{config.TEMP_DIR}/{config.INPUT.split("/")[-1]}.de_column_ified.pdf' + with open(fat_output, "wb") as f: writer.write(f) + output = f'{config.INPUT}.de-column-ified.pdf' + log("shrinking de-column-ified pages") + subprocess.run(f"gs -q -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -sOutputFile='{output}' '{fat_output}'", shell=True) + log(output) def de_columnify_page(q, path, page): for _ in range(3):