master
bel 2023-02-21 14:12:59 -07:00
parent 7e93ba51aa
commit 64324508b8
2 changed files with 17 additions and 5 deletions

View File

@ -91,10 +91,14 @@ class Chars:
original_reader = pypdf.PdfReader(self.path) original_reader = pypdf.PdfReader(self.path)
modified_writer = pypdf.PdfWriter() modified_writer = pypdf.PdfWriter()
modified_page = original_reader.pages[self.page.page_number-1] modified_page = original_reader.pages[self.page.page_number-1]
modified_page.mediabox.upper_right = (bounds.x0, bounds.y0) modified_page.trimbox.upper_right = (bounds.x0, bounds.y0)
modified_page.mediabox.upper_left = (bounds.x1, bounds.y0) modified_page.trimbox.upper_left = (bounds.x1, bounds.y0)
modified_page.mediabox.lower_right = (bounds.x0, bounds.y1) modified_page.trimbox.lower_right = (bounds.x0, bounds.y1)
modified_page.mediabox.lower_left = (bounds.x1, bounds.y1) modified_page.trimbox.lower_left = (bounds.x1, bounds.y1)
modified_page.cropbox.upper_right = (bounds.x0, bounds.y0)
modified_page.cropbox.upper_left = (bounds.x1, bounds.y0)
modified_page.cropbox.lower_right = (bounds.x0, bounds.y1)
modified_page.cropbox.lower_left = (bounds.x1, bounds.y1)
modified_writer.add_page(modified_page) modified_writer.add_page(modified_page)
modified_path = "{}/{}-{:03d}-{}.modified.pdf".format( modified_path = "{}/{}-{:03d}-{}.modified.pdf".format(
config.TEMP_DIR, config.TEMP_DIR,

10
main.py
View File

@ -5,6 +5,7 @@ import pdfplumber
from multiprocessing.pool import ThreadPool from multiprocessing.pool import ThreadPool
import pypdf import pypdf
import queue import queue
import subprocess
def main(): def main():
de_column_ify() de_column_ify()
@ -30,14 +31,21 @@ def de_column_ify():
#if got_i.chars: #if got_i.chars:
# debug.debug_show(debug.debug_im(got_i.page)) # debug.debug_show(debug.debug_im(got_i.page))
log("merging", len(cropped_pages), "de-column-ified pages")
cropped_pages = sorted(cropped_pages) cropped_pages = sorted(cropped_pages)
writer = pypdf.PdfWriter() writer = pypdf.PdfWriter()
for cropped_page in cropped_pages: for cropped_page in cropped_pages:
with open(cropped_page[1], "rb") as f: with open(cropped_page[1], "rb") as f:
reader = pypdf.PdfReader(f) reader = pypdf.PdfReader(f)
writer.add_page(reader.pages[0]) writer.add_page(reader.pages[0])
with open(config.INPUT + ".de_column_ified.pdf", "wb") as f: log("dumping de-column-ified pages")
fat_output = f'{config.TEMP_DIR}/{config.INPUT.split("/")[-1]}.de_column_ified.pdf'
with open(fat_output, "wb") as f:
writer.write(f) writer.write(f)
output = f'{config.INPUT}.de-column-ified.pdf'
log("shrinking de-column-ified pages")
subprocess.run(f"gs -q -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -sOutputFile='{output}' '{fat_output}'", shell=True)
log(output)
def de_columnify_page(q, path, page): def de_columnify_page(q, path, page):
for _ in range(3): for _ in range(3):