little
parent
7e93ba51aa
commit
64324508b8
12
cluster.py
12
cluster.py
|
|
@ -91,10 +91,14 @@ class Chars:
|
||||||
original_reader = pypdf.PdfReader(self.path)
|
original_reader = pypdf.PdfReader(self.path)
|
||||||
modified_writer = pypdf.PdfWriter()
|
modified_writer = pypdf.PdfWriter()
|
||||||
modified_page = original_reader.pages[self.page.page_number-1]
|
modified_page = original_reader.pages[self.page.page_number-1]
|
||||||
modified_page.mediabox.upper_right = (bounds.x0, bounds.y0)
|
modified_page.trimbox.upper_right = (bounds.x0, bounds.y0)
|
||||||
modified_page.mediabox.upper_left = (bounds.x1, bounds.y0)
|
modified_page.trimbox.upper_left = (bounds.x1, bounds.y0)
|
||||||
modified_page.mediabox.lower_right = (bounds.x0, bounds.y1)
|
modified_page.trimbox.lower_right = (bounds.x0, bounds.y1)
|
||||||
modified_page.mediabox.lower_left = (bounds.x1, bounds.y1)
|
modified_page.trimbox.lower_left = (bounds.x1, bounds.y1)
|
||||||
|
modified_page.cropbox.upper_right = (bounds.x0, bounds.y0)
|
||||||
|
modified_page.cropbox.upper_left = (bounds.x1, bounds.y0)
|
||||||
|
modified_page.cropbox.lower_right = (bounds.x0, bounds.y1)
|
||||||
|
modified_page.cropbox.lower_left = (bounds.x1, bounds.y1)
|
||||||
modified_writer.add_page(modified_page)
|
modified_writer.add_page(modified_page)
|
||||||
modified_path = "{}/{}-{:03d}-{}.modified.pdf".format(
|
modified_path = "{}/{}-{:03d}-{}.modified.pdf".format(
|
||||||
config.TEMP_DIR,
|
config.TEMP_DIR,
|
||||||
|
|
|
||||||
10
main.py
10
main.py
|
|
@ -5,6 +5,7 @@ import pdfplumber
|
||||||
from multiprocessing.pool import ThreadPool
|
from multiprocessing.pool import ThreadPool
|
||||||
import pypdf
|
import pypdf
|
||||||
import queue
|
import queue
|
||||||
|
import subprocess
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
de_column_ify()
|
de_column_ify()
|
||||||
|
|
@ -30,14 +31,21 @@ def de_column_ify():
|
||||||
#if got_i.chars:
|
#if got_i.chars:
|
||||||
# debug.debug_show(debug.debug_im(got_i.page))
|
# debug.debug_show(debug.debug_im(got_i.page))
|
||||||
|
|
||||||
|
log("merging", len(cropped_pages), "de-column-ified pages")
|
||||||
cropped_pages = sorted(cropped_pages)
|
cropped_pages = sorted(cropped_pages)
|
||||||
writer = pypdf.PdfWriter()
|
writer = pypdf.PdfWriter()
|
||||||
for cropped_page in cropped_pages:
|
for cropped_page in cropped_pages:
|
||||||
with open(cropped_page[1], "rb") as f:
|
with open(cropped_page[1], "rb") as f:
|
||||||
reader = pypdf.PdfReader(f)
|
reader = pypdf.PdfReader(f)
|
||||||
writer.add_page(reader.pages[0])
|
writer.add_page(reader.pages[0])
|
||||||
with open(config.INPUT + ".de_column_ified.pdf", "wb") as f:
|
log("dumping de-column-ified pages")
|
||||||
|
fat_output = f'{config.TEMP_DIR}/{config.INPUT.split("/")[-1]}.de_column_ified.pdf'
|
||||||
|
with open(fat_output, "wb") as f:
|
||||||
writer.write(f)
|
writer.write(f)
|
||||||
|
output = f'{config.INPUT}.de-column-ified.pdf'
|
||||||
|
log("shrinking de-column-ified pages")
|
||||||
|
subprocess.run(f"gs -q -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -sOutputFile='{output}' '{fat_output}'", shell=True)
|
||||||
|
log(output)
|
||||||
|
|
||||||
def de_columnify_page(q, path, page):
|
def de_columnify_page(q, path, page):
|
||||||
for _ in range(3):
|
for _ in range(3):
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue