From ca2f4c0fac9d11e55c628bfa84770a6d1ab0e249 Mon Sep 17 00:00:00 2001 From: bel Date: Tue, 21 Feb 2023 12:48:50 -0700 Subject: [PATCH] ok we exporting cropped pdfs ok --- cluster.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/cluster.py b/cluster.py index c8c0619..c60ebb9 100644 --- a/cluster.py +++ b/cluster.py @@ -1,4 +1,6 @@ import config +import pypdf +import pdfplumber class Chars: def __init__(self, path, chars, page): @@ -75,7 +77,9 @@ class Chars: result2.append(sub) result = result2 + j = 0 for i in result: + j += 1 i.merge() assert(len(i.chars) == 1) i.chars[0]["x0"] -= median_height @@ -83,12 +87,25 @@ class Chars: i.chars[0]["y0"] -= median_height i.chars[0]["y1"] += median_height bounds = i._box() - i.page = self.page.crop(( - bounds.x0, - self.page.height - bounds.y1, - bounds.x1, - self.page.height - bounds.y0, - ), relative=True) + + original_reader = pypdf.PdfReader(self.path) + modified_writer = pypdf.PdfWriter() + modified_page = original_reader.pages[self.page.page_number-1] + modified_page.mediabox.upper_right = (bounds.x0, bounds.y0) + modified_page.mediabox.upper_left = (bounds.x1, bounds.y0) + modified_page.mediabox.lower_right = (bounds.x0, bounds.y1) + modified_page.mediabox.lower_left = (bounds.x1, bounds.y1) + modified_writer.add_page(modified_page) + modified_path = "/tmp/{}-{:03d}-{}.modified.pdf".format( + self.path.split("/")[-1], + self.page.page_number, + j, + ) + with open(modified_path, "wb") as mwf: + modified_writer.write(mwf) + with pdfplumber.open(modified_path) as modified_pdf: + i.path = modified_path + i.page = modified_pdf.pages[0] return result