ok we exporting cropped pdfs ok
parent
1119c46b97
commit
ca2f4c0fac
29
cluster.py
29
cluster.py
|
|
@ -1,4 +1,6 @@
|
|||
import config
|
||||
import pypdf
|
||||
import pdfplumber
|
||||
|
||||
class Chars:
|
||||
def __init__(self, path, chars, page):
|
||||
|
|
@ -75,7 +77,9 @@ class Chars:
|
|||
result2.append(sub)
|
||||
result = result2
|
||||
|
||||
j = 0
|
||||
for i in result:
|
||||
j += 1
|
||||
i.merge()
|
||||
assert(len(i.chars) == 1)
|
||||
i.chars[0]["x0"] -= median_height
|
||||
|
|
@ -83,12 +87,25 @@ class Chars:
|
|||
i.chars[0]["y0"] -= median_height
|
||||
i.chars[0]["y1"] += median_height
|
||||
bounds = i._box()
|
||||
i.page = self.page.crop((
|
||||
bounds.x0,
|
||||
self.page.height - bounds.y1,
|
||||
bounds.x1,
|
||||
self.page.height - bounds.y0,
|
||||
), relative=True)
|
||||
|
||||
original_reader = pypdf.PdfReader(self.path)
|
||||
modified_writer = pypdf.PdfWriter()
|
||||
modified_page = original_reader.pages[self.page.page_number-1]
|
||||
modified_page.mediabox.upper_right = (bounds.x0, bounds.y0)
|
||||
modified_page.mediabox.upper_left = (bounds.x1, bounds.y0)
|
||||
modified_page.mediabox.lower_right = (bounds.x0, bounds.y1)
|
||||
modified_page.mediabox.lower_left = (bounds.x1, bounds.y1)
|
||||
modified_writer.add_page(modified_page)
|
||||
modified_path = "/tmp/{}-{:03d}-{}.modified.pdf".format(
|
||||
self.path.split("/")[-1],
|
||||
self.page.page_number,
|
||||
j,
|
||||
)
|
||||
with open(modified_path, "wb") as mwf:
|
||||
modified_writer.write(mwf)
|
||||
with pdfplumber.open(modified_path) as modified_pdf:
|
||||
i.path = modified_path
|
||||
i.page = modified_pdf.pages[0]
|
||||
|
||||
return result
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue