Compare commits
10 Commits
689998d71f
...
0b1814fb81
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0b1814fb81 | ||
|
|
1730116aff | ||
|
|
fa71cc49f7 | ||
|
|
64324508b8 | ||
|
|
7e93ba51aa | ||
|
|
138f1f51d9 | ||
|
|
10b280c606 | ||
|
|
cbd964c868 | ||
|
|
ab878e9de8 | ||
|
|
006a66941f |
@@ -1,3 +1,5 @@
|
||||
# dnd-pdf-to-txt
|
||||
|
||||
[original](https://stackoverflow.com/questions/55100037/how-to-extract-text-from-two-column-pdf-with-python)
|
||||
|
||||
[TODO](https://github.com/EllatharTheHalfling/DnD-Books/tree/master/5e)
|
||||
|
||||
20
cluster.py
20
cluster.py
@@ -82,19 +82,23 @@ class Chars:
|
||||
j += 1
|
||||
i.merge()
|
||||
assert(len(i.chars) == 1)
|
||||
i.chars[0]["x0"] -= median_height
|
||||
i.chars[0]["x1"] += median_height
|
||||
i.chars[0]["y0"] -= median_height
|
||||
i.chars[0]["y1"] += median_height
|
||||
#i.chars[0]["x0"] -= median_height
|
||||
#i.chars[0]["x1"] += median_height
|
||||
#i.chars[0]["y0"] -= median_height
|
||||
#i.chars[0]["y1"] += median_height
|
||||
bounds = i._box()
|
||||
|
||||
original_reader = pypdf.PdfReader(self.path)
|
||||
modified_writer = pypdf.PdfWriter()
|
||||
modified_page = original_reader.pages[self.page.page_number-1]
|
||||
modified_page.mediabox.upper_right = (bounds.x0, bounds.y0)
|
||||
modified_page.mediabox.upper_left = (bounds.x1, bounds.y0)
|
||||
modified_page.mediabox.lower_right = (bounds.x0, bounds.y1)
|
||||
modified_page.mediabox.lower_left = (bounds.x1, bounds.y1)
|
||||
modified_page.trimbox.upper_right = (bounds.x0, bounds.y0)
|
||||
modified_page.trimbox.upper_left = (bounds.x1, bounds.y0)
|
||||
modified_page.trimbox.lower_right = (bounds.x0, bounds.y1)
|
||||
modified_page.trimbox.lower_left = (bounds.x1, bounds.y1)
|
||||
modified_page.cropbox.upper_right = (bounds.x0, bounds.y0-median_height)
|
||||
modified_page.cropbox.upper_left = (bounds.x1, bounds.y0-median_height)
|
||||
modified_page.cropbox.lower_right = (bounds.x0, bounds.y1+median_height)
|
||||
modified_page.cropbox.lower_left = (bounds.x1, bounds.y1+median_height)
|
||||
modified_writer.add_page(modified_page)
|
||||
modified_path = "{}/{}-{:03d}-{}.modified.pdf".format(
|
||||
config.TEMP_DIR,
|
||||
|
||||
@@ -5,4 +5,5 @@ DEBUG_NO_SHOW = os.environ.get("DEBUG_NO_SHOW", "")
|
||||
DEBUG_HEIGHT = int(os.environ.get("DEBUG_HEIGHT", "800"))
|
||||
INPUT = os.environ.get("INPUT", "./testdata/input.pdf")
|
||||
TEMP_DIR = os.environ.get("TEMP_DIR", "/tmp/dnd-pdf-to-txt.d")
|
||||
PARALLEL = int(os.environ.get("PARALLEL", "0"))
|
||||
os.makedirs(TEMP_DIR, exist_ok=True)
|
||||
|
||||
40
main.py
40
main.py
@@ -3,7 +3,9 @@ import cluster
|
||||
import config
|
||||
import pdfplumber
|
||||
from multiprocessing.pool import ThreadPool
|
||||
import pypdf
|
||||
import queue
|
||||
import subprocess
|
||||
|
||||
def main():
|
||||
de_column_ify()
|
||||
@@ -12,22 +14,46 @@ def log(*args):
|
||||
print(*args, flush=True)
|
||||
|
||||
def de_column_ify():
|
||||
q = queue.Queue(maxsize=4)
|
||||
q = queue.Queue(maxsize=4 if config.PARALLEL else 0)
|
||||
with pdfplumber.open(config.INPUT) as pdf:
|
||||
cropped_pages = []
|
||||
with ThreadPool(4) as pool:
|
||||
for i in range(len(pdf.pages)):
|
||||
pool.apply_async(de_columnify_page, (q, config.INPUT, pdf.pages[i], ))
|
||||
if config.PARALLEL:
|
||||
pool.apply_async(de_columnify_page, (q, config.INPUT, pdf.pages[i], ))
|
||||
else:
|
||||
de_columnify_page(q, config.INPUT, pdf.pages[i])
|
||||
for i in range(len(pdf.pages)):
|
||||
log("getting", i, "of", len(pdf.pages))
|
||||
got = q.get()
|
||||
for got_i in got[1]:
|
||||
if got_i.chars:
|
||||
debug.debug_show(debug.debug_im(got_i.page))
|
||||
cropped_pages.append((got[0], got_i.path))
|
||||
#if got_i.chars:
|
||||
# debug.debug_show(debug.debug_im(got_i.page))
|
||||
|
||||
log("merging", len(cropped_pages), "de-column-ified pages")
|
||||
cropped_pages = sorted(cropped_pages)
|
||||
writer = pypdf.PdfWriter()
|
||||
for cropped_page in cropped_pages:
|
||||
with open(cropped_page[1], "rb") as f:
|
||||
reader = pypdf.PdfReader(f)
|
||||
writer.add_page(reader.pages[0])
|
||||
log("dumping de-column-ified pages")
|
||||
output = f'{config.INPUT}.de-column-ified.pdf'
|
||||
with open(output, "wb") as f:
|
||||
writer.write(f)
|
||||
log(output)
|
||||
|
||||
def de_columnify_page(q, path, page):
|
||||
result = cluster.Chars(path, page.chars, page).divide_into_columns()
|
||||
log("putting", page.page_number)
|
||||
q.put((page, result))
|
||||
for _ in range(3):
|
||||
try:
|
||||
result = cluster.Chars(path, page.chars, page).divide_into_columns()
|
||||
log("putting", page.page_number, len(result))
|
||||
q.put((page.page_number, result))
|
||||
return
|
||||
except Exception as e:
|
||||
log(page.page_number, "encountered", e)
|
||||
raise Exception(f"failure for {page.page_number}")
|
||||
|
||||
def textify(page):
|
||||
lines = page.extract_text(layout=True).split("\n")
|
||||
|
||||
@@ -1 +1,2 @@
|
||||
pdfplumber
|
||||
pypdf
|
||||
|
||||
Reference in New Issue
Block a user