Compare commits
10 Commits
689998d71f
...
0b1814fb81
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0b1814fb81 | ||
|
|
1730116aff | ||
|
|
fa71cc49f7 | ||
|
|
64324508b8 | ||
|
|
7e93ba51aa | ||
|
|
138f1f51d9 | ||
|
|
10b280c606 | ||
|
|
cbd964c868 | ||
|
|
ab878e9de8 | ||
|
|
006a66941f |
@@ -1,3 +1,5 @@
|
|||||||
# dnd-pdf-to-txt
|
# dnd-pdf-to-txt
|
||||||
|
|
||||||
[original](https://stackoverflow.com/questions/55100037/how-to-extract-text-from-two-column-pdf-with-python)
|
[original](https://stackoverflow.com/questions/55100037/how-to-extract-text-from-two-column-pdf-with-python)
|
||||||
|
|
||||||
|
[TODO](https://github.com/EllatharTheHalfling/DnD-Books/tree/master/5e)
|
||||||
|
|||||||
20
cluster.py
20
cluster.py
@@ -82,19 +82,23 @@ class Chars:
|
|||||||
j += 1
|
j += 1
|
||||||
i.merge()
|
i.merge()
|
||||||
assert(len(i.chars) == 1)
|
assert(len(i.chars) == 1)
|
||||||
i.chars[0]["x0"] -= median_height
|
#i.chars[0]["x0"] -= median_height
|
||||||
i.chars[0]["x1"] += median_height
|
#i.chars[0]["x1"] += median_height
|
||||||
i.chars[0]["y0"] -= median_height
|
#i.chars[0]["y0"] -= median_height
|
||||||
i.chars[0]["y1"] += median_height
|
#i.chars[0]["y1"] += median_height
|
||||||
bounds = i._box()
|
bounds = i._box()
|
||||||
|
|
||||||
original_reader = pypdf.PdfReader(self.path)
|
original_reader = pypdf.PdfReader(self.path)
|
||||||
modified_writer = pypdf.PdfWriter()
|
modified_writer = pypdf.PdfWriter()
|
||||||
modified_page = original_reader.pages[self.page.page_number-1]
|
modified_page = original_reader.pages[self.page.page_number-1]
|
||||||
modified_page.mediabox.upper_right = (bounds.x0, bounds.y0)
|
modified_page.trimbox.upper_right = (bounds.x0, bounds.y0)
|
||||||
modified_page.mediabox.upper_left = (bounds.x1, bounds.y0)
|
modified_page.trimbox.upper_left = (bounds.x1, bounds.y0)
|
||||||
modified_page.mediabox.lower_right = (bounds.x0, bounds.y1)
|
modified_page.trimbox.lower_right = (bounds.x0, bounds.y1)
|
||||||
modified_page.mediabox.lower_left = (bounds.x1, bounds.y1)
|
modified_page.trimbox.lower_left = (bounds.x1, bounds.y1)
|
||||||
|
modified_page.cropbox.upper_right = (bounds.x0, bounds.y0-median_height)
|
||||||
|
modified_page.cropbox.upper_left = (bounds.x1, bounds.y0-median_height)
|
||||||
|
modified_page.cropbox.lower_right = (bounds.x0, bounds.y1+median_height)
|
||||||
|
modified_page.cropbox.lower_left = (bounds.x1, bounds.y1+median_height)
|
||||||
modified_writer.add_page(modified_page)
|
modified_writer.add_page(modified_page)
|
||||||
modified_path = "{}/{}-{:03d}-{}.modified.pdf".format(
|
modified_path = "{}/{}-{:03d}-{}.modified.pdf".format(
|
||||||
config.TEMP_DIR,
|
config.TEMP_DIR,
|
||||||
|
|||||||
@@ -5,4 +5,5 @@ DEBUG_NO_SHOW = os.environ.get("DEBUG_NO_SHOW", "")
|
|||||||
DEBUG_HEIGHT = int(os.environ.get("DEBUG_HEIGHT", "800"))
|
DEBUG_HEIGHT = int(os.environ.get("DEBUG_HEIGHT", "800"))
|
||||||
INPUT = os.environ.get("INPUT", "./testdata/input.pdf")
|
INPUT = os.environ.get("INPUT", "./testdata/input.pdf")
|
||||||
TEMP_DIR = os.environ.get("TEMP_DIR", "/tmp/dnd-pdf-to-txt.d")
|
TEMP_DIR = os.environ.get("TEMP_DIR", "/tmp/dnd-pdf-to-txt.d")
|
||||||
|
PARALLEL = int(os.environ.get("PARALLEL", "0"))
|
||||||
os.makedirs(TEMP_DIR, exist_ok=True)
|
os.makedirs(TEMP_DIR, exist_ok=True)
|
||||||
|
|||||||
40
main.py
40
main.py
@@ -3,7 +3,9 @@ import cluster
|
|||||||
import config
|
import config
|
||||||
import pdfplumber
|
import pdfplumber
|
||||||
from multiprocessing.pool import ThreadPool
|
from multiprocessing.pool import ThreadPool
|
||||||
|
import pypdf
|
||||||
import queue
|
import queue
|
||||||
|
import subprocess
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
de_column_ify()
|
de_column_ify()
|
||||||
@@ -12,22 +14,46 @@ def log(*args):
|
|||||||
print(*args, flush=True)
|
print(*args, flush=True)
|
||||||
|
|
||||||
def de_column_ify():
|
def de_column_ify():
|
||||||
q = queue.Queue(maxsize=4)
|
q = queue.Queue(maxsize=4 if config.PARALLEL else 0)
|
||||||
with pdfplumber.open(config.INPUT) as pdf:
|
with pdfplumber.open(config.INPUT) as pdf:
|
||||||
|
cropped_pages = []
|
||||||
with ThreadPool(4) as pool:
|
with ThreadPool(4) as pool:
|
||||||
for i in range(len(pdf.pages)):
|
for i in range(len(pdf.pages)):
|
||||||
pool.apply_async(de_columnify_page, (q, config.INPUT, pdf.pages[i], ))
|
if config.PARALLEL:
|
||||||
|
pool.apply_async(de_columnify_page, (q, config.INPUT, pdf.pages[i], ))
|
||||||
|
else:
|
||||||
|
de_columnify_page(q, config.INPUT, pdf.pages[i])
|
||||||
for i in range(len(pdf.pages)):
|
for i in range(len(pdf.pages)):
|
||||||
log("getting", i, "of", len(pdf.pages))
|
log("getting", i, "of", len(pdf.pages))
|
||||||
got = q.get()
|
got = q.get()
|
||||||
for got_i in got[1]:
|
for got_i in got[1]:
|
||||||
if got_i.chars:
|
cropped_pages.append((got[0], got_i.path))
|
||||||
debug.debug_show(debug.debug_im(got_i.page))
|
#if got_i.chars:
|
||||||
|
# debug.debug_show(debug.debug_im(got_i.page))
|
||||||
|
|
||||||
|
log("merging", len(cropped_pages), "de-column-ified pages")
|
||||||
|
cropped_pages = sorted(cropped_pages)
|
||||||
|
writer = pypdf.PdfWriter()
|
||||||
|
for cropped_page in cropped_pages:
|
||||||
|
with open(cropped_page[1], "rb") as f:
|
||||||
|
reader = pypdf.PdfReader(f)
|
||||||
|
writer.add_page(reader.pages[0])
|
||||||
|
log("dumping de-column-ified pages")
|
||||||
|
output = f'{config.INPUT}.de-column-ified.pdf'
|
||||||
|
with open(output, "wb") as f:
|
||||||
|
writer.write(f)
|
||||||
|
log(output)
|
||||||
|
|
||||||
def de_columnify_page(q, path, page):
|
def de_columnify_page(q, path, page):
|
||||||
result = cluster.Chars(path, page.chars, page).divide_into_columns()
|
for _ in range(3):
|
||||||
log("putting", page.page_number)
|
try:
|
||||||
q.put((page, result))
|
result = cluster.Chars(path, page.chars, page).divide_into_columns()
|
||||||
|
log("putting", page.page_number, len(result))
|
||||||
|
q.put((page.page_number, result))
|
||||||
|
return
|
||||||
|
except Exception as e:
|
||||||
|
log(page.page_number, "encountered", e)
|
||||||
|
raise Exception(f"failure for {page.page_number}")
|
||||||
|
|
||||||
def textify(page):
|
def textify(page):
|
||||||
lines = page.extract_text(layout=True).split("\n")
|
lines = page.extract_text(layout=True).split("\n")
|
||||||
|
|||||||
@@ -1 +1,2 @@
|
|||||||
pdfplumber
|
pdfplumber
|
||||||
|
pypdf
|
||||||
|
|||||||
Reference in New Issue
Block a user