Compare commits

..

10 Commits

Author SHA1 Message Date
Bel LaPointe
0b1814fb81 2do 2023-03-08 16:15:01 -07:00
bel
1730116aff gs reflows and that is bad for copy pasting 2023-02-21 14:28:10 -07:00
bel
fa71cc49f7 y buffer only 2023-02-21 14:20:36 -07:00
bel
64324508b8 little 2023-02-21 14:12:59 -07:00
bel
7e93ba51aa req 2023-02-21 13:48:18 -07:00
bel
138f1f51d9 no debug img 2023-02-21 13:04:00 -07:00
bel
10b280c606 ok serial 2023-02-21 13:02:00 -07:00
bel
cbd964c868 gettin close tho 2023-02-21 12:59:39 -07:00
bel
ab878e9de8 it outputs... 2023-02-21 12:55:00 -07:00
bel
006a66941f gather pages 2023-02-21 12:52:57 -07:00
5 changed files with 49 additions and 15 deletions

View File

@@ -1,3 +1,5 @@
# dnd-pdf-to-txt # dnd-pdf-to-txt
[original](https://stackoverflow.com/questions/55100037/how-to-extract-text-from-two-column-pdf-with-python) [original](https://stackoverflow.com/questions/55100037/how-to-extract-text-from-two-column-pdf-with-python)
[TODO](https://github.com/EllatharTheHalfling/DnD-Books/tree/master/5e)

View File

@@ -82,19 +82,23 @@ class Chars:
j += 1 j += 1
i.merge() i.merge()
assert(len(i.chars) == 1) assert(len(i.chars) == 1)
i.chars[0]["x0"] -= median_height #i.chars[0]["x0"] -= median_height
i.chars[0]["x1"] += median_height #i.chars[0]["x1"] += median_height
i.chars[0]["y0"] -= median_height #i.chars[0]["y0"] -= median_height
i.chars[0]["y1"] += median_height #i.chars[0]["y1"] += median_height
bounds = i._box() bounds = i._box()
original_reader = pypdf.PdfReader(self.path) original_reader = pypdf.PdfReader(self.path)
modified_writer = pypdf.PdfWriter() modified_writer = pypdf.PdfWriter()
modified_page = original_reader.pages[self.page.page_number-1] modified_page = original_reader.pages[self.page.page_number-1]
modified_page.mediabox.upper_right = (bounds.x0, bounds.y0) modified_page.trimbox.upper_right = (bounds.x0, bounds.y0)
modified_page.mediabox.upper_left = (bounds.x1, bounds.y0) modified_page.trimbox.upper_left = (bounds.x1, bounds.y0)
modified_page.mediabox.lower_right = (bounds.x0, bounds.y1) modified_page.trimbox.lower_right = (bounds.x0, bounds.y1)
modified_page.mediabox.lower_left = (bounds.x1, bounds.y1) modified_page.trimbox.lower_left = (bounds.x1, bounds.y1)
modified_page.cropbox.upper_right = (bounds.x0, bounds.y0-median_height)
modified_page.cropbox.upper_left = (bounds.x1, bounds.y0-median_height)
modified_page.cropbox.lower_right = (bounds.x0, bounds.y1+median_height)
modified_page.cropbox.lower_left = (bounds.x1, bounds.y1+median_height)
modified_writer.add_page(modified_page) modified_writer.add_page(modified_page)
modified_path = "{}/{}-{:03d}-{}.modified.pdf".format( modified_path = "{}/{}-{:03d}-{}.modified.pdf".format(
config.TEMP_DIR, config.TEMP_DIR,

View File

@@ -5,4 +5,5 @@ DEBUG_NO_SHOW = os.environ.get("DEBUG_NO_SHOW", "")
DEBUG_HEIGHT = int(os.environ.get("DEBUG_HEIGHT", "800")) DEBUG_HEIGHT = int(os.environ.get("DEBUG_HEIGHT", "800"))
INPUT = os.environ.get("INPUT", "./testdata/input.pdf") INPUT = os.environ.get("INPUT", "./testdata/input.pdf")
TEMP_DIR = os.environ.get("TEMP_DIR", "/tmp/dnd-pdf-to-txt.d") TEMP_DIR = os.environ.get("TEMP_DIR", "/tmp/dnd-pdf-to-txt.d")
PARALLEL = int(os.environ.get("PARALLEL", "0"))
os.makedirs(TEMP_DIR, exist_ok=True) os.makedirs(TEMP_DIR, exist_ok=True)

40
main.py
View File

@@ -3,7 +3,9 @@ import cluster
import config import config
import pdfplumber import pdfplumber
from multiprocessing.pool import ThreadPool from multiprocessing.pool import ThreadPool
import pypdf
import queue import queue
import subprocess
def main(): def main():
de_column_ify() de_column_ify()
@@ -12,22 +14,46 @@ def log(*args):
print(*args, flush=True) print(*args, flush=True)
def de_column_ify(): def de_column_ify():
q = queue.Queue(maxsize=4) q = queue.Queue(maxsize=4 if config.PARALLEL else 0)
with pdfplumber.open(config.INPUT) as pdf: with pdfplumber.open(config.INPUT) as pdf:
cropped_pages = []
with ThreadPool(4) as pool: with ThreadPool(4) as pool:
for i in range(len(pdf.pages)): for i in range(len(pdf.pages)):
pool.apply_async(de_columnify_page, (q, config.INPUT, pdf.pages[i], )) if config.PARALLEL:
pool.apply_async(de_columnify_page, (q, config.INPUT, pdf.pages[i], ))
else:
de_columnify_page(q, config.INPUT, pdf.pages[i])
for i in range(len(pdf.pages)): for i in range(len(pdf.pages)):
log("getting", i, "of", len(pdf.pages)) log("getting", i, "of", len(pdf.pages))
got = q.get() got = q.get()
for got_i in got[1]: for got_i in got[1]:
if got_i.chars: cropped_pages.append((got[0], got_i.path))
debug.debug_show(debug.debug_im(got_i.page)) #if got_i.chars:
# debug.debug_show(debug.debug_im(got_i.page))
log("merging", len(cropped_pages), "de-column-ified pages")
cropped_pages = sorted(cropped_pages)
writer = pypdf.PdfWriter()
for cropped_page in cropped_pages:
with open(cropped_page[1], "rb") as f:
reader = pypdf.PdfReader(f)
writer.add_page(reader.pages[0])
log("dumping de-column-ified pages")
output = f'{config.INPUT}.de-column-ified.pdf'
with open(output, "wb") as f:
writer.write(f)
log(output)
def de_columnify_page(q, path, page): def de_columnify_page(q, path, page):
result = cluster.Chars(path, page.chars, page).divide_into_columns() for _ in range(3):
log("putting", page.page_number) try:
q.put((page, result)) result = cluster.Chars(path, page.chars, page).divide_into_columns()
log("putting", page.page_number, len(result))
q.put((page.page_number, result))
return
except Exception as e:
log(page.page_number, "encountered", e)
raise Exception(f"failure for {page.page_number}")
def textify(page): def textify(page):
lines = page.extract_text(layout=True).split("\n") lines = page.extract_text(layout=True).split("\n")

View File

@@ -1 +1,2 @@
pdfplumber pdfplumber
pypdf