dont think i will use but page to plaintext

master
bel 2023-02-21 12:15:30 -07:00
parent 1c5998d0a1
commit 54372c88f3
1 changed files with 22 additions and 1 deletions

23
main.py
View File

@ -22,12 +22,33 @@ def de_column_ify():
got = q.get() got = q.get()
for got_i in got[1]: for got_i in got[1]:
if got_i.chars: if got_i.chars:
debug.debug_show(debug.debug_im(got_i.page)) True or debug.debug_show(debug.debug_im(got_i.page))
textify(got_i.page)
def de_columnify_page(q, page): def de_columnify_page(q, page):
result = cluster.Chars(page.chars, page).divide_into_columns() result = cluster.Chars(page.chars, page).divide_into_columns()
log("putting", page.page_number) log("putting", page.page_number)
q.put((page, result)) q.put((page, result))
def textify(page):
lines = page.extract_text(layout=True).split("\n")
leading_spaces = []
for line in lines:
leading_spaces.append(len(line) - len(line.lstrip()))
median_leading_spaces = sorted(leading_spaces)[len(leading_spaces)//2]
paragraphs = []
for line in lines:
if line.startswith(median_leading_spaces * " "):
line = line[median_leading_spaces:]
if line.startswith(" "):
line = "\t" + line[1:]
if not paragraphs or line.startswith("\t"):
paragraphs.append([])
paragraphs.append([])
paragraphs[-1].append(line.rstrip())
for paragraph in paragraphs:
print(" ".join(paragraph))
if __name__ == "__main__": if __name__ == "__main__":
main() main()