dont think i will use but page to plaintext
parent
1c5998d0a1
commit
54372c88f3
23
main.py
23
main.py
|
|
@ -22,12 +22,33 @@ def de_column_ify():
|
|||
got = q.get()
|
||||
for got_i in got[1]:
|
||||
if got_i.chars:
|
||||
debug.debug_show(debug.debug_im(got_i.page))
|
||||
True or debug.debug_show(debug.debug_im(got_i.page))
|
||||
textify(got_i.page)
|
||||
|
||||
def de_columnify_page(q, page):
|
||||
result = cluster.Chars(page.chars, page).divide_into_columns()
|
||||
log("putting", page.page_number)
|
||||
q.put((page, result))
|
||||
|
||||
def textify(page):
|
||||
lines = page.extract_text(layout=True).split("\n")
|
||||
leading_spaces = []
|
||||
for line in lines:
|
||||
leading_spaces.append(len(line) - len(line.lstrip()))
|
||||
median_leading_spaces = sorted(leading_spaces)[len(leading_spaces)//2]
|
||||
|
||||
paragraphs = []
|
||||
for line in lines:
|
||||
if line.startswith(median_leading_spaces * " "):
|
||||
line = line[median_leading_spaces:]
|
||||
if line.startswith(" "):
|
||||
line = "\t" + line[1:]
|
||||
if not paragraphs or line.startswith("\t"):
|
||||
paragraphs.append([])
|
||||
paragraphs.append([])
|
||||
paragraphs[-1].append(line.rstrip())
|
||||
for paragraph in paragraphs:
|
||||
print(" ".join(paragraph))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
|||
Loading…
Reference in New Issue