From 54372c88f334f4940b485164787c3c499266ed69 Mon Sep 17 00:00:00 2001 From: bel Date: Tue, 21 Feb 2023 12:15:30 -0700 Subject: [PATCH] dont think i will use but page to plaintext --- main.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index f9a0332..64ad4d5 100644 --- a/main.py +++ b/main.py @@ -22,12 +22,33 @@ def de_column_ify(): got = q.get() for got_i in got[1]: if got_i.chars: - debug.debug_show(debug.debug_im(got_i.page)) + True or debug.debug_show(debug.debug_im(got_i.page)) + textify(got_i.page) def de_columnify_page(q, page): result = cluster.Chars(page.chars, page).divide_into_columns() log("putting", page.page_number) q.put((page, result)) +def textify(page): + lines = page.extract_text(layout=True).split("\n") + leading_spaces = [] + for line in lines: + leading_spaces.append(len(line) - len(line.lstrip())) + median_leading_spaces = sorted(leading_spaces)[len(leading_spaces)//2] + + paragraphs = [] + for line in lines: + if line.startswith(median_leading_spaces * " "): + line = line[median_leading_spaces:] + if line.startswith(" "): + line = "\t" + line[1:] + if not paragraphs or line.startswith("\t"): + paragraphs.append([]) + paragraphs.append([]) + paragraphs[-1].append(line.rstrip()) + for paragraph in paragraphs: + print(" ".join(paragraph)) + if __name__ == "__main__": main()