cluster.Chars has .path
parent
7636ce0038
commit
1119c46b97
|
|
@ -1,7 +1,8 @@
|
||||||
import config
|
import config
|
||||||
|
|
||||||
class Chars:
|
class Chars:
|
||||||
def __init__(self, chars, page):
|
def __init__(self, path, chars, page):
|
||||||
|
self.path = path
|
||||||
self.chars = chars
|
self.chars = chars
|
||||||
self.page = page
|
self.page = page
|
||||||
self.n = 0
|
self.n = 0
|
||||||
|
|
@ -44,7 +45,7 @@ class Chars:
|
||||||
if result and result[-1]._box().overlaps(Box.from_char(char), clearance=median_x_delta_when_same_y):
|
if result and result[-1]._box().overlaps(Box.from_char(char), clearance=median_x_delta_when_same_y):
|
||||||
result[-1].merge_in(char)
|
result[-1].merge_in(char)
|
||||||
else:
|
else:
|
||||||
result.append(Chars([char], self.page))
|
result.append(Chars(self.path, [char], self.page))
|
||||||
result = [i for i in result if i.n > 2]
|
result = [i for i in result if i.n > 2]
|
||||||
|
|
||||||
# any clusters shorter than median character and high/lower are header/footer
|
# any clusters shorter than median character and high/lower are header/footer
|
||||||
|
|
|
||||||
6
main.py
6
main.py
|
|
@ -16,7 +16,7 @@ def de_column_ify():
|
||||||
with pdfplumber.open(config.INPUT) as pdf:
|
with pdfplumber.open(config.INPUT) as pdf:
|
||||||
with ThreadPool(4) as pool:
|
with ThreadPool(4) as pool:
|
||||||
for i in range(len(pdf.pages)):
|
for i in range(len(pdf.pages)):
|
||||||
pool.apply_async(de_columnify_page, (q, pdf.pages[i], ))
|
pool.apply_async(de_columnify_page, (q, config.INPUT, pdf.pages[i], ))
|
||||||
for i in range(len(pdf.pages)):
|
for i in range(len(pdf.pages)):
|
||||||
log("getting", i, "of", len(pdf.pages))
|
log("getting", i, "of", len(pdf.pages))
|
||||||
got = q.get()
|
got = q.get()
|
||||||
|
|
@ -24,8 +24,8 @@ def de_column_ify():
|
||||||
if got_i.chars:
|
if got_i.chars:
|
||||||
debug.debug_show(debug.debug_im(got_i.page))
|
debug.debug_show(debug.debug_im(got_i.page))
|
||||||
|
|
||||||
def de_columnify_page(q, page):
|
def de_columnify_page(q, path, page):
|
||||||
result = cluster.Chars(page.chars, page).divide_into_columns()
|
result = cluster.Chars(path, page.chars, page).divide_into_columns()
|
||||||
log("putting", page.page_number)
|
log("putting", page.page_number)
|
||||||
q.put((page, result))
|
q.put((page, result))
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,7 @@ class TestChars(unittest.TestCase):
|
||||||
# for i in range(len(words))
|
# for i in range(len(words))
|
||||||
#])
|
#])
|
||||||
#continue
|
#continue
|
||||||
got = cluster.Chars(page.chars, page).divide_into_columns()
|
got = cluster.Chars(p, page.chars, page).divide_into_columns()
|
||||||
print(p)
|
print(p)
|
||||||
debug.draw_boxes(page, [
|
debug.draw_boxes(page, [
|
||||||
{
|
{
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue