diff --git a/cluster.py b/cluster.py index e3885a1..4598a43 100644 --- a/cluster.py +++ b/cluster.py @@ -20,6 +20,8 @@ class Chars: # given median character size # drop all above+below first instance heights = [i["y1"]-i["y0"] for i in self.chars] + if not heights: + return [self] median_height = sorted(heights)[len(heights)//2] at_least_median_height = [i for i in self.chars if i["y1"]-i["y0"] >= median_height] at_least_median_height = sorted(at_least_median_height, key=lambda x:x["y0"]) @@ -72,6 +74,14 @@ class Chars: result2.append(sub) result = result2 + for i in result: + i.merge() + for j in range(len(i.chars)): + i.chars[j]["x0"] -= median_height + i.chars[j]["x1"] += median_height + i.chars[j]["y0"] -= median_height * 3 + i.chars[j]["y1"] += median_height * 3 + return result def merge_in(self, other): diff --git a/main.py b/main.py new file mode 100644 index 0000000..e66e420 --- /dev/null +++ b/main.py @@ -0,0 +1,22 @@ +import debug +import cluster +import config +import pdfplumber + +def main(): + with pdfplumber.open(config.INPUT) as pdf: + for page in pdf.pages: + got = cluster.Chars(page.chars, page).divide_into_columns() + debug.draw_boxes(page, [ + { + "x0": i.chars[0]["x0"], + "x1": i.chars[0]["x1"], + "y0": i.chars[0]["y0"], + "y1": i.chars[0]["y1"], + "debug_label": i.n, + } + for i in got if i.chars + ]) + +if __name__ == "__main__": + main()