dnd-pdf-to-txt/test_cluster.py

import unittest

import cluster
import pdfplumber
import debug

class TestChars(unittest.TestCase):
   def test_divide_into_columns(self):
      for p in [
         #"./testdata/2-column_2-row.pdf",
         #"./testdata/1-column_half-image.pdf",
         #"./testdata/2-column_fancy-font.pdf",
         #"./testdata/2-column_happy.pdf",
         "./testdata/2-column_non-interrupting-image.pdf",
      ]:
         with pdfplumber.open(p) as pdf:
            for page in pdf.pages:
               #im = debug.debug_im(page)
               #words = page.extract_words()
               #debug.draw_boxes(page, [
               #   {
               #      "debug_label": i,
               #      "x0": words[i]["x0"],
               #      "x1": words[i]["x1"],
               #      "y0": words[i]["top"],
               #      "y1": words[i]["bottom"],
               #   }
               #   for i in range(len(words))
               #])
               #continue
               got = cluster.Chars(p, page.chars, page).divide_into_columns()
               print(p)
               debug.draw_boxes(page, [
                  {
                     "x0": i.chars[0]["x0"],
                     "x1": i.chars[0]["x1"],
                     "y0": i.chars[0]["y0"],
                     "y1": i.chars[0]["y1"],
                     "debug_label": i.n,
                  }
                  for i in got
               ])
               for i in got:
                  print(i.page.height, i.page.width, i._box())
                  debug.draw_boxes(i.page, [
                     {
                        "x0": i.chars[0]["x0"],
                        "x1": i.chars[0]["x1"],
                        "y0": i.chars[0]["y0"],
                        "y1": i.chars[0]["y1"],
                        "debug_label": i.n,
                     }
                  ])

if __name__ == "__main__":
   unittest.main()