From 2b68a39cc7893700501a6200b4791faa1bbc9d0c Mon Sep 17 00:00:00 2001 From: bel Date: Sun, 19 Feb 2023 20:19:20 -0700 Subject: [PATCH] boo --- __pycache__/cluster.cpython-310.pyc | Bin 0 -> 3233 bytes __pycache__/config.cpython-310.pyc | Bin 0 -> 308 bytes __pycache__/debug.cpython-310.pyc | Bin 0 -> 902 bytes cluster.py | 115 ++++++++++++++++++++++++++++ config.py | 5 ++ debug.py | 22 ++++++ test_cluster.py | 13 ++++ 7 files changed, 155 insertions(+) create mode 100644 __pycache__/cluster.cpython-310.pyc create mode 100644 __pycache__/config.cpython-310.pyc create mode 100644 __pycache__/debug.cpython-310.pyc create mode 100644 cluster.py create mode 100644 config.py create mode 100644 debug.py create mode 100644 test_cluster.py diff --git a/__pycache__/cluster.cpython-310.pyc b/__pycache__/cluster.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f560a59a3bbf22f7ce9ee55fa1df0d055f4c4f85 GIT binary patch literal 3233 zcmZ`*%}*Og6rY*>@Y;NY07`J1kBZW$u7pjFl@N`Tw8^32lBx&PXk~U5V`6(@cAbK? zNR>#XUVH5cZslK^Yo$tDn_Cqr{k<8lZJ_LGX5PH{dha*Cc{9n(Ooie5Z|4~M ziz<_!gUS=6bQzgsl6TmaE%_FI&ZLmeJ0_hIu_Yvb#cJ*a`mxQ3^2tx2@&qaU9$Cz` zxMVa!ig#?wk&bjxy3&(AN>7$#fYO&`SwUHnQ*s(*AZO$(%CfvA=TKJUZLB!2y{EfP zl}2Q0@sUhRK1E957t49hLY52USQ^9lW=&`>qLJF&Z?@u^qun&_>?j(mTx91=`6&K< z(b!G8absJ>ai{S-X{0J@w31d@Z}+I7zTIi=C+%Jq*Q2D{kUhD&FLze6WHmd?>ar1a z`e_!c`u>p)!m!rjJ>{hEOIWLDm!&@amLk(t?ltsm+G7y%;%6X7e?DztX5(Z-T|#= z>;%0s@6@MO>5sK%f`b4+u(YYj&`4#X>q zZ;flO8>?1)F0`nxY2G%l+pL6?<7&PlWssLM&4}Qox-{jS+@u8szdu~+_-qoEh(na@rrHg7IZ5}DH(MKs3;P~ zB+}L7ZLGkM{dTVv9>Kyy$#$=q#i{n8jeAmiGT!dDbVaJ>&*64*7^gKy5k<7G;= z_%+hFM&nIre4{B%XaZ=6rwACt)6rQhTPA67xTeSf2#+raMlFukF327Tk8xh7lRi!o zAnNyIYA~*eAy7jd9hEV2)D)_ZkW$*Z0C3oUkq@RUAclCCAfOqDxCVkhA;4tBJ#xBA zKom7Nk3&|}T-2OV4KN(Lrsj{1`r68v)svVnchK*t zl}$ea*G0dV(3uquK1Wy+xs$s)?)X5eDvVa2Pr+IKGiC?DqVBTUoxmgiyXYtY00g{3E1B4;V z5VH*19LbN}jGu}eR|K3E1~zC36q3)}^qXvWZ4CRE)+OpY>SbyK__)p1kT0ErDa5aJ$(aww^*&!LLy~S!Jx}?x*^~lZqqh6*q8aYveZelerM}%Eqw1o@oF2&Kn zvfpgcB_-4vjnVFpFl;NU?K2h!~Sf~5MCdfrEN zqb2GK>SEaOh@_j=d`XS8X3bSZ9upV!OoTud4ntz99dK~O$Ryj1l`eOWh6fzSois(Q zbQsFGlQnhe$O=~|s$Gd#zk#JvGAsZyUGW+6f~}PgZs~G;t~98O0lCCciaHd64M&$D z&~e7iI&~}zS8(vogqp=&h-c3MSow;bJDWdzhsaGst+x**y}f<`6j8on{iM#iOmC2o z#4GNpc})8wQc5qj3QnER#Ya~R9$F7>yk@SsMX|uR^#Vf1PK)4<2Lvc@VjXhGXD*zj zhXh4G2PL&5<_+tn&q;|x2O7JZ<8U(&^*Wp*0;?~PX>VWQe#R5495s8gEph$nsW{D= z+qj>1W2xv4zUuH}=zc;n4>%X@+TZ=K;=6pXV%;XB3O-#&IHL1Yd?b@e#&-!Jhvjd$ zsvv@LDrqapKgf%StPYh%XZtOqjruJQ+mp7On%4Zq}ygvDs^O<1o~fFzhC> z-ywNA3=jIvj%`sCh1C*ebW_@M+8~M6QnUeP`3aN8b`{~t*cH%ubA$Uq5X=Qjfj?a_ kyN@0BqolXfZYc_2mJwtBMw97a-CDRougR43G~>MZ4+aB{AOHXW literal 0 HcmV?d00001 diff --git a/__pycache__/config.cpython-310.pyc b/__pycache__/config.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20462efbefa717d4cb1d0067170e5dd1b90c7b37 GIT binary patch literal 308 zcmYk0zfQw25XNmMX#-L^AYP(F?2QQ_gg}8MtyC(cE>)UyLr`Kzwu^vQVWA_hWJV@d zMy6h>4t&ymcR!u(k4}Q;)Ck+Kn)7S@!a(v{`4>KlC3 w6rHY>lf2Ze00k)?Nu5sJi{l_lt=4tLyH_8dw;DE$yr8(&#%M7au@OC?JCB1;{Qv*} literal 0 HcmV?d00001 diff --git a/__pycache__/debug.cpython-310.pyc b/__pycache__/debug.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2422337c1766bbc71eeb169ae80d55426753dd2c GIT binary patch literal 902 zcmZ`%J#W-N5Zzt-F7_!22pZDSx;UF4N{AwXpu2|75{k~AJ)e>7HM>6LB3-Ja4n(dPcb7&}HWN2q`SuUOh! zo^rPrsc_ayJrLksveXA3251Kmzz}vWc{&9CkwxJhp1~%$AhTXR+5qJ(in&GAu@$@E z=VHa*u}`yeZzZny$^(9Y@y`N1aloFj8F}7zY$mQbXX}c8vKtkCf69L<*4Yi*hbsPd z^-#sS(;nl?u*17tV1K}ni^0T-i4&P`m@NV zp>Yqoiw-^;jtz*fGy>kFe2&?OYkKsE+n`f;U%U&3XcFB8di995giFp(8XCl-{l)-M zoa7$f6}x6V%{P347{237XzIrL&9s)=C?RH^plGLPr=OyVcI*h5X{$w@No|EP*6+$% q+L2jgtyZ}-#xda9(yop;+B)6_uWJR1N*)n8J&TS< x_max: + x_max = char["x1"] + if char["y0"] < y_min: + x_min = char["y0"] + if char["y1"] > y_max: + y_max = char["y1"] + return (x_min, x_max, y_min, y_max) + + def dist(self, other): + my_bounds = self.outer_bounds() + other_bounds = other.outer_bounds() + + x_delta = 0 + if not Chars.char_overlaps(my_bounds[0], my_bounds[1], other_bounds[0], other_bounds[1]): + x_delta = min([abs(i) for i in [ + my_bounds[0] - other_bounds[0], + my_bounds[0] - other_bounds[1], + my_bounds[1] - other_bounds[0], + my_bounds[1] - other_bounds[1], + ]]) + + y_delta = 0 + if not Chars.char_overlaps(my_bounds[2], my_bounds[3], other_bounds[2], other_bounds[3]): + y_delta = min([abs(i) for i in [ + my_bounds[2] - other_bounds[2], + my_bounds[2] - other_bounds[3], + my_bounds[3] - other_bounds[2], + my_bounds[3] - other_bounds[3], + ]]) + + return x_delta ** 2 + y_delta ** 2 + + def overlapping_y_coordinates(self, other_char): + for self_char in self.chars: + if Chars.char_overlapping_y_coordinates(other_char, self_char): + return True + return False + + def char_overlapping_y_coordinates(candidate, established): + result = Chars.char_overlaps( + established["y0"], + established["y1"], + candidate["y0"], + candidate["y1"], + ) + print(established["y0"], "..", established["y1"], result, candidate["y0"], "..", candidate["y1"]) + return result + + def char_overlaps(my_min, my_max, other_min, other_max): + # my.. other..other ..my + if my_min <= other_min and other_max <= my_max: + return True + # other.. my..my ..other + elif other_min <= my_min and my_max <= other_max: + return True + # my..other..my..other + elif my_min <= other_min and other_min <= my_max and my_max <= other_max: + return True + # other..my..other..my + elif other_min <= my_min and my_min <= other_max and other_max <= my_max: + return True + return False diff --git a/config.py b/config.py new file mode 100644 index 0000000..6f303d3 --- /dev/null +++ b/config.py @@ -0,0 +1,5 @@ +import os + +DEBUG = os.environ.get("DEBUG", "") +DEBUG_NO_SHOW = os.environ.get("DEBUG_NO_SHOW", "") +INPUT = os.environ.get("INPUT", "./testdata/input.pdf") diff --git a/debug.py b/debug.py new file mode 100644 index 0000000..e125e41 --- /dev/null +++ b/debug.py @@ -0,0 +1,22 @@ +import pdfplumber +import os +import time +import subprocess + +def draw_boxes(page, boxes): + im = debug_im(page) + for box in boxes: + im.draw_line(((box["x0"], page.height - box["y0"]), (box["x1"], page.height - box["y0"]))) + im.draw_line(((box["x1"], page.height - box["y0"]), (box["x1"], page.height - box["y1"]))) + im.draw_line(((box["x1"], page.height - box["y1"]), (box["x0"], page.height - box["y1"]))) + im.draw_line(((box["x0"], page.height - box["y1"]), (box["x0"], page.height - box["y0"]))) + debug_show(im) + +def debug_im(page): + return page.to_image(height=800) + +def debug_show(im, name=None): + im.show() + #im.save(f"/tmp/dnd-pdf-to-txt{'' if not name else '-'+name}.jpg") + #if not DEBUG_NO_SHOW: + # go(f"qlmanage -p /tmp/dnd-pdf-to-txt{'' if not name else '-'+name}.jpg &> /dev/null") diff --git a/test_cluster.py b/test_cluster.py new file mode 100644 index 0000000..6bdc5db --- /dev/null +++ b/test_cluster.py @@ -0,0 +1,13 @@ +import unittest + +import cluster +import pdfplumber + +class TestChars(unittest.TestCase): + def test_divide_into_columns(self): + with pdfplumber.open("./testdata/2-column_2-row.pdf") as pdf: + for page in pdf.pages: + cluster.Chars(page.chars, page).divide_into_columns() + +if __name__ == "__main__": + unittest.main()