From 8c87cdf0b26866841b04902be643c19f0e3145de Mon Sep 17 00:00:00 2001 From: Bel LaPointe Date: Wed, 16 Feb 2022 15:09:08 -0700 Subject: [PATCH] simplify google docs markdown --- crawler/google.sh | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/crawler/google.sh b/crawler/google.sh index 9c7249c..2c43f12 100644 --- a/crawler/google.sh +++ b/crawler/google.sh @@ -26,11 +26,27 @@ google() ( echo "" if [ "${downloaded##*.}" == "csv" ]; then _csv_to_md "$downloaded" + elif [ "${downloaded##*.}" == "html" ]; then + _html_to_md "$downloaded" else cat "$downloaded" fi } + _html_to_md() { + which pandoc &> /dev/null + local f="$1" + #log f=$f + cat "$f" \ + | sed 's/.*.*/<\/body>/' \ + | sed 's/<[\/]*span[^>]*>//g' \ + | perl -pe 's|
.*?<\/div>||g' \ + | sed 's/<\([a-z][a-z]*\)[^>]*/<\1/g' \ + | pandoc - -f html -t commonmark -s -o - \ + | sed 's/^<[\/]*div>$//g' + } + _csv_to_md() { local f="$1" (