diff --git a/crawler/google.sh b/crawler/google.sh index 9c7249c..2c43f12 100644 --- a/crawler/google.sh +++ b/crawler/google.sh @@ -26,11 +26,27 @@ google() ( echo "" if [ "${downloaded##*.}" == "csv" ]; then _csv_to_md "$downloaded" + elif [ "${downloaded##*.}" == "html" ]; then + _html_to_md "$downloaded" else cat "$downloaded" fi } + _html_to_md() { + which pandoc &> /dev/null + local f="$1" + #log f=$f + cat "$f" \ + | sed 's/.*.*/<\/body>/' \ + | sed 's/<[\/]*span[^>]*>//g' \ + | perl -pe 's|
.*?<\/div>||g' \ + | sed 's/<\([a-z][a-z]*\)[^>]*/<\1/g' \ + | pandoc - -f html -t commonmark -s -o - \ + | sed 's/^<[\/]*div>$//g' + } + _csv_to_md() { local f="$1" (