diff --git a/crawler/google.sh b/crawler/google.sh index 9c7249c..2c43f12 100644 --- a/crawler/google.sh +++ b/crawler/google.sh @@ -26,11 +26,27 @@ google() ( echo "" if [ "${downloaded##*.}" == "csv" ]; then _csv_to_md "$downloaded" + elif [ "${downloaded##*.}" == "html" ]; then + _html_to_md "$downloaded" else cat "$downloaded" fi } + _html_to_md() { + which pandoc &> /dev/null + local f="$1" + #log f=$f + cat "$f" \ + | sed 's/.*
.*/<\/body>/' \ + | sed 's/<[\/]*span[^>]*>//g' \ + | perl -pe 's|