diff --git a/scrape.sh b/scrape.sh index 8fd865c..8285afb 100644 --- a/scrape.sh +++ b/scrape.sh @@ -237,25 +237,36 @@ html_to_markdown() { cat > "$f" local clean="$( cat "$f" \ + | awk '/CONTENT/,/FOOTER/' \ | readability "file://$(realpath $f)" \ | pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \ | sed 's/{[^}]*data[^}]*}//g' \ - | sed 's/\[](#[^)]*)//' + | sed 's/\[](#[^)]*)//' \ + | cat \ + | perl -pe 's|^\*\**(.*?)\**\*$|# \1|g' \ + | sed 's/{[^}]*}//' \ + | sed 's/ *$//' \ + | tr '\n' '\r' \ + | sed 's/\r\r *\([A-Za-z0-9][^\r]*\)\r--*\r\r/\r\r## \1\r\r/g' \ + | sed 's/
\(.*\)<\/div>/\1/g' \ + | tr '\r' '\n' \ )" rm "$f" - echo "$clean" | ( - lastline="" - while read -r line; do - if [ "$line" != "${line#----}" ]; then - echo "# $lastline" - echo "" - break - fi - lastline="$line" - done - cat &> /dev/null - ) - echo "$clean" + ( + echo "$clean" \ + | grep '^#' \ + | head -n 1 \ + | sed 's/^##*/#/' + echo + echo "$clean" + ) \ + | tr '\n' '\r' \ + | sed 's/\r\r:::\r\r/\r/g' \ + | sed 's/\r\r:::\r*$//' \ + | sed 's/:::/```/g' \ + | sed 's/&.dquo;/"/g' \ + | sed "s/&.squo;/'/g" \ + | tr '\r' '\n' return $? }