diff --git a/scrape.sh b/scrape.sh index 8fd865c..8285afb 100644 --- a/scrape.sh +++ b/scrape.sh @@ -237,25 +237,36 @@ html_to_markdown() { cat > "$f" local clean="$( cat "$f" \ + | awk '/CONTENT/,/FOOTER/' \ | readability "file://$(realpath $f)" \ | pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \ | sed 's/{[^}]*data[^}]*}//g' \ - | sed 's/\[](#[^)]*)//' + | sed 's/\[](#[^)]*)//' \ + | cat \ + | perl -pe 's|^\*\**(.*?)\**\*$|# \1|g' \ + | sed 's/{[^}]*}//' \ + | sed 's/ *$//' \ + | tr '\n' '\r' \ + | sed 's/\r\r *\([A-Za-z0-9][^\r]*\)\r--*\r\r/\r\r## \1\r\r/g' \ + | sed 's/