From 0977f17c5e1affe8c4634611d8fe590cdb969fb0 Mon Sep 17 00:00:00 2001 From: bel Date: Sun, 3 Jul 2022 23:31:59 -0600 Subject: [PATCH] scrape a little bit cleaner about pre-purging lines, tables as preformatted, rm excess ::: --- scrape.sh | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/scrape.sh b/scrape.sh index 8fd865c..8285afb 100644 --- a/scrape.sh +++ b/scrape.sh @@ -237,25 +237,36 @@ html_to_markdown() { cat > "$f" local clean="$( cat "$f" \ + | awk '/CONTENT/,/FOOTER/' \ | readability "file://$(realpath $f)" \ | pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \ | sed 's/{[^}]*data[^}]*}//g' \ - | sed 's/\[](#[^)]*)//' + | sed 's/\[](#[^)]*)//' \ + | cat \ + | perl -pe 's|^\*\**(.*?)\**\*$|# \1|g' \ + | sed 's/{[^}]*}//' \ + | sed 's/ *$//' \ + | tr '\n' '\r' \ + | sed 's/\r\r *\([A-Za-z0-9][^\r]*\)\r--*\r\r/\r\r## \1\r\r/g' \ + | sed 's/
\(.*\)<\/div>/\1/g' \ + | tr '\r' '\n' \ )" rm "$f" - echo "$clean" | ( - lastline="" - while read -r line; do - if [ "$line" != "${line#----}" ]; then - echo "# $lastline" - echo "" - break - fi - lastline="$line" - done - cat &> /dev/null - ) - echo "$clean" + ( + echo "$clean" \ + | grep '^#' \ + | head -n 1 \ + | sed 's/^##*/#/' + echo + echo "$clean" + ) \ + | tr '\n' '\r' \ + | sed 's/\r\r:::\r\r/\r/g' \ + | sed 's/\r\r:::\r*$//' \ + | sed 's/:::/```/g' \ + | sed 's/&.dquo;/"/g' \ + | sed "s/&.squo;/'/g" \ + | tr '\r' '\n' return $? }