scrape a little bit cleaner about pre-purging lines, tables as preformatted, rm excess :::

master
bel 2022-07-03 23:31:59 -06:00
parent f878486d80
commit 0977f17c5e
1 changed files with 25 additions and 14 deletions

View File

@ -237,25 +237,36 @@ html_to_markdown() {
cat > "$f" cat > "$f"
local clean="$( local clean="$(
cat "$f" \ cat "$f" \
| awk '/CONTENT/,/FOOTER/' \
| readability "file://$(realpath $f)" \ | readability "file://$(realpath $f)" \
| pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \ | pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \
| sed 's/{[^}]*data[^}]*}//g' \ | sed 's/{[^}]*data[^}]*}//g' \
| sed 's/\[](#[^)]*)//' | sed 's/\[](#[^)]*)//' \
| cat \
| perl -pe 's|^\*\**(.*?)\**\*$|# \1|g' \
| sed 's/{[^}]*}//' \
| sed 's/ *$//' \
| tr '\n' '\r' \
| sed 's/\r\r *\([A-Za-z0-9][^\r]*\)\r--*\r\r/\r\r## \1\r\r/g' \
| sed 's/<div>\(.*\)<\/div>/\1/g' \
| tr '\r' '\n' \
)" )"
rm "$f" rm "$f"
echo "$clean" | ( (
lastline="" echo "$clean" \
while read -r line; do | grep '^#' \
if [ "$line" != "${line#----}" ]; then | head -n 1 \
echo "# $lastline" | sed 's/^##*/#/'
echo "" echo
break echo "$clean"
fi ) \
lastline="$line" | tr '\n' '\r' \
done | sed 's/\r\r:::\r\r/\r/g' \
cat &> /dev/null | sed 's/\r\r:::\r*$//' \
) | sed 's/:::/```/g' \
echo "$clean" | sed 's/&.dquo;/"/g' \
| sed "s/&.squo;/'/g" \
| tr '\r' '\n'
return $? return $?
} }