scrape a little bit cleaner about pre-purging lines, tables as preformatted, rm excess :::

master
bel 2022-07-03 23:31:59 -06:00
parent f878486d80
commit 0977f17c5e
1 changed files with 25 additions and 14 deletions

View File

@ -237,25 +237,36 @@ html_to_markdown() {
cat > "$f"
local clean="$(
cat "$f" \
| awk '/CONTENT/,/FOOTER/' \
| readability "file://$(realpath $f)" \
| pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \
| sed 's/{[^}]*data[^}]*}//g' \
| sed 's/\[](#[^)]*)//'
| sed 's/\[](#[^)]*)//' \
| cat \
| perl -pe 's|^\*\**(.*?)\**\*$|# \1|g' \
| sed 's/{[^}]*}//' \
| sed 's/ *$//' \
| tr '\n' '\r' \
| sed 's/\r\r *\([A-Za-z0-9][^\r]*\)\r--*\r\r/\r\r## \1\r\r/g' \
| sed 's/<div>\(.*\)<\/div>/\1/g' \
| tr '\r' '\n' \
)"
rm "$f"
echo "$clean" | (
lastline=""
while read -r line; do
if [ "$line" != "${line#----}" ]; then
echo "# $lastline"
echo ""
break
fi
lastline="$line"
done
cat &> /dev/null
)
echo "$clean"
(
echo "$clean" \
| grep '^#' \
| head -n 1 \
| sed 's/^##*/#/'
echo
echo "$clean"
) \
| tr '\n' '\r' \
| sed 's/\r\r:::\r\r/\r/g' \
| sed 's/\r\r:::\r*$//' \
| sed 's/:::/```/g' \
| sed 's/&.dquo;/"/g' \
| sed "s/&.squo;/'/g" \
| tr '\r' '\n'
return $?
}