scrape a little bit cleaner about pre-purging lines, tables as preformatted, rm excess :::
parent
f878486d80
commit
0977f17c5e
39
scrape.sh
39
scrape.sh
|
|
@ -237,25 +237,36 @@ html_to_markdown() {
|
||||||
cat > "$f"
|
cat > "$f"
|
||||||
local clean="$(
|
local clean="$(
|
||||||
cat "$f" \
|
cat "$f" \
|
||||||
|
| awk '/CONTENT/,/FOOTER/' \
|
||||||
| readability "file://$(realpath $f)" \
|
| readability "file://$(realpath $f)" \
|
||||||
| pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \
|
| pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \
|
||||||
| sed 's/{[^}]*data[^}]*}//g' \
|
| sed 's/{[^}]*data[^}]*}//g' \
|
||||||
| sed 's/\[](#[^)]*)//'
|
| sed 's/\[](#[^)]*)//' \
|
||||||
|
| cat \
|
||||||
|
| perl -pe 's|^\*\**(.*?)\**\*$|# \1|g' \
|
||||||
|
| sed 's/{[^}]*}//' \
|
||||||
|
| sed 's/ *$//' \
|
||||||
|
| tr '\n' '\r' \
|
||||||
|
| sed 's/\r\r *\([A-Za-z0-9][^\r]*\)\r--*\r\r/\r\r## \1\r\r/g' \
|
||||||
|
| sed 's/<div>\(.*\)<\/div>/\1/g' \
|
||||||
|
| tr '\r' '\n' \
|
||||||
)"
|
)"
|
||||||
rm "$f"
|
rm "$f"
|
||||||
echo "$clean" | (
|
(
|
||||||
lastline=""
|
echo "$clean" \
|
||||||
while read -r line; do
|
| grep '^#' \
|
||||||
if [ "$line" != "${line#----}" ]; then
|
| head -n 1 \
|
||||||
echo "# $lastline"
|
| sed 's/^##*/#/'
|
||||||
echo ""
|
echo
|
||||||
break
|
echo "$clean"
|
||||||
fi
|
) \
|
||||||
lastline="$line"
|
| tr '\n' '\r' \
|
||||||
done
|
| sed 's/\r\r:::\r\r/\r/g' \
|
||||||
cat &> /dev/null
|
| sed 's/\r\r:::\r*$//' \
|
||||||
)
|
| sed 's/:::/```/g' \
|
||||||
echo "$clean"
|
| sed 's/&.dquo;/"/g' \
|
||||||
|
| sed "s/&.squo;/'/g" \
|
||||||
|
| tr '\r' '\n'
|
||||||
return $?
|
return $?
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue