scrape a little bit cleaner about pre-purging lines, tables as preformatted, rm excess :::
parent
f878486d80
commit
0977f17c5e
39
scrape.sh
39
scrape.sh
|
|
@ -237,25 +237,36 @@ html_to_markdown() {
|
|||
cat > "$f"
|
||||
local clean="$(
|
||||
cat "$f" \
|
||||
| awk '/CONTENT/,/FOOTER/' \
|
||||
| readability "file://$(realpath $f)" \
|
||||
| pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \
|
||||
| sed 's/{[^}]*data[^}]*}//g' \
|
||||
| sed 's/\[](#[^)]*)//'
|
||||
| sed 's/\[](#[^)]*)//' \
|
||||
| cat \
|
||||
| perl -pe 's|^\*\**(.*?)\**\*$|# \1|g' \
|
||||
| sed 's/{[^}]*}//' \
|
||||
| sed 's/ *$//' \
|
||||
| tr '\n' '\r' \
|
||||
| sed 's/\r\r *\([A-Za-z0-9][^\r]*\)\r--*\r\r/\r\r## \1\r\r/g' \
|
||||
| sed 's/<div>\(.*\)<\/div>/\1/g' \
|
||||
| tr '\r' '\n' \
|
||||
)"
|
||||
rm "$f"
|
||||
echo "$clean" | (
|
||||
lastline=""
|
||||
while read -r line; do
|
||||
if [ "$line" != "${line#----}" ]; then
|
||||
echo "# $lastline"
|
||||
echo ""
|
||||
break
|
||||
fi
|
||||
lastline="$line"
|
||||
done
|
||||
cat &> /dev/null
|
||||
)
|
||||
echo "$clean"
|
||||
(
|
||||
echo "$clean" \
|
||||
| grep '^#' \
|
||||
| head -n 1 \
|
||||
| sed 's/^##*/#/'
|
||||
echo
|
||||
echo "$clean"
|
||||
) \
|
||||
| tr '\n' '\r' \
|
||||
| sed 's/\r\r:::\r\r/\r/g' \
|
||||
| sed 's/\r\r:::\r*$//' \
|
||||
| sed 's/:::/```/g' \
|
||||
| sed 's/&.dquo;/"/g' \
|
||||
| sed "s/&.squo;/'/g" \
|
||||
| tr '\r' '\n'
|
||||
return $?
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue