cleanup table displays
parent
b445441b84
commit
cfa69a88fc
45
scrape.sh
45
scrape.sh
|
|
@ -3,7 +3,7 @@
|
||||||
main() {
|
main() {
|
||||||
ensure
|
ensure
|
||||||
|
|
||||||
local output="${1:-"$TMPDIR/result"}"
|
local output="$TMPDIR/result"
|
||||||
mkdir -p "$output"
|
mkdir -p "$output"
|
||||||
log "$output"
|
log "$output"
|
||||||
|
|
||||||
|
|
@ -71,7 +71,8 @@ scrape_books() {
|
||||||
| sed 's/^href="//' \
|
| sed 's/^href="//' \
|
||||||
| sed 's/^\///' \
|
| sed 's/^\///' \
|
||||||
| sed 's/^/\//' \
|
| sed 's/^/\//' \
|
||||||
| sed 's/^/https:\/\/www.dndbeyond.com/'
|
| sed 's/^/https:\/\/www.dndbeyond.com/' \
|
||||||
|
| grep "${SCRAPE_BOOKS_PATTERN:-.*}"
|
||||||
}
|
}
|
||||||
|
|
||||||
scrape_book() {
|
scrape_book() {
|
||||||
|
|
@ -237,29 +238,48 @@ html_to_markdown() {
|
||||||
local f="$(mktemp)"
|
local f="$(mktemp)"
|
||||||
log url=$1
|
log url=$1
|
||||||
cat > "$f"
|
cat > "$f"
|
||||||
#| perl -pe 's|^\*\**(.*?)\**\*$|# \1|g' \
|
|
||||||
|
|
||||||
local clean="$(
|
local clean="$(
|
||||||
cat "$f" \
|
cat "$f" \
|
||||||
| awk '/CONTENT/,/FOOTER/' \
|
| awk '/CONTENT/,/FOOTER/' \
|
||||||
|
| tr '\n' '\r' \
|
||||||
|
| perl -pe 's|<blockquote[^>]*>(.*?)</blockquote>|\1|g' \
|
||||||
|
| perl -pe 's|\r<p[^\r]*<strong>([^<]*)</strong></p>\r|\r<h5>\1</h5>\r|g' \
|
||||||
|
| perl -pe 's|<div[^>]*stat-block-ability-scores-data[^>]*>(.*?)</div>|<span>\1</span>|g' \
|
||||||
|
| perl -pe 's|<div[^>]*stat-block-ability-scores-heading[^>]*>(.*?)</div>|<strong>\1</strong>|g' \
|
||||||
|
| perl -pe 's|<div[^>]*stat-block-ability-scores-stat[^>]*>(.*?)</div>|<p>\1</p>|g' \
|
||||||
|
| perl -pe 's|<div[^>]*stat-block-ability-scores[^>]*>(.*?)</div>|\1|g' \
|
||||||
|
| tr '\r' '\n' \
|
||||||
|
| sed 's/<br>/ /g' \
|
||||||
|
| sed 's/colspan="[^"]*"//g' \
|
||||||
| readability "file://$(realpath $f)" \
|
| readability "file://$(realpath $f)" \
|
||||||
| pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \
|
| tr '\n' '\r' \
|
||||||
|
| perl -pe 's|<t(.)><p>([^<]*)</p></t.>|<t\1>\2</t\1>|g' \
|
||||||
|
| perl -pe 's|<thead>\r*<tr>(?:(?!</tr>).)+</tr>\r*<tr>|<thead><tr>|g' \
|
||||||
|
| tr '\r' '\n' \
|
||||||
|
| pandoc \
|
||||||
|
-f html \
|
||||||
|
-t markdown+pipe_tables-simple_tables-multiline_tables \
|
||||||
|
--atx-headers \
|
||||||
|
--ascii \
|
||||||
|
--toc \
|
||||||
|
--wrap=none \
|
||||||
|
--strip-comments \
|
||||||
|
-o - \
|
||||||
| sed 's/{[^}]*data[^}]*}//g' \
|
| sed 's/{[^}]*data[^}]*}//g' \
|
||||||
| sed 's/\[](#[^)]*)//' \
|
| sed 's/\[](#[^)]*)//' \
|
||||||
| cat \
|
| cat \
|
||||||
| sed 's/{[^}]*}//' \
|
| sed 's/{[^}]*}//' \
|
||||||
| sed 's/ *$//' \
|
| sed 's/ *$//' \
|
||||||
| tr '\n' '\r' \
|
| tr '\n' '\r' \
|
||||||
| sed 's/\r\r *\([A-Za-z0-9][^\r]*\)\r--*\r\r/\r\r## \1\r\r/g' \
|
|
||||||
| sed 's/<div>\(.*\)<\/div>/\1/g' \
|
| sed 's/<div>\(.*\)<\/div>/\1/g' \
|
||||||
| tr '\r' '\n' \
|
| tr '\r' '\n' \
|
||||||
)"
|
)"
|
||||||
rm "$f"
|
|
||||||
(
|
(
|
||||||
echo "$clean" \
|
grep -o '<title.*' "$f" \
|
||||||
| grep '^#' \
|
| sed 's/^[^>]*>//' \
|
||||||
| head -n 1 \
|
| sed 's/-.*//' \
|
||||||
| sed 's/^##*/#/' \
|
| sed 's/^/# /' \
|
||||||
| grep . || echo "# ${1##*/}"
|
| grep . || echo "# ${1##*/}"
|
||||||
echo
|
echo
|
||||||
echo "$clean"
|
echo "$clean"
|
||||||
|
|
@ -267,11 +287,14 @@ html_to_markdown() {
|
||||||
| tr '\n' '\r' \
|
| tr '\n' '\r' \
|
||||||
| sed 's/\r\r:::\r\r/\r/g' \
|
| sed 's/\r\r:::\r\r/\r/g' \
|
||||||
| sed 's/\r\r:::\r*$//' \
|
| sed 's/\r\r:::\r*$//' \
|
||||||
| sed 's/:::\(.*\):::/```\1```/g' \
|
|
||||||
| sed 's/::://g' \
|
| sed 's/::://g' \
|
||||||
| sed 's/&.dquo;/"/g' \
|
| sed 's/&.dquo;/"/g' \
|
||||||
| sed "s/&.squo;/'/g" \
|
| sed "s/&.squo;/'/g" \
|
||||||
| tr '\r' '\n'
|
| tr '\r' '\n'
|
||||||
|
|
||||||
|
#| sed 's/:::\(.*\):::/```\1```/g' \
|
||||||
|
|
||||||
|
rm "$f"
|
||||||
return $?
|
return $?
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue