cleanup table displays
parent
b445441b84
commit
cfa69a88fc
45
scrape.sh
45
scrape.sh
|
|
@ -3,7 +3,7 @@
|
|||
main() {
|
||||
ensure
|
||||
|
||||
local output="${1:-"$TMPDIR/result"}"
|
||||
local output="$TMPDIR/result"
|
||||
mkdir -p "$output"
|
||||
log "$output"
|
||||
|
||||
|
|
@ -71,7 +71,8 @@ scrape_books() {
|
|||
| sed 's/^href="//' \
|
||||
| sed 's/^\///' \
|
||||
| sed 's/^/\//' \
|
||||
| sed 's/^/https:\/\/www.dndbeyond.com/'
|
||||
| sed 's/^/https:\/\/www.dndbeyond.com/' \
|
||||
| grep "${SCRAPE_BOOKS_PATTERN:-.*}"
|
||||
}
|
||||
|
||||
scrape_book() {
|
||||
|
|
@ -237,29 +238,48 @@ html_to_markdown() {
|
|||
local f="$(mktemp)"
|
||||
log url=$1
|
||||
cat > "$f"
|
||||
#| perl -pe 's|^\*\**(.*?)\**\*$|# \1|g' \
|
||||
|
||||
local clean="$(
|
||||
cat "$f" \
|
||||
| awk '/CONTENT/,/FOOTER/' \
|
||||
| tr '\n' '\r' \
|
||||
| perl -pe 's|<blockquote[^>]*>(.*?)</blockquote>|\1|g' \
|
||||
| perl -pe 's|\r<p[^\r]*<strong>([^<]*)</strong></p>\r|\r<h5>\1</h5>\r|g' \
|
||||
| perl -pe 's|<div[^>]*stat-block-ability-scores-data[^>]*>(.*?)</div>|<span>\1</span>|g' \
|
||||
| perl -pe 's|<div[^>]*stat-block-ability-scores-heading[^>]*>(.*?)</div>|<strong>\1</strong>|g' \
|
||||
| perl -pe 's|<div[^>]*stat-block-ability-scores-stat[^>]*>(.*?)</div>|<p>\1</p>|g' \
|
||||
| perl -pe 's|<div[^>]*stat-block-ability-scores[^>]*>(.*?)</div>|\1|g' \
|
||||
| tr '\r' '\n' \
|
||||
| sed 's/<br>/ /g' \
|
||||
| sed 's/colspan="[^"]*"//g' \
|
||||
| readability "file://$(realpath $f)" \
|
||||
| pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \
|
||||
| tr '\n' '\r' \
|
||||
| perl -pe 's|<t(.)><p>([^<]*)</p></t.>|<t\1>\2</t\1>|g' \
|
||||
| perl -pe 's|<thead>\r*<tr>(?:(?!</tr>).)+</tr>\r*<tr>|<thead><tr>|g' \
|
||||
| tr '\r' '\n' \
|
||||
| pandoc \
|
||||
-f html \
|
||||
-t markdown+pipe_tables-simple_tables-multiline_tables \
|
||||
--atx-headers \
|
||||
--ascii \
|
||||
--toc \
|
||||
--wrap=none \
|
||||
--strip-comments \
|
||||
-o - \
|
||||
| sed 's/{[^}]*data[^}]*}//g' \
|
||||
| sed 's/\[](#[^)]*)//' \
|
||||
| cat \
|
||||
| sed 's/{[^}]*}//' \
|
||||
| sed 's/ *$//' \
|
||||
| tr '\n' '\r' \
|
||||
| sed 's/\r\r *\([A-Za-z0-9][^\r]*\)\r--*\r\r/\r\r## \1\r\r/g' \
|
||||
| sed 's/<div>\(.*\)<\/div>/\1/g' \
|
||||
| tr '\r' '\n' \
|
||||
)"
|
||||
rm "$f"
|
||||
(
|
||||
echo "$clean" \
|
||||
| grep '^#' \
|
||||
| head -n 1 \
|
||||
| sed 's/^##*/#/' \
|
||||
grep -o '<title.*' "$f" \
|
||||
| sed 's/^[^>]*>//' \
|
||||
| sed 's/-.*//' \
|
||||
| sed 's/^/# /' \
|
||||
| grep . || echo "# ${1##*/}"
|
||||
echo
|
||||
echo "$clean"
|
||||
|
|
@ -267,11 +287,14 @@ html_to_markdown() {
|
|||
| tr '\n' '\r' \
|
||||
| sed 's/\r\r:::\r\r/\r/g' \
|
||||
| sed 's/\r\r:::\r*$//' \
|
||||
| sed 's/:::\(.*\):::/```\1```/g' \
|
||||
| sed 's/::://g' \
|
||||
| sed 's/&.dquo;/"/g' \
|
||||
| sed "s/&.squo;/'/g" \
|
||||
| tr '\r' '\n'
|
||||
|
||||
#| sed 's/:::\(.*\):::/```\1```/g' \
|
||||
|
||||
rm "$f"
|
||||
return $?
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue