cleanup table displays

master
bel 2022-07-04 10:25:18 -06:00
parent b445441b84
commit cfa69a88fc
1 changed files with 34 additions and 11 deletions

View File

@ -3,7 +3,7 @@
main() { main() {
ensure ensure
local output="${1:-"$TMPDIR/result"}" local output="$TMPDIR/result"
mkdir -p "$output" mkdir -p "$output"
log "$output" log "$output"
@ -71,7 +71,8 @@ scrape_books() {
| sed 's/^href="//' \ | sed 's/^href="//' \
| sed 's/^\///' \ | sed 's/^\///' \
| sed 's/^/\//' \ | sed 's/^/\//' \
| sed 's/^/https:\/\/www.dndbeyond.com/' | sed 's/^/https:\/\/www.dndbeyond.com/' \
| grep "${SCRAPE_BOOKS_PATTERN:-.*}"
} }
scrape_book() { scrape_book() {
@ -237,29 +238,48 @@ html_to_markdown() {
local f="$(mktemp)" local f="$(mktemp)"
log url=$1 log url=$1
cat > "$f" cat > "$f"
#| perl -pe 's|^\*\**(.*?)\**\*$|# \1|g' \
local clean="$( local clean="$(
cat "$f" \ cat "$f" \
| awk '/CONTENT/,/FOOTER/' \ | awk '/CONTENT/,/FOOTER/' \
| tr '\n' '\r' \
| perl -pe 's|<blockquote[^>]*>(.*?)</blockquote>|\1|g' \
| perl -pe 's|\r<p[^\r]*<strong>([^<]*)</strong></p>\r|\r<h5>\1</h5>\r|g' \
| perl -pe 's|<div[^>]*stat-block-ability-scores-data[^>]*>(.*?)</div>|<span>\1</span>|g' \
| perl -pe 's|<div[^>]*stat-block-ability-scores-heading[^>]*>(.*?)</div>|<strong>\1</strong>|g' \
| perl -pe 's|<div[^>]*stat-block-ability-scores-stat[^>]*>(.*?)</div>|<p>\1</p>|g' \
| perl -pe 's|<div[^>]*stat-block-ability-scores[^>]*>(.*?)</div>|\1|g' \
| tr '\r' '\n' \
| sed 's/<br>/ /g' \
| sed 's/colspan="[^"]*"//g' \
| readability "file://$(realpath $f)" \ | readability "file://$(realpath $f)" \
| pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \ | tr '\n' '\r' \
| perl -pe 's|<t(.)><p>([^<]*)</p></t.>|<t\1>\2</t\1>|g' \
| perl -pe 's|<thead>\r*<tr>(?:(?!</tr>).)+</tr>\r*<tr>|<thead><tr>|g' \
| tr '\r' '\n' \
| pandoc \
-f html \
-t markdown+pipe_tables-simple_tables-multiline_tables \
--atx-headers \
--ascii \
--toc \
--wrap=none \
--strip-comments \
-o - \
| sed 's/{[^}]*data[^}]*}//g' \ | sed 's/{[^}]*data[^}]*}//g' \
| sed 's/\[](#[^)]*)//' \ | sed 's/\[](#[^)]*)//' \
| cat \ | cat \
| sed 's/{[^}]*}//' \ | sed 's/{[^}]*}//' \
| sed 's/ *$//' \ | sed 's/ *$//' \
| tr '\n' '\r' \ | tr '\n' '\r' \
| sed 's/\r\r *\([A-Za-z0-9][^\r]*\)\r--*\r\r/\r\r## \1\r\r/g' \
| sed 's/<div>\(.*\)<\/div>/\1/g' \ | sed 's/<div>\(.*\)<\/div>/\1/g' \
| tr '\r' '\n' \ | tr '\r' '\n' \
)" )"
rm "$f"
( (
echo "$clean" \ grep -o '<title.*' "$f" \
| grep '^#' \ | sed 's/^[^>]*>//' \
| head -n 1 \ | sed 's/-.*//' \
| sed 's/^##*/#/' \ | sed 's/^/# /' \
| grep . || echo "# ${1##*/}" | grep . || echo "# ${1##*/}"
echo echo
echo "$clean" echo "$clean"
@ -267,11 +287,14 @@ html_to_markdown() {
| tr '\n' '\r' \ | tr '\n' '\r' \
| sed 's/\r\r:::\r\r/\r/g' \ | sed 's/\r\r:::\r\r/\r/g' \
| sed 's/\r\r:::\r*$//' \ | sed 's/\r\r:::\r*$//' \
| sed 's/:::\(.*\):::/```\1```/g' \
| sed 's/::://g' \ | sed 's/::://g' \
| sed 's/&.dquo;/"/g' \ | sed 's/&.dquo;/"/g' \
| sed "s/&.squo;/'/g" \ | sed "s/&.squo;/'/g" \
| tr '\r' '\n' | tr '\r' '\n'
#| sed 's/:::\(.*\):::/```\1```/g' \
rm "$f"
return $? return $?
} }