ok nwo i think

master
bel 2022-06-27 18:15:06 -06:00
parent 28b9a82004
commit e6d8cfae94
1 changed files with 36 additions and 63 deletions

View File

@ -4,7 +4,8 @@
main() { main() {
ensure ensure
local output="${1:-$(mktemp -d)}" local output="${1:-"$TMPDIR/result"}"
mkdir -p "$output"
log "$output" log "$output"
#set -x #set -x
@ -21,9 +22,10 @@ main() {
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/krenkos-way #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/krenkos-way
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/bestiary #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/bestiary
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/azorius-senate #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/azorius-senate
scrape_book_chapter https://www.dndbeyond.com/sources/mm/monsters-e #scrape_book_chapter https://www.dndbeyond.com/sources/mm/monsters-e
return $?
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/dmg/appendix-b-monster-lists #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/dmg/appendix-b-monster-lists
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/basic-rules/appendix-b-gods-of-the-multiverse
#return $?
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/phb/credits | less #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/phb/credits | less
@ -33,6 +35,8 @@ main() {
#scrape_book "$output" https://www.dndbeyond.com/sources/phb #scrape_book "$output" https://www.dndbeyond.com/sources/phb
log "books=$(scrape_books)"
sleep 1
for book in $(scrape_books); do for book in $(scrape_books); do
book="${book%/}" book="${book%/}"
log book=$book log book=$book
@ -155,9 +159,11 @@ scrape_dndbeyond() {
local f="$d/$(echo "$*" | base64)" local f="$d/$(echo "$*" | base64)"
if [ -f "$f" ]; then if [ -f "$f" ]; then
if cat "$f" | grep .; then if cat "$f" | grep .; then
log "foo:scrape_dndbeyond: yes cached: $f: $@"
return return
fi fi
fi fi
log "foo:scrape_dndbeyond: not cached: $f: $@"
_scrape_dndbeyond "$@" | tee "$f" _scrape_dndbeyond "$@" | tee "$f"
} }
@ -176,7 +182,7 @@ _scrape_dndbeyond() {
-H 'sec-fetch-user: ?1' \ -H 'sec-fetch-user: ?1' \
-H 'sec-fetch-dest: document' \ -H 'sec-fetch-dest: document' \
-H 'accept-language: en-US,en;q=0.9' \ -H 'accept-language: en-US,en;q=0.9' \
-H 'Cookie: _pxhd=qvJb-g60h1HSymUFEiAmb-Noh35mNbUmcFgIE0qEiisCj/BqXrgrrsRUlODrjAaSGknpuHBn0S2b4rGTBFvaIA==:5/4beZX4ojNLKbAeZH9rQFcGIQ/ijS7lK29wvhLkgB4O0L2GFpRuU5FOpRztEiKg9EoUZr2cPB2Z8w3DKVkaKJPlwcIt23DM5PgQLZEPoA8=; ResponsiveSwitch.DesktopMode=1; LoginState=0ae9ef26-a1ae-4cb2-8af7-f18c21b99811; G_ENABLED_IDPS=google; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..FawE7bpRlluZsWq7Iny1xQ.nBX_qdZq5varuXNGkl70QeJ03JIRT3sxxQi6iCzazt-SjlK1hEuINX0sK85gJKVT.8wlUwvfZiME9fAdBKj0rIw; Preferences.Language=1; Pref…sername=squeaky2x3; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"5ce909a9-34aa-4b15-9b6d-00ee48e13e05"}; AWSELB=17A593B6CA59C3C4856B812F84CD401A582EF08335AE71D0834BC2696233572C85266F7F9F4B5BF89DD4952FDA9AD462250417F8B7C4975F1633599CC9D18D74C3B57CBA; AWSELBCORS=17A593B6CA59C3C4856B812F84CD401A582EF08335AE71D0834BC2696233572C85266F7F9F4B5BF89DD4952FDA9AD462250417F8B7C4975F1633599CC9D18D74C3B57CBA; ddbSiteBanner:bb279d28-f265-49d5-ad41-6d716915bbdf=true; WarningNotification.Lock=1' \ -H 'Cookie: Preferences=undefined; Preferences=undefined; _pxhd=qvJb-g60h1HSymUFEiAmb-Noh35mNbUmcFgIE0qEiisCj/BqXrgrrsRUlODrjAaSGknpuHBn0S2b4rGTBFvaIA==:5/4beZX4ojNLKbAeZH9rQFcGIQ/ijS7lK29wvhLkgB4O0L2GFpRuU5FOpRztEiKg9EoUZr2cPB2Z8w3DKVkaKJPlwcIt23DM5PgQLZEPoA8=; ResponsiveSwitch.DesktopMode=1; LoginState=0ae9ef26-a1ae-4cb2-8af7-f18c21b99811; G_ENABLED_IDPS=google; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..FawE7bpRlluZsWq7Iny1xQ.nBX_qdZq5varuXNGkl70QeJ03JIRT3sxxQi6iCzazt-SjlK1hEuINX0sK85gJKVT.8wlUwv…sername=squeaky2x3; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"5ce909a9-34aa-4b15-9b6d-00ee48e13e05"}; AWSELB=17A593B6CA59C3C4856B812F84CD401A582EF08335AE71D0834BC2696233572C85266F7F9F4B5BF89DD4952FDA9AD462250417F8B7C4975F1633599CC9D18D74C3B57CBA; AWSELBCORS=17A593B6CA59C3C4856B812F84CD401A582EF08335AE71D0834BC2696233572C85266F7F9F4B5BF89DD4952FDA9AD462250417F8B7C4975F1633599CC9D18D74C3B57CBA; ddbSiteBanner:bb279d28-f265-49d5-ad41-6d716915bbdf=true; WarningNotification.Lock=1' \
-H 'dnt: 1' \ -H 'dnt: 1' \
--compressed --compressed
return return
@ -221,69 +227,36 @@ url_to_title_candidate() {
html_to_markdown() { html_to_markdown() {
local f="$(mktemp)" local f="$(mktemp)"
log url=$1 log url=$1
log 1: url=$1
cat > "$f" cat > "$f"
grep '<title>' "$f" | sed 's/.*>\([^<]*\)<.title>.*/\1/g' >&2 log 2: url=$1
#grep '<title>' "$f" | sed 's/.*>\([^<]*\)<.title>.*/# \1/g'
local clean="$(
cat "$f" \ cat "$f" \
| readability "file://$(realpath $f)" \ | readability "file://$(realpath $f)" \
| pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \ | pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \
| sed 's/{[^}]*data[^}]*}//g' \ | sed 's/{[^}]*data[^}]*}//g' \
| sed 's/\[](#[^)]*)//' \ | sed 's/\[](#[^)]*)//'
| cat )"
log 3: url=$1
rm "$f" rm "$f"
log 4: url=$1
echo "$clean" | (
lastline=""
while read -r line; do
if [ "$line" != "${line#----}" ]; then
echo "# $lastline"
echo ""
break
fi
lastline="$line"
done
cat &> /dev/null
)
log 5: url=$1
echo "$clean"
log 6: url=$1
return $? return $?
local input="$(cat)"
local url_title_candidate="$(url_to_title_candidate "$1")"
local url_title_candidate_2="$(url_to_title_candidate "$(dirname "$1")")"
local out="$(
echo "$input" \
| pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \
| sed '/^[ ]*:::.*/d' \
| sed -e '/^Share$/,$d' \
| sed 's/^\(\#\#*\) \[](#\([^)]*\))/\1 /' \
| sed 's/{#\([^ ]*\)[^}]*}/\n\n<a name="\1"><\/a>/' \
| sed 's/\[\([^]]*\)]\(([^)]*)\)*\({[^}]*}\)*/\1/g' \
| sed "s/\\\\\([\"']\)/\1/g"
)"
local min_level_title=0
for levels in $(echo "$out" | grep -o '^##*'); do
if ((min_level_title<${#levels})); then
min_level_title=${#levels}
fi
done
local less_n_levels=0
if ((min_level_title>1)); then
less_n_levels=$((min_level_title-1))
fi
local trim_prefix=""
for i in $(seq 1 $less_n_levels); do
trim_prefix+="#"
done
local target="$(
title="$(
echo "$input" \
| grep -o '<title>.*'
)"
title="${title%% - *}"
title="${title#<title>}"
echo "$title"
)"
log target=$target
local buffer=("# $target")
for line in $(
echo "$out" | while read -r line; do
echo "$line" | base64
done
); do
line="$(echo "$line" | base64 --decode)"
if echo "$line" | grep -q "^$target$"; then
buffer=("# $target")
else
buffer+=("$line")
fi
done
for line in "${buffer[@]}"; do
echo "${line#$trim_prefix}"
done
} }
rate_limit_1s() { rate_limit_1s() {