diff --git a/deps.txt b/deps.txt new file mode 100644 index 0000000..46aee22 --- /dev/null +++ b/deps.txt @@ -0,0 +1,2 @@ +pandoc +https://github.com/NightMachinery/readability-cli diff --git a/scrape.sh b/scrape.sh index 4cf6b25..ff9a635 100644 --- a/scrape.sh +++ b/scrape.sh @@ -2,8 +2,7 @@ main() { - set -e - set -o pipefail + ensure local output="${1:-$(mktemp -d)}" log "$output" @@ -22,7 +21,8 @@ main() { #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/krenkos-way #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/bestiary #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/azorius-senate - #scrape_book_chapter https://www.dndbeyond.com/sources/mm/monsters-e + scrape_book_chapter https://www.dndbeyond.com/sources/mm/monsters-e + return $? #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/dmg/appendix-b-monster-lists #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/phb/credits | less @@ -43,6 +43,19 @@ main() { log "$output" } +ensure() { + set -e + set -o pipefail + for exe in pandoc readability; do + if ! which $exe; then + echo "$exe not installed" >&2 + return 1 + fi + done + export TMPDIR=/tmp/scrape.${SEED:-${RANDOM:-$(date +%s)}} + mkdir -p $TMPDIR +} + log() { echo "$(date) > $*" >&2 } @@ -137,7 +150,7 @@ list_chapters_in_book() { scrape_dndbeyond() { log foo:scrape_dndbeyond $@ - local d="${TMPDIR:-/tmp}/scrape_cache" + local d="${TMPDIR}/scrape_cache" mkdir -p "$d" local f="$d/$(echo "$*" | base64)" if [ -f "$f" ]; then @@ -163,7 +176,7 @@ _scrape_dndbeyond() { -H 'sec-fetch-user: ?1' \ -H 'sec-fetch-dest: document' \ -H 'accept-language: en-US,en;q=0.9' \ - -H 'cookie: Preferences=undefined; Preferences=undefined; Geo={%22region%22:%22UT%22%2C%22country%22:%22US%22%2C%22continent%22:%22NA%22}; ResponsiveSwitch.DesktopMode=1; sublevel=ANON; Preferences=undefined; ddbSiteBanner:fd981371-c501-4638-a64b-0d9a5c8da68b=true; G_ENABLED_IDPS=google; User.ID=109926924; User.Username=squeaky2x3; Preferences.Language=1; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"5993614e-774f-4b55-89de-c88120abea59"}; RequestVerificationToken=6dc61cff-204f-45a0-a39f-edcea6a23072; Preferences.TimeZoneID=1; LoginState=aa3ada1f-7ab9-4804-a177-858bd2b4d693; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..ZYTOkNPFaI619J4f0rSamg.Wu7iDGvwCv-S2uiAftwHIienWEyGno2diWV6KU_KMnjzJGog2bIr8XorEpMshuGW.GN5g3TsaP_jm0d3THUnftA; ddbSiteBanner:d7e00153-5c1f-442c-a584-b3b814abc854=true; AWSELB=17A593B6CA59C3C4856B812F84CD401A582EF083E14A2E133FB99235E1D3CC671549BC8DF3DD0A01DD4815C023F988F55E0E9527B680F5A93AEBF8E5ADAA49A8F75A9276; AWSELBCORS=17A593B6CA59C3C4856B812F84CD401A582EF083E14A2E133FB99235E1D3CC671549BC8DF3DD0A01DD4815C023F988F55E0E9527B680F5A93AEBF8E5ADAA49A8F75A9276; _pxhd=fFcuMtI4ivDtBeMpMP/C7iQM0Tq6VQ3S7Ez2cFTQR3660-FAmmxJN1nf10BpNrlWlqlcOEbI4RNwUBHISjoo/A==:daB4hDYpQC3vi3ldLNM4PFvrSLt41cDr81ohF5gDoCaFGENWNSNvkbM9u5BjZ2zgHKWbzxAz4B0PTu0FnstnbJzhd3uPJGcXecasPyv7C04=; WarningNotification.Lock=1' \ + -H 'Cookie: _pxhd=qvJb-g60h1HSymUFEiAmb-Noh35mNbUmcFgIE0qEiisCj/BqXrgrrsRUlODrjAaSGknpuHBn0S2b4rGTBFvaIA==:5/4beZX4ojNLKbAeZH9rQFcGIQ/ijS7lK29wvhLkgB4O0L2GFpRuU5FOpRztEiKg9EoUZr2cPB2Z8w3DKVkaKJPlwcIt23DM5PgQLZEPoA8=; ResponsiveSwitch.DesktopMode=1; LoginState=0ae9ef26-a1ae-4cb2-8af7-f18c21b99811; G_ENABLED_IDPS=google; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..FawE7bpRlluZsWq7Iny1xQ.nBX_qdZq5varuXNGkl70QeJ03JIRT3sxxQi6iCzazt-SjlK1hEuINX0sK85gJKVT.8wlUwvfZiME9fAdBKj0rIw; Preferences.Language=1; Pref…sername=squeaky2x3; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"5ce909a9-34aa-4b15-9b6d-00ee48e13e05"}; AWSELB=17A593B6CA59C3C4856B812F84CD401A582EF08335AE71D0834BC2696233572C85266F7F9F4B5BF89DD4952FDA9AD462250417F8B7C4975F1633599CC9D18D74C3B57CBA; AWSELBCORS=17A593B6CA59C3C4856B812F84CD401A582EF08335AE71D0834BC2696233572C85266F7F9F4B5BF89DD4952FDA9AD462250417F8B7C4975F1633599CC9D18D74C3B57CBA; ddbSiteBanner:bb279d28-f265-49d5-ad41-6d716915bbdf=true; WarningNotification.Lock=1' \ -H 'dnt: 1' \ --compressed return @@ -206,10 +219,23 @@ url_to_title_candidate() { } html_to_markdown() { + local f="$(mktemp)" + log url=$1 + cat > "$f" + cat "$f" \ + | readability "file://$(realpath $f)" \ + | pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \ + | sed 's/{[^}]*data[^}]*}//g' \ + | sed 's/\[](#[^)]*)//' \ + | cat + rm "$f" + return $? + local input="$(cat)" local url_title_candidate="$(url_to_title_candidate "$1")" local url_title_candidate_2="$(url_to_title_candidate "$(dirname "$1")")" local out="$( - pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \ + echo "$input" \ + | pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \ | sed '/^[ ]*:::.*/d' \ | sed -e '/^Share$/,$d' \ | sed 's/^\(\#\#*\) \[](#\([^)]*\))/\1 /' \ @@ -217,39 +243,51 @@ html_to_markdown() { | sed 's/\[\([^]]*\)]\(([^)]*)\)*\({[^}]*}\)*/\1/g' \ | sed "s/\\\\\([\"']\)/\1/g" )" - local target="$( - if echo "$out" | grep '^# Appendix' &> /dev/null; then - echo Appendix - elif echo "$out" | grep '^# Chapter' &> /dev/null; then - echo Chapter - elif echo "$out" | grep '^# Preface' &> /dev/null; then - echo Preface - elif echo "$out" | grep '^# Introduction' &> /dev/null; then - echo Introduction - elif echo "$out" | grep '^# Credits' &> /dev/null; then - echo Credits - elif echo "$out" | grep '^# Monsters' &> /dev/null; then - echo Monsters - elif echo "$out" | grep -i '^# '"$url_title_candidate" &> /dev/null; then - target="$(echo "$out" | grep -i '^# '"$url_title_candidate" | sed 's/^\#[ ]*//' | head -n 1)" - log WARNING: URL title candidate $target - echo "$target" - else - log WARNING: NOTHING INDICITIVE OF TYPE OF CONTENT FOUND, including matching $url_title_candidate - echo "$out" | grep '^#' | while read -r line; do - log candidate="$line" - done - echo Chapter + local min_level_title=0 + for levels in $(echo "$out" | grep -o '^##*'); do + if ((min_level_title<${#levels})); then + min_level_title=${#levels} fi + done + local less_n_levels=0 + if ((min_level_title>1)); then + less_n_levels=$((min_level_title-1)) + fi + local trim_prefix="" + for i in $(seq 1 $less_n_levels); do + trim_prefix+="#" + done + local target="$( + title="$( + echo "$input" \ + | grep -o '.*' + )" + title="${title%% - *}" + title="${title#<title>}" + echo "$title" )" log target=$target - echo "$out" \ - | sed -e '/^# '"$target"'/p' -e '0,/^# '"$target"'/d' + local buffer=("# $target") + for line in $( + echo "$out" | while read -r line; do + echo "$line" | base64 + done + ); do + line="$(echo "$line" | base64 --decode)" + if echo "$line" | grep -q "^$target$"; then + buffer=("# $target") + else + buffer+=("$line") + fi + done + for line in "${buffer[@]}"; do + echo "${line#$trim_prefix}" + done } rate_limit_1s() { local name="$1" - local d="${TMPDIR:-/tmp}"/rate_limited + local d="${TMPDIR}"/rate_limited mkdir -p "$d" local last_run="$(date -r "$d/$name" +%s)" local now="$(date +%s)"