From 95518fc3a8713d234479cd70c89aa28a3db1adde Mon Sep 17 00:00:00 2001 From: Bel LaPointe Date: Wed, 15 Dec 2021 06:01:12 -0700 Subject: [PATCH] mvp --- poc.sh | 112 ++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 88 insertions(+), 24 deletions(-) diff --git a/poc.sh b/poc.sh index 03e9851..ed2735c 100644 --- a/poc.sh +++ b/poc.sh @@ -1,28 +1,88 @@ #! /bin/bash + main() { - _main "$@" + set -e + set -o pipefail + + local output="$(mktemp -d)" + log "$output" + + #list_chapters_in_book https://www.dndbeyond.com/sources/sdw + + #scrape_book_chapter https://www.dndbeyond.com/sources/phb/races | less + #scrape_book_chapter https://www.dndbeyond.com/sources/phb/appendix-a-conditions | less + + #scrape_book "$output" https://www.dndbeyond.com/sources/phb + + for book in $(scrape_books); do + book="${book%/}" + log book=$book + scrape_book "$output/${book##*/}" "$book" + break + done + + + log "$output" } -_main() { - for url in $( - scrape_dndbeyond https://www.dndbeyond.com/sources/phb \ - | grep -o 'https:..www.dndbeyond.com.compendium[^"]*' \ +log() { + echo "$(date) > $*" >&2 +} + +scrape_books() { + scrape_dndbeyond "https://www.dndbeyond.com/sources#Sourcebooks" \ + | grep class..sources-listing--item \ + | grep href=.*sources \ + | grep -o 'href="[^"]*' \ + | sed 's/^href="//' \ + | sed 's/^\///' \ + | sed 's/^/\//' \ + | sed 's/^/https:\/\/www.dndbeyond.com/' +} + +scrape_book() { + local output="$1" + local book_url="$2" + + mkdir -p "$output" + local f="$(mktemp)" + + for chapter in $(list_chapters_in_book "$book_url"); do + log chapter=$chapter + scrape_book_chapter "$chapter" > "$f" + local title="$(grep -E 'Chapter|Appendix' "$f" | head -n 1 | sed 's/^\#*//' | sed 's/^[ ]*//' | sed 's/[ ]*$//')" + mv "$f" "$output"/"$title" + done + + rm "$f" +} + +scrape_book_chapter() { + local url="$1" + scrape_dndbeyond $url \ + | html_to_markdown +} + +list_chapters_in_book() { + local book_url="${1%/}" + local raw="$(scrape_dndbeyond "$book_url")" + if echo "$raw" | grep -q 'https:..www.dndbeyond.com.compendium[^"]*'; then + echo "$raw" | grep -o 'https:..www.dndbeyond.com.compendium[^"]*' + elif echo "$raw" | grep -q "$book_url/"; then + echo "$raw" | grep -o "${book_url//\//.}\/[^\"]*" + else + echo "ERROR: FOUND NO CHAPTERS IN $book_url" >&2 + return 1 + fi \ | sed 's/#.*//' \ | sort -u - ); do - echo $url - scrape_dndbeyond $url \ - | html_to_markdown \ - | less - break - done } scrape_dndbeyond() { rate_limit_1s scrape_dndbeyond curl -L -sS \ - "${1:-https://www.dndbeyond.com/sources/phb}" \ + "$1" \ --compressed \ -H 'Cookie: Preferences=undefined; Preferences=undefined; _pxhd=NHdkEjOXnAEHojER86i0Egmnm85GLvifiQxURPISwVtzWLNwuqQbIk1Y/2MjheQqNP4CsBXhuc4UG5-qcmLjYw==:hGx3TOEjeVaJPMfrXXnmyG7-btpQhZutwvpVBKwEbNXTAkEQvgmIs/faKB6s8537Nivj6epH9HpOWYURQeazeMwSMReW6EMazW09Rci-PJI=; ResponsiveSwitch.DesktopMode=1; ddbSiteBanner:00523ca3-81a4-4d2a-8f86-9e40273af2e2=true; LoginState=2c011278-8a3d-4cf7-a141-221f42694333; G_ENABLED_IDPS=google; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..vh9-_2G_aEjMvJ2PFAAE4w.eheZuOJLp-3NyiAJZzKqaSc91lH0ganCRsDZPYMGIG3265AppaHh0uMQLu_-P6WX.bHRi_gX8PKtQGZwVmNeb1Q; Preferences.Language=1; Preferences.TimeZoneID=1; ddbSiteBanner:d8879cae-c68e-4411-9ea0-a49d77fe5454=true; Ratings=null; _pxvid=f6abe72c-49a6-11ec-b844-57447254714c; ddbSiteBanner:fd981371-c501-4638-a64b-0d9a5c8da68b=true; Geo={%22region%22:%22ON%22%2C%22country%22:%22CA%22%2C%22continent%22:%22NA%22}; RequestVerificationToken=983cd9ce-c03c-4b8b-b6c1-a0561b39cb47; sublevel=ANON; Preferences=undefined; User.ID=109926924; User.Username=squeaky2x3; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"77896779-c672-42d1-903b-f6caba6ec599"}; pxcts=5eb1b230-525e-11ec-aa22-793bf1473016; WarningNotification.Lock=1' \ -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0' \ @@ -42,14 +102,18 @@ scrape_dndbeyond() { } html_to_markdown() { - pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \ - | sed '/^:::.*/d' \ - | sed -e '/^# Chapter/p' -e '0,/^# Chapter/d' \ - | sed -e '/^Share$/,$d' \ - | sed 's/^\(\#\#*\) \[](#\([^)]*\))/\1 /' \ - | sed 's/{#\([^ ]*\)[^}]*}/\n\n<\/a>/' \ - | sed 's/\[\([^]]*\)]\(([^)]*)\)*\({[^}]*}\)*/\1/g' \ - | sed "s/\\\\\([\"']\)/\1/g" + local out="$( + pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \ + | sed '/^:::.*/d' \ + | sed -e '/^Share$/,$d' \ + | sed 's/^\(\#\#*\) \[](#\([^)]*\))/\1 /' \ + | sed 's/{#\([^ ]*\)[^}]*}/\n\n<\/a>/' \ + | sed 's/\[\([^]]*\)]\(([^)]*)\)*\({[^}]*}\)*/\1/g' \ + | sed "s/\\\\\([\"']\)/\1/g" + )" + local target="$(echo "$out" | grep -q '^[#]* Appendix' && echo Appendix || echo Chapter)" + echo "$out" \ + | sed -e '/^# '"$target"'/p' -e '0,/^# '"$target"'/d' } rate_limit_1s() { @@ -58,14 +122,14 @@ rate_limit_1s() { mkdir -p "$d" local last_run="$(date -r "$d/$name" +%s)" local now="$(date +%s)" - local interval=2 - echo "$(date +%s): should sleep while $(($(date +%s)-last_run)) < $interval" >&2 + local interval=3 + log "should sleep while $(($(date +%s)-last_run)) < $interval" if [ -f "$d/$name" ]; then while [ "$(($(date +%s)-last_run))" -lt $interval ]; do sleep 1 done fi - echo "$(date +%s): done sleeping" >&2 + log "done sleeping" touch "$d/$name" }