#! /bin/bash main() { set -e set -o pipefail local output="${1:-$(mktemp -d)}" log "$output" #set -x #list_chapters_in_book https://www.dndbeyond.com/sources/sacoc #list_chapters_in_book https://www.dndbeyond.com/sources/twbtw #list_chapters_in_book https://www.dndbeyond.com/sources/hftt #list_chapters_in_book https://www.dndbeyond.com/sources/sdw #should_scrape_book_chapter https://www.dndbeyond.com/sources/ai/credits || true #scrape_book_chapter https://www.dndbeyond.com/sources/ai/credits #scrape_book_chapter https://www.dndbeyond.com/sources/ai #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/krenkos-way #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/bestiary #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/azorius-senate #scrape_book_chapter https://www.dndbeyond.com/sources/mm/monsters-e #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/dmg/appendix-b-monster-lists #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/phb/credits | less #scrape_book_chapter https://www.dndbeyond.com/sources/phb/races | less #scrape_book_chapter https://www.dndbeyond.com/sources/phb/appendix-a-conditions | less #scrape_book "$output" https://www.dndbeyond.com/sources/phb for book in $(scrape_books); do book="${book%/}" log book=$book scrape_book "$output/${book##*/}" "$book" done log "$output" } log() { echo "$(date) > $*" >&2 } scrape_books() { scrape_dndbeyond "https://www.dndbeyond.com/sources#Sourcebooks" \ | grep class..sources-listing--item \ | grep href=.*sources \ | grep -o 'href="[^"]*' \ | sed 's/^href="//' \ | sed 's/^\///' \ | sed 's/^/\//' \ | sed 's/^/https:\/\/www.dndbeyond.com/' } scrape_book() { local output="$1" local book_url="$2" if [ -d "$output" ] && ls "$output"/* &> /dev/null && [ ! -f "$output/.wip" ]; then log "$book_url already in $output" return fi mkdir -p "$output" touch "$output/.wip" local f="$(mktemp)" for chapter in $(list_chapters_in_book "$book_url"); do log scraping book chapter $chapter of $book_url if ! should_scrape_book_chapter "$chapter"; then continue fi scrape_book_chapter "$chapter" > "$f" log scraping book title from $f local title="$(grep '^\#' "$f" | head -n 1 | sed 's/^\#*//' | sed 's/^[ ]*//' | sed 's/[ ]*$//')" if [ -z "$title" ]; then log "WARNING: NO TITLE FOUND IN $f ($chapter): $(head -n 3 $f)" return 1 fi mv "$f" "$output"/"$title" done if [ -f "$f" ]; then rm "$f" fi rm "$output/.wip" } should_scrape_book_chapter() { local url="$1" log foo:should_scrape_book_chapter $url if scrape_dndbeyond "$url" | grep -i "add to cart" &> /dev/null; then log "not purchased: $url" return 1 fi } scrape_book_chapter() { local url="$1" log foo:scrape_book_chapter $url scrape_dndbeyond "$url" \ | html_to_markdown "$url" } list_chapters_in_book() { local book_url="${1%/}" local book_domain="${book_url%%.com/*}.com" log domain=$book_domain local book_url_path="${book_url##*.com}" book_url_path="${book_url_path%#*}" if ! should_scrape_book_chapter "$book_url"; then return fi local raw="$(scrape_dndbeyond "$book_url")" if echo "$raw" | grep 'href="https:..www.dndbeyond.com.compendium[^"]*' &> /dev/null; then echo "$raw" | grep -o 'https:..www.dndbeyond.com.compendium[^"]*' elif echo "$raw" | grep "href=\"$book_url/" &> /dev/null; then echo "$raw" | grep -o "${book_url//\//.}\/[^\"]*" elif echo "$raw" | grep "href=\"${book_url_path//\//.}\\/" &> /dev/null; then echo "$raw" | grep -o "href=\"${book_url_path//\//.}\/[^\"]*" | sed 's/href=\"//' | sed "s/^/${book_domain//\//\\/}/g" else echo "ERROR: FOUND NO CHAPTERS IN $book_url" >&2 return 1 fi \ | sed 's/#.*//' \ | sort -u } scrape_dndbeyond() { log foo:scrape_dndbeyond $@ local d="${TMPDIR:-/tmp}/scrape_cache" mkdir -p "$d" local f="$d/$(echo "$*" | base64)" if [ -f "$f" ]; then if cat "$f" | grep .; then return fi fi _scrape_dndbeyond "$@" | tee "$f" } _scrape_dndbeyond() { log foo:_scrape_dndbeyond $@ rate_limit_1s scrape_dndbeyond curl -L -sS "$1" \ -H 'authority: www.dndbeyond.com' \ -H 'cache-control: max-age=0' \ -H 'upgrade-insecure-requests: 1' \ -H 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36' \ -H 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' \ -H 'sec-gpc: 1' \ -H 'sec-fetch-site: none' \ -H 'sec-fetch-mode: navigate' \ -H 'sec-fetch-user: ?1' \ -H 'sec-fetch-dest: document' \ -H 'accept-language: en-US,en;q=0.9' \ -H 'cookie: Preferences=undefined; Preferences=undefined; Geo={%22region%22:%22UT%22%2C%22country%22:%22US%22%2C%22continent%22:%22NA%22}; ResponsiveSwitch.DesktopMode=1; sublevel=ANON; Preferences=undefined; ddbSiteBanner:fd981371-c501-4638-a64b-0d9a5c8da68b=true; G_ENABLED_IDPS=google; User.ID=109926924; User.Username=squeaky2x3; Preferences.Language=1; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"5993614e-774f-4b55-89de-c88120abea59"}; RequestVerificationToken=6dc61cff-204f-45a0-a39f-edcea6a23072; Preferences.TimeZoneID=1; LoginState=aa3ada1f-7ab9-4804-a177-858bd2b4d693; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..ZYTOkNPFaI619J4f0rSamg.Wu7iDGvwCv-S2uiAftwHIienWEyGno2diWV6KU_KMnjzJGog2bIr8XorEpMshuGW.GN5g3TsaP_jm0d3THUnftA; ddbSiteBanner:d7e00153-5c1f-442c-a584-b3b814abc854=true; AWSELB=17A593B6CA59C3C4856B812F84CD401A582EF083E14A2E133FB99235E1D3CC671549BC8DF3DD0A01DD4815C023F988F55E0E9527B680F5A93AEBF8E5ADAA49A8F75A9276; AWSELBCORS=17A593B6CA59C3C4856B812F84CD401A582EF083E14A2E133FB99235E1D3CC671549BC8DF3DD0A01DD4815C023F988F55E0E9527B680F5A93AEBF8E5ADAA49A8F75A9276; _pxhd=fFcuMtI4ivDtBeMpMP/C7iQM0Tq6VQ3S7Ez2cFTQR3660-FAmmxJN1nf10BpNrlWlqlcOEbI4RNwUBHISjoo/A==:daB4hDYpQC3vi3ldLNM4PFvrSLt41cDr81ohF5gDoCaFGENWNSNvkbM9u5BjZ2zgHKWbzxAz4B0PTu0FnstnbJzhd3uPJGcXecasPyv7C04=; WarningNotification.Lock=1' \ -H 'dnt: 1' \ --compressed return curl -L -sS \ "$1" \ --compressed \ -H 'Cookie: Preferences=undefined; Preferences=undefined; _pxhd=NHdkEjOXnAEHojER86i0Egmnm85GLvifiQxURPISwVtzWLNwuqQbIk1Y/2MjheQqNP4CsBXhuc4UG5-qcmLjYw==:hGx3TOEjeVaJPMfrXXnmyG7-btpQhZutwvpVBKwEbNXTAkEQvgmIs/faKB6s8537Nivj6epH9HpOWYURQeazeMwSMReW6EMazW09Rci-PJI=; ResponsiveSwitch.DesktopMode=1; ddbSiteBanner:00523ca3-81a4-4d2a-8f86-9e40273af2e2=true; LoginState=2c011278-8a3d-4cf7-a141-221f42694333; G_ENABLED_IDPS=google; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..vh9-_2G_aEjMvJ2PFAAE4w.eheZuOJLp-3NyiAJZzKqaSc91lH0ganCRsDZPYMGIG3265AppaHh0uMQLu_-P6WX.bHRi_gX8PKtQGZwVmNeb1Q; Preferences.Language=1; Preferences.TimeZoneID=1; ddbSiteBanner:d8879cae-c68e-4411-9ea0-a49d77fe5454=true; Ratings=null; _pxvid=f6abe72c-49a6-11ec-b844-57447254714c; ddbSiteBanner:fd981371-c501-4638-a64b-0d9a5c8da68b=true; Geo={%22region%22:%22ON%22%2C%22country%22:%22CA%22%2C%22continent%22:%22NA%22}; RequestVerificationToken=983cd9ce-c03c-4b8b-b6c1-a0561b39cb47; sublevel=ANON; Preferences=undefined; User.ID=109926924; User.Username=squeaky2x3; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"77896779-c672-42d1-903b-f6caba6ec599"}; pxcts=5eb1b230-525e-11ec-aa22-793bf1473016; WarningNotification.Lock=1' \ -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0' \ -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' \ -H 'Accept-Language: en-US,en;q=0.5' \ -H 'Referer: https://www.dndbeyond.com/' \ -H 'DNT: 1' \ -H 'Connection: keep-alive' \ -H 'Upgrade-Insecure-Requests: 1' \ -H 'Sec-Fetch-Dest: document' \ -H 'Sec-Fetch-Mode: navigate' \ -H 'Sec-Fetch-Site: same-origin' \ -H 'Sec-Fetch-User: ?1' \ -H 'Pragma: no-cache' \ -H 'Cache-Control: no-cache' \ -H 'TE: trailers'; } url_to_title_candidate() { local url="${1%#*}" url="${url%/}" local url_title_candidate="${url##*/}" url_title_candidate="$( echo "$url_title_candidate" \ | tr '[:upper:]' '[:lower:]' \ | sed 's/[^a-z]/./g' )" local with_wildcards=".*" for i in $(seq 0 ${#url_title_candidate}); do with_wildcards+=${url_title_candidate:$i:1} with_wildcards+=".*" done url_title_candidate="$with_wildcards" echo "$url_title_candidate" } html_to_markdown() { local url_title_candidate="$(url_to_title_candidate "$1")" local url_title_candidate_2="$(url_to_title_candidate "$(dirname "$1")")" local out="$( pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \ | sed '/^[ ]*:::.*/d' \ | sed -e '/^Share$/,$d' \ | sed 's/^\(\#\#*\) \[](#\([^)]*\))/\1 /' \ | sed 's/{#\([^ ]*\)[^}]*}/\n\n<\/a>/' \ | sed 's/\[\([^]]*\)]\(([^)]*)\)*\({[^}]*}\)*/\1/g' \ | sed "s/\\\\\([\"']\)/\1/g" )" local target="$( if echo "$out" | grep '^# Appendix' &> /dev/null; then echo Appendix elif echo "$out" | grep '^# Chapter' &> /dev/null; then echo Chapter elif echo "$out" | grep '^# Preface' &> /dev/null; then echo Preface elif echo "$out" | grep '^# Introduction' &> /dev/null; then echo Introduction elif echo "$out" | grep '^# Credits' &> /dev/null; then echo Credits elif echo "$out" | grep '^# Monsters' &> /dev/null; then echo Monsters elif echo "$out" | grep -i '^# '"$url_title_candidate" &> /dev/null; then target="$(echo "$out" | grep -i '^# '"$url_title_candidate" | sed 's/^\#[ ]*//' | head -n 1)" log WARNING: URL title candidate $target echo "$target" else log WARNING: NOTHING INDICITIVE OF TYPE OF CONTENT FOUND, including matching $url_title_candidate echo "$out" | grep '^#' | while read -r line; do log candidate="$line" done echo Chapter fi )" log target=$target echo "$out" \ | sed -e '/^# '"$target"'/p' -e '0,/^# '"$target"'/d' } rate_limit_1s() { local name="$1" local d="${TMPDIR:-/tmp}"/rate_limited mkdir -p "$d" local last_run="$(date -r "$d/$name" +%s)" local now="$(date +%s)" local interval=6 log "should sleep while $(($(date +%s)-last_run)) < $interval" if [ -f "$d/$name" ]; then while [ "$(($(date +%s)-last_run))" -lt $interval ]; do sleep 1 done fi touch "$d/$name" } if [ "$0" == "$BASH_SOURCE" ]; then main "$@" fi