#! /bin/bash main() { ensure local output="$TMPDIR/result" mkdir -p "$output" log "$output" #set -x #list_chapters_in_book https://www.dndbeyond.com/sources/sacoc #list_chapters_in_book https://www.dndbeyond.com/sources/twbtw #list_chapters_in_book https://www.dndbeyond.com/sources/hftt #list_chapters_in_book https://www.dndbeyond.com/sources/sdw #should_scrape_book_chapter https://www.dndbeyond.com/sources/ai/credits || true #scrape_book_chapter https://www.dndbeyond.com/sources/ai/credits #scrape_book_chapter https://www.dndbeyond.com/sources/ai #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/krenkos-way #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/bestiary #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/azorius-senate #scrape_book_chapter https://www.dndbeyond.com/sources/mm/monsters-e #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/dmg/appendix-b-monster-lists #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/basic-rules/appendix-b-gods-of-the-multiverse #return $? #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/phb/credits | less #scrape_book_chapter https://www.dndbeyond.com/sources/phb/races | less #scrape_book_chapter https://www.dndbeyond.com/sources/phb/appendix-a-conditions | less #scrape_book "$output" https://www.dndbeyond.com/sources/phb log "books=$(scrape_books)" sleep 1 for book in $(scrape_books); do book="${book%/}" log book=$book scrape_book "$output/${book##*/}" "$book" done log "$output" } ensure() { set -e set -o pipefail for exe in pandoc readability; do if ! which $exe; then echo "$exe not installed" >&2 return 1 fi done export TMPDIR="${TMPDIR:-"/tmp/scrape.${SEED:-${RANDOM:-$(date +%s)}}"}" mkdir -p $TMPDIR } log() { echo "$(date) > $*" >&2 } scrape_books() { scrape_dndbeyond "https://www.dndbeyond.com/sources#Sourcebooks" \ | grep class..sources-listing--item \ | grep href=.*sources \ | grep -o 'href="[^"]*' \ | sed 's/^href="//' \ | sed 's/^\///' \ | sed 's/^/\//' \ | sed 's/^/https:\/\/www.dndbeyond.com/' \ | grep "${SCRAPE_BOOKS_PATTERN:-.*}" } scrape_book() { local output="$1" local book_url="$2" if [ -d "$output" ] && ls "$output"/* &> /dev/null && [ ! -f "$output/.wip" ]; then log "$book_url already in $output" return fi mkdir -p "$output" touch "$output/.wip" local f="$(mktemp)" for chapter in $(list_chapters_in_book "$book_url"); do log scraping book chapter $chapter of $book_url if ! should_scrape_book_chapter "$chapter"; then continue fi scrape_book_chapter "$chapter" > "$f" log scraping book title from $f local title="$( grep '^\#' "$f" \ | head -n 1 \ | sed 's/^\#*//' \ | sed 's/^[ ]*//' \ | sed 's/[ ]*$//' \ | sed 's/[ ][ ]*/ /g' \ | sed 's/[^a-zA-Z0-9]/_/g' \ | sed 's/mdash.*//' \ | sed 's/Disclaimer.*/Disclaimer/g' \ )" if [ -z "$title" ]; then title="${chapter%/}" title="${title##*/}" title="${title%%"?"*}" log "WARNING: NO TITLE FOUND IN $f ($chapter): $(head -n 3 $f); using $title from $chapter (enter to continue)" read fi mv "$f" "$output"/"$title" done if [ -f "$f" ]; then rm "$f" fi rm "$output/.wip" } should_scrape_book_chapter() { local url="$1" log foo:should_scrape_book_chapter $url if scrape_dndbeyond "$url" | grep -i "add to cart" &> /dev/null; then log "not purchased: $url" _purge_cached_scrape_dndbeyond "$url" return 1 fi } scrape_book_chapter() { local url="$1" log foo:scrape_book_chapter $url scrape_dndbeyond "$url" \ | html_to_markdown "$url" } list_chapters_in_book() { local book_url="${1%/}" local book_domain="${book_url%%.com/*}.com" log domain=$book_domain local book_url_path="${book_url##*.com}" book_url_path="${book_url_path%#*}" if ! should_scrape_book_chapter "$book_url"; then return fi local raw="$(scrape_dndbeyond "$book_url")" if echo "$raw" | grep 'href="https:..www.dndbeyond.com.compendium[^"]*' &> /dev/null; then echo "$raw" | grep -o 'https:..www.dndbeyond.com.compendium[^"]*' elif echo "$raw" | grep "href=\"$book_url/" &> /dev/null; then echo "$raw" | grep -o "${book_url//\//.}\/[^\"]*" elif echo "$raw" | grep "href=\"${book_url_path//\//.}\\/" &> /dev/null; then echo "$raw" | grep -o "href=\"${book_url_path//\//.}\/[^\"]*" | sed 's/href=\"//' | sed "s/^/${book_domain//\//\\/}/g" else echo "ERROR: FOUND NO CHAPTERS IN $book_url" >&2 return 1 fi \ | sed 's/#.*//' \ | sort -u } scrape_dndbeyond() { log foo:scrape_dndbeyond $@ local f="$(_scrape_dndbeyond_cache_f "$@")" if [ -f "$f" ]; then if cat "$f" | grep .; then log "foo:scrape_dndbeyond: yes cached: $f: $@" return fi fi log "foo:scrape_dndbeyond: not cached: $f: $@" _scrape_dndbeyond "$@" | tee "$f" } _purge_cached_scrape_dndbeyond () { local f="$(_scrape_dndbeyond_cache_f "$@")" if [ -f "$f" ]; then rm -f "$f" fi } _scrape_dndbeyond_cache_f() { local d="${TMPDIR}/scrape_cache" mkdir -p "$d" echo "$d/$(echo "$*" | base64 | tr -d '\n')" } _scrape_dndbeyond() { log foo:_scrape_dndbeyond $@ rate_limit_1s scrape_dndbeyond touch "$TMPDIR/cookies.txt" curl -L -sS "$1" \ -H 'authority: www.dndbeyond.com' \ -H 'cache-control: max-age=0' \ -H 'upgrade-insecure-requests: 1' \ -H 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36' \ -H 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' \ -H 'sec-gpc: 1' \ -H 'sec-fetch-site: none' \ -H 'sec-fetch-mode: navigate' \ -H 'sec-fetch-user: ?1' \ -H 'sec-fetch-dest: document' \ -H 'accept-language: en-US,en;q=0.9' \ -H 'Cookie: Preferences=undefined; Preferences=undefined; _pxhd=dGa4DRIksfo8m7Siqu0z-stTfu9R0jC2I53wbDc/nEiS/YpOxaQaWLECaBu4-wCfqyCDmn2c/PpjW1ESR3pWow==:RH/b5abeFbAM04R3V7bm14eVCFGzYpPFEt6-0X4z2CCnEnFxcFIQkGBwdews8JTWKuNsgYx758Xp3RcXupSxnPMBLfFLJ8MSNyzyaOozezE=; Geo={%22region%22:%22UT%22%2C%22country%22:%22US%22%2C%22continent%22:%22NA%22}; ResponsiveSwitch.DesktopMode=1; RequestVerificationToken=2cdddc90-fbbb-45a4-929a-1fff6879a852; sublevel=ANON; Preferences=undefined; marketplace_filter_show_owned=false; G_ENABLED_IDPS=google; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..YZH4-9MvXBhf0v5KdkH8vA.J5jwdmKCBGfAjmlGY1UOr6UD0am0hS0qiyY-xBx-j3CEM8CrhlzyxRDo04YK3nc3.Z5M5dZnstMa9OIkrEN29HQ; User.ID=109926924; User.Username=squeaky2x3; Preferences.Language=1; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"71f1f0eb-3523-4f94-96d9-22d3511ea6b6"}; Preferences.TimeZoneID=1; LoginState=8ab45ac1-5777-4613-a3f5-7439058fec58; AWSELB=17A593B6CA59C3C4856B812F84CD401A582EF083AB9FE2016C3192A7805F9642538006F9B284A9F525C1AA0DF220CB30AEE9DCF61CB6C98E40F1ADC8CB1E6C93F2D5E4FC; AWSELBCORS=17A593B6CA59C3C4856B812F84CD401A582EF083AB9FE2016C3192A7805F9642538006F9B284A9F525C1AA0DF220CB30AEE9DCF61CB6C98E40F1ADC8CB1E6C93F2D5E4FC; cobalt-token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJodHRwOi8vc2NoZW1hcy54bWxzb2FwLm9yZy93cy8yMDA1LzA1L2lkZW50aXR5L2NsYWltcy9uYW1laWRlbnRpZmllciI6IjEwOTkyNjkyNCIsImh0dHA6Ly9zY2hlbWFzLnhtbHNvYXAub3JnL3dzLzIwMDUvMDUvaWRlbnRpdHkvY2xhaW1zL25hbWUiOiJzcXVlYWt5MngzIiwiaHR0cDovL3NjaGVtYXMueG1sc29hcC5vcmcvd3MvMjAwNS8wNS9pZGVudGl0eS9jbGFpbXMvZW1haWxhZGRyZXNzIjoic3F1ZWFreTJ4M0BnbWFpbC5jb20iLCJkaXNwbGF5TmFtZSI6InNxdWVha3kyeDMiLCJodHRwOi8vc2NoZW1hcy5taWNyb3NvZnQuY29tL3dzLzIwMDgvMDYvaWRlbnRpdHkvY2xhaW1zL3JvbGUiOiJSZWdpc3RlcmVkIFVzZXJzIiwibmJmIjoxNjU2Mzc2ODE0LCJleHAiOjE2NTYzNzcxMTQsImlzcyI6ImRuZGJleW9uZC5jb20iLCJhdWQiOiJkbmRiZXlvbmQuY29tIn0.ygIntjw5eL4hlACI7T0VpdWKUghzaJZtrGsVllsgEtk; WarningNotification.Lock=1' \ -H 'dnt: 1' \ -b "$TMPDIR/cookies.txt" \ -c "$TMPDIR/cookies.txt" \ --compressed return } url_to_title_candidate() { local url="${1%#*}" url="${url%/}" local url_title_candidate="${url##*/}" url_title_candidate="$( echo "$url_title_candidate" \ | tr '[:upper:]' '[:lower:]' \ | sed 's/[^a-z]/./g' )" local with_wildcards=".*" for i in $(seq 0 ${#url_title_candidate}); do with_wildcards+=${url_title_candidate:$i:1} with_wildcards+=".*" done url_title_candidate="$with_wildcards" echo "$url_title_candidate" } html_to_markdown() { local f="$(mktemp)" log url=$1 cat > "$f" #| perl -pe 's|]*stat-block-ability-scores-data[^>]*>(.*?)|\1|g' \ #| perl -pe 's|]*stat-block-ability-scores-heading[^>]*>(.*?)|\1|g' \ #| perl -pe 's|]*stat-block-ability-scores-stat[^>]*>(.*?)|

\1

|g' \ #| readability "file://$(realpath $f)" \ local clean="$( cat "$f" \ | awk '/CONTENT/,/FOOTER/' \ | tr '\n' '\r' \ | perl -pe 's|]*>(.*?)|\1|g' \ | perl -pe 's|\r([^<]*)

\r|\r
\1
\r|g' \ | perl -pe 's|

(.*?)

|\2|g' \ | perl -pe 's|\r*(?:(?!).)+\r*||g' \ | tr '\r' '\n' \ | sed 's/
/ /g' \ | sed 's/colspan="[^"]*"//g' \ | tr '\n' '\r' \ | perl -pe 's|

(.*?)

|\2|g' \ | perl -pe 's|\r*(?:(?!).)+\r*||g' \ | tr '\r' '\n' \ | pandoc \ -f html \ -t markdown+pipe_tables-simple_tables-multiline_tables \ --atx-headers \ --ascii \ --toc \ --wrap=none \ --strip-comments \ -o - \ | sed 's/{[^}]*data[^}]*}//g' \ | sed 's/\[](#[^)]*)//' \ | cat \ | sed 's/{[^}]*}//' \ | sed 's/ *$//' \ | tr '\n' '\r' \ | sed 's/
\(.*\)<\/div>/\1/g' \ | tr '\r' '\n' \ )" ( grep -o ']*>//' \ | sed 's/-.*//' \ | sed 's/^/# /' \ | grep . || echo "# ${1##*/}" echo echo "$clean" ) \ | tr '\n' '\r' \ | sed 's/\r\r:::\r\r/\r/g' \ | sed 's/\r\r:::\r*$//' \ | sed 's/::://g' \ | sed 's/&.dquo;/"/g' \ | sed "s/&.squo;/'/g" \ | sed 's/—/--/g' \ | sed 's/–/-/g' \ | sed 's/×/*/g' \ | sed 's/­//g' \ | sed 's/{\.[^}]*}//g' \ | tr '\r' '\n' \ | sed 's/===+/===|/g' \ | sed 's/+===/|===/g' \ | sed 's/---+/---|/g' \ | sed 's/+---/|---/g' \ | grep -v '^|-[-|]*-|$' \ | sed '/^|[=|]*|$/s/=/-/g' \ #| sed 's/:::\(.*\):::/```\1```/g' \ rm "$f" return $? } rate_limit_1s() { local name="$1" local d="${TMPDIR}"/rate_limited mkdir -p "$d" local last_run="$(date -r "$d/$name" +%s)" local now="$(date +%s)" local interval=6 log "should sleep while $(($(date +%s)-last_run)) < $interval" if [ -f "$d/$name" ]; then while [ "$(($(date +%s)-last_run))" -lt $interval ]; do sleep 1 done fi touch "$d/$name" } if [ "$0" == "$BASH_SOURCE" ]; then main "$@" fi