master
Bel LaPointe 2021-12-15 06:01:12 -07:00
parent 52623ed8d9
commit 95518fc3a8
1 changed files with 88 additions and 24 deletions

112
poc.sh
View File

@ -1,28 +1,88 @@
#! /bin/bash #! /bin/bash
main() { main() {
_main "$@" set -e
set -o pipefail
local output="$(mktemp -d)"
log "$output"
#list_chapters_in_book https://www.dndbeyond.com/sources/sdw
#scrape_book_chapter https://www.dndbeyond.com/sources/phb/races | less
#scrape_book_chapter https://www.dndbeyond.com/sources/phb/appendix-a-conditions | less
#scrape_book "$output" https://www.dndbeyond.com/sources/phb
for book in $(scrape_books); do
book="${book%/}"
log book=$book
scrape_book "$output/${book##*/}" "$book"
break
done
log "$output"
} }
_main() { log() {
for url in $( echo "$(date) > $*" >&2
scrape_dndbeyond https://www.dndbeyond.com/sources/phb \ }
| grep -o 'https:..www.dndbeyond.com.compendium[^"]*' \
scrape_books() {
scrape_dndbeyond "https://www.dndbeyond.com/sources#Sourcebooks" \
| grep class..sources-listing--item \
| grep href=.*sources \
| grep -o 'href="[^"]*' \
| sed 's/^href="//' \
| sed 's/^\///' \
| sed 's/^/\//' \
| sed 's/^/https:\/\/www.dndbeyond.com/'
}
scrape_book() {
local output="$1"
local book_url="$2"
mkdir -p "$output"
local f="$(mktemp)"
for chapter in $(list_chapters_in_book "$book_url"); do
log chapter=$chapter
scrape_book_chapter "$chapter" > "$f"
local title="$(grep -E 'Chapter|Appendix' "$f" | head -n 1 | sed 's/^\#*//' | sed 's/^[ ]*//' | sed 's/[ ]*$//')"
mv "$f" "$output"/"$title"
done
rm "$f"
}
scrape_book_chapter() {
local url="$1"
scrape_dndbeyond $url \
| html_to_markdown
}
list_chapters_in_book() {
local book_url="${1%/}"
local raw="$(scrape_dndbeyond "$book_url")"
if echo "$raw" | grep -q 'https:..www.dndbeyond.com.compendium[^"]*'; then
echo "$raw" | grep -o 'https:..www.dndbeyond.com.compendium[^"]*'
elif echo "$raw" | grep -q "$book_url/"; then
echo "$raw" | grep -o "${book_url//\//.}\/[^\"]*"
else
echo "ERROR: FOUND NO CHAPTERS IN $book_url" >&2
return 1
fi \
| sed 's/#.*//' \ | sed 's/#.*//' \
| sort -u | sort -u
); do
echo $url
scrape_dndbeyond $url \
| html_to_markdown \
| less
break
done
} }
scrape_dndbeyond() { scrape_dndbeyond() {
rate_limit_1s scrape_dndbeyond rate_limit_1s scrape_dndbeyond
curl -L -sS \ curl -L -sS \
"${1:-https://www.dndbeyond.com/sources/phb}" \ "$1" \
--compressed \ --compressed \
-H 'Cookie: Preferences=undefined; Preferences=undefined; _pxhd=NHdkEjOXnAEHojER86i0Egmnm85GLvifiQxURPISwVtzWLNwuqQbIk1Y/2MjheQqNP4CsBXhuc4UG5-qcmLjYw==:hGx3TOEjeVaJPMfrXXnmyG7-btpQhZutwvpVBKwEbNXTAkEQvgmIs/faKB6s8537Nivj6epH9HpOWYURQeazeMwSMReW6EMazW09Rci-PJI=; ResponsiveSwitch.DesktopMode=1; ddbSiteBanner:00523ca3-81a4-4d2a-8f86-9e40273af2e2=true; LoginState=2c011278-8a3d-4cf7-a141-221f42694333; G_ENABLED_IDPS=google; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..vh9-_2G_aEjMvJ2PFAAE4w.eheZuOJLp-3NyiAJZzKqaSc91lH0ganCRsDZPYMGIG3265AppaHh0uMQLu_-P6WX.bHRi_gX8PKtQGZwVmNeb1Q; Preferences.Language=1; Preferences.TimeZoneID=1; ddbSiteBanner:d8879cae-c68e-4411-9ea0-a49d77fe5454=true; Ratings=null; _pxvid=f6abe72c-49a6-11ec-b844-57447254714c; ddbSiteBanner:fd981371-c501-4638-a64b-0d9a5c8da68b=true; Geo={%22region%22:%22ON%22%2C%22country%22:%22CA%22%2C%22continent%22:%22NA%22}; RequestVerificationToken=983cd9ce-c03c-4b8b-b6c1-a0561b39cb47; sublevel=ANON; Preferences=undefined; User.ID=109926924; User.Username=squeaky2x3; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"77896779-c672-42d1-903b-f6caba6ec599"}; pxcts=5eb1b230-525e-11ec-aa22-793bf1473016; WarningNotification.Lock=1' \ -H 'Cookie: Preferences=undefined; Preferences=undefined; _pxhd=NHdkEjOXnAEHojER86i0Egmnm85GLvifiQxURPISwVtzWLNwuqQbIk1Y/2MjheQqNP4CsBXhuc4UG5-qcmLjYw==:hGx3TOEjeVaJPMfrXXnmyG7-btpQhZutwvpVBKwEbNXTAkEQvgmIs/faKB6s8537Nivj6epH9HpOWYURQeazeMwSMReW6EMazW09Rci-PJI=; ResponsiveSwitch.DesktopMode=1; ddbSiteBanner:00523ca3-81a4-4d2a-8f86-9e40273af2e2=true; LoginState=2c011278-8a3d-4cf7-a141-221f42694333; G_ENABLED_IDPS=google; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..vh9-_2G_aEjMvJ2PFAAE4w.eheZuOJLp-3NyiAJZzKqaSc91lH0ganCRsDZPYMGIG3265AppaHh0uMQLu_-P6WX.bHRi_gX8PKtQGZwVmNeb1Q; Preferences.Language=1; Preferences.TimeZoneID=1; ddbSiteBanner:d8879cae-c68e-4411-9ea0-a49d77fe5454=true; Ratings=null; _pxvid=f6abe72c-49a6-11ec-b844-57447254714c; ddbSiteBanner:fd981371-c501-4638-a64b-0d9a5c8da68b=true; Geo={%22region%22:%22ON%22%2C%22country%22:%22CA%22%2C%22continent%22:%22NA%22}; RequestVerificationToken=983cd9ce-c03c-4b8b-b6c1-a0561b39cb47; sublevel=ANON; Preferences=undefined; User.ID=109926924; User.Username=squeaky2x3; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"77896779-c672-42d1-903b-f6caba6ec599"}; pxcts=5eb1b230-525e-11ec-aa22-793bf1473016; WarningNotification.Lock=1' \
-H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0' \ -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0' \
@ -42,14 +102,18 @@ scrape_dndbeyond() {
} }
html_to_markdown() { html_to_markdown() {
pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \ local out="$(
| sed '/^:::.*/d' \ pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \
| sed -e '/^# Chapter/p' -e '0,/^# Chapter/d' \ | sed '/^:::.*/d' \
| sed -e '/^Share$/,$d' \ | sed -e '/^Share$/,$d' \
| sed 's/^\(\#\#*\) \[](#\([^)]*\))/\1 /' \ | sed 's/^\(\#\#*\) \[](#\([^)]*\))/\1 /' \
| sed 's/{#\([^ ]*\)[^}]*}/\n\n<a name="\1"><\/a>/' \ | sed 's/{#\([^ ]*\)[^}]*}/\n\n<a name="\1"><\/a>/' \
| sed 's/\[\([^]]*\)]\(([^)]*)\)*\({[^}]*}\)*/\1/g' \ | sed 's/\[\([^]]*\)]\(([^)]*)\)*\({[^}]*}\)*/\1/g' \
| sed "s/\\\\\([\"']\)/\1/g" | sed "s/\\\\\([\"']\)/\1/g"
)"
local target="$(echo "$out" | grep -q '^[#]* Appendix' && echo Appendix || echo Chapter)"
echo "$out" \
| sed -e '/^# '"$target"'/p' -e '0,/^# '"$target"'/d'
} }
rate_limit_1s() { rate_limit_1s() {
@ -58,14 +122,14 @@ rate_limit_1s() {
mkdir -p "$d" mkdir -p "$d"
local last_run="$(date -r "$d/$name" +%s)" local last_run="$(date -r "$d/$name" +%s)"
local now="$(date +%s)" local now="$(date +%s)"
local interval=2 local interval=3
echo "$(date +%s): should sleep while $(($(date +%s)-last_run)) < $interval" >&2 log "should sleep while $(($(date +%s)-last_run)) < $interval"
if [ -f "$d/$name" ]; then if [ -f "$d/$name" ]; then
while [ "$(($(date +%s)-last_run))" -lt $interval ]; do while [ "$(($(date +%s)-last_run))" -lt $interval ]; do
sleep 1 sleep 1
done done
fi fi
echo "$(date +%s): done sleeping" >&2 log "done sleeping"
touch "$d/$name" touch "$d/$name"
} }