292 lines
10 KiB
Bash
292 lines
10 KiB
Bash
#! /bin/bash
|
|
|
|
main() {
|
|
ensure
|
|
|
|
local output="${1:-"$TMPDIR/result"}"
|
|
mkdir -p "$output"
|
|
log "$output"
|
|
|
|
#set -x
|
|
|
|
#list_chapters_in_book https://www.dndbeyond.com/sources/sacoc
|
|
#list_chapters_in_book https://www.dndbeyond.com/sources/twbtw
|
|
#list_chapters_in_book https://www.dndbeyond.com/sources/hftt
|
|
#list_chapters_in_book https://www.dndbeyond.com/sources/sdw
|
|
|
|
#should_scrape_book_chapter https://www.dndbeyond.com/sources/ai/credits || true
|
|
#scrape_book_chapter https://www.dndbeyond.com/sources/ai/credits
|
|
|
|
#scrape_book_chapter https://www.dndbeyond.com/sources/ai
|
|
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/krenkos-way
|
|
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/bestiary
|
|
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/azorius-senate
|
|
#scrape_book_chapter https://www.dndbeyond.com/sources/mm/monsters-e
|
|
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/dmg/appendix-b-monster-lists
|
|
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/basic-rules/appendix-b-gods-of-the-multiverse
|
|
#return $?
|
|
|
|
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/phb/credits | less
|
|
|
|
|
|
#scrape_book_chapter https://www.dndbeyond.com/sources/phb/races | less
|
|
#scrape_book_chapter https://www.dndbeyond.com/sources/phb/appendix-a-conditions | less
|
|
|
|
#scrape_book "$output" https://www.dndbeyond.com/sources/phb
|
|
|
|
log "books=$(scrape_books)"
|
|
sleep 1
|
|
for book in $(scrape_books); do
|
|
book="${book%/}"
|
|
log book=$book
|
|
scrape_book "$output/${book##*/}" "$book"
|
|
done
|
|
|
|
|
|
log "$output"
|
|
}
|
|
|
|
ensure() {
|
|
set -e
|
|
set -o pipefail
|
|
for exe in pandoc readability; do
|
|
if ! which $exe; then
|
|
echo "$exe not installed" >&2
|
|
return 1
|
|
fi
|
|
done
|
|
export TMPDIR=/tmp/scrape.${SEED:-${RANDOM:-$(date +%s)}}
|
|
mkdir -p $TMPDIR
|
|
}
|
|
|
|
log() {
|
|
echo "$(date) > $*" >&2
|
|
}
|
|
|
|
scrape_books() {
|
|
scrape_dndbeyond "https://www.dndbeyond.com/sources#Sourcebooks" \
|
|
| grep class..sources-listing--item \
|
|
| grep href=.*sources \
|
|
| grep -o 'href="[^"]*' \
|
|
| sed 's/^href="//' \
|
|
| sed 's/^\///' \
|
|
| sed 's/^/\//' \
|
|
| sed 's/^/https:\/\/www.dndbeyond.com/'
|
|
}
|
|
|
|
scrape_book() {
|
|
local output="$1"
|
|
local book_url="$2"
|
|
|
|
if [ -d "$output" ] && ls "$output"/* &> /dev/null && [ ! -f "$output/.wip" ]; then
|
|
log "$book_url already in $output"
|
|
return
|
|
fi
|
|
|
|
mkdir -p "$output"
|
|
touch "$output/.wip"
|
|
local f="$(mktemp)"
|
|
|
|
for chapter in $(list_chapters_in_book "$book_url"); do
|
|
log scraping book chapter $chapter of $book_url
|
|
if ! should_scrape_book_chapter "$chapter"; then
|
|
continue
|
|
fi
|
|
scrape_book_chapter "$chapter" > "$f"
|
|
log scraping book title from $f
|
|
local title="$(grep '^\#' "$f" | head -n 1 | sed 's/^\#*//' | sed 's/^[ ]*//' | sed 's/[ ]*$//')"
|
|
if [ -z "$title" ]; then
|
|
log "WARNING: NO TITLE FOUND IN $f ($chapter): $(head -n 3 $f)"
|
|
return 1
|
|
fi
|
|
mv "$f" "$output"/"$title"
|
|
done
|
|
|
|
if [ -f "$f" ]; then
|
|
rm "$f"
|
|
fi
|
|
rm "$output/.wip"
|
|
}
|
|
|
|
should_scrape_book_chapter() {
|
|
local url="$1"
|
|
log foo:should_scrape_book_chapter $url
|
|
if scrape_dndbeyond "$url" | grep -i "add to cart" &> /dev/null; then
|
|
log "not purchased: $url"
|
|
_purge_cached_scrape_dndbeyond "$url"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
scrape_book_chapter() {
|
|
local url="$1"
|
|
log foo:scrape_book_chapter $url
|
|
scrape_dndbeyond "$url" \
|
|
| html_to_markdown "$url"
|
|
}
|
|
|
|
list_chapters_in_book() {
|
|
local book_url="${1%/}"
|
|
local book_domain="${book_url%%.com/*}.com"
|
|
log domain=$book_domain
|
|
local book_url_path="${book_url##*.com}"
|
|
book_url_path="${book_url_path%#*}"
|
|
|
|
if ! should_scrape_book_chapter "$book_url"; then
|
|
return
|
|
fi
|
|
|
|
local raw="$(scrape_dndbeyond "$book_url")"
|
|
|
|
if echo "$raw" | grep 'href="https:..www.dndbeyond.com.compendium[^"]*' &> /dev/null; then
|
|
echo "$raw" | grep -o 'https:..www.dndbeyond.com.compendium[^"]*'
|
|
elif echo "$raw" | grep "href=\"$book_url/" &> /dev/null; then
|
|
echo "$raw" | grep -o "${book_url//\//.}\/[^\"]*"
|
|
elif echo "$raw" | grep "href=\"${book_url_path//\//.}\\/" &> /dev/null; then
|
|
echo "$raw" | grep -o "href=\"${book_url_path//\//.}\/[^\"]*" | sed 's/href=\"//' | sed "s/^/${book_domain//\//\\/}/g"
|
|
else
|
|
echo "ERROR: FOUND NO CHAPTERS IN $book_url" >&2
|
|
return 1
|
|
fi \
|
|
| sed 's/#.*//' \
|
|
| sort -u
|
|
}
|
|
|
|
scrape_dndbeyond() {
|
|
log foo:scrape_dndbeyond $@
|
|
local f="$(_scrape_dndbeyond_cache_f "$@")"
|
|
if [ -f "$f" ]; then
|
|
if cat "$f" | grep .; then
|
|
log "foo:scrape_dndbeyond: yes cached: $f: $@"
|
|
return
|
|
fi
|
|
fi
|
|
log "foo:scrape_dndbeyond: not cached: $f: $@"
|
|
_scrape_dndbeyond "$@" | tee "$f"
|
|
}
|
|
|
|
_purge_cached_scrape_dndbeyond () {
|
|
local f="$(_scrape_dndbeyond_cache_f "$@")"
|
|
if [ -f "$f" ]; then
|
|
rm -f "$f"
|
|
fi
|
|
}
|
|
|
|
_scrape_dndbeyond_cache_f() {
|
|
local d="${TMPDIR}/scrape_cache"
|
|
mkdir -p "$d"
|
|
echo "$d/$(echo "$*" | base64 | tr -d '\n')"
|
|
}
|
|
|
|
_scrape_dndbeyond() {
|
|
log foo:_scrape_dndbeyond $@
|
|
rate_limit_1s scrape_dndbeyond
|
|
curl -L -sS "$1" \
|
|
-H 'authority: www.dndbeyond.com' \
|
|
-H 'cache-control: max-age=0' \
|
|
-H 'upgrade-insecure-requests: 1' \
|
|
-H 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36' \
|
|
-H 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' \
|
|
-H 'sec-gpc: 1' \
|
|
-H 'sec-fetch-site: none' \
|
|
-H 'sec-fetch-mode: navigate' \
|
|
-H 'sec-fetch-user: ?1' \
|
|
-H 'sec-fetch-dest: document' \
|
|
-H 'accept-language: en-US,en;q=0.9' \
|
|
-H 'Cookie: Preferences=undefined; Preferences=undefined; _pxhd=qvJb-g60h1HSymUFEiAmb-Noh35mNbUmcFgIE0qEiisCj/BqXrgrrsRUlODrjAaSGknpuHBn0S2b4rGTBFvaIA==:5/4beZX4ojNLKbAeZH9rQFcGIQ/ijS7lK29wvhLkgB4O0L2GFpRuU5FOpRztEiKg9EoUZr2cPB2Z8w3DKVkaKJPlwcIt23DM5PgQLZEPoA8=; ResponsiveSwitch.DesktopMode=1; LoginState=0ae9ef26-a1ae-4cb2-8af7-f18c21b99811; G_ENABLED_IDPS=google; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..FawE7bpRlluZsWq7Iny1xQ.nBX_qdZq5varuXNGkl70QeJ03JIRT3sxxQi6iCzazt-SjlK1hEuINX0sK85gJKVT.8wlUwv…sername=squeaky2x3; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"5ce909a9-34aa-4b15-9b6d-00ee48e13e05"}; ddbSiteBanner:bb279d28-f265-49d5-ad41-6d716915bbdf=true; AWSELB=17A593B6CA59C3C4856B812F84CD401A582EF083E14A2E133FB99235E1D3CC671549BC8DF3DD0A01DD4815C023F988F55E0E95275F44EAFDFB8393A8DEFA33DD0B90284D; AWSELBCORS=17A593B6CA59C3C4856B812F84CD401A582EF083E14A2E133FB99235E1D3CC671549BC8DF3DD0A01DD4815C023F988F55E0E95275F44EAFDFB8393A8DEFA33DD0B90284D; WarningNotification.Lock=1' \
|
|
-H 'dnt: 1' \
|
|
--compressed
|
|
return
|
|
curl -L -sS \
|
|
"$1" \
|
|
--compressed \
|
|
-H 'Cookie: Preferences=undefined; Preferences=undefined; _pxhd=NHdkEjOXnAEHojER86i0Egmnm85GLvifiQxURPISwVtzWLNwuqQbIk1Y/2MjheQqNP4CsBXhuc4UG5-qcmLjYw==:hGx3TOEjeVaJPMfrXXnmyG7-btpQhZutwvpVBKwEbNXTAkEQvgmIs/faKB6s8537Nivj6epH9HpOWYURQeazeMwSMReW6EMazW09Rci-PJI=; ResponsiveSwitch.DesktopMode=1; ddbSiteBanner:00523ca3-81a4-4d2a-8f86-9e40273af2e2=true; LoginState=2c011278-8a3d-4cf7-a141-221f42694333; G_ENABLED_IDPS=google; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..vh9-_2G_aEjMvJ2PFAAE4w.eheZuOJLp-3NyiAJZzKqaSc91lH0ganCRsDZPYMGIG3265AppaHh0uMQLu_-P6WX.bHRi_gX8PKtQGZwVmNeb1Q; Preferences.Language=1; Preferences.TimeZoneID=1; ddbSiteBanner:d8879cae-c68e-4411-9ea0-a49d77fe5454=true; Ratings=null; _pxvid=f6abe72c-49a6-11ec-b844-57447254714c; ddbSiteBanner:fd981371-c501-4638-a64b-0d9a5c8da68b=true; Geo={%22region%22:%22ON%22%2C%22country%22:%22CA%22%2C%22continent%22:%22NA%22}; RequestVerificationToken=983cd9ce-c03c-4b8b-b6c1-a0561b39cb47; sublevel=ANON; Preferences=undefined; User.ID=109926924; User.Username=squeaky2x3; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"77896779-c672-42d1-903b-f6caba6ec599"}; pxcts=5eb1b230-525e-11ec-aa22-793bf1473016; WarningNotification.Lock=1' \
|
|
-H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0' \
|
|
-H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' \
|
|
-H 'Accept-Language: en-US,en;q=0.5' \
|
|
-H 'Referer: https://www.dndbeyond.com/' \
|
|
-H 'DNT: 1' \
|
|
-H 'Connection: keep-alive' \
|
|
-H 'Upgrade-Insecure-Requests: 1' \
|
|
-H 'Sec-Fetch-Dest: document' \
|
|
-H 'Sec-Fetch-Mode: navigate' \
|
|
-H 'Sec-Fetch-Site: same-origin' \
|
|
-H 'Sec-Fetch-User: ?1' \
|
|
-H 'Pragma: no-cache' \
|
|
-H 'Cache-Control: no-cache' \
|
|
-H 'TE: trailers';
|
|
}
|
|
|
|
url_to_title_candidate() {
|
|
local url="${1%#*}"
|
|
url="${url%/}"
|
|
local url_title_candidate="${url##*/}"
|
|
url_title_candidate="$(
|
|
echo "$url_title_candidate" \
|
|
| tr '[:upper:]' '[:lower:]' \
|
|
| sed 's/[^a-z]/./g'
|
|
)"
|
|
local with_wildcards=".*"
|
|
for i in $(seq 0 ${#url_title_candidate}); do
|
|
with_wildcards+=${url_title_candidate:$i:1}
|
|
with_wildcards+=".*"
|
|
done
|
|
url_title_candidate="$with_wildcards"
|
|
echo "$url_title_candidate"
|
|
}
|
|
|
|
html_to_markdown() {
|
|
local f="$(mktemp)"
|
|
log url=$1
|
|
log 1: url=$1
|
|
cat > "$f"
|
|
log 2: url=$1
|
|
#grep '<title>' "$f" | sed 's/.*>\([^<]*\)<.title>.*/# \1/g'
|
|
local clean="$(
|
|
cat "$f" \
|
|
| readability "file://$(realpath $f)" \
|
|
| pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \
|
|
| sed 's/{[^}]*data[^}]*}//g' \
|
|
| sed 's/\[](#[^)]*)//'
|
|
)"
|
|
log 3: url=$1
|
|
rm "$f"
|
|
log 4: url=$1
|
|
echo "$clean" | (
|
|
lastline=""
|
|
while read -r line; do
|
|
if [ "$line" != "${line#----}" ]; then
|
|
echo "# $lastline"
|
|
echo ""
|
|
break
|
|
fi
|
|
lastline="$line"
|
|
done
|
|
cat &> /dev/null
|
|
)
|
|
log 5: url=$1
|
|
echo "$clean"
|
|
log 6: url=$1
|
|
return $?
|
|
}
|
|
|
|
rate_limit_1s() {
|
|
local name="$1"
|
|
local d="${TMPDIR}"/rate_limited
|
|
mkdir -p "$d"
|
|
local last_run="$(date -r "$d/$name" +%s)"
|
|
local now="$(date +%s)"
|
|
local interval=6
|
|
log "should sleep while $(($(date +%s)-last_run)) < $interval"
|
|
if [ -f "$d/$name" ]; then
|
|
while [ "$(($(date +%s)-last_run))" -lt $interval ]; do
|
|
sleep 1
|
|
done
|
|
fi
|
|
touch "$d/$name"
|
|
}
|
|
|
|
if [ "$0" == "$BASH_SOURCE" ]; then
|
|
main "$@"
|
|
fi
|