support wilder matches from url and cache
parent
08f17f56b6
commit
1d21cead02
89
poc.sh
89
poc.sh
|
|
@ -5,12 +5,17 @@ main() {
|
||||||
set -e
|
set -e
|
||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
|
||||||
|
|
||||||
local output="${1:-$(mktemp -d)}"
|
local output="${1:-$(mktemp -d)}"
|
||||||
log "$output"
|
log "$output"
|
||||||
|
|
||||||
#set -x
|
#set -x
|
||||||
|
|
||||||
|
#should_scrape_book_chapter https://www.dndbeyond.com/sources/ai/credits || true
|
||||||
|
#scrape_book_chapter https://www.dndbeyond.com/sources/ai/credits
|
||||||
|
|
||||||
|
#scrape_book_chapter https://www.dndbeyond.com/sources/ai
|
||||||
|
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/krenkos-way
|
||||||
|
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/bestiary
|
||||||
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/azorius-senate
|
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/azorius-senate
|
||||||
#scrape_book_chapter https://www.dndbeyond.com/sources/mm/monsters-e
|
#scrape_book_chapter https://www.dndbeyond.com/sources/mm/monsters-e
|
||||||
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/dmg/appendix-b-monster-lists
|
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/dmg/appendix-b-monster-lists
|
||||||
|
|
@ -64,6 +69,9 @@ scrape_book() {
|
||||||
|
|
||||||
for chapter in $(list_chapters_in_book "$book_url"); do
|
for chapter in $(list_chapters_in_book "$book_url"); do
|
||||||
log scraping book chapter $chapter of $book_url
|
log scraping book chapter $chapter of $book_url
|
||||||
|
if ! should_scrape_book_chapter "$chapter"; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
scrape_book_chapter "$chapter" > "$f"
|
scrape_book_chapter "$chapter" > "$f"
|
||||||
log scraping book title from $f
|
log scraping book title from $f
|
||||||
local title="$(grep '^\#' "$f" | head -n 1 | sed 's/^\#*//' | sed 's/^[ ]*//' | sed 's/[ ]*$//')"
|
local title="$(grep '^\#' "$f" | head -n 1 | sed 's/^\#*//' | sed 's/^[ ]*//' | sed 's/[ ]*$//')"
|
||||||
|
|
@ -80,10 +88,20 @@ scrape_book() {
|
||||||
rm "$output/.wip"
|
rm "$output/.wip"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
should_scrape_book_chapter() {
|
||||||
|
local url="$1"
|
||||||
|
log foo:should_scrape_book_chapter $url
|
||||||
|
if scrape_dndbeyond "$url" | grep -i "add to cart" &> /dev/null; then
|
||||||
|
log "not purchased: $url"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
scrape_book_chapter() {
|
scrape_book_chapter() {
|
||||||
local url="$1"
|
local url="$1"
|
||||||
scrape_dndbeyond $url \
|
log foo:scrape_book_chapter $url
|
||||||
| html_to_markdown $url
|
scrape_dndbeyond "$url" \
|
||||||
|
| html_to_markdown "$url"
|
||||||
}
|
}
|
||||||
|
|
||||||
list_chapters_in_book() {
|
list_chapters_in_book() {
|
||||||
|
|
@ -103,7 +121,37 @@ list_chapters_in_book() {
|
||||||
}
|
}
|
||||||
|
|
||||||
scrape_dndbeyond() {
|
scrape_dndbeyond() {
|
||||||
|
log foo:scrape_dndbeyond $@
|
||||||
|
local d="${TMPDIR:-/tmp}/scrape_cache"
|
||||||
|
mkdir -p "$d"
|
||||||
|
local f="$d/$(echo "$*" | base64)"
|
||||||
|
if [ -f "$f" ]; then
|
||||||
|
if cat "$f" | grep .; then
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
_scrape_dndbeyond "$@" | tee "$f"
|
||||||
|
}
|
||||||
|
|
||||||
|
_scrape_dndbeyond() {
|
||||||
|
log foo:_scrape_dndbeyond $@
|
||||||
rate_limit_1s scrape_dndbeyond
|
rate_limit_1s scrape_dndbeyond
|
||||||
|
curl -L -sS "$1" \
|
||||||
|
-H 'authority: www.dndbeyond.com' \
|
||||||
|
-H 'cache-control: max-age=0' \
|
||||||
|
-H 'upgrade-insecure-requests: 1' \
|
||||||
|
-H 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36' \
|
||||||
|
-H 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' \
|
||||||
|
-H 'sec-gpc: 1' \
|
||||||
|
-H 'sec-fetch-site: none' \
|
||||||
|
-H 'sec-fetch-mode: navigate' \
|
||||||
|
-H 'sec-fetch-user: ?1' \
|
||||||
|
-H 'sec-fetch-dest: document' \
|
||||||
|
-H 'accept-language: en-US,en;q=0.9' \
|
||||||
|
-H 'cookie: Preferences=undefined; Preferences=undefined; Geo={%22region%22:%22UT%22%2C%22country%22:%22US%22%2C%22continent%22:%22NA%22}; ResponsiveSwitch.DesktopMode=1; sublevel=ANON; Preferences=undefined; ddbSiteBanner:fd981371-c501-4638-a64b-0d9a5c8da68b=true; G_ENABLED_IDPS=google; User.ID=109926924; User.Username=squeaky2x3; Preferences.Language=1; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"5993614e-774f-4b55-89de-c88120abea59"}; RequestVerificationToken=6dc61cff-204f-45a0-a39f-edcea6a23072; Preferences.TimeZoneID=1; LoginState=aa3ada1f-7ab9-4804-a177-858bd2b4d693; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..ZYTOkNPFaI619J4f0rSamg.Wu7iDGvwCv-S2uiAftwHIienWEyGno2diWV6KU_KMnjzJGog2bIr8XorEpMshuGW.GN5g3TsaP_jm0d3THUnftA; ddbSiteBanner:d7e00153-5c1f-442c-a584-b3b814abc854=true; AWSELB=17A593B6CA59C3C4856B812F84CD401A582EF083E14A2E133FB99235E1D3CC671549BC8DF3DD0A01DD4815C023F988F55E0E9527B680F5A93AEBF8E5ADAA49A8F75A9276; AWSELBCORS=17A593B6CA59C3C4856B812F84CD401A582EF083E14A2E133FB99235E1D3CC671549BC8DF3DD0A01DD4815C023F988F55E0E9527B680F5A93AEBF8E5ADAA49A8F75A9276; _pxhd=fFcuMtI4ivDtBeMpMP/C7iQM0Tq6VQ3S7Ez2cFTQR3660-FAmmxJN1nf10BpNrlWlqlcOEbI4RNwUBHISjoo/A==:daB4hDYpQC3vi3ldLNM4PFvrSLt41cDr81ohF5gDoCaFGENWNSNvkbM9u5BjZ2zgHKWbzxAz4B0PTu0FnstnbJzhd3uPJGcXecasPyv7C04=; WarningNotification.Lock=1' \
|
||||||
|
-H 'dnt: 1' \
|
||||||
|
--compressed
|
||||||
|
return
|
||||||
curl -L -sS \
|
curl -L -sS \
|
||||||
"$1" \
|
"$1" \
|
||||||
--compressed \
|
--compressed \
|
||||||
|
|
@ -124,7 +172,7 @@ scrape_dndbeyond() {
|
||||||
-H 'TE: trailers';
|
-H 'TE: trailers';
|
||||||
}
|
}
|
||||||
|
|
||||||
html_to_markdown() {
|
url_to_title_candidate() {
|
||||||
local url="${1%#*}"
|
local url="${1%#*}"
|
||||||
url="${url%/}"
|
url="${url%/}"
|
||||||
local url_title_candidate="${url##*/}"
|
local url_title_candidate="${url##*/}"
|
||||||
|
|
@ -133,9 +181,21 @@ html_to_markdown() {
|
||||||
| tr '[:upper:]' '[:lower:]' \
|
| tr '[:upper:]' '[:lower:]' \
|
||||||
| sed 's/[^a-z]/./g'
|
| sed 's/[^a-z]/./g'
|
||||||
)"
|
)"
|
||||||
|
local with_wildcards=".*"
|
||||||
|
for i in $(seq 0 ${#url_title_candidate}); do
|
||||||
|
with_wildcards+=${url_title_candidate:$i:1}
|
||||||
|
with_wildcards+=".*"
|
||||||
|
done
|
||||||
|
url_title_candidate="$with_wildcards"
|
||||||
|
echo "$url_title_candidate"
|
||||||
|
}
|
||||||
|
|
||||||
|
html_to_markdown() {
|
||||||
|
local url_title_candidate="$(url_to_title_candidate "$1")"
|
||||||
|
local url_title_candidate_2="$(url_to_title_candidate "$(dirname "$1")")"
|
||||||
local out="$(
|
local out="$(
|
||||||
pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \
|
pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \
|
||||||
| sed '/^:::.*/d' \
|
| sed '/^[ ]*:::.*/d' \
|
||||||
| sed -e '/^Share$/,$d' \
|
| sed -e '/^Share$/,$d' \
|
||||||
| sed 's/^\(\#\#*\) \[](#\([^)]*\))/\1 /' \
|
| sed 's/^\(\#\#*\) \[](#\([^)]*\))/\1 /' \
|
||||||
| sed 's/{#\([^ ]*\)[^}]*}/\n\n<a name="\1"><\/a>/' \
|
| sed 's/{#\([^ ]*\)[^}]*}/\n\n<a name="\1"><\/a>/' \
|
||||||
|
|
@ -143,30 +203,31 @@ html_to_markdown() {
|
||||||
| sed "s/\\\\\([\"']\)/\1/g"
|
| sed "s/\\\\\([\"']\)/\1/g"
|
||||||
)"
|
)"
|
||||||
local target="$(
|
local target="$(
|
||||||
if echo "$out" | grep '^##* Appendix' &> /dev/null; then
|
if echo "$out" | grep '^# Appendix' &> /dev/null; then
|
||||||
echo Appendix
|
echo Appendix
|
||||||
elif echo "$out" | grep '^##* Chapter' &> /dev/null; then
|
elif echo "$out" | grep '^# Chapter' &> /dev/null; then
|
||||||
echo Chapter
|
echo Chapter
|
||||||
elif echo "$out" | grep '^##* Preface' &> /dev/null; then
|
elif echo "$out" | grep '^# Preface' &> /dev/null; then
|
||||||
echo Preface
|
echo Preface
|
||||||
elif echo "$out" | grep '^##* Introduction' &> /dev/null; then
|
elif echo "$out" | grep '^# Introduction' &> /dev/null; then
|
||||||
echo Introduction
|
echo Introduction
|
||||||
elif echo "$out" | grep '^##* Credits' &> /dev/null; then
|
elif echo "$out" | grep '^# Credits' &> /dev/null; then
|
||||||
echo Credits
|
echo Credits
|
||||||
elif echo "$out" | grep '^##* Monsters' &> /dev/null; then
|
elif echo "$out" | grep '^# Monsters' &> /dev/null; then
|
||||||
echo Monsters
|
echo Monsters
|
||||||
elif echo "$out" | grep -i '^##* '"$url_title_candidate" &> /dev/null; then
|
elif echo "$out" | grep -i '^# '"$url_title_candidate" &> /dev/null; then
|
||||||
target="$(echo "$out" | grep -i '^##* '"$url_title_candidate" | sed 's/^\#\#*[ ]*//' | head -n 1)"
|
target="$(echo "$out" | grep -i '^# '"$url_title_candidate" | sed 's/^\#[ ]*//' | head -n 1)"
|
||||||
log WARNING: URL title candidate $target
|
log WARNING: URL title candidate $target
|
||||||
echo "$target"
|
echo "$target"
|
||||||
else
|
else
|
||||||
log WARNING: NOTHING INDICITIVE OF TYPE OF CONTENT FOUND
|
log WARNING: NOTHING INDICITIVE OF TYPE OF CONTENT FOUND, including matching $url_title_candidate
|
||||||
echo "$out" | grep '^#' | while read -r line; do
|
echo "$out" | grep '^#' | while read -r line; do
|
||||||
log candidate="$line"
|
log candidate="$line"
|
||||||
done
|
done
|
||||||
echo Chapter
|
echo Chapter
|
||||||
fi
|
fi
|
||||||
)"
|
)"
|
||||||
|
log target=$target
|
||||||
echo "$out" \
|
echo "$out" \
|
||||||
| sed -e '/^# '"$target"'/p' -e '0,/^# '"$target"'/d'
|
| sed -e '/^# '"$target"'/p' -e '0,/^# '"$target"'/d'
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue