cush
parent
dabe67cfa5
commit
df46dcdd9d
|
|
@ -0,0 +1,2 @@
|
||||||
|
pandoc
|
||||||
|
https://github.com/NightMachinery/readability-cli
|
||||||
102
scrape.sh
102
scrape.sh
|
|
@ -2,8 +2,7 @@
|
||||||
|
|
||||||
|
|
||||||
main() {
|
main() {
|
||||||
set -e
|
ensure
|
||||||
set -o pipefail
|
|
||||||
|
|
||||||
local output="${1:-$(mktemp -d)}"
|
local output="${1:-$(mktemp -d)}"
|
||||||
log "$output"
|
log "$output"
|
||||||
|
|
@ -22,7 +21,8 @@ main() {
|
||||||
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/krenkos-way
|
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/krenkos-way
|
||||||
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/bestiary
|
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/bestiary
|
||||||
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/azorius-senate
|
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/azorius-senate
|
||||||
#scrape_book_chapter https://www.dndbeyond.com/sources/mm/monsters-e
|
scrape_book_chapter https://www.dndbeyond.com/sources/mm/monsters-e
|
||||||
|
return $?
|
||||||
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/dmg/appendix-b-monster-lists
|
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/dmg/appendix-b-monster-lists
|
||||||
|
|
||||||
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/phb/credits | less
|
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/phb/credits | less
|
||||||
|
|
@ -43,6 +43,19 @@ main() {
|
||||||
log "$output"
|
log "$output"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ensure() {
|
||||||
|
set -e
|
||||||
|
set -o pipefail
|
||||||
|
for exe in pandoc readability; do
|
||||||
|
if ! which $exe; then
|
||||||
|
echo "$exe not installed" >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
export TMPDIR=/tmp/scrape.${SEED:-${RANDOM:-$(date +%s)}}
|
||||||
|
mkdir -p $TMPDIR
|
||||||
|
}
|
||||||
|
|
||||||
log() {
|
log() {
|
||||||
echo "$(date) > $*" >&2
|
echo "$(date) > $*" >&2
|
||||||
}
|
}
|
||||||
|
|
@ -137,7 +150,7 @@ list_chapters_in_book() {
|
||||||
|
|
||||||
scrape_dndbeyond() {
|
scrape_dndbeyond() {
|
||||||
log foo:scrape_dndbeyond $@
|
log foo:scrape_dndbeyond $@
|
||||||
local d="${TMPDIR:-/tmp}/scrape_cache"
|
local d="${TMPDIR}/scrape_cache"
|
||||||
mkdir -p "$d"
|
mkdir -p "$d"
|
||||||
local f="$d/$(echo "$*" | base64)"
|
local f="$d/$(echo "$*" | base64)"
|
||||||
if [ -f "$f" ]; then
|
if [ -f "$f" ]; then
|
||||||
|
|
@ -163,7 +176,7 @@ _scrape_dndbeyond() {
|
||||||
-H 'sec-fetch-user: ?1' \
|
-H 'sec-fetch-user: ?1' \
|
||||||
-H 'sec-fetch-dest: document' \
|
-H 'sec-fetch-dest: document' \
|
||||||
-H 'accept-language: en-US,en;q=0.9' \
|
-H 'accept-language: en-US,en;q=0.9' \
|
||||||
-H 'cookie: Preferences=undefined; Preferences=undefined; Geo={%22region%22:%22UT%22%2C%22country%22:%22US%22%2C%22continent%22:%22NA%22}; ResponsiveSwitch.DesktopMode=1; sublevel=ANON; Preferences=undefined; ddbSiteBanner:fd981371-c501-4638-a64b-0d9a5c8da68b=true; G_ENABLED_IDPS=google; User.ID=109926924; User.Username=squeaky2x3; Preferences.Language=1; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"5993614e-774f-4b55-89de-c88120abea59"}; RequestVerificationToken=6dc61cff-204f-45a0-a39f-edcea6a23072; Preferences.TimeZoneID=1; LoginState=aa3ada1f-7ab9-4804-a177-858bd2b4d693; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..ZYTOkNPFaI619J4f0rSamg.Wu7iDGvwCv-S2uiAftwHIienWEyGno2diWV6KU_KMnjzJGog2bIr8XorEpMshuGW.GN5g3TsaP_jm0d3THUnftA; ddbSiteBanner:d7e00153-5c1f-442c-a584-b3b814abc854=true; AWSELB=17A593B6CA59C3C4856B812F84CD401A582EF083E14A2E133FB99235E1D3CC671549BC8DF3DD0A01DD4815C023F988F55E0E9527B680F5A93AEBF8E5ADAA49A8F75A9276; AWSELBCORS=17A593B6CA59C3C4856B812F84CD401A582EF083E14A2E133FB99235E1D3CC671549BC8DF3DD0A01DD4815C023F988F55E0E9527B680F5A93AEBF8E5ADAA49A8F75A9276; _pxhd=fFcuMtI4ivDtBeMpMP/C7iQM0Tq6VQ3S7Ez2cFTQR3660-FAmmxJN1nf10BpNrlWlqlcOEbI4RNwUBHISjoo/A==:daB4hDYpQC3vi3ldLNM4PFvrSLt41cDr81ohF5gDoCaFGENWNSNvkbM9u5BjZ2zgHKWbzxAz4B0PTu0FnstnbJzhd3uPJGcXecasPyv7C04=; WarningNotification.Lock=1' \
|
-H 'Cookie: _pxhd=qvJb-g60h1HSymUFEiAmb-Noh35mNbUmcFgIE0qEiisCj/BqXrgrrsRUlODrjAaSGknpuHBn0S2b4rGTBFvaIA==:5/4beZX4ojNLKbAeZH9rQFcGIQ/ijS7lK29wvhLkgB4O0L2GFpRuU5FOpRztEiKg9EoUZr2cPB2Z8w3DKVkaKJPlwcIt23DM5PgQLZEPoA8=; ResponsiveSwitch.DesktopMode=1; LoginState=0ae9ef26-a1ae-4cb2-8af7-f18c21b99811; G_ENABLED_IDPS=google; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..FawE7bpRlluZsWq7Iny1xQ.nBX_qdZq5varuXNGkl70QeJ03JIRT3sxxQi6iCzazt-SjlK1hEuINX0sK85gJKVT.8wlUwvfZiME9fAdBKj0rIw; Preferences.Language=1; Pref…sername=squeaky2x3; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"5ce909a9-34aa-4b15-9b6d-00ee48e13e05"}; AWSELB=17A593B6CA59C3C4856B812F84CD401A582EF08335AE71D0834BC2696233572C85266F7F9F4B5BF89DD4952FDA9AD462250417F8B7C4975F1633599CC9D18D74C3B57CBA; AWSELBCORS=17A593B6CA59C3C4856B812F84CD401A582EF08335AE71D0834BC2696233572C85266F7F9F4B5BF89DD4952FDA9AD462250417F8B7C4975F1633599CC9D18D74C3B57CBA; ddbSiteBanner:bb279d28-f265-49d5-ad41-6d716915bbdf=true; WarningNotification.Lock=1' \
|
||||||
-H 'dnt: 1' \
|
-H 'dnt: 1' \
|
||||||
--compressed
|
--compressed
|
||||||
return
|
return
|
||||||
|
|
@ -206,10 +219,23 @@ url_to_title_candidate() {
|
||||||
}
|
}
|
||||||
|
|
||||||
html_to_markdown() {
|
html_to_markdown() {
|
||||||
|
local f="$(mktemp)"
|
||||||
|
log url=$1
|
||||||
|
cat > "$f"
|
||||||
|
cat "$f" \
|
||||||
|
| readability "file://$(realpath $f)" \
|
||||||
|
| pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \
|
||||||
|
| sed 's/{[^}]*data[^}]*}//g' \
|
||||||
|
| sed 's/\[](#[^)]*)//' \
|
||||||
|
| cat
|
||||||
|
rm "$f"
|
||||||
|
return $?
|
||||||
|
local input="$(cat)"
|
||||||
local url_title_candidate="$(url_to_title_candidate "$1")"
|
local url_title_candidate="$(url_to_title_candidate "$1")"
|
||||||
local url_title_candidate_2="$(url_to_title_candidate "$(dirname "$1")")"
|
local url_title_candidate_2="$(url_to_title_candidate "$(dirname "$1")")"
|
||||||
local out="$(
|
local out="$(
|
||||||
pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \
|
echo "$input" \
|
||||||
|
| pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \
|
||||||
| sed '/^[ ]*:::.*/d' \
|
| sed '/^[ ]*:::.*/d' \
|
||||||
| sed -e '/^Share$/,$d' \
|
| sed -e '/^Share$/,$d' \
|
||||||
| sed 's/^\(\#\#*\) \[](#\([^)]*\))/\1 /' \
|
| sed 's/^\(\#\#*\) \[](#\([^)]*\))/\1 /' \
|
||||||
|
|
@ -217,39 +243,51 @@ html_to_markdown() {
|
||||||
| sed 's/\[\([^]]*\)]\(([^)]*)\)*\({[^}]*}\)*/\1/g' \
|
| sed 's/\[\([^]]*\)]\(([^)]*)\)*\({[^}]*}\)*/\1/g' \
|
||||||
| sed "s/\\\\\([\"']\)/\1/g"
|
| sed "s/\\\\\([\"']\)/\1/g"
|
||||||
)"
|
)"
|
||||||
local target="$(
|
local min_level_title=0
|
||||||
if echo "$out" | grep '^# Appendix' &> /dev/null; then
|
for levels in $(echo "$out" | grep -o '^##*'); do
|
||||||
echo Appendix
|
if ((min_level_title<${#levels})); then
|
||||||
elif echo "$out" | grep '^# Chapter' &> /dev/null; then
|
min_level_title=${#levels}
|
||||||
echo Chapter
|
|
||||||
elif echo "$out" | grep '^# Preface' &> /dev/null; then
|
|
||||||
echo Preface
|
|
||||||
elif echo "$out" | grep '^# Introduction' &> /dev/null; then
|
|
||||||
echo Introduction
|
|
||||||
elif echo "$out" | grep '^# Credits' &> /dev/null; then
|
|
||||||
echo Credits
|
|
||||||
elif echo "$out" | grep '^# Monsters' &> /dev/null; then
|
|
||||||
echo Monsters
|
|
||||||
elif echo "$out" | grep -i '^# '"$url_title_candidate" &> /dev/null; then
|
|
||||||
target="$(echo "$out" | grep -i '^# '"$url_title_candidate" | sed 's/^\#[ ]*//' | head -n 1)"
|
|
||||||
log WARNING: URL title candidate $target
|
|
||||||
echo "$target"
|
|
||||||
else
|
|
||||||
log WARNING: NOTHING INDICITIVE OF TYPE OF CONTENT FOUND, including matching $url_title_candidate
|
|
||||||
echo "$out" | grep '^#' | while read -r line; do
|
|
||||||
log candidate="$line"
|
|
||||||
done
|
|
||||||
echo Chapter
|
|
||||||
fi
|
fi
|
||||||
|
done
|
||||||
|
local less_n_levels=0
|
||||||
|
if ((min_level_title>1)); then
|
||||||
|
less_n_levels=$((min_level_title-1))
|
||||||
|
fi
|
||||||
|
local trim_prefix=""
|
||||||
|
for i in $(seq 1 $less_n_levels); do
|
||||||
|
trim_prefix+="#"
|
||||||
|
done
|
||||||
|
local target="$(
|
||||||
|
title="$(
|
||||||
|
echo "$input" \
|
||||||
|
| grep -o '<title>.*'
|
||||||
|
)"
|
||||||
|
title="${title%% - *}"
|
||||||
|
title="${title#<title>}"
|
||||||
|
echo "$title"
|
||||||
)"
|
)"
|
||||||
log target=$target
|
log target=$target
|
||||||
echo "$out" \
|
local buffer=("# $target")
|
||||||
| sed -e '/^# '"$target"'/p' -e '0,/^# '"$target"'/d'
|
for line in $(
|
||||||
|
echo "$out" | while read -r line; do
|
||||||
|
echo "$line" | base64
|
||||||
|
done
|
||||||
|
); do
|
||||||
|
line="$(echo "$line" | base64 --decode)"
|
||||||
|
if echo "$line" | grep -q "^$target$"; then
|
||||||
|
buffer=("# $target")
|
||||||
|
else
|
||||||
|
buffer+=("$line")
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
for line in "${buffer[@]}"; do
|
||||||
|
echo "${line#$trim_prefix}"
|
||||||
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
rate_limit_1s() {
|
rate_limit_1s() {
|
||||||
local name="$1"
|
local name="$1"
|
||||||
local d="${TMPDIR:-/tmp}"/rate_limited
|
local d="${TMPDIR}"/rate_limited
|
||||||
mkdir -p "$d"
|
mkdir -p "$d"
|
||||||
local last_run="$(date -r "$d/$name" +%s)"
|
local last_run="$(date -r "$d/$name" +%s)"
|
||||||
local now="$(date +%s)"
|
local now="$(date +%s)"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue