Compare commits
9 Commits
704e956daf
...
v0.0.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
882de95140 | ||
|
|
d6df61c282 | ||
|
|
cfa69a88fc | ||
|
|
b445441b84 | ||
|
|
0977f17c5e | ||
|
|
f878486d80 | ||
|
|
260815e53c | ||
|
|
69a2003398 | ||
|
|
7c06268f8e |
10
README.md
Normal file
10
README.md
Normal file
@@ -0,0 +1,10 @@
|
||||
# dnd beyond scrape
|
||||
|
||||
* run `SEED=123 bash scrape.sh`
|
||||
* see cached files, cookie, output in $TMPDIR/scrape.$SEED
|
||||
* may have to replace `Cookie: ` in scrape.sh if expired
|
||||
|
||||
## Todo
|
||||
|
||||
* fix tables to pre-formatted
|
||||
* download and localize images (webp!)
|
||||
139
scrape.sh
139
scrape.sh
@@ -3,7 +3,7 @@
|
||||
main() {
|
||||
ensure
|
||||
|
||||
local output="${1:-"$TMPDIR/result"}"
|
||||
local output="$TMPDIR/result"
|
||||
mkdir -p "$output"
|
||||
log "$output"
|
||||
|
||||
@@ -55,7 +55,7 @@ ensure() {
|
||||
return 1
|
||||
fi
|
||||
done
|
||||
export TMPDIR=/tmp/scrape.${SEED:-${RANDOM:-$(date +%s)}}
|
||||
export TMPDIR="${TMPDIR:-"/tmp/scrape.${SEED:-${RANDOM:-$(date +%s)}}"}"
|
||||
mkdir -p $TMPDIR
|
||||
}
|
||||
|
||||
@@ -71,7 +71,8 @@ scrape_books() {
|
||||
| sed 's/^href="//' \
|
||||
| sed 's/^\///' \
|
||||
| sed 's/^/\//' \
|
||||
| sed 's/^/https:\/\/www.dndbeyond.com/'
|
||||
| sed 's/^/https:\/\/www.dndbeyond.com/' \
|
||||
| grep "${SCRAPE_BOOKS_PATTERN:-.*}"
|
||||
}
|
||||
|
||||
scrape_book() {
|
||||
@@ -94,10 +95,23 @@ scrape_book() {
|
||||
fi
|
||||
scrape_book_chapter "$chapter" > "$f"
|
||||
log scraping book title from $f
|
||||
local title="$(grep '^\#' "$f" | head -n 1 | sed 's/^\#*//' | sed 's/^[ ]*//' | sed 's/[ ]*$//')"
|
||||
local title="$(
|
||||
grep '^\#' "$f" \
|
||||
| head -n 1 \
|
||||
| sed 's/^\#*//' \
|
||||
| sed 's/^[ ]*//' \
|
||||
| sed 's/[ ]*$//' \
|
||||
| sed 's/[ ][ ]*/ /g' \
|
||||
| sed 's/[^a-zA-Z0-9]/_/g' \
|
||||
| sed 's/mdash.*//' \
|
||||
| sed 's/Disclaimer.*/Disclaimer/g' \
|
||||
)"
|
||||
if [ -z "$title" ]; then
|
||||
log "WARNING: NO TITLE FOUND IN $f ($chapter): $(head -n 3 $f)"
|
||||
return 1
|
||||
title="${chapter%/}"
|
||||
title="${title##*/}"
|
||||
title="${title%%"?"*}"
|
||||
log "WARNING: NO TITLE FOUND IN $f ($chapter): $(head -n 3 $f); using $title from $chapter (enter to continue)"
|
||||
read
|
||||
fi
|
||||
mv "$f" "$output"/"$title"
|
||||
done
|
||||
@@ -181,6 +195,7 @@ _scrape_dndbeyond_cache_f() {
|
||||
_scrape_dndbeyond() {
|
||||
log foo:_scrape_dndbeyond $@
|
||||
rate_limit_1s scrape_dndbeyond
|
||||
touch "$TMPDIR/cookies.txt"
|
||||
curl -L -sS "$1" \
|
||||
-H 'authority: www.dndbeyond.com' \
|
||||
-H 'cache-control: max-age=0' \
|
||||
@@ -193,28 +208,12 @@ _scrape_dndbeyond() {
|
||||
-H 'sec-fetch-user: ?1' \
|
||||
-H 'sec-fetch-dest: document' \
|
||||
-H 'accept-language: en-US,en;q=0.9' \
|
||||
-H 'Cookie: Preferences=undefined; Preferences=undefined; _pxhd=qvJb-g60h1HSymUFEiAmb-Noh35mNbUmcFgIE0qEiisCj/BqXrgrrsRUlODrjAaSGknpuHBn0S2b4rGTBFvaIA==:5/4beZX4ojNLKbAeZH9rQFcGIQ/ijS7lK29wvhLkgB4O0L2GFpRuU5FOpRztEiKg9EoUZr2cPB2Z8w3DKVkaKJPlwcIt23DM5PgQLZEPoA8=; ResponsiveSwitch.DesktopMode=1; LoginState=0ae9ef26-a1ae-4cb2-8af7-f18c21b99811; G_ENABLED_IDPS=google; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..FawE7bpRlluZsWq7Iny1xQ.nBX_qdZq5varuXNGkl70QeJ03JIRT3sxxQi6iCzazt-SjlK1hEuINX0sK85gJKVT.8wlUwv…sername=squeaky2x3; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"5ce909a9-34aa-4b15-9b6d-00ee48e13e05"}; ddbSiteBanner:bb279d28-f265-49d5-ad41-6d716915bbdf=true; AWSELB=17A593B6CA59C3C4856B812F84CD401A582EF083E14A2E133FB99235E1D3CC671549BC8DF3DD0A01DD4815C023F988F55E0E95275F44EAFDFB8393A8DEFA33DD0B90284D; AWSELBCORS=17A593B6CA59C3C4856B812F84CD401A582EF083E14A2E133FB99235E1D3CC671549BC8DF3DD0A01DD4815C023F988F55E0E95275F44EAFDFB8393A8DEFA33DD0B90284D; WarningNotification.Lock=1' \
|
||||
-H 'Cookie: Preferences=undefined; Preferences=undefined; _pxhd=dGa4DRIksfo8m7Siqu0z-stTfu9R0jC2I53wbDc/nEiS/YpOxaQaWLECaBu4-wCfqyCDmn2c/PpjW1ESR3pWow==:RH/b5abeFbAM04R3V7bm14eVCFGzYpPFEt6-0X4z2CCnEnFxcFIQkGBwdews8JTWKuNsgYx758Xp3RcXupSxnPMBLfFLJ8MSNyzyaOozezE=; Geo={%22region%22:%22UT%22%2C%22country%22:%22US%22%2C%22continent%22:%22NA%22}; ResponsiveSwitch.DesktopMode=1; RequestVerificationToken=2cdddc90-fbbb-45a4-929a-1fff6879a852; sublevel=ANON; Preferences=undefined; marketplace_filter_show_owned=false; G_ENABLED_IDPS=google; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..YZH4-9MvXBhf0v5KdkH8vA.J5jwdmKCBGfAjmlGY1UOr6UD0am0hS0qiyY-xBx-j3CEM8CrhlzyxRDo04YK3nc3.Z5M5dZnstMa9OIkrEN29HQ; User.ID=109926924; User.Username=squeaky2x3; Preferences.Language=1; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"71f1f0eb-3523-4f94-96d9-22d3511ea6b6"}; Preferences.TimeZoneID=1; LoginState=8ab45ac1-5777-4613-a3f5-7439058fec58; AWSELB=17A593B6CA59C3C4856B812F84CD401A582EF083AB9FE2016C3192A7805F9642538006F9B284A9F525C1AA0DF220CB30AEE9DCF61CB6C98E40F1ADC8CB1E6C93F2D5E4FC; AWSELBCORS=17A593B6CA59C3C4856B812F84CD401A582EF083AB9FE2016C3192A7805F9642538006F9B284A9F525C1AA0DF220CB30AEE9DCF61CB6C98E40F1ADC8CB1E6C93F2D5E4FC; cobalt-token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJodHRwOi8vc2NoZW1hcy54bWxzb2FwLm9yZy93cy8yMDA1LzA1L2lkZW50aXR5L2NsYWltcy9uYW1laWRlbnRpZmllciI6IjEwOTkyNjkyNCIsImh0dHA6Ly9zY2hlbWFzLnhtbHNvYXAub3JnL3dzLzIwMDUvMDUvaWRlbnRpdHkvY2xhaW1zL25hbWUiOiJzcXVlYWt5MngzIiwiaHR0cDovL3NjaGVtYXMueG1sc29hcC5vcmcvd3MvMjAwNS8wNS9pZGVudGl0eS9jbGFpbXMvZW1haWxhZGRyZXNzIjoic3F1ZWFreTJ4M0BnbWFpbC5jb20iLCJkaXNwbGF5TmFtZSI6InNxdWVha3kyeDMiLCJodHRwOi8vc2NoZW1hcy5taWNyb3NvZnQuY29tL3dzLzIwMDgvMDYvaWRlbnRpdHkvY2xhaW1zL3JvbGUiOiJSZWdpc3RlcmVkIFVzZXJzIiwibmJmIjoxNjU2Mzc2ODE0LCJleHAiOjE2NTYzNzcxMTQsImlzcyI6ImRuZGJleW9uZC5jb20iLCJhdWQiOiJkbmRiZXlvbmQuY29tIn0.ygIntjw5eL4hlACI7T0VpdWKUghzaJZtrGsVllsgEtk; WarningNotification.Lock=1' \
|
||||
-H 'dnt: 1' \
|
||||
-b "$TMPDIR/cookies.txt" \
|
||||
-c "$TMPDIR/cookies.txt" \
|
||||
--compressed
|
||||
return
|
||||
curl -L -sS \
|
||||
"$1" \
|
||||
--compressed \
|
||||
-H 'Cookie: Preferences=undefined; Preferences=undefined; _pxhd=NHdkEjOXnAEHojER86i0Egmnm85GLvifiQxURPISwVtzWLNwuqQbIk1Y/2MjheQqNP4CsBXhuc4UG5-qcmLjYw==:hGx3TOEjeVaJPMfrXXnmyG7-btpQhZutwvpVBKwEbNXTAkEQvgmIs/faKB6s8537Nivj6epH9HpOWYURQeazeMwSMReW6EMazW09Rci-PJI=; ResponsiveSwitch.DesktopMode=1; ddbSiteBanner:00523ca3-81a4-4d2a-8f86-9e40273af2e2=true; LoginState=2c011278-8a3d-4cf7-a141-221f42694333; G_ENABLED_IDPS=google; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..vh9-_2G_aEjMvJ2PFAAE4w.eheZuOJLp-3NyiAJZzKqaSc91lH0ganCRsDZPYMGIG3265AppaHh0uMQLu_-P6WX.bHRi_gX8PKtQGZwVmNeb1Q; Preferences.Language=1; Preferences.TimeZoneID=1; ddbSiteBanner:d8879cae-c68e-4411-9ea0-a49d77fe5454=true; Ratings=null; _pxvid=f6abe72c-49a6-11ec-b844-57447254714c; ddbSiteBanner:fd981371-c501-4638-a64b-0d9a5c8da68b=true; Geo={%22region%22:%22ON%22%2C%22country%22:%22CA%22%2C%22continent%22:%22NA%22}; RequestVerificationToken=983cd9ce-c03c-4b8b-b6c1-a0561b39cb47; sublevel=ANON; Preferences=undefined; User.ID=109926924; User.Username=squeaky2x3; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"77896779-c672-42d1-903b-f6caba6ec599"}; pxcts=5eb1b230-525e-11ec-aa22-793bf1473016; WarningNotification.Lock=1' \
|
||||
-H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0' \
|
||||
-H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' \
|
||||
-H 'Accept-Language: en-US,en;q=0.5' \
|
||||
-H 'Referer: https://www.dndbeyond.com/' \
|
||||
-H 'DNT: 1' \
|
||||
-H 'Connection: keep-alive' \
|
||||
-H 'Upgrade-Insecure-Requests: 1' \
|
||||
-H 'Sec-Fetch-Dest: document' \
|
||||
-H 'Sec-Fetch-Mode: navigate' \
|
||||
-H 'Sec-Fetch-Site: same-origin' \
|
||||
-H 'Sec-Fetch-User: ?1' \
|
||||
-H 'Pragma: no-cache' \
|
||||
-H 'Cache-Control: no-cache' \
|
||||
-H 'TE: trailers';
|
||||
}
|
||||
|
||||
url_to_title_candidate() {
|
||||
@@ -238,35 +237,79 @@ url_to_title_candidate() {
|
||||
html_to_markdown() {
|
||||
local f="$(mktemp)"
|
||||
log url=$1
|
||||
log 1: url=$1
|
||||
cat > "$f"
|
||||
log 2: url=$1
|
||||
#grep '<title>' "$f" | sed 's/.*>\([^<]*\)<.title>.*/# \1/g'
|
||||
#| perl -pe 's|<div[^>]*stat-block-ability-scores-data[^>]*>(.*?)</div>|<span>\1</span>|g' \
|
||||
#| perl -pe 's|<div[^>]*stat-block-ability-scores-heading[^>]*>(.*?)</div>|<strong>\1</strong>|g' \
|
||||
#| perl -pe 's|<div[^>]*stat-block-ability-scores-stat[^>]*>(.*?)</div>|<p>\1</p>|g' \
|
||||
#| readability "file://$(realpath $f)" \
|
||||
|
||||
local clean="$(
|
||||
cat "$f" \
|
||||
| readability "file://$(realpath $f)" \
|
||||
| pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \
|
||||
| awk '/CONTENT/,/FOOTER/' \
|
||||
| tr '\n' '\r' \
|
||||
| perl -pe 's|<blockquote[^>]*>(.*?)</blockquote>|\1|g' \
|
||||
| perl -pe 's|\r<p[^\r]*<strong>([^<]*)</strong></p>\r|\r<h5>\1</h5>\r|g' \
|
||||
| perl -pe 's|<t(.)><p>(.*?)</p></t.>|<t\1>\2</t\1>|g' \
|
||||
| perl -pe 's|<thead>\r*<tr>(?:(?!</tr>).)+</tr>\r*<tr>|<thead><tr>|g' \
|
||||
| tr '\r' '\n' \
|
||||
| sed 's/<br>/ /g' \
|
||||
| sed 's/colspan="[^"]*"//g' \
|
||||
| tr '\n' '\r' \
|
||||
| perl -pe 's|<t(.)><p>(.*?)</p></t.>|<t\1>\2</t\1>|g' \
|
||||
| perl -pe 's|<thead>\r*<tr>(?:(?!</tr>).)+</tr>\r*<tr>|<thead><tr>|g' \
|
||||
| tr '\r' '\n' \
|
||||
| pandoc \
|
||||
-f html \
|
||||
-t markdown+pipe_tables-simple_tables-multiline_tables \
|
||||
--atx-headers \
|
||||
--ascii \
|
||||
--toc \
|
||||
--wrap=none \
|
||||
--strip-comments \
|
||||
-o - \
|
||||
| grep -v '^\+[-+]*\+$' \
|
||||
| sed 's/{[^}]*data[^}]*}//g' \
|
||||
| sed 's/\[](#[^)]*)//'
|
||||
| sed 's/\[](#[^)]*)//' \
|
||||
| cat \
|
||||
| sed 's/{[^}]*}//' \
|
||||
| sed 's/ *$//' \
|
||||
| tr '\n' '\r' \
|
||||
| sed 's/<div>\(.*\)<\/div>/\1/g' \
|
||||
| tr '\r' '\n' \
|
||||
)"
|
||||
log 3: url=$1
|
||||
(
|
||||
grep -o '<title.*' "$f" \
|
||||
| sed 's/^[^>]*>//' \
|
||||
| sed 's/-.*//' \
|
||||
| sed 's/^/# /' \
|
||||
| grep . || echo "# ${1##*/}"
|
||||
echo
|
||||
echo "$clean"
|
||||
) \
|
||||
| tr '\n' '\r' \
|
||||
| sed 's/\r\r:::\r\r/\r/g' \
|
||||
| sed 's/\r\r:::\r*$//' \
|
||||
| sed 's/::://g' \
|
||||
| sed 's/&.dquo;/"/g' \
|
||||
| sed "s/&.squo;/'/g" \
|
||||
| sed 's/—/--/g' \
|
||||
| sed 's/–/-/g' \
|
||||
| sed 's/×/*/g' \
|
||||
| sed 's/­//g' \
|
||||
| sed 's/{\.[^}]*}//g' \
|
||||
| tr '\r' '\n' \
|
||||
| sed 's/===+/===|/g' \
|
||||
| sed 's/+===/|===/g' \
|
||||
| sed 's/---+/---|/g' \
|
||||
| sed 's/+---/|---/g' \
|
||||
| sed '/^|[=|]*|$/s/=/-/g' \
|
||||
|
||||
#| grep -v '^|-[-|]*-|$' \
|
||||
#| sed '/ |\r|-[-|]*-|\r| /s/-/=/g' \
|
||||
#| sed 's/\r\r| \([^\r]*\) |\r|[^=]/\r\r| \1 |\r|=============================|\r|/g' \
|
||||
#| sed 's/:::\(.*\):::/```\1```/g' \
|
||||
|
||||
rm "$f"
|
||||
log 4: url=$1
|
||||
echo "$clean" | (
|
||||
lastline=""
|
||||
while read -r line; do
|
||||
if [ "$line" != "${line#----}" ]; then
|
||||
echo "# $lastline"
|
||||
echo ""
|
||||
break
|
||||
fi
|
||||
lastline="$line"
|
||||
done
|
||||
cat &> /dev/null
|
||||
)
|
||||
log 5: url=$1
|
||||
echo "$clean"
|
||||
log 6: url=$1
|
||||
return $?
|
||||
}
|
||||
|
||||
|
||||
46
to_meili.sh
46
to_meili.sh
@@ -31,24 +31,41 @@ clean_id() {
|
||||
echo "$*" | sed 's/[^a-zA-Z0-9_-]/_/g'
|
||||
}
|
||||
|
||||
spaceless() {
|
||||
local input="$*"
|
||||
if [ "$#" -lt 1 ]; then
|
||||
input="$(cat)"
|
||||
fi
|
||||
echo "$input" | sed 's/^ *//' | sed 's/ *$//'
|
||||
}
|
||||
|
||||
scraped_to_meili() {
|
||||
find "$SCRAPED" -type f \
|
||||
| sort \
|
||||
| while read -r md_path; do
|
||||
file_id="$(clean_id "$md_path")"
|
||||
h1="$(head -n 1 "$md_path" | sed 's/^# //')"
|
||||
grep '^## ' "$md_path" | sed 's/^## //' | while read -r h2; do
|
||||
h1="$(grep '^#' "$md_path" | head -n 1 | sed 's/^##*//' | spaceless)"
|
||||
grep '^##*' "$md_path" | while read -r h2_line; do
|
||||
local h2="$(echo "$h2_line" | sed 's/^##*//' | spaceless)"
|
||||
local content="$(
|
||||
echo "# $h1"
|
||||
echo
|
||||
echo ""
|
||||
echo "## $h2"
|
||||
echo
|
||||
cat "$md_path" \
|
||||
| sed -e '0,/^\#\# '"$h2"'/d' \
|
||||
| sed -e '/^\#\# .*/,$d' \
|
||||
| head -n 25
|
||||
echo ""
|
||||
found_h2=false
|
||||
found_stopper=false
|
||||
cat "$md_path" | while read -r line; do
|
||||
if ! $found_h2 && [ "$line" == "$h2_line" ]; then
|
||||
found_h2=true
|
||||
elif $found_h2 && [ "$line" != "${line#"#"}" ]; then
|
||||
found_stopper=true
|
||||
fi
|
||||
if $found_stopper; then continue; fi
|
||||
if $found_h2; then echo "$line"; fi
|
||||
done
|
||||
)"
|
||||
if [ $(echo "$content" | wc -l | awk '{print $NF}') -lt 5 ]; then
|
||||
content="$(echo "$content" | head -n 25)"
|
||||
if [ $(echo "$content" | wc -w | awk '{print $NF}') -lt 25 ]; then
|
||||
log "skipping content because less than 25 words found: $content"
|
||||
continue
|
||||
fi
|
||||
local id="$(clean_id "${file_id}_${h1}_${h2}")"
|
||||
@@ -56,6 +73,7 @@ scraped_to_meili() {
|
||||
log h1=$h1
|
||||
log h2=$h2
|
||||
log content="${#content}"
|
||||
log "submitting $id/$h1/$h2"
|
||||
curl -sS \
|
||||
"$MEILI"/indexes/scraped/documents \
|
||||
-X POST \
|
||||
@@ -69,3 +87,11 @@ scraped_to_meili() {
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
if [ "$#" == 0 ] ; then
|
||||
if (read -p "would you like to ingest from $SCRAPED? " yn; echo "$yn" | grep -q -i y); then
|
||||
scraped_to_meili
|
||||
echo $?
|
||||
fi
|
||||
|
||||
fi
|
||||
|
||||
Reference in New Issue
Block a user