Compare commits

...

10 Commits

Author SHA1 Message Date
bel
04acc7ff4f woo 2022-07-04 11:51:58 -06:00
bel
882de95140 pbh actually looks good including tables 2022-07-04 11:45:55 -06:00
bel
d6df61c282 tables without thead dont render 2022-07-04 11:27:44 -06:00
bel
cfa69a88fc cleanup table displays 2022-07-04 10:25:18 -06:00
bel
b445441b84 fix 2022-07-03 23:54:43 -06:00
bel
0977f17c5e scrape a little bit cleaner about pre-purging lines, tables as preformatted, rm excess ::: 2022-07-03 23:31:59 -06:00
bel
f878486d80 update to meili for new scrape format 2022-07-03 21:18:20 -06:00
bel
260815e53c rm logs 2022-06-27 19:25:17 -06:00
bel
69a2003398 fix illegal chars in title 2022-06-27 19:24:55 -06:00
bel
7c06268f8e readme 2022-06-27 19:06:43 -06:00
3 changed files with 138 additions and 58 deletions

10
README.md Normal file
View File

@@ -0,0 +1,10 @@
# dnd beyond scrape
* run `SEED=123 bash scrape.sh`
* see cached files, cookie, output in $TMPDIR/scrape.$SEED
* may have to replace `Cookie: ` in scrape.sh if expired
## Todo
* fix tables to pre-formatted
* download and localize images (webp!)

140
scrape.sh
View File

@@ -3,7 +3,7 @@
main() {
ensure
local output="${1:-"$TMPDIR/result"}"
local output="$TMPDIR/result"
mkdir -p "$output"
log "$output"
@@ -55,7 +55,7 @@ ensure() {
return 1
fi
done
export TMPDIR=/tmp/scrape.${SEED:-${RANDOM:-$(date +%s)}}
export TMPDIR="${TMPDIR:-"/tmp/scrape.${SEED:-${RANDOM:-$(date +%s)}}"}"
mkdir -p $TMPDIR
}
@@ -71,7 +71,8 @@ scrape_books() {
| sed 's/^href="//' \
| sed 's/^\///' \
| sed 's/^/\//' \
| sed 's/^/https:\/\/www.dndbeyond.com/'
| sed 's/^/https:\/\/www.dndbeyond.com/' \
| grep "${SCRAPE_BOOKS_PATTERN:-.*}"
}
scrape_book() {
@@ -94,10 +95,23 @@ scrape_book() {
fi
scrape_book_chapter "$chapter" > "$f"
log scraping book title from $f
local title="$(grep '^\#' "$f" | head -n 1 | sed 's/^\#*//' | sed 's/^[ ]*//' | sed 's/[ ]*$//')"
local title="$(
grep '^\#' "$f" \
| head -n 1 \
| sed 's/^\#*//' \
| sed 's/^[ ]*//' \
| sed 's/[ ]*$//' \
| sed 's/[ ][ ]*/ /g' \
| sed 's/[^a-zA-Z0-9]/_/g' \
| sed 's/mdash.*//' \
| sed 's/Disclaimer.*/Disclaimer/g' \
)"
if [ -z "$title" ]; then
log "WARNING: NO TITLE FOUND IN $f ($chapter): $(head -n 3 $f)"
return 1
title="${chapter%/}"
title="${title##*/}"
title="${title%%"?"*}"
log "WARNING: NO TITLE FOUND IN $f ($chapter): $(head -n 3 $f); using $title from $chapter (enter to continue)"
read
fi
mv "$f" "$output"/"$title"
done
@@ -181,6 +195,7 @@ _scrape_dndbeyond_cache_f() {
_scrape_dndbeyond() {
log foo:_scrape_dndbeyond $@
rate_limit_1s scrape_dndbeyond
touch "$TMPDIR/cookies.txt"
curl -L -sS "$1" \
-H 'authority: www.dndbeyond.com' \
-H 'cache-control: max-age=0' \
@@ -193,28 +208,12 @@ _scrape_dndbeyond() {
-H 'sec-fetch-user: ?1' \
-H 'sec-fetch-dest: document' \
-H 'accept-language: en-US,en;q=0.9' \
-H 'Cookie: Preferences=undefined; Preferences=undefined; _pxhd=qvJb-g60h1HSymUFEiAmb-Noh35mNbUmcFgIE0qEiisCj/BqXrgrrsRUlODrjAaSGknpuHBn0S2b4rGTBFvaIA==:5/4beZX4ojNLKbAeZH9rQFcGIQ/ijS7lK29wvhLkgB4O0L2GFpRuU5FOpRztEiKg9EoUZr2cPB2Z8w3DKVkaKJPlwcIt23DM5PgQLZEPoA8=; ResponsiveSwitch.DesktopMode=1; LoginState=0ae9ef26-a1ae-4cb2-8af7-f18c21b99811; G_ENABLED_IDPS=google; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..FawE7bpRlluZsWq7Iny1xQ.nBX_qdZq5varuXNGkl70QeJ03JIRT3sxxQi6iCzazt-SjlK1hEuINX0sK85gJKVT.8wlUwv…sername=squeaky2x3; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"5ce909a9-34aa-4b15-9b6d-00ee48e13e05"}; ddbSiteBanner:bb279d28-f265-49d5-ad41-6d716915bbdf=true; AWSELB=17A593B6CA59C3C4856B812F84CD401A582EF083E14A2E133FB99235E1D3CC671549BC8DF3DD0A01DD4815C023F988F55E0E95275F44EAFDFB8393A8DEFA33DD0B90284D; AWSELBCORS=17A593B6CA59C3C4856B812F84CD401A582EF083E14A2E133FB99235E1D3CC671549BC8DF3DD0A01DD4815C023F988F55E0E95275F44EAFDFB8393A8DEFA33DD0B90284D; WarningNotification.Lock=1' \
-H 'Cookie: Preferences=undefined; Preferences=undefined; _pxhd=dGa4DRIksfo8m7Siqu0z-stTfu9R0jC2I53wbDc/nEiS/YpOxaQaWLECaBu4-wCfqyCDmn2c/PpjW1ESR3pWow==:RH/b5abeFbAM04R3V7bm14eVCFGzYpPFEt6-0X4z2CCnEnFxcFIQkGBwdews8JTWKuNsgYx758Xp3RcXupSxnPMBLfFLJ8MSNyzyaOozezE=; Geo={%22region%22:%22UT%22%2C%22country%22:%22US%22%2C%22continent%22:%22NA%22}; ResponsiveSwitch.DesktopMode=1; RequestVerificationToken=2cdddc90-fbbb-45a4-929a-1fff6879a852; sublevel=ANON; Preferences=undefined; marketplace_filter_show_owned=false; G_ENABLED_IDPS=google; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..YZH4-9MvXBhf0v5KdkH8vA.J5jwdmKCBGfAjmlGY1UOr6UD0am0hS0qiyY-xBx-j3CEM8CrhlzyxRDo04YK3nc3.Z5M5dZnstMa9OIkrEN29HQ; User.ID=109926924; User.Username=squeaky2x3; Preferences.Language=1; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"71f1f0eb-3523-4f94-96d9-22d3511ea6b6"}; Preferences.TimeZoneID=1; LoginState=8ab45ac1-5777-4613-a3f5-7439058fec58; AWSELB=17A593B6CA59C3C4856B812F84CD401A582EF083AB9FE2016C3192A7805F9642538006F9B284A9F525C1AA0DF220CB30AEE9DCF61CB6C98E40F1ADC8CB1E6C93F2D5E4FC; AWSELBCORS=17A593B6CA59C3C4856B812F84CD401A582EF083AB9FE2016C3192A7805F9642538006F9B284A9F525C1AA0DF220CB30AEE9DCF61CB6C98E40F1ADC8CB1E6C93F2D5E4FC; cobalt-token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJodHRwOi8vc2NoZW1hcy54bWxzb2FwLm9yZy93cy8yMDA1LzA1L2lkZW50aXR5L2NsYWltcy9uYW1laWRlbnRpZmllciI6IjEwOTkyNjkyNCIsImh0dHA6Ly9zY2hlbWFzLnhtbHNvYXAub3JnL3dzLzIwMDUvMDUvaWRlbnRpdHkvY2xhaW1zL25hbWUiOiJzcXVlYWt5MngzIiwiaHR0cDovL3NjaGVtYXMueG1sc29hcC5vcmcvd3MvMjAwNS8wNS9pZGVudGl0eS9jbGFpbXMvZW1haWxhZGRyZXNzIjoic3F1ZWFreTJ4M0BnbWFpbC5jb20iLCJkaXNwbGF5TmFtZSI6InNxdWVha3kyeDMiLCJodHRwOi8vc2NoZW1hcy5taWNyb3NvZnQuY29tL3dzLzIwMDgvMDYvaWRlbnRpdHkvY2xhaW1zL3JvbGUiOiJSZWdpc3RlcmVkIFVzZXJzIiwibmJmIjoxNjU2Mzc2ODE0LCJleHAiOjE2NTYzNzcxMTQsImlzcyI6ImRuZGJleW9uZC5jb20iLCJhdWQiOiJkbmRiZXlvbmQuY29tIn0.ygIntjw5eL4hlACI7T0VpdWKUghzaJZtrGsVllsgEtk; WarningNotification.Lock=1' \
-H 'dnt: 1' \
-b "$TMPDIR/cookies.txt" \
-c "$TMPDIR/cookies.txt" \
--compressed
return
curl -L -sS \
"$1" \
--compressed \
-H 'Cookie: Preferences=undefined; Preferences=undefined; _pxhd=NHdkEjOXnAEHojER86i0Egmnm85GLvifiQxURPISwVtzWLNwuqQbIk1Y/2MjheQqNP4CsBXhuc4UG5-qcmLjYw==:hGx3TOEjeVaJPMfrXXnmyG7-btpQhZutwvpVBKwEbNXTAkEQvgmIs/faKB6s8537Nivj6epH9HpOWYURQeazeMwSMReW6EMazW09Rci-PJI=; ResponsiveSwitch.DesktopMode=1; ddbSiteBanner:00523ca3-81a4-4d2a-8f86-9e40273af2e2=true; LoginState=2c011278-8a3d-4cf7-a141-221f42694333; G_ENABLED_IDPS=google; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..vh9-_2G_aEjMvJ2PFAAE4w.eheZuOJLp-3NyiAJZzKqaSc91lH0ganCRsDZPYMGIG3265AppaHh0uMQLu_-P6WX.bHRi_gX8PKtQGZwVmNeb1Q; Preferences.Language=1; Preferences.TimeZoneID=1; ddbSiteBanner:d8879cae-c68e-4411-9ea0-a49d77fe5454=true; Ratings=null; _pxvid=f6abe72c-49a6-11ec-b844-57447254714c; ddbSiteBanner:fd981371-c501-4638-a64b-0d9a5c8da68b=true; Geo={%22region%22:%22ON%22%2C%22country%22:%22CA%22%2C%22continent%22:%22NA%22}; RequestVerificationToken=983cd9ce-c03c-4b8b-b6c1-a0561b39cb47; sublevel=ANON; Preferences=undefined; User.ID=109926924; User.Username=squeaky2x3; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"77896779-c672-42d1-903b-f6caba6ec599"}; pxcts=5eb1b230-525e-11ec-aa22-793bf1473016; WarningNotification.Lock=1' \
-H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0' \
-H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' \
-H 'Accept-Language: en-US,en;q=0.5' \
-H 'Referer: https://www.dndbeyond.com/' \
-H 'DNT: 1' \
-H 'Connection: keep-alive' \
-H 'Upgrade-Insecure-Requests: 1' \
-H 'Sec-Fetch-Dest: document' \
-H 'Sec-Fetch-Mode: navigate' \
-H 'Sec-Fetch-Site: same-origin' \
-H 'Sec-Fetch-User: ?1' \
-H 'Pragma: no-cache' \
-H 'Cache-Control: no-cache' \
-H 'TE: trailers';
}
url_to_title_candidate() {
@@ -238,35 +237,80 @@ url_to_title_candidate() {
html_to_markdown() {
local f="$(mktemp)"
log url=$1
log 1: url=$1
cat > "$f"
log 2: url=$1
#grep '<title>' "$f" | sed 's/.*>\([^<]*\)<.title>.*/# \1/g'
#| perl -pe 's|<div[^>]*stat-block-ability-scores-data[^>]*>(.*?)</div>|<span>\1</span>|g' \
#| perl -pe 's|<div[^>]*stat-block-ability-scores-heading[^>]*>(.*?)</div>|<strong>\1</strong>|g' \
#| perl -pe 's|<div[^>]*stat-block-ability-scores-stat[^>]*>(.*?)</div>|<p>\1</p>|g' \
#| readability "file://$(realpath $f)" \
local clean="$(
cat "$f" \
| readability "file://$(realpath $f)" \
| pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \
| awk '/CONTENT/,/FOOTER/' \
| tr '\n' '\r' \
| perl -pe 's|<blockquote[^>]*>(.*?)</blockquote>|\1|g' \
| perl -pe 's|\r<p[^\r]*<strong>([^<]*)</strong></p>\r|\r<h5>\1</h5>\r|g' \
| perl -pe 's|<t(.)><p>(.*?)</p></t.>|<t\1>\2</t\1>|g' \
| perl -pe 's|<thead>\r*<tr>(?:(?!</tr>).)+</tr>\r*<tr>|<thead><tr>|g' \
| tr '\r' '\n' \
| sed 's/<br>/ /g' \
| sed 's/colspan="[^"]*"//g' \
| tr '\n' '\r' \
| perl -pe 's|<t(.)><p>(.*?)</p></t.>|<t\1>\2</t\1>|g' \
| perl -pe 's|<thead>\r*<tr>(?:(?!</tr>).)+</tr>\r*<tr>|<thead><tr>|g' \
| tr '\r' '\n' \
| sed 's/<a[^>]*>\([^<]*\)<\/a>/\1/g' \
| pandoc \
-f html \
-t markdown+pipe_tables-simple_tables-multiline_tables \
--atx-headers \
--ascii \
--toc \
--wrap=none \
--strip-comments \
-o - \
| grep -v '^\+[-+]*\+$' \
| sed 's/{[^}]*data[^}]*}//g' \
| sed 's/\[](#[^)]*)//'
| sed 's/\[](#[^)]*)//' \
| cat \
| sed 's/{[^}]*}//' \
| sed 's/ *$//' \
| tr '\n' '\r' \
| sed 's/<div>\(.*\)<\/div>/\1/g' \
| tr '\r' '\n' \
)"
log 3: url=$1
rm "$f"
log 4: url=$1
echo "$clean" | (
lastline=""
while read -r line; do
if [ "$line" != "${line#----}" ]; then
echo "# $lastline"
echo ""
break
fi
lastline="$line"
done
cat &> /dev/null
)
log 5: url=$1
(
grep -o '<title.*' "$f" \
| sed 's/^[^>]*>//' \
| sed 's/-.*//' \
| sed 's/^/# /' \
| grep . || echo "# ${1##*/}"
echo
echo "$clean"
log 6: url=$1
) \
| tr '\n' '\r' \
| sed 's/\r\r:::\r\r/\r/g' \
| sed 's/\r\r:::\r*$//' \
| sed 's/::://g' \
| sed 's/&.dquo;/"/g' \
| sed "s/&.squo;/'/g" \
| sed 's/&mdash;/--/g' \
| sed 's/&ndash;/-/g' \
| sed 's/&times;/*/g' \
| sed 's/&shy;//g' \
| sed 's/{\.[^}]*}//g' \
| tr '\r' '\n' \
| sed 's/===+/===|/g' \
| sed 's/+===/|===/g' \
| sed 's/---+/---|/g' \
| sed 's/+---/|---/g' \
| sed '/^|[=|]*|$/s/=/-/g' \
#| grep -v '^|-[-|]*-|$' \
#| sed '/ |\r|-[-|]*-|\r| /s/-/=/g' \
#| sed 's/\r\r| \([^\r]*\) |\r|[^=]/\r\r| \1 |\r|=============================|\r|/g' \
#| sed 's/:::\(.*\):::/```\1```/g' \
rm "$f"
return $?
}

View File

@@ -31,24 +31,41 @@ clean_id() {
echo "$*" | sed 's/[^a-zA-Z0-9_-]/_/g'
}
spaceless() {
local input="$*"
if [ "$#" -lt 1 ]; then
input="$(cat)"
fi
echo "$input" | sed 's/^ *//' | sed 's/ *$//'
}
scraped_to_meili() {
find "$SCRAPED" -type f \
| sort \
| while read -r md_path; do
file_id="$(clean_id "$md_path")"
h1="$(head -n 1 "$md_path" | sed 's/^# //')"
grep '^## ' "$md_path" | sed 's/^## //' | while read -r h2; do
h1="$(grep '^#' "$md_path" | head -n 1 | sed 's/^##*//' | spaceless)"
grep '^##*' "$md_path" | while read -r h2_line; do
local h2="$(echo "$h2_line" | sed 's/^##*//' | spaceless)"
local content="$(
echo "# $h1"
echo
echo ""
echo "## $h2"
echo
cat "$md_path" \
| sed -e '0,/^\#\# '"$h2"'/d' \
| sed -e '/^\#\# .*/,$d' \
| head -n 25
echo ""
found_h2=false
found_stopper=false
cat "$md_path" | while read -r line; do
if ! $found_h2 && [ "$line" == "$h2_line" ]; then
found_h2=true
elif $found_h2 && [ "$line" != "${line#"#"}" ]; then
found_stopper=true
fi
if $found_stopper; then continue; fi
if $found_h2; then echo "$line"; fi
done
)"
if [ $(echo "$content" | wc -l | awk '{print $NF}') -lt 5 ]; then
content="$(echo "$content" | head -n 25)"
if [ $(echo "$content" | wc -w | awk '{print $NF}') -lt 25 ]; then
log "skipping content because less than 25 words found: $content"
continue
fi
local id="$(clean_id "${file_id}_${h1}_${h2}")"
@@ -56,6 +73,7 @@ scraped_to_meili() {
log h1=$h1
log h2=$h2
log content="${#content}"
log "submitting $id/$h1/$h2"
curl -sS \
"$MEILI"/indexes/scraped/documents \
-X POST \
@@ -69,3 +87,11 @@ scraped_to_meili() {
done
done
}
if [ "$#" == 0 ] ; then
if (read -p "would you like to ingest from $SCRAPED? " yn; echo "$yn" | grep -q -i y); then
scraped_to_meili
echo $?
fi
fi