readme
parent
704e956daf
commit
7c06268f8e
|
|
@ -0,0 +1,10 @@
|
|||
# dnd beyond scrape
|
||||
|
||||
* run `SEED=123 bash scrape.sh`
|
||||
* see cached files, cookie, output in $TMPDIR/scrape.$SEED
|
||||
* may have to replace `Cookie: ` in scrape.sh if expired
|
||||
|
||||
## Todo
|
||||
|
||||
* fix tables to pre-formatted
|
||||
* download and localize images (webp!)
|
||||
30
scrape.sh
30
scrape.sh
|
|
@ -96,8 +96,11 @@ scrape_book() {
|
|||
log scraping book title from $f
|
||||
local title="$(grep '^\#' "$f" | head -n 1 | sed 's/^\#*//' | sed 's/^[ ]*//' | sed 's/[ ]*$//')"
|
||||
if [ -z "$title" ]; then
|
||||
log "WARNING: NO TITLE FOUND IN $f ($chapter): $(head -n 3 $f)"
|
||||
return 1
|
||||
title="${chapter%/}"
|
||||
title="${title##*/}"
|
||||
title="${title%%"?"*}"
|
||||
log "WARNING: NO TITLE FOUND IN $f ($chapter): $(head -n 3 $f); using $title from $chapter (enter to continue)"
|
||||
read
|
||||
fi
|
||||
mv "$f" "$output"/"$title"
|
||||
done
|
||||
|
|
@ -181,6 +184,7 @@ _scrape_dndbeyond_cache_f() {
|
|||
_scrape_dndbeyond() {
|
||||
log foo:_scrape_dndbeyond $@
|
||||
rate_limit_1s scrape_dndbeyond
|
||||
touch "$TMPDIR/cookies.txt"
|
||||
curl -L -sS "$1" \
|
||||
-H 'authority: www.dndbeyond.com' \
|
||||
-H 'cache-control: max-age=0' \
|
||||
|
|
@ -193,28 +197,12 @@ _scrape_dndbeyond() {
|
|||
-H 'sec-fetch-user: ?1' \
|
||||
-H 'sec-fetch-dest: document' \
|
||||
-H 'accept-language: en-US,en;q=0.9' \
|
||||
-H 'Cookie: Preferences=undefined; Preferences=undefined; _pxhd=qvJb-g60h1HSymUFEiAmb-Noh35mNbUmcFgIE0qEiisCj/BqXrgrrsRUlODrjAaSGknpuHBn0S2b4rGTBFvaIA==:5/4beZX4ojNLKbAeZH9rQFcGIQ/ijS7lK29wvhLkgB4O0L2GFpRuU5FOpRztEiKg9EoUZr2cPB2Z8w3DKVkaKJPlwcIt23DM5PgQLZEPoA8=; ResponsiveSwitch.DesktopMode=1; LoginState=0ae9ef26-a1ae-4cb2-8af7-f18c21b99811; G_ENABLED_IDPS=google; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..FawE7bpRlluZsWq7Iny1xQ.nBX_qdZq5varuXNGkl70QeJ03JIRT3sxxQi6iCzazt-SjlK1hEuINX0sK85gJKVT.8wlUwv…sername=squeaky2x3; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"5ce909a9-34aa-4b15-9b6d-00ee48e13e05"}; ddbSiteBanner:bb279d28-f265-49d5-ad41-6d716915bbdf=true; AWSELB=17A593B6CA59C3C4856B812F84CD401A582EF083E14A2E133FB99235E1D3CC671549BC8DF3DD0A01DD4815C023F988F55E0E95275F44EAFDFB8393A8DEFA33DD0B90284D; AWSELBCORS=17A593B6CA59C3C4856B812F84CD401A582EF083E14A2E133FB99235E1D3CC671549BC8DF3DD0A01DD4815C023F988F55E0E95275F44EAFDFB8393A8DEFA33DD0B90284D; WarningNotification.Lock=1' \
|
||||
-H 'Cookie: Preferences=undefined; Preferences=undefined; _pxhd=dGa4DRIksfo8m7Siqu0z-stTfu9R0jC2I53wbDc/nEiS/YpOxaQaWLECaBu4-wCfqyCDmn2c/PpjW1ESR3pWow==:RH/b5abeFbAM04R3V7bm14eVCFGzYpPFEt6-0X4z2CCnEnFxcFIQkGBwdews8JTWKuNsgYx758Xp3RcXupSxnPMBLfFLJ8MSNyzyaOozezE=; Geo={%22region%22:%22UT%22%2C%22country%22:%22US%22%2C%22continent%22:%22NA%22}; ResponsiveSwitch.DesktopMode=1; RequestVerificationToken=2cdddc90-fbbb-45a4-929a-1fff6879a852; sublevel=ANON; Preferences=undefined; marketplace_filter_show_owned=false; G_ENABLED_IDPS=google; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..YZH4-9MvXBhf0v5KdkH8vA.J5jwdmKCBGfAjmlGY1UOr6UD0am0hS0qiyY-xBx-j3CEM8CrhlzyxRDo04YK3nc3.Z5M5dZnstMa9OIkrEN29HQ; User.ID=109926924; User.Username=squeaky2x3; Preferences.Language=1; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"71f1f0eb-3523-4f94-96d9-22d3511ea6b6"}; Preferences.TimeZoneID=1; LoginState=8ab45ac1-5777-4613-a3f5-7439058fec58; AWSELB=17A593B6CA59C3C4856B812F84CD401A582EF083AB9FE2016C3192A7805F9642538006F9B284A9F525C1AA0DF220CB30AEE9DCF61CB6C98E40F1ADC8CB1E6C93F2D5E4FC; AWSELBCORS=17A593B6CA59C3C4856B812F84CD401A582EF083AB9FE2016C3192A7805F9642538006F9B284A9F525C1AA0DF220CB30AEE9DCF61CB6C98E40F1ADC8CB1E6C93F2D5E4FC; cobalt-token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJodHRwOi8vc2NoZW1hcy54bWxzb2FwLm9yZy93cy8yMDA1LzA1L2lkZW50aXR5L2NsYWltcy9uYW1laWRlbnRpZmllciI6IjEwOTkyNjkyNCIsImh0dHA6Ly9zY2hlbWFzLnhtbHNvYXAub3JnL3dzLzIwMDUvMDUvaWRlbnRpdHkvY2xhaW1zL25hbWUiOiJzcXVlYWt5MngzIiwiaHR0cDovL3NjaGVtYXMueG1sc29hcC5vcmcvd3MvMjAwNS8wNS9pZGVudGl0eS9jbGFpbXMvZW1haWxhZGRyZXNzIjoic3F1ZWFreTJ4M0BnbWFpbC5jb20iLCJkaXNwbGF5TmFtZSI6InNxdWVha3kyeDMiLCJodHRwOi8vc2NoZW1hcy5taWNyb3NvZnQuY29tL3dzLzIwMDgvMDYvaWRlbnRpdHkvY2xhaW1zL3JvbGUiOiJSZWdpc3RlcmVkIFVzZXJzIiwibmJmIjoxNjU2Mzc2ODE0LCJleHAiOjE2NTYzNzcxMTQsImlzcyI6ImRuZGJleW9uZC5jb20iLCJhdWQiOiJkbmRiZXlvbmQuY29tIn0.ygIntjw5eL4hlACI7T0VpdWKUghzaJZtrGsVllsgEtk; WarningNotification.Lock=1' \
|
||||
-H 'dnt: 1' \
|
||||
-b "$TMPDIR/cookies.txt" \
|
||||
-c "$TMPDIR/cookies.txt" \
|
||||
--compressed
|
||||
return
|
||||
curl -L -sS \
|
||||
"$1" \
|
||||
--compressed \
|
||||
-H 'Cookie: Preferences=undefined; Preferences=undefined; _pxhd=NHdkEjOXnAEHojER86i0Egmnm85GLvifiQxURPISwVtzWLNwuqQbIk1Y/2MjheQqNP4CsBXhuc4UG5-qcmLjYw==:hGx3TOEjeVaJPMfrXXnmyG7-btpQhZutwvpVBKwEbNXTAkEQvgmIs/faKB6s8537Nivj6epH9HpOWYURQeazeMwSMReW6EMazW09Rci-PJI=; ResponsiveSwitch.DesktopMode=1; ddbSiteBanner:00523ca3-81a4-4d2a-8f86-9e40273af2e2=true; LoginState=2c011278-8a3d-4cf7-a141-221f42694333; G_ENABLED_IDPS=google; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..vh9-_2G_aEjMvJ2PFAAE4w.eheZuOJLp-3NyiAJZzKqaSc91lH0ganCRsDZPYMGIG3265AppaHh0uMQLu_-P6WX.bHRi_gX8PKtQGZwVmNeb1Q; Preferences.Language=1; Preferences.TimeZoneID=1; ddbSiteBanner:d8879cae-c68e-4411-9ea0-a49d77fe5454=true; Ratings=null; _pxvid=f6abe72c-49a6-11ec-b844-57447254714c; ddbSiteBanner:fd981371-c501-4638-a64b-0d9a5c8da68b=true; Geo={%22region%22:%22ON%22%2C%22country%22:%22CA%22%2C%22continent%22:%22NA%22}; RequestVerificationToken=983cd9ce-c03c-4b8b-b6c1-a0561b39cb47; sublevel=ANON; Preferences=undefined; User.ID=109926924; User.Username=squeaky2x3; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"77896779-c672-42d1-903b-f6caba6ec599"}; pxcts=5eb1b230-525e-11ec-aa22-793bf1473016; WarningNotification.Lock=1' \
|
||||
-H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0' \
|
||||
-H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' \
|
||||
-H 'Accept-Language: en-US,en;q=0.5' \
|
||||
-H 'Referer: https://www.dndbeyond.com/' \
|
||||
-H 'DNT: 1' \
|
||||
-H 'Connection: keep-alive' \
|
||||
-H 'Upgrade-Insecure-Requests: 1' \
|
||||
-H 'Sec-Fetch-Dest: document' \
|
||||
-H 'Sec-Fetch-Mode: navigate' \
|
||||
-H 'Sec-Fetch-Site: same-origin' \
|
||||
-H 'Sec-Fetch-User: ?1' \
|
||||
-H 'Pragma: no-cache' \
|
||||
-H 'Cache-Control: no-cache' \
|
||||
-H 'TE: trailers';
|
||||
}
|
||||
|
||||
url_to_title_candidate() {
|
||||
|
|
|
|||
Loading…
Reference in New Issue