diff --git a/README.md b/README.md new file mode 100644 index 0000000..6ca6f1e --- /dev/null +++ b/README.md @@ -0,0 +1,10 @@ +# dnd beyond scrape + +* run `SEED=123 bash scrape.sh` +* see cached files, cookie, output in $TMPDIR/scrape.$SEED +* may have to replace `Cookie: ` in scrape.sh if expired + +## Todo + +* fix tables to pre-formatted +* download and localize images (webp!) diff --git a/scrape.sh b/scrape.sh index 41c0972..737950d 100644 --- a/scrape.sh +++ b/scrape.sh @@ -96,8 +96,11 @@ scrape_book() { log scraping book title from $f local title="$(grep '^\#' "$f" | head -n 1 | sed 's/^\#*//' | sed 's/^[ ]*//' | sed 's/[ ]*$//')" if [ -z "$title" ]; then - log "WARNING: NO TITLE FOUND IN $f ($chapter): $(head -n 3 $f)" - return 1 + title="${chapter%/}" + title="${title##*/}" + title="${title%%"?"*}" + log "WARNING: NO TITLE FOUND IN $f ($chapter): $(head -n 3 $f); using $title from $chapter (enter to continue)" + read fi mv "$f" "$output"/"$title" done @@ -181,6 +184,7 @@ _scrape_dndbeyond_cache_f() { _scrape_dndbeyond() { log foo:_scrape_dndbeyond $@ rate_limit_1s scrape_dndbeyond + touch "$TMPDIR/cookies.txt" curl -L -sS "$1" \ -H 'authority: www.dndbeyond.com' \ -H 'cache-control: max-age=0' \ @@ -193,28 +197,12 @@ _scrape_dndbeyond() { -H 'sec-fetch-user: ?1' \ -H 'sec-fetch-dest: document' \ -H 'accept-language: en-US,en;q=0.9' \ - -H 'Cookie: Preferences=undefined; Preferences=undefined; _pxhd=qvJb-g60h1HSymUFEiAmb-Noh35mNbUmcFgIE0qEiisCj/BqXrgrrsRUlODrjAaSGknpuHBn0S2b4rGTBFvaIA==:5/4beZX4ojNLKbAeZH9rQFcGIQ/ijS7lK29wvhLkgB4O0L2GFpRuU5FOpRztEiKg9EoUZr2cPB2Z8w3DKVkaKJPlwcIt23DM5PgQLZEPoA8=; ResponsiveSwitch.DesktopMode=1; LoginState=0ae9ef26-a1ae-4cb2-8af7-f18c21b99811; G_ENABLED_IDPS=google; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..FawE7bpRlluZsWq7Iny1xQ.nBX_qdZq5varuXNGkl70QeJ03JIRT3sxxQi6iCzazt-SjlK1hEuINX0sK85gJKVT.8wlUwv…sername=squeaky2x3; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"5ce909a9-34aa-4b15-9b6d-00ee48e13e05"}; ddbSiteBanner:bb279d28-f265-49d5-ad41-6d716915bbdf=true; AWSELB=17A593B6CA59C3C4856B812F84CD401A582EF083E14A2E133FB99235E1D3CC671549BC8DF3DD0A01DD4815C023F988F55E0E95275F44EAFDFB8393A8DEFA33DD0B90284D; AWSELBCORS=17A593B6CA59C3C4856B812F84CD401A582EF083E14A2E133FB99235E1D3CC671549BC8DF3DD0A01DD4815C023F988F55E0E95275F44EAFDFB8393A8DEFA33DD0B90284D; WarningNotification.Lock=1' \ + -H 'Cookie: Preferences=undefined; Preferences=undefined; _pxhd=dGa4DRIksfo8m7Siqu0z-stTfu9R0jC2I53wbDc/nEiS/YpOxaQaWLECaBu4-wCfqyCDmn2c/PpjW1ESR3pWow==:RH/b5abeFbAM04R3V7bm14eVCFGzYpPFEt6-0X4z2CCnEnFxcFIQkGBwdews8JTWKuNsgYx758Xp3RcXupSxnPMBLfFLJ8MSNyzyaOozezE=; Geo={%22region%22:%22UT%22%2C%22country%22:%22US%22%2C%22continent%22:%22NA%22}; ResponsiveSwitch.DesktopMode=1; RequestVerificationToken=2cdddc90-fbbb-45a4-929a-1fff6879a852; sublevel=ANON; Preferences=undefined; marketplace_filter_show_owned=false; G_ENABLED_IDPS=google; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..YZH4-9MvXBhf0v5KdkH8vA.J5jwdmKCBGfAjmlGY1UOr6UD0am0hS0qiyY-xBx-j3CEM8CrhlzyxRDo04YK3nc3.Z5M5dZnstMa9OIkrEN29HQ; User.ID=109926924; User.Username=squeaky2x3; Preferences.Language=1; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"71f1f0eb-3523-4f94-96d9-22d3511ea6b6"}; Preferences.TimeZoneID=1; LoginState=8ab45ac1-5777-4613-a3f5-7439058fec58; AWSELB=17A593B6CA59C3C4856B812F84CD401A582EF083AB9FE2016C3192A7805F9642538006F9B284A9F525C1AA0DF220CB30AEE9DCF61CB6C98E40F1ADC8CB1E6C93F2D5E4FC; AWSELBCORS=17A593B6CA59C3C4856B812F84CD401A582EF083AB9FE2016C3192A7805F9642538006F9B284A9F525C1AA0DF220CB30AEE9DCF61CB6C98E40F1ADC8CB1E6C93F2D5E4FC; cobalt-token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJodHRwOi8vc2NoZW1hcy54bWxzb2FwLm9yZy93cy8yMDA1LzA1L2lkZW50aXR5L2NsYWltcy9uYW1laWRlbnRpZmllciI6IjEwOTkyNjkyNCIsImh0dHA6Ly9zY2hlbWFzLnhtbHNvYXAub3JnL3dzLzIwMDUvMDUvaWRlbnRpdHkvY2xhaW1zL25hbWUiOiJzcXVlYWt5MngzIiwiaHR0cDovL3NjaGVtYXMueG1sc29hcC5vcmcvd3MvMjAwNS8wNS9pZGVudGl0eS9jbGFpbXMvZW1haWxhZGRyZXNzIjoic3F1ZWFreTJ4M0BnbWFpbC5jb20iLCJkaXNwbGF5TmFtZSI6InNxdWVha3kyeDMiLCJodHRwOi8vc2NoZW1hcy5taWNyb3NvZnQuY29tL3dzLzIwMDgvMDYvaWRlbnRpdHkvY2xhaW1zL3JvbGUiOiJSZWdpc3RlcmVkIFVzZXJzIiwibmJmIjoxNjU2Mzc2ODE0LCJleHAiOjE2NTYzNzcxMTQsImlzcyI6ImRuZGJleW9uZC5jb20iLCJhdWQiOiJkbmRiZXlvbmQuY29tIn0.ygIntjw5eL4hlACI7T0VpdWKUghzaJZtrGsVllsgEtk; WarningNotification.Lock=1' \ -H 'dnt: 1' \ + -b "$TMPDIR/cookies.txt" \ + -c "$TMPDIR/cookies.txt" \ --compressed return - curl -L -sS \ - "$1" \ - --compressed \ - -H 'Cookie: Preferences=undefined; Preferences=undefined; _pxhd=NHdkEjOXnAEHojER86i0Egmnm85GLvifiQxURPISwVtzWLNwuqQbIk1Y/2MjheQqNP4CsBXhuc4UG5-qcmLjYw==:hGx3TOEjeVaJPMfrXXnmyG7-btpQhZutwvpVBKwEbNXTAkEQvgmIs/faKB6s8537Nivj6epH9HpOWYURQeazeMwSMReW6EMazW09Rci-PJI=; ResponsiveSwitch.DesktopMode=1; ddbSiteBanner:00523ca3-81a4-4d2a-8f86-9e40273af2e2=true; LoginState=2c011278-8a3d-4cf7-a141-221f42694333; G_ENABLED_IDPS=google; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..vh9-_2G_aEjMvJ2PFAAE4w.eheZuOJLp-3NyiAJZzKqaSc91lH0ganCRsDZPYMGIG3265AppaHh0uMQLu_-P6WX.bHRi_gX8PKtQGZwVmNeb1Q; Preferences.Language=1; Preferences.TimeZoneID=1; ddbSiteBanner:d8879cae-c68e-4411-9ea0-a49d77fe5454=true; Ratings=null; _pxvid=f6abe72c-49a6-11ec-b844-57447254714c; ddbSiteBanner:fd981371-c501-4638-a64b-0d9a5c8da68b=true; Geo={%22region%22:%22ON%22%2C%22country%22:%22CA%22%2C%22continent%22:%22NA%22}; RequestVerificationToken=983cd9ce-c03c-4b8b-b6c1-a0561b39cb47; sublevel=ANON; Preferences=undefined; User.ID=109926924; User.Username=squeaky2x3; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"77896779-c672-42d1-903b-f6caba6ec599"}; pxcts=5eb1b230-525e-11ec-aa22-793bf1473016; WarningNotification.Lock=1' \ - -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0' \ - -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' \ - -H 'Accept-Language: en-US,en;q=0.5' \ - -H 'Referer: https://www.dndbeyond.com/' \ - -H 'DNT: 1' \ - -H 'Connection: keep-alive' \ - -H 'Upgrade-Insecure-Requests: 1' \ - -H 'Sec-Fetch-Dest: document' \ - -H 'Sec-Fetch-Mode: navigate' \ - -H 'Sec-Fetch-Site: same-origin' \ - -H 'Sec-Fetch-User: ?1' \ - -H 'Pragma: no-cache' \ - -H 'Cache-Control: no-cache' \ - -H 'TE: trailers'; } url_to_title_candidate() {