From e6d8cfae9421e79b886df58623731b4c49e1bfb4 Mon Sep 17 00:00:00 2001 From: bel Date: Mon, 27 Jun 2022 18:15:06 -0600 Subject: [PATCH] ok nwo i think --- scrape.sh | 99 ++++++++++++++++++++----------------------------------- 1 file changed, 36 insertions(+), 63 deletions(-) diff --git a/scrape.sh b/scrape.sh index 8bb257a..cc02bee 100644 --- a/scrape.sh +++ b/scrape.sh @@ -4,7 +4,8 @@ main() { ensure - local output="${1:-$(mktemp -d)}" + local output="${1:-"$TMPDIR/result"}" + mkdir -p "$output" log "$output" #set -x @@ -21,9 +22,10 @@ main() { #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/krenkos-way #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/bestiary #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/azorius-senate - scrape_book_chapter https://www.dndbeyond.com/sources/mm/monsters-e - return $? + #scrape_book_chapter https://www.dndbeyond.com/sources/mm/monsters-e #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/dmg/appendix-b-monster-lists + #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/basic-rules/appendix-b-gods-of-the-multiverse + #return $? #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/phb/credits | less @@ -33,6 +35,8 @@ main() { #scrape_book "$output" https://www.dndbeyond.com/sources/phb + log "books=$(scrape_books)" + sleep 1 for book in $(scrape_books); do book="${book%/}" log book=$book @@ -155,9 +159,11 @@ scrape_dndbeyond() { local f="$d/$(echo "$*" | base64)" if [ -f "$f" ]; then if cat "$f" | grep .; then + log "foo:scrape_dndbeyond: yes cached: $f: $@" return fi fi + log "foo:scrape_dndbeyond: not cached: $f: $@" _scrape_dndbeyond "$@" | tee "$f" } @@ -176,7 +182,7 @@ _scrape_dndbeyond() { -H 'sec-fetch-user: ?1' \ -H 'sec-fetch-dest: document' \ -H 'accept-language: en-US,en;q=0.9' \ - -H 'Cookie: _pxhd=qvJb-g60h1HSymUFEiAmb-Noh35mNbUmcFgIE0qEiisCj/BqXrgrrsRUlODrjAaSGknpuHBn0S2b4rGTBFvaIA==:5/4beZX4ojNLKbAeZH9rQFcGIQ/ijS7lK29wvhLkgB4O0L2GFpRuU5FOpRztEiKg9EoUZr2cPB2Z8w3DKVkaKJPlwcIt23DM5PgQLZEPoA8=; ResponsiveSwitch.DesktopMode=1; LoginState=0ae9ef26-a1ae-4cb2-8af7-f18c21b99811; G_ENABLED_IDPS=google; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..FawE7bpRlluZsWq7Iny1xQ.nBX_qdZq5varuXNGkl70QeJ03JIRT3sxxQi6iCzazt-SjlK1hEuINX0sK85gJKVT.8wlUwvfZiME9fAdBKj0rIw; Preferences.Language=1; Pref…sername=squeaky2x3; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"5ce909a9-34aa-4b15-9b6d-00ee48e13e05"}; AWSELB=17A593B6CA59C3C4856B812F84CD401A582EF08335AE71D0834BC2696233572C85266F7F9F4B5BF89DD4952FDA9AD462250417F8B7C4975F1633599CC9D18D74C3B57CBA; AWSELBCORS=17A593B6CA59C3C4856B812F84CD401A582EF08335AE71D0834BC2696233572C85266F7F9F4B5BF89DD4952FDA9AD462250417F8B7C4975F1633599CC9D18D74C3B57CBA; ddbSiteBanner:bb279d28-f265-49d5-ad41-6d716915bbdf=true; WarningNotification.Lock=1' \ + -H 'Cookie: Preferences=undefined; Preferences=undefined; _pxhd=qvJb-g60h1HSymUFEiAmb-Noh35mNbUmcFgIE0qEiisCj/BqXrgrrsRUlODrjAaSGknpuHBn0S2b4rGTBFvaIA==:5/4beZX4ojNLKbAeZH9rQFcGIQ/ijS7lK29wvhLkgB4O0L2GFpRuU5FOpRztEiKg9EoUZr2cPB2Z8w3DKVkaKJPlwcIt23DM5PgQLZEPoA8=; ResponsiveSwitch.DesktopMode=1; LoginState=0ae9ef26-a1ae-4cb2-8af7-f18c21b99811; G_ENABLED_IDPS=google; CobaltSession=eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..FawE7bpRlluZsWq7Iny1xQ.nBX_qdZq5varuXNGkl70QeJ03JIRT3sxxQi6iCzazt-SjlK1hEuINX0sK85gJKVT.8wlUwv…sername=squeaky2x3; UserInfo={"UserId":109926924,"UserJoinDate":"2021-03-25","UserSessionId":"5ce909a9-34aa-4b15-9b6d-00ee48e13e05"}; AWSELB=17A593B6CA59C3C4856B812F84CD401A582EF08335AE71D0834BC2696233572C85266F7F9F4B5BF89DD4952FDA9AD462250417F8B7C4975F1633599CC9D18D74C3B57CBA; AWSELBCORS=17A593B6CA59C3C4856B812F84CD401A582EF08335AE71D0834BC2696233572C85266F7F9F4B5BF89DD4952FDA9AD462250417F8B7C4975F1633599CC9D18D74C3B57CBA; ddbSiteBanner:bb279d28-f265-49d5-ad41-6d716915bbdf=true; WarningNotification.Lock=1' \ -H 'dnt: 1' \ --compressed return @@ -221,69 +227,36 @@ url_to_title_candidate() { html_to_markdown() { local f="$(mktemp)" log url=$1 + log 1: url=$1 cat > "$f" - grep '' "$f" | sed 's/.*>\([^<]*\)<.title>.*/\1/g' >&2 - cat "$f" \ - | readability "file://$(realpath $f)" \ - | pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \ - | sed 's/{[^}]*data[^}]*}//g' \ - | sed 's/\[](#[^)]*)//' \ - | cat - rm "$f" - return $? - local input="$(cat)" - local url_title_candidate="$(url_to_title_candidate "$1")" - local url_title_candidate_2="$(url_to_title_candidate "$(dirname "$1")")" - local out="$( - echo "$input" \ + log 2: url=$1 + #grep '<title>' "$f" | sed 's/.*>\([^<]*\)<.title>.*/# \1/g' + local clean="$( + cat "$f" \ + | readability "file://$(realpath $f)" \ | pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \ - | sed '/^[ ]*:::.*/d' \ - | sed -e '/^Share$/,$d' \ - | sed 's/^\(\#\#*\) \[](#\([^)]*\))/\1 /' \ - | sed 's/{#\([^ ]*\)[^}]*}/\n\n<a name="\1"><\/a>/' \ - | sed 's/\[\([^]]*\)]\(([^)]*)\)*\({[^}]*}\)*/\1/g' \ - | sed "s/\\\\\([\"']\)/\1/g" + | sed 's/{[^}]*data[^}]*}//g' \ + | sed 's/\[](#[^)]*)//' )" - local min_level_title=0 - for levels in $(echo "$out" | grep -o '^##*'); do - if ((min_level_title<${#levels})); then - min_level_title=${#levels} - fi - done - local less_n_levels=0 - if ((min_level_title>1)); then - less_n_levels=$((min_level_title-1)) - fi - local trim_prefix="" - for i in $(seq 1 $less_n_levels); do - trim_prefix+="#" - done - local target="$( - title="$( - echo "$input" \ - | grep -o '<title>.*' - )" - title="${title%% - *}" - title="${title#<title>}" - echo "$title" - )" - log target=$target - local buffer=("# $target") - for line in $( - echo "$out" | while read -r line; do - echo "$line" | base64 + log 3: url=$1 + rm "$f" + log 4: url=$1 + echo "$clean" | ( + lastline="" + while read -r line; do + if [ "$line" != "${line#----}" ]; then + echo "# $lastline" + echo "" + break + fi + lastline="$line" done - ); do - line="$(echo "$line" | base64 --decode)" - if echo "$line" | grep -q "^$target$"; then - buffer=("# $target") - else - buffer+=("$line") - fi - done - for line in "${buffer[@]}"; do - echo "${line#$trim_prefix}" - done + cat &> /dev/null + ) + log 5: url=$1 + echo "$clean" + log 6: url=$1 + return $? } rate_limit_1s() {