From d6e7061738d32e30f01640ce6caae01c4b06c71e Mon Sep 17 00:00:00 2001 From: Bel LaPointe Date: Wed, 15 Dec 2021 06:15:59 -0700 Subject: [PATCH] fix intro vs chap vs appendix --- poc.sh | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/poc.sh b/poc.sh index ed2735c..fdd3758 100644 --- a/poc.sh +++ b/poc.sh @@ -4,6 +4,7 @@ main() { set -e set -o pipefail + #set -x local output="$(mktemp -d)" log "$output" @@ -49,9 +50,14 @@ scrape_book() { local f="$(mktemp)" for chapter in $(list_chapters_in_book "$book_url"); do - log chapter=$chapter + log scraping book chapter $chapter of $book_url scrape_book_chapter "$chapter" > "$f" - local title="$(grep -E 'Chapter|Appendix' "$f" | head -n 1 | sed 's/^\#*//' | sed 's/^[ ]*//' | sed 's/[ ]*$//')" + log scraping book title from $f + local title="$(grep '^\#' "$f" | head -n 1 | sed 's/^\#*//' | sed 's/^[ ]*//' | sed 's/[ ]*$//')" + if [ -z "$title" ]; then + log "WARNING: NO TITLE FOUND IN $f ($chapter): $(head -n 3 $f)" + return 1 + fi mv "$f" "$output"/"$title" done @@ -67,6 +73,7 @@ scrape_book_chapter() { list_chapters_in_book() { local book_url="${1%/}" local raw="$(scrape_dndbeyond "$book_url")" + if echo "$raw" | grep -q 'https:..www.dndbeyond.com.compendium[^"]*'; then echo "$raw" | grep -o 'https:..www.dndbeyond.com.compendium[^"]*' elif echo "$raw" | grep -q "$book_url/"; then @@ -111,7 +118,12 @@ html_to_markdown() { | sed 's/\[\([^]]*\)]\(([^)]*)\)*\({[^}]*}\)*/\1/g' \ | sed "s/\\\\\([\"']\)/\1/g" )" - local target="$(echo "$out" | grep -q '^[#]* Appendix' && echo Appendix || echo Chapter)" + local target="$( + (echo "$out" | grep -q '^##* Appendix' && echo Appendix) \ + || (echo "$out" | grep -q '^##* Chapter' && echo Chapter) \ + || (echo "$out" | grep -q '^##* Introduction' && echo Introduction) \ + || echo Chapter + )" echo "$out" \ | sed -e '/^# '"$target"'/p' -e '0,/^# '"$target"'/d' } @@ -122,14 +134,13 @@ rate_limit_1s() { mkdir -p "$d" local last_run="$(date -r "$d/$name" +%s)" local now="$(date +%s)" - local interval=3 + local interval=6 log "should sleep while $(($(date +%s)-last_run)) < $interval" if [ -f "$d/$name" ]; then while [ "$(($(date +%s)-last_run))" -lt $interval ]; do sleep 1 done fi - log "done sleeping" touch "$d/$name" }