diff --git a/poc.sh b/poc.sh index 21079dd..4cf6b25 100644 --- a/poc.sh +++ b/poc.sh @@ -10,6 +10,11 @@ main() { #set -x + #list_chapters_in_book https://www.dndbeyond.com/sources/sacoc + #list_chapters_in_book https://www.dndbeyond.com/sources/twbtw + #list_chapters_in_book https://www.dndbeyond.com/sources/hftt + #list_chapters_in_book https://www.dndbeyond.com/sources/sdw + #should_scrape_book_chapter https://www.dndbeyond.com/sources/ai/credits || true #scrape_book_chapter https://www.dndbeyond.com/sources/ai/credits @@ -22,7 +27,6 @@ main() { #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/phb/credits | less - #list_chapters_in_book https://www.dndbeyond.com/sources/sdw #scrape_book_chapter https://www.dndbeyond.com/sources/phb/races | less #scrape_book_chapter https://www.dndbeyond.com/sources/phb/appendix-a-conditions | less @@ -106,12 +110,23 @@ scrape_book_chapter() { list_chapters_in_book() { local book_url="${1%/}" + local book_domain="${book_url%%.com/*}.com" + log domain=$book_domain + local book_url_path="${book_url##*.com}" + book_url_path="${book_url_path%#*}" + + if ! should_scrape_book_chapter "$book_url"; then + return + fi + local raw="$(scrape_dndbeyond "$book_url")" - if echo "$raw" | grep -q 'https:..www.dndbeyond.com.compendium[^"]*'; then + if echo "$raw" | grep 'href="https:..www.dndbeyond.com.compendium[^"]*' &> /dev/null; then echo "$raw" | grep -o 'https:..www.dndbeyond.com.compendium[^"]*' - elif echo "$raw" | grep -q "$book_url/"; then + elif echo "$raw" | grep "href=\"$book_url/" &> /dev/null; then echo "$raw" | grep -o "${book_url//\//.}\/[^\"]*" + elif echo "$raw" | grep "href=\"${book_url_path//\//.}\\/" &> /dev/null; then + echo "$raw" | grep -o "href=\"${book_url_path//\//.}\/[^\"]*" | sed 's/href=\"//' | sed "s/^/${book_domain//\//\\/}/g" else echo "ERROR: FOUND NO CHAPTERS IN $book_url" >&2 return 1