only hrefs when seeking chapter list

master
Bel LaPointe 2021-12-15 12:29:58 -07:00
parent 1d21cead02
commit 95a66d1a18
1 changed files with 18 additions and 3 deletions

21
poc.sh
View File

@ -10,6 +10,11 @@ main() {
#set -x #set -x
#list_chapters_in_book https://www.dndbeyond.com/sources/sacoc
#list_chapters_in_book https://www.dndbeyond.com/sources/twbtw
#list_chapters_in_book https://www.dndbeyond.com/sources/hftt
#list_chapters_in_book https://www.dndbeyond.com/sources/sdw
#should_scrape_book_chapter https://www.dndbeyond.com/sources/ai/credits || true #should_scrape_book_chapter https://www.dndbeyond.com/sources/ai/credits || true
#scrape_book_chapter https://www.dndbeyond.com/sources/ai/credits #scrape_book_chapter https://www.dndbeyond.com/sources/ai/credits
@ -22,7 +27,6 @@ main() {
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/phb/credits | less #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/phb/credits | less
#list_chapters_in_book https://www.dndbeyond.com/sources/sdw
#scrape_book_chapter https://www.dndbeyond.com/sources/phb/races | less #scrape_book_chapter https://www.dndbeyond.com/sources/phb/races | less
#scrape_book_chapter https://www.dndbeyond.com/sources/phb/appendix-a-conditions | less #scrape_book_chapter https://www.dndbeyond.com/sources/phb/appendix-a-conditions | less
@ -106,12 +110,23 @@ scrape_book_chapter() {
list_chapters_in_book() { list_chapters_in_book() {
local book_url="${1%/}" local book_url="${1%/}"
local book_domain="${book_url%%.com/*}.com"
log domain=$book_domain
local book_url_path="${book_url##*.com}"
book_url_path="${book_url_path%#*}"
if ! should_scrape_book_chapter "$book_url"; then
return
fi
local raw="$(scrape_dndbeyond "$book_url")" local raw="$(scrape_dndbeyond "$book_url")"
if echo "$raw" | grep -q 'https:..www.dndbeyond.com.compendium[^"]*'; then if echo "$raw" | grep 'href="https:..www.dndbeyond.com.compendium[^"]*' &> /dev/null; then
echo "$raw" | grep -o 'https:..www.dndbeyond.com.compendium[^"]*' echo "$raw" | grep -o 'https:..www.dndbeyond.com.compendium[^"]*'
elif echo "$raw" | grep -q "$book_url/"; then elif echo "$raw" | grep "href=\"$book_url/" &> /dev/null; then
echo "$raw" | grep -o "${book_url//\//.}\/[^\"]*" echo "$raw" | grep -o "${book_url//\//.}\/[^\"]*"
elif echo "$raw" | grep "href=\"${book_url_path//\//.}\\/" &> /dev/null; then
echo "$raw" | grep -o "href=\"${book_url_path//\//.}\/[^\"]*" | sed 's/href=\"//' | sed "s/^/${book_domain//\//\\/}/g"
else else
echo "ERROR: FOUND NO CHAPTERS IN $book_url" >&2 echo "ERROR: FOUND NO CHAPTERS IN $book_url" >&2
return 1 return 1