only hrefs when seeking chapter list
parent
1d21cead02
commit
95a66d1a18
21
poc.sh
21
poc.sh
|
|
@ -10,6 +10,11 @@ main() {
|
|||
|
||||
#set -x
|
||||
|
||||
#list_chapters_in_book https://www.dndbeyond.com/sources/sacoc
|
||||
#list_chapters_in_book https://www.dndbeyond.com/sources/twbtw
|
||||
#list_chapters_in_book https://www.dndbeyond.com/sources/hftt
|
||||
#list_chapters_in_book https://www.dndbeyond.com/sources/sdw
|
||||
|
||||
#should_scrape_book_chapter https://www.dndbeyond.com/sources/ai/credits || true
|
||||
#scrape_book_chapter https://www.dndbeyond.com/sources/ai/credits
|
||||
|
||||
|
|
@ -22,7 +27,6 @@ main() {
|
|||
|
||||
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/phb/credits | less
|
||||
|
||||
#list_chapters_in_book https://www.dndbeyond.com/sources/sdw
|
||||
|
||||
#scrape_book_chapter https://www.dndbeyond.com/sources/phb/races | less
|
||||
#scrape_book_chapter https://www.dndbeyond.com/sources/phb/appendix-a-conditions | less
|
||||
|
|
@ -106,12 +110,23 @@ scrape_book_chapter() {
|
|||
|
||||
list_chapters_in_book() {
|
||||
local book_url="${1%/}"
|
||||
local book_domain="${book_url%%.com/*}.com"
|
||||
log domain=$book_domain
|
||||
local book_url_path="${book_url##*.com}"
|
||||
book_url_path="${book_url_path%#*}"
|
||||
|
||||
if ! should_scrape_book_chapter "$book_url"; then
|
||||
return
|
||||
fi
|
||||
|
||||
local raw="$(scrape_dndbeyond "$book_url")"
|
||||
|
||||
if echo "$raw" | grep -q 'https:..www.dndbeyond.com.compendium[^"]*'; then
|
||||
if echo "$raw" | grep 'href="https:..www.dndbeyond.com.compendium[^"]*' &> /dev/null; then
|
||||
echo "$raw" | grep -o 'https:..www.dndbeyond.com.compendium[^"]*'
|
||||
elif echo "$raw" | grep -q "$book_url/"; then
|
||||
elif echo "$raw" | grep "href=\"$book_url/" &> /dev/null; then
|
||||
echo "$raw" | grep -o "${book_url//\//.}\/[^\"]*"
|
||||
elif echo "$raw" | grep "href=\"${book_url_path//\//.}\\/" &> /dev/null; then
|
||||
echo "$raw" | grep -o "href=\"${book_url_path//\//.}\/[^\"]*" | sed 's/href=\"//' | sed "s/^/${book_domain//\//\\/}/g"
|
||||
else
|
||||
echo "ERROR: FOUND NO CHAPTERS IN $book_url" >&2
|
||||
return 1
|
||||
|
|
|
|||
Loading…
Reference in New Issue