use url as possible

master
Bel LaPointe 2021-12-15 09:12:39 -07:00
parent c5d03aca6d
commit 08f17f56b6
1 changed files with 19 additions and 1 deletions

20
poc.sh
View File

@ -11,6 +11,7 @@ main() {
#set -x #set -x
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/azorius-senate
#scrape_book_chapter https://www.dndbeyond.com/sources/mm/monsters-e #scrape_book_chapter https://www.dndbeyond.com/sources/mm/monsters-e
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/dmg/appendix-b-monster-lists #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/dmg/appendix-b-monster-lists
@ -82,7 +83,7 @@ scrape_book() {
scrape_book_chapter() { scrape_book_chapter() {
local url="$1" local url="$1"
scrape_dndbeyond $url \ scrape_dndbeyond $url \
| html_to_markdown | html_to_markdown $url
} }
list_chapters_in_book() { list_chapters_in_book() {
@ -124,6 +125,14 @@ scrape_dndbeyond() {
} }
html_to_markdown() { html_to_markdown() {
local url="${1%#*}"
url="${url%/}"
local url_title_candidate="${url##*/}"
url_title_candidate="$(
echo "$url_title_candidate" \
| tr '[:upper:]' '[:lower:]' \
| sed 's/[^a-z]/./g'
)"
local out="$( local out="$(
pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \ pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \
| sed '/^:::.*/d' \ | sed '/^:::.*/d' \
@ -138,14 +147,23 @@ html_to_markdown() {
echo Appendix echo Appendix
elif echo "$out" | grep '^##* Chapter' &> /dev/null; then elif echo "$out" | grep '^##* Chapter' &> /dev/null; then
echo Chapter echo Chapter
elif echo "$out" | grep '^##* Preface' &> /dev/null; then
echo Preface
elif echo "$out" | grep '^##* Introduction' &> /dev/null; then elif echo "$out" | grep '^##* Introduction' &> /dev/null; then
echo Introduction echo Introduction
elif echo "$out" | grep '^##* Credits' &> /dev/null; then elif echo "$out" | grep '^##* Credits' &> /dev/null; then
echo Credits echo Credits
elif echo "$out" | grep '^##* Monsters' &> /dev/null; then elif echo "$out" | grep '^##* Monsters' &> /dev/null; then
echo Monsters echo Monsters
elif echo "$out" | grep -i '^##* '"$url_title_candidate" &> /dev/null; then
target="$(echo "$out" | grep -i '^##* '"$url_title_candidate" | sed 's/^\#\#*[ ]*//' | head -n 1)"
log WARNING: URL title candidate $target
echo "$target"
else else
log WARNING: NOTHING INDICITIVE OF TYPE OF CONTENT FOUND log WARNING: NOTHING INDICITIVE OF TYPE OF CONTENT FOUND
echo "$out" | grep '^#' | while read -r line; do
log candidate="$line"
done
echo Chapter echo Chapter
fi fi
)" )"