use url as possible
parent
c5d03aca6d
commit
08f17f56b6
20
poc.sh
20
poc.sh
|
|
@ -11,6 +11,7 @@ main() {
|
||||||
|
|
||||||
#set -x
|
#set -x
|
||||||
|
|
||||||
|
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/azorius-senate
|
||||||
#scrape_book_chapter https://www.dndbeyond.com/sources/mm/monsters-e
|
#scrape_book_chapter https://www.dndbeyond.com/sources/mm/monsters-e
|
||||||
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/dmg/appendix-b-monster-lists
|
#scrape_book_chapter https://www.dndbeyond.com/compendium/rules/dmg/appendix-b-monster-lists
|
||||||
|
|
||||||
|
|
@ -82,7 +83,7 @@ scrape_book() {
|
||||||
scrape_book_chapter() {
|
scrape_book_chapter() {
|
||||||
local url="$1"
|
local url="$1"
|
||||||
scrape_dndbeyond $url \
|
scrape_dndbeyond $url \
|
||||||
| html_to_markdown
|
| html_to_markdown $url
|
||||||
}
|
}
|
||||||
|
|
||||||
list_chapters_in_book() {
|
list_chapters_in_book() {
|
||||||
|
|
@ -124,6 +125,14 @@ scrape_dndbeyond() {
|
||||||
}
|
}
|
||||||
|
|
||||||
html_to_markdown() {
|
html_to_markdown() {
|
||||||
|
local url="${1%#*}"
|
||||||
|
url="${url%/}"
|
||||||
|
local url_title_candidate="${url##*/}"
|
||||||
|
url_title_candidate="$(
|
||||||
|
echo "$url_title_candidate" \
|
||||||
|
| tr '[:upper:]' '[:lower:]' \
|
||||||
|
| sed 's/[^a-z]/./g'
|
||||||
|
)"
|
||||||
local out="$(
|
local out="$(
|
||||||
pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \
|
pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \
|
||||||
| sed '/^:::.*/d' \
|
| sed '/^:::.*/d' \
|
||||||
|
|
@ -138,14 +147,23 @@ html_to_markdown() {
|
||||||
echo Appendix
|
echo Appendix
|
||||||
elif echo "$out" | grep '^##* Chapter' &> /dev/null; then
|
elif echo "$out" | grep '^##* Chapter' &> /dev/null; then
|
||||||
echo Chapter
|
echo Chapter
|
||||||
|
elif echo "$out" | grep '^##* Preface' &> /dev/null; then
|
||||||
|
echo Preface
|
||||||
elif echo "$out" | grep '^##* Introduction' &> /dev/null; then
|
elif echo "$out" | grep '^##* Introduction' &> /dev/null; then
|
||||||
echo Introduction
|
echo Introduction
|
||||||
elif echo "$out" | grep '^##* Credits' &> /dev/null; then
|
elif echo "$out" | grep '^##* Credits' &> /dev/null; then
|
||||||
echo Credits
|
echo Credits
|
||||||
elif echo "$out" | grep '^##* Monsters' &> /dev/null; then
|
elif echo "$out" | grep '^##* Monsters' &> /dev/null; then
|
||||||
echo Monsters
|
echo Monsters
|
||||||
|
elif echo "$out" | grep -i '^##* '"$url_title_candidate" &> /dev/null; then
|
||||||
|
target="$(echo "$out" | grep -i '^##* '"$url_title_candidate" | sed 's/^\#\#*[ ]*//' | head -n 1)"
|
||||||
|
log WARNING: URL title candidate $target
|
||||||
|
echo "$target"
|
||||||
else
|
else
|
||||||
log WARNING: NOTHING INDICITIVE OF TYPE OF CONTENT FOUND
|
log WARNING: NOTHING INDICITIVE OF TYPE OF CONTENT FOUND
|
||||||
|
echo "$out" | grep '^#' | while read -r line; do
|
||||||
|
log candidate="$line"
|
||||||
|
done
|
||||||
echo Chapter
|
echo Chapter
|
||||||
fi
|
fi
|
||||||
)"
|
)"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue