From 08f17f56b67f4ed02ff94c9ca3093c1158e6a01d Mon Sep 17 00:00:00 2001 From: Bel LaPointe Date: Wed, 15 Dec 2021 09:12:39 -0700 Subject: [PATCH] use url as possible --- poc.sh | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/poc.sh b/poc.sh index d175015..6c244f9 100644 --- a/poc.sh +++ b/poc.sh @@ -11,6 +11,7 @@ main() { #set -x + #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/ggtr/azorius-senate #scrape_book_chapter https://www.dndbeyond.com/sources/mm/monsters-e #scrape_book_chapter https://www.dndbeyond.com/compendium/rules/dmg/appendix-b-monster-lists @@ -82,7 +83,7 @@ scrape_book() { scrape_book_chapter() { local url="$1" scrape_dndbeyond $url \ - | html_to_markdown + | html_to_markdown $url } list_chapters_in_book() { @@ -124,6 +125,14 @@ scrape_dndbeyond() { } html_to_markdown() { + local url="${1%#*}" + url="${url%/}" + local url_title_candidate="${url##*/}" + url_title_candidate="$( + echo "$url_title_candidate" \ + | tr '[:upper:]' '[:lower:]' \ + | sed 's/[^a-z]/./g' + )" local out="$( pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \ | sed '/^:::.*/d' \ @@ -138,14 +147,23 @@ html_to_markdown() { echo Appendix elif echo "$out" | grep '^##* Chapter' &> /dev/null; then echo Chapter + elif echo "$out" | grep '^##* Preface' &> /dev/null; then + echo Preface elif echo "$out" | grep '^##* Introduction' &> /dev/null; then echo Introduction elif echo "$out" | grep '^##* Credits' &> /dev/null; then echo Credits elif echo "$out" | grep '^##* Monsters' &> /dev/null; then echo Monsters + elif echo "$out" | grep -i '^##* '"$url_title_candidate" &> /dev/null; then + target="$(echo "$out" | grep -i '^##* '"$url_title_candidate" | sed 's/^\#\#*[ ]*//' | head -n 1)" + log WARNING: URL title candidate $target + echo "$target" else log WARNING: NOTHING INDICITIVE OF TYPE OF CONTENT FOUND + echo "$out" | grep '^#' | while read -r line; do + log candidate="$line" + done echo Chapter fi )"