fix intro vs chap vs appendix
parent
95518fc3a8
commit
d6e7061738
21
poc.sh
21
poc.sh
|
|
@ -4,6 +4,7 @@
|
||||||
main() {
|
main() {
|
||||||
set -e
|
set -e
|
||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
#set -x
|
||||||
|
|
||||||
local output="$(mktemp -d)"
|
local output="$(mktemp -d)"
|
||||||
log "$output"
|
log "$output"
|
||||||
|
|
@ -49,9 +50,14 @@ scrape_book() {
|
||||||
local f="$(mktemp)"
|
local f="$(mktemp)"
|
||||||
|
|
||||||
for chapter in $(list_chapters_in_book "$book_url"); do
|
for chapter in $(list_chapters_in_book "$book_url"); do
|
||||||
log chapter=$chapter
|
log scraping book chapter $chapter of $book_url
|
||||||
scrape_book_chapter "$chapter" > "$f"
|
scrape_book_chapter "$chapter" > "$f"
|
||||||
local title="$(grep -E 'Chapter|Appendix' "$f" | head -n 1 | sed 's/^\#*//' | sed 's/^[ ]*//' | sed 's/[ ]*$//')"
|
log scraping book title from $f
|
||||||
|
local title="$(grep '^\#' "$f" | head -n 1 | sed 's/^\#*//' | sed 's/^[ ]*//' | sed 's/[ ]*$//')"
|
||||||
|
if [ -z "$title" ]; then
|
||||||
|
log "WARNING: NO TITLE FOUND IN $f ($chapter): $(head -n 3 $f)"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
mv "$f" "$output"/"$title"
|
mv "$f" "$output"/"$title"
|
||||||
done
|
done
|
||||||
|
|
||||||
|
|
@ -67,6 +73,7 @@ scrape_book_chapter() {
|
||||||
list_chapters_in_book() {
|
list_chapters_in_book() {
|
||||||
local book_url="${1%/}"
|
local book_url="${1%/}"
|
||||||
local raw="$(scrape_dndbeyond "$book_url")"
|
local raw="$(scrape_dndbeyond "$book_url")"
|
||||||
|
|
||||||
if echo "$raw" | grep -q 'https:..www.dndbeyond.com.compendium[^"]*'; then
|
if echo "$raw" | grep -q 'https:..www.dndbeyond.com.compendium[^"]*'; then
|
||||||
echo "$raw" | grep -o 'https:..www.dndbeyond.com.compendium[^"]*'
|
echo "$raw" | grep -o 'https:..www.dndbeyond.com.compendium[^"]*'
|
||||||
elif echo "$raw" | grep -q "$book_url/"; then
|
elif echo "$raw" | grep -q "$book_url/"; then
|
||||||
|
|
@ -111,7 +118,12 @@ html_to_markdown() {
|
||||||
| sed 's/\[\([^]]*\)]\(([^)]*)\)*\({[^}]*}\)*/\1/g' \
|
| sed 's/\[\([^]]*\)]\(([^)]*)\)*\({[^}]*}\)*/\1/g' \
|
||||||
| sed "s/\\\\\([\"']\)/\1/g"
|
| sed "s/\\\\\([\"']\)/\1/g"
|
||||||
)"
|
)"
|
||||||
local target="$(echo "$out" | grep -q '^[#]* Appendix' && echo Appendix || echo Chapter)"
|
local target="$(
|
||||||
|
(echo "$out" | grep -q '^##* Appendix' && echo Appendix) \
|
||||||
|
|| (echo "$out" | grep -q '^##* Chapter' && echo Chapter) \
|
||||||
|
|| (echo "$out" | grep -q '^##* Introduction' && echo Introduction) \
|
||||||
|
|| echo Chapter
|
||||||
|
)"
|
||||||
echo "$out" \
|
echo "$out" \
|
||||||
| sed -e '/^# '"$target"'/p' -e '0,/^# '"$target"'/d'
|
| sed -e '/^# '"$target"'/p' -e '0,/^# '"$target"'/d'
|
||||||
}
|
}
|
||||||
|
|
@ -122,14 +134,13 @@ rate_limit_1s() {
|
||||||
mkdir -p "$d"
|
mkdir -p "$d"
|
||||||
local last_run="$(date -r "$d/$name" +%s)"
|
local last_run="$(date -r "$d/$name" +%s)"
|
||||||
local now="$(date +%s)"
|
local now="$(date +%s)"
|
||||||
local interval=3
|
local interval=6
|
||||||
log "should sleep while $(($(date +%s)-last_run)) < $interval"
|
log "should sleep while $(($(date +%s)-last_run)) < $interval"
|
||||||
if [ -f "$d/$name" ]; then
|
if [ -f "$d/$name" ]; then
|
||||||
while [ "$(($(date +%s)-last_run))" -lt $interval ]; do
|
while [ "$(($(date +%s)-last_run))" -lt $interval ]; do
|
||||||
sleep 1
|
sleep 1
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
log "done sleeping"
|
|
||||||
touch "$d/$name"
|
touch "$d/$name"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue