diff --git a/scrape.sh b/scrape.sh
index 611300b..77a63da 100644
--- a/scrape.sh
+++ b/scrape.sh
@@ -3,7 +3,7 @@
main() {
ensure
- local output="${1:-"$TMPDIR/result"}"
+ local output="$TMPDIR/result"
mkdir -p "$output"
log "$output"
@@ -71,7 +71,8 @@ scrape_books() {
| sed 's/^href="//' \
| sed 's/^\///' \
| sed 's/^/\//' \
- | sed 's/^/https:\/\/www.dndbeyond.com/'
+ | sed 's/^/https:\/\/www.dndbeyond.com/' \
+ | grep "${SCRAPE_BOOKS_PATTERN:-.*}"
}
scrape_book() {
@@ -237,29 +238,48 @@ html_to_markdown() {
local f="$(mktemp)"
log url=$1
cat > "$f"
- #| perl -pe 's|^\*\**(.*?)\**\*$|# \1|g' \
local clean="$(
cat "$f" \
| awk '/CONTENT/,/FOOTER/' \
+ | tr '\n' '\r' \
+ | perl -pe 's|
]*>(.*?)
|\1|g' \
+ | perl -pe 's|\r([^<]*)
\r|\r\1
\r|g' \
+ | perl -pe 's|]*stat-block-ability-scores-data[^>]*>(.*?)
|\1|g' \
+ | perl -pe 's|]*stat-block-ability-scores-heading[^>]*>(.*?)
|\1|g' \
+ | perl -pe 's|]*stat-block-ability-scores-stat[^>]*>(.*?)
|\1
|g' \
+ | perl -pe 's|]*stat-block-ability-scores[^>]*>(.*?)
|\1|g' \
+ | tr '\r' '\n' \
+ | sed 's/
/ /g' \
+ | sed 's/colspan="[^"]*"//g' \
| readability "file://$(realpath $f)" \
- | pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \
+ | tr '\n' '\r' \
+ | perl -pe 's|([^<]*)
|\2|g' \
+ | perl -pe 's|\r*(?:(?!
).)+\r*||g' \
+ | tr '\r' '\n' \
+ | pandoc \
+ -f html \
+ -t markdown+pipe_tables-simple_tables-multiline_tables \
+ --atx-headers \
+ --ascii \
+ --toc \
+ --wrap=none \
+ --strip-comments \
+ -o - \
| sed 's/{[^}]*data[^}]*}//g' \
| sed 's/\[](#[^)]*)//' \
| cat \
| sed 's/{[^}]*}//' \
| sed 's/ *$//' \
| tr '\n' '\r' \
- | sed 's/\r\r *\([A-Za-z0-9][^\r]*\)\r--*\r\r/\r\r## \1\r\r/g' \
| sed 's/\(.*\)<\/div>/\1/g' \
| tr '\r' '\n' \
)"
- rm "$f"
(
- echo "$clean" \
- | grep '^#' \
- | head -n 1 \
- | sed 's/^##*/#/' \
+ grep -o '
]*>//' \
+ | sed 's/-.*//' \
+ | sed 's/^/# /' \
| grep . || echo "# ${1##*/}"
echo
echo "$clean"
@@ -267,11 +287,14 @@ html_to_markdown() {
| tr '\n' '\r' \
| sed 's/\r\r:::\r\r/\r/g' \
| sed 's/\r\r:::\r*$//' \
- | sed 's/:::\(.*\):::/```\1```/g' \
| sed 's/::://g' \
| sed 's/&.dquo;/"/g' \
| sed "s/&.squo;/'/g" \
| tr '\r' '\n'
+
+ #| sed 's/:::\(.*\):::/```\1```/g' \
+
+ rm "$f"
return $?
}