From cfa69a88fc7051a45cf1d8b88f0c0b50c1fe1c4a Mon Sep 17 00:00:00 2001 From: bel Date: Mon, 4 Jul 2022 10:25:18 -0600 Subject: [PATCH] cleanup table displays --- scrape.sh | 45 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/scrape.sh b/scrape.sh index 611300b..77a63da 100644 --- a/scrape.sh +++ b/scrape.sh @@ -3,7 +3,7 @@ main() { ensure - local output="${1:-"$TMPDIR/result"}" + local output="$TMPDIR/result" mkdir -p "$output" log "$output" @@ -71,7 +71,8 @@ scrape_books() { | sed 's/^href="//' \ | sed 's/^\///' \ | sed 's/^/\//' \ - | sed 's/^/https:\/\/www.dndbeyond.com/' + | sed 's/^/https:\/\/www.dndbeyond.com/' \ + | grep "${SCRAPE_BOOKS_PATTERN:-.*}" } scrape_book() { @@ -237,29 +238,48 @@ html_to_markdown() { local f="$(mktemp)" log url=$1 cat > "$f" - #| perl -pe 's|^\*\**(.*?)\**\*$|# \1|g' \ local clean="$( cat "$f" \ | awk '/CONTENT/,/FOOTER/' \ + | tr '\n' '\r' \ + | perl -pe 's|]*>(.*?)|\1|g' \ + | perl -pe 's|\r([^<]*)

\r|\r
\1
\r|g' \ + | perl -pe 's|]*stat-block-ability-scores-data[^>]*>(.*?)|\1|g' \ + | perl -pe 's|]*stat-block-ability-scores-heading[^>]*>(.*?)|\1|g' \ + | perl -pe 's|]*stat-block-ability-scores-stat[^>]*>(.*?)|

\1

|g' \ + | perl -pe 's|]*stat-block-ability-scores[^>]*>(.*?)|\1|g' \ + | tr '\r' '\n' \ + | sed 's/
/ /g' \ + | sed 's/colspan="[^"]*"//g' \ | readability "file://$(realpath $f)" \ - | pandoc -f html -t markdown --ascii --toc --wrap=none --strip-comments -o - \ + | tr '\n' '\r' \ + | perl -pe 's|

([^<]*)

|\2|g' \ + | perl -pe 's|\r*(?:(?!).)+\r*||g' \ + | tr '\r' '\n' \ + | pandoc \ + -f html \ + -t markdown+pipe_tables-simple_tables-multiline_tables \ + --atx-headers \ + --ascii \ + --toc \ + --wrap=none \ + --strip-comments \ + -o - \ | sed 's/{[^}]*data[^}]*}//g' \ | sed 's/\[](#[^)]*)//' \ | cat \ | sed 's/{[^}]*}//' \ | sed 's/ *$//' \ | tr '\n' '\r' \ - | sed 's/\r\r *\([A-Za-z0-9][^\r]*\)\r--*\r\r/\r\r## \1\r\r/g' \ | sed 's/
\(.*\)<\/div>/\1/g' \ | tr '\r' '\n' \ )" - rm "$f" ( - echo "$clean" \ - | grep '^#' \ - | head -n 1 \ - | sed 's/^##*/#/' \ + grep -o ']*>//' \ + | sed 's/-.*//' \ + | sed 's/^/# /' \ | grep . || echo "# ${1##*/}" echo echo "$clean" @@ -267,11 +287,14 @@ html_to_markdown() { | tr '\n' '\r' \ | sed 's/\r\r:::\r\r/\r/g' \ | sed 's/\r\r:::\r*$//' \ - | sed 's/:::\(.*\):::/```\1```/g' \ | sed 's/::://g' \ | sed 's/&.dquo;/"/g' \ | sed "s/&.squo;/'/g" \ | tr '\r' '\n' + + #| sed 's/:::\(.*\):::/```\1```/g' \ + + rm "$f" return $? }