diff --git a/scrape.sh b/scrape.sh index 77a63da..0975ec0 100644 --- a/scrape.sh +++ b/scrape.sh @@ -238,6 +238,10 @@ html_to_markdown() { local f="$(mktemp)" log url=$1 cat > "$f" + #| perl -pe 's|]*stat-block-ability-scores-data[^>]*>(.*?)|\1|g' \ + #| perl -pe 's|]*stat-block-ability-scores-heading[^>]*>(.*?)|\1|g' \ + #| perl -pe 's|]*stat-block-ability-scores-stat[^>]*>(.*?)|

\1

|g' \ + #| readability "file://$(realpath $f)" \ local clean="$( cat "$f" \ @@ -245,16 +249,13 @@ html_to_markdown() { | tr '\n' '\r' \ | perl -pe 's|]*>(.*?)|\1|g' \ | perl -pe 's|\r([^<]*)

\r|\r
\1
\r|g' \ - | perl -pe 's|]*stat-block-ability-scores-data[^>]*>(.*?)|\1|g' \ - | perl -pe 's|]*stat-block-ability-scores-heading[^>]*>(.*?)|\1|g' \ - | perl -pe 's|]*stat-block-ability-scores-stat[^>]*>(.*?)|

\1

|g' \ - | perl -pe 's|]*stat-block-ability-scores[^>]*>(.*?)|\1|g' \ + | perl -pe 's|

(.*?)

|\2|g' \ + | perl -pe 's|\r*(?:(?!).)+\r*||g' \ | tr '\r' '\n' \ | sed 's/
/ /g' \ | sed 's/colspan="[^"]*"//g' \ - | readability "file://$(realpath $f)" \ | tr '\n' '\r' \ - | perl -pe 's|

([^<]*)

|\2|g' \ + | perl -pe 's|

(.*?)

|\2|g' \ | perl -pe 's|\r*(?:(?!).)+\r*||g' \ | tr '\r' '\n' \ | pandoc \ @@ -290,7 +291,18 @@ html_to_markdown() { | sed 's/::://g' \ | sed 's/&.dquo;/"/g' \ | sed "s/&.squo;/'/g" \ - | tr '\r' '\n' + | sed 's/—/--/g' \ + | sed 's/–/-/g' \ + | sed 's/×/*/g' \ + | sed 's/­//g' \ + | sed 's/{\.[^}]*}//g' \ + | tr '\r' '\n' \ + | sed 's/===+/===|/g' \ + | sed 's/+===/|===/g' \ + | sed 's/---+/---|/g' \ + | sed 's/+---/|---/g' \ + | grep -v '^|-[-|]*-|$' \ + | sed '/^|[=|]*|$/s/=/-/g' \ #| sed 's/:::\(.*\):::/```\1```/g' \