tables without thead dont render
parent
cfa69a88fc
commit
d6df61c282
26
scrape.sh
26
scrape.sh
|
|
@ -238,6 +238,10 @@ html_to_markdown() {
|
|||
local f="$(mktemp)"
|
||||
log url=$1
|
||||
cat > "$f"
|
||||
#| perl -pe 's|<div[^>]*stat-block-ability-scores-data[^>]*>(.*?)</div>|<span>\1</span>|g' \
|
||||
#| perl -pe 's|<div[^>]*stat-block-ability-scores-heading[^>]*>(.*?)</div>|<strong>\1</strong>|g' \
|
||||
#| perl -pe 's|<div[^>]*stat-block-ability-scores-stat[^>]*>(.*?)</div>|<p>\1</p>|g' \
|
||||
#| readability "file://$(realpath $f)" \
|
||||
|
||||
local clean="$(
|
||||
cat "$f" \
|
||||
|
|
@ -245,16 +249,13 @@ html_to_markdown() {
|
|||
| tr '\n' '\r' \
|
||||
| perl -pe 's|<blockquote[^>]*>(.*?)</blockquote>|\1|g' \
|
||||
| perl -pe 's|\r<p[^\r]*<strong>([^<]*)</strong></p>\r|\r<h5>\1</h5>\r|g' \
|
||||
| perl -pe 's|<div[^>]*stat-block-ability-scores-data[^>]*>(.*?)</div>|<span>\1</span>|g' \
|
||||
| perl -pe 's|<div[^>]*stat-block-ability-scores-heading[^>]*>(.*?)</div>|<strong>\1</strong>|g' \
|
||||
| perl -pe 's|<div[^>]*stat-block-ability-scores-stat[^>]*>(.*?)</div>|<p>\1</p>|g' \
|
||||
| perl -pe 's|<div[^>]*stat-block-ability-scores[^>]*>(.*?)</div>|\1|g' \
|
||||
| perl -pe 's|<t(.)><p>(.*?)</p></t.>|<t\1>\2</t\1>|g' \
|
||||
| perl -pe 's|<thead>\r*<tr>(?:(?!</tr>).)+</tr>\r*<tr>|<thead><tr>|g' \
|
||||
| tr '\r' '\n' \
|
||||
| sed 's/<br>/ /g' \
|
||||
| sed 's/colspan="[^"]*"//g' \
|
||||
| readability "file://$(realpath $f)" \
|
||||
| tr '\n' '\r' \
|
||||
| perl -pe 's|<t(.)><p>([^<]*)</p></t.>|<t\1>\2</t\1>|g' \
|
||||
| perl -pe 's|<t(.)><p>(.*?)</p></t.>|<t\1>\2</t\1>|g' \
|
||||
| perl -pe 's|<thead>\r*<tr>(?:(?!</tr>).)+</tr>\r*<tr>|<thead><tr>|g' \
|
||||
| tr '\r' '\n' \
|
||||
| pandoc \
|
||||
|
|
@ -290,7 +291,18 @@ html_to_markdown() {
|
|||
| sed 's/::://g' \
|
||||
| sed 's/&.dquo;/"/g' \
|
||||
| sed "s/&.squo;/'/g" \
|
||||
| tr '\r' '\n'
|
||||
| sed 's/—/--/g' \
|
||||
| sed 's/–/-/g' \
|
||||
| sed 's/×/*/g' \
|
||||
| sed 's/­//g' \
|
||||
| sed 's/{\.[^}]*}//g' \
|
||||
| tr '\r' '\n' \
|
||||
| sed 's/===+/===|/g' \
|
||||
| sed 's/+===/|===/g' \
|
||||
| sed 's/---+/---|/g' \
|
||||
| sed 's/+---/|---/g' \
|
||||
| grep -v '^|-[-|]*-|$' \
|
||||
| sed '/^|[=|]*|$/s/=/-/g' \
|
||||
|
||||
#| sed 's/:::\(.*\):::/```\1```/g' \
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue