master
bel 2022-07-03 23:54:43 -06:00
parent 0977f17c5e
commit b445441b84
1 changed files with 9 additions and 4 deletions

View File

@ -55,7 +55,7 @@ ensure() {
return 1
fi
done
export TMPDIR=/tmp/scrape.${SEED:-${RANDOM:-$(date +%s)}}
export TMPDIR="${TMPDIR:-"/tmp/scrape.${SEED:-${RANDOM:-$(date +%s)}}"}"
mkdir -p $TMPDIR
}
@ -102,6 +102,8 @@ scrape_book() {
| sed 's/[ ]*$//' \
| sed 's/[ ][ ]*/ /g' \
| sed 's/[^a-zA-Z0-9]/_/g' \
| sed 's/mdash.*//' \
| sed 's/Disclaimer.*/Disclaimer/g' \
)"
if [ -z "$title" ]; then
title="${chapter%/}"
@ -235,6 +237,8 @@ html_to_markdown() {
local f="$(mktemp)"
log url=$1
cat > "$f"
#| perl -pe 's|^\*\**(.*?)\**\*$|# \1|g' \
local clean="$(
cat "$f" \
| awk '/CONTENT/,/FOOTER/' \
@ -243,7 +247,6 @@ html_to_markdown() {
| sed 's/{[^}]*data[^}]*}//g' \
| sed 's/\[](#[^)]*)//' \
| cat \
| perl -pe 's|^\*\**(.*?)\**\*$|# \1|g' \
| sed 's/{[^}]*}//' \
| sed 's/ *$//' \
| tr '\n' '\r' \
@ -256,14 +259,16 @@ html_to_markdown() {
echo "$clean" \
| grep '^#' \
| head -n 1 \
| sed 's/^##*/#/'
| sed 's/^##*/#/' \
| grep . || echo "# ${1##*/}"
echo
echo "$clean"
) \
| tr '\n' '\r' \
| sed 's/\r\r:::\r\r/\r/g' \
| sed 's/\r\r:::\r*$//' \
| sed 's/:::/```/g' \
| sed 's/:::\(.*\):::/```\1```/g' \
| sed 's/::://g' \
| sed 's/&.dquo;/"/g' \
| sed "s/&.squo;/'/g" \
| tr '\r' '\n'