master
bel 2022-07-03 23:54:43 -06:00
parent 0977f17c5e
commit b445441b84
1 changed files with 9 additions and 4 deletions

View File

@ -55,7 +55,7 @@ ensure() {
return 1 return 1
fi fi
done done
export TMPDIR=/tmp/scrape.${SEED:-${RANDOM:-$(date +%s)}} export TMPDIR="${TMPDIR:-"/tmp/scrape.${SEED:-${RANDOM:-$(date +%s)}}"}"
mkdir -p $TMPDIR mkdir -p $TMPDIR
} }
@ -102,6 +102,8 @@ scrape_book() {
| sed 's/[ ]*$//' \ | sed 's/[ ]*$//' \
| sed 's/[ ][ ]*/ /g' \ | sed 's/[ ][ ]*/ /g' \
| sed 's/[^a-zA-Z0-9]/_/g' \ | sed 's/[^a-zA-Z0-9]/_/g' \
| sed 's/mdash.*//' \
| sed 's/Disclaimer.*/Disclaimer/g' \
)" )"
if [ -z "$title" ]; then if [ -z "$title" ]; then
title="${chapter%/}" title="${chapter%/}"
@ -235,6 +237,8 @@ html_to_markdown() {
local f="$(mktemp)" local f="$(mktemp)"
log url=$1 log url=$1
cat > "$f" cat > "$f"
#| perl -pe 's|^\*\**(.*?)\**\*$|# \1|g' \
local clean="$( local clean="$(
cat "$f" \ cat "$f" \
| awk '/CONTENT/,/FOOTER/' \ | awk '/CONTENT/,/FOOTER/' \
@ -243,7 +247,6 @@ html_to_markdown() {
| sed 's/{[^}]*data[^}]*}//g' \ | sed 's/{[^}]*data[^}]*}//g' \
| sed 's/\[](#[^)]*)//' \ | sed 's/\[](#[^)]*)//' \
| cat \ | cat \
| perl -pe 's|^\*\**(.*?)\**\*$|# \1|g' \
| sed 's/{[^}]*}//' \ | sed 's/{[^}]*}//' \
| sed 's/ *$//' \ | sed 's/ *$//' \
| tr '\n' '\r' \ | tr '\n' '\r' \
@ -256,14 +259,16 @@ html_to_markdown() {
echo "$clean" \ echo "$clean" \
| grep '^#' \ | grep '^#' \
| head -n 1 \ | head -n 1 \
| sed 's/^##*/#/' | sed 's/^##*/#/' \
| grep . || echo "# ${1##*/}"
echo echo
echo "$clean" echo "$clean"
) \ ) \
| tr '\n' '\r' \ | tr '\n' '\r' \
| sed 's/\r\r:::\r\r/\r/g' \ | sed 's/\r\r:::\r\r/\r/g' \
| sed 's/\r\r:::\r*$//' \ | sed 's/\r\r:::\r*$//' \
| sed 's/:::/```/g' \ | sed 's/:::\(.*\):::/```\1```/g' \
| sed 's/::://g' \
| sed 's/&.dquo;/"/g' \ | sed 's/&.dquo;/"/g' \
| sed "s/&.squo;/'/g" \ | sed "s/&.squo;/'/g" \
| tr '\r' '\n' | tr '\r' '\n'