From b445441b84204f8918698faa8e4728b7c3514211 Mon Sep 17 00:00:00 2001 From: bel Date: Sun, 3 Jul 2022 23:54:43 -0600 Subject: [PATCH] fix --- scrape.sh | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/scrape.sh b/scrape.sh index 8285afb..611300b 100644 --- a/scrape.sh +++ b/scrape.sh @@ -55,7 +55,7 @@ ensure() { return 1 fi done - export TMPDIR=/tmp/scrape.${SEED:-${RANDOM:-$(date +%s)}} + export TMPDIR="${TMPDIR:-"/tmp/scrape.${SEED:-${RANDOM:-$(date +%s)}}"}" mkdir -p $TMPDIR } @@ -102,6 +102,8 @@ scrape_book() { | sed 's/[ ]*$//' \ | sed 's/[ ][ ]*/ /g' \ | sed 's/[^a-zA-Z0-9]/_/g' \ + | sed 's/mdash.*//' \ + | sed 's/Disclaimer.*/Disclaimer/g' \ )" if [ -z "$title" ]; then title="${chapter%/}" @@ -235,6 +237,8 @@ html_to_markdown() { local f="$(mktemp)" log url=$1 cat > "$f" + #| perl -pe 's|^\*\**(.*?)\**\*$|# \1|g' \ + local clean="$( cat "$f" \ | awk '/CONTENT/,/FOOTER/' \ @@ -243,7 +247,6 @@ html_to_markdown() { | sed 's/{[^}]*data[^}]*}//g' \ | sed 's/\[](#[^)]*)//' \ | cat \ - | perl -pe 's|^\*\**(.*?)\**\*$|# \1|g' \ | sed 's/{[^}]*}//' \ | sed 's/ *$//' \ | tr '\n' '\r' \ @@ -256,14 +259,16 @@ html_to_markdown() { echo "$clean" \ | grep '^#' \ | head -n 1 \ - | sed 's/^##*/#/' + | sed 's/^##*/#/' \ + | grep . || echo "# ${1##*/}" echo echo "$clean" ) \ | tr '\n' '\r' \ | sed 's/\r\r:::\r\r/\r/g' \ | sed 's/\r\r:::\r*$//' \ - | sed 's/:::/```/g' \ + | sed 's/:::\(.*\):::/```\1```/g' \ + | sed 's/::://g' \ | sed 's/&.dquo;/"/g' \ | sed "s/&.squo;/'/g" \ | tr '\r' '\n'