white space

master
Bel LaPointe 2022-02-10 08:08:35 -07:00
parent 829081ebed
commit f190bdecca
1 changed files with 65 additions and 65 deletions

View File

@ -1,122 +1,122 @@
#! /bin/bash
main() {
config
for id in $(ids); do
crawl "$id"
done
for id in $(ids); do
rewrite "$id"
done
config
for id in $(ids); do
crawl "$id"
done
for id in $(ids); do
rewrite "$id"
done
}
config() {
set -o pipefail
set -e
export CACHE="${CACHE:-"$(mktemp -d)"}"
mkdir -p "$CACHE"
export CACHE_DURATION=$((60*50))
export NOTES_ADDR="${NOTES_ADDR:-"http://localhost:3004"}"
export GITLAB_PAT="$GITLAB_PAT"
source ./gitlab.sh
source ./cache.sh
source ./notes.sh
set -o pipefail
set -e
export CACHE="${CACHE:-"$(mktemp -d)"}"
mkdir -p "$CACHE"
export CACHE_DURATION=$((60*50))
export NOTES_ADDR="${NOTES_ADDR:-"http://localhost:3004"}"
export GITLAB_PAT="$GITLAB_PAT"
source ./gitlab.sh
source ./cache.sh
source ./notes.sh
}
log() {
echo "$(echo "$(date +%H:%M:%S)> $*" | tr '\n' ' ')" >&2
echo "$(echo "$(date +%H:%M:%S)> $*" | tr '\n' ' ')" >&2
}
ids() {
notes ids
notes ids
}
crawl() {
local cache_key="crawled $*"
local cache_key="crawled $*"
# TODO
if false && cache get "$cache_key"; then
return
fi
_crawl "$@" | cache put "$cache_key"
if false && cache get "$cache_key"; then
return
fi
_crawl "$@" | cache put "$cache_key"
}
_crawl() {
log "crawling $*"
local id="$1"
local content="$(notes get "$id")"
log "crawling $*"
local id="$1"
local content="$(notes get "$id")"
local json="$(
printf '{"content": %s, "id": "%s"}' \
"$(echo "$content" | jq -Rs)" \
"$id"
)"
if ! is_crawlable "$content"; then
log "not crawlable: '${content:0:20}'..."
return 0
fi
local crawlable_source="$(extract_crawlable_source "$content")"
for backend in gitlab; do
if $backend is "$crawlable_source"; then
crawl_with $backend "$json"
return $?
fi
done
log "unknown backend for $crawlable_source"
return 1
if ! is_crawlable "$content"; then
log "not crawlable: '${content:0:20}'..."
return 0
fi
local crawlable_source="$(extract_crawlable_source "$content")"
for backend in gitlab; do
if $backend is "$crawlable_source"; then
crawl_with $backend "$json"
return $?
fi
done
log "unknown backend for $crawlable_source"
return 1
}
extract_crawlable_source() {
echo "$*" | head -n 1 | awk '{print $NF}' | sed 's/^<//' | sed 's/>$//'
echo "$*" | head -n 1 | awk '{print $NF}' | sed 's/^<//' | sed 's/>$//'
}
crawl_with() {
local backend="$1"
local json="$2"
local backend="$1"
local json="$2"
local content="$(echo "$json" | jq -r .content)"
local crawlable_source="$(extract_crawlable_source "$content")"
local content="$(echo "$json" | jq -r .content)"
local crawlable_source="$(extract_crawlable_source "$content")"
local expanded=($($backend expand "$crawlable_source"))
log expand $crawlable_source:
local expanded=($($backend expand "$crawlable_source"))
log expand $crawlable_source:
one() {
local i="$1"
export TITLE="$(
export TITLE="$(
echo "$i" | base64 --decode
)"
export CONTENT="$(
export CONTENT="$(
$backend get "$crawlable_source" "$i"
)"
export ID="$(
export ID="$(
echo "$crawlable_source/$TITLE" | base64 | md5sum | awk '{print $1}'
)"
export PID="$(
export PID="$(
echo $json | jq -r .id
)"
log " $PID/$ID ($TITLE): ${#CONTENT}"
push_crawled "$PID/$ID" "$TITLE" "$CONTENT"
log " $PID/$ID ($TITLE): ${#CONTENT}"
push_crawled "$PID/$ID" "$TITLE" "$CONTENT"
}
for i in $(seq 1 $(("${#expanded[@]}"-1))); do
for i in $(seq 1 $(("${#expanded[@]}"-1))); do
one "${expanded[i]}"
done
done
}
push_crawled() {
notes put "$@"
notes put "$@"
}
is_crawlable() {
local crawlable_source="$(extract_crawlable_source "$*")"
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*"
echo "$crawlable_source" | grep -q -E "^[ ]*$url_pattern[ ]*$"
local crawlable_source="$(extract_crawlable_source "$*")"
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*"
echo "$crawlable_source" | grep -q -E "^[ ]*$url_pattern[ ]*$"
}
rewrite() {
log not impl: rewrite "#abc-def" to "#h-abc-def"
log not impl: rewrite "./asdf" to "./zyxw" or "absolute.com/asdf"
log not impl rewrite, change images
return 1
log not impl: rewrite "#abc-def" to "#h-abc-def"
log not impl: rewrite "./asdf" to "./zyxw" or "absolute.com/asdf"
log not impl rewrite, change images
return 1
}
if [ "$0" == "$BASH_SOURCE" ]; then
main "$@"
main "$@"
fi