white space

master
Bel LaPointe 2022-02-10 08:08:35 -07:00
parent 829081ebed
commit f190bdecca
1 changed files with 65 additions and 65 deletions

View File

@ -1,122 +1,122 @@
#! /bin/bash #! /bin/bash
main() { main() {
config config
for id in $(ids); do for id in $(ids); do
crawl "$id" crawl "$id"
done done
for id in $(ids); do for id in $(ids); do
rewrite "$id" rewrite "$id"
done done
} }
config() { config() {
set -o pipefail set -o pipefail
set -e set -e
export CACHE="${CACHE:-"$(mktemp -d)"}" export CACHE="${CACHE:-"$(mktemp -d)"}"
mkdir -p "$CACHE" mkdir -p "$CACHE"
export CACHE_DURATION=$((60*50)) export CACHE_DURATION=$((60*50))
export NOTES_ADDR="${NOTES_ADDR:-"http://localhost:3004"}" export NOTES_ADDR="${NOTES_ADDR:-"http://localhost:3004"}"
export GITLAB_PAT="$GITLAB_PAT" export GITLAB_PAT="$GITLAB_PAT"
source ./gitlab.sh source ./gitlab.sh
source ./cache.sh source ./cache.sh
source ./notes.sh source ./notes.sh
} }
log() { log() {
echo "$(echo "$(date +%H:%M:%S)> $*" | tr '\n' ' ')" >&2 echo "$(echo "$(date +%H:%M:%S)> $*" | tr '\n' ' ')" >&2
} }
ids() { ids() {
notes ids notes ids
} }
crawl() { crawl() {
local cache_key="crawled $*" local cache_key="crawled $*"
# TODO # TODO
if false && cache get "$cache_key"; then if false && cache get "$cache_key"; then
return return
fi fi
_crawl "$@" | cache put "$cache_key" _crawl "$@" | cache put "$cache_key"
} }
_crawl() { _crawl() {
log "crawling $*" log "crawling $*"
local id="$1" local id="$1"
local content="$(notes get "$id")" local content="$(notes get "$id")"
local json="$( local json="$(
printf '{"content": %s, "id": "%s"}' \ printf '{"content": %s, "id": "%s"}' \
"$(echo "$content" | jq -Rs)" \ "$(echo "$content" | jq -Rs)" \
"$id" "$id"
)" )"
if ! is_crawlable "$content"; then if ! is_crawlable "$content"; then
log "not crawlable: '${content:0:20}'..." log "not crawlable: '${content:0:20}'..."
return 0 return 0
fi fi
local crawlable_source="$(extract_crawlable_source "$content")" local crawlable_source="$(extract_crawlable_source "$content")"
for backend in gitlab; do for backend in gitlab; do
if $backend is "$crawlable_source"; then if $backend is "$crawlable_source"; then
crawl_with $backend "$json" crawl_with $backend "$json"
return $? return $?
fi fi
done done
log "unknown backend for $crawlable_source" log "unknown backend for $crawlable_source"
return 1 return 1
} }
extract_crawlable_source() { extract_crawlable_source() {
echo "$*" | head -n 1 | awk '{print $NF}' | sed 's/^<//' | sed 's/>$//' echo "$*" | head -n 1 | awk '{print $NF}' | sed 's/^<//' | sed 's/>$//'
} }
crawl_with() { crawl_with() {
local backend="$1" local backend="$1"
local json="$2" local json="$2"
local content="$(echo "$json" | jq -r .content)" local content="$(echo "$json" | jq -r .content)"
local crawlable_source="$(extract_crawlable_source "$content")" local crawlable_source="$(extract_crawlable_source "$content")"
local expanded=($($backend expand "$crawlable_source")) local expanded=($($backend expand "$crawlable_source"))
log expand $crawlable_source: log expand $crawlable_source:
one() { one() {
local i="$1" local i="$1"
export TITLE="$( export TITLE="$(
echo "$i" | base64 --decode echo "$i" | base64 --decode
)" )"
export CONTENT="$( export CONTENT="$(
$backend get "$crawlable_source" "$i" $backend get "$crawlable_source" "$i"
)" )"
export ID="$( export ID="$(
echo "$crawlable_source/$TITLE" | base64 | md5sum | awk '{print $1}' echo "$crawlable_source/$TITLE" | base64 | md5sum | awk '{print $1}'
)" )"
export PID="$( export PID="$(
echo $json | jq -r .id echo $json | jq -r .id
)" )"
log " $PID/$ID ($TITLE): ${#CONTENT}" log " $PID/$ID ($TITLE): ${#CONTENT}"
push_crawled "$PID/$ID" "$TITLE" "$CONTENT" push_crawled "$PID/$ID" "$TITLE" "$CONTENT"
} }
for i in $(seq 1 $(("${#expanded[@]}"-1))); do for i in $(seq 1 $(("${#expanded[@]}"-1))); do
one "${expanded[i]}" one "${expanded[i]}"
done done
} }
push_crawled() { push_crawled() {
notes put "$@" notes put "$@"
} }
is_crawlable() { is_crawlable() {
local crawlable_source="$(extract_crawlable_source "$*")" local crawlable_source="$(extract_crawlable_source "$*")"
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file # https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*" local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*"
echo "$crawlable_source" | grep -q -E "^[ ]*$url_pattern[ ]*$" echo "$crawlable_source" | grep -q -E "^[ ]*$url_pattern[ ]*$"
} }
rewrite() { rewrite() {
log not impl: rewrite "#abc-def" to "#h-abc-def" log not impl: rewrite "#abc-def" to "#h-abc-def"
log not impl: rewrite "./asdf" to "./zyxw" or "absolute.com/asdf" log not impl: rewrite "./asdf" to "./zyxw" or "absolute.com/asdf"
log not impl rewrite, change images log not impl rewrite, change images
return 1 return 1
} }
if [ "$0" == "$BASH_SOURCE" ]; then if [ "$0" == "$BASH_SOURCE" ]; then
main "$@" main "$@"
fi fi