begin app

master
Bel LaPointe 2022-02-01 07:48:37 -07:00
parent c31505a65e
commit 56b5984fda
2 changed files with 105 additions and 0 deletions

104
app/crawler/main.sh Normal file
View File

@ -0,0 +1,104 @@
#! /bin/bash
main() {
config
for id in $(ids); do
if should_crawl "$id"; then
crawl "$id"
fi
rewrite "$id"
done
}
config() {
set -o pipefail
set -e
export CACHE=$(mktemp -d)
export CRAWL_INTERVAL=$((60*5))
export NOTEA_ADDR="${NOTEA_ADDR:-"http://localhost:3000"}"
}
log() {
echo "$(date)> $*" >&2
}
ids() {
notea ids
}
should_crawl() {
local f="$CACHE/crawled_$1"
if [ ! -f "$f" ]; then
return 0
fi
local last_crawled=$(date -r "$f" +%s)
local now=$(date +%s)
if ((now-last_crawled < CRAWL_INTERVAL)); then
return 0
fi
return 1
}
crawl() {
local id="$1"
local json="$(notea get "$id")"
local content="$(echo "$json" | jq -r .content)"
if ! is_crawlable "$content"; then
return 0
fi
log not impl crawl
return 1
}
is_crawlable() {
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*"
if echo "$*" | tr -d '\n' | grep -E "^[ ]*$url_pattern[ ]*$"; then
return 0
fi
if echo "$*" | head -n 1 | grep -E "^[ ]*_source_: $url_pattern[ ]*$"; then
return 0
fi
return 1
}
rewrite() {
log not impl rewrite
return 1
}
notea() (
ncurl() {
curl -sS "$@"
}
ids() {
ncurl $NOTEA_ADDR/api/tree \
| jq -r '.items \
| to_entries[].value.id' \
| grep -v '^root$'
}
get() {
local cached="$CACHE/cache_$1"
if [ -f "$cached" ] && cat "$cached" | grep .; then
return 0
fi
_get "$@" | tee "$cached"
}
_get() {
ncurl $NOTEA_ADDR/api/notes/$1
}
"$@"
)
crawler() (
should() {
}
)
if [ "$0" == "$BASH_SOURCE" ]; then
main "$@"
fi

1
app/ui/run.sh Symbolic link
View File

@ -0,0 +1 @@
../../spike/review/run.sh