From 56b5984fda455b5e2cf50c93aa199f7e507a9f33 Mon Sep 17 00:00:00 2001 From: Bel LaPointe Date: Tue, 1 Feb 2022 07:48:37 -0700 Subject: [PATCH] begin app --- app/crawler/main.sh | 104 ++++++++++++++++++++++++++++++++++++++++++++ app/ui/run.sh | 1 + 2 files changed, 105 insertions(+) create mode 100644 app/crawler/main.sh create mode 120000 app/ui/run.sh diff --git a/app/crawler/main.sh b/app/crawler/main.sh new file mode 100644 index 0000000..a565844 --- /dev/null +++ b/app/crawler/main.sh @@ -0,0 +1,104 @@ +#! /bin/bash + +main() { + config + for id in $(ids); do + if should_crawl "$id"; then + crawl "$id" + fi + rewrite "$id" + done +} + +config() { + set -o pipefail + set -e + export CACHE=$(mktemp -d) + export CRAWL_INTERVAL=$((60*5)) + export NOTEA_ADDR="${NOTEA_ADDR:-"http://localhost:3000"}" +} + +log() { + echo "$(date)> $*" >&2 +} + +ids() { + notea ids +} + +should_crawl() { + local f="$CACHE/crawled_$1" + if [ ! -f "$f" ]; then + return 0 + fi + local last_crawled=$(date -r "$f" +%s) + local now=$(date +%s) + if ((now-last_crawled < CRAWL_INTERVAL)); then + return 0 + fi + return 1 +} + +crawl() { + local id="$1" + local json="$(notea get "$id")" + local content="$(echo "$json" | jq -r .content)" + if ! is_crawlable "$content"; then + return 0 + fi + log not impl crawl + return 1 +} + +is_crawlable() { + # https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file + local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*" + if echo "$*" | tr -d '\n' | grep -E "^[ ]*$url_pattern[ ]*$"; then + return 0 + fi + if echo "$*" | head -n 1 | grep -E "^[ ]*_source_: $url_pattern[ ]*$"; then + return 0 + fi + return 1 +} + +rewrite() { + log not impl rewrite + return 1 +} + +notea() ( + ncurl() { + curl -sS "$@" + } + + ids() { + ncurl $NOTEA_ADDR/api/tree \ + | jq -r '.items \ + | to_entries[].value.id' \ + | grep -v '^root$' + } + + get() { + local cached="$CACHE/cache_$1" + if [ -f "$cached" ] && cat "$cached" | grep .; then + return 0 + fi + _get "$@" | tee "$cached" + } + + _get() { + ncurl $NOTEA_ADDR/api/notes/$1 + } + + "$@" +) + +crawler() ( + should() { + } +) + +if [ "$0" == "$BASH_SOURCE" ]; then + main "$@" +fi diff --git a/app/ui/run.sh b/app/ui/run.sh new file mode 120000 index 0000000..34ad5ff --- /dev/null +++ b/app/ui/run.sh @@ -0,0 +1 @@ +../../spike/review/run.sh \ No newline at end of file