#! /bin/bash main() { config for id in $(ids); do if should_crawl "$id"; then crawl "$id" fi rewrite "$id" done } config() { set -o pipefail set -e export CACHE=$(mktemp -d) export CRAWL_INTERVAL=$((60*5)) export NOTEA_ADDR="${NOTEA_ADDR:-"http://localhost:3000"}" } log() { echo "$(date)> $*" >&2 } ids() { notea ids } should_crawl() { local f="$CACHE/crawled_$1" if [ ! -f "$f" ]; then return 0 fi local last_crawled=$(date -r "$f" +%s) local now=$(date +%s) if ((now-last_crawled < CRAWL_INTERVAL)); then return 0 fi return 1 } crawl() { local id="$1" local json="$(notea get "$id")" local content="$(echo "$json" | jq -r .content)" if ! is_crawlable "$content"; then return 0 fi log not impl crawl return 1 } is_crawlable() { # https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*" if echo "$*" | tr -d '\n' | grep -E "^[ ]*$url_pattern[ ]*$"; then return 0 fi if echo "$*" | head -n 1 | grep -E "^[ ]*_source_: $url_pattern[ ]*$"; then return 0 fi return 1 } rewrite() { log not impl rewrite return 1 } notea() ( ncurl() { curl -sS "$@" } ids() { ncurl $NOTEA_ADDR/api/tree \ | jq -r '.items \ | to_entries[].value.id' \ | grep -v '^root$' } get() { local cached="$CACHE/cache_$1" if [ -f "$cached" ] && cat "$cached" | grep .; then return 0 fi _get "$@" | tee "$cached" } _get() { ncurl $NOTEA_ADDR/api/notes/$1 } "$@" ) crawler() ( should() { } ) if [ "$0" == "$BASH_SOURCE" ]; then main "$@" fi