#! /bin/bash main() { config for id in $(ids); do crawl "$id" rewrite "$id" done } config() { set -o pipefail set -e export CACHE="${CACHE:-"$(mktemp -d)"}" export CACHE_DURATION=$((60*5)) export NOTEA_ADDR="${NOTEA_ADDR:-"http://localhost:3000"}" export GITLAB_PAT="$GITLAB_PAT" source ./gitlab.sh source ./cache.sh source ./notea.sh } log() { echo "$(date)> $*" >&2 } ids() { notea ids } crawl() { local cache_key="crawled $*" if cache get "$cache_key"; then return fi _crawl "$@" | cache put "$cache_key" } _crawl() { local id="$1" local json="$(notea get "$id")" local content="$(echo "$json" | jq -r .content)" if ! is_crawlable "$content"; then return 0 fi local crawlable_source="$(extract_crawlable_source "$content")" for backend in gitlab; do if $backend is "$crawlable_source"; then crawl_with $backend "$json" return $? fi done log "unknown backend for $crawlable_source" return 1 } extract_crawlable_source() { echo "$*" | head -n 1 | awk '{print $NF}' } crawl_with() { local backend="$1" local json="$2" local content="$(echo "$json" | jq -r .content)" local crawlable_source="$(extract_crawlable_source "$content")" local expanded=("$($backend expand "$crawlable_source")") local context="$expanded" for i in $(seq 1 "${#expanded[@]}"); do log expand $context, ${expanded[i]} done log not impl crawl with return 1 } is_crawlable() { # https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*" if echo "$*" | tr -d '\n' | grep -E "^[ ]*$url_pattern[ ]*$"; then return 0 fi if echo "$*" | head -n 1 | grep -E "^[ ]*_source_: $url_pattern[ ]*$"; then return 0 fi return 1 } rewrite() { log not impl rewrite return 1 } if [ "$0" == "$BASH_SOURCE" ]; then main "$@" fi