notea-de-me/app/crawler/main.sh

#! /bin/bash

main() {
   config
   for id in $(ids); do
      if should_crawl "$id"; then
         crawl "$id"
      fi
      rewrite "$id"
   done
}

config() {
   set -o pipefail
   set -e
   export CACHE="${CACHE:-"$(mktemp -d)"}"
   export CACHE_DURATION=$((60*5))
   export CRAWL_INTERVAL=$((60*5))
   export NOTEA_ADDR="${NOTEA_ADDR:-"http://localhost:3000"}"
   export GITLAB_PAT="$GITLAB_PAT"
   source ./gitlab.sh
}

log() {
   echo "$(date)> $*" >&2
}

ids() {
   notea ids
}

should_crawl() {
   local f="$CACHE/crawled.$1"
   if [ ! -f "$f" ]; then
      return 0
   fi
   local last_crawled=$(date -r "$f" +%s)
   local now=$(date +%s)
   if ((now-last_crawled < CRAWL_INTERVAL)); then
      return 0
   fi
   return 1
}

crawl() {
   local id="$1"
   local json="$(notea get "$id")"
   local content="$(echo "$json" | jq -r .content)"
   if ! is_crawlable "$content"; then
      return 0
   fi
   local crawlable_source="$(extract_crawlable_source "$content")"
   for backend in gitlab; do
      if $backend is "$crawlable_source"; then
         crawl_with $backend "$json"
         return $?
      fi
   done
   log "unknown backend for $crawlable_source"
   return 1
}

extract_crawlable_source() {
   echo "$*" | head -n 1 | awk '{print $NF}'
}

crawl_with() {
   local backend="$1"
   local json="$2"

   local content="$(echo "$json" | jq -r .content)"
   local crawlable_source="$(extract_crawlable_source "$content")"

   local expanded=("$($backend expand "$crawlable_source")")
   local context="$expanded"
   for i in $(seq 1 "${#expanded[@]}"); do
      log expand $context, ${expanded[i]}
   done

   log not impl crawl with
   return 1
}

is_crawlable() {
   # https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
   local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*"
   if echo "$*" | tr -d '\n' | grep -E "^[ ]*$url_pattern[ ]*$"; then
      return 0
   fi
   if echo "$*" | head -n 1 | grep -E "^[ ]*_source_: $url_pattern[ ]*$"; then
      return 0
   fi
   return 1
}

rewrite() {
   log not impl rewrite
   return 1
}

if [ "$0" == "$BASH_SOURCE" ]; then
   main "$@"
fi