notea-de-me/app/crawler/main.sh

#! /bin/bash

main() {
   config
   for id in $(ids); do
      crawl "$id"
   done
   for id in $(ids); do
      rewrite "$id"
   done
}

config() {
   set -o pipefail
   set -e
   export CACHE="${CACHE:-"$(mktemp -d)"}"
   mkdir -p "$CACHE"
   export CACHE_DURATION=$((60*50))
   export NOTES_ADDR="${NOTES_ADDR:-"http://localhost:3004"}"
   export GITLAB_PAT="$GITLAB_PAT"
   source ./gitlab.sh
   source ./cache.sh
   source ./notes.sh
}

log() {
   echo "$(echo "$(date +%H:%M:%S)> $*" | tr '\n' ' ')" >&2
}

ids() {
   notes ids
}

crawl() {
   local cache_key="crawled $*"
	# TODO
   if false && cache get "$cache_key"; then
      return
   fi
   _crawl "$@" | cache put "$cache_key"
}

_crawl() {
   log "crawling $*"
   local id="$1"
   local content="$(notes get "$id")"
	local json="$(
		printf '{"content": %s, "id": "%s"}' \
			"$(echo "$content" | jq -Rs)" \
			"$id"
	)"
   if ! is_crawlable "$content"; then
      log "not crawlable: '${content:0:20}'..."
      return 0
   fi
   local crawlable_source="$(extract_crawlable_source "$content")"
   for backend in gitlab; do
      if $backend is "$crawlable_source"; then
         crawl_with $backend "$json"
         return $?
      fi
   done
   log "unknown backend for $crawlable_source"
   return 1
}

extract_crawlable_source() {
   echo "$*" | head -n 1 | awk '{print $NF}' | sed 's/^<//' | sed 's/>$//'
}

crawl_with() {
   local backend="$1"
   local json="$2"

   local content="$(echo "$json" | jq -r .content)"
   local crawlable_source="$(extract_crawlable_source "$content")"

   local expanded=($($backend expand "$crawlable_source"))
   log expand $crawlable_source:
	one() {
		local i="$1"
      export TITLE="$(
			echo "$i" | base64 --decode
		)"
      export CONTENT="$(
			$backend get "$crawlable_source" "$i"
		)"
      export ID="$(
			echo "$crawlable_source/$TITLE" | base64 | md5sum | awk '{print $1}'
		)"
      export PID="$(
			echo $json | jq -r .id
		)"
      log "   $PID/$ID ($TITLE): ${#CONTENT}"
      push_crawled "$PID/$ID" "$TITLE" "$CONTENT"
	}
   for i in $(seq 1 $(("${#expanded[@]}"-1))); do
		one "${expanded[i]}"
   done
}

push_crawled() {
   notes put "$@"
}

is_crawlable() {
   local crawlable_source="$(extract_crawlable_source "$*")"
   # https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
   local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*"
   echo "$crawlable_source" | grep -q -E "^[ ]*$url_pattern[ ]*$"
}

rewrite() {
   log not impl: rewrite "#abc-def" to "#h-abc-def"
   log not impl: rewrite "./asdf" to "./zyxw" or "absolute.com/asdf"
   log not impl rewrite, change images
   return 1
}

if [ "$0" == "$BASH_SOURCE" ]; then
   main "$@"
fi