begin app

2022-02-01 07:48:37 -07:00
parent c31505a65e
commit 56b5984fda
2 changed files with 105 additions and 0 deletions
--- a/app/crawler/main.sh
+++ b/app/crawler/main.sh
@@ -0,0 +1,104 @@
 #! /bin/bash
 main() {
   config
   for id in $(ids); do
      if should_crawl "$id"; then
         crawl "$id"
      fi
      rewrite "$id"
   done
 }
 config() {
   set -o pipefail
   set -e
   export CACHE=$(mktemp -d)
   export CRAWL_INTERVAL=$((60*5))
   export NOTEA_ADDR="${NOTEA_ADDR:-"http://localhost:3000"}"
 }
 log() {
   echo "$(date)> $*" >&2
 }
 ids() {
   notea ids
 }
 should_crawl() {
   local f="$CACHE/crawled_$1"
   if [ ! -f "$f" ]; then
      return 0
   fi
   local last_crawled=$(date -r "$f" +%s)
   local now=$(date +%s)
   if ((now-last_crawled < CRAWL_INTERVAL)); then
      return 0
   fi
   return 1
 }
 crawl() {
   local id="$1"
   local json="$(notea get "$id")"
   local content="$(echo "$json" | jq -r .content)"
   if ! is_crawlable "$content"; then
      return 0
   fi
   log not impl crawl
   return 1
 }
 is_crawlable() {
   # https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
   local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*"
   if echo "$*" | tr -d '\n' | grep -E "^[ ]*$url_pattern[ ]*$"; then
      return 0
   fi
   if echo "$*" | head -n 1 | grep -E "^[ ]*_source_: $url_pattern[ ]*$"; then
      return 0
   fi
   return 1
 }
 rewrite() {
   log not impl rewrite
   return 1
 }
 notea() (
   ncurl() {
      curl -sS "$@"
   }
   ids() {
      ncurl $NOTEA_ADDR/api/tree \
         | jq -r '.items \
         | to_entries[].value.id' \
         | grep -v '^root$'
   }
   get() {
      local cached="$CACHE/cache_$1"
      if [ -f "$cached" ] && cat "$cached" | grep .; then
         return 0
      fi
      _get "$@" | tee "$cached"
   }
   _get() {
      ncurl $NOTEA_ADDR/api/notes/$1
   }
   "$@"
 )
 crawler() (
   should() {
   }
 )
 if [ "$0" == "$BASH_SOURCE" ]; then
   main "$@"
 fi
--- a/app/ui/run.sh
+++ b/app/ui/run.sh
@@ -0,0 +1 @@
 ../../spike/review/run.sh