notea-de-me/app/crawler/main.sh

104 lines
2.1 KiB
Bash

#! /bin/bash
main() {
config
for id in $(ids); do
if should_crawl "$id"; then
crawl "$id"
fi
rewrite "$id"
done
}
config() {
set -o pipefail
set -e
export CACHE="${CACHE:-"$(mktemp -d)"}"
export CACHE_DURATION=$((60*5))
export CRAWL_INTERVAL=$((60*5))
export NOTEA_ADDR="${NOTEA_ADDR:-"http://localhost:3000"}"
export GITLAB_PAT="$GITLAB_PAT"
source ./gitlab.sh
}
log() {
echo "$(date)> $*" >&2
}
ids() {
notea ids
}
should_crawl() {
local f="$CACHE/crawled.$1"
if [ ! -f "$f" ]; then
return 0
fi
local last_crawled=$(date -r "$f" +%s)
local now=$(date +%s)
if ((now-last_crawled < CRAWL_INTERVAL)); then
return 0
fi
return 1
}
crawl() {
local id="$1"
local json="$(notea get "$id")"
local content="$(echo "$json" | jq -r .content)"
if ! is_crawlable "$content"; then
return 0
fi
local crawlable_source="$(extract_crawlable_source "$content")"
for backend in gitlab; do
if $backend is "$crawlable_source"; then
crawl_with $backend "$json"
return $?
fi
done
log "unknown backend for $crawlable_source"
return 1
}
extract_crawlable_source() {
echo "$*" | head -n 1 | awk '{print $NF}'
}
crawl_with() {
local backend="$1"
local json="$2"
local content="$(echo "$json" | jq -r .content)"
local crawlable_source="$(extract_crawlable_source "$content")"
local expanded=("$($backend expand "$crawlable_source")")
local context="$expanded"
for i in $(seq 1 "${#expanded[@]}"); do
log expand $context, ${expanded[i]}
done
log not impl crawl with
return 1
}
is_crawlable() {
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*"
if echo "$*" | tr -d '\n' | grep -E "^[ ]*$url_pattern[ ]*$"; then
return 0
fi
if echo "$*" | head -n 1 | grep -E "^[ ]*_source_: $url_pattern[ ]*$"; then
return 0
fi
return 1
}
rewrite() {
log not impl rewrite
return 1
}
if [ "$0" == "$BASH_SOURCE" ]; then
main "$@"
fi