notea-de-me/app/crawler/main.sh

103 lines
2.1 KiB
Bash

#! /bin/bash
main() {
config
for id in $(ids); do
crawl "$id"
done
for id in $(ids); do
rewrite "$id"
done
}
config() {
set -o pipefail
set -e
export CACHE="${CACHE:-"$(mktemp -d)"}"
mkdir -p "$CACHE"
export CACHE_DURATION=$((60*5))
export NOTEA_ADDR="${NOTEA_ADDR:-"http://localhost:3000"}"
export GITLAB_PAT="$GITLAB_PAT"
source ./gitlab.sh
source ./cache.sh
source ./notea.sh
}
log() {
echo "$(date)> $*" >&2
}
ids() {
notea ids
}
crawl() {
local cache_key="crawled $*"
if cache get "$cache_key"; then
return
fi
_crawl "$@" | cache put "$cache_key"
}
_crawl() {
log crawling $*
local id="$1"
local json="$(notea get "$id")"
local content="$(echo "$json" | jq -r .content)"
if ! is_crawlable "$content"; then
log $content is not crawlable
return 0
fi
local crawlable_source="$(extract_crawlable_source "$content")"
for backend in gitlab; do
if $backend is "$crawlable_source"; then
crawl_with $backend "$json"
return $?
fi
done
log "unknown backend for $crawlable_source"
return 1
}
extract_crawlable_source() {
echo "$*" | head -n 1 | awk '{print $NF}'
}
crawl_with() {
local backend="$1"
local json="$2"
local content="$(echo "$json" | jq -r .content)"
local crawlable_source="$(extract_crawlable_source "$content")"
local expanded=($($backend expand "$crawlable_source"))
local context="$expanded"
for i in $(seq 1 "${#expanded[@]}"); do
log expand $(echo $context | base64 --decode), $(echo ${expanded[i]} | base64 --decode)
done
log not impl crawl with
return 1
}
is_crawlable() {
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*"
if echo "$*" | tr -d '\n' | grep -E "^[ ]*$url_pattern[ ]*$"; then
return 0
fi
if echo "$*" | head -n 1 | grep -E "^[ ]*_source_: $url_pattern[ ]*$"; then
return 0
fi
return 1
}
rewrite() {
log not impl rewrite
return 1
}
if [ "$0" == "$BASH_SOURCE" ]; then
main "$@"
fi