104 lines
2.1 KiB
Bash
104 lines
2.1 KiB
Bash
#! /bin/bash
|
|
|
|
main() {
|
|
config
|
|
for id in $(ids); do
|
|
if should_crawl "$id"; then
|
|
crawl "$id"
|
|
fi
|
|
rewrite "$id"
|
|
done
|
|
}
|
|
|
|
config() {
|
|
set -o pipefail
|
|
set -e
|
|
export CACHE="${CACHE:-"$(mktemp -d)"}"
|
|
export CACHE_DURATION=$((60*5))
|
|
export CRAWL_INTERVAL=$((60*5))
|
|
export NOTEA_ADDR="${NOTEA_ADDR:-"http://localhost:3000"}"
|
|
export GITLAB_PAT="$GITLAB_PAT"
|
|
source ./gitlab.sh
|
|
}
|
|
|
|
log() {
|
|
echo "$(date)> $*" >&2
|
|
}
|
|
|
|
ids() {
|
|
notea ids
|
|
}
|
|
|
|
should_crawl() {
|
|
local f="$CACHE/crawled.$1"
|
|
if [ ! -f "$f" ]; then
|
|
return 0
|
|
fi
|
|
local last_crawled=$(date -r "$f" +%s)
|
|
local now=$(date +%s)
|
|
if ((now-last_crawled < CRAWL_INTERVAL)); then
|
|
return 0
|
|
fi
|
|
return 1
|
|
}
|
|
|
|
crawl() {
|
|
local id="$1"
|
|
local json="$(notea get "$id")"
|
|
local content="$(echo "$json" | jq -r .content)"
|
|
if ! is_crawlable "$content"; then
|
|
return 0
|
|
fi
|
|
local crawlable_source="$(extract_crawlable_source "$content")"
|
|
for backend in gitlab; do
|
|
if $backend is "$crawlable_source"; then
|
|
crawl_with $backend "$json"
|
|
return $?
|
|
fi
|
|
done
|
|
log "unknown backend for $crawlable_source"
|
|
return 1
|
|
}
|
|
|
|
extract_crawlable_source() {
|
|
echo "$*" | head -n 1 | awk '{print $NF}'
|
|
}
|
|
|
|
crawl_with() {
|
|
local backend="$1"
|
|
local json="$2"
|
|
|
|
local content="$(echo "$json" | jq -r .content)"
|
|
local crawlable_source="$(extract_crawlable_source "$content")"
|
|
|
|
local expanded=("$($backend expand "$crawlable_source")")
|
|
local context="$expanded"
|
|
for i in $(seq 1 "${#expanded[@]}"); do
|
|
log expand $context, ${expanded[i]}
|
|
done
|
|
|
|
log not impl crawl with
|
|
return 1
|
|
}
|
|
|
|
is_crawlable() {
|
|
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
|
|
local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*"
|
|
if echo "$*" | tr -d '\n' | grep -E "^[ ]*$url_pattern[ ]*$"; then
|
|
return 0
|
|
fi
|
|
if echo "$*" | head -n 1 | grep -E "^[ ]*_source_: $url_pattern[ ]*$"; then
|
|
return 0
|
|
fi
|
|
return 1
|
|
}
|
|
|
|
rewrite() {
|
|
log not impl rewrite
|
|
return 1
|
|
}
|
|
|
|
if [ "$0" == "$BASH_SOURCE" ]; then
|
|
main "$@"
|
|
fi
|