notea-de-me/app/crawler/main.sh

100 lines
2.4 KiB
Bash

#! /bin/bash
main() {
config
for id in $(ids); do
crawl "$id"
done
for id in $(ids); do
rewrite "$id"
done
}
config() {
set -o pipefail
set -e
export CACHE="${CACHE:-"$(mktemp -d)"}"
mkdir -p "$CACHE"
export CACHE_DURATION=$((60*50))
export NOTEA_ADDR="${NOTEA_ADDR:-"http://localhost:3000"}"
export GITLAB_PAT="$GITLAB_PAT"
source ./gitlab.sh
source ./cache.sh
source ./notea.sh
}
log() {
echo "$(echo "$(date +%H:%M:%S)> $*" | tr '\n' ' ')" >&2
}
ids() {
notea ids
}
crawl() {
local cache_key="crawled $*"
if cache get "$cache_key"; then
return
fi
_crawl "$@" | cache put "$cache_key"
}
_crawl() {
log "crawling $*"
local id="$1"
local json="$(notea get "$id")"
local content="$(echo "$json" | jq -r .content)"
if ! is_crawlable "$content"; then
log "not crawlable: '${content:0:20}'..."
return 0
fi
local crawlable_source="$(extract_crawlable_source "$content")"
for backend in gitlab; do
if $backend is "$crawlable_source"; then
crawl_with $backend "$json"
return $?
fi
done
log "unknown backend for $crawlable_source"
return 1
}
extract_crawlable_source() {
echo "$*" | head -n 1 | awk '{print $NF}' | sed 's/^<//' | sed 's/>$//'
}
crawl_with() {
local backend="$1"
local json="$2"
local content="$(echo "$json" | jq -r .content)"
local crawlable_source="$(extract_crawlable_source "$content")"
local expanded=($($backend expand "$crawlable_source"))
log expand $crawlable_source:
for i in $(seq 1 $(("${#expanded[@]}"-1))); do
export TITLE="$(echo "${expanded[i]}" | base64 --decode)"
export CONTENT="$($backend get "$crawlable_source" "${expanded[i]}")"
export ID="$(echo "$crawlable_source/$TITLE" | base64 | md5sum | awk '{print $1}')"
export PID="$(echo $json | jq -r .id)"
log " $PID/$ID ($TITLE): ${#CONTENT}"
notea put
done
}
is_crawlable() {
local crawlable_source="$(extract_crawlable_source "$*")"
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*"
echo "$crawlable_source" | grep -q -E "^[ ]*$url_pattern[ ]*$"
}
rewrite() {
log not impl rewrite, change images
return 1
}
if [ "$0" == "$BASH_SOURCE" ]; then
main "$@"
fi