notea-de-me/app/crawler/main.sh

147 lines
3.2 KiB
Bash

#! /bin/bash
main() {
config
for id in $(ids); do
crawl "$id"
done
for id in $(ids); do
rewrite "$id"
done
}
config() {
set -o pipefail
set -e
export CACHE="${CACHE:-"$(mktemp -d)"}"
mkdir -p "$CACHE"
export CACHE_DURATION=$((60*50))
export NOTES_ADDR="${NOTES_ADDR:-"http://localhost:3004"}"
export GITLAB_PAT="$GITLAB_PAT"
source ./gitlab.sh
source ./cache.sh
source ./notes.sh
}
log() {
echo "$(echo "$(date +%H:%M:%S)> $*" | tr '\n' ' ')" >&2
}
ids() {
notes ids
}
crawl() {
local cache_key="crawled $*"
# TODO
if false && cache get "$cache_key"; then
return
fi
_crawl "$@" | cache put "$cache_key"
}
_crawl() {
log "crawling? $*"
local id="$1"
local content="$(notes get "$id")"
local json="$(
printf '{"content": %s, "id": "%s"}' \
"$(echo "$content" | jq -Rs)" \
"$id"
)"
if ! is_crawlable "$content"; then
log "not crawlable: '${content:0:20}'..."
return 0
fi
local crawlable_source="$(extract_crawlable_source "$content")"
for backend in gitlab; do
if $backend is "$crawlable_source"; then
crawl_with $backend "$json"
return $?
fi
done
log "unknown backend for $crawlable_source"
return 1
}
extract_crawlable_source() {
echo "$*" | head -n 1 | awk '{print $NF}' | sed 's/^<//' | sed 's/>$//'
}
crawl_with() {
local backend="$1"
local json="$2"
local pid="$(echo "$json" | jq -r .id)"
local content="$(echo "$json" | jq -r .content)"
local crawlable_source="$(extract_crawlable_source "$content")"
local expanded=($($backend expand "$crawlable_source"))
log purge $crawlable_source:
for subid in $(notes ids | grep "^$pid/"); do
notes del "$subid"
done
log expand $crawlable_source:
notes_mkdir_p() {
local id="$1"
local subtitle="$2"
notes put "$id" "$subtitle" "autogenerated content"
}
one() {
encode() {
base64 | md5sum | cut -c 1-10 | awk '{print $1}' | tr -d '\n'
}
local i="$1"
export TITLE="$(
echo "$i" | base64 --decode
)"
TITLE="${TITLE##*/}"
export CONTENT="$(
$backend get "$crawlable_source" "$i"
)"
export ID="$(
local sum="$pid/"
local title_so_far=""
for subtitle in $(echo "$i" | base64 --decode | tr '/' '\n' | while read -r subtitle; do echo "$subtitle" | base64; done); do
local subtitle="$(echo "$subtitle" | base64 --decode)"
if [ -n "$title_so_far" ]; then
notes_mkdir_p "${sum%/}" "${title_so_far%/}" >&2
fi
sum+="$(echo "$subtitle" | encode)/"
title_so_far+="$subtitle/"
done
echo "$sum"
)"
ID="${ID%/}"
log " $ID ($TITLE): ${#CONTENT}"
push_crawled "$PID/$ID" "$TITLE" "$CONTENT"
}
for i in $(seq 1 $(("${#expanded[@]}"-1))); do
one "${expanded[i]}"
done
}
push_crawled() {
notes put "$@"
}
is_crawlable() {
local crawlable_source="$(extract_crawlable_source "$*")"
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*"
echo "$crawlable_source" | cut -c 1-300 | grep -q -E "^[ ]*$url_pattern[ ]*$"
}
rewrite() {
log not impl: rewrite "#abc-def" to "#h-abc-def"
log not impl: rewrite "./asdf" to "./zyxw" or "absolute.com/asdf"
log not impl rewrite, change images
return 1
}
if [ "$0" == "$BASH_SOURCE" ]; then
main "$@"
fi