notea-de-me/app/crawler/main.sh

173 lines
3.9 KiB
Bash

#! /bin/bash
main() {
config
log crawling ids...
for id in $(crawlable_ids); do
crawl "$id"
done
log rewriting ids...
for id in $(ids); do
rewrite "$id"
done
}
config() {
set -o pipefail
set -e
export CACHE="${CACHE:-"$(mktemp -d)"}"
mkdir -p "$CACHE"
export CACHE_DURATION=$((60*50))
export NOTES_ADDR="${NOTES_ADDR:-"http://localhost:3004"}"
export GITLAB_PAT="$GITLAB_PAT"
source ./gitlab.sh
source ./cache.sh
source ./notes.sh
}
log() {
echo "$(echo "$(date +%H:%M:%S)> $*" | tr '\n' ' ')" >&2
}
ids() {
notes ids | sort
}
crawlable_ids() {
local all_ids=($(ids))
local crawlable_ids=()
for id in "${all_ids[@]}"; do
if for crawlable_id in "${crawlable_ids[@]}"; do
if [ "$id" != "${id#$crawlable_id/}" ]; then
echo true
fi
done | grep -q true; then
continue
fi
local content="$(notes get "$id")"
if is_crawlable "$content"; then
crawlable_ids+=("$id")
fi
done
for crawlable_id in "${crawlable_ids[@]}"; do
echo "$crawlable_id"
done
}
crawl() {
local cache_key="crawled $*"
# TODO
if false && cache get "$cache_key"; then
return
fi
_crawl "$@" | cache put "$cache_key"
}
_crawl() {
local id="$1"
local content="$(notes get "$id")"
local json="$(
printf '{"content": %s, "id": "%s"}' \
"$(echo "$content" | jq -Rs)" \
"$id"
)"
local crawlable_source="$(extract_crawlable_source "$content")"
for backend in gitlab; do
if $backend is "$crawlable_source"; then
crawl_with $backend "$json"
return $?
fi
done
log "unknown backend for $crawlable_source"
return 1
}
extract_crawlable_source() {
echo "$*" | head -n 1 | awk '{print $NF}' | sed 's/^<//' | sed 's/>$//' | sed 's/^\///' | sed 's/\/$//'
}
crawl_with() {
local backend="$1"
local json="$2"
local pid="$(echo "$json" | jq -r .id)"
local content="$(echo "$json" | jq -r .content)"
local crawlable_source="$(extract_crawlable_source "$content")"
local expanded=($($backend expand "$crawlable_source"))
log purge $crawlable_source:
for subid in $(notes ids | grep "^$pid/"); do
notes del "$subid"
done
log expand $crawlable_source:
notes_mkdir_p() {
local id="$1"
local subtitle="${2%/}"
notes put "$id" "$subtitle" "autogenerated content"
}
one() {
encode() {
base64 | md5sum | cut -c 1-10 | awk '{print $1}' | tr -d '\n'
}
local i="$1"
local full_title="$(
echo "$i" | base64 --decode | grep . || echo "${crawlable_source##*/}"
)"
full_title="${full_title%/}"
full_title="${full_title#/}"
export TITLE="${full_title##*/}"
export CONTENT="$(
$backend get "$crawlable_source" "$i"
)"
export ID="$(
local sum="$pid/"
local title_so_far=""
for subtitle in $(echo $full_title | tr '/' '\n' | while read -r subtitle; do echo "$subtitle" | base64; done); do
local subtitle="$(echo "$subtitle" | base64 --decode)"
if [ -n "$title_so_far" ]; then
local mkdir_p_title="${title_so_far%/}"
mkdir_p_title="${mkdir_p_title##*/}"
notes_mkdir_p "${sum%/}" "${mkdir_p_title}" >&2
fi
sum+="$(echo "$subtitle" | encode)/"
title_so_far+="$subtitle/"
done
echo "$sum"
)"
ID="${ID%/}"
log " $ID ($TITLE): ${#CONTENT}"
push_crawled "$PID/$ID" "$TITLE" "$CONTENT"
}
if [ "${#expanded[@]}" -gt 0 ]; then
for i in $(seq 0 $(("${#expanded[@]}"-1))); do
one "${expanded[i]}"
done
else
one ""
fi
}
push_crawled() {
notes put "$@"
}
is_crawlable() {
local crawlable_source="$(extract_crawlable_source "$*")"
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*"
echo "$crawlable_source" | cut -c 1-300 | grep -q -E "^[ ]*$url_pattern[ ]*$"
}
rewrite() {
log not impl: rewrite "#abc-def" to "#h-abc-def"
log not impl: rewrite "./asdf" to "./zyxw" or "absolute.com/asdf"
log not impl rewrite, change images
return 1
}
if [ "$0" == "$BASH_SOURCE" ]; then
main "$@"
fi