notea-de-me/crawler/main.sh

192 lines
4.7 KiB
Bash

#! /bin/bash
main() {
config
log crawling ids...
for id in $(crawlable_ids); do
log crawling id $id
crawl "$id"
done
log rewriting ids...
for id in $(ids); do
rewrite "$id"
done
}
config() {
set -o pipefail
set -e
export CACHE="${CACHE:-"$(mktemp -d)"}"
mkdir -p "$CACHE"
export CACHE_DURATION=$((60*50))
export NOTES_ADDR="${NOTES_ADDR:-"http://localhost:3004"}"
export GITLAB_PAT="$GITLAB_PAT"
export RCLONE_CONFIG="$RCLONE_CONFIG"
export RCLONE_CONFIG_PASS="$RCLONE_CONFIG_PASS"
source ./gitlab.sh
source ./gitlab_wiki.sh
source ./google.sh
source ./rclone.sh
source ./cache.sh
source ./notes.sh
}
log() {
echo "$(echo "$(date +%H:%M:%S)> $*" | tr '\n' ' ')" >&2
}
ids() {
notes ids | sort
}
crawlable_ids() {
local all_ids=($(ids))
local crawlable_ids=()
for id in "${all_ids[@]}"; do
if for crawlable_id in "${crawlable_ids[@]}"; do
if [ "$id" != "${id#$crawlable_id/}" ]; then
echo true
fi
done | grep -q true; then
continue
fi
local content="$(notes get "$id")"
if is_crawlable "$content"; then
crawlable_ids+=("$id")
fi
done
for crawlable_id in "${crawlable_ids[@]}"; do
echo "$crawlable_id"
done
}
crawl() {
_crawl "$@"
}
_crawl() {
local id="$1"
local content="$(notes get "$id")"
local json="$(
printf '{"content": %s, "id": "%s"}' \
"$(echo "$content" | jq -Rs)" \
"$id"
)"
local crawlable_source="$(extract_crawlable_source "$content")"
for backend in gitlab gitlab_wiki google; do
if $backend is "$crawlable_source"; then
crawl_with $backend "$json"
return $?
fi
done
log "unknown backend for $crawlable_source"
return 1
}
extract_crawlable_source() {
echo "$*" | head -n 1 | awk '{print $NF}' | sed 's/^<//' | sed 's/>$//' | sed 's/^\///' | sed 's/\/$//'
}
crawl_with() {
local backend="$1"
local json="$2"
local pid="$(echo "$json" | jq -r .id)"
local content="$(echo "$json" | jq -r .content)"
local crawlable_source="$(extract_crawlable_source "$content")"
notes put "$pid" "$(notes meta "$pid" | jq -r .Meta.Title)" "$crawlable_source"
local expanded=($($backend expand "$crawlable_source"))
log purge $crawlable_source:
for subid in $(notes ids | grep "^$pid/"); do
notes del "$subid"
done
log expand $crawlable_source:"${#expanded[@]}: ${expanded[@]}"
notes_mkdir_p() {
local id="$1"
local subtitle="${2%/}"
notes put "$id" "$subtitle" "autogenerated content"
}
one() {
encode() {
base64 | md5sum | cut -c 1-10 | awk '{print $1}' | tr -d '\n'
}
local i="$1"
local full_title="$(
echo "$i" | base64 --decode | grep . || echo "${crawlable_source##*/}"
)"
full_title="${full_title%/}"
full_title="${full_title#/}"
export TITLE="${full_title##*/}"
local human_url="$($backend human_url "$crawlable_source" "$i")"
export CONTENT="$(
echo "**!! WARNING !! This page is autogenerated and prone to destruction and replacement**"
echo "**[See the original]($human_url)**"
echo ""
$backend get "$crawlable_source" "$i" \
| sed 's/](\([^#h]\)/]\(%%%\1/g'
)"
export CONTENT="${CONTENT//"%%%"/"${human_url%/*}/"}"
export CONTENT="$(
printf "%s\n" "$CONTENT" \
| sed 's/!\[\([^]]*\)](\([^)]*\)\/-\/tree\/\([^)]*\))/![\1](\2\/-\/raw\/\3)/g'
)"
export ID="$(
local sum="$pid/"
local title_so_far=""
for subtitle in $(echo $full_title | tr '/' '\n' | while read -r subtitle; do echo "$subtitle" | base64; done); do
local subtitle="$(echo "$subtitle" | base64 --decode)"
if [ -n "$title_so_far" ]; then
local mkdir_p_title="${title_so_far%/}"
mkdir_p_title="${mkdir_p_title##*/}"
notes_mkdir_p "${sum%/}" "${mkdir_p_title}" >&2
fi
sum+="$(echo "$subtitle" | encode)/"
title_so_far+="$subtitle/"
done
echo "$sum"
)"
ID="${ID%/}"
if [ "${#expanded[@]}" -lt 2 ]; then
ID="$pid"
TITLE="$(notes meta "$ID" | jq -r .Meta.Title)"
CONTENT="$(printf "%s\n\n%s", "$crawlable_source" "$CONTENT")"
fi
log " $ID ($TITLE): ${#CONTENT}"
push_crawled "$ID" "$TITLE" "$CONTENT"
log " /$ID ($TITLE): ${#CONTENT}"
}
if [ "${#expanded[@]}" -gt 0 ]; then
for i in $(seq 0 $(("${#expanded[@]}"-1))); do
one "${expanded[i]}"
done
else
one ""
fi
}
push_crawled() {
notes put "$@"
}
is_crawlable() {
local crawlable_source="$(extract_crawlable_source "$*")"
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:\-\#--]*"
echo "$crawlable_source" | cut -c 1-300 | grep -q -E "^[ ]*$url_pattern[ ]*$"
}
rewrite() {
log not impl: rewrite "./asdf" to "absolute.com/asdf"
log not impl: rewrite "#abc-def?f=abc" to "#h-abc-def?f=abc" or better dont depend on query params so much
log not impl rewrite, change images
return 1
}
if [ "$0" == "$BASH_SOURCE" ]; then
main "$@"
fi