192 lines
4.7 KiB
Bash
192 lines
4.7 KiB
Bash
#! /bin/bash
|
|
|
|
main() {
|
|
config
|
|
log crawling ids...
|
|
for id in $(crawlable_ids); do
|
|
log crawling id $id
|
|
crawl "$id"
|
|
done
|
|
log rewriting ids...
|
|
for id in $(ids); do
|
|
rewrite "$id"
|
|
done
|
|
}
|
|
|
|
config() {
|
|
set -o pipefail
|
|
set -e
|
|
export CACHE="${CACHE:-"$(mktemp -d)"}"
|
|
mkdir -p "$CACHE"
|
|
export CACHE_DURATION=$((60*50))
|
|
export NOTES_ADDR="${NOTES_ADDR:-"http://localhost:3004"}"
|
|
export GITLAB_PAT="$GITLAB_PAT"
|
|
export RCLONE_CONFIG="$RCLONE_CONFIG"
|
|
export RCLONE_CONFIG_PASS="$RCLONE_CONFIG_PASS"
|
|
source ./gitlab.sh
|
|
source ./gitlab_wiki.sh
|
|
source ./google.sh
|
|
source ./rclone.sh
|
|
source ./cache.sh
|
|
source ./notes.sh
|
|
}
|
|
|
|
log() {
|
|
echo "$(echo "$(date +%H:%M:%S)> $*" | tr '\n' ' ')" >&2
|
|
}
|
|
|
|
ids() {
|
|
notes ids | sort
|
|
}
|
|
|
|
crawlable_ids() {
|
|
local all_ids=($(ids))
|
|
local crawlable_ids=()
|
|
for id in "${all_ids[@]}"; do
|
|
if for crawlable_id in "${crawlable_ids[@]}"; do
|
|
if [ "$id" != "${id#$crawlable_id/}" ]; then
|
|
echo true
|
|
fi
|
|
done | grep -q true; then
|
|
continue
|
|
fi
|
|
local content="$(notes get "$id")"
|
|
if is_crawlable "$content"; then
|
|
crawlable_ids+=("$id")
|
|
fi
|
|
done
|
|
for crawlable_id in "${crawlable_ids[@]}"; do
|
|
echo "$crawlable_id"
|
|
done
|
|
}
|
|
|
|
crawl() {
|
|
_crawl "$@"
|
|
}
|
|
|
|
_crawl() {
|
|
local id="$1"
|
|
local content="$(notes get "$id")"
|
|
local json="$(
|
|
printf '{"content": %s, "id": "%s"}' \
|
|
"$(echo "$content" | jq -Rs)" \
|
|
"$id"
|
|
)"
|
|
local crawlable_source="$(extract_crawlable_source "$content")"
|
|
for backend in gitlab gitlab_wiki google; do
|
|
if $backend is "$crawlable_source"; then
|
|
crawl_with $backend "$json"
|
|
return $?
|
|
fi
|
|
done
|
|
log "unknown backend for $crawlable_source"
|
|
return 1
|
|
}
|
|
|
|
extract_crawlable_source() {
|
|
echo "$*" | head -n 1 | awk '{print $NF}' | sed 's/^<//' | sed 's/>$//' | sed 's/^\///' | sed 's/\/$//'
|
|
}
|
|
|
|
crawl_with() {
|
|
local backend="$1"
|
|
local json="$2"
|
|
local pid="$(echo "$json" | jq -r .id)"
|
|
|
|
local content="$(echo "$json" | jq -r .content)"
|
|
local crawlable_source="$(extract_crawlable_source "$content")"
|
|
|
|
notes put "$pid" "$(notes meta "$pid" | jq -r .Meta.Title)" "$crawlable_source"
|
|
|
|
local expanded=($($backend expand "$crawlable_source"))
|
|
|
|
log purge $crawlable_source:
|
|
for subid in $(notes ids | grep "^$pid/"); do
|
|
notes del "$subid"
|
|
done
|
|
|
|
log expand $crawlable_source:"${#expanded[@]}: ${expanded[@]}"
|
|
notes_mkdir_p() {
|
|
local id="$1"
|
|
local subtitle="${2%/}"
|
|
notes put "$id" "$subtitle" "autogenerated content"
|
|
}
|
|
one() {
|
|
encode() {
|
|
base64 | md5sum | cut -c 1-10 | awk '{print $1}' | tr -d '\n'
|
|
}
|
|
local i="$1"
|
|
local full_title="$(
|
|
echo "$i" | base64 --decode | grep . || echo "${crawlable_source##*/}"
|
|
)"
|
|
full_title="${full_title%/}"
|
|
full_title="${full_title#/}"
|
|
export TITLE="${full_title##*/}"
|
|
local human_url="$($backend human_url "$crawlable_source" "$i")"
|
|
export CONTENT="$(
|
|
echo "**!! WARNING !! This page is autogenerated and prone to destruction and replacement**"
|
|
echo "**[See the original]($human_url)**"
|
|
echo ""
|
|
$backend get "$crawlable_source" "$i" \
|
|
| sed 's/](\([^#h]\)/]\(%%%\1/g'
|
|
)"
|
|
export CONTENT="${CONTENT//"%%%"/"${human_url%/*}/"}"
|
|
export CONTENT="$(
|
|
printf "%s\n" "$CONTENT" \
|
|
| sed 's/!\[\([^]]*\)](\([^)]*\)\/-\/tree\/\([^)]*\))//g'
|
|
)"
|
|
export ID="$(
|
|
local sum="$pid/"
|
|
local title_so_far=""
|
|
for subtitle in $(echo $full_title | tr '/' '\n' | while read -r subtitle; do echo "$subtitle" | base64; done); do
|
|
local subtitle="$(echo "$subtitle" | base64 --decode)"
|
|
if [ -n "$title_so_far" ]; then
|
|
local mkdir_p_title="${title_so_far%/}"
|
|
mkdir_p_title="${mkdir_p_title##*/}"
|
|
notes_mkdir_p "${sum%/}" "${mkdir_p_title}" >&2
|
|
fi
|
|
sum+="$(echo "$subtitle" | encode)/"
|
|
title_so_far+="$subtitle/"
|
|
done
|
|
echo "$sum"
|
|
)"
|
|
ID="${ID%/}"
|
|
if [ "${#expanded[@]}" -lt 2 ]; then
|
|
ID="$pid"
|
|
TITLE="$(notes meta "$ID" | jq -r .Meta.Title)"
|
|
CONTENT="$(printf "%s\n\n%s", "$crawlable_source" "$CONTENT")"
|
|
fi
|
|
log " $ID ($TITLE): ${#CONTENT}"
|
|
push_crawled "$ID" "$TITLE" "$CONTENT"
|
|
log " /$ID ($TITLE): ${#CONTENT}"
|
|
}
|
|
if [ "${#expanded[@]}" -gt 0 ]; then
|
|
for i in $(seq 0 $(("${#expanded[@]}"-1))); do
|
|
one "${expanded[i]}"
|
|
done
|
|
else
|
|
one ""
|
|
fi
|
|
}
|
|
|
|
push_crawled() {
|
|
notes put "$@"
|
|
}
|
|
|
|
is_crawlable() {
|
|
local crawlable_source="$(extract_crawlable_source "$*")"
|
|
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
|
|
local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:\-\#--]*"
|
|
echo "$crawlable_source" | cut -c 1-300 | grep -q -E "^[ ]*$url_pattern[ ]*$"
|
|
}
|
|
|
|
rewrite() {
|
|
log not impl: rewrite "./asdf" to "absolute.com/asdf"
|
|
log not impl: rewrite "#abc-def?f=abc" to "#h-abc-def?f=abc" or better dont depend on query params so much
|
|
log not impl rewrite, change images
|
|
return 1
|
|
}
|
|
|
|
if [ "$0" == "$BASH_SOURCE" ]; then
|
|
main "$@"
|
|
fi
|