notea-de-me/crawler/main.sh

#! /bin/bash

main() {
	config
	log crawling ids...
	for id in $(crawlable_ids); do
		log crawling id $id
		crawl "$id"
	done
	log rewriting ids...
	for id in $(ids); do
		rewrite "$id"
	done
}

config() {
	set -o pipefail
	set -e
	export CACHE="${CACHE:-"$(mktemp -d)"}"
	mkdir -p "$CACHE"
	export CACHE_DURATION=$((60*50))
	export NOTES_ADDR="${NOTES_ADDR:-"http://localhost:3004"}"
	export GITLAB_PAT="$GITLAB_PAT"
	export RCLONE_CONFIG="$RCLONE_CONFIG"
	export RCLONE_CONFIG_PASS="$RCLONE_CONFIG_PASS"
	source ./gitlab.sh
	source ./gitlab_wiki.sh
	source ./google.sh
	source ./rclone.sh
	source ./cache.sh
	source ./notes.sh
}

log() {
	echo "$(echo "$(date +%H:%M:%S)> $*" | tr '\n' ' ')" >&2
}

ids() {
	notes ids | sort
}

crawlable_ids() {
	local all_ids=($(ids))
	local crawlable_ids=()
	for id in "${all_ids[@]}"; do
		if for crawlable_id in "${crawlable_ids[@]}"; do
			if [ "$id" != "${id#$crawlable_id/}" ]; then
				echo true
			fi
		done | grep -q true; then
			continue
		fi
		local content="$(notes get "$id")"
		if is_crawlable "$content"; then
			crawlable_ids+=("$id")
		fi
	done
	for crawlable_id in "${crawlable_ids[@]}"; do
		echo "$crawlable_id"
	done
}

crawl() {
	_crawl "$@"
}

_crawl() {
	local id="$1"
	local content="$(notes get "$id")"
	local json="$(
		printf '{"content": %s, "id": "%s"}' \
			"$(echo "$content" | jq -Rs)" \
			"$id"
	)"
	local crawlable_source="$(extract_crawlable_source "$content")"
	for backend in gitlab gitlab_wiki google; do
		if $backend is "$crawlable_source"; then
			crawl_with $backend "$json"
			return $?
		fi
	done
	log "unknown backend for $crawlable_source"
	return 1
}

extract_crawlable_source() {
	echo "$*" | head -n 1 | awk '{print $NF}' | sed 's/^<//' | sed 's/>$//' | sed 's/^\///' | sed 's/\/$//'
}

crawl_with() {
	local backend="$1"
	local json="$2"
	local pid="$(echo "$json" | jq -r .id)"

	local content="$(echo "$json" | jq -r .content)"
	local crawlable_source="$(extract_crawlable_source "$content")"

	notes put "$pid" "$(notes meta "$pid" | jq -r .Meta.Title)" "$crawlable_source"

	local expanded=($($backend expand "$crawlable_source"))

	log purge $crawlable_source:
	for subid in $(notes ids | grep "^$pid/"); do
		notes del "$subid"
	done

	log expand $crawlable_source:"${#expanded[@]}: ${expanded[@]}"
	notes_mkdir_p() {
		local id="$1"
		local subtitle="${2%/}"
		notes put "$id" "$subtitle" "autogenerated content"
	}
	one() {
		encode() {
			base64 | md5sum | cut -c 1-10 | awk '{print $1}' | tr -d '\n'
		}
		local i="$1"
		local full_title="$(
			echo "$i" | base64 --decode | grep . || echo "${crawlable_source##*/}"
		)"
		full_title="${full_title%/}"
		full_title="${full_title#/}"
		export TITLE="${full_title##*/}"
		local human_url="$($backend human_url "$crawlable_source" "$i")"
		export CONTENT="$(
			echo "**!! WARNING !! This page is autogenerated and prone to destruction and replacement**"
			echo "**[See the original]($human_url)**"
			echo ""
			$backend get "$crawlable_source" "$i" \
				| sed 's/](\([^#h]\)/]\(%%%\1/g'
		)"
		export CONTENT="${CONTENT//"%%%"/"${human_url%/*}/"}"
		export CONTENT="$(
			printf "%s\n" "$CONTENT" \
				| sed 's/!\[\([^]]*\)](\([^)]*\)\/-\/tree\/\([^)]*\))/![\1](\2\/-\/raw\/\3)/g'
		)"
		export ID="$(
			local sum="$pid/"
			local title_so_far=""
			for subtitle in $(echo $full_title | tr '/' '\n' | while read -r subtitle; do echo "$subtitle" | base64; done); do
				local subtitle="$(echo "$subtitle" | base64 --decode)"
				if [ -n "$title_so_far" ]; then
					local mkdir_p_title="${title_so_far%/}"
					mkdir_p_title="${mkdir_p_title##*/}"
					notes_mkdir_p "${sum%/}" "${mkdir_p_title}" >&2
				fi
				sum+="$(echo "$subtitle" | encode)/"
				title_so_far+="$subtitle/"
			done
			echo "$sum"
		)"
		ID="${ID%/}"
		if [ "${#expanded[@]}" -lt 2 ]; then
			ID="$pid"
			TITLE="$(notes meta "$ID" | jq -r .Meta.Title)"
			CONTENT="$(printf "%s\n\n%s", "$crawlable_source" "$CONTENT")"
		fi
		log "	$ID ($TITLE): ${#CONTENT}"
		push_crawled "$ID" "$TITLE" "$CONTENT"
		log "	/$ID ($TITLE): ${#CONTENT}"
	}
	if [ "${#expanded[@]}" -gt 0 ]; then
		for i in $(seq 0 $(("${#expanded[@]}"-1))); do
			one "${expanded[i]}"
		done
	else
		one ""
	fi
}

push_crawled() {
	notes put "$@"
}

is_crawlable() {
	local crawlable_source="$(extract_crawlable_source "$*")"
	# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
	local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:\-\#--]*"
	echo "$crawlable_source" | cut -c 1-300 | grep -q -E "^[ ]*$url_pattern[ ]*$"
}

rewrite() {
	log not impl: rewrite "./asdf" to "absolute.com/asdf"
	log not impl: rewrite "#abc-def?f=abc" to "#h-abc-def?f=abc" or better dont depend on query params so much
	log not impl rewrite, change images
	return 1
}

if [ "$0" == "$BASH_SOURCE" ]; then
	main "$@"
fi