#! /bin/bash main() { config log crawling ids... for id in $(crawlable_ids); do crawl "$id" done log rewriting ids... for id in $(ids); do rewrite "$id" done } config() { set -o pipefail set -e export CACHE="${CACHE:-"$(mktemp -d)"}" mkdir -p "$CACHE" export CACHE_DURATION=$((60*50)) export NOTES_ADDR="${NOTES_ADDR:-"http://localhost:3004"}" export GITLAB_PAT="$GITLAB_PAT" source ./gitlab.sh source ./cache.sh source ./notes.sh } log() { echo "$(echo "$(date +%H:%M:%S)> $*" | tr '\n' ' ')" >&2 } ids() { notes ids | sort } crawlable_ids() { local all_ids=($(ids)) local crawlable_ids=() for id in "${all_ids[@]}"; do if for crawlable_id in "${crawlable_ids[@]}"; do if [ "$id" != "${id#$crawlable_id/}" ]; then echo true fi done | grep -q true; then continue fi local content="$(notes get "$id")" if is_crawlable "$content"; then crawlable_ids+=("$id") fi done for crawlable_id in "${crawlable_ids[@]}"; do echo "$crawlable_id" done } crawl() { local cache_key="crawled $*" # TODO if false && cache get "$cache_key"; then return fi _crawl "$@" | cache put "$cache_key" } _crawl() { local id="$1" local content="$(notes get "$id")" local json="$( printf '{"content": %s, "id": "%s"}' \ "$(echo "$content" | jq -Rs)" \ "$id" )" local crawlable_source="$(extract_crawlable_source "$content")" for backend in gitlab; do if $backend is "$crawlable_source"; then crawl_with $backend "$json" return $? fi done log "unknown backend for $crawlable_source" return 1 } extract_crawlable_source() { echo "$*" | head -n 1 | awk '{print $NF}' | sed 's/^$//' | sed 's/^\///' | sed 's/\/$//' } crawl_with() { local backend="$1" local json="$2" local pid="$(echo "$json" | jq -r .id)" local content="$(echo "$json" | jq -r .content)" local crawlable_source="$(extract_crawlable_source "$content")" local expanded=($($backend expand "$crawlable_source")) log purge $crawlable_source: for subid in $(notes ids | grep "^$pid/"); do notes del "$subid" done log expand $crawlable_source: notes_mkdir_p() { local id="$1" local subtitle="${2%/}" notes put "$id" "$subtitle" "autogenerated content" } one() { encode() { base64 | md5sum | cut -c 1-10 | awk '{print $1}' | tr -d '\n' } local i="$1" local full_title="$( echo "$i" | base64 --decode | grep . || echo "${crawlable_source##*/}" )" full_title="${full_title%/}" full_title="${full_title#/}" export TITLE="${full_title##*/}" export CONTENT="$( $backend get "$crawlable_source" "$i" )" export ID="$( local sum="$pid/" local title_so_far="" for subtitle in $(echo $full_title | tr '/' '\n' | while read -r subtitle; do echo "$subtitle" | base64; done); do local subtitle="$(echo "$subtitle" | base64 --decode)" if [ -n "$title_so_far" ]; then local mkdir_p_title="${title_so_far%/}" mkdir_p_title="${mkdir_p_title##*/}" notes_mkdir_p "${sum%/}" "${mkdir_p_title}" >&2 fi sum+="$(echo "$subtitle" | encode)/" title_so_far+="$subtitle/" done echo "$sum" )" ID="${ID%/}" log " $ID ($TITLE): ${#CONTENT}" push_crawled "$PID/$ID" "$TITLE" "$CONTENT" } if [ "${#expanded[@]}" -gt 0 ]; then for i in $(seq 0 $(("${#expanded[@]}"-1))); do one "${expanded[i]}" done else one "" fi } push_crawled() { notes put "$@" } is_crawlable() { local crawlable_source="$(extract_crawlable_source "$*")" # https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*" echo "$crawlable_source" | cut -c 1-300 | grep -q -E "^[ ]*$url_pattern[ ]*$" } rewrite() { log not impl: rewrite "#abc-def" to "#h-abc-def" log not impl: rewrite "./asdf" to "./zyxw" or "absolute.com/asdf" log not impl rewrite, change images return 1 } if [ "$0" == "$BASH_SOURCE" ]; then main "$@" fi