#! /bin/bash main() { config log crawling ids... for id in $(crawlable_ids); do log crawling id $id crawl "$id" done log rewriting ids... for id in $(ids); do rewrite "$id" done } config() { set -o pipefail set -e export CACHE="${CACHE:-"$(mktemp -d)"}" mkdir -p "$CACHE" export CACHE_DURATION=$((60*50)) export NOTES_ADDR="${NOTES_ADDR:-"http://localhost:3004"}" export GITLAB_PAT="$GITLAB_PAT" export RCLONE_CONFIG="$RCLONE_CONFIG" export RCLONE_CONFIG_PASS="$RCLONE_CONFIG_PASS" source ./gitlab.sh source ./gitlab_wiki.sh source ./google.sh source ./rclone.sh source ./cache.sh source ./notes.sh } log() { echo "$(echo "$(date +%H:%M:%S)> $*" | tr '\n' ' ')" >&2 } ids() { notes ids | sort } crawlable_ids() { local all_ids=($(ids)) local crawlable_ids=() for id in "${all_ids[@]}"; do if for crawlable_id in "${crawlable_ids[@]}"; do if [ "$id" != "${id#$crawlable_id/}" ]; then echo true fi done | grep -q true; then continue fi local content="$(notes get "$id")" if is_crawlable "$content"; then crawlable_ids+=("$id") fi done for crawlable_id in "${crawlable_ids[@]}"; do echo "$crawlable_id" done } crawl() { _crawl "$@" } _crawl() { local id="$1" local content="$(notes get "$id")" local json="$( printf '{"content": %s, "id": "%s"}' \ "$(echo "$content" | jq -Rs)" \ "$id" )" local crawlable_source="$(extract_crawlable_source "$content")" for backend in gitlab gitlab_wiki google; do if $backend is "$crawlable_source"; then crawl_with $backend "$json" return $? fi done log "unknown backend for $crawlable_source" return 1 } extract_crawlable_source() { echo "$*" | head -n 1 | awk '{print $NF}' | sed 's/^$//' | sed 's/^\///' | sed 's/\/$//' } crawl_with() { local backend="$1" local json="$2" local pid="$(echo "$json" | jq -r .id)" local content="$(echo "$json" | jq -r .content)" local crawlable_source="$(extract_crawlable_source "$content")" notes put "$pid" "$(notes meta "$pid" | jq -r .Meta.Title)" "$crawlable_source" local expanded=($($backend expand "$crawlable_source")) log purge $crawlable_source: for subid in $(notes ids | grep "^$pid/"); do notes del "$subid" done log expand $crawlable_source:"${#expanded[@]}: ${expanded[@]}" notes_mkdir_p() { local id="$1" local subtitle="${2%/}" notes put "$id" "$subtitle" "autogenerated content" } one() { encode() { base64 | md5sum | cut -c 1-10 | awk '{print $1}' | tr -d '\n' } local i="$1" local full_title="$( echo "$i" | base64 --decode | grep . || echo "${crawlable_source##*/}" )" full_title="${full_title%/}" full_title="${full_title#/}" export TITLE="${full_title##*/}" local human_url="$($backend human_url "$crawlable_source" "$i")" export CONTENT="$( echo "**!! WARNING !! This page is autogenerated and prone to destruction and replacement**" echo "**[See the original]($human_url)**" echo "" $backend get "$crawlable_source" "$i" \ | sed 's/](\([^#h]\)/]\(%%%\1/g' )" export CONTENT="${CONTENT//"%%%"/"${human_url%/*}/"}" export CONTENT="$( printf "%s\n" "$CONTENT" \ | sed 's/!\[\([^]]*\)](\([^)]*\)\/-\/tree\/\([^)]*\))/![\1](\2\/-\/raw\/\3)/g' )" export ID="$( local sum="$pid/" local title_so_far="" for subtitle in $(echo $full_title | tr '/' '\n' | while read -r subtitle; do echo "$subtitle" | base64; done); do local subtitle="$(echo "$subtitle" | base64 --decode)" if [ -n "$title_so_far" ]; then local mkdir_p_title="${title_so_far%/}" mkdir_p_title="${mkdir_p_title##*/}" notes_mkdir_p "${sum%/}" "${mkdir_p_title}" >&2 fi sum+="$(echo "$subtitle" | encode)/" title_so_far+="$subtitle/" done echo "$sum" )" ID="${ID%/}" if [ "${#expanded[@]}" -lt 2 ]; then ID="$pid" TITLE="$(notes meta "$ID" | jq -r .Meta.Title)" CONTENT="$(printf "%s\n\n%s", "$crawlable_source" "$CONTENT")" fi log " $ID ($TITLE): ${#CONTENT}" push_crawled "$ID" "$TITLE" "$CONTENT" log " /$ID ($TITLE): ${#CONTENT}" } if [ "${#expanded[@]}" -gt 0 ]; then for i in $(seq 0 $(("${#expanded[@]}"-1))); do one "${expanded[i]}" done else one "" fi } push_crawled() { notes put "$@" } is_crawlable() { local crawlable_source="$(extract_crawlable_source "$*")" # https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:\-\#--]*" echo "$crawlable_source" | cut -c 1-300 | grep -q -E "^[ ]*$url_pattern[ ]*$" } rewrite() { log not impl: rewrite "./asdf" to "absolute.com/asdf" log not impl: rewrite "#abc-def?f=abc" to "#h-abc-def?f=abc" or better dont depend on query params so much log not impl rewrite, change images return 1 } if [ "$0" == "$BASH_SOURCE" ]; then main "$@" fi