#! /bin/bash main() { config for id in $(ids); do if should_crawl "$id"; then crawl "$id" fi rewrite "$id" done } config() { set -o pipefail set -e export CACHE="${CACHE:-"$(mktemp -d)"}" export CACHE_DURATION=$((60*5)) export CRAWL_INTERVAL=$((60*5)) export NOTEA_ADDR="${NOTEA_ADDR:-"http://localhost:3000"}" export GITLAB_PAT="$GITLAB_PAT" } log() { echo "$(date)> $*" >&2 } ids() { notea ids } should_crawl() { local f="$CACHE/crawled.$1" if [ ! -f "$f" ]; then return 0 fi local last_crawled=$(date -r "$f" +%s) local now=$(date +%s) if ((now-last_crawled < CRAWL_INTERVAL)); then return 0 fi return 1 } crawl() { local id="$1" local json="$(notea get "$id")" local content="$(echo "$json" | jq -r .content)" if ! is_crawlable "$content"; then return 0 fi local crawlable_source="$(echo "$content" | head -n 1 | awk '{print $NF}')" for backend in gitlab; do if $backend is "$crawlable_source"; then crawl_with $backend "$json" return $? fi done log "unknown backend for $crawlable_source" return 1 } crawl_with() { local backend="$1" local json="$2" log not impl crawl with return 1 } is_crawlable() { # https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*" if echo "$*" | tr -d '\n' | grep -E "^[ ]*$url_pattern[ ]*$"; then return 0 fi if echo "$*" | head -n 1 | grep -E "^[ ]*_source_: $url_pattern[ ]*$"; then return 0 fi return 1 } rewrite() { log not impl rewrite return 1 } notea() ( ncurl() { curl -sS "$@" } ids() { ncurl $NOTEA_ADDR/api/tree \ | jq -r '.items \ | to_entries[].value.id' \ | grep -v '^root$' } get() { local cache_key="notea cache $1" if cache get "$cache_key"; then return 0 fi _get "$@" | cache put "$cache_key" } _get() { ncurl $NOTEA_ADDR/api/notes/$1 } "$@" ) cache() ( path() { echo "$CACHE/$(echo "$*" | base64)" } get() { local path="$(path "$*")" if ! [ -f "$path" ]; then return 1 fi if wc -c "$path" | grep -q '^[ ]*0[ ]*$'; then return 1 fi local created="$(date -r "$path" +%s)" local now="$(date +%s)" if ((now-created > CACHE_DURATION)); then return 1 fi cat "$path" } put() { local path="$(path "$*")" tee "$path" } "$@" ) gitlab() ( is() { echo "$*" | grep -q gitlab.app && ! echo "$*" | grep -q '/wiki/' } gcurl() { local cache_key="gitlab gcurl $*" if cache get "$cache_key"; then return 0 fi _gcurl "$@" | cache put "$cache_key" } _gcurl() { curl -sS -H "Authorization: Bearer $GITLAB_PAT" "$@" } expand() { local cache_key="gitlab expand $*" if cache get "$cache_key"; then return 0 fi _expand "$@" | sort | cache put "$cache_key" } _expand() { local url="$1" local project="$(url_to_project_root "$url" | head -n 1)" local root="$(url_to_project_root "$url" | tail -n 1)" __expand "$project" "$root" } url_to_project_root() { local url="$1" local url_path="${url#http*://gitlab*.net/}" local project="${url_path%%/-/*}" local root="${url_path#*/-/}" local root="${root#tree/}" local root="${root#blob/}" local root="${root#*/}" echo "$project" echo "$root" } __expand() { local project="$1" local root="${2:-"/"}" local b64_files=() local b64_trees=("$root") local i=0 find_each() { local type="$1" shift echo "$*" \ | jq -c .[] \ | grep "\"type\":\"$type\"" \ | jq -r .path \ | while read -r line; do echo "$line" | base64; done \ | grep . } while [ "$i" -lt "${#b64_trees[@]}" ]; do got="$(list_tree "$project" "${b64_trees[i]}")" for b64_tree in $(find_each "tree" "$got"); do if ! echo "${b64_trees[@]}" | grep -q "[ ^]$b64_tree[ $]"; then b64_trees+=("$b64_tree") fi done for b64_file in $(find_each "blob" "$got"); do if ! echo "${b64_files[@]}" | grep -q "[ ^]$b64_file[ $]"; then b64_files+=("$b64_file") fi done i=$((i+1)) done for b64_file in "${b64_files[@]}"; do echo "$b64_file" | base64 --decode done } list_tree() { local project="$(urlencode "$1")" local path="/api/v4/projects/$project/repository/tree" local query="recursive=true&path=$2" gcurl "https://gitlab-app.eng.qops.net/$path?$query" } "$@" ) urlencode() ( LC_COLLATE=C local length="${#1}" for (( i = 0; i < length; i++ )); do local c="${1:$i:1}" case $c in [a-zA-Z0-9.~_-]) printf '%s' "$c" ;; *) printf '%%%02X' "'$c" ;; esac done ) if [ "$0" == "$BASH_SOURCE" ]; then main "$@" fi