diff --git a/app/crawler/main.sh b/app/crawler/main.sh index a565844..896a6e1 100644 --- a/app/crawler/main.sh +++ b/app/crawler/main.sh @@ -13,9 +13,11 @@ main() { config() { set -o pipefail set -e - export CACHE=$(mktemp -d) + export CACHE="${CACHE:-"$(mktemp -d)"}" + export CACHE_DURATION=$((60*5)) export CRAWL_INTERVAL=$((60*5)) export NOTEA_ADDR="${NOTEA_ADDR:-"http://localhost:3000"}" + export GITLAB_PAT="$GITLAB_PAT" } log() { @@ -27,7 +29,7 @@ ids() { } should_crawl() { - local f="$CACHE/crawled_$1" + local f="$CACHE/crawled.$1" if [ ! -f "$f" ]; then return 0 fi @@ -46,7 +48,21 @@ crawl() { if ! is_crawlable "$content"; then return 0 fi - log not impl crawl + local crawlable_source="$(echo "$content" | head -n 1 | awk '{print $NF}')" + for backend in gitlab; do + if $backend is "$crawlable_source"; then + crawl_with $backend "$json" + return $? + fi + done + log "unknown backend for $crawlable_source" + return 1 +} + +crawl_with() { + local backend="$1" + local json="$2" + log not impl crawl with return 1 } @@ -80,11 +96,11 @@ notea() ( } get() { - local cached="$CACHE/cache_$1" - if [ -f "$cached" ] && cat "$cached" | grep .; then + local cache_key="notea cache $1" + if cache get "$cache_key"; then return 0 fi - _get "$@" | tee "$cached" + _get "$@" | cache put "$cache_key" } _get() { @@ -94,9 +110,133 @@ notea() ( "$@" ) -crawler() ( - should() { +cache() ( + path() { + echo "$CACHE/$(echo "$*" | base64)" } + get() { + local path="$(path "$*")" + if ! [ -f "$path" ]; then + return 1 + fi + if wc -c "$path" | grep -q '^[ ]*0[ ]*$'; then + return 1 + fi + local created="$(date -r "$path" +%s)" + local now="$(date +%s)" + if ((now-created > CACHE_DURATION)); then + return 1 + fi + cat "$path" + } + put() { + local path="$(path "$*")" + tee "$path" + } + "$@" +) + +gitlab() ( + is() { + echo "$*" | grep -q gitlab.app && ! echo "$*" | grep -q '/wiki/' + } + + gcurl() { + local cache_key="gitlab gcurl $*" + if cache get "$cache_key"; then + return 0 + fi + _gcurl "$@" | cache put "$cache_key" + } + + _gcurl() { + curl -sS -H "Authorization: Bearer $GITLAB_PAT" "$@" + } + + expand() { + local cache_key="gitlab expand $*" + if cache get "$cache_key"; then + return 0 + fi + _expand "$@" | sort | cache put "$cache_key" + } + + _expand() { + local url="$1" + local project="$(url_to_project_root "$url" | head -n 1)" + local root="$(url_to_project_root "$url" | tail -n 1)" + __expand "$project" "$root" + } + + url_to_project_root() { + local url="$1" + local url_path="${url#http*://gitlab*.net/}" + local project="${url_path%%/-/*}" + local root="${url_path#*/-/}" + local root="${root#tree/}" + local root="${root#blob/}" + local root="${root#*/}" + echo "$project" + echo "$root" + } + + __expand() { + local project="$1" + local root="${2:-"/"}" + + local b64_files=() + local b64_trees=("$root") + local i=0 + + find_each() { + local type="$1" + shift + echo "$*" \ + | jq -c .[] \ + | grep "\"type\":\"$type\"" \ + | jq -r .path \ + | while read -r line; do echo "$line" | base64; done \ + | grep . + } + while [ "$i" -lt "${#b64_trees[@]}" ]; do + got="$(list_tree "$project" "${b64_trees[i]}")" + for b64_tree in $(find_each "tree" "$got"); do + if ! echo "${b64_trees[@]}" | grep -q "[ ^]$b64_tree[ $]"; then + b64_trees+=("$b64_tree") + fi + done + for b64_file in $(find_each "blob" "$got"); do + if ! echo "${b64_files[@]}" | grep -q "[ ^]$b64_file[ $]"; then + b64_files+=("$b64_file") + fi + done + i=$((i+1)) + done + for b64_file in "${b64_files[@]}"; do + echo "$b64_file" | base64 --decode + done + } + + list_tree() { + local project="$(urlencode "$1")" + local path="/api/v4/projects/$project/repository/tree" + local query="recursive=true&path=$2" + gcurl "https://gitlab-app.eng.qops.net/$path?$query" + } + + "$@" +) + +urlencode() ( + LC_COLLATE=C + local length="${#1}" + for (( i = 0; i < length; i++ )); do + local c="${1:$i:1}" + case $c in + [a-zA-Z0-9.~_-]) printf '%s' "$c" ;; + *) printf '%%%02X' "'$c" ;; + esac + done ) if [ "$0" == "$BASH_SOURCE" ]; then