From 42605c706b7d90e6101840b5bba88bb3f4041447 Mon Sep 17 00:00:00 2001 From: Bel LaPointe Date: Mon, 31 Jan 2022 15:27:18 -0700 Subject: [PATCH] get gitlab scrape scripts from hackathon --- spike/crawl/env.env | 3 ++ spike/crawl/gitlab-wiki/expand.sh | 64 +++++++++++++++++++++++++++++++ spike/crawl/gitlab/expand.sh | 64 +++++++++++++++++++++++++++++++ 3 files changed, 131 insertions(+) create mode 100644 spike/crawl/env.env create mode 100644 spike/crawl/gitlab-wiki/expand.sh create mode 100644 spike/crawl/gitlab/expand.sh diff --git a/spike/crawl/env.env b/spike/crawl/env.env new file mode 100644 index 0000000..3f494d6 --- /dev/null +++ b/spike/crawl/env.env @@ -0,0 +1,3 @@ +GITLAB_PAT=LkaMCFFfqdHd-r4Cubnw +ODO_TOKEN=ac9a9e4d-9c6b-4049-9e8d-c8b97fe053aa +GDOC_TOKEN= diff --git a/spike/crawl/gitlab-wiki/expand.sh b/spike/crawl/gitlab-wiki/expand.sh new file mode 100644 index 0000000..14594bb --- /dev/null +++ b/spike/crawl/gitlab-wiki/expand.sh @@ -0,0 +1,64 @@ +#! /bin/bash + +urlencode() { + # urlencode + + old_lc_collate=$LC_COLLATE + LC_COLLATE=C + + local length="${#1}" + for (( i = 0; i < length; i++ )); do + local c="${1:$i:1}" + case $c in + [a-zA-Z0-9.~_-]) printf '%s' "$c" ;; + *) printf '%%%02X' "'$c" ;; + esac + done + + LC_COLLATE=$old_lc_collate +} + +pat="${GITLAB_PAT:-"$(security find-generic-password -a "${USER}" -s GITLAB_PAT -w 2> /dev/null )"}" +project="$(urlencode ${PROJECT:-"data-store/orchestration/runbooks"})" +path="/api/v4/projects/$project/repository/tree" +list() { + local query="recursive=true&path=$(test -n "$GITLAB_PATH" && echo "$GITLAB_PATH")" + curl -sS \ + -H "Authorization: Bearer $pat" \ + "https://gitlab-app.eng.qops.net/$path?$query" +} +list_wiki() { + curl -sS \ + -H "Authorization: Bearer $pat" \ + "https://gitlab-app.eng.qops.net/api/v4/projects/$project/wikis?with_content=0" +} +files=() +trees=("${GITLAB_PATH:-"/"}") +i=0 +while [ "$i" -lt "${#trees[@]}" ]; do + gitlab_path="${trees[i]}" + echo gitlab_path=$gitlab_path, i=$i, trees=${#trees[@]}, files=${#files[@]}... >&2 + got="$(GITLAB_PATH=$gitlab_path list)" + for nested_path in $(echo "$got" | jq -c .[] | grep '"type":"tree"' | jq -r .path | while read -r line; do echo "$line" | base64; done | grep .); do + nested_path="$(echo "$nested_path" | base64 --decode)" + if echo "${trees[@]}" | grep -q "[ ^]$nested_path[ $]"; then + continue + fi + trees+=("$nested_path") + done + for nested_path in $(echo "$got" | jq -c .[] | grep '"type":"blob"' | jq -r .path | while read -r line; do echo "$line" | base64; done | grep .); do + nested_path="$(echo "$nested_path" | base64 --decode)" + if echo "${files[@]}" | grep -q "[ ^]$nested_path[ $]"; then + continue + fi + files+=("$nested_path") + done + i=$((i+1)) + if ((i>5)); then + break + fi +done +for file in "${files[@]}"; do + echo "gitlab://$file" +done | sort +list_wiki | jq -c .[] | jq -r .title | sed 's/^/wiki:\/\//' diff --git a/spike/crawl/gitlab/expand.sh b/spike/crawl/gitlab/expand.sh new file mode 100644 index 0000000..14594bb --- /dev/null +++ b/spike/crawl/gitlab/expand.sh @@ -0,0 +1,64 @@ +#! /bin/bash + +urlencode() { + # urlencode + + old_lc_collate=$LC_COLLATE + LC_COLLATE=C + + local length="${#1}" + for (( i = 0; i < length; i++ )); do + local c="${1:$i:1}" + case $c in + [a-zA-Z0-9.~_-]) printf '%s' "$c" ;; + *) printf '%%%02X' "'$c" ;; + esac + done + + LC_COLLATE=$old_lc_collate +} + +pat="${GITLAB_PAT:-"$(security find-generic-password -a "${USER}" -s GITLAB_PAT -w 2> /dev/null )"}" +project="$(urlencode ${PROJECT:-"data-store/orchestration/runbooks"})" +path="/api/v4/projects/$project/repository/tree" +list() { + local query="recursive=true&path=$(test -n "$GITLAB_PATH" && echo "$GITLAB_PATH")" + curl -sS \ + -H "Authorization: Bearer $pat" \ + "https://gitlab-app.eng.qops.net/$path?$query" +} +list_wiki() { + curl -sS \ + -H "Authorization: Bearer $pat" \ + "https://gitlab-app.eng.qops.net/api/v4/projects/$project/wikis?with_content=0" +} +files=() +trees=("${GITLAB_PATH:-"/"}") +i=0 +while [ "$i" -lt "${#trees[@]}" ]; do + gitlab_path="${trees[i]}" + echo gitlab_path=$gitlab_path, i=$i, trees=${#trees[@]}, files=${#files[@]}... >&2 + got="$(GITLAB_PATH=$gitlab_path list)" + for nested_path in $(echo "$got" | jq -c .[] | grep '"type":"tree"' | jq -r .path | while read -r line; do echo "$line" | base64; done | grep .); do + nested_path="$(echo "$nested_path" | base64 --decode)" + if echo "${trees[@]}" | grep -q "[ ^]$nested_path[ $]"; then + continue + fi + trees+=("$nested_path") + done + for nested_path in $(echo "$got" | jq -c .[] | grep '"type":"blob"' | jq -r .path | while read -r line; do echo "$line" | base64; done | grep .); do + nested_path="$(echo "$nested_path" | base64 --decode)" + if echo "${files[@]}" | grep -q "[ ^]$nested_path[ $]"; then + continue + fi + files+=("$nested_path") + done + i=$((i+1)) + if ((i>5)); then + break + fi +done +for file in "${files[@]}"; do + echo "gitlab://$file" +done | sort +list_wiki | jq -c .[] | jq -r .title | sed 's/^/wiki:\/\//'