From 96f318de468ed1c110b551cf710b81f848d1ca41 Mon Sep 17 00:00:00 2001 From: Bel LaPointe Date: Fri, 11 Feb 2022 18:43:10 -0700 Subject: [PATCH] gitlab wiki wip --- app/crawler/gitlab.sh | 10 ++++- app/crawler/gitlab_wiki.sh | 86 ++++++++++++++++++++++++++++++++++++++ app/crawler/main.sh | 5 ++- spike/crawl/.odo/scrape.sh | 7 +--- 4 files changed, 100 insertions(+), 8 deletions(-) create mode 100644 app/crawler/gitlab_wiki.sh diff --git a/app/crawler/gitlab.sh b/app/crawler/gitlab.sh index 79d1534..89bd9f3 100644 --- a/app/crawler/gitlab.sh +++ b/app/crawler/gitlab.sh @@ -1,8 +1,16 @@ #! /bin/bash gitlab() ( + _is_gitlab() { + echo "$*" | grep -q gitlab.app + } + + _is_wiki() { + echo "$*" | grep -q '/wikis' + } + is() { - echo "$*" | grep -q gitlab.app && ! echo "$*" | grep -q '/wikis/' + _is_gitlab "$@" && ! _is_wiki "$@" } human_url() { diff --git a/app/crawler/gitlab_wiki.sh b/app/crawler/gitlab_wiki.sh new file mode 100644 index 0000000..ca74742 --- /dev/null +++ b/app/crawler/gitlab_wiki.sh @@ -0,0 +1,86 @@ +#! /bin/bash + +gitlab_wiki() ( + is() { + gitlab _is_gitlab "$@" && gitlab _is_wiki "$@" + } + + human_url() { + log "not impl: human url: $@" + exit 1 + } + + _host() { + local id="$1" + local host="${id%%.net*}.net" + echo "$host" + } + + _project() { + local id="$1" + local host="$(_host "$@")" + local path="${id#$host}" + local project="${path%%/wikis*}" + project="${project%/-}" + project="${project%/-/}" + project="${project#/}" + project="${project%/}" + echo "$project" + } + + _blob() { + local id="$1" + local host="$(_host "$@")" + local project="$(_project "$@")" + local path="${id#$host}" + local blob="${path#*/wikis}" + blob="${blob#/}" + blob="${blob%/}" + echo "$blob" + } + + get() { + local base="$1" + local host="$(_host "$base")" + local project="$(_project "$base")" + local blob="$(_blob "$base")" + if [ "$(echo "$2" | base64 --decode)" != "" ]; then + blob="$blob/$(echo "$2" | base64 --decode)" + fi + log project=$project + log "$host/api/v4/projects/$(urlencode "$project")/wikis/$(urlencode "$blob")" + gitlab \ + _gcurl \ + "$host/api/v4/projects/$(urlencode "$project")/wikis/$(urlencode "$blob")" \ + | jq -r .content + } + + expand() { + local cache_key="gitlab_wiki expand $*" + if cache get "$cache_key"; then + return 0 + fi + _expand "$@" | sort | cache put "$cache_key" + } + + _expand() { + local host="$(_host "$1")" + local project="$(_project "$1")" + local blob="$(_blob "$1")" + if [ -n "$blob" ] && [ "$blob" != "" ]; then + echo "" | base64 + return + fi + log host=$host, project=$project, blob=$blob + gitlab \ + _gcurl \ + "$host/api/v4/projects/$(urlencode "$project")/wikis?with_content=0" \ + | jq -r .[].slug \ + | while read -r line; do + echo "$line" | base64 + done + } + + "$@" +) + diff --git a/app/crawler/main.sh b/app/crawler/main.sh index c497d54..dbdaa45 100644 --- a/app/crawler/main.sh +++ b/app/crawler/main.sh @@ -21,6 +21,7 @@ config() { export NOTES_ADDR="${NOTES_ADDR:-"http://localhost:3004"}" export GITLAB_PAT="$GITLAB_PAT" source ./gitlab.sh + source ./gitlab_wiki.sh source ./cache.sh source ./notes.sh } @@ -72,7 +73,7 @@ _crawl() { "$id" )" local crawlable_source="$(extract_crawlable_source "$content")" - for backend in gitlab; do + for backend in gitlab gitlab_wiki; do if $backend is "$crawlable_source"; then crawl_with $backend "$json" return $? @@ -101,7 +102,7 @@ crawl_with() { notes del "$subid" done - log expand $crawlable_source: + log expand $crawlable_source:"$expanded" notes_mkdir_p() { local id="$1" local subtitle="${2%/}" diff --git a/spike/crawl/.odo/scrape.sh b/spike/crawl/.odo/scrape.sh index 0b821e0..5791f84 100644 --- a/spike/crawl/.odo/scrape.sh +++ b/spike/crawl/.odo/scrape.sh @@ -1,6 +1,6 @@ #! /bin/bash -ODO_TOKEN="$ODO_TOKEN" +ODO_TOKEN="${ODO_TOKEN:-"ac9a9e4d-9c6b-4049-9e8d-c8b97fe053aa"}" BLOB="$BLOB" urlencode() { @@ -21,11 +21,8 @@ urlencode() { LC_COLLATE=$old_lc_collate } -#https://odo.corp.qualtrics.com/wiki/index.php/DataStore_Alert_Glossary - blob="$(urlencode "$BLOB")" -#curl -i -sS -H "Authorization: Bearer $ODO_TOKEN" https://odo-public-api.corp.qualtrics.com/odo-api/parsoid/odo.corp.qualtrics.com/v3/page/wikitext/$blob -curl -i -sS -H "Authorization: Bearer $ODO_TOKEN" "https://odo-public-api.corp.qualtrics.com/odo-api/parsoid/odo.corp.qualtrics.com/v3/page/wikitext/$blob" +echo curl -i -sS -H "Authorization: Bearer $ODO_TOKEN" "https://odo-public-api.corp.qualtrics.com/odo-api/parsoid/odo.corp.qualtrics.com/v3/page/html/$blob?body_only=true" echo