reorg repo

This commit is contained in:
Bel LaPointe
2022-02-16 12:01:11 -07:00
parent 8cd9a5d472
commit 9739a73265
45 changed files with 12 additions and 13 deletions

28
crawler/cache.sh Normal file
View File

@@ -0,0 +1,28 @@
#! /bin/bash
cache() (
path() {
echo "$CACHE/$(echo "$*" | base64 | md5sum | awk '{print $1}')"
}
get() {
local path="$(path "$*")"
if ! [ -f "$path" ]; then
return 1
fi
if wc -c "$path" | grep -q '^[ ]*0[ ]*$'; then
return 1
fi
local created="$(date -r "$path" +%s)"
local now="$(date +%s)"
if ((now-created > CACHE_DURATION)); then
return 1
fi
cat "$path"
}
put() {
local path="$(path "$*")"
tee "$path"
}
"$@"
)

12
crawler/cache_test.sh Normal file
View File

@@ -0,0 +1,12 @@
#! /bin/bash
test_path() {
cache path abc | tr '/' '\n' | tail -n 1 | grep -q .
}
test_get_put_get() {
export CACHE_DURATION=10
! cache get abc
echo hi | cache put abc > /dev/null
cache get abc | grep -q .
}

158
crawler/gitlab.sh Normal file
View File

@@ -0,0 +1,158 @@
#! /bin/bash
gitlab() (
_is_gitlab() {
echo "$*" | grep -q gitlab.app
}
_is_wiki() {
echo "$*" | grep -q '/wikis'
}
is() {
_is_gitlab "$@" && ! _is_wiki "$@"
}
human_url() {
_url "$@" | sed 's/api.v4.projects.//' | sed 's/%2F/\//g' | sed 's/.raw$//' | sed 's/repository\/files/-\/tree\/master/'
}
_url() {
local base_url="$1"
local blob="$(echo "$2" | base64 --decode)"
local project="$(_url_to_project_root "$base_url" | head -n 1)"
project="$(urlencode "$project")"
local root="$(_url_to_project_root "$base_url" | tail -n 1)"
if [ -n "$root" ]; then
blob="${root%/}/${blob#/}"
blob="${blob#/}"
blob="${blob%/}"
fi
blob="$(urlencode "$blob")"
local path="api/v4/projects/$project/repository/files/$blob/raw"
log "url: https://gitlab-app.eng.qops.net/$path (blob=$blob, project=$project)"
echo "https://gitlab-app.eng.qops.net/$path"
}
get() {
_gcurl "$(_url "$@")"
}
expand() {
local cache_key="gitlab expand $*"
if cache get "$cache_key"; then
return 0
fi
_expand "$@" | sort | cache put "$cache_key"
}
_expand() {
local url="$1"
local project="$(_url_to_project_root "$url" | head -n 1)"
local root="$(_url_to_project_root "$url" | tail -n 1)"
__expand "$project" "$root"
}
_url_to_project_root() {
local url="$1"
local url_path="${url#http*://gitlab*.net/}"
local project=""
if [[ "$url_path" == *"/-/"* ]]; then
project="${url_path%%/-/*}"
elif [[ "$url_path" == *"/tree/"* ]]; then
project="${url_path%%/tree/*}"
else
project="$url_path"
fi
local root="${url_path#*"$project"}"
root="${root#*/-/}"
root="${root#/}"
root="${root#blob/}"
root="${root#tree/}"
root="$(echo "$root" | sed 's/^[^\/]*//')"
root="${root#/}"
log project=$project, root=$root, url=$url
echo "$project"
echo "$root"
}
__expand() {
local project="$1"
local root="${2:-"/"}"
local b64_files=()
local b64_trees=("$(echo "$root" | base64)")
local i=0
find_each() {
local type="$1"
shift
echo "$*" \
| jq -c .[] \
| grep "\"type\":\"$type\"" \
| jq -r .path \
| while read -r line; do echo "$line" | base64; done \
| grep .
}
while [ "$i" -lt "${#b64_trees[@]}" ]; do
got="$(_list_tree "$project" "$(echo "${b64_trees[i]}" | base64 --decode)")"
for b64_tree in $(find_each "tree" "$got"); do
if ! echo "${b64_trees[@]}" | grep -q "[ ^]$b64_tree[ $]"; then
b64_trees+=("$b64_tree")
fi
done
for b64_file in $(find_each "blob" "$got"); do
if ! echo "${b64_files[@]}" | grep -q "[ ^]$b64_file[ $]"; then
b64_files+=("$b64_file")
fi
done
i=$((i+1))
done
for b64_file in "${b64_files[@]}"; do
local file="$(echo "$b64_file" | base64 --decode)"
file="${file#$root}"
file="${file#/}"
case "${file##*.}" in
md|txt )
echo "$file" | base64
;;
esac
done
}
_list_tree() {
local project="$(urlencode "$1")"
local path="api/v4/projects/$project/repository/tree"
local query="recursive=true&path=$2"
_gcurl "https://gitlab-app.eng.qops.net/$path?$query"
}
_gcurl() {
local cache_key="gitlab _gcurl $*"
if cache get "$cache_key"; then
return 0
fi
__gcurl "$@" | cache put "$cache_key"
}
__gcurl() {
curl -sS -H "Authorization: Bearer $GITLAB_PAT" "$@"
}
"$@"
)
urlencode() (
LC_COLLATE=C
local length="${#1}"
for (( i = 0; i < length; i++ )); do
local c="${1:$i:1}"
case $c in
[a-zA-Z0-9.~_-]) printf '%s' "$c" ;;
*) printf '%%%02X' "'$c" ;;
esac
done
)

69
crawler/gitlab_test.sh Normal file
View File

@@ -0,0 +1,69 @@
#! /bin/bash
test___expand() {
gitlab eval "$(cat <<EOF
_gcurl() {
case "\$1" in
'https://gitlab-app.eng.qops.net/api/v4/projects/project/repository/tree?recursive=true&path=dir' )
echo '[
{"id": "a", "name": "dir2", "type": "tree", "path": "dir/dir2", "mode": "040000"},
{"id": "b", "name": "blob.md", "type": "blob", "path": "dir/blob.md", "mode": "100644"}
]'
;;
'https://gitlab-app.eng.qops.net/api/v4/projects/project/repository/tree?recursive=true&path=dir/dir2' )
echo '[
{"id": "c", "name": "blob2.txt", "type": "blob", "path": "dir/dir2/blob2.txt", "mode": "100644"},
{"id": "c", "name": "blob3.jpg", "type": "blob", "path": "dir/dir2/blob3.jpg", "mode": "100644"}
]'
;;
* )
return 1
;;
esac
}
local blob=false
local dir2blob2=false
local others=0
for result in \$(__expand project dir); do
if echo \$result | base64 --decode | grep -q ^blob.md$; then
blob=true
elif echo \$result | base64 --decode | grep -q ^dir2.blob2.txt$; then
dir2blob2=true
else
others=\$((others+1))
fi
done
if [ \$others != 0 ]; then
return 101
fi
if ! \$blob; then
return 102
fi
if ! \$dir2blob2; then
return 103
fi
EOF
)"
}
test_url_to_project_root() {
log() { true; };
gitlab _url_to_project_root https://gitlab-app.eng.qops.net/data-store/orchestration/runbooks/tree/master | grep -q '^data-store/orchestration/runbooks$'
gitlab _url_to_project_root https://gitlab-app.eng.qops.net/data-store/orchestration/runbooks/tree/master | tail -n 1 | grep ^$
gitlab _url_to_project_root https://gitlab-app.eng.qops.net/data-store/orchestration/runbooks/-/blob/master/Alerts/rems/README.md | grep -q 'data-store/orchestration/runbooks'
gitlab _url_to_project_root https://gitlab-app.eng.qops.net/data-store/orchestration/runbooks/-/blob/master/Alerts/rems/README.md | grep -q 'Alerts/rems/README.md'
gitlab _url_to_project_root https://gitlab-app.eng.qops.net/data-store/orchestration/runbooks/-/tree/master/Alerts | grep -q 'data-store/orchestration/runbooks'
gitlab _url_to_project_root https://gitlab-app.eng.qops.net/data-store/orchestration/runbooks/-/tree/master/Alerts | grep -q 'Alerts'
gitlab _url_to_project_root https://gitlab-app.eng.qops.net/data-store/orchestration/runbooks | grep -q 'data-store/orchestration/runbooks'
gitlab _url_to_project_root https://gitlab-app.eng.qops.net/data-store/orchestration/runbooks | grep -q '^$'
}
test_is() {
gitlab is https://gitlab-app.eng.qops.net/data-store/orchestration/runbooks/-/blob/master/Alerts/rems/README.md
gitlab is https://gitlab-app.eng.qops.net/data-store/orchestration/runbooks/-/tree/master/Alerts
gitlab is https://gitlab-app.eng.qops.net/data-store/orchestration/runbooks
! gitlab is https://gitlab-app.eng.qops.net/surveys/marauders-map/wikis/Customer-impact-of-an-outage
}

86
crawler/gitlab_wiki.sh Normal file
View File

@@ -0,0 +1,86 @@
#! /bin/bash
gitlab_wiki() (
is() {
gitlab _is_gitlab "$@" && gitlab _is_wiki "$@"
}
human_url() {
log "not impl: human url: $@"
exit 1
}
_host() {
local id="$1"
local host="${id%%.net*}.net"
echo "$host"
}
_project() {
local id="$1"
local host="$(_host "$@")"
local path="${id#$host}"
local project="${path%%/wikis*}"
project="${project%/-}"
project="${project%/-/}"
project="${project#/}"
project="${project%/}"
echo "$project"
}
_blob() {
local id="$1"
local host="$(_host "$@")"
local project="$(_project "$@")"
local path="${id#$host}"
local blob="${path#*/wikis}"
blob="${blob#/}"
blob="${blob%/}"
echo "$blob"
}
get() {
local base="$1"
local host="$(_host "$base")"
local project="$(_project "$base")"
local blob="$(_blob "$base")"
if [ "$(echo "$2" | base64 --decode)" != "" ]; then
blob="$blob/$(echo "$2" | base64 --decode)"
fi
log project=$project
log "$host/api/v4/projects/$(urlencode "$project")/wikis/$(urlencode "$blob")"
gitlab \
_gcurl \
"$host/api/v4/projects/$(urlencode "$project")/wikis/$(urlencode "$blob")" \
| jq -r .content
}
expand() {
local cache_key="gitlab_wiki expand $*"
if cache get "$cache_key"; then
return 0
fi
_expand "$@" | sort | cache put "$cache_key"
}
_expand() {
local host="$(_host "$1")"
local project="$(_project "$1")"
local blob="$(_blob "$1")"
if [ -n "$blob" ] && [ "$blob" != "" ]; then
echo "" | base64
return
fi
log host=$host, project=$project, blob=$blob
gitlab \
_gcurl \
"$host/api/v4/projects/$(urlencode "$project")/wikis?with_content=0" \
| jq -r .[].slug \
| while read -r line; do
echo "$line" | base64
done
}
"$@"
)

182
crawler/main.sh Normal file
View File

@@ -0,0 +1,182 @@
#! /bin/bash
main() {
config
log crawling ids...
for id in $(crawlable_ids); do
crawl "$id"
done
log rewriting ids...
for id in $(ids); do
rewrite "$id"
done
}
config() {
set -o pipefail
set -e
export CACHE="${CACHE:-"$(mktemp -d)"}"
mkdir -p "$CACHE"
export CACHE_DURATION=$((60*50))
export NOTES_ADDR="${NOTES_ADDR:-"http://localhost:3004"}"
export GITLAB_PAT="$GITLAB_PAT"
source ./gitlab.sh
source ./gitlab_wiki.sh
source ./cache.sh
source ./notes.sh
}
log() {
echo "$(echo "$(date +%H:%M:%S)> $*" | tr '\n' ' ')" >&2
}
ids() {
notes ids | sort
}
crawlable_ids() {
local all_ids=($(ids))
local crawlable_ids=()
for id in "${all_ids[@]}"; do
if for crawlable_id in "${crawlable_ids[@]}"; do
if [ "$id" != "${id#$crawlable_id/}" ]; then
echo true
fi
done | grep -q true; then
continue
fi
local content="$(notes get "$id")"
if is_crawlable "$content"; then
crawlable_ids+=("$id")
fi
done
for crawlable_id in "${crawlable_ids[@]}"; do
echo "$crawlable_id"
done
}
crawl() {
local cache_key="crawled $*"
# TODO
if false && cache get "$cache_key"; then
return
fi
_crawl "$@" | cache put "$cache_key"
}
_crawl() {
local id="$1"
local content="$(notes get "$id")"
local json="$(
printf '{"content": %s, "id": "%s"}' \
"$(echo "$content" | jq -Rs)" \
"$id"
)"
local crawlable_source="$(extract_crawlable_source "$content")"
for backend in gitlab gitlab_wiki; do
if $backend is "$crawlable_source"; then
crawl_with $backend "$json"
return $?
fi
done
log "unknown backend for $crawlable_source"
return 1
}
extract_crawlable_source() {
echo "$*" | head -n 1 | awk '{print $NF}' | sed 's/^<//' | sed 's/>$//' | sed 's/^\///' | sed 's/\/$//'
}
crawl_with() {
local backend="$1"
local json="$2"
local pid="$(echo "$json" | jq -r .id)"
local content="$(echo "$json" | jq -r .content)"
local crawlable_source="$(extract_crawlable_source "$content")"
local expanded=($($backend expand "$crawlable_source"))
log purge $crawlable_source:
for subid in $(notes ids | grep "^$pid/"); do
notes del "$subid"
done
log expand $crawlable_source:"$expanded"
notes_mkdir_p() {
local id="$1"
local subtitle="${2%/}"
notes put "$id" "$subtitle" "autogenerated content"
}
one() {
encode() {
base64 | md5sum | cut -c 1-10 | awk '{print $1}' | tr -d '\n'
}
local i="$1"
local full_title="$(
echo "$i" | base64 --decode | grep . || echo "${crawlable_source##*/}"
)"
full_title="${full_title%/}"
full_title="${full_title#/}"
export TITLE="${full_title##*/}"
local human_url="$($backend human_url "$crawlable_source" "$i")"
export CONTENT="$(
echo "**!! WARNING !! This page is autogenerated and prone to destruction and replacement**"
echo "**[See the original]($human_url)**"
$backend get "$crawlable_source" "$i" \
| sed 's/](\([^#h]\)/]\(%%%\1/g'
)"
export CONTENT="${CONTENT//"%%%"/"${human_url%/*}/"}"
export CONTENT="$(
printf "%s\n" "$CONTENT" \
| sed 's/!\[\([^]]*\)](\([^)]*\)\/-\/tree\/\([^)]*\))/![\1](\2\/-\/raw\/\3)/g'
)"
export ID="$(
local sum="$pid/"
local title_so_far=""
for subtitle in $(echo $full_title | tr '/' '\n' | while read -r subtitle; do echo "$subtitle" | base64; done); do
local subtitle="$(echo "$subtitle" | base64 --decode)"
if [ -n "$title_so_far" ]; then
local mkdir_p_title="${title_so_far%/}"
mkdir_p_title="${mkdir_p_title##*/}"
notes_mkdir_p "${sum%/}" "${mkdir_p_title}" >&2
fi
sum+="$(echo "$subtitle" | encode)/"
title_so_far+="$subtitle/"
done
echo "$sum"
)"
ID="${ID%/}"
log " $ID ($TITLE): ${#CONTENT}"
push_crawled "$ID" "$TITLE" "$CONTENT"
}
if [ "${#expanded[@]}" -gt 0 ]; then
for i in $(seq 0 $(("${#expanded[@]}"-1))); do
one "${expanded[i]}"
done
else
one ""
fi
}
push_crawled() {
notes put "$@"
}
is_crawlable() {
local crawlable_source="$(extract_crawlable_source "$*")"
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*"
echo "$crawlable_source" | cut -c 1-300 | grep -q -E "^[ ]*$url_pattern[ ]*$"
}
rewrite() {
log not impl: rewrite "./asdf" to "absolute.com/asdf"
log not impl: rewrite "#abc-def?f=abc" to "#h-abc-def?f=abc" or better dont depend on query params so much
log not impl rewrite, change images
return 1
}
if [ "$0" == "$BASH_SOURCE" ]; then
main "$@"
fi

102
crawler/notes.sh Normal file
View File

@@ -0,0 +1,102 @@
#! /bin/bash
notes() (
ids() {
_recurse_ids "" "$(_tree)"
}
_tree() {
__tree "$@"
}
__tree() {
_nncurl $NOTES_ADDR/api/v0/tree
}
_nncurl() {
curl -sS "$@"
}
_recurse_ids() {
local prefix="$1"
local json="$2"
if echo "$json" | jq .Branches | grep -q ^null$; then
return 0
fi
local b64lines="$(echo "$json" | jq -r '.Branches | keys[]' | while read -r line; do echo "$line" | base64; done)"
if [ -z "$b64lines" ]; then
return 0
fi
for line in $b64lines; do
line="$(echo "$line" | base64 --decode)"
local subfix="$(printf "%s/%s" "$prefix" "$line")"
subfix="${subfix#/}"
if ! _is_deleted "$subfix"; then
echo "$subfix"
fi
_recurse_ids "$subfix" "$(echo "$json" | jq -c ".Branches[\"$line\"]")"
done
}
meta() {
local id="$1"
local tree="$(_tree)"
for subid in ${id//\// }; do
tree="$(echo "$tree" | jq -c .Branches | jq -c ".[\"$subid\"]")"
done
echo "$tree" | jq .Leaf
}
_is_deleted() {
local id="$1"
while [ -n "$id" ]; do
if meta "$id" | jq .Deleted | grep -q true; then
return 0
fi
if [ "$id" == "${id%/*}" ]; then
return 1
fi
id="${id%/*}"
done
return 1
}
get() {
_get "$@"
}
_get() {
_nncurl $NOTES_ADDR/api/v0/files/$1
}
del() {
local id="$1"
_nncurl \
-X DELETE \
$NOTES_ADDR/api/v0/files/$id
}
put() {
set -u
local ret=0
if ! _put "$@"; then
ret=1
fi
set +u
return $ret
}
_put() {
local id="$1"
local title="$2"
local body="$3"
echo "$body" | _nncurl \
-X PUT \
-H "Title: $title" \
-d "$body" \
$NOTES_ADDR/api/v0/files/$id
}
"$@"
)

66
crawler/notes_test.sh Normal file
View File

@@ -0,0 +1,66 @@
#! /bin/bash
test_ids() {
local two_levels='{
"Branches": {
"id": {
"Branches": {
"subid": {
"Branches": {}
}
}
}
}
}'
notes eval "$(cat <<EOF
_tree() { echo '$two_levels'; true; }
(ids; true) | grep '^id$' > /dev/null || return 101
(ids; true) | grep '^id\/subid$' > /dev/null || return 102
ids | wc -l | grep 2 > /dev/null || return 103
EOF
)"
}
test_meta() {
local two_levels='{
"Branches": {
"id": {
"Leaf": {"Title": "top level"},
"Branches": {
"subid": {
"Leaf": {"Title": "sub level"},
"Branches": {}
}
}
}
}
}'
notes eval "$(cat <<EOF
_tree() { echo '$two_levels'; }
meta id | jq .Title | grep -q top.level || return 201
meta id/subid | jq .Title | grep -q sub.level || return 202
EOF
)"
}
test__is_deleted() {
local two_levels='{
"Branches": {
"id": {
"Leaf": {"Title": "top level", "Deleted": true},
"Branches": {
"subid": {
"Leaf": {"Title": "sub level"},
"Branches": {}
}
}
}
}
}'
notes eval "$(cat <<EOF
_tree() { echo '$two_levels'; }
_is_deleted id || return 301
_is_deleted id/subid || return 302
EOF
)"
}

52
crawler/test.sh Normal file
View File

@@ -0,0 +1,52 @@
#! /bin/bash
main() {
local ret=0
for f in ./*_test.sh; do
if ! one_main "$f"; then
echo failed $f >&2
ret=$((ret+1))
fi
done
if [ $ret != 0 ]; then
echo failed >&2
fi
return $ret
}
one_main() (
local f="$1"
local ret=0
for t in $(grep ^test_ "$f" | sed 's/(.*//'); do
one_test "$f" "$t"
local test_ret=$?
if [ $test_ret != 0 ]; then
echo failed $f:$t: $test_ret >&2
ret=$((ret+1))
fi
done
return $ret
)
one_test() (
local f="$1"
local t="$2"
each
source "${f%_test.sh}.sh"
source "$f"
eval "$t"
)
each() {
export CACHE=$(mktemp -d)
export GITLAB_PAT=gibberish
export NOTES_ADDR=http://127.0.0.1:61111
source ./cache.sh
set -e
set -o pipefail
log() { echo "> $*" >&2; }
}
if [ "$0" == "$BASH_SOURCE" ]; then
main "$@"
fi