diff --git a/app/crawler/gitlab.sh b/app/crawler/gitlab.sh index e69916a..63608f9 100644 --- a/app/crawler/gitlab.sh +++ b/app/crawler/gitlab.sh @@ -7,7 +7,7 @@ gitlab() ( get() { local url="$1" - local blob="$2" + local blob="$(echo "$2" | base64 --decode)" local project="$(_url_to_project_root "$url" | head -n 1)" project="$(urlencode "$project")" @@ -82,7 +82,11 @@ gitlab() ( local file="$(echo "$b64_file" | base64 --decode)" file="${file#$root}" file="${file#/}" - echo "$file" | base64 + case "${file##*.}" in + md|txt ) + echo "$file" | base64 + ;; + esac done } diff --git a/app/crawler/gitlab_test.sh b/app/crawler/gitlab_test.sh index 4d89dbf..53d1502 100644 --- a/app/crawler/gitlab_test.sh +++ b/app/crawler/gitlab_test.sh @@ -7,12 +7,13 @@ test___expand() { 'https://gitlab-app.eng.qops.net/api/v4/projects/project/repository/tree?recursive=true&path=dir' ) echo '[ {"id": "a", "name": "dir2", "type": "tree", "path": "dir/dir2", "mode": "040000"}, - {"id": "b", "name": "blob", "type": "blob", "path": "dir/blob", "mode": "100644"} + {"id": "b", "name": "blob.md", "type": "blob", "path": "dir/blob.md", "mode": "100644"} ]' ;; 'https://gitlab-app.eng.qops.net/api/v4/projects/project/repository/tree?recursive=true&path=dir/dir2' ) echo '[ - {"id": "c", "name": "blob2", "type": "blob", "path": "dir/dir2/blob2", "mode": "100644"} + {"id": "c", "name": "blob2.txt", "type": "blob", "path": "dir/dir2/blob2.txt", "mode": "100644"}, + {"id": "c", "name": "blob3.jpg", "type": "blob", "path": "dir/dir2/blob3.jpg", "mode": "100644"} ]' ;; * ) @@ -24,9 +25,9 @@ test___expand() { local dir2blob2=false local others=0 for result in \$(__expand project dir); do - if echo \$result | base64 --decode | grep -q ^blob$; then + if echo \$result | base64 --decode | grep -q ^blob.md$; then blob=true - elif echo \$result | base64 --decode | grep -q ^dir2.blob2$; then + elif echo \$result | base64 --decode | grep -q ^dir2.blob2.txt$; then dir2blob2=true else others=\$((others+1)) diff --git a/app/crawler/main.sh b/app/crawler/main.sh index d1a1d8c..185e89d 100644 --- a/app/crawler/main.sh +++ b/app/crawler/main.sh @@ -15,7 +15,7 @@ config() { set -e export CACHE="${CACHE:-"$(mktemp -d)"}" mkdir -p "$CACHE" - export CACHE_DURATION=$((60*5)) + export CACHE_DURATION=$((60*50)) export NOTEA_ADDR="${NOTEA_ADDR:-"http://localhost:3000"}" export GITLAB_PAT="$GITLAB_PAT" source ./gitlab.sh @@ -72,8 +72,13 @@ crawl_with() { local expanded=($($backend expand "$crawlable_source")) log expand $crawlable_source: - for i in $(seq 1 "${#expanded[@]}"); do - log " $(echo ${expanded[i]} | base64 --decode)" + for i in $(seq 1 $(("${#expanded[@]}"-1))); do + local title="$(echo "${expanded[i]}" | base64 --decode)" + CONTENT="$($backend get "$crawlable_source" "${expanded[i]}")" \ + ID="$(echo "$crawlable_source/$title" | base64)" \ + PID="$(echo $json | jq -r .id)" \ + TITLE="$title" \ + notea put done log not impl crawl with @@ -81,15 +86,10 @@ crawl_with() { } is_crawlable() { + local crawlable_source="$(extract_crawlable_source "$*")" # https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*" - if echo "$*" | tr -d '\n' | grep -E "^[ ]*$url_pattern[ ]*$"; then - return 0 - fi - if echo "$*" | head -n 1 | grep -E "^[ ]*_source_: $url_pattern[ ]*$"; then - return 0 - fi - return 1 + echo "$crawlable_source" | grep -q -E "^[ ]*$url_pattern[ ]*$" } rewrite() { diff --git a/app/crawler/notea.sh b/app/crawler/notea.sh index e034524..4ec5ebe 100644 --- a/app/crawler/notea.sh +++ b/app/crawler/notea.sh @@ -25,6 +25,40 @@ notea() ( ncurl $NOTEA_ADDR/api/notes/$1 } + put() { + set -u + local ret=0 + if ! _put "$@"; then + ret=1 + fi + set +u + return $ret + } + + _put() { + local xsrf_key="xsrf-token" + local contains_tokens="$(ncurl -i $NOTEA_ADDR/api)" + local xsrf_token="$(echo "$contains_tokens" | grep -o '"csrfToken":[^,]*' | tr ':' '\n' | jq -r . | tail -n 1)" + local xsrf_cookie="$(echo "$contains_tokens" | grep ^set.cookie: | sed 's/^set.cookie: //' | tr ';' '\n' | head -n 1)" + local request="$(echo '{ + "content": '$(echo "$CONTENT" | jq -Rs)', + "deleted": 0, + "id": '$(echo "$ID" | jq -R)', + "pid": '$(echo "$PID" | jq -R)', + "pinned": 0, + "shared": 0, + "title": '$(echo "$TITLE" | jq -R)' + }' | jq -c .)" + ncurl \ + -X POST \ + -H "$xsrf_key: $xsrf_token" \ + -b "$xsrf_cookie" \ + -H "Content-Type: application/json" \ + -d "$request" \ + $NOTEA_ADDR/api/notes \ + | grep -q "$ID" + } + "$@" )