diff --git a/app/crawler/gitlab.sh b/app/crawler/gitlab.sh index 817347c..79d1534 100644 --- a/app/crawler/gitlab.sh +++ b/app/crawler/gitlab.sh @@ -5,13 +5,17 @@ gitlab() ( echo "$*" | grep -q gitlab.app && ! echo "$*" | grep -q '/wikis/' } - get() { - local url="$1" + human_url() { + _url "$@" | sed 's/api.v4.projects.//' | sed 's/%2F/\//g' | sed 's/.raw$//' | sed 's/repository\/files/-\/tree\/master/' + } + + _url() { + local base_url="$1" local blob="$(echo "$2" | base64 --decode)" - local project="$(_url_to_project_root "$url" | head -n 1)" + local project="$(_url_to_project_root "$base_url" | head -n 1)" project="$(urlencode "$project")" - local root="$(_url_to_project_root "$url" | tail -n 1)" + local root="$(_url_to_project_root "$base_url" | tail -n 1)" if [ -n "$root" ]; then blob="${root%/}/${blob#/}" blob="${blob#/}" @@ -20,8 +24,12 @@ gitlab() ( blob="$(urlencode "$blob")" local path="api/v4/projects/$project/repository/files/$blob/raw" - log _gcurl "https://gitlab-app.eng.qops.net/$path (blob=$blob, project=$project)" - _gcurl "https://gitlab-app.eng.qops.net/$path" + log "url: https://gitlab-app.eng.qops.net/$path (blob=$blob, project=$project)" + echo "https://gitlab-app.eng.qops.net/$path" + } + + get() { + _gcurl "$(_url "$@")" } expand() { diff --git a/app/crawler/main.sh b/app/crawler/main.sh index 3820073..4504d08 100644 --- a/app/crawler/main.sh +++ b/app/crawler/main.sh @@ -118,10 +118,14 @@ crawl_with() { full_title="${full_title%/}" full_title="${full_title#/}" export TITLE="${full_title##*/}" + local human_url="$($backend human_url "$crawlable_source" "$i")" export CONTENT="$( echo "**!! WARNING !! This page is autogenerated and prone to destruction and replacement**" - $backend get "$crawlable_source" "$i" + echo "**[See the original]($human_url)**" + $backend get "$crawlable_source" "$i" \ + | sed 's/](\([^#h]\)/]\(%%%\1/g' )" + export CONTENT="${CONTENT//"%%%"/"${human_url%/*}/"}" export ID="$( local sum="$pid/" local title_so_far="" @@ -139,7 +143,7 @@ crawl_with() { )" ID="${ID%/}" log " $ID ($TITLE): ${#CONTENT}" - push_crawled "$PID/$ID" "$TITLE" "$CONTENT" + push_crawled "$ID" "$TITLE" "$CONTENT" } if [ "${#expanded[@]}" -gt 0 ]; then for i in $(seq 0 $(("${#expanded[@]}"-1))); do @@ -162,8 +166,8 @@ is_crawlable() { } rewrite() { - log not impl: rewrite "#abc-def" to "#h-abc-def" - log not impl: rewrite "./asdf" to "./zyxw" or "absolute.com/asdf" + log not impl: rewrite "./asdf" to "absolute.com/asdf" + log not impl: rewrite "#abc-def?f=abc" to "#h-abc-def?f=abc" or better dont depend on query params so much log not impl rewrite, change images return 1 } diff --git a/spike/review/reinvent/ezmded/server/todo.yaml b/spike/review/reinvent/ezmded/server/todo.yaml index d1b6745..c8052c1 100644 --- a/spike/review/reinvent/ezmded/server/todo.yaml +++ b/spike/review/reinvent/ezmded/server/todo.yaml @@ -1,16 +1,19 @@ todo: -- link to original in generated/scraped -- rewrite links if available to local -- rewrite anchors (maybe gitlab already works :^)) +- only 1 pid link in tree as title - mark generated via meta so other files in the dir can be created, deleted, replaced safely - ui; last updated; 2022.02.01T12:34:56 -- fix links - put images @server - fix images - scrape odo - scrape gdoc - scrape gsheet +- scrape gslide +- anchor links work +- rewrite links if available to local done: +- fix links +- rewrite anchors (maybe gitlab already works :^)) +- link to original in generated/scraped - buttons to invis - damned width css - css