From 98df3f2372a9ce9f975e9c81fd1a062173c22da1 Mon Sep 17 00:00:00 2001 From: Bel LaPointe Date: Wed, 16 Feb 2022 14:26:34 -0700 Subject: [PATCH] google sheets and docs cache in rclone, put title as first line h1, load to file tree --- crawler/google.sh | 30 +++++++++++++++++++++++++----- crawler/main.sh | 10 ++++------ crawler/notes.sh | 41 +++++++++++++++++++++++++++-------------- crawler/rclone.sh | 8 ++++++++ 4 files changed, 64 insertions(+), 25 deletions(-) diff --git a/crawler/google.sh b/crawler/google.sh index e94bc6a..e1d5f1a 100644 --- a/crawler/google.sh +++ b/crawler/google.sh @@ -14,17 +14,37 @@ google() ( } human_url() { - log "not impl: human url: $@" - exit 1 + echo "$1" } get() { - log "not impl: get: $@" - exit 1 + local cache_key="google get $*" + if cache get "$cache_key"; then + return 0 + fi + _get "$@" | cache put "$cache_key" + } + + _get() { + local url="$1" + local id="${url%/*}" + id="${id##*/}" + local downloaded="$(rclone get_google "$id")" + echo "# ${downloaded##*/}" + echo "" + if [ "${downloaded##*.}" == ".csv" ]; then + _csv_to_md "$downloaded" + fi + cat "$downloaded" } + _csv_to_md() { + local f="$1" + log _csv_to_md $f + } + expand() { - echo "$@" | base64 + get "$@" | head -n 1 | sed 's/^[#]* //' | base64 } "$@" diff --git a/crawler/main.sh b/crawler/main.sh index 16d82d7..06f76c2 100644 --- a/crawler/main.sh +++ b/crawler/main.sh @@ -4,6 +4,7 @@ main() { config log crawling ids... for id in $(crawlable_ids); do + log crawling id $id crawl "$id" done log rewriting ids... @@ -25,6 +26,7 @@ config() { source ./gitlab.sh source ./gitlab_wiki.sh source ./google.sh + source ./rclone.sh source ./cache.sh source ./notes.sh } @@ -59,12 +61,7 @@ crawlable_ids() { } crawl() { - local cache_key="crawled $*" - # TODO - if false && cache get "$cache_key"; then - return - fi - _crawl "$@" | cache put "$cache_key" + _crawl "$@" } _crawl() { @@ -152,6 +149,7 @@ crawl_with() { ID="${ID%/}" log " $ID ($TITLE): ${#CONTENT}" push_crawled "$ID" "$TITLE" "$CONTENT" + log " /$ID ($TITLE): ${#CONTENT}" } if [ "${#expanded[@]}" -gt 0 ]; then for i in $(seq 0 $(("${#expanded[@]}"-1))); do diff --git a/crawler/notes.sh b/crawler/notes.sh index cb39a49..a1e1b58 100644 --- a/crawler/notes.sh +++ b/crawler/notes.sh @@ -2,11 +2,15 @@ notes() ( ids() { - _recurse_ids "" "$(_tree)" + _recurse_ids "$(_tree)" } _tree() { - __tree "$@" + local cache_key="notes _tree" + if CACHE_DURATION=5 cache get "$cache_key"; then + return 0 + fi + __tree "$@" | cache put "$cache_key" } __tree() { @@ -18,8 +22,7 @@ notes() ( } _recurse_ids() { - local prefix="$1" - local json="$2" + local json="$1" if echo "$json" | jq .Branches | grep -q ^null$; then return 0 fi @@ -29,22 +32,32 @@ notes() ( fi for line in $b64lines; do line="$(echo "$line" | base64 --decode)" - local subfix="$(printf "%s/%s" "$prefix" "$line")" - subfix="${subfix#/}" - if ! _is_deleted "$subfix"; then - echo "$subfix" + if ! _is_deleted "$line"; then + echo "$line" + _recurse_ids "$(echo "$json" | jq -c ".Branches[\"$line\"]")" fi - _recurse_ids "$subfix" "$(echo "$json" | jq -c ".Branches[\"$line\"]")" done } meta() { + local cache_key="notes meta $*" + if CACHE_DURATION=5 cache get "$cache_key"; then + return 0 + fi + _meta "$@" | cache put "$cache_key" + } + + _meta() { local id="$1" local tree="$(_tree)" - for subid in ${id//\// }; do - tree="$(echo "$tree" | jq -c .Branches | jq -c ".[\"$subid\"]")" + local pid="${id%%/*}" + while [ "$id" != "$pid" ]; do + tree="$(echo "$tree" | jq ".Branches[\"$pid\"]")" + local to_add="${id#$pid/}" + to_add="${to_add%%/*}" + pid="$pid/$to_add" done - echo "$tree" | jq .Leaf + echo "$tree" | jq ".Branches[\"$id\"].Leaf" } _is_deleted() { @@ -90,11 +103,11 @@ notes() ( local id="$1" local title="$2" local body="$3" - echo "$body" | _nncurl \ + _nncurl \ -X PUT \ -H "Title: $title" \ -d "$body" \ - $NOTES_ADDR/api/v0/files/$id + $NOTES_ADDR/api/v0/files/$id >&2 } "$@" diff --git a/crawler/rclone.sh b/crawler/rclone.sh index 95b766f..5c3e0e6 100644 --- a/crawler/rclone.sh +++ b/crawler/rclone.sh @@ -2,6 +2,14 @@ rclone() ( get_google() { + local cache_key="rclone get google $*" + if cache get "$cache_key"; then + return 0 + fi + _get_google "$@" | cache put "$cache_key" + } + + _get_google() { _rate_limit local id="$1" local out="$(mktemp -d)"