From 3774d3eba1e090698fafd172f0a264e55ceaa4e0 Mon Sep 17 00:00:00 2001 From: Bel LaPointe Date: Wed, 16 Feb 2022 12:19:32 -0700 Subject: [PATCH] add google and update crawlable detection --- crawler/google.sh | 32 ++++++++++++++++++++++++++++++++ crawler/main.sh | 5 +++-- 2 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 crawler/google.sh diff --git a/crawler/google.sh b/crawler/google.sh new file mode 100644 index 0000000..e94bc6a --- /dev/null +++ b/crawler/google.sh @@ -0,0 +1,32 @@ +#! /bin/bash + +google() ( + _is_sheets() { + echo "$@" | grep -q 'docs.google.com.spreadsheets' + } + + _is_doc() { + echo "$@" | grep -q 'docs.google.com.document' + } + + is() { + _is_sheets "$@" || _is_doc "$@" + } + + human_url() { + log "not impl: human url: $@" + exit 1 + } + + get() { + log "not impl: get: $@" + exit 1 + } + + expand() { + echo "$@" | base64 + } + + "$@" +) + diff --git a/crawler/main.sh b/crawler/main.sh index dbdaa45..b092603 100644 --- a/crawler/main.sh +++ b/crawler/main.sh @@ -22,6 +22,7 @@ config() { export GITLAB_PAT="$GITLAB_PAT" source ./gitlab.sh source ./gitlab_wiki.sh + source ./google.sh source ./cache.sh source ./notes.sh } @@ -73,7 +74,7 @@ _crawl() { "$id" )" local crawlable_source="$(extract_crawlable_source "$content")" - for backend in gitlab gitlab_wiki; do + for backend in gitlab gitlab_wiki google; do if $backend is "$crawlable_source"; then crawl_with $backend "$json" return $? @@ -166,7 +167,7 @@ push_crawled() { is_crawlable() { local crawlable_source="$(extract_crawlable_source "$*")" # https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file - local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*" + local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:\-\#--]*" echo "$crawlable_source" | cut -c 1-300 | grep -q -E "^[ ]*$url_pattern[ ]*$" }