add google and update crawlable detection
parent
e3b97814ea
commit
3774d3eba1
|
|
@ -0,0 +1,32 @@
|
||||||
|
#! /bin/bash
|
||||||
|
|
||||||
|
google() (
|
||||||
|
_is_sheets() {
|
||||||
|
echo "$@" | grep -q 'docs.google.com.spreadsheets'
|
||||||
|
}
|
||||||
|
|
||||||
|
_is_doc() {
|
||||||
|
echo "$@" | grep -q 'docs.google.com.document'
|
||||||
|
}
|
||||||
|
|
||||||
|
is() {
|
||||||
|
_is_sheets "$@" || _is_doc "$@"
|
||||||
|
}
|
||||||
|
|
||||||
|
human_url() {
|
||||||
|
log "not impl: human url: $@"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
get() {
|
||||||
|
log "not impl: get: $@"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
expand() {
|
||||||
|
echo "$@" | base64
|
||||||
|
}
|
||||||
|
|
||||||
|
"$@"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
@ -22,6 +22,7 @@ config() {
|
||||||
export GITLAB_PAT="$GITLAB_PAT"
|
export GITLAB_PAT="$GITLAB_PAT"
|
||||||
source ./gitlab.sh
|
source ./gitlab.sh
|
||||||
source ./gitlab_wiki.sh
|
source ./gitlab_wiki.sh
|
||||||
|
source ./google.sh
|
||||||
source ./cache.sh
|
source ./cache.sh
|
||||||
source ./notes.sh
|
source ./notes.sh
|
||||||
}
|
}
|
||||||
|
|
@ -73,7 +74,7 @@ _crawl() {
|
||||||
"$id"
|
"$id"
|
||||||
)"
|
)"
|
||||||
local crawlable_source="$(extract_crawlable_source "$content")"
|
local crawlable_source="$(extract_crawlable_source "$content")"
|
||||||
for backend in gitlab gitlab_wiki; do
|
for backend in gitlab gitlab_wiki google; do
|
||||||
if $backend is "$crawlable_source"; then
|
if $backend is "$crawlable_source"; then
|
||||||
crawl_with $backend "$json"
|
crawl_with $backend "$json"
|
||||||
return $?
|
return $?
|
||||||
|
|
@ -166,7 +167,7 @@ push_crawled() {
|
||||||
is_crawlable() {
|
is_crawlable() {
|
||||||
local crawlable_source="$(extract_crawlable_source "$*")"
|
local crawlable_source="$(extract_crawlable_source "$*")"
|
||||||
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
|
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
|
||||||
local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*"
|
local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:\-\#--]*"
|
||||||
echo "$crawlable_source" | cut -c 1-300 | grep -q -E "^[ ]*$url_pattern[ ]*$"
|
echo "$crawlable_source" | cut -c 1-300 | grep -q -E "^[ ]*$url_pattern[ ]*$"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue