add google and update crawlable detection
parent
e3b97814ea
commit
3774d3eba1
|
|
@ -0,0 +1,32 @@
|
|||
#! /bin/bash
|
||||
|
||||
google() (
|
||||
_is_sheets() {
|
||||
echo "$@" | grep -q 'docs.google.com.spreadsheets'
|
||||
}
|
||||
|
||||
_is_doc() {
|
||||
echo "$@" | grep -q 'docs.google.com.document'
|
||||
}
|
||||
|
||||
is() {
|
||||
_is_sheets "$@" || _is_doc "$@"
|
||||
}
|
||||
|
||||
human_url() {
|
||||
log "not impl: human url: $@"
|
||||
exit 1
|
||||
}
|
||||
|
||||
get() {
|
||||
log "not impl: get: $@"
|
||||
exit 1
|
||||
}
|
||||
|
||||
expand() {
|
||||
echo "$@" | base64
|
||||
}
|
||||
|
||||
"$@"
|
||||
)
|
||||
|
||||
|
|
@ -22,6 +22,7 @@ config() {
|
|||
export GITLAB_PAT="$GITLAB_PAT"
|
||||
source ./gitlab.sh
|
||||
source ./gitlab_wiki.sh
|
||||
source ./google.sh
|
||||
source ./cache.sh
|
||||
source ./notes.sh
|
||||
}
|
||||
|
|
@ -73,7 +74,7 @@ _crawl() {
|
|||
"$id"
|
||||
)"
|
||||
local crawlable_source="$(extract_crawlable_source "$content")"
|
||||
for backend in gitlab gitlab_wiki; do
|
||||
for backend in gitlab gitlab_wiki google; do
|
||||
if $backend is "$crawlable_source"; then
|
||||
crawl_with $backend "$json"
|
||||
return $?
|
||||
|
|
@ -166,7 +167,7 @@ push_crawled() {
|
|||
is_crawlable() {
|
||||
local crawlable_source="$(extract_crawlable_source "$*")"
|
||||
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
|
||||
local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*"
|
||||
local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:\-\#--]*"
|
||||
echo "$crawlable_source" | cut -c 1-300 | grep -q -E "^[ ]*$url_pattern[ ]*$"
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue