add google and update crawlable detection
This commit is contained in:
32
crawler/google.sh
Normal file
32
crawler/google.sh
Normal file
@@ -0,0 +1,32 @@
|
||||
#! /bin/bash
|
||||
|
||||
google() (
|
||||
_is_sheets() {
|
||||
echo "$@" | grep -q 'docs.google.com.spreadsheets'
|
||||
}
|
||||
|
||||
_is_doc() {
|
||||
echo "$@" | grep -q 'docs.google.com.document'
|
||||
}
|
||||
|
||||
is() {
|
||||
_is_sheets "$@" || _is_doc "$@"
|
||||
}
|
||||
|
||||
human_url() {
|
||||
log "not impl: human url: $@"
|
||||
exit 1
|
||||
}
|
||||
|
||||
get() {
|
||||
log "not impl: get: $@"
|
||||
exit 1
|
||||
}
|
||||
|
||||
expand() {
|
||||
echo "$@" | base64
|
||||
}
|
||||
|
||||
"$@"
|
||||
)
|
||||
|
||||
@@ -22,6 +22,7 @@ config() {
|
||||
export GITLAB_PAT="$GITLAB_PAT"
|
||||
source ./gitlab.sh
|
||||
source ./gitlab_wiki.sh
|
||||
source ./google.sh
|
||||
source ./cache.sh
|
||||
source ./notes.sh
|
||||
}
|
||||
@@ -73,7 +74,7 @@ _crawl() {
|
||||
"$id"
|
||||
)"
|
||||
local crawlable_source="$(extract_crawlable_source "$content")"
|
||||
for backend in gitlab gitlab_wiki; do
|
||||
for backend in gitlab gitlab_wiki google; do
|
||||
if $backend is "$crawlable_source"; then
|
||||
crawl_with $backend "$json"
|
||||
return $?
|
||||
@@ -166,7 +167,7 @@ push_crawled() {
|
||||
is_crawlable() {
|
||||
local crawlable_source="$(extract_crawlable_source "$*")"
|
||||
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
|
||||
local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*"
|
||||
local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:\-\#--]*"
|
||||
echo "$crawlable_source" | cut -c 1-300 | grep -q -E "^[ ]*$url_pattern[ ]*$"
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user