add google and update crawlable detection

master
Bel LaPointe 2022-02-16 12:19:32 -07:00
parent e3b97814ea
commit 3774d3eba1
2 changed files with 35 additions and 2 deletions

32
crawler/google.sh Normal file
View File

@ -0,0 +1,32 @@
#! /bin/bash
google() (
_is_sheets() {
echo "$@" | grep -q 'docs.google.com.spreadsheets'
}
_is_doc() {
echo "$@" | grep -q 'docs.google.com.document'
}
is() {
_is_sheets "$@" || _is_doc "$@"
}
human_url() {
log "not impl: human url: $@"
exit 1
}
get() {
log "not impl: get: $@"
exit 1
}
expand() {
echo "$@" | base64
}
"$@"
)

View File

@ -22,6 +22,7 @@ config() {
export GITLAB_PAT="$GITLAB_PAT"
source ./gitlab.sh
source ./gitlab_wiki.sh
source ./google.sh
source ./cache.sh
source ./notes.sh
}
@ -73,7 +74,7 @@ _crawl() {
"$id"
)"
local crawlable_source="$(extract_crawlable_source "$content")"
for backend in gitlab gitlab_wiki; do
for backend in gitlab gitlab_wiki google; do
if $backend is "$crawlable_source"; then
crawl_with $backend "$json"
return $?
@ -166,7 +167,7 @@ push_crawled() {
is_crawlable() {
local crawlable_source="$(extract_crawlable_source "$*")"
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*"
local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:\-\#--]*"
echo "$crawlable_source" | cut -c 1-300 | grep -q -E "^[ ]*$url_pattern[ ]*$"
}