white space
parent
829081ebed
commit
f190bdecca
|
|
@ -1,122 +1,122 @@
|
||||||
#! /bin/bash
|
#! /bin/bash
|
||||||
|
|
||||||
main() {
|
main() {
|
||||||
config
|
config
|
||||||
for id in $(ids); do
|
for id in $(ids); do
|
||||||
crawl "$id"
|
crawl "$id"
|
||||||
done
|
done
|
||||||
for id in $(ids); do
|
for id in $(ids); do
|
||||||
rewrite "$id"
|
rewrite "$id"
|
||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
config() {
|
config() {
|
||||||
set -o pipefail
|
set -o pipefail
|
||||||
set -e
|
set -e
|
||||||
export CACHE="${CACHE:-"$(mktemp -d)"}"
|
export CACHE="${CACHE:-"$(mktemp -d)"}"
|
||||||
mkdir -p "$CACHE"
|
mkdir -p "$CACHE"
|
||||||
export CACHE_DURATION=$((60*50))
|
export CACHE_DURATION=$((60*50))
|
||||||
export NOTES_ADDR="${NOTES_ADDR:-"http://localhost:3004"}"
|
export NOTES_ADDR="${NOTES_ADDR:-"http://localhost:3004"}"
|
||||||
export GITLAB_PAT="$GITLAB_PAT"
|
export GITLAB_PAT="$GITLAB_PAT"
|
||||||
source ./gitlab.sh
|
source ./gitlab.sh
|
||||||
source ./cache.sh
|
source ./cache.sh
|
||||||
source ./notes.sh
|
source ./notes.sh
|
||||||
}
|
}
|
||||||
|
|
||||||
log() {
|
log() {
|
||||||
echo "$(echo "$(date +%H:%M:%S)> $*" | tr '\n' ' ')" >&2
|
echo "$(echo "$(date +%H:%M:%S)> $*" | tr '\n' ' ')" >&2
|
||||||
}
|
}
|
||||||
|
|
||||||
ids() {
|
ids() {
|
||||||
notes ids
|
notes ids
|
||||||
}
|
}
|
||||||
|
|
||||||
crawl() {
|
crawl() {
|
||||||
local cache_key="crawled $*"
|
local cache_key="crawled $*"
|
||||||
# TODO
|
# TODO
|
||||||
if false && cache get "$cache_key"; then
|
if false && cache get "$cache_key"; then
|
||||||
return
|
return
|
||||||
fi
|
fi
|
||||||
_crawl "$@" | cache put "$cache_key"
|
_crawl "$@" | cache put "$cache_key"
|
||||||
}
|
}
|
||||||
|
|
||||||
_crawl() {
|
_crawl() {
|
||||||
log "crawling $*"
|
log "crawling $*"
|
||||||
local id="$1"
|
local id="$1"
|
||||||
local content="$(notes get "$id")"
|
local content="$(notes get "$id")"
|
||||||
local json="$(
|
local json="$(
|
||||||
printf '{"content": %s, "id": "%s"}' \
|
printf '{"content": %s, "id": "%s"}' \
|
||||||
"$(echo "$content" | jq -Rs)" \
|
"$(echo "$content" | jq -Rs)" \
|
||||||
"$id"
|
"$id"
|
||||||
)"
|
)"
|
||||||
if ! is_crawlable "$content"; then
|
if ! is_crawlable "$content"; then
|
||||||
log "not crawlable: '${content:0:20}'..."
|
log "not crawlable: '${content:0:20}'..."
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
local crawlable_source="$(extract_crawlable_source "$content")"
|
local crawlable_source="$(extract_crawlable_source "$content")"
|
||||||
for backend in gitlab; do
|
for backend in gitlab; do
|
||||||
if $backend is "$crawlable_source"; then
|
if $backend is "$crawlable_source"; then
|
||||||
crawl_with $backend "$json"
|
crawl_with $backend "$json"
|
||||||
return $?
|
return $?
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
log "unknown backend for $crawlable_source"
|
log "unknown backend for $crawlable_source"
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
extract_crawlable_source() {
|
extract_crawlable_source() {
|
||||||
echo "$*" | head -n 1 | awk '{print $NF}' | sed 's/^<//' | sed 's/>$//'
|
echo "$*" | head -n 1 | awk '{print $NF}' | sed 's/^<//' | sed 's/>$//'
|
||||||
}
|
}
|
||||||
|
|
||||||
crawl_with() {
|
crawl_with() {
|
||||||
local backend="$1"
|
local backend="$1"
|
||||||
local json="$2"
|
local json="$2"
|
||||||
|
|
||||||
local content="$(echo "$json" | jq -r .content)"
|
local content="$(echo "$json" | jq -r .content)"
|
||||||
local crawlable_source="$(extract_crawlable_source "$content")"
|
local crawlable_source="$(extract_crawlable_source "$content")"
|
||||||
|
|
||||||
local expanded=($($backend expand "$crawlable_source"))
|
local expanded=($($backend expand "$crawlable_source"))
|
||||||
log expand $crawlable_source:
|
log expand $crawlable_source:
|
||||||
one() {
|
one() {
|
||||||
local i="$1"
|
local i="$1"
|
||||||
export TITLE="$(
|
export TITLE="$(
|
||||||
echo "$i" | base64 --decode
|
echo "$i" | base64 --decode
|
||||||
)"
|
)"
|
||||||
export CONTENT="$(
|
export CONTENT="$(
|
||||||
$backend get "$crawlable_source" "$i"
|
$backend get "$crawlable_source" "$i"
|
||||||
)"
|
)"
|
||||||
export ID="$(
|
export ID="$(
|
||||||
echo "$crawlable_source/$TITLE" | base64 | md5sum | awk '{print $1}'
|
echo "$crawlable_source/$TITLE" | base64 | md5sum | awk '{print $1}'
|
||||||
)"
|
)"
|
||||||
export PID="$(
|
export PID="$(
|
||||||
echo $json | jq -r .id
|
echo $json | jq -r .id
|
||||||
)"
|
)"
|
||||||
log " $PID/$ID ($TITLE): ${#CONTENT}"
|
log " $PID/$ID ($TITLE): ${#CONTENT}"
|
||||||
push_crawled "$PID/$ID" "$TITLE" "$CONTENT"
|
push_crawled "$PID/$ID" "$TITLE" "$CONTENT"
|
||||||
}
|
}
|
||||||
for i in $(seq 1 $(("${#expanded[@]}"-1))); do
|
for i in $(seq 1 $(("${#expanded[@]}"-1))); do
|
||||||
one "${expanded[i]}"
|
one "${expanded[i]}"
|
||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
push_crawled() {
|
push_crawled() {
|
||||||
notes put "$@"
|
notes put "$@"
|
||||||
}
|
}
|
||||||
|
|
||||||
is_crawlable() {
|
is_crawlable() {
|
||||||
local crawlable_source="$(extract_crawlable_source "$*")"
|
local crawlable_source="$(extract_crawlable_source "$*")"
|
||||||
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
|
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
|
||||||
local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*"
|
local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*"
|
||||||
echo "$crawlable_source" | grep -q -E "^[ ]*$url_pattern[ ]*$"
|
echo "$crawlable_source" | grep -q -E "^[ ]*$url_pattern[ ]*$"
|
||||||
}
|
}
|
||||||
|
|
||||||
rewrite() {
|
rewrite() {
|
||||||
log not impl: rewrite "#abc-def" to "#h-abc-def"
|
log not impl: rewrite "#abc-def" to "#h-abc-def"
|
||||||
log not impl: rewrite "./asdf" to "./zyxw" or "absolute.com/asdf"
|
log not impl: rewrite "./asdf" to "./zyxw" or "absolute.com/asdf"
|
||||||
log not impl rewrite, change images
|
log not impl rewrite, change images
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
if [ "$0" == "$BASH_SOURCE" ]; then
|
if [ "$0" == "$BASH_SOURCE" ]; then
|
||||||
main "$@"
|
main "$@"
|
||||||
fi
|
fi
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue