diff --git a/app/crawler/main.sh b/app/crawler/main.sh index 46f5f68..7a0243f 100644 --- a/app/crawler/main.sh +++ b/app/crawler/main.sh @@ -1,122 +1,122 @@ #! /bin/bash main() { - config - for id in $(ids); do - crawl "$id" - done - for id in $(ids); do - rewrite "$id" - done + config + for id in $(ids); do + crawl "$id" + done + for id in $(ids); do + rewrite "$id" + done } config() { - set -o pipefail - set -e - export CACHE="${CACHE:-"$(mktemp -d)"}" - mkdir -p "$CACHE" - export CACHE_DURATION=$((60*50)) - export NOTES_ADDR="${NOTES_ADDR:-"http://localhost:3004"}" - export GITLAB_PAT="$GITLAB_PAT" - source ./gitlab.sh - source ./cache.sh - source ./notes.sh + set -o pipefail + set -e + export CACHE="${CACHE:-"$(mktemp -d)"}" + mkdir -p "$CACHE" + export CACHE_DURATION=$((60*50)) + export NOTES_ADDR="${NOTES_ADDR:-"http://localhost:3004"}" + export GITLAB_PAT="$GITLAB_PAT" + source ./gitlab.sh + source ./cache.sh + source ./notes.sh } log() { - echo "$(echo "$(date +%H:%M:%S)> $*" | tr '\n' ' ')" >&2 + echo "$(echo "$(date +%H:%M:%S)> $*" | tr '\n' ' ')" >&2 } ids() { - notes ids + notes ids } crawl() { - local cache_key="crawled $*" + local cache_key="crawled $*" # TODO - if false && cache get "$cache_key"; then - return - fi - _crawl "$@" | cache put "$cache_key" + if false && cache get "$cache_key"; then + return + fi + _crawl "$@" | cache put "$cache_key" } _crawl() { - log "crawling $*" - local id="$1" - local content="$(notes get "$id")" + log "crawling $*" + local id="$1" + local content="$(notes get "$id")" local json="$( printf '{"content": %s, "id": "%s"}' \ "$(echo "$content" | jq -Rs)" \ "$id" )" - if ! is_crawlable "$content"; then - log "not crawlable: '${content:0:20}'..." - return 0 - fi - local crawlable_source="$(extract_crawlable_source "$content")" - for backend in gitlab; do - if $backend is "$crawlable_source"; then - crawl_with $backend "$json" - return $? - fi - done - log "unknown backend for $crawlable_source" - return 1 + if ! is_crawlable "$content"; then + log "not crawlable: '${content:0:20}'..." + return 0 + fi + local crawlable_source="$(extract_crawlable_source "$content")" + for backend in gitlab; do + if $backend is "$crawlable_source"; then + crawl_with $backend "$json" + return $? + fi + done + log "unknown backend for $crawlable_source" + return 1 } extract_crawlable_source() { - echo "$*" | head -n 1 | awk '{print $NF}' | sed 's/^$//' + echo "$*" | head -n 1 | awk '{print $NF}' | sed 's/^$//' } crawl_with() { - local backend="$1" - local json="$2" + local backend="$1" + local json="$2" - local content="$(echo "$json" | jq -r .content)" - local crawlable_source="$(extract_crawlable_source "$content")" + local content="$(echo "$json" | jq -r .content)" + local crawlable_source="$(extract_crawlable_source "$content")" - local expanded=($($backend expand "$crawlable_source")) - log expand $crawlable_source: + local expanded=($($backend expand "$crawlable_source")) + log expand $crawlable_source: one() { local i="$1" - export TITLE="$( + export TITLE="$( echo "$i" | base64 --decode )" - export CONTENT="$( + export CONTENT="$( $backend get "$crawlable_source" "$i" )" - export ID="$( + export ID="$( echo "$crawlable_source/$TITLE" | base64 | md5sum | awk '{print $1}' )" - export PID="$( + export PID="$( echo $json | jq -r .id )" - log " $PID/$ID ($TITLE): ${#CONTENT}" - push_crawled "$PID/$ID" "$TITLE" "$CONTENT" + log " $PID/$ID ($TITLE): ${#CONTENT}" + push_crawled "$PID/$ID" "$TITLE" "$CONTENT" } - for i in $(seq 1 $(("${#expanded[@]}"-1))); do + for i in $(seq 1 $(("${#expanded[@]}"-1))); do one "${expanded[i]}" - done + done } push_crawled() { - notes put "$@" + notes put "$@" } is_crawlable() { - local crawlable_source="$(extract_crawlable_source "$*")" - # https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file - local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*" - echo "$crawlable_source" | grep -q -E "^[ ]*$url_pattern[ ]*$" + local crawlable_source="$(extract_crawlable_source "$*")" + # https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file + local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*" + echo "$crawlable_source" | grep -q -E "^[ ]*$url_pattern[ ]*$" } rewrite() { - log not impl: rewrite "#abc-def" to "#h-abc-def" - log not impl: rewrite "./asdf" to "./zyxw" or "absolute.com/asdf" - log not impl rewrite, change images - return 1 + log not impl: rewrite "#abc-def" to "#h-abc-def" + log not impl: rewrite "./asdf" to "./zyxw" or "absolute.com/asdf" + log not impl rewrite, change images + return 1 } if [ "$0" == "$BASH_SOURCE" ]; then - main "$@" + main "$@" fi