From 9acaa4a3560c375b8f29bde4510d2b865b6f40c0 Mon Sep 17 00:00:00 2001 From: Bel LaPointe Date: Thu, 10 Feb 2022 11:13:04 -0700 Subject: [PATCH] dont crawl crawled subfiles --- app/crawler/main.sh | 30 ++++++++++++++----- spike/review/reinvent/ezmded/server/todo.yaml | 2 +- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/app/crawler/main.sh b/app/crawler/main.sh index 73c526b..90ba4ac 100644 --- a/app/crawler/main.sh +++ b/app/crawler/main.sh @@ -3,7 +3,7 @@ main() { config log crawling ids... - for id in $(ids); do + for id in $(crawlable_ids); do crawl "$id" done log rewriting ids... @@ -30,7 +30,28 @@ log() { } ids() { - notes ids + notes ids | sort +} + +crawlable_ids() { + local all_ids=($(ids)) + local crawlable_ids=() + for id in "${all_ids[@]}"; do + if for crawlable_id in "${crawlable_ids[@]}"; do + if [ "$id" != "${id#$crawlable_id/}" ]; then + echo true + fi + done | grep -q true; then + continue + fi + local content="$(notes get "$id")" + if is_crawlable "$content"; then + crawlable_ids+=("$id") + fi + done + for crawlable_id in "${crawlable_ids[@]}"; do + echo "$crawlable_id" + done } crawl() { @@ -43,7 +64,6 @@ crawl() { } _crawl() { - log "crawling? $*" local id="$1" local content="$(notes get "$id")" local json="$( @@ -51,10 +71,6 @@ _crawl() { "$(echo "$content" | jq -Rs)" \ "$id" )" - if ! is_crawlable "$content"; then - log "not crawlable: '${content:0:20}'..." - return 0 - fi local crawlable_source="$(extract_crawlable_source "$content")" for backend in gitlab; do if $backend is "$crawlable_source"; then diff --git a/spike/review/reinvent/ezmded/server/todo.yaml b/spike/review/reinvent/ezmded/server/todo.yaml index 74579de..aaae545 100644 --- a/spike/review/reinvent/ezmded/server/todo.yaml +++ b/spike/review/reinvent/ezmded/server/todo.yaml @@ -1,5 +1,5 @@ todo: -- preview default via q param - css done: - https://developer.mozilla.org/en-US/docs/Web/API/History/pushState#change_a_query_parameter +- preview default via q param