From fa9aafcd2893089f61e0b61c1882fb4d6e9701c6 Mon Sep 17 00:00:00 2001
From: Bel LaPointe <breel@qualtrics.com>
Date: Thu, 10 Feb 2022 08:52:40 -0700
Subject: [PATCH] simplify mkdir all notes

---
 app/crawler/main.sh  | 13 ++++++++-----
 app/crawler/notes.sh |  7 +++++++
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/app/crawler/main.sh b/app/crawler/main.sh
index a780b45..ad6991b 100644
--- a/app/crawler/main.sh
+++ b/app/crawler/main.sh
@@ -71,18 +71,22 @@ extract_crawlable_source() {
 crawl_with() {
 	local backend="$1"
 	local json="$2"
+	local pid="$(echo "$json" | jq -r .id)"
 
 	local content="$(echo "$json" | jq -r .content)"
 	local crawlable_source="$(extract_crawlable_source "$content")"
 
 	local expanded=($($backend expand "$crawlable_source"))
+
+	log purge $crawlable_source:
+	for subid in $(notes ids | grep "^$pid/"); do
+		notes del "$subid"
+	done
+
 	log expand $crawlable_source:
 	notes_mkdir_p() {
 		local id="$1"
 		local subtitle="$2"
-		if ! notes get "$id" | grep -q '^404 page not found$'; then
-			return
-		fi
 		notes put "$id" "$subtitle" "autogenerated content"
 	}
 	one() {
@@ -97,7 +101,6 @@ crawl_with() {
 		export CONTENT="$(
 			$backend get "$crawlable_source" "$i"
 		)"
-		local pid="$(echo $json | jq -r .id)"
 		export ID="$(
 			local sum="$pid/"
 			local title_so_far=""
@@ -128,7 +131,7 @@ is_crawlable() {
 	local crawlable_source="$(extract_crawlable_source "$*")"
 	# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
 	local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*"
-	echo "$crawlable_source" | grep -q -E "^[ ]*$url_pattern[ ]*$"
+	echo "$crawlable_source" | cut -c 1-300 | grep -q -E "^[ ]*$url_pattern[ ]*$"
 }
 
 rewrite() {
diff --git a/app/crawler/notes.sh b/app/crawler/notes.sh
index 0972ba8..cb39a49 100644
--- a/app/crawler/notes.sh
+++ b/app/crawler/notes.sh
@@ -69,6 +69,13 @@ notes() (
       _nncurl $NOTES_ADDR/api/v0/files/$1
    }
 
+   del() {
+		local id="$1"
+      _nncurl \
+         -X DELETE \
+         $NOTES_ADDR/api/v0/files/$id
+	}
+
    put() {
       set -u
       local ret=0