From ffbc2e285a2af95792ed2bad6f793c544a5bb1e7 Mon Sep 17 00:00:00 2001 From: Bel LaPointe Date: Wed, 15 Dec 2021 16:05:57 -0700 Subject: [PATCH] to meili splits into h2s --- to_meili.sh | 48 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/to_meili.sh b/to_meili.sh index 2a1deef..915979c 100644 --- a/to_meili.sh +++ b/to_meili.sh @@ -27,23 +27,45 @@ meili_queue() { curl "$MEILI"/indexes/scraped/updates } +clean_id() { + echo "$*" | sed 's/[^a-zA-Z0-9_-]/_/g' +} + scraped_to_meili() { find "$SCRAPED" -type f \ | sort \ | while read -r md_path; do - id="$(echo "$md_path" | sed 's/[^a-zA-Z0-9_-]/_/g')" - content="$(cat "$md_path")" - echo md_path=$md_path, content=${#content} - curl -sS \ - "$MEILI"/indexes/scraped/documents \ - -X POST \ - -d "$( - goprintf '[{%q: %q, %q: %q}]' \ - "id" \ - "$id" \ - "content" \ - "$content" + file_id="$(clean_id "$md_path")" + h1="$(head -n 1 "$md_path" | sed 's/^# //')" + grep '^## ' "$md_path" | sed 's/^## //' | while read -r h2; do + local content="$( + echo "# $h1" + echo + echo "## $h2" + echo + cat "$md_path" \ + | sed -e '0,/^\#\# '"$h2"'/d' \ + | sed -e '/^\#\# .*/,$d' \ + | head -n 25 )" - break + if [ $(echo "$content" | wc -l | awk '{print $NF}') -lt 5 ]; then + continue + fi + local id="$(clean_id "${file_id}_${h1}_${h2}")" + log id=$id + log h1=$h1 + log h2=$h2 + log content="${#content}" + curl -sS \ + "$MEILI"/indexes/scraped/documents \ + -X POST \ + -d "$( + goprintf '[{%q: %q, %q: %q}]' \ + "id" \ + "$id" \ + "content" \ + "$content" + )" + done done }