diff --git a/to_meili.sh b/to_meili.sh index e65ec10..d270b14 100644 --- a/to_meili.sh +++ b/to_meili.sh @@ -31,24 +31,41 @@ clean_id() { echo "$*" | sed 's/[^a-zA-Z0-9_-]/_/g' } +spaceless() { + local input="$*" + if [ "$#" -lt 1 ]; then + input="$(cat)" + fi + echo "$input" | sed 's/^ *//' | sed 's/ *$//' +} + scraped_to_meili() { find "$SCRAPED" -type f \ | sort \ | while read -r md_path; do - file_id="$(clean_id "$md_path")" - h1="$(head -n 1 "$md_path" | sed 's/^# //')" - grep '^## ' "$md_path" | sed 's/^## //' | while read -r h2; do + h1="$(grep '^#' "$md_path" | head -n 1 | sed 's/^##*//' | spaceless)" + grep '^##*' "$md_path" | while read -r h2_line; do + local h2="$(echo "$h2_line" | sed 's/^##*//' | spaceless)" local content="$( echo "# $h1" - echo + echo "" echo "## $h2" - echo - cat "$md_path" \ - | sed -e '0,/^\#\# '"$h2"'/d' \ - | sed -e '/^\#\# .*/,$d' \ - | head -n 25 + echo "" + found_h2=false + found_stopper=false + cat "$md_path" | while read -r line; do + if ! $found_h2 && [ "$line" == "$h2_line" ]; then + found_h2=true + elif $found_h2 && [ "$line" != "${line#"#"}" ]; then + found_stopper=true + fi + if $found_stopper; then continue; fi + if $found_h2; then echo "$line"; fi + done )" - if [ $(echo "$content" | wc -l | awk '{print $NF}') -lt 5 ]; then + content="$(echo "$content" | head -n 25)" + if [ $(echo "$content" | wc -w | awk '{print $NF}') -lt 25 ]; then + log "skipping content because less than 25 words found: $content" continue fi local id="$(clean_id "${file_id}_${h1}_${h2}")" @@ -56,6 +73,7 @@ scraped_to_meili() { log h1=$h1 log h2=$h2 log content="${#content}" + log "submitting $id/$h1/$h2" curl -sS \ "$MEILI"/indexes/scraped/documents \ -X POST \ @@ -69,3 +87,11 @@ scraped_to_meili() { done done } + +if [ "$#" == 0 ] ; then + if (read -p "would you like to ingest from $SCRAPED? " yn; echo "$yn" | grep -q -i y); then + scraped_to_meili + echo $? + fi + +fi