#! /bin/bash export MEILI="${MEILI:-"127.0.0.1:7700"}" export SCRAPED="${SCRAPED:-"./scraped"}" curl() { $(which curl) -Ss -L -H 'Content-Type: application/json' "$@" echo } log() { echo "> $(date) > $*" >&2 } meili_index() { log index curl "$MEILI"/indexes/scraped } meili_query() { log query: "$*" curl "$MEILI"/indexes/scraped/search -X POST -d "$(goprintf '{%q: %q}' "q" "$*")" } meili_queue() { log queue curl "$MEILI"/indexes/scraped/updates | jq .[].status | uniq -c } clean_id() { echo "$*" | sed 's/[^a-zA-Z0-9_-]/_/g' } spaceless() { local input="$*" if [ "$#" -lt 1 ]; then input="$(cat)" fi echo "$input" | sed 's/^ *//' | sed 's/ *$//' } scraped_to_meili() { find "$SCRAPED" -type f \ | sort \ | while read -r md_path; do h1="$(grep '^#' "$md_path" | head -n 1 | sed 's/^##*//' | spaceless)" grep '^##*' "$md_path" | while read -r h2_line; do local h2="$(echo "$h2_line" | sed 's/^##*//' | spaceless)" local content="$( echo "# $h1" echo "" echo "## $h2" echo "" found_h2=false found_stopper=false cat "$md_path" | while read -r line; do if ! $found_h2 && [ "$line" == "$h2_line" ]; then found_h2=true elif $found_h2 && [ "$line" != "${line#"#"}" ]; then found_stopper=true fi if $found_stopper; then continue; fi if $found_h2; then echo "$line"; fi done )" content="$(echo "$content" | head -n 25)" if [ $(echo "$content" | wc -w | awk '{print $NF}') -lt 25 ]; then log "skipping content because less than 25 words found: $content" continue fi local id="$(clean_id "${file_id}_${h1}_${h2}")" log id=$id log h1=$h1 log h2=$h2 log content="${#content}" log "submitting $id/$h1/$h2" curl -sS \ "$MEILI"/indexes/scraped/documents \ -X POST \ -d "$( goprintf '[{%q: %q, %q: %q}]' \ "id" \ "$id" \ "content" \ "$content" )" done done } if [ "$#" == 0 ] ; then if (read -p "would you like to ingest from $SCRAPED? " yn; echo "$yn" | grep -q -i y); then scraped_to_meili echo $? fi fi