dnd-beyond-scrape/to_meili.sh

98 lines
2.4 KiB
Bash

#! /bin/bash
export MEILI="${MEILI:-"127.0.0.1:7700"}"
export SCRAPED="${SCRAPED:-"./scraped"}"
curl() {
$(which curl) -Ss -L -H 'Content-Type: application/json' "$@"
echo
}
log() {
echo "> $(date) > $*" >&2
}
meili_index() {
log index
curl "$MEILI"/indexes/scraped
}
meili_query() {
log query: "$*"
curl "$MEILI"/indexes/scraped/search -X POST -d "$(goprintf '{%q: %q}' "q" "$*")"
}
meili_queue() {
log queue
curl "$MEILI"/indexes/scraped/updates | jq .[].status | uniq -c
}
clean_id() {
echo "$*" | sed 's/[^a-zA-Z0-9_-]/_/g'
}
spaceless() {
local input="$*"
if [ "$#" -lt 1 ]; then
input="$(cat)"
fi
echo "$input" | sed 's/^ *//' | sed 's/ *$//'
}
scraped_to_meili() {
find "$SCRAPED" -type f \
| sort \
| while read -r md_path; do
h1="$(grep '^#' "$md_path" | head -n 1 | sed 's/^##*//' | spaceless)"
grep '^##*' "$md_path" | while read -r h2_line; do
local h2="$(echo "$h2_line" | sed 's/^##*//' | spaceless)"
local content="$(
echo "# $h1"
echo ""
echo "## $h2"
echo ""
found_h2=false
found_stopper=false
cat "$md_path" | while read -r line; do
if ! $found_h2 && [ "$line" == "$h2_line" ]; then
found_h2=true
elif $found_h2 && [ "$line" != "${line#"#"}" ]; then
found_stopper=true
fi
if $found_stopper; then continue; fi
if $found_h2; then echo "$line"; fi
done
)"
content="$(echo "$content" | head -n 25)"
if [ $(echo "$content" | wc -w | awk '{print $NF}') -lt 25 ]; then
log "skipping content because less than 25 words found: $content"
continue
fi
local id="$(clean_id "${file_id}_${h1}_${h2}")"
log id=$id
log h1=$h1
log h2=$h2
log content="${#content}"
log "submitting $id/$h1/$h2"
curl -sS \
"$MEILI"/indexes/scraped/documents \
-X POST \
-d "$(
goprintf '[{%q: %q, %q: %q}]' \
"id" \
"$id" \
"content" \
"$content"
)"
done
done
}
if [ "$#" == 0 ] ; then
if (read -p "would you like to ingest from $SCRAPED? " yn; echo "$yn" | grep -q -i y); then
scraped_to_meili
echo $?
fi
fi