72 lines
1.6 KiB
Bash
72 lines
1.6 KiB
Bash
#! /bin/bash
|
|
|
|
export MEILI="${MEILI:-"127.0.0.1:7700"}"
|
|
export SCRAPED="${SCRAPED:-"./scraped"}"
|
|
|
|
curl() {
|
|
$(which curl) -Ss -L -H 'Content-Type: application/json' "$@"
|
|
echo
|
|
}
|
|
|
|
log() {
|
|
echo "> $(date) > $*" >&2
|
|
}
|
|
|
|
meili_index() {
|
|
log index
|
|
curl "$MEILI"/indexes/scraped
|
|
}
|
|
|
|
meili_query() {
|
|
log query: "$*"
|
|
curl "$MEILI"/indexes/scraped/search -X POST -d "$(goprintf '{%q: %q}' "q" "$*")"
|
|
}
|
|
|
|
meili_queue() {
|
|
log queue
|
|
curl "$MEILI"/indexes/scraped/updates | jq .[].status | uniq -c
|
|
}
|
|
|
|
clean_id() {
|
|
echo "$*" | sed 's/[^a-zA-Z0-9_-]/_/g'
|
|
}
|
|
|
|
scraped_to_meili() {
|
|
find "$SCRAPED" -type f \
|
|
| sort \
|
|
| while read -r md_path; do
|
|
file_id="$(clean_id "$md_path")"
|
|
h1="$(head -n 1 "$md_path" | sed 's/^# //')"
|
|
grep '^## ' "$md_path" | sed 's/^## //' | while read -r h2; do
|
|
local content="$(
|
|
echo "# $h1"
|
|
echo
|
|
echo "## $h2"
|
|
echo
|
|
cat "$md_path" \
|
|
| sed -e '0,/^\#\# '"$h2"'/d' \
|
|
| sed -e '/^\#\# .*/,$d' \
|
|
| head -n 25
|
|
)"
|
|
if [ $(echo "$content" | wc -l | awk '{print $NF}') -lt 5 ]; then
|
|
continue
|
|
fi
|
|
local id="$(clean_id "${file_id}_${h1}_${h2}")"
|
|
log id=$id
|
|
log h1=$h1
|
|
log h2=$h2
|
|
log content="${#content}"
|
|
curl -sS \
|
|
"$MEILI"/indexes/scraped/documents \
|
|
-X POST \
|
|
-d "$(
|
|
goprintf '[{%q: %q, %q: %q}]' \
|
|
"id" \
|
|
"$id" \
|
|
"content" \
|
|
"$content"
|
|
)"
|
|
done
|
|
done
|
|
}
|