to meili splits into h2s

master
Bel LaPointe 2021-12-15 16:05:57 -07:00
parent b3d26a88c2
commit ffbc2e285a
1 changed files with 35 additions and 13 deletions

View File

@ -27,13 +27,35 @@ meili_queue() {
curl "$MEILI"/indexes/scraped/updates curl "$MEILI"/indexes/scraped/updates
} }
clean_id() {
echo "$*" | sed 's/[^a-zA-Z0-9_-]/_/g'
}
scraped_to_meili() { scraped_to_meili() {
find "$SCRAPED" -type f \ find "$SCRAPED" -type f \
| sort \ | sort \
| while read -r md_path; do | while read -r md_path; do
id="$(echo "$md_path" | sed 's/[^a-zA-Z0-9_-]/_/g')" file_id="$(clean_id "$md_path")"
content="$(cat "$md_path")" h1="$(head -n 1 "$md_path" | sed 's/^# //')"
echo md_path=$md_path, content=${#content} grep '^## ' "$md_path" | sed 's/^## //' | while read -r h2; do
local content="$(
echo "# $h1"
echo
echo "## $h2"
echo
cat "$md_path" \
| sed -e '0,/^\#\# '"$h2"'/d' \
| sed -e '/^\#\# .*/,$d' \
| head -n 25
)"
if [ $(echo "$content" | wc -l | awk '{print $NF}') -lt 5 ]; then
continue
fi
local id="$(clean_id "${file_id}_${h1}_${h2}")"
log id=$id
log h1=$h1
log h2=$h2
log content="${#content}"
curl -sS \ curl -sS \
"$MEILI"/indexes/scraped/documents \ "$MEILI"/indexes/scraped/documents \
-X POST \ -X POST \
@ -44,6 +66,6 @@ scraped_to_meili() {
"content" \ "content" \
"$content" "$content"
)" )"
break done
done done
} }