to meili splits into h2s
parent
b3d26a88c2
commit
ffbc2e285a
48
to_meili.sh
48
to_meili.sh
|
|
@ -27,23 +27,45 @@ meili_queue() {
|
|||
curl "$MEILI"/indexes/scraped/updates
|
||||
}
|
||||
|
||||
clean_id() {
|
||||
echo "$*" | sed 's/[^a-zA-Z0-9_-]/_/g'
|
||||
}
|
||||
|
||||
scraped_to_meili() {
|
||||
find "$SCRAPED" -type f \
|
||||
| sort \
|
||||
| while read -r md_path; do
|
||||
id="$(echo "$md_path" | sed 's/[^a-zA-Z0-9_-]/_/g')"
|
||||
content="$(cat "$md_path")"
|
||||
echo md_path=$md_path, content=${#content}
|
||||
curl -sS \
|
||||
"$MEILI"/indexes/scraped/documents \
|
||||
-X POST \
|
||||
-d "$(
|
||||
goprintf '[{%q: %q, %q: %q}]' \
|
||||
"id" \
|
||||
"$id" \
|
||||
"content" \
|
||||
"$content"
|
||||
file_id="$(clean_id "$md_path")"
|
||||
h1="$(head -n 1 "$md_path" | sed 's/^# //')"
|
||||
grep '^## ' "$md_path" | sed 's/^## //' | while read -r h2; do
|
||||
local content="$(
|
||||
echo "# $h1"
|
||||
echo
|
||||
echo "## $h2"
|
||||
echo
|
||||
cat "$md_path" \
|
||||
| sed -e '0,/^\#\# '"$h2"'/d' \
|
||||
| sed -e '/^\#\# .*/,$d' \
|
||||
| head -n 25
|
||||
)"
|
||||
break
|
||||
if [ $(echo "$content" | wc -l | awk '{print $NF}') -lt 5 ]; then
|
||||
continue
|
||||
fi
|
||||
local id="$(clean_id "${file_id}_${h1}_${h2}")"
|
||||
log id=$id
|
||||
log h1=$h1
|
||||
log h2=$h2
|
||||
log content="${#content}"
|
||||
curl -sS \
|
||||
"$MEILI"/indexes/scraped/documents \
|
||||
-X POST \
|
||||
-d "$(
|
||||
goprintf '[{%q: %q, %q: %q}]' \
|
||||
"id" \
|
||||
"$id" \
|
||||
"content" \
|
||||
"$content"
|
||||
)"
|
||||
done
|
||||
done
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue