update to meili for new scrape format

master
bel 2022-07-03 21:18:20 -06:00
parent 260815e53c
commit f878486d80
1 changed files with 36 additions and 10 deletions

View File

@ -31,24 +31,41 @@ clean_id() {
echo "$*" | sed 's/[^a-zA-Z0-9_-]/_/g' echo "$*" | sed 's/[^a-zA-Z0-9_-]/_/g'
} }
spaceless() {
local input="$*"
if [ "$#" -lt 1 ]; then
input="$(cat)"
fi
echo "$input" | sed 's/^ *//' | sed 's/ *$//'
}
scraped_to_meili() { scraped_to_meili() {
find "$SCRAPED" -type f \ find "$SCRAPED" -type f \
| sort \ | sort \
| while read -r md_path; do | while read -r md_path; do
file_id="$(clean_id "$md_path")" h1="$(grep '^#' "$md_path" | head -n 1 | sed 's/^##*//' | spaceless)"
h1="$(head -n 1 "$md_path" | sed 's/^# //')" grep '^##*' "$md_path" | while read -r h2_line; do
grep '^## ' "$md_path" | sed 's/^## //' | while read -r h2; do local h2="$(echo "$h2_line" | sed 's/^##*//' | spaceless)"
local content="$( local content="$(
echo "# $h1" echo "# $h1"
echo echo ""
echo "## $h2" echo "## $h2"
echo echo ""
cat "$md_path" \ found_h2=false
| sed -e '0,/^\#\# '"$h2"'/d' \ found_stopper=false
| sed -e '/^\#\# .*/,$d' \ cat "$md_path" | while read -r line; do
| head -n 25 if ! $found_h2 && [ "$line" == "$h2_line" ]; then
found_h2=true
elif $found_h2 && [ "$line" != "${line#"#"}" ]; then
found_stopper=true
fi
if $found_stopper; then continue; fi
if $found_h2; then echo "$line"; fi
done
)" )"
if [ $(echo "$content" | wc -l | awk '{print $NF}') -lt 5 ]; then content="$(echo "$content" | head -n 25)"
if [ $(echo "$content" | wc -w | awk '{print $NF}') -lt 25 ]; then
log "skipping content because less than 25 words found: $content"
continue continue
fi fi
local id="$(clean_id "${file_id}_${h1}_${h2}")" local id="$(clean_id "${file_id}_${h1}_${h2}")"
@ -56,6 +73,7 @@ scraped_to_meili() {
log h1=$h1 log h1=$h1
log h2=$h2 log h2=$h2
log content="${#content}" log content="${#content}"
log "submitting $id/$h1/$h2"
curl -sS \ curl -sS \
"$MEILI"/indexes/scraped/documents \ "$MEILI"/indexes/scraped/documents \
-X POST \ -X POST \
@ -69,3 +87,11 @@ scraped_to_meili() {
done done
done done
} }
if [ "$#" == 0 ] ; then
if (read -p "would you like to ingest from $SCRAPED? " yn; echo "$yn" | grep -q -i y); then
scraped_to_meili
echo $?
fi
fi