update to meili for new scrape format
parent
260815e53c
commit
f878486d80
46
to_meili.sh
46
to_meili.sh
|
|
@ -31,24 +31,41 @@ clean_id() {
|
||||||
echo "$*" | sed 's/[^a-zA-Z0-9_-]/_/g'
|
echo "$*" | sed 's/[^a-zA-Z0-9_-]/_/g'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
spaceless() {
|
||||||
|
local input="$*"
|
||||||
|
if [ "$#" -lt 1 ]; then
|
||||||
|
input="$(cat)"
|
||||||
|
fi
|
||||||
|
echo "$input" | sed 's/^ *//' | sed 's/ *$//'
|
||||||
|
}
|
||||||
|
|
||||||
scraped_to_meili() {
|
scraped_to_meili() {
|
||||||
find "$SCRAPED" -type f \
|
find "$SCRAPED" -type f \
|
||||||
| sort \
|
| sort \
|
||||||
| while read -r md_path; do
|
| while read -r md_path; do
|
||||||
file_id="$(clean_id "$md_path")"
|
h1="$(grep '^#' "$md_path" | head -n 1 | sed 's/^##*//' | spaceless)"
|
||||||
h1="$(head -n 1 "$md_path" | sed 's/^# //')"
|
grep '^##*' "$md_path" | while read -r h2_line; do
|
||||||
grep '^## ' "$md_path" | sed 's/^## //' | while read -r h2; do
|
local h2="$(echo "$h2_line" | sed 's/^##*//' | spaceless)"
|
||||||
local content="$(
|
local content="$(
|
||||||
echo "# $h1"
|
echo "# $h1"
|
||||||
echo
|
echo ""
|
||||||
echo "## $h2"
|
echo "## $h2"
|
||||||
echo
|
echo ""
|
||||||
cat "$md_path" \
|
found_h2=false
|
||||||
| sed -e '0,/^\#\# '"$h2"'/d' \
|
found_stopper=false
|
||||||
| sed -e '/^\#\# .*/,$d' \
|
cat "$md_path" | while read -r line; do
|
||||||
| head -n 25
|
if ! $found_h2 && [ "$line" == "$h2_line" ]; then
|
||||||
|
found_h2=true
|
||||||
|
elif $found_h2 && [ "$line" != "${line#"#"}" ]; then
|
||||||
|
found_stopper=true
|
||||||
|
fi
|
||||||
|
if $found_stopper; then continue; fi
|
||||||
|
if $found_h2; then echo "$line"; fi
|
||||||
|
done
|
||||||
)"
|
)"
|
||||||
if [ $(echo "$content" | wc -l | awk '{print $NF}') -lt 5 ]; then
|
content="$(echo "$content" | head -n 25)"
|
||||||
|
if [ $(echo "$content" | wc -w | awk '{print $NF}') -lt 25 ]; then
|
||||||
|
log "skipping content because less than 25 words found: $content"
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
local id="$(clean_id "${file_id}_${h1}_${h2}")"
|
local id="$(clean_id "${file_id}_${h1}_${h2}")"
|
||||||
|
|
@ -56,6 +73,7 @@ scraped_to_meili() {
|
||||||
log h1=$h1
|
log h1=$h1
|
||||||
log h2=$h2
|
log h2=$h2
|
||||||
log content="${#content}"
|
log content="${#content}"
|
||||||
|
log "submitting $id/$h1/$h2"
|
||||||
curl -sS \
|
curl -sS \
|
||||||
"$MEILI"/indexes/scraped/documents \
|
"$MEILI"/indexes/scraped/documents \
|
||||||
-X POST \
|
-X POST \
|
||||||
|
|
@ -69,3 +87,11 @@ scraped_to_meili() {
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if [ "$#" == 0 ] ; then
|
||||||
|
if (read -p "would you like to ingest from $SCRAPED? " yn; echo "$yn" | grep -q -i y); then
|
||||||
|
scraped_to_meili
|
||||||
|
echo $?
|
||||||
|
fi
|
||||||
|
|
||||||
|
fi
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue