notea-de-me/crawler/google.sh

77 lines
1.4 KiB
Bash

#! /bin/bash
google() (
_is_slides() {
echo "$@" | grep -q 'docs.google.com.presentation'
}
_is_sheets() {
echo "$@" | grep -q 'docs.google.com.spreadsheets'
}
_is_doc() {
echo "$@" | grep -q 'docs.google.com.document'
}
is() {
_is_sheets "$@" || _is_doc "$@" || _is_slides "$@"
}
human_url() {
echo "$1"
}
get() {
local url="$1"
local id="${url%/*}"
id="${id##*/}"
local downloaded="$(rclone get_google "$id")"
echo "# ${downloaded##*/}"
echo ""
if [ "${downloaded##*.}" == "csv" ]; then
_csv_to_md "$downloaded"
elif [ "${downloaded##*.}" == "html" ]; then
_html_to_md "$downloaded"
else
cat "$downloaded"
fi
}
_html_to_md() {
which pandoc &> /dev/null
local f="$1"
#log f=$f
cat "$f" \
| sed 's/.*<body/<body/' \
| sed 's/<\/body>.*/<\/body>/' \
| sed 's/<[\/]*span[^>]*>//g' \
| perl -pe 's|<div class="c[0-9][0-9]*">.*?<\/div>||g' \
| sed 's/<\([a-z][a-z]*\)[^>]*/<\1/g' \
| pandoc - -f html -t commonmark -s -o - \
| sed 's/^<[\/]*div>$//g'
}
_csv_to_md() {
local f="$1"
(
head -n 1 "$f"
head -n 1 "$f" \
| sed 's/^[^,][^,]*/--- /' \
| sed 's/[^,][^,]*$/ ---/' \
| sed 's/,[^,][^,]*/, --- /g' \
| sed 's/[^|]$/|/'
tail -n +2 "$f"
) \
| grep . \
| sed 's/,/ | /g' \
| sed 's/^/| /'
}
expand() {
get "$@" | head -n 1 | sed 's/^[#]* //' | base64
}
"$@"
)