15 Commits

Author SHA1 Message Date
Bel LaPointe
eadc4080b1 google slides works enough for search 2022-02-16 16:14:31 -07:00
Bel LaPointe
9219e3656b cache tree traversal for both full and meta on disk as json 2022-02-16 16:09:02 -07:00
Bel LaPointe
76d67cff7a remove tree.cached...root as it wasnt used in server notable 2022-02-16 15:56:17 -07:00
Bel LaPointe
b076b6a9cf grr cached root doesnt matter because server.tree called each time 2022-02-16 15:47:35 -07:00
Bel LaPointe
2781114863 only look at first 1kb of data.yaml when building tree 2022-02-16 15:21:51 -07:00
Bel LaPointe
51a8c8b425 editor loads content as initial 2022-02-16 15:13:49 -07:00
Bel LaPointe
8c87cdf0b2 simplify google docs markdown 2022-02-16 15:09:08 -07:00
Bel LaPointe
c0d49d23bb google converts csv to md table 2022-02-16 14:35:19 -07:00
Bel LaPointe
98df3f2372 google sheets and docs cache in rclone, put title as first line h1, load to file tree 2022-02-16 14:26:34 -07:00
Bel LaPointe
c85813ad76 impl crawler rclone wrapper to get google files by id 2022-02-16 13:53:01 -07:00
Bel LaPointe
3774d3eba1 add google and update crawlable detection 2022-02-16 12:19:32 -07:00
Bel LaPointe
e3b97814ea fix buttons on chrome vs firefox height 2022-02-16 12:09:21 -07:00
Bel LaPointe
62c927d5ec update go mod for restructure 2022-02-16 12:03:27 -07:00
Bel LaPointe
c000168dc6 rm big 2022-02-16 12:01:33 -07:00
Bel LaPointe
9739a73265 reorg repo 2022-02-16 12:01:11 -07:00
48 changed files with 286 additions and 81 deletions

16
.gitignore vendored
View File

@@ -1,9 +1,9 @@
**/*.sw* **/*.sw*
spike/review/reinvent/ezmded/server/ezmded server/ezmded
spike/review/reinvent/ezmded/server/server server/server
spike/review/reinvent/ezmded/server/testdata/files/**/* server/testdata/files/**/*
spike/review/reinvent/ezmded/server/testdata/workd/**/* server/testdata/workd/**/*
spike/review/reinvent/ezmded/server/testdata/media/**/* server/testdata/media/**/*
spike/review/reinvent/ezmded/server/testdata/index.html server/testdata/index.html
spike/review/reinvent/ezmded/ui/render ui/render
spike/review/reinvent/ezmded/ui/**/.*.html ui/**/.*.html

View File

@@ -1 +0,0 @@
../../spike/review/run.sh

76
crawler/google.sh Normal file
View File

@@ -0,0 +1,76 @@
#! /bin/bash
google() (
_is_slides() {
echo "$@" | grep -q 'docs.google.com.presentation'
}
_is_sheets() {
echo "$@" | grep -q 'docs.google.com.spreadsheets'
}
_is_doc() {
echo "$@" | grep -q 'docs.google.com.document'
}
is() {
_is_sheets "$@" || _is_doc "$@" || _is_slides "$@"
}
human_url() {
echo "$1"
}
get() {
local url="$1"
local id="${url%/*}"
id="${id##*/}"
local downloaded="$(rclone get_google "$id")"
echo "# ${downloaded##*/}"
echo ""
if [ "${downloaded##*.}" == "csv" ]; then
_csv_to_md "$downloaded"
elif [ "${downloaded##*.}" == "html" ]; then
_html_to_md "$downloaded"
else
cat "$downloaded"
fi
}
_html_to_md() {
which pandoc &> /dev/null
local f="$1"
#log f=$f
cat "$f" \
| sed 's/.*<body/<body/' \
| sed 's/<\/body>.*/<\/body>/' \
| sed 's/<[\/]*span[^>]*>//g' \
| perl -pe 's|<div class="c[0-9][0-9]*">.*?<\/div>||g' \
| sed 's/<\([a-z][a-z]*\)[^>]*/<\1/g' \
| pandoc - -f html -t commonmark -s -o - \
| sed 's/^<[\/]*div>$//g'
}
_csv_to_md() {
local f="$1"
(
head -n 1 "$f"
head -n 1 "$f" \
| sed 's/^[^,][^,]*/--- /' \
| sed 's/[^,][^,]*$/ ---/' \
| sed 's/,[^,][^,]*/, --- /g' \
| sed 's/[^|]$/|/'
tail -n +2 "$f"
) \
| grep . \
| sed 's/,/ | /g' \
| sed 's/^/| /'
}
expand() {
get "$@" | head -n 1 | sed 's/^[#]* //' | base64
}
"$@"
)

View File

@@ -4,6 +4,7 @@ main() {
config config
log crawling ids... log crawling ids...
for id in $(crawlable_ids); do for id in $(crawlable_ids); do
log crawling id $id
crawl "$id" crawl "$id"
done done
log rewriting ids... log rewriting ids...
@@ -20,8 +21,12 @@ config() {
export CACHE_DURATION=$((60*50)) export CACHE_DURATION=$((60*50))
export NOTES_ADDR="${NOTES_ADDR:-"http://localhost:3004"}" export NOTES_ADDR="${NOTES_ADDR:-"http://localhost:3004"}"
export GITLAB_PAT="$GITLAB_PAT" export GITLAB_PAT="$GITLAB_PAT"
export RCLONE_CONFIG="$RCLONE_CONFIG"
export RCLONE_CONFIG_PASS="$RCLONE_CONFIG_PASS"
source ./gitlab.sh source ./gitlab.sh
source ./gitlab_wiki.sh source ./gitlab_wiki.sh
source ./google.sh
source ./rclone.sh
source ./cache.sh source ./cache.sh
source ./notes.sh source ./notes.sh
} }
@@ -56,12 +61,7 @@ crawlable_ids() {
} }
crawl() { crawl() {
local cache_key="crawled $*" _crawl "$@"
# TODO
if false && cache get "$cache_key"; then
return
fi
_crawl "$@" | cache put "$cache_key"
} }
_crawl() { _crawl() {
@@ -73,7 +73,7 @@ _crawl() {
"$id" "$id"
)" )"
local crawlable_source="$(extract_crawlable_source "$content")" local crawlable_source="$(extract_crawlable_source "$content")"
for backend in gitlab gitlab_wiki; do for backend in gitlab gitlab_wiki google; do
if $backend is "$crawlable_source"; then if $backend is "$crawlable_source"; then
crawl_with $backend "$json" crawl_with $backend "$json"
return $? return $?
@@ -149,6 +149,7 @@ crawl_with() {
ID="${ID%/}" ID="${ID%/}"
log " $ID ($TITLE): ${#CONTENT}" log " $ID ($TITLE): ${#CONTENT}"
push_crawled "$ID" "$TITLE" "$CONTENT" push_crawled "$ID" "$TITLE" "$CONTENT"
log " /$ID ($TITLE): ${#CONTENT}"
} }
if [ "${#expanded[@]}" -gt 0 ]; then if [ "${#expanded[@]}" -gt 0 ]; then
for i in $(seq 0 $(("${#expanded[@]}"-1))); do for i in $(seq 0 $(("${#expanded[@]}"-1))); do
@@ -166,7 +167,7 @@ push_crawled() {
is_crawlable() { is_crawlable() {
local crawlable_source="$(extract_crawlable_source "$*")" local crawlable_source="$(extract_crawlable_source "$*")"
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file # https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:-]*" local url_pattern="(http|https)://[a-zA-Z0-9./?=_%:\-\#--]*"
echo "$crawlable_source" | cut -c 1-300 | grep -q -E "^[ ]*$url_pattern[ ]*$" echo "$crawlable_source" | cut -c 1-300 | grep -q -E "^[ ]*$url_pattern[ ]*$"
} }

View File

@@ -2,11 +2,15 @@
notes() ( notes() (
ids() { ids() {
_recurse_ids "" "$(_tree)" _recurse_ids "$(_tree)"
} }
_tree() { _tree() {
__tree "$@" local cache_key="notes _tree"
if CACHE_DURATION=5 cache get "$cache_key"; then
return 0
fi
__tree "$@" | cache put "$cache_key"
} }
__tree() { __tree() {
@@ -18,8 +22,7 @@ notes() (
} }
_recurse_ids() { _recurse_ids() {
local prefix="$1" local json="$1"
local json="$2"
if echo "$json" | jq .Branches | grep -q ^null$; then if echo "$json" | jq .Branches | grep -q ^null$; then
return 0 return 0
fi fi
@@ -29,22 +32,32 @@ notes() (
fi fi
for line in $b64lines; do for line in $b64lines; do
line="$(echo "$line" | base64 --decode)" line="$(echo "$line" | base64 --decode)"
local subfix="$(printf "%s/%s" "$prefix" "$line")" if ! _is_deleted "$line"; then
subfix="${subfix#/}" echo "$line"
if ! _is_deleted "$subfix"; then _recurse_ids "$(echo "$json" | jq -c ".Branches[\"$line\"]")"
echo "$subfix"
fi fi
_recurse_ids "$subfix" "$(echo "$json" | jq -c ".Branches[\"$line\"]")"
done done
} }
meta() { meta() {
local cache_key="notes meta $*"
if CACHE_DURATION=5 cache get "$cache_key"; then
return 0
fi
_meta "$@" | cache put "$cache_key"
}
_meta() {
local id="$1" local id="$1"
local tree="$(_tree)" local tree="$(_tree)"
for subid in ${id//\// }; do local pid="${id%%/*}"
tree="$(echo "$tree" | jq -c .Branches | jq -c ".[\"$subid\"]")" while [ "$id" != "$pid" ]; do
tree="$(echo "$tree" | jq ".Branches[\"$pid\"]")"
local to_add="${id#$pid/}"
to_add="${to_add%%/*}"
pid="$pid/$to_add"
done done
echo "$tree" | jq .Leaf echo "$tree" | jq ".Branches[\"$id\"].Leaf"
} }
_is_deleted() { _is_deleted() {
@@ -90,11 +103,11 @@ notes() (
local id="$1" local id="$1"
local title="$2" local title="$2"
local body="$3" local body="$3"
echo "$body" | _nncurl \ _nncurl \
-X PUT \ -X PUT \
-H "Title: $title" \ -H "Title: $title" \
-d "$body" \ -d "$body" \
$NOTES_ADDR/api/v0/files/$id $NOTES_ADDR/api/v0/files/$id >&2
} }
"$@" "$@"

62
crawler/rclone.sh Normal file
View File

@@ -0,0 +1,62 @@
#! /bin/bash
rclone() (
get_google() {
local cache_key="rclone get google 2 $*"
if cache get "$cache_key"; then
return 0
fi
_get_google "$@" | cache put "$cache_key"
}
_get_google() {
_rate_limit
local id="$1"
local out="$(mktemp -d)"
_cmd backend copyid work-notes-google: --drive-export-formats=csv,html,txt "$id" "$out/"
find "$out" -type f
}
_rate_limit() {
local f="/tmp/rclone.rate.limit"
local last=0
if [ -f "$f" ]; then
last="$(date -r "$f" +%s)"
fi
local now="$(date +%s)"
local since_last=$((now-last))
if ((since_last>2)); then
dur=-2
fi
dur=$((dur+2))
sleep $dur
touch "$f"
}
_ensure() {
which rclone &> /dev/null && rclone version &> /dev/null
}
_cmd() {
_ensure_google_config
__cmd "$@"
}
__cmd() {
_ensure
RCLONE_CONFIG_PASS="$RCLONE_CONFIG_PASS" \
$(which rclone) \
--config "$RCLONE_CONFIG" \
--size-only \
--fast-list \
--retries 10 \
--retries-sleep 10s \
"$@"
}
_ensure_google_config() {
__cmd config show | grep -q work-notes-google
}
"$@"
)

31
server/go.mod Normal file
View File

@@ -0,0 +1,31 @@
module ezmded
go 1.17
require (
github.com/google/uuid v1.3.0
go.mongodb.org/mongo-driver v1.7.2
gopkg.in/yaml.v2 v2.4.0
local/args v0.0.0-00010101000000-000000000000
local/gziphttp v0.0.0-00010101000000-000000000000
local/router v0.0.0-00010101000000-000000000000
local/simpleserve v0.0.0-00010101000000-000000000000
)
require github.com/go-stack/stack v1.8.0 // indirect
replace local/args => ../../../../args
replace local/logb => ../../../../logb
replace local/storage => ../../../../storage
replace local/router => ../../../../router
replace local/simpleserve => ../../../../simpleserve
replace local/gziphttp => ../../../../gziphttp
replace local/notes-server => ../../../../notes-server
replace local/oauth2 => ../../../../oauth2

View File

@@ -1,15 +1,15 @@
todo: todo:
- scrape odo
- mark generated via meta so other files in the dir can be created, deleted, replaced safely
- rewrite links if available to local
- table of contents - table of contents
- anchor per line - anchor per line
- scrape odo
- scrape gdoc
- scrape gsheet
- scrape gslide
- anchor links work - anchor links work
- rewrite links if available to local
- ui; last updated; 2022.02.01T12:34:56 - ui; last updated; 2022.02.01T12:34:56
- mark generated via meta so other files in the dir can be created, deleted, replaced safely
done: done:
- scrape gslide
- scrape gsheet
- scrape gdoc
- alert box; https://concisecss.com/documentation/ui - alert box; https://concisecss.com/documentation/ui
- hide checkbox for tree - hide checkbox for tree
- do not rewrite .md title vs. link cause hrefs to ./gobs.md wont work - do not rewrite .md title vs. link cause hrefs to ./gobs.md wont work

View File

@@ -1,6 +1,7 @@
package main package main
import ( import (
"encoding/json"
"io/ioutil" "io/ioutil"
"os" "os"
"path" "path"
@@ -52,8 +53,7 @@ func (base Leaf) Merge(updated Leaf) Leaf {
} }
type Tree struct { type Tree struct {
root string root string
cachedRoot Branch
} }
func NewTree(root string) Tree { func NewTree(root string) Tree {
@@ -62,25 +62,72 @@ func NewTree(root string) Tree {
func (tree Tree) WithRoot(root string) Tree { func (tree Tree) WithRoot(root string) Tree {
tree.root = root tree.root = root
tree.cachedRoot = Branch{}
return tree return tree
} }
func (tree Tree) GetRootMeta() (Branch, error) { func (tree Tree) GetRootMeta() (Branch, error) {
return tree.getRoot(NewID(""), false, false) if meta, ok := tree.getCachedRootMeta(); ok {
return meta, nil
}
got, err := tree.getRoot(NewID(""), false, false)
if err != nil {
return Branch{}, err
}
tree.cacheRootMeta(got)
return got, err
} }
func (tree Tree) GetRoot() (Branch, error) { func (tree Tree) GetRoot() (Branch, error) {
if !tree.cachedRoot.IsZero() { if root, ok := tree.getCachedRoot(); ok {
return tree.cachedRoot, nil return root, nil
} }
got, err := tree.getRoot(NewID(""), true, false) got, err := tree.getRoot(NewID(""), true, false)
if err == nil { if err != nil {
tree.cachedRoot = got return Branch{}, err
} }
tree.cacheRoot(got)
return got, err return got, err
} }
func (tree Tree) getCachedRoot() (Branch, bool) {
return tree.getCachedFrom("root.json")
}
func (tree Tree) getCachedRootMeta() (Branch, bool) {
return tree.getCachedFrom("root_meta.json")
}
func (tree Tree) getCachedFrom(name string) (Branch, bool) {
b, err := ioutil.ReadFile(path.Join(tree.root, name))
if err != nil {
return Branch{}, false
}
var branch Branch
err = json.Unmarshal(b, &branch)
return branch, err == nil
}
func (tree Tree) cacheRoot(branch Branch) {
tree.cacheRootFrom("root.json", branch)
}
func (tree Tree) cacheRootMeta(branch Branch) {
tree.cacheRootFrom("root_meta.json", branch)
}
func (tree Tree) cacheRootFrom(name string, branch Branch) {
b, err := json.Marshal(branch)
if err != nil {
return
}
ensureAndWrite(path.Join(tree.root, name), b)
}
func (tree Tree) cacheClear() {
os.Remove(path.Join(path.Join(tree.root, "root.json")))
os.Remove(path.Join(path.Join(tree.root, "root_meta.json")))
}
func (tree Tree) getRoot(pid ID, withContent, withDeleted bool) (Branch, error) { func (tree Tree) getRoot(pid ID, withContent, withDeleted bool) (Branch, error) {
m := Branch{Branches: map[ID]Branch{}} m := Branch{Branches: map[ID]Branch{}}
entries, err := os.ReadDir(tree.root) entries, err := os.ReadDir(tree.root)
@@ -92,7 +139,7 @@ func (tree Tree) getRoot(pid ID, withContent, withDeleted bool) (Branch, error)
} }
for _, entry := range entries { for _, entry := range entries {
if entry.Name() == "data.yaml" { if entry.Name() == "data.yaml" {
if b, err := ioutil.ReadFile(path.Join(tree.root, entry.Name())); err != nil { if b, err := peekLeaf(withContent, path.Join(tree.root, entry.Name())); err != nil {
return Branch{}, err return Branch{}, err
} else if err := yaml.Unmarshal(b, &m.Leaf); err != nil { } else if err := yaml.Unmarshal(b, &m.Leaf); err != nil {
return Branch{}, err return Branch{}, err
@@ -115,6 +162,10 @@ func (tree Tree) getRoot(pid ID, withContent, withDeleted bool) (Branch, error)
return m, nil return m, nil
} }
func peekLeaf(all bool, path string) ([]byte, error) {
return ioutil.ReadFile(path)
}
func (tree Tree) toDir(id ID) string { func (tree Tree) toDir(id ID) string {
return path.Dir(tree.toData(id)) return path.Dir(tree.toData(id))
} }
@@ -124,6 +175,7 @@ func (tree Tree) toData(id ID) string {
} }
func (tree Tree) Put(id ID, input Leaf) error { func (tree Tree) Put(id ID, input Leaf) error {
tree.cacheClear()
if _, err := os.Stat(tree.toData(id)); os.IsNotExist(err) { if _, err := os.Stat(tree.toData(id)); os.IsNotExist(err) {
b, err := yaml.Marshal(Leaf{}) b, err := yaml.Marshal(Leaf{})
if err != nil { if err != nil {
@@ -144,11 +196,11 @@ func (tree Tree) Put(id ID, input Leaf) error {
if err := ensureAndWrite(tree.toData(id), b); err != nil { if err := ensureAndWrite(tree.toData(id), b); err != nil {
return err return err
} }
tree.cachedRoot = Branch{}
return nil return nil
} }
func (tree Tree) Del(id ID) error { func (tree Tree) Del(id ID) error {
tree.cacheClear()
got, err := tree.Get(id) got, err := tree.Get(id)
if os.IsNotExist(err) { if os.IsNotExist(err) {
return nil return nil
@@ -164,8 +216,8 @@ func (tree Tree) Del(id ID) error {
} }
func (tree Tree) HardDel(id ID) error { func (tree Tree) HardDel(id ID) error {
tree.cacheClear()
os.RemoveAll(tree.toDir(id)) os.RemoveAll(tree.toDir(id))
tree.cachedRoot = Branch{}
return nil return nil
} }

Binary file not shown.

View File

@@ -1,31 +0,0 @@
module ezmded
go 1.17
require (
github.com/google/uuid v1.3.0
go.mongodb.org/mongo-driver v1.7.2
gopkg.in/yaml.v2 v2.4.0
local/args v0.0.0-00010101000000-000000000000
local/gziphttp v0.0.0-00010101000000-000000000000
local/router v0.0.0-00010101000000-000000000000
local/simpleserve v0.0.0-00010101000000-000000000000
)
require github.com/go-stack/stack v1.8.0 // indirect
replace local/args => ../../../../../../../../args
replace local/logb => ../../../../../../../../logb
replace local/storage => ../../../../../../../../storage
replace local/router => ../../../../../../../../router
replace local/simpleserve => ../../../../../../../../simpleserve
replace local/gziphttp => ../../../../../../../../gziphttp
replace local/notes-server => ../../../../../../../../notes-server
replace local/oauth2 => ../../../../../../../../oauth2

View File

@@ -113,7 +113,7 @@
element: document.getElementById('my-text-area'), element: document.getElementById('my-text-area'),
forceSync: true, forceSync: true,
indentWithTabs: false, indentWithTabs: false,
initialValue: "loading...", initialValue: "{{ .This.Content }}",
showIcons: ["code", "table"], showIcons: ["code", "table"],
spellChecker: false, spellChecker: false,
sideBySideFullscreen: false, sideBySideFullscreen: false,
@@ -133,6 +133,5 @@
}, },
status: ["lines", "words", "cursor"], status: ["lines", "words", "cursor"],
}) })
easyMDE.value({{ .This.Content }})
</script> </script>
{{ end }} {{ end }}

View File

@@ -75,6 +75,9 @@
.tb_fullscreen { .tb_fullscreen {
height: 100%; height: 100%;
} }
.button, button, input[type="button"] {
height: auto;
}
</style> </style>
<script> <script>
function http(method, remote, callback, body, headers) { function http(method, remote, callback, body, headers) {