always zip, better html to plaintext

master
Bel LaPointe 2021-11-29 13:51:22 -07:00
parent af1429d6d8
commit 17f71427e7
7 changed files with 267 additions and 228 deletions

1
about.md.html Normal file
View File

@ -0,0 +1 @@
<html><head><meta content="text/html; charset=UTF-8" http-equiv="content-type"><style type="text/css">ol{margin:0;padding:0}table td,table th{padding:0}.c1{color:#000000;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:11pt;font-family:"Arial";font-style:normal}.c0{padding-top:0pt;padding-bottom:0pt;line-height:1.15;orphans:2;widows:2;text-align:left}.c2{background-color:#ffffff;max-width:468pt;padding:72pt 72pt 72pt 72pt}.title{padding-top:0pt;color:#000000;font-size:26pt;padding-bottom:3pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}.subtitle{padding-top:0pt;color:#666666;font-size:15pt;padding-bottom:16pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}li{color:#000000;font-size:11pt;font-family:"Arial"}p{margin:0;color:#000000;font-size:11pt;font-family:"Arial"}h1{padding-top:20pt;color:#000000;font-size:20pt;padding-bottom:6pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h2{padding-top:18pt;color:#000000;font-size:16pt;padding-bottom:6pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h3{padding-top:16pt;color:#434343;font-size:14pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h4{padding-top:14pt;color:#666666;font-size:12pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h5{padding-top:12pt;color:#666666;font-size:11pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h6{padding-top:12pt;color:#666666;font-size:11pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;font-style:italic;orphans:2;widows:2;text-align:left}</style></head><body class="c2"><p class="c0"><span class="c1">Welcome :)</span></p></body></html>

31
meta/.default/about.md Normal file
View File

@ -0,0 +1,31 @@
---
title: "About"
description: "Hugo, the worlds fastest framework for building websites"
date: "2019-02-28"
aliases:
- "about-us"
- "about-hugo"
- "contact"
author: "Hugo Authors"
---
Written in Go, Hugo is an open source static site generator available under the [Apache Licence 2.0.](https://github.com/gohugoio/hugo/blob/master/LICENSE) Hugo supports TOML, YAML and JSON data file types, Markdown and HTML content files and uses shortcodes to add rich content. Other notable features are taxonomies, multilingual mode, image processing, custom output formats, HTML/CSS/JS minification and support for Sass SCSS workflows.
Hugo makes use of a variety of open source projects including:
* https://github.com/yuin/goldmark
* https://github.com/alecthomas/chroma
* https://github.com/muesli/smartcrop
* https://github.com/spf13/cobra
* https://github.com/spf13/viper
Hugo is ideal for blogs, corporate websites, creative portfolios, online magazines, single page applications or even a website with thousands of pages.
Hugo is for people who want to hand code their own website without worrying about setting up complicated runtimes, dependencies and databases.
Websites built with Hugo are extremelly fast, secure and can be deployed anywhere including, AWS, GitHub Pages, Heroku, Netlify and any other hosting provider.
Learn more and contribute on [GitHub](https://github.com/gohugoio).

76
meta/.default/config.yaml Normal file
View File

@ -0,0 +1,76 @@
baseURL: ''
languageCode: 'en-us'
title: 'Site Title'
theme: "anubis"
paginate: 5
paginatePath: "posts/-/pages"
enableRobotsTXT: true
disableLiveReload: true
minify: true
noHTTPCache: true
disableFastRender: true
rssLimit: 5
canonifyurls: true
relativeurls: false
params:
toc: true
author: "me"
email: ""
avatar: ""
description: ""
#customJS: [ "js/main.js" ] # relative to ./assets/
dateFormat: "2006-01-02"
paginationSinglePost: false
style: light
#style: auto-without-switcher
#style: light-without-switcher
readMore: true
disableSummary: false
social:
- id: email
name: contact
#- id: linkedin
# name: lapoba16
#- id: rss
# name: rss
# url: '/index.xml'
menu:
main:
- identifier: archive
name: Archive
title: Archive
url: /posts/
weight: 0
- identifier: about
name: About
title: About
url: /about/
weight: 1
markup:
defaultMarkdownHandler: goldmark
goldmark:
extensions:
linkify: true
strikethrough: true
table: true
taskList: true
parser:
autoHeadingID: true
autoHeadingIDType: github
renderer:
unsafe: true # raw HTML OK
highlight:
anchorLineNos: true
guessSyntax: true
lineNos: true
tabWidth: 3
tableOfContents:
endLevel: 3
ordered: false
startLevel: 2
permalinks:
posts: /posts/:year-:month-:day-:filename

View File

@ -1,31 +0,0 @@
---
title: "About"
description: "Hugo, the worlds fastest framework for building websites"
date: "2019-02-28"
aliases:
- "about-us"
- "about-hugo"
- "contact"
author: "Hugo Authors"
---
Written in Go, Hugo is an open source static site generator available under the [Apache Licence 2.0.](https://github.com/gohugoio/hugo/blob/master/LICENSE) Hugo supports TOML, YAML and JSON data file types, Markdown and HTML content files and uses shortcodes to add rich content. Other notable features are taxonomies, multilingual mode, image processing, custom output formats, HTML/CSS/JS minification and support for Sass SCSS workflows.
Hugo makes use of a variety of open source projects including:
* https://github.com/yuin/goldmark
* https://github.com/alecthomas/chroma
* https://github.com/muesli/smartcrop
* https://github.com/spf13/cobra
* https://github.com/spf13/viper
Hugo is ideal for blogs, corporate websites, creative portfolios, online magazines, single page applications or even a website with thousands of pages.
Hugo is for people who want to hand code their own website without worrying about setting up complicated runtimes, dependencies and databases.
Websites built with Hugo are extremelly fast, secure and can be deployed anywhere including, AWS, GitHub Pages, Heroku, Netlify and any other hosting provider.
Learn more and contribute on [GitHub](https://github.com/gohugoio).

1
meta/about.md Symbolic link
View File

@ -0,0 +1 @@
.default/about.md

View File

@ -1,76 +0,0 @@
baseURL: ''
languageCode: 'en-us'
title: 'Site Title'
theme: "anubis"
paginate: 5
paginatePath: "posts/-/pages"
enableRobotsTXT: true
disableLiveReload: true
minify: true
noHTTPCache: true
disableFastRender: true
rssLimit: 5
canonifyurls: true
relativeurls: false
params:
toc: true
author: "me"
email: ""
avatar: ""
description: ""
#customJS: [ "js/main.js" ] # relative to ./assets/
dateFormat: "2006-01-02"
paginationSinglePost: false
style: light
#style: auto-without-switcher
#style: light-without-switcher
readMore: true
disableSummary: false
social:
- id: email
name: contact
#- id: linkedin
# name: lapoba16
#- id: rss
# name: rss
# url: '/index.xml'
menu:
main:
- identifier: archive
name: Archive
title: Archive
url: /posts/
weight: 0
- identifier: about
name: About
title: About
url: /about/
weight: 1
markup:
defaultMarkdownHandler: goldmark
goldmark:
extensions:
linkify: true
strikethrough: true
table: true
taskList: true
parser:
autoHeadingID: true
autoHeadingIDType: github
renderer:
unsafe: true # raw HTML OK
highlight:
anchorLineNos: true
guessSyntax: true
lineNos: true
tabWidth: 3
tableOfContents:
endLevel: 3
ordered: false
startLevel: 2
permalinks:
posts: /posts/:year-:month-:day-:filename

1
meta/config.yaml Symbolic link
View File

@ -0,0 +1 @@
.default/config.yaml

274
posts.sh
View File

@ -11,7 +11,7 @@ TMPDIR="${TMPDIR:-"/tmp"}"
main() {
flags "$@"
#posts
posts
meta
}
@ -21,9 +21,109 @@ flags() {
cd "$(dirname "$BASH_SOURCE")"
}
posts() {
posts() (
list() {
gd list_posts
}
up_to_date() {
local metadatad="$(metadatad "$1")"
if [ -f "$metadatad" ]; then
if [ "$1" == "$(cat "$metadatad")" ]; then
return 0
fi
fi
return 1
}
pulled() {
echo "$TMPDIR/$(echo "$1" | jq -r .Name)"
}
pull() {
local filename="$(echo "$json" | jq -r .Name)"
gd pull_posts "$filename"
}
extracted() {
local pulled_file="$(pulled "$1")"
local date="$(echo "$1" | jq -r .ModTime)"
date="${date%%T*}"
local pulled_file_safe_basename="$(basename "$pulled_file" | sed 's/.zip$//' | sed 's/[^a-zA-Z0-9]/_/g')"
echo "$TMPDIR/${date}_${pulled_file_safe_basename}"
}
extract() {
local extracted_dir="$(extracted "$1")"
local index_html="$extracted_dir/.index.html"
local index_md="$extracted_dir/index.md"
local pulled_file="$(pulled "$1")"
local u_date_iso="$(echo "$1" | jq -r .ModTime)"
if [ -d "$extracted_dir" ]; then
rm -rf "$extracted_dir"
fi
mkdir -p "$extracted_dir"
7z x -o"$extracted_dir" "$pulled_file"
cat "$extracted_dir"/*.html \
| sed 's/.*<body/<body/' \
| sed 's/<\/body>.*/<\/body>/' \
> "$index_html"
local tags=($(cat "$index_html" | html_to_plaintext | grep -o '#[a-zA-Z0-9]*' | grep '[a-zA-Z]' | sed 's/^#//' | sort -u))
local tags_csv="$(first=false; for tag in "${tags[@]}"; do true; if $first; then echo -n ", "; fi; first=true; echo -n "$tag"; done)"
printf '
---
title: "%s"
date: %s
draft: false
tags: [%s]
---
' \
"$(basename "$pulled_file" | sed 's/.zip$//' | sed 's/"/\\"/g')" \
"$u_date_iso" \
"$tags_csv" \
| sed 's/^[ ]*//' \
| grep . \
> "$index_md"
cat "$index_html" >> "$index_md"
echo "" >> "$index_md"
rm "$index_html" "$extracted_dir"/*.html
for ext in png jpg jpeg gif JPG; do find "$extracted_dir" -name "*.$ext"; done | while read -r line; do
convert "$line" -ordered-dither o8x8,8,8,4 "$line.2"
mv "$line.2" "$line"
done
}
imported() {
local extracted_dir="$(extracted "$1")"
echo "$HUGO_POSTS/$(basename "$extracted_dir")"
}
import() {
local extracted_dir="$(extracted "$1")"
local target_dir="$(imported "$1")"
if [ -d "$target_dir" ]; then
rm -rf "$target_dir"
fi
mv "$extracted_dir" "$target_dir"
}
metadatad() {
local imported="$(imported "$1")"
echo "$imported"/.metadata.json
}
metadata() {
local metadatad="$(metadatad "$1")"
echo "$1" > "$metadatad"
}
local want=()
for b64_json in $(list_posts); do
for b64_json in $(list); do
local json="$(echo "$b64_json" | base64 --decode)"
local filename="$(echo "$json" | jq -r .Name)"
want+=("$(imported "$json")")
@ -47,11 +147,40 @@ posts() {
rm -rf "$d"
fi
done
}
)
meta() {
for b64_json in $(list_meta); do
echo "$b64_json" | base64 --decode | jq .
list() {
gd list_meta
}
pull() {
local filename="$(echo "$json" | jq -r .Name)"
gd pull_meta "$filename"
}
pulled() {
echo "$TMPDIR/$(echo "$1" | jq -r .Name)"
}
meta_file() {
echo "$TMPDIR/.meta.$(echo "$1" | jq -r .Name)"
}
up_to_date() {
test -f "$(meta_file "$1")" && test "$1" == "$(cat "$(meta_file "$1")")"
}
for b64_json in $(list); do
local json="$(echo "$b64_json" | base64 --decode | jq .)"
local filename="$(echo "$json" | jq -r .Name)"
if up_to_date "$json"; then
log "$filename: up to date"
continue
fi
log json=$json
pull "$json"
# todo: echo "$json" > "$(meta_file $json)"
done
false
}
@ -65,127 +194,30 @@ log() {
echo "$(date +%H:%M:%S) > $*" >&2
}
list_posts() {
RCLONE_OPTS="$RCLONE_OPTS --drive-export-formats zip" gd list_posts
}
list_meta() {
RCLONE_OPTS="$RCLONE_OPTS --drive-export-formats txt" gd list_meta
}
up_to_date() {
local metadatad="$(metadatad "$1")"
if [ -f "$metadatad" ]; then
if [ "$1" == "$(cat "$metadatad")" ]; then
return 0
fi
fi
return 1
}
pulled() {
echo "$TMPDIR/$(echo "$1" | jq -r .Name)"
}
pull() {
local filename="$(echo "$json" | jq -r .Name)"
gd pull_posts "$filename"
}
extracted() {
local pulled_file="$(pulled "$1")"
local date="$(echo "$1" | jq -r .ModTime)"
date="${date%%T*}"
local pulled_file_safe_basename="$(basename "$pulled_file" | sed 's/.zip$//' | sed 's/[^a-zA-Z0-9]/_/g')"
echo "$TMPDIR/${date}_${pulled_file_safe_basename}"
}
extract() {
local extracted_dir="$(extracted "$1")"
local index_html="$extracted_dir/.index.html"
local index_md="$extracted_dir/index.md"
local pulled_file="$(pulled "$1")"
local u_date_iso="$(echo "$1" | jq -r .ModTime)"
if [ -d "$extracted_dir" ]; then
rm -rf "$extracted_dir"
fi
mkdir -p "$extracted_dir"
7z x -o"$extracted_dir" "$pulled_file"
cat "$extracted_dir"/*.html \
| sed 's/.*<body/<body/' \
| sed 's/<\/body>.*/<\/body>/' \
> "$index_html"
local tags=($(cat "$index_html" | sed '/^$/!{s/<[^>]*>//g;p;}' | grep -o '#[a-zA-Z0-9]*' | grep '[a-zA-Z]' | sed 's/^#//' | sort -u))
local tags_csv="$(first=false; for tag in "${tags[@]}"; do true; if $first; then echo -n ", "; fi; first=true; echo -n "$tag"; done)"
printf '
---
title: "%s"
date: %s
draft: false
tags: [%s]
---
' \
"$(basename "$pulled_file" | sed 's/.zip$//' | sed 's/"/\\"/g')" \
"$u_date_iso" \
"$tags_csv" \
| sed 's/^[ ]*//' \
| grep . \
> "$index_md"
cat "$index_html" >> "$index_md"
echo "" >> "$index_md"
rm "$index_html" "$extracted_dir"/*.html
for ext in png jpg jpeg gif JPG; do find "$extracted_dir" -name "*.$ext"; done | while read -r line; do
convert "$line" -ordered-dither o8x8,8,8,4 "$line.2"
mv "$line.2" "$line"
done
}
imported() {
local extracted_dir="$(extracted "$1")"
echo "$HUGO_POSTS/$(basename "$extracted_dir")"
}
import() {
local extracted_dir="$(extracted "$1")"
local target_dir="$(imported "$1")"
if [ -d "$target_dir" ]; then
rm -rf "$target_dir"
fi
mv "$extracted_dir" "$target_dir"
}
metadatad() {
local imported="$(imported "$1")"
echo "$imported"/.metadata.json
}
metadata() {
local metadatad="$(metadatad "$1")"
echo "$1" > "$metadatad"
}
gd() (
list() {
rc lsjson "$RCLONE_ALIAS:$RCLONE_FOLDER/$1" \
| jq -c .[] \
| grep -E '(zip|txt)"' \
| while read -r line; do
echo "$line" | base64
done
}
list_posts() {
gd list "$RCLONE_FOLDER_POSTS"
}
list_meta() {
gd list "$RCLONE_FOLDER_META"
}
list() {
rc lsjson "$RCLONE_ALIAS:$RCLONE_FOLDER/$1" \
| jq -c .[] \
| grep -E 'zip"' \
| while read -r line; do
echo "$line" | base64
done
}
pull_posts() {
rc copy "$RCLONE_ALIAS:$RCLONE_FOLDER/$RCLONE_FOLDER_POSTS/$1" "$TMPDIR"/
pull "$RCLONE_FOLDER_POSTS/$1"
}
pull_meta() {
set -x
pull "$RCLONE_FOLDER_META/$1"
}
pull() {
rc copy "$RCLONE_ALIAS:$RCLONE_FOLDER/$1" "$TMPDIR"/
}
rc() {
rclone \
@ -198,6 +230,10 @@ gd() (
"$@"
)
html_to_plaintext() {
sed 's/<[^>]*>//g'
}
if [ "$0" == "$BASH_SOURCE" ]; then
main "$@"
fi

View File

@ -5,6 +5,6 @@ client_secret = zCqs6bTjTm6TivvdU4BxhR19
scope = drive
#team_drive = 1-p043gwMk88rj4-Sm5otARi_AJZYskbj
#root_folder_id = 1-p043gwMk88rj4-Sm5otARi_AJZYskbj
#formats = zip,html,rtf,odt,docx,xlsx,pptx,svg
token = {"access_token":"ya29.a0ARrdaM-Ca19JFA2LNKls0IsuLLGEF2M2eytTRxRP-YoS73oJvhDOi_xdw48uu0c8q5wrLglRfPBVZM5h48Oe4fJBVTPMR96b1ySUXWCscuh-GtWHV0iF4JxlIqY3bgR6lg5odyiUvXRFauh0JcZ10ozJ7mUy4g","token_type":"Bearer","refresh_token":"1//04sQD4800FsqyCgYIARAAGAQSNwF-L9IrB1_7BV8ScFnqmIz8FNWgXfr80--6VG6KX1pTnneQKAz6Ss3YtxqmCdf2tRBy2ndkhek","expiry":"2021-11-29T13:47:23.55774-07:00"}
formats = zip,html,rtf,odt,docx,xlsx,pptx,svg
token = {"access_token":"ya29.a0ARrdaM-P6wKzub3feWIaQlG3YXq2REc0rfS1LDxfeV1m1rr5Zwhe_XqYTInCEcCHP4bHNPvcBcZufAnJpgLZlVBte-abfEOvwfk4Qz25__FboaddAXthlv_EeIxPwVTlKJTy7MOKGLz2Rhg47CjjagfFYMcCKw","token_type":"Bearer","refresh_token":"1//04sQD4800FsqyCgYIARAAGAQSNwF-L9IrB1_7BV8ScFnqmIz8FNWgXfr80--6VG6KX1pTnneQKAz6Ss3YtxqmCdf2tRBy2ndkhek","expiry":"2021-11-29T14:49:21.903576-07:00"}