always zip, better html to plaintext

2021-11-29 13:51:22 -07:00
parent af1429d6d8
commit 17f71427e7
7 changed files with 267 additions and 228 deletions
--- a/about.md.html
+++ b/about.md.html
@@ -0,0 +1 @@
+<html><head><meta content="text/html; charset=UTF-8" http-equiv="content-type"><style type="text/css">ol{margin:0;padding:0}table td,table th{padding:0}.c1{color:#000000;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:11pt;font-family:"Arial";font-style:normal}.c0{padding-top:0pt;padding-bottom:0pt;line-height:1.15;orphans:2;widows:2;text-align:left}.c2{background-color:#ffffff;max-width:468pt;padding:72pt 72pt 72pt 72pt}.title{padding-top:0pt;color:#000000;font-size:26pt;padding-bottom:3pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}.subtitle{padding-top:0pt;color:#666666;font-size:15pt;padding-bottom:16pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}li{color:#000000;font-size:11pt;font-family:"Arial"}p{margin:0;color:#000000;font-size:11pt;font-family:"Arial"}h1{padding-top:20pt;color:#000000;font-size:20pt;padding-bottom:6pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h2{padding-top:18pt;color:#000000;font-size:16pt;padding-bottom:6pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h3{padding-top:16pt;color:#434343;font-size:14pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h4{padding-top:14pt;color:#666666;font-size:12pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h5{padding-top:12pt;color:#666666;font-size:11pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h6{padding-top:12pt;color:#666666;font-size:11pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;font-style:italic;orphans:2;widows:2;text-align:left}</style></head><body class="c2"><p class="c0"><span class="c1">Welcome :)</span></p></body></html>
--- a/meta/.default/about.md
+++ b/meta/.default/about.md
@@ -0,0 +1,31 @@
+---
+title: "About"
+description: "Hugo, the world’s fastest framework for building websites"
+date: "2019-02-28"
+aliases:
+  - "about-us"
+  - "about-hugo"
+  - "contact"
+author: "Hugo Authors"
+---
+
+Written in Go, Hugo is an open source static site generator available under the [Apache Licence 2.0.](https://github.com/gohugoio/hugo/blob/master/LICENSE) Hugo supports TOML, YAML and JSON data file types, Markdown and HTML content files and uses shortcodes to add rich content. Other notable features are taxonomies, multilingual mode, image processing, custom output formats, HTML/CSS/JS minification and support for Sass SCSS workflows.
+
+Hugo makes use of a variety of open source projects including:
+
+* https://github.com/yuin/goldmark
+* https://github.com/alecthomas/chroma
+* https://github.com/muesli/smartcrop
+* https://github.com/spf13/cobra
+* https://github.com/spf13/viper
+
+Hugo is ideal for blogs, corporate websites, creative portfolios, online magazines, single page applications or even a website with thousands of pages.
+
+Hugo is for people who want to hand code their own website without worrying about setting up complicated runtimes, dependencies and databases.
+
+Websites built with Hugo are extremelly fast, secure and can be deployed anywhere including, AWS, GitHub Pages, Heroku, Netlify and any other hosting provider.
+
+Learn more and contribute on [GitHub](https://github.com/gohugoio).
+
+
+
--- a/meta/.default/config.yaml
+++ b/meta/.default/config.yaml
@@ -0,0 +1,76 @@
+baseURL: ''
+languageCode: 'en-us'
+title: 'Site Title'
+theme: "anubis"
+paginate: 5
+paginatePath: "posts/-/pages"
+enableRobotsTXT: true
+disableLiveReload: true
+minify: true
+noHTTPCache: true
+disableFastRender: true
+rssLimit: 5
+canonifyurls: true
+relativeurls: false
+
+params:
+   toc: true
+   author: "me"
+   email: ""
+   avatar: ""
+   description: ""
+   #customJS: [ "js/main.js" ] # relative to ./assets/
+   dateFormat: "2006-01-02"
+   paginationSinglePost: false
+   style: light
+   #style: auto-without-switcher
+   #style: light-without-switcher
+   readMore: true
+   disableSummary: false
+   social:
+   - id: email
+     name: contact
+   #- id: linkedin
+   #  name: lapoba16
+   #- id: rss
+   #  name: rss
+   #  url: '/index.xml'
+
+menu:
+  main:
+  - identifier: archive
+    name: Archive
+    title: Archive
+    url: /posts/
+    weight: 0
+  - identifier: about
+    name: About
+    title: About
+    url: /about/
+    weight: 1
+
+markup:
+  defaultMarkdownHandler: goldmark
+  goldmark:
+    extensions:
+      linkify: true
+      strikethrough: true
+      table: true
+      taskList: true
+    parser:
+      autoHeadingID: true
+      autoHeadingIDType: github
+    renderer:
+      unsafe: true # raw HTML OK
+  highlight:
+    anchorLineNos: true
+    guessSyntax: true
+    lineNos: true
+    tabWidth: 3
+  tableOfContents:
+    endLevel: 3
+    ordered: false
+    startLevel: 2
+
+permalinks:
+  posts: /posts/:year-:month-:day-:filename
--- a/meta/about.md
+++ b/meta/about.md
@@ -1,31 +0,0 @@
---
-title: "About"
-description: "Hugo, the world’s fastest framework for building websites"
-date: "2019-02-28"
-aliases:
-  - "about-us"
-  - "about-hugo"
-  - "contact"
-author: "Hugo Authors"
---
-
-Written in Go, Hugo is an open source static site generator available under the [Apache Licence 2.0.](https://github.com/gohugoio/hugo/blob/master/LICENSE) Hugo supports TOML, YAML and JSON data file types, Markdown and HTML content files and uses shortcodes to add rich content. Other notable features are taxonomies, multilingual mode, image processing, custom output formats, HTML/CSS/JS minification and support for Sass SCSS workflows.
-
-Hugo makes use of a variety of open source projects including:
-
-* https://github.com/yuin/goldmark
-* https://github.com/alecthomas/chroma
-* https://github.com/muesli/smartcrop
-* https://github.com/spf13/cobra
-* https://github.com/spf13/viper
-
-Hugo is ideal for blogs, corporate websites, creative portfolios, online magazines, single page applications or even a website with thousands of pages.
-
-Hugo is for people who want to hand code their own website without worrying about setting up complicated runtimes, dependencies and databases.
-
-Websites built with Hugo are extremelly fast, secure and can be deployed anywhere including, AWS, GitHub Pages, Heroku, Netlify and any other hosting provider.
-
-Learn more and contribute on [GitHub](https://github.com/gohugoio).
-
-
-
--- a/meta/about.md
+++ b/meta/about.md
@@ -0,0 +1 @@
+.default/about.md
--- a/meta/config.yaml
+++ b/meta/config.yaml
@@ -1,76 +0,0 @@
-baseURL: ''
-languageCode: 'en-us'
-title: 'Site Title'
-theme: "anubis"
-paginate: 5
-paginatePath: "posts/-/pages"
-enableRobotsTXT: true
-disableLiveReload: true
-minify: true
-noHTTPCache: true
-disableFastRender: true
-rssLimit: 5
-canonifyurls: true
-relativeurls: false
-
-params:
-   toc: true
-   author: "me"
-   email: ""
-   avatar: ""
-   description: ""
-   #customJS: [ "js/main.js" ] # relative to ./assets/
-   dateFormat: "2006-01-02"
-   paginationSinglePost: false
-   style: light
-   #style: auto-without-switcher
-   #style: light-without-switcher
-   readMore: true
-   disableSummary: false
-   social:
-   - id: email
-     name: contact
-   #- id: linkedin
-   #  name: lapoba16
-   #- id: rss
-   #  name: rss
-   #  url: '/index.xml'
-
-menu:
-  main:
-  - identifier: archive
-    name: Archive
-    title: Archive
-    url: /posts/
-    weight: 0
-  - identifier: about
-    name: About
-    title: About
-    url: /about/
-    weight: 1
-
-markup:
-  defaultMarkdownHandler: goldmark
-  goldmark:
-    extensions:
-      linkify: true
-      strikethrough: true
-      table: true
-      taskList: true
-    parser:
-      autoHeadingID: true
-      autoHeadingIDType: github
-    renderer:
-      unsafe: true # raw HTML OK
-  highlight:
-    anchorLineNos: true
-    guessSyntax: true
-    lineNos: true
-    tabWidth: 3
-  tableOfContents:
-    endLevel: 3
-    ordered: false
-    startLevel: 2
-
-permalinks:
-  posts: /posts/:year-:month-:day-:filename
--- a/meta/config.yaml
+++ b/meta/config.yaml
@@ -0,0 +1 @@
+.default/config.yaml
--- a/posts.sh
+++ b/posts.sh
@@ -11,7 +11,7 @@ TMPDIR="${TMPDIR:-"/tmp"}"

 main() {
   flags "$@"
-   #posts
+   posts
   meta
 }

@@ -21,9 +21,109 @@ flags() {
   cd "$(dirname "$BASH_SOURCE")"
 }

-posts() {
+posts() (
+   list() {
+      gd list_posts
+   }
+
+   up_to_date() {
+      local metadatad="$(metadatad "$1")"
+      if [ -f "$metadatad" ]; then
+         if [ "$1" == "$(cat "$metadatad")" ]; then
+            return 0
+         fi
+      fi
+      return 1
+   }
+
+   pulled() {
+      echo "$TMPDIR/$(echo "$1" | jq -r .Name)"
+   }
+
+   pull() {
+      local filename="$(echo "$json" | jq -r .Name)"
+      gd pull_posts "$filename"
+   }
+
+   extracted() {
+      local pulled_file="$(pulled "$1")"
+      local date="$(echo "$1" | jq -r .ModTime)"
+      date="${date%%T*}"
+      local pulled_file_safe_basename="$(basename "$pulled_file" | sed 's/.zip$//' | sed 's/[^a-zA-Z0-9]/_/g')"
+      echo "$TMPDIR/${date}_${pulled_file_safe_basename}"
+   }
+
+   extract() {
+      local extracted_dir="$(extracted "$1")"
+      local index_html="$extracted_dir/.index.html"
+      local index_md="$extracted_dir/index.md"
+      local pulled_file="$(pulled "$1")"
+      local u_date_iso="$(echo "$1" | jq -r .ModTime)"
+
+      if [ -d "$extracted_dir" ]; then
+         rm -rf "$extracted_dir"
+      fi
+      mkdir -p "$extracted_dir"
+      7z x -o"$extracted_dir" "$pulled_file"
+
+      cat "$extracted_dir"/*.html \
+         | sed 's/.*<body/<body/' \
+         | sed 's/<\/body>.*/<\/body>/' \
+         > "$index_html"
+      local tags=($(cat "$index_html" | html_to_plaintext | grep -o '#[a-zA-Z0-9]*' | grep '[a-zA-Z]' | sed 's/^#//' | sort -u))
+      local tags_csv="$(first=false; for tag in "${tags[@]}"; do true; if $first; then echo -n ", "; fi; first=true; echo -n "$tag"; done)"
+
+      printf '
+         ---
+         title: "%s"
+         date: %s
+         draft: false
+         tags: [%s]
+         ---
+         ' \
+         "$(basename "$pulled_file" | sed 's/.zip$//' | sed 's/"/\\"/g')" \
+         "$u_date_iso" \
+         "$tags_csv" \
+         | sed 's/^[ ]*//' \
+         | grep . \
+         > "$index_md"
+
+      cat "$index_html" >> "$index_md"
+      echo "" >> "$index_md"
+      rm "$index_html" "$extracted_dir"/*.html
+
+      for ext in png jpg jpeg gif JPG; do find "$extracted_dir" -name "*.$ext"; done | while read -r line; do
+         convert "$line" -ordered-dither o8x8,8,8,4 "$line.2"
+         mv "$line.2" "$line"
+      done
+   }
+
+   imported() {
+      local extracted_dir="$(extracted "$1")"
+      echo "$HUGO_POSTS/$(basename "$extracted_dir")"
+   }
+
+   import() {
+      local extracted_dir="$(extracted "$1")"
+      local target_dir="$(imported "$1")"
+      if [ -d "$target_dir" ]; then
+         rm -rf "$target_dir"
+      fi
+      mv "$extracted_dir" "$target_dir"
+   }
+
+   metadatad() {
+      local imported="$(imported "$1")"
+      echo "$imported"/.metadata.json
+   }
+
+   metadata() {
+      local metadatad="$(metadatad "$1")"
+      echo "$1" > "$metadatad"
+   }
+
   local want=()
-   for b64_json in $(list_posts); do
+   for b64_json in $(list); do
      local json="$(echo "$b64_json" | base64 --decode)"
      local filename="$(echo "$json" | jq -r .Name)"
      want+=("$(imported "$json")")
@@ -47,11 +147,40 @@ posts() {
         rm -rf "$d"
      fi
   done
-}
+)

 meta() {
-   for b64_json in $(list_meta); do
-      echo "$b64_json" | base64 --decode | jq .
+   list() {
+      gd list_meta
+   }
+
+   pull() {
+      local filename="$(echo "$json" | jq -r .Name)"
+      gd pull_meta "$filename"
+   }
+
+   pulled() {
+      echo "$TMPDIR/$(echo "$1" | jq -r .Name)"
+   }
+
+   meta_file() {
+      echo "$TMPDIR/.meta.$(echo "$1" | jq -r .Name)"
+   }
+
+   up_to_date() {
+      test -f "$(meta_file "$1")" && test "$1" == "$(cat "$(meta_file "$1")")"
+   }
+
+   for b64_json in $(list); do
+      local json="$(echo "$b64_json" | base64 --decode | jq .)"
+      local filename="$(echo "$json" | jq -r .Name)"
+      if up_to_date "$json"; then
+         log "$filename: up to date"
+         continue
+      fi
+      log json=$json
+      pull "$json"
+      # todo: echo "$json" > "$(meta_file $json)"
   done
   false
 }
@@ -65,127 +194,30 @@ log() {
   echo "$(date +%H:%M:%S) > $*" >&2
 }

-list_posts() {
-   RCLONE_OPTS="$RCLONE_OPTS --drive-export-formats zip" gd list_posts
-}
-
-list_meta() {
-   RCLONE_OPTS="$RCLONE_OPTS --drive-export-formats txt" gd list_meta
-}
-
-up_to_date() {
-   local metadatad="$(metadatad "$1")"
-   if [ -f "$metadatad" ]; then
-      if [ "$1" == "$(cat "$metadatad")" ]; then
-         return 0
-      fi
-   fi
-   return 1
-}
-
-pulled() {
-   echo "$TMPDIR/$(echo "$1" | jq -r .Name)"
-}
-
-pull() {
-   local filename="$(echo "$json" | jq -r .Name)"
-   gd pull_posts "$filename"
-}
-
-extracted() {
-   local pulled_file="$(pulled "$1")"
-   local date="$(echo "$1" | jq -r .ModTime)"
-   date="${date%%T*}"
-   local pulled_file_safe_basename="$(basename "$pulled_file" | sed 's/.zip$//' | sed 's/[^a-zA-Z0-9]/_/g')"
-   echo "$TMPDIR/${date}_${pulled_file_safe_basename}"
-}
-
-extract() {
-   local extracted_dir="$(extracted "$1")"
-   local index_html="$extracted_dir/.index.html"
-   local index_md="$extracted_dir/index.md"
-   local pulled_file="$(pulled "$1")"
-   local u_date_iso="$(echo "$1" | jq -r .ModTime)"
-
-   if [ -d "$extracted_dir" ]; then
-      rm -rf "$extracted_dir"
-   fi
-   mkdir -p "$extracted_dir"
-   7z x -o"$extracted_dir" "$pulled_file"
-
-   cat "$extracted_dir"/*.html \
-      | sed 's/.*<body/<body/' \
-      | sed 's/<\/body>.*/<\/body>/' \
-      > "$index_html"
-   local tags=($(cat "$index_html" | sed '/^$/!{s/<[^>]*>//g;p;}' | grep -o '#[a-zA-Z0-9]*' | grep '[a-zA-Z]' | sed 's/^#//' | sort -u))
-   local tags_csv="$(first=false; for tag in "${tags[@]}"; do true; if $first; then echo -n ", "; fi; first=true; echo -n "$tag"; done)"
-
-   printf '
-      ---
-      title: "%s"
-      date: %s
-      draft: false
-      tags: [%s]
-      ---
-      ' \
-      "$(basename "$pulled_file" | sed 's/.zip$//' | sed 's/"/\\"/g')" \
-      "$u_date_iso" \
-      "$tags_csv" \
-      | sed 's/^[ ]*//' \
-      | grep . \
-      > "$index_md"
-
-   cat "$index_html" >> "$index_md"
-   echo "" >> "$index_md"
-   rm "$index_html" "$extracted_dir"/*.html
-
-   for ext in png jpg jpeg gif JPG; do find "$extracted_dir" -name "*.$ext"; done | while read -r line; do
-      convert "$line" -ordered-dither o8x8,8,8,4 "$line.2"
-      mv "$line.2" "$line"
-   done
-}
-
-imported() {
-   local extracted_dir="$(extracted "$1")"
-   echo "$HUGO_POSTS/$(basename "$extracted_dir")"
-}
-
-import() {
-   local extracted_dir="$(extracted "$1")"
-   local target_dir="$(imported "$1")"
-   if [ -d "$target_dir" ]; then
-      rm -rf "$target_dir"
-   fi
-   mv "$extracted_dir" "$target_dir"
-}
-
-metadatad() {
-   local imported="$(imported "$1")"
-   echo "$imported"/.metadata.json
-}
-
-metadata() {
-   local metadatad="$(metadatad "$1")"
-   echo "$1" > "$metadatad"
-}
-
 gd() (
-   list() {
-      rc lsjson "$RCLONE_ALIAS:$RCLONE_FOLDER/$1" \
-         | jq -c .[] \
-         | grep -E '(zip|txt)"' \
-         | while read -r line; do
-         echo "$line" | base64
-      done
-   }
   list_posts() {
      gd list "$RCLONE_FOLDER_POSTS"
   }
   list_meta() {
      gd list "$RCLONE_FOLDER_META"
   }
+   list() {
+      rc lsjson "$RCLONE_ALIAS:$RCLONE_FOLDER/$1" \
+         | jq -c .[] \
+         | grep -E 'zip"' \
+         | while read -r line; do
+         echo "$line" | base64
+      done
+   }
   pull_posts() {
-      rc copy "$RCLONE_ALIAS:$RCLONE_FOLDER/$RCLONE_FOLDER_POSTS/$1" "$TMPDIR"/
+      pull "$RCLONE_FOLDER_POSTS/$1"
+   }
+   pull_meta() {
+      set -x
+      pull "$RCLONE_FOLDER_META/$1"
+   }
+   pull() {
+      rc copy "$RCLONE_ALIAS:$RCLONE_FOLDER/$1" "$TMPDIR"/
   }
   rc() {
      rclone \
@@ -198,6 +230,10 @@ gd() (
   "$@"
 )

+html_to_plaintext() {
+   sed 's/<[^>]*>//g'
+}
+
 if [ "$0" == "$BASH_SOURCE" ]; then
   main "$@"
 fi
--- a/rclone.conf
+++ b/rclone.conf
@@ -5,6 +5,6 @@ client_secret = zCqs6bTjTm6TivvdU4BxhR19
 scope = drive
 #team_drive = 1-p043gwMk88rj4-Sm5otARi_AJZYskbj
 #root_folder_id = 1-p043gwMk88rj4-Sm5otARi_AJZYskbj
-#formats = zip,html,rtf,odt,docx,xlsx,pptx,svg
-token = {"access_token":"ya29.a0ARrdaM-Ca19JFA2LNKls0IsuLLGEF2M2eytTRxRP-YoS73oJvhDOi_xdw48uu0c8q5wrLglRfPBVZM5h48Oe4fJBVTPMR96b1ySUXWCscuh-GtWHV0iF4JxlIqY3bgR6lg5odyiUvXRFauh0JcZ10ozJ7mUy4g","token_type":"Bearer","refresh_token":"1//04sQD4800FsqyCgYIARAAGAQSNwF-L9IrB1_7BV8ScFnqmIz8FNWgXfr80--6VG6KX1pTnneQKAz6Ss3YtxqmCdf2tRBy2ndkhek","expiry":"2021-11-29T13:47:23.55774-07:00"}
+formats = zip,html,rtf,odt,docx,xlsx,pptx,svg
+token = {"access_token":"ya29.a0ARrdaM-P6wKzub3feWIaQlG3YXq2REc0rfS1LDxfeV1m1rr5Zwhe_XqYTInCEcCHP4bHNPvcBcZufAnJpgLZlVBte-abfEOvwfk4Qz25__FboaddAXthlv_EeIxPwVTlKJTy7MOKGLz2Rhg47CjjagfFYMcCKw","token_type":"Bearer","refresh_token":"1//04sQD4800FsqyCgYIARAAGAQSNwF-L9IrB1_7BV8ScFnqmIz8FNWgXfr80--6VG6KX1pTnneQKAz6Ss3YtxqmCdf2tRBy2ndkhek","expiry":"2021-11-29T14:49:21.903576-07:00"}
				`@@ -0,0 +1 @@`
				<html><head><meta content="text/html; charset=UTF-8" http-equiv="content-type"><style type="text/css">ol{margin:0;padding:0}table td,table th{padding:0}.c1{color:#000000;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:11pt;font-family:"Arial";font-style:normal}.c0{padding-top:0pt;padding-bottom:0pt;line-height:1.15;orphans:2;widows:2;text-align:left}.c2{background-color:#ffffff;max-width:468pt;padding:72pt 72pt 72pt 72pt}.title{padding-top:0pt;color:#000000;font-size:26pt;padding-bottom:3pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}.subtitle{padding-top:0pt;color:#666666;font-size:15pt;padding-bottom:16pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}li{color:#000000;font-size:11pt;font-family:"Arial"}p{margin:0;color:#000000;font-size:11pt;font-family:"Arial"}h1{padding-top:20pt;color:#000000;font-size:20pt;padding-bottom:6pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h2{padding-top:18pt;color:#000000;font-size:16pt;padding-bottom:6pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h3{padding-top:16pt;color:#434343;font-size:14pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h4{padding-top:14pt;color:#666666;font-size:12pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h5{padding-top:12pt;color:#666666;font-size:11pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h6{padding-top:12pt;color:#666666;font-size:11pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;font-style:italic;orphans:2;widows:2;text-align:left}</style></head><body class="c2"><p class="c0"><span class="c1">Welcome :)</span></p></body></html>