RSS implemented and tested

master
Bel LaPointe 2018-10-08 10:02:34 -06:00
parent 24e30a7eee
commit 49d95c150e
5 changed files with 212 additions and 48 deletions

View File

@ -3,10 +3,11 @@ package rss
import ( import (
"bytes" "bytes"
"encoding/gob" "encoding/gob"
"errors" "fmt"
"io/ioutil" "io/ioutil"
"net/http" "net/http"
"regexp" "regexp"
"strings"
"time" "time"
"github.com/mmcdole/gofeed" "github.com/mmcdole/gofeed"
@ -17,26 +18,41 @@ type Feed struct {
Items []string Items []string
ItemFilter string ItemFilter string
ContentFilter string ContentFilter string
Source string Link string
} }
func NewFeed(source, itemFilter, contentFilter string) (*Feed, error) { func (feed *Feed) String() string {
return fmt.Sprintf("Updated: %v, Items: %v, ItemFilter: %q, ContentFilter: %q, Link; %v",
feed.Updated.Local(),
feed.Items,
feed.ItemFilter,
feed.ContentFilter,
feed.Link,
)
}
func (feed *Feed) ID() string {
return strings.Join(regexp.MustCompile("[a-zA-Z0-9]*").FindAllString(feed.Link, -1), "_")
}
func NewFeed(source, itemFilter, contentFilter string) (*Feed, []*Item, error) {
if _, err := regexp.Compile(itemFilter); err != nil { if _, err := regexp.Compile(itemFilter); err != nil {
return nil, err return nil, nil, err
} }
if _, err := regexp.Compile(contentFilter); err != nil { if _, err := regexp.Compile(contentFilter); err != nil {
return nil, err return nil, nil, err
} }
f := &Feed{ f := &Feed{
Items: []string{}, Items: []string{},
ItemFilter: itemFilter, ItemFilter: itemFilter,
ContentFilter: contentFilter, ContentFilter: contentFilter,
Source: source, Link: source,
} }
if _, err := f.Update(); err != nil { items, err := f.Update()
return nil, err if err != nil {
return nil, nil, err
} }
return f, errors.New("not implemented") return f, items, nil
} }
func Deserialize(src []byte) (*Feed, error) { func Deserialize(src []byte) (*Feed, error) {
@ -47,15 +63,15 @@ func Deserialize(src []byte) (*Feed, error) {
return &dst, err return &dst, err
} }
func (f *Feed) Serialize() ([]byte, error) { func (feed *Feed) Serialize() ([]byte, error) {
var buffer bytes.Buffer var buffer bytes.Buffer
enc := gob.NewEncoder(&buffer) enc := gob.NewEncoder(&buffer)
err := enc.Encode(f) err := enc.Encode(feed)
return buffer.Bytes(), err return buffer.Bytes(), err
} }
func (f *Feed) Update() ([]*Item, error) { func (feed *Feed) Update() ([]*Item, error) {
resp, err := http.Get(f.Source) resp, err := http.Get(feed.Link)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -64,46 +80,46 @@ func (f *Feed) Update() ([]*Item, error) {
return nil, err return nil, err
} }
parser := gofeed.NewParser() parser := gofeed.NewParser()
feed, err := parser.Parse(bytes.NewBuffer(body)) gofeed, err := parser.Parse(bytes.NewBuffer(body))
if err != nil { if err != nil {
return nil, err return nil, err
} }
return f.fromGofeed(feed) return feed.fromGofeed(gofeed)
} }
func (f *Feed) fromGofeed(feed *gofeed.Feed) ([]*Item, error) { func (feed *Feed) fromGofeed(gofeed *gofeed.Feed) ([]*Item, error) {
updated := feed.PublishedParsed updated := gofeed.PublishedParsed
if updated == nil { if updated == nil {
updated = feed.UpdatedParsed updated = gofeed.UpdatedParsed
} }
if updated == nil && len(feed.Items) > 0 { if updated == nil && len(gofeed.Items) > 0 {
updated = gofeedItemTS(feed.Items[0]) updated = gofeedItemTS(gofeed.Items[0])
} }
if updated == nil { if updated == nil {
t := time.Now() t := time.Now()
updated = &t updated = &t
} }
newitems, err := f.appendNewItems(feed.Items) newitems, err := feed.appendNewItems(gofeed.Items)
if err != nil { if err != nil {
return nil, err return nil, err
} }
f.Updated = *updated feed.Updated = *updated
return newitems, nil return newitems, nil
} }
func (f *Feed) appendNewItems(items []*gofeed.Item) ([]*Item, error) { func (feed *Feed) appendNewItems(items []*gofeed.Item) ([]*Item, error) {
newitems := []*Item{} newitems := []*Item{}
for i := range items { for i := range items {
t := gofeedItemTS(items[i]) t := gofeedItemTS(items[i])
if t.Before(f.Updated) { if t.Before(feed.Updated) {
continue continue
} }
if ok, _ := regexp.MatchString(f.ItemFilter, items[i].Title); !ok { if ok, _ := regexp.MatchString(feed.ItemFilter, items[i].Title); !ok {
continue continue
} }
item := fromGofeedItem(items[i], f.ContentFilter) item := fromGofeedItem(items[i], feed.ContentFilter)
newitems = append(newitems, item) newitems = append(newitems, item)
f.Items = append(f.Items, item.Name) feed.Items = append(feed.Items, item.Name)
} }
return newitems, nil return newitems, nil
} }

91
rss/feed_test.go Normal file
View File

@ -0,0 +1,91 @@
package rss
import (
"io/ioutil"
"net/http"
"net/http/httptest"
"os"
"path"
"testing"
)
func Test_RSSFeed(t *testing.T) {
s := RSSServer()
defer s.Close()
cases := []struct {
itemFilter string
contentFilter string
itemsOut int
}{
{
itemFilter: "Blue",
contentFilter: "<img.*?/(img)?>",
itemsOut: 1,
},
{
itemFilter: "Blue",
itemsOut: 1,
},
{
itemsOut: 4,
},
}
for _, c := range cases {
feed, items, err := NewFeed(s.URL, c.itemFilter, c.contentFilter)
if err != nil {
t.Errorf("couldn't create new feed %v: %v", feed, err)
}
if len(items) != c.itemsOut {
t.Errorf("couldn't get all items from feed: got %v, wanted %v", len(items), c.itemsOut)
}
for i := range items {
if len(items[i].Content) == 0 {
t.Errorf("Empty content for %v with filter %q %q", items[i].Name, c.itemFilter, c.contentFilter)
}
}
serialized, err := feed.Serialize()
if err != nil {
t.Fatalf("Cannot serialize feed: %v", err)
}
deserialized, err := Deserialize(serialized)
if err != nil {
t.Fatalf("Cannot deserialize feed: %v", err)
}
if feed.String() != deserialized.String() {
t.Fatalf("deserialized != feed: %s, expected %s", deserialized.String(), feed.String())
}
}
}
func RSSServer() *httptest.Server {
var content []byte
feedPath := "./testdata/feed.xml"
if _, err := os.Stat(feedPath); os.IsNotExist(err) {
resp, err := http.Get("https://xkcd.com/rss.xml")
if err != nil {
panic(err)
}
defer resp.Body.Close()
b, err := ioutil.ReadAll(resp.Body)
if err != nil {
panic(err)
}
if err := os.MkdirAll(path.Dir(feedPath), os.ModePerm); err != nil {
panic(err)
}
if err := ioutil.WriteFile(feedPath, b, os.ModePerm); err != nil {
panic(err)
}
content = b
} else {
b, err := ioutil.ReadFile(feedPath)
if err != nil {
panic(err)
}
content = b
}
return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Write(content)
}))
}

View File

@ -1,7 +1,10 @@
package rss package rss
import ( import (
"fmt"
"html" "html"
"io/ioutil"
"net/http"
"regexp" "regexp"
"strings" "strings"
"time" "time"
@ -16,6 +19,19 @@ type Item struct {
TS time.Time TS time.Time
} }
func (item *Item) String() string {
return fmt.Sprintf("Name %v, Link %v, Content %q, TS %v",
item.Name,
item.Link,
item.Content,
item.TS.Local(),
)
}
func (item *Item) ID() string {
return strings.Join(regexp.MustCompile("[a-zA-Z0-9]*").FindAllString(item.Link, -1), "_")
}
func fromGofeedItem(gfitem *gofeed.Item, filter string) *Item { func fromGofeedItem(gfitem *gofeed.Item, filter string) *Item {
item := &Item{ item := &Item{
Name: gfitem.Title, Name: gfitem.Title,
@ -23,35 +39,48 @@ func fromGofeedItem(gfitem *gofeed.Item, filter string) *Item {
Content: "", Content: "",
TS: *gofeedItemTS(gfitem), TS: *gofeedItemTS(gfitem),
} }
if filter == "" { content := gfitem.Content
item.Content = gfitem.Content if content == "" {
return item content = contentFromLink(item.Link)
} }
r := regexp.MustCompile(filter) content = strings.Replace(content, "\n", "", -1)
matches := r.FindAllString(gfitem.Content, -1) if filter != "" {
content := strings.Join(matches, "\n<br>\n") r := regexp.MustCompile(filter)
content = cleanImgTags(content, item.Link) matches := r.FindAllString(content, -1)
content = strings.Join(matches, "<br>")
}
content = cleanImgTags(content)
item.Content = content item.Content = content
return item return item
} }
func cleanImgTags(s, url string) string { func contentFromLink(link string) string {
domain := regexp.MustCompile("(https?://)?(www\\.)?[a-zA-Z0-9]+\\.+[a-z]{2}[a-z]?").FindString(url) resp, err := http.Get(link)
if err != nil {
return ""
}
defer resp.Body.Close()
b, err := ioutil.ReadAll(resp.Body)
if err != nil {
return ""
}
return string(b)
}
func cleanImgTags(s string) string {
reg := regexp.MustCompile("<img.+?/(img)?>") reg := regexp.MustCompile("<img.+?/(img)?>")
s = html.UnescapeString(s) s = html.UnescapeString(s)
matches := reg.FindAllString(s, -1) matches := reg.FindAllString(s, -1)
if len(matches) > 0 { if len(matches) > 0 {
// get img src="..." and build // get img src="..." and build
regImgSrc := regexp.MustCompile("src=\"[^\"]+\"") regImgSrc := regexp.MustCompile("src=\".*?\"")
for j := range matches { for j := range matches {
theseMatches := regImgSrc.FindAllString(matches[j], -1) imgSrc := regImgSrc.FindString(matches[j])
for k := range theseMatches { replacement := matches[j]
if strings.HasPrefix(theseMatches[k], "src=\"/") { if imgSrc != "" {
theseMatches[k] = "src=\"" + domain + theseMatches[k][5:] replacement = "<img " + imgSrc + "/>"
}
theseMatches[k] = "<img " + theseMatches[k] + " />"
} }
s = strings.Replace(s, matches[j], strings.Join(theseMatches, "<br>"), 1) s = strings.Replace(s, matches[j], replacement, 1)
} }
} }
return s return s

View File

@ -12,6 +12,32 @@ func Test_RSSItem(t *testing.T) {
filter string filter string
output Item output Item
}{ }{
{
input: gofeed.Item{
Title: "a",
Link: "b",
Content: `<img src="A" and=things/> <img src="asdf" and="some-toher-stuff"></img>`,
},
filter: `<[a-z]+.+?/([a-z]+)?>`,
output: Item{
Name: "a",
Link: "b",
Content: `<img src="A"/><br><img src="asdf"/>`,
},
},
{
input: gofeed.Item{
Title: "a",
Link: "b",
Content: `a b c <img src="asdf" and="some-toher-stuff"></img> d e f`,
},
filter: `<[a-z]+.+?/([a-z]+)?>`,
output: Item{
Name: "a",
Link: "b",
Content: `<img src="asdf"/>`,
},
},
{ {
input: gofeed.Item{ input: gofeed.Item{
Title: "a", Title: "a",
@ -29,13 +55,13 @@ func Test_RSSItem(t *testing.T) {
input: gofeed.Item{ input: gofeed.Item{
Title: "a", Title: "a",
Link: "b", Link: "b",
Content: `x y <img src="asdf"></img>`, Content: `<img src="asdf"></img>`,
}, },
filter: "[a-z]*", filter: "",
output: Item{ output: Item{
Name: "a", Name: "a",
Link: "b", Link: "b",
Content: "x\n<br>\ny", Content: `<img src="asdf"/>`,
}, },
}, },
{ {
@ -48,7 +74,7 @@ func Test_RSSItem(t *testing.T) {
output: Item{ output: Item{
Name: "a", Name: "a",
Link: "b", Link: "b",
Content: "x\n<br>\ny", Content: "x<br>y",
}, },
}, },
} }

2
rss/testdata/feed.xml vendored Executable file
View File

@ -0,0 +1,2 @@
<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0"><channel><title>xkcd.com</title><link>https://xkcd.com/</link><description>xkcd.com: A webcomic of romance and math humor.</description><language>en</language><item><title>Bluetooth</title><link>https://xkcd.com/2055/</link><description>&lt;img src="https://imgs.xkcd.com/comics/bluetooth.png" title="Bluetooth is actually named for the tenth-century Viking king Harald &amp;quot;Bluetooth&amp;quot; Gormsson, but the protocol developed by Harald was a wireless charging standard unrelated to the modern Bluetooth except by name." alt="Bluetooth is actually named for the tenth-century Viking king Harald &amp;quot;Bluetooth&amp;quot; Gormsson, but the protocol developed by Harald was a wireless charging standard unrelated to the modern Bluetooth except by name." /&gt;</description><pubDate>Fri, 05 Oct 2018 04:00:00 -0000</pubDate><guid>https://xkcd.com/2055/</guid></item><item><title>Data Pipeline</title><link>https://xkcd.com/2054/</link><description>&lt;img src="https://imgs.xkcd.com/comics/data_pipeline.png" title="&amp;quot;Is the pipeline literally running from your laptop?&amp;quot; &amp;quot;Don't be silly, my laptop disconnects far too often to host a service we rely on. It's running on my phone.&amp;quot;" alt="&amp;quot;Is the pipeline literally running from your laptop?&amp;quot; &amp;quot;Don't be silly, my laptop disconnects far too often to host a service we rely on. It's running on my phone.&amp;quot;" /&gt;</description><pubDate>Wed, 03 Oct 2018 04:00:00 -0000</pubDate><guid>https://xkcd.com/2054/</guid></item><item><title>Incoming Calls</title><link>https://xkcd.com/2053/</link><description>&lt;img src="https://imgs.xkcd.com/comics/incoming_calls.png" title="I wonder if that friendly lady ever fixed the problem she was having with her headset." alt="I wonder if that friendly lady ever fixed the problem she was having with her headset." /&gt;</description><pubDate>Mon, 01 Oct 2018 04:00:00 -0000</pubDate><guid>https://xkcd.com/2053/</guid></item><item><title>Stanislav Petrov Day</title><link>https://xkcd.com/2052/</link><description>&lt;img src="https://imgs.xkcd.com/comics/stanislav_petrov_day.png" title="I was going to get you an alarm clock that occasionally goes off randomly in the middle of the night, but you can ignore it and go back to sleep and it's fine." alt="I was going to get you an alarm clock that occasionally goes off randomly in the middle of the night, but you can ignore it and go back to sleep and it's fine." /&gt;</description><pubDate>Fri, 28 Sep 2018 04:00:00 -0000</pubDate><guid>https://xkcd.com/2052/</guid></item></channel></rss>