From 49d95c150e6e91a8eb2bb47104db17960dd22212 Mon Sep 17 00:00:00 2001 From: Bel LaPointe Date: Mon, 8 Oct 2018 10:02:34 -0600 Subject: [PATCH] RSS implemented and tested --- rss/feed.go | 70 ++++++++++++++++++++------------- rss/feed_test.go | 91 +++++++++++++++++++++++++++++++++++++++++++ rss/item.go | 63 ++++++++++++++++++++++-------- rss/item_test.go | 34 ++++++++++++++-- rss/testdata/feed.xml | 2 + 5 files changed, 212 insertions(+), 48 deletions(-) create mode 100644 rss/feed_test.go create mode 100755 rss/testdata/feed.xml diff --git a/rss/feed.go b/rss/feed.go index f6f24db..c294057 100644 --- a/rss/feed.go +++ b/rss/feed.go @@ -3,10 +3,11 @@ package rss import ( "bytes" "encoding/gob" - "errors" + "fmt" "io/ioutil" "net/http" "regexp" + "strings" "time" "github.com/mmcdole/gofeed" @@ -17,26 +18,41 @@ type Feed struct { Items []string ItemFilter string ContentFilter string - Source string + Link string } -func NewFeed(source, itemFilter, contentFilter string) (*Feed, error) { +func (feed *Feed) String() string { + return fmt.Sprintf("Updated: %v, Items: %v, ItemFilter: %q, ContentFilter: %q, Link; %v", + feed.Updated.Local(), + feed.Items, + feed.ItemFilter, + feed.ContentFilter, + feed.Link, + ) +} + +func (feed *Feed) ID() string { + return strings.Join(regexp.MustCompile("[a-zA-Z0-9]*").FindAllString(feed.Link, -1), "_") +} + +func NewFeed(source, itemFilter, contentFilter string) (*Feed, []*Item, error) { if _, err := regexp.Compile(itemFilter); err != nil { - return nil, err + return nil, nil, err } if _, err := regexp.Compile(contentFilter); err != nil { - return nil, err + return nil, nil, err } f := &Feed{ Items: []string{}, ItemFilter: itemFilter, ContentFilter: contentFilter, - Source: source, + Link: source, } - if _, err := f.Update(); err != nil { - return nil, err + items, err := f.Update() + if err != nil { + return nil, nil, err } - return f, errors.New("not implemented") + return f, items, nil } func Deserialize(src []byte) (*Feed, error) { @@ -47,15 +63,15 @@ func Deserialize(src []byte) (*Feed, error) { return &dst, err } -func (f *Feed) Serialize() ([]byte, error) { +func (feed *Feed) Serialize() ([]byte, error) { var buffer bytes.Buffer enc := gob.NewEncoder(&buffer) - err := enc.Encode(f) + err := enc.Encode(feed) return buffer.Bytes(), err } -func (f *Feed) Update() ([]*Item, error) { - resp, err := http.Get(f.Source) +func (feed *Feed) Update() ([]*Item, error) { + resp, err := http.Get(feed.Link) if err != nil { return nil, err } @@ -64,46 +80,46 @@ func (f *Feed) Update() ([]*Item, error) { return nil, err } parser := gofeed.NewParser() - feed, err := parser.Parse(bytes.NewBuffer(body)) + gofeed, err := parser.Parse(bytes.NewBuffer(body)) if err != nil { return nil, err } - return f.fromGofeed(feed) + return feed.fromGofeed(gofeed) } -func (f *Feed) fromGofeed(feed *gofeed.Feed) ([]*Item, error) { - updated := feed.PublishedParsed +func (feed *Feed) fromGofeed(gofeed *gofeed.Feed) ([]*Item, error) { + updated := gofeed.PublishedParsed if updated == nil { - updated = feed.UpdatedParsed + updated = gofeed.UpdatedParsed } - if updated == nil && len(feed.Items) > 0 { - updated = gofeedItemTS(feed.Items[0]) + if updated == nil && len(gofeed.Items) > 0 { + updated = gofeedItemTS(gofeed.Items[0]) } if updated == nil { t := time.Now() updated = &t } - newitems, err := f.appendNewItems(feed.Items) + newitems, err := feed.appendNewItems(gofeed.Items) if err != nil { return nil, err } - f.Updated = *updated + feed.Updated = *updated return newitems, nil } -func (f *Feed) appendNewItems(items []*gofeed.Item) ([]*Item, error) { +func (feed *Feed) appendNewItems(items []*gofeed.Item) ([]*Item, error) { newitems := []*Item{} for i := range items { t := gofeedItemTS(items[i]) - if t.Before(f.Updated) { + if t.Before(feed.Updated) { continue } - if ok, _ := regexp.MatchString(f.ItemFilter, items[i].Title); !ok { + if ok, _ := regexp.MatchString(feed.ItemFilter, items[i].Title); !ok { continue } - item := fromGofeedItem(items[i], f.ContentFilter) + item := fromGofeedItem(items[i], feed.ContentFilter) newitems = append(newitems, item) - f.Items = append(f.Items, item.Name) + feed.Items = append(feed.Items, item.Name) } return newitems, nil } diff --git a/rss/feed_test.go b/rss/feed_test.go new file mode 100644 index 0000000..f8fcdba --- /dev/null +++ b/rss/feed_test.go @@ -0,0 +1,91 @@ +package rss + +import ( + "io/ioutil" + "net/http" + "net/http/httptest" + "os" + "path" + "testing" +) + +func Test_RSSFeed(t *testing.T) { + s := RSSServer() + defer s.Close() + cases := []struct { + itemFilter string + contentFilter string + itemsOut int + }{ + { + itemFilter: "Blue", + contentFilter: "", + itemsOut: 1, + }, + { + itemFilter: "Blue", + itemsOut: 1, + }, + { + itemsOut: 4, + }, + } + for _, c := range cases { + feed, items, err := NewFeed(s.URL, c.itemFilter, c.contentFilter) + if err != nil { + t.Errorf("couldn't create new feed %v: %v", feed, err) + } + if len(items) != c.itemsOut { + t.Errorf("couldn't get all items from feed: got %v, wanted %v", len(items), c.itemsOut) + } + for i := range items { + if len(items[i].Content) == 0 { + t.Errorf("Empty content for %v with filter %q %q", items[i].Name, c.itemFilter, c.contentFilter) + } + } + serialized, err := feed.Serialize() + if err != nil { + t.Fatalf("Cannot serialize feed: %v", err) + } + deserialized, err := Deserialize(serialized) + if err != nil { + t.Fatalf("Cannot deserialize feed: %v", err) + } + if feed.String() != deserialized.String() { + t.Fatalf("deserialized != feed: %s, expected %s", deserialized.String(), feed.String()) + } + } +} + +func RSSServer() *httptest.Server { + var content []byte + feedPath := "./testdata/feed.xml" + if _, err := os.Stat(feedPath); os.IsNotExist(err) { + resp, err := http.Get("https://xkcd.com/rss.xml") + if err != nil { + panic(err) + } + defer resp.Body.Close() + b, err := ioutil.ReadAll(resp.Body) + if err != nil { + panic(err) + } + if err := os.MkdirAll(path.Dir(feedPath), os.ModePerm); err != nil { + panic(err) + } + if err := ioutil.WriteFile(feedPath, b, os.ModePerm); err != nil { + panic(err) + } + content = b + } else { + b, err := ioutil.ReadFile(feedPath) + if err != nil { + panic(err) + } + content = b + } + + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Write(content) + })) +} diff --git a/rss/item.go b/rss/item.go index eefc4dd..a8d7b4b 100644 --- a/rss/item.go +++ b/rss/item.go @@ -1,7 +1,10 @@ package rss import ( + "fmt" "html" + "io/ioutil" + "net/http" "regexp" "strings" "time" @@ -16,6 +19,19 @@ type Item struct { TS time.Time } +func (item *Item) String() string { + return fmt.Sprintf("Name %v, Link %v, Content %q, TS %v", + item.Name, + item.Link, + item.Content, + item.TS.Local(), + ) +} + +func (item *Item) ID() string { + return strings.Join(regexp.MustCompile("[a-zA-Z0-9]*").FindAllString(item.Link, -1), "_") +} + func fromGofeedItem(gfitem *gofeed.Item, filter string) *Item { item := &Item{ Name: gfitem.Title, @@ -23,35 +39,48 @@ func fromGofeedItem(gfitem *gofeed.Item, filter string) *Item { Content: "", TS: *gofeedItemTS(gfitem), } - if filter == "" { - item.Content = gfitem.Content - return item + content := gfitem.Content + if content == "" { + content = contentFromLink(item.Link) } - r := regexp.MustCompile(filter) - matches := r.FindAllString(gfitem.Content, -1) - content := strings.Join(matches, "\n
\n") - content = cleanImgTags(content, item.Link) + content = strings.Replace(content, "\n", "", -1) + if filter != "" { + r := regexp.MustCompile(filter) + matches := r.FindAllString(content, -1) + content = strings.Join(matches, "
") + } + content = cleanImgTags(content) item.Content = content return item } -func cleanImgTags(s, url string) string { - domain := regexp.MustCompile("(https?://)?(www\\.)?[a-zA-Z0-9]+\\.+[a-z]{2}[a-z]?").FindString(url) +func contentFromLink(link string) string { + resp, err := http.Get(link) + if err != nil { + return "" + } + defer resp.Body.Close() + b, err := ioutil.ReadAll(resp.Body) + if err != nil { + return "" + } + return string(b) +} + +func cleanImgTags(s string) string { reg := regexp.MustCompile("") s = html.UnescapeString(s) matches := reg.FindAllString(s, -1) if len(matches) > 0 { // get img src="..." and build - regImgSrc := regexp.MustCompile("src=\"[^\"]+\"") + regImgSrc := regexp.MustCompile("src=\".*?\"") for j := range matches { - theseMatches := regImgSrc.FindAllString(matches[j], -1) - for k := range theseMatches { - if strings.HasPrefix(theseMatches[k], "src=\"/") { - theseMatches[k] = "src=\"" + domain + theseMatches[k][5:] - } - theseMatches[k] = "" + imgSrc := regImgSrc.FindString(matches[j]) + replacement := matches[j] + if imgSrc != "" { + replacement = "" } - s = strings.Replace(s, matches[j], strings.Join(theseMatches, "
"), 1) + s = strings.Replace(s, matches[j], replacement, 1) } } return s diff --git a/rss/item_test.go b/rss/item_test.go index ce134fc..32424a5 100644 --- a/rss/item_test.go +++ b/rss/item_test.go @@ -12,6 +12,32 @@ func Test_RSSItem(t *testing.T) { filter string output Item }{ + { + input: gofeed.Item{ + Title: "a", + Link: "b", + Content: ` `, + }, + filter: `<[a-z]+.+?/([a-z]+)?>`, + output: Item{ + Name: "a", + Link: "b", + Content: `
`, + }, + }, + { + input: gofeed.Item{ + Title: "a", + Link: "b", + Content: `a b c d e f`, + }, + filter: `<[a-z]+.+?/([a-z]+)?>`, + output: Item{ + Name: "a", + Link: "b", + Content: ``, + }, + }, { input: gofeed.Item{ Title: "a", @@ -29,13 +55,13 @@ func Test_RSSItem(t *testing.T) { input: gofeed.Item{ Title: "a", Link: "b", - Content: `x y `, + Content: ``, }, - filter: "[a-z]*", + filter: "", output: Item{ Name: "a", Link: "b", - Content: "x\n
\ny", + Content: ``, }, }, { @@ -48,7 +74,7 @@ func Test_RSSItem(t *testing.T) { output: Item{ Name: "a", Link: "b", - Content: "x\n
\ny", + Content: "x
y", }, }, } diff --git a/rss/testdata/feed.xml b/rss/testdata/feed.xml new file mode 100755 index 0000000..63bf1a9 --- /dev/null +++ b/rss/testdata/feed.xml @@ -0,0 +1,2 @@ + +xkcd.comhttps://xkcd.com/xkcd.com: A webcomic of romance and math humor.enBluetoothhttps://xkcd.com/2055/<img src="https://imgs.xkcd.com/comics/bluetooth.png" title="Bluetooth is actually named for the tenth-century Viking king Harald &quot;Bluetooth&quot; Gormsson, but the protocol developed by Harald was a wireless charging standard unrelated to the modern Bluetooth except by name." alt="Bluetooth is actually named for the tenth-century Viking king Harald &quot;Bluetooth&quot; Gormsson, but the protocol developed by Harald was a wireless charging standard unrelated to the modern Bluetooth except by name." />Fri, 05 Oct 2018 04:00:00 -0000https://xkcd.com/2055/Data Pipelinehttps://xkcd.com/2054/<img src="https://imgs.xkcd.com/comics/data_pipeline.png" title="&quot;Is the pipeline literally running from your laptop?&quot; &quot;Don't be silly, my laptop disconnects far too often to host a service we rely on. It's running on my phone.&quot;" alt="&quot;Is the pipeline literally running from your laptop?&quot; &quot;Don't be silly, my laptop disconnects far too often to host a service we rely on. It's running on my phone.&quot;" />Wed, 03 Oct 2018 04:00:00 -0000https://xkcd.com/2054/Incoming Callshttps://xkcd.com/2053/<img src="https://imgs.xkcd.com/comics/incoming_calls.png" title="I wonder if that friendly lady ever fixed the problem she was having with her headset." alt="I wonder if that friendly lady ever fixed the problem she was having with her headset." />Mon, 01 Oct 2018 04:00:00 -0000https://xkcd.com/2053/Stanislav Petrov Dayhttps://xkcd.com/2052/<img src="https://imgs.xkcd.com/comics/stanislav_petrov_day.png" title="I was going to get you an alarm clock that occasionally goes off randomly in the middle of the night, but you can ignore it and go back to sleep and it's fine." alt="I was going to get you an alarm clock that occasionally goes off randomly in the middle of the night, but you can ignore it and go back to sleep and it's fine." />Fri, 28 Sep 2018 04:00:00 -0000https://xkcd.com/2052/ \ No newline at end of file