RSS implemented and tested
parent
24e30a7eee
commit
49d95c150e
70
rss/feed.go
70
rss/feed.go
|
|
@ -3,10 +3,11 @@ package rss
|
|||
import (
|
||||
"bytes"
|
||||
"encoding/gob"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/mmcdole/gofeed"
|
||||
|
|
@ -17,26 +18,41 @@ type Feed struct {
|
|||
Items []string
|
||||
ItemFilter string
|
||||
ContentFilter string
|
||||
Source string
|
||||
Link string
|
||||
}
|
||||
|
||||
func NewFeed(source, itemFilter, contentFilter string) (*Feed, error) {
|
||||
func (feed *Feed) String() string {
|
||||
return fmt.Sprintf("Updated: %v, Items: %v, ItemFilter: %q, ContentFilter: %q, Link; %v",
|
||||
feed.Updated.Local(),
|
||||
feed.Items,
|
||||
feed.ItemFilter,
|
||||
feed.ContentFilter,
|
||||
feed.Link,
|
||||
)
|
||||
}
|
||||
|
||||
func (feed *Feed) ID() string {
|
||||
return strings.Join(regexp.MustCompile("[a-zA-Z0-9]*").FindAllString(feed.Link, -1), "_")
|
||||
}
|
||||
|
||||
func NewFeed(source, itemFilter, contentFilter string) (*Feed, []*Item, error) {
|
||||
if _, err := regexp.Compile(itemFilter); err != nil {
|
||||
return nil, err
|
||||
return nil, nil, err
|
||||
}
|
||||
if _, err := regexp.Compile(contentFilter); err != nil {
|
||||
return nil, err
|
||||
return nil, nil, err
|
||||
}
|
||||
f := &Feed{
|
||||
Items: []string{},
|
||||
ItemFilter: itemFilter,
|
||||
ContentFilter: contentFilter,
|
||||
Source: source,
|
||||
Link: source,
|
||||
}
|
||||
if _, err := f.Update(); err != nil {
|
||||
return nil, err
|
||||
items, err := f.Update()
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
return f, errors.New("not implemented")
|
||||
return f, items, nil
|
||||
}
|
||||
|
||||
func Deserialize(src []byte) (*Feed, error) {
|
||||
|
|
@ -47,15 +63,15 @@ func Deserialize(src []byte) (*Feed, error) {
|
|||
return &dst, err
|
||||
}
|
||||
|
||||
func (f *Feed) Serialize() ([]byte, error) {
|
||||
func (feed *Feed) Serialize() ([]byte, error) {
|
||||
var buffer bytes.Buffer
|
||||
enc := gob.NewEncoder(&buffer)
|
||||
err := enc.Encode(f)
|
||||
err := enc.Encode(feed)
|
||||
return buffer.Bytes(), err
|
||||
}
|
||||
|
||||
func (f *Feed) Update() ([]*Item, error) {
|
||||
resp, err := http.Get(f.Source)
|
||||
func (feed *Feed) Update() ([]*Item, error) {
|
||||
resp, err := http.Get(feed.Link)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
|
@ -64,46 +80,46 @@ func (f *Feed) Update() ([]*Item, error) {
|
|||
return nil, err
|
||||
}
|
||||
parser := gofeed.NewParser()
|
||||
feed, err := parser.Parse(bytes.NewBuffer(body))
|
||||
gofeed, err := parser.Parse(bytes.NewBuffer(body))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return f.fromGofeed(feed)
|
||||
return feed.fromGofeed(gofeed)
|
||||
}
|
||||
|
||||
func (f *Feed) fromGofeed(feed *gofeed.Feed) ([]*Item, error) {
|
||||
updated := feed.PublishedParsed
|
||||
func (feed *Feed) fromGofeed(gofeed *gofeed.Feed) ([]*Item, error) {
|
||||
updated := gofeed.PublishedParsed
|
||||
if updated == nil {
|
||||
updated = feed.UpdatedParsed
|
||||
updated = gofeed.UpdatedParsed
|
||||
}
|
||||
if updated == nil && len(feed.Items) > 0 {
|
||||
updated = gofeedItemTS(feed.Items[0])
|
||||
if updated == nil && len(gofeed.Items) > 0 {
|
||||
updated = gofeedItemTS(gofeed.Items[0])
|
||||
}
|
||||
if updated == nil {
|
||||
t := time.Now()
|
||||
updated = &t
|
||||
}
|
||||
newitems, err := f.appendNewItems(feed.Items)
|
||||
newitems, err := feed.appendNewItems(gofeed.Items)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
f.Updated = *updated
|
||||
feed.Updated = *updated
|
||||
return newitems, nil
|
||||
}
|
||||
|
||||
func (f *Feed) appendNewItems(items []*gofeed.Item) ([]*Item, error) {
|
||||
func (feed *Feed) appendNewItems(items []*gofeed.Item) ([]*Item, error) {
|
||||
newitems := []*Item{}
|
||||
for i := range items {
|
||||
t := gofeedItemTS(items[i])
|
||||
if t.Before(f.Updated) {
|
||||
if t.Before(feed.Updated) {
|
||||
continue
|
||||
}
|
||||
if ok, _ := regexp.MatchString(f.ItemFilter, items[i].Title); !ok {
|
||||
if ok, _ := regexp.MatchString(feed.ItemFilter, items[i].Title); !ok {
|
||||
continue
|
||||
}
|
||||
item := fromGofeedItem(items[i], f.ContentFilter)
|
||||
item := fromGofeedItem(items[i], feed.ContentFilter)
|
||||
newitems = append(newitems, item)
|
||||
f.Items = append(f.Items, item.Name)
|
||||
feed.Items = append(feed.Items, item.Name)
|
||||
}
|
||||
return newitems, nil
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,91 @@
|
|||
package rss
|
||||
|
||||
import (
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func Test_RSSFeed(t *testing.T) {
|
||||
s := RSSServer()
|
||||
defer s.Close()
|
||||
cases := []struct {
|
||||
itemFilter string
|
||||
contentFilter string
|
||||
itemsOut int
|
||||
}{
|
||||
{
|
||||
itemFilter: "Blue",
|
||||
contentFilter: "<img.*?/(img)?>",
|
||||
itemsOut: 1,
|
||||
},
|
||||
{
|
||||
itemFilter: "Blue",
|
||||
itemsOut: 1,
|
||||
},
|
||||
{
|
||||
itemsOut: 4,
|
||||
},
|
||||
}
|
||||
for _, c := range cases {
|
||||
feed, items, err := NewFeed(s.URL, c.itemFilter, c.contentFilter)
|
||||
if err != nil {
|
||||
t.Errorf("couldn't create new feed %v: %v", feed, err)
|
||||
}
|
||||
if len(items) != c.itemsOut {
|
||||
t.Errorf("couldn't get all items from feed: got %v, wanted %v", len(items), c.itemsOut)
|
||||
}
|
||||
for i := range items {
|
||||
if len(items[i].Content) == 0 {
|
||||
t.Errorf("Empty content for %v with filter %q %q", items[i].Name, c.itemFilter, c.contentFilter)
|
||||
}
|
||||
}
|
||||
serialized, err := feed.Serialize()
|
||||
if err != nil {
|
||||
t.Fatalf("Cannot serialize feed: %v", err)
|
||||
}
|
||||
deserialized, err := Deserialize(serialized)
|
||||
if err != nil {
|
||||
t.Fatalf("Cannot deserialize feed: %v", err)
|
||||
}
|
||||
if feed.String() != deserialized.String() {
|
||||
t.Fatalf("deserialized != feed: %s, expected %s", deserialized.String(), feed.String())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func RSSServer() *httptest.Server {
|
||||
var content []byte
|
||||
feedPath := "./testdata/feed.xml"
|
||||
if _, err := os.Stat(feedPath); os.IsNotExist(err) {
|
||||
resp, err := http.Get("https://xkcd.com/rss.xml")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
b, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
if err := os.MkdirAll(path.Dir(feedPath), os.ModePerm); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
if err := ioutil.WriteFile(feedPath, b, os.ModePerm); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
content = b
|
||||
} else {
|
||||
b, err := ioutil.ReadFile(feedPath)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
content = b
|
||||
}
|
||||
|
||||
return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Write(content)
|
||||
}))
|
||||
}
|
||||
63
rss/item.go
63
rss/item.go
|
|
@ -1,7 +1,10 @@
|
|||
package rss
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"html"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
|
@ -16,6 +19,19 @@ type Item struct {
|
|||
TS time.Time
|
||||
}
|
||||
|
||||
func (item *Item) String() string {
|
||||
return fmt.Sprintf("Name %v, Link %v, Content %q, TS %v",
|
||||
item.Name,
|
||||
item.Link,
|
||||
item.Content,
|
||||
item.TS.Local(),
|
||||
)
|
||||
}
|
||||
|
||||
func (item *Item) ID() string {
|
||||
return strings.Join(regexp.MustCompile("[a-zA-Z0-9]*").FindAllString(item.Link, -1), "_")
|
||||
}
|
||||
|
||||
func fromGofeedItem(gfitem *gofeed.Item, filter string) *Item {
|
||||
item := &Item{
|
||||
Name: gfitem.Title,
|
||||
|
|
@ -23,35 +39,48 @@ func fromGofeedItem(gfitem *gofeed.Item, filter string) *Item {
|
|||
Content: "",
|
||||
TS: *gofeedItemTS(gfitem),
|
||||
}
|
||||
if filter == "" {
|
||||
item.Content = gfitem.Content
|
||||
return item
|
||||
content := gfitem.Content
|
||||
if content == "" {
|
||||
content = contentFromLink(item.Link)
|
||||
}
|
||||
r := regexp.MustCompile(filter)
|
||||
matches := r.FindAllString(gfitem.Content, -1)
|
||||
content := strings.Join(matches, "\n<br>\n")
|
||||
content = cleanImgTags(content, item.Link)
|
||||
content = strings.Replace(content, "\n", "", -1)
|
||||
if filter != "" {
|
||||
r := regexp.MustCompile(filter)
|
||||
matches := r.FindAllString(content, -1)
|
||||
content = strings.Join(matches, "<br>")
|
||||
}
|
||||
content = cleanImgTags(content)
|
||||
item.Content = content
|
||||
return item
|
||||
}
|
||||
|
||||
func cleanImgTags(s, url string) string {
|
||||
domain := regexp.MustCompile("(https?://)?(www\\.)?[a-zA-Z0-9]+\\.+[a-z]{2}[a-z]?").FindString(url)
|
||||
func contentFromLink(link string) string {
|
||||
resp, err := http.Get(link)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
b, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return string(b)
|
||||
}
|
||||
|
||||
func cleanImgTags(s string) string {
|
||||
reg := regexp.MustCompile("<img.+?/(img)?>")
|
||||
s = html.UnescapeString(s)
|
||||
matches := reg.FindAllString(s, -1)
|
||||
if len(matches) > 0 {
|
||||
// get img src="..." and build
|
||||
regImgSrc := regexp.MustCompile("src=\"[^\"]+\"")
|
||||
regImgSrc := regexp.MustCompile("src=\".*?\"")
|
||||
for j := range matches {
|
||||
theseMatches := regImgSrc.FindAllString(matches[j], -1)
|
||||
for k := range theseMatches {
|
||||
if strings.HasPrefix(theseMatches[k], "src=\"/") {
|
||||
theseMatches[k] = "src=\"" + domain + theseMatches[k][5:]
|
||||
}
|
||||
theseMatches[k] = "<img " + theseMatches[k] + " />"
|
||||
imgSrc := regImgSrc.FindString(matches[j])
|
||||
replacement := matches[j]
|
||||
if imgSrc != "" {
|
||||
replacement = "<img " + imgSrc + "/>"
|
||||
}
|
||||
s = strings.Replace(s, matches[j], strings.Join(theseMatches, "<br>"), 1)
|
||||
s = strings.Replace(s, matches[j], replacement, 1)
|
||||
}
|
||||
}
|
||||
return s
|
||||
|
|
|
|||
|
|
@ -12,6 +12,32 @@ func Test_RSSItem(t *testing.T) {
|
|||
filter string
|
||||
output Item
|
||||
}{
|
||||
{
|
||||
input: gofeed.Item{
|
||||
Title: "a",
|
||||
Link: "b",
|
||||
Content: `<img src="A" and=things/> <img src="asdf" and="some-toher-stuff"></img>`,
|
||||
},
|
||||
filter: `<[a-z]+.+?/([a-z]+)?>`,
|
||||
output: Item{
|
||||
Name: "a",
|
||||
Link: "b",
|
||||
Content: `<img src="A"/><br><img src="asdf"/>`,
|
||||
},
|
||||
},
|
||||
{
|
||||
input: gofeed.Item{
|
||||
Title: "a",
|
||||
Link: "b",
|
||||
Content: `a b c <img src="asdf" and="some-toher-stuff"></img> d e f`,
|
||||
},
|
||||
filter: `<[a-z]+.+?/([a-z]+)?>`,
|
||||
output: Item{
|
||||
Name: "a",
|
||||
Link: "b",
|
||||
Content: `<img src="asdf"/>`,
|
||||
},
|
||||
},
|
||||
{
|
||||
input: gofeed.Item{
|
||||
Title: "a",
|
||||
|
|
@ -29,13 +55,13 @@ func Test_RSSItem(t *testing.T) {
|
|||
input: gofeed.Item{
|
||||
Title: "a",
|
||||
Link: "b",
|
||||
Content: `x y <img src="asdf"></img>`,
|
||||
Content: `<img src="asdf"></img>`,
|
||||
},
|
||||
filter: "[a-z]*",
|
||||
filter: "",
|
||||
output: Item{
|
||||
Name: "a",
|
||||
Link: "b",
|
||||
Content: "x\n<br>\ny",
|
||||
Content: `<img src="asdf"/>`,
|
||||
},
|
||||
},
|
||||
{
|
||||
|
|
@ -48,7 +74,7 @@ func Test_RSSItem(t *testing.T) {
|
|||
output: Item{
|
||||
Name: "a",
|
||||
Link: "b",
|
||||
Content: "x\n<br>\ny",
|
||||
Content: "x<br>y",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,2 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<rss version="2.0"><channel><title>xkcd.com</title><link>https://xkcd.com/</link><description>xkcd.com: A webcomic of romance and math humor.</description><language>en</language><item><title>Bluetooth</title><link>https://xkcd.com/2055/</link><description><img src="https://imgs.xkcd.com/comics/bluetooth.png" title="Bluetooth is actually named for the tenth-century Viking king Harald &quot;Bluetooth&quot; Gormsson, but the protocol developed by Harald was a wireless charging standard unrelated to the modern Bluetooth except by name." alt="Bluetooth is actually named for the tenth-century Viking king Harald &quot;Bluetooth&quot; Gormsson, but the protocol developed by Harald was a wireless charging standard unrelated to the modern Bluetooth except by name." /></description><pubDate>Fri, 05 Oct 2018 04:00:00 -0000</pubDate><guid>https://xkcd.com/2055/</guid></item><item><title>Data Pipeline</title><link>https://xkcd.com/2054/</link><description><img src="https://imgs.xkcd.com/comics/data_pipeline.png" title="&quot;Is the pipeline literally running from your laptop?&quot; &quot;Don't be silly, my laptop disconnects far too often to host a service we rely on. It's running on my phone.&quot;" alt="&quot;Is the pipeline literally running from your laptop?&quot; &quot;Don't be silly, my laptop disconnects far too often to host a service we rely on. It's running on my phone.&quot;" /></description><pubDate>Wed, 03 Oct 2018 04:00:00 -0000</pubDate><guid>https://xkcd.com/2054/</guid></item><item><title>Incoming Calls</title><link>https://xkcd.com/2053/</link><description><img src="https://imgs.xkcd.com/comics/incoming_calls.png" title="I wonder if that friendly lady ever fixed the problem she was having with her headset." alt="I wonder if that friendly lady ever fixed the problem she was having with her headset." /></description><pubDate>Mon, 01 Oct 2018 04:00:00 -0000</pubDate><guid>https://xkcd.com/2053/</guid></item><item><title>Stanislav Petrov Day</title><link>https://xkcd.com/2052/</link><description><img src="https://imgs.xkcd.com/comics/stanislav_petrov_day.png" title="I was going to get you an alarm clock that occasionally goes off randomly in the middle of the night, but you can ignore it and go back to sleep and it's fine." alt="I was going to get you an alarm clock that occasionally goes off randomly in the middle of the night, but you can ignore it and go back to sleep and it's fine." /></description><pubDate>Fri, 28 Sep 2018 04:00:00 -0000</pubDate><guid>https://xkcd.com/2052/</guid></item></channel></rss>
|
||||
Loading…
Reference in New Issue