package rss import ( "bytes" "encoding/gob" "fmt" "html" "io/ioutil" "net/http" "regexp" "strings" "time" "github.com/mmcdole/gofeed" ) type Item struct { Name string Link string Content string TS time.Time } func (item *Item) String() string { return fmt.Sprintf("Name %v, Link %v, Content %q, TS %v", item.Name, item.Link, item.Content, item.TS.Local(), ) } func (item *Item) ID() string { return item.TS.UTC().Format("20060102_") + strings.Join(regexp.MustCompile("[a-zA-Z0-9]*").FindAllString(item.Link, -1), "_") } func DeserializeItem(src []byte) (*Item, error) { buffer := bytes.NewBuffer(src) dec := gob.NewDecoder(buffer) var dst Item err := dec.Decode(&dst) return &dst, err } func (item *Item) Serialize() ([]byte, error) { var buffer bytes.Buffer enc := gob.NewEncoder(&buffer) err := enc.Encode(item) return buffer.Bytes(), err } func fromGofeedItem(gfitem *gofeed.Item, filter string) *Item { item := &Item{ Name: gfitem.Title, Link: gfitem.Link, Content: "", TS: *gofeedItemTS(gfitem), } content := gfitem.Content if content == "" { content = contentFromLink(item.Link) } if filter != "" { r := regexp.MustCompile(filter) matches := r.FindAllString(content, -1) content = strings.Join(matches, "
") } content = cleanImgTags(content) item.Content = content return item } func contentFromLink(link string) string { resp, err := http.Get(link) if err != nil { return "" } defer resp.Body.Close() b, err := ioutil.ReadAll(resp.Body) if err != nil { return "" } protocol := strings.Split(link, ":")[0] + "://" if !strings.HasPrefix(protocol, "http") { protocol = "" } content := strings.Replace(string(b), "\n", "", -1) // fix all //img.link/something.jpg badSrc := regexp.MustCompile("\"\\/\\/") content = badSrc.ReplaceAllString(content, "\""+protocol) // fix all href="/path/to" host := protocol + strings.Split(link[len(protocol):], "/")[0] + "/" badHref := regexp.MustCompile("href=\"\\/") content = badHref.ReplaceAllString(content, "href=\""+host) // fix all src="/path/to" badPathSrc := regexp.MustCompile("src=\"\\/") content = badPathSrc.ReplaceAllString(content, "src=\""+host) return content } func cleanImgTags(s string) string { reg := regexp.MustCompile("") s = html.UnescapeString(s) matches := reg.FindAllString(s, -1) if len(matches) > 0 { // get img src="..." and build regImgSrc := regexp.MustCompile("src=\".*?\"") for j := range matches { imgSrc := regImgSrc.FindString(matches[j]) replacement := matches[j] if imgSrc != "" { replacement = "" } s = strings.Replace(s, matches[j], replacement, 1) } } return s } func gofeedItemTS(gfitem *gofeed.Item) *time.Time { var t time.Time if gfitem.UpdatedParsed != nil { t = *gfitem.UpdatedParsed } else if gfitem.PublishedParsed != nil { t = *gfitem.PublishedParsed } return &t }