Browse Source

Add scraper rules

Frédéric Guillot 8 years ago
parent
commit
87ccad5c7f

+ 4 - 3
locale/translations.go

@@ -1,5 +1,5 @@
 // Code generated by go generate; DO NOT EDIT.
-// 2017-12-10 18:56:24.387844114 -0800 PST m=+0.029823201
+// 2017-12-10 20:08:14.447304303 -0800 PST m=+0.040286758
 
 package locale
 
@@ -167,12 +167,13 @@ var translations = map[string]string{
     "Activate Fever API": "Activer l'API de Fever",
     "Fever Username": "Nom d'utilisateur pour l'API de Fever",
     "Fever Password": "Mot de passe pour l'API de Fever",
-    "Fetch original content": "Récupérer le contenu original"
+    "Fetch original content": "Récupérer le contenu original",
+    "Scraper Rules": "Règles pour récupérer le contenu original"
 }
 `,
 }
 
 var translationsChecksums = map[string]string{
 	"en_US": "6fe95384260941e8a5a3c695a655a932e0a8a6a572c1e45cb2b1ae8baa01b897",
-	"fr_FR": "fd629b171aefa50dd0a6100acaac8fbecbdf1a1d53e3fce984234565ec5bb5d5",
+	"fr_FR": "4426cea875ee2c9acb1a2b0619cb82f3a32f71aabe5d07657eaf2f6b7387c5f9",
 }

+ 2 - 1
locale/translations/fr_FR.json

@@ -151,5 +151,6 @@
     "Activate Fever API": "Activer l'API de Fever",
     "Fever Username": "Nom d'utilisateur pour l'API de Fever",
     "Fever Password": "Mot de passe pour l'API de Fever",
-    "Fetch original content": "Récupérer le contenu original"
+    "Fetch original content": "Récupérer le contenu original",
+    "Scraper Rules": "Règles pour récupérer le contenu original"
 }

+ 1 - 0
model/feed.go

@@ -22,6 +22,7 @@ type Feed struct {
 	LastModifiedHeader string    `json:"last_modified_header,omitempty"`
 	ParsingErrorMsg    string    `json:"parsing_error_message,omitempty"`
 	ParsingErrorCount  int       `json:"parsing_error_count,omitempty"`
+	ScraperRules       string    `json:"scraper_rules"`
 	Category           *Category `json:"category,omitempty"`
 	Entries            Entries   `json:"entries,omitempty"`
 	Icon               *FeedIcon `json:"icon,omitempty"`

+ 16 - 0
reader/scraper/rules.go

@@ -0,0 +1,16 @@
+// Copyright 2017 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package scraper
+
+// List of predefined scraper rules (alphabetically sorted)
+// domain => CSS selectors
+var predefinedRules = map[string]string{
+	"lemonde.fr":        "div#articleBody",
+	"lesjoiesducode.fr": ".blog-post-content img",
+	"linux.com":         "div.content, div[property]",
+	"opensource.com":    "div[property]",
+	"phoronix.com":      "div.content",
+	"techcrunch.com":    "div.article-entry",
+}

+ 54 - 2
reader/scraper/scraper.go

@@ -6,14 +6,19 @@ package scraper
 
 import (
 	"errors"
+	"io"
+	"log"
+	"strings"
 
+	"github.com/PuerkitoBio/goquery"
 	"github.com/miniflux/miniflux2/http"
 	"github.com/miniflux/miniflux2/reader/readability"
 	"github.com/miniflux/miniflux2/reader/sanitizer"
+	"github.com/miniflux/miniflux2/url"
 )
 
 // Fetch download a web page a returns relevant contents.
-func Fetch(websiteURL string) (string, error) {
+func Fetch(websiteURL, rules string) (string, error) {
 	client := http.NewClient(websiteURL)
 	response, err := client.Get()
 	if err != nil {
@@ -29,10 +34,57 @@ func Fetch(websiteURL string) (string, error) {
 		return "", err
 	}
 
-	content, err := readability.ExtractContent(page)
+	var content string
+	if rules == "" {
+		rules = getPredefinedScraperRules(websiteURL)
+	}
+
+	if rules != "" {
+		log.Printf(`[Scraper] Using rules "%s" for "%s"`, rules, websiteURL)
+		content, err = scrapContent(page, rules)
+	} else {
+		log.Printf(`[Scraper] Using readability for "%s"`, websiteURL)
+		content, err = readability.ExtractContent(page)
+	}
+
 	if err != nil {
 		return "", err
 	}
 
 	return sanitizer.Sanitize(websiteURL, content), nil
 }
+
+func scrapContent(page io.Reader, rules string) (string, error) {
+	document, err := goquery.NewDocumentFromReader(page)
+	if err != nil {
+		return "", err
+	}
+
+	contents := ""
+	document.Find(rules).Each(func(i int, s *goquery.Selection) {
+		var content string
+
+		// For some inline elements, we get the parent.
+		if s.Is("img") {
+			content, _ = s.Parent().Html()
+		} else {
+			content, _ = s.Html()
+		}
+
+		contents += content
+	})
+
+	return contents, nil
+}
+
+func getPredefinedScraperRules(websiteURL string) string {
+	urlDomain := url.Domain(websiteURL)
+
+	for domain, rules := range predefinedRules {
+		if strings.Contains(urlDomain, domain) {
+			return rules
+		}
+	}
+
+	return ""
+}

+ 21 - 0
reader/scraper/scraper_test.go

@@ -0,0 +1,21 @@
+// Copyright 2017 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package scraper
+
+import "testing"
+
+func TestGetPredefinedRules(t *testing.T) {
+	if getPredefinedScraperRules("http://www.phoronix.com/") == "" {
+		t.Error("Unable to find rule for phoronix.com")
+	}
+
+	if getPredefinedScraperRules("https://www.linux.com/") == "" {
+		t.Error("Unable to find rule for linux.com")
+	}
+
+	if getPredefinedScraperRules("https://example.org/") != "" {
+		t.Error("A rule not defined should not return anything")
+	}
+}

+ 3 - 0
server/template/html/edit_feed.html

@@ -45,6 +45,9 @@
         <label for="form-feed-url">{{ t "Feed URL" }}</label>
         <input type="url" name="feed_url" id="form-feed-url" placeholder="https://domain.tld/" value="{{ .form.FeedURL }}" required>
 
+        <label for="form-scraper-rules">{{ t "Scraper Rules" }}</label>
+        <input type="text" name="scraper_rules" id="form-scraper-rules" value="{{ .form.ScraperRules }}">
+
         <label for="form-category">{{ t "Category" }}</label>
         <select id="form-category" name="category_id">
         {{ range .categories }}

+ 5 - 2
server/template/views.go

@@ -1,5 +1,5 @@
 // Code generated by go generate; DO NOT EDIT.
-// 2017-12-10 18:56:24.375327888 -0800 PST m=+0.017306975
+// 2017-12-10 20:08:14.428877093 -0800 PST m=+0.021859548
 
 package template
 
@@ -395,6 +395,9 @@ var templateViewsMap = map[string]string{
         <label for="form-feed-url">{{ t "Feed URL" }}</label>
         <input type="url" name="feed_url" id="form-feed-url" placeholder="https://domain.tld/" value="{{ .form.FeedURL }}" required>
 
+        <label for="form-scraper-rules">{{ t "Scraper Rules" }}</label>
+        <input type="text" name="scraper_rules" id="form-scraper-rules" value="{{ .form.ScraperRules }}">
+
         <label for="form-category">{{ t "Category" }}</label>
         <select id="form-category" name="category_id">
         {{ range .categories }}
@@ -1181,7 +1184,7 @@ var templateViewsMapChecksums = map[string]string{
 	"create_category":     "2b82af5d2dcd67898dc5daa57a6461e6ff8121a6089b2a2a1be909f35e4a2275",
 	"create_user":         "45e226df757126d5fe7c464e295e9a34f07952cfdb71e31e49839850d35af139",
 	"edit_category":       "cee720faadcec58289b707ad30af623d2ee66c1ce23a732965463250d7ff41c5",
-	"edit_feed":           "c5bc4c22bf7e8348d880395250545595d21fb8c8e723fc5d7cca68e25d250884",
+	"edit_feed":           "b3c7dd5e93d58e051abcd59da31217d8e9b50587014b895d1b7c9172247b35f8",
 	"edit_user":           "82d9749d76ddbd2352816d813c4b1f6d92f2222de678b4afe5821090246735c7",
 	"entry":               "ebcf9bb35812dd02759718f7f7411267e6a6c8efd59a9aa0a0e735bcb88efeff",
 	"feed_entries":        "547c19eb36b20e350ce70ed045173b064cdcd6b114afb241c9f2dda9d88fcc27",

+ 3 - 7
server/ui/controller/entry.go

@@ -40,18 +40,14 @@ func (c *Controller) FetchContent(ctx *core.Context, request *core.Request, resp
 		return
 	}
 
-	content, err := scraper.Fetch(entry.URL)
+	content, err := scraper.Fetch(entry.URL, entry.Feed.ScraperRules)
 	if err != nil {
 		response.JSON().ServerError(err)
 		return
 	}
 
-	if len(content) > len(entry.Content) {
-		entry.Content = content
-		c.store.UpdateEntryContent(entry)
-	} else {
-		content = entry.Content
-	}
+	entry.Content = content
+	c.store.UpdateEntryContent(entry)
 
 	response.JSON().Created(map[string]string{"content": content})
 }

+ 5 - 4
server/ui/controller/feed.go

@@ -217,10 +217,11 @@ func (c *Controller) getFeedFormTemplateArgs(ctx *core.Context, user *model.User
 
 	if feedForm == nil {
 		args["form"] = form.FeedForm{
-			SiteURL:    feed.SiteURL,
-			FeedURL:    feed.FeedURL,
-			Title:      feed.Title,
-			CategoryID: feed.Category.ID,
+			SiteURL:      feed.SiteURL,
+			FeedURL:      feed.FeedURL,
+			Title:        feed.Title,
+			ScraperRules: feed.ScraperRules,
+			CategoryID:   feed.Category.ID,
 		}
 	} else {
 		args["form"] = feedForm

+ 11 - 8
server/ui/form/feed.go

@@ -14,10 +14,11 @@ import (
 
 // FeedForm represents a feed form in the UI
 type FeedForm struct {
-	FeedURL    string
-	SiteURL    string
-	Title      string
-	CategoryID int64
+	FeedURL      string
+	SiteURL      string
+	Title        string
+	ScraperRules string
+	CategoryID   int64
 }
 
 // ValidateModification validates FeedForm fields
@@ -34,6 +35,7 @@ func (f FeedForm) Merge(feed *model.Feed) *model.Feed {
 	feed.Title = f.Title
 	feed.SiteURL = f.SiteURL
 	feed.FeedURL = f.FeedURL
+	feed.ScraperRules = f.ScraperRules
 	feed.ParsingErrorCount = 0
 	feed.ParsingErrorMsg = ""
 	return feed
@@ -47,9 +49,10 @@ func NewFeedForm(r *http.Request) *FeedForm {
 	}
 
 	return &FeedForm{
-		FeedURL:    r.FormValue("feed_url"),
-		SiteURL:    r.FormValue("site_url"),
-		Title:      r.FormValue("title"),
-		CategoryID: int64(categoryID),
+		FeedURL:      r.FormValue("feed_url"),
+		SiteURL:      r.FormValue("site_url"),
+		Title:        r.FormValue("title"),
+		ScraperRules: r.FormValue("scraper_rules"),
+		CategoryID:   int64(categoryID),
 	}
 }

+ 1 - 0
sql/schema_version_6.sql

@@ -0,0 +1 @@
+alter table feeds add column scraper_rules text default '';

+ 4 - 1
sql/sql.go

@@ -1,5 +1,5 @@
 // Code generated by go generate; DO NOT EDIT.
-// 2017-12-10 18:56:24.36359961 -0800 PST m=+0.005578697
+// 2017-12-10 20:08:14.411225368 -0800 PST m=+0.004207823
 
 package sql
 
@@ -136,6 +136,8 @@ alter table users add column entry_direction entry_sorting_direction default 'as
     fever_token text default '',
     primary key(user_id)
 )
+`,
+	"schema_version_6": `alter table feeds add column scraper_rules text default '';
 `,
 }
 
@@ -145,4 +147,5 @@ var SqlMapChecksums = map[string]string{
 	"schema_version_3": "a54745dbc1c51c000f74d4e5068f1e2f43e83309f023415b1749a47d5c1e0f12",
 	"schema_version_4": "216ea3a7d3e1704e40c797b5dc47456517c27dbb6ca98bf88812f4f63d74b5d9",
 	"schema_version_5": "46397e2f5f2c82116786127e9f6a403e975b14d2ca7b652a48cd1ba843e6a27c",
+	"schema_version_6": "9d05b4fb223f0e60efc716add5048b0ca9c37511cf2041721e20505d6d798ce4",
 }

+ 2 - 1
storage/entry_query_builder.go

@@ -152,7 +152,7 @@ func (e *EntryQueryBuilder) GetEntries() (model.Entries, error) {
 		SELECT
 		e.id, e.user_id, e.feed_id, e.hash, e.published_at at time zone '%s', e.title, e.url, e.author, e.content, e.status,
 		f.title as feed_title, f.feed_url, f.site_url, f.checked_at,
-		f.category_id, c.title as category_title,
+		f.category_id, c.title as category_title, f.scraper_rules,
 		fi.icon_id
 		FROM entries e
 		LEFT JOIN feeds f ON f.id=e.feed_id
@@ -197,6 +197,7 @@ func (e *EntryQueryBuilder) GetEntries() (model.Entries, error) {
 			&entry.Feed.CheckedAt,
 			&entry.Feed.Category.ID,
 			&entry.Feed.Category.Title,
+			&entry.Feed.ScraperRules,
 			&iconID,
 		)
 

+ 7 - 4
storage/feed.go

@@ -52,7 +52,7 @@ func (s *Storage) Feeds(userID int64) (model.Feeds, error) {
 	feeds := make(model.Feeds, 0)
 	query := `SELECT
 		f.id, f.feed_url, f.site_url, f.title, f.etag_header, f.last_modified_header,
-		f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg,
+		f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg, f.scraper_rules,
 		f.category_id, c.title as category_title,
 		fi.icon_id
 		FROM feeds f
@@ -84,6 +84,7 @@ func (s *Storage) Feeds(userID int64) (model.Feeds, error) {
 			&feed.CheckedAt,
 			&feed.ParsingErrorCount,
 			&errorMsg,
+			&feed.ScraperRules,
 			&feed.Category.ID,
 			&feed.Category.Title,
 			&iconID,
@@ -122,7 +123,7 @@ func (s *Storage) FeedByID(userID, feedID int64) (*model.Feed, error) {
 	query := `
 		SELECT
 		f.id, f.feed_url, f.site_url, f.title, f.etag_header, f.last_modified_header,
-		f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg,
+		f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg, f.scraper_rules,
 		f.category_id, c.title as category_title
 		FROM feeds f
 		LEFT JOIN categories c ON c.id=f.category_id
@@ -139,6 +140,7 @@ func (s *Storage) FeedByID(userID, feedID int64) (*model.Feed, error) {
 		&feed.CheckedAt,
 		&feed.ParsingErrorCount,
 		&feed.ParsingErrorMsg,
+		&feed.ScraperRules,
 		&feed.Category.ID,
 		&feed.Category.Title,
 	)
@@ -195,8 +197,8 @@ func (s *Storage) UpdateFeed(feed *model.Feed) (err error) {
 
 	query := `UPDATE feeds SET
 		feed_url=$1, site_url=$2, title=$3, category_id=$4, etag_header=$5, last_modified_header=$6, checked_at=$7,
-		parsing_error_msg=$8, parsing_error_count=$9
-		WHERE id=$10 AND user_id=$11`
+		parsing_error_msg=$8, parsing_error_count=$9, scraper_rules=$10
+		WHERE id=$11 AND user_id=$12`
 
 	_, err = s.db.Exec(query,
 		feed.FeedURL,
@@ -208,6 +210,7 @@ func (s *Storage) UpdateFeed(feed *model.Feed) (err error) {
 		feed.CheckedAt,
 		feed.ParsingErrorMsg,
 		feed.ParsingErrorCount,
+		feed.ScraperRules,
 		feed.ID,
 		feed.UserID,
 	)

+ 1 - 1
storage/migration.go

@@ -12,7 +12,7 @@ import (
 	"github.com/miniflux/miniflux2/sql"
 )
 
-const schemaVersion = 5
+const schemaVersion = 6
 
 // Migrate run database migrations.
 func (s *Storage) Migrate() {