|
|
@@ -32,8 +32,9 @@ import (
|
|
|
)
|
|
|
|
|
|
var (
|
|
|
- youtubeRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)`)
|
|
|
- iso8601Regex = regexp.MustCompile(`^P((?P<year>\d+)Y)?((?P<month>\d+)M)?((?P<week>\d+)W)?((?P<day>\d+)D)?(T((?P<hour>\d+)H)?((?P<minute>\d+)M)?((?P<second>\d+)S)?)?$`)
|
|
|
+ youtubeRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)`)
|
|
|
+ iso8601Regex = regexp.MustCompile(`^P((?P<year>\d+)Y)?((?P<month>\d+)M)?((?P<week>\d+)W)?((?P<day>\d+)D)?(T((?P<hour>\d+)H)?((?P<minute>\d+)M)?((?P<second>\d+)S)?)?$`)
|
|
|
+ customReplaceRuleRegex = regexp.MustCompile(`rewrite\("(.*)"\|"(.*)"\)`)
|
|
|
)
|
|
|
|
|
|
// ProcessFeedEntries downloads original web page for entries and apply filters.
|
|
|
@@ -47,13 +48,14 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed) {
|
|
|
continue
|
|
|
}
|
|
|
|
|
|
+ url := getUrlFromEntry(feed, entry)
|
|
|
entryIsNew := !store.EntryURLExists(feed.ID, entry.URL)
|
|
|
if feed.Crawler && entryIsNew {
|
|
|
- logger.Debug("[Processor] Crawling entry %q from feed %q", entry.URL, feed.FeedURL)
|
|
|
+ logger.Debug("[Processor] Crawling entry %q from feed %q", url, feed.FeedURL)
|
|
|
|
|
|
startTime := time.Now()
|
|
|
content, scraperErr := scraper.Fetch(
|
|
|
- entry.URL,
|
|
|
+ url,
|
|
|
feed.ScraperRules,
|
|
|
feed.UserAgent,
|
|
|
feed.Cookie,
|
|
|
@@ -77,10 +79,10 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed) {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- entry.Content = rewrite.Rewriter(entry.URL, entry.Content, feed.RewriteRules)
|
|
|
+ entry.Content = rewrite.Rewriter(url, entry.Content, feed.RewriteRules)
|
|
|
|
|
|
// The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered.
|
|
|
- entry.Content = sanitizer.Sanitize(entry.URL, entry.Content)
|
|
|
+ entry.Content = sanitizer.Sanitize(url, entry.Content)
|
|
|
|
|
|
if entryIsNew {
|
|
|
intg, err := store.Integration(feed.UserID)
|
|
|
@@ -127,8 +129,10 @@ func isAllowedEntry(feed *model.Feed, entry *model.Entry) bool {
|
|
|
// ProcessEntryWebPage downloads the entry web page and apply rewrite rules.
|
|
|
func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry) error {
|
|
|
startTime := time.Now()
|
|
|
+ url := getUrlFromEntry(feed, entry)
|
|
|
+
|
|
|
content, scraperErr := scraper.Fetch(
|
|
|
- entry.URL,
|
|
|
+ url,
|
|
|
entry.Feed.ScraperRules,
|
|
|
entry.Feed.UserAgent,
|
|
|
entry.Feed.Cookie,
|
|
|
@@ -148,8 +152,8 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry) error {
|
|
|
return scraperErr
|
|
|
}
|
|
|
|
|
|
- content = rewrite.Rewriter(entry.URL, content, entry.Feed.RewriteRules)
|
|
|
- content = sanitizer.Sanitize(entry.URL, content)
|
|
|
+ content = rewrite.Rewriter(url, content, entry.Feed.RewriteRules)
|
|
|
+ content = sanitizer.Sanitize(url, content)
|
|
|
|
|
|
if content != "" {
|
|
|
entry.Content = content
|
|
|
@@ -159,6 +163,22 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry) error {
|
|
|
return nil
|
|
|
}
|
|
|
|
|
|
+func getUrlFromEntry(feed *model.Feed, entry *model.Entry) string {
|
|
|
+ var url = entry.URL
|
|
|
+ if feed.UrlRewriteRules != "" {
|
|
|
+ parts := customReplaceRuleRegex.FindStringSubmatch(feed.UrlRewriteRules)
|
|
|
+
|
|
|
+ if len(parts) >= 3 {
|
|
|
+ re := regexp.MustCompile(parts[1])
|
|
|
+ url = re.ReplaceAllString(entry.URL, parts[2])
|
|
|
+ logger.Debug(`[Processor] Rewriting entry URL %s to %s`, entry.URL, url)
|
|
|
+ } else {
|
|
|
+ logger.Debug("[Processor] Cannot find search and replace terms for replace rule %s", feed.UrlRewriteRules)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return url
|
|
|
+}
|
|
|
+
|
|
|
func updateEntryReadingTime(store *storage.Storage, feed *model.Feed, entry *model.Entry, entryIsNew bool) {
|
|
|
if shouldFetchYouTubeWatchTime(entry) {
|
|
|
if entryIsNew {
|