urlcleaner.go 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package urlcleaner // import "miniflux.app/v2/internal/reader/urlcleaner"
  4. import (
  5. "errors"
  6. "net/url"
  7. "strings"
  8. )
  9. // Interesting lists:
  10. // https://raw.githubusercontent.com/AdguardTeam/AdguardFilters/master/TrackParamFilter/sections/general_url.txt
  11. // https://firefox.settings.services.mozilla.com/v1/buckets/main/collections/query-stripping/records
  12. // https://github.com/Smile4ever/Neat-URL/blob/master/data/default-params-by-category.json
  13. // https://github.com/brave/brave-core/blob/master/components/query_filter/utils.cc
  14. // https://developers.google.com/analytics/devguides/collection/ga4/reference/config
  15. var trackingParams = map[string]bool{
  16. // Facebook Click Identifiers
  17. "fbclid": true,
  18. "_openstat": true,
  19. "fb_action_ids": true,
  20. "fb_action_types": true,
  21. "fb_ref": true,
  22. "fb_source": true,
  23. "fb_comment_id": true,
  24. // Humble Bundles
  25. "hmb_campaign": true,
  26. "hmb_medium": true,
  27. "hmb_source": true,
  28. // Likely Google as well
  29. "itm_campaign": true,
  30. "itm_medium": true,
  31. "itm_source": true,
  32. // Google Click Identifiers
  33. "gclid": true,
  34. "dclid": true,
  35. "gbraid": true,
  36. "wbraid": true,
  37. "gclsrc": true,
  38. // Google Analytics
  39. "campaign_id": true,
  40. "campaign_medium": true,
  41. "campaign_name": true,
  42. "campaign_source": true,
  43. "campaign_term": true,
  44. "campaign_content": true,
  45. // Google
  46. "srsltid": true,
  47. // Yandex Click Identifiers
  48. "yclid": true,
  49. "ysclid": true,
  50. // Twitter Click Identifier
  51. "twclid": true,
  52. // Microsoft Click Identifier
  53. "msclkid": true,
  54. // Mailchimp Click Identifiers
  55. "mc_cid": true,
  56. "mc_eid": true,
  57. "mc_tc": true,
  58. // Wicked Reports click tracking
  59. "wickedid": true,
  60. // Hubspot Click Identifiers
  61. "hsa_cam": true,
  62. "_hsenc": true,
  63. "__hssc": true,
  64. "__hstc": true,
  65. "__hsfp": true,
  66. "_hsmi": true,
  67. "hsctatracking": true,
  68. // Olytics
  69. "rb_clickid": true,
  70. "oly_anon_id": true,
  71. "oly_enc_id": true,
  72. // Vero Click Identifier
  73. "vero_id": true,
  74. "vero_conv": true,
  75. // Marketo email tracking
  76. "mkt_tok": true,
  77. // Adobe email tracking
  78. "sc_cid": true,
  79. // Beehiiv
  80. "_bhlid": true,
  81. // Branch.io
  82. "_branch_match_id": true,
  83. "_branch_referrer": true,
  84. // Readwise
  85. "__readwiseLocation": true,
  86. }
  87. // Outbound tracking parameters are appending the website's url to outbound links.
  88. var trackingParamsOutbound = map[string]bool{
  89. // Ghost
  90. "ref": true,
  91. }
  92. var trackingParamsPrefixes = []string{
  93. "utm_", // https://en.wikipedia.org/wiki/UTM_parameters
  94. "mtm_", // https://matomo.org/faq/reports/common-campaign-tracking-use-cases-and-examples/
  95. }
  96. func isTrackingParam(param string) bool {
  97. for _, prefix := range trackingParamsPrefixes {
  98. if strings.HasPrefix(param, prefix) {
  99. return true
  100. }
  101. }
  102. return trackingParams[param]
  103. }
  104. func RemoveTrackingParameters(parsedFeedURL, parsedSiteURL, parsedInputUrl *url.URL) (string, error) {
  105. if parsedFeedURL == nil || parsedSiteURL == nil || parsedInputUrl == nil {
  106. return "", errors.New("urlcleaner: one of the URLs is nil")
  107. }
  108. if parsedInputUrl.RawQuery == "" {
  109. return parsedInputUrl.String(), nil
  110. }
  111. queryParams := parsedInputUrl.Query()
  112. hasTrackers := false
  113. feedHostname := parsedFeedURL.Hostname()
  114. siteHostname := parsedSiteURL.Hostname()
  115. // Remove tracking parameters
  116. for param := range queryParams {
  117. lowerParam := strings.ToLower(param)
  118. if isTrackingParam(lowerParam) {
  119. queryParams.Del(param)
  120. hasTrackers = true
  121. continue
  122. }
  123. if trackingParamsOutbound[lowerParam] {
  124. // handle duplicate parameters like ?a=b&a=c&a=d…
  125. for _, value := range queryParams[param] {
  126. if value == feedHostname || value == siteHostname {
  127. queryParams.Del(param)
  128. hasTrackers = true
  129. break
  130. }
  131. }
  132. }
  133. }
  134. // Do not modify the URL if there are no tracking parameters
  135. if !hasTrackers {
  136. return parsedInputUrl.String(), nil
  137. }
  138. parsedInputUrl.RawQuery = queryParams.Encode()
  139. cleanedURL := strings.TrimSuffix(parsedInputUrl.String(), "?")
  140. return cleanedURL, nil
  141. }