urlcleaner.go 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package urlcleaner // import "miniflux.app/v2/internal/reader/urlcleaner"
  4. import (
  5. "fmt"
  6. "net/url"
  7. "strings"
  8. )
  9. // Interesting lists:
  10. // https://raw.githubusercontent.com/AdguardTeam/AdguardFilters/master/TrackParamFilter/sections/general_url.txt
  11. // https://firefox.settings.services.mozilla.com/v1/buckets/main/collections/query-stripping/records
  12. // https://github.com/Smile4ever/Neat-URL/blob/master/data/default-params-by-category.json
  13. // https://github.com/brave/brave-core/blob/master/components/query_filter/utils.cc
  14. // https://developers.google.com/analytics/devguides/collection/ga4/reference/config
  15. var trackingParams = map[string]bool{
  16. // Facebook Click Identifiers
  17. "fbclid": true,
  18. "_openstat": true,
  19. "fb_action_ids": true,
  20. "fb_action_types": true,
  21. "fb_ref": true,
  22. "fb_source": true,
  23. "fb_comment_id": true,
  24. // Google Click Identifiers
  25. "gclid": true,
  26. "dclid": true,
  27. "gbraid": true,
  28. "wbraid": true,
  29. "gclsrc": true,
  30. // Google Analytics
  31. "campaign_id": true,
  32. "campaign_medium": true,
  33. "campaign_name": true,
  34. "campaign_source": true,
  35. "campaign_term": true,
  36. "campaign_content": true,
  37. // Yandex Click Identifiers
  38. "yclid": true,
  39. "ysclid": true,
  40. // Twitter Click Identifier
  41. "twclid": true,
  42. // Microsoft Click Identifier
  43. "msclkid": true,
  44. // Mailchimp Click Identifiers
  45. "mc_cid": true,
  46. "mc_eid": true,
  47. // Wicked Reports click tracking
  48. "wickedid": true,
  49. // Hubspot Click Identifiers
  50. "hsa_cam": true,
  51. "_hsenc": true,
  52. "__hssc": true,
  53. "__hstc": true,
  54. "__hsfp": true,
  55. "_hsmi": true,
  56. "hsctatracking": true,
  57. // Olytics
  58. "rb_clickid": true,
  59. "oly_anon_id": true,
  60. "oly_enc_id": true,
  61. // Vero Click Identifier
  62. "vero_id": true,
  63. "vero_conv": true,
  64. // Marketo email tracking
  65. "mkt_tok": true,
  66. // Adobe email tracking
  67. "sc_cid": true,
  68. // Beehiiv
  69. "_bhlid": true,
  70. // Branch.io
  71. "_branch_match_id": true,
  72. "_branch_referrer": true,
  73. }
  74. // Outbound tracking parameters are appending the website's url to outbound links.
  75. var trackingParamsOutbound = map[string]bool{
  76. // Ghost
  77. "ref": true,
  78. }
  79. func RemoveTrackingParameters(parsedFeedURL, parsedSiteURL, parsedInputUrl *url.URL) (string, error) {
  80. if parsedFeedURL == nil || parsedSiteURL == nil || parsedInputUrl == nil {
  81. return "", fmt.Errorf("urlcleaner: one of the URLs is nil")
  82. }
  83. queryParams := parsedInputUrl.Query()
  84. hasTrackers := false
  85. // Remove tracking parameters
  86. for param := range queryParams {
  87. lowerParam := strings.ToLower(param)
  88. if trackingParams[lowerParam] || strings.HasPrefix(lowerParam, "utm_") {
  89. queryParams.Del(param)
  90. hasTrackers = true
  91. }
  92. if trackingParamsOutbound[lowerParam] {
  93. // handle duplicate parameters like ?a=b&a=c&a=d…
  94. for _, value := range queryParams[param] {
  95. if value == parsedFeedURL.Hostname() || value == parsedSiteURL.Hostname() {
  96. queryParams.Del(param)
  97. hasTrackers = true
  98. break
  99. }
  100. }
  101. }
  102. }
  103. // Do not modify the URL if there are no tracking parameters
  104. if !hasTrackers {
  105. return parsedInputUrl.String(), nil
  106. }
  107. parsedInputUrl.RawQuery = queryParams.Encode()
  108. cleanedURL := strings.TrimSuffix(parsedInputUrl.String(), "?")
  109. return cleanedURL, nil
  110. }