urlcleaner.go 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package urlcleaner // import "miniflux.app/v2/internal/reader/urlcleaner"
  4. import (
  5. "fmt"
  6. "net/url"
  7. "strings"
  8. )
  9. // Interesting lists:
  10. // https://raw.githubusercontent.com/AdguardTeam/AdguardFilters/master/TrackParamFilter/sections/general_url.txt
  11. // https://firefox.settings.services.mozilla.com/v1/buckets/main/collections/query-stripping/records
  12. // https://github.com/Smile4ever/Neat-URL/blob/master/data/default-params-by-category.json
  13. // https://github.com/brave/brave-core/blob/master/components/query_filter/utils.cc
  14. // https://developers.google.com/analytics/devguides/collection/ga4/reference/config
  15. var trackingParams = map[string]bool{
  16. // Facebook Click Identifiers
  17. "fbclid": true,
  18. "_openstat": true,
  19. "fb_action_ids": true,
  20. "fb_action_types": true,
  21. "fb_ref": true,
  22. "fb_source": true,
  23. "fb_comment_id": true,
  24. // Google Click Identifiers
  25. "gclid": true,
  26. "dclid": true,
  27. "gbraid": true,
  28. "wbraid": true,
  29. "gclsrc": true,
  30. // Google Analytics
  31. "campaign_id": true,
  32. "campaign_medium": true,
  33. "campaign_name": true,
  34. "campaign_source": true,
  35. "campaign_term": true,
  36. "campaign_content": true,
  37. // Yandex Click Identifiers
  38. "yclid": true,
  39. "ysclid": true,
  40. // Twitter Click Identifier
  41. "twclid": true,
  42. // Microsoft Click Identifier
  43. "msclkid": true,
  44. // Mailchimp Click Identifiers
  45. "mc_cid": true,
  46. "mc_eid": true,
  47. // Wicked Reports click tracking
  48. "wickedid": true,
  49. // Hubspot Click Identifiers
  50. "hsa_cam": true,
  51. "_hsenc": true,
  52. "__hssc": true,
  53. "__hstc": true,
  54. "__hsfp": true,
  55. "_hsmi": true,
  56. "hsctatracking": true,
  57. // Olytics
  58. "rb_clickid": true,
  59. "oly_anon_id": true,
  60. "oly_enc_id": true,
  61. // Vero Click Identifier
  62. "vero_id": true,
  63. "vero_conv": true,
  64. // Marketo email tracking
  65. "mkt_tok": true,
  66. // Adobe email tracking
  67. "sc_cid": true,
  68. // Beehiiv
  69. "_bhlid": true,
  70. // Branch.io
  71. "_branch_match_id": true,
  72. "_branch_referrer": true,
  73. }
  74. // Outbound tracking parameters are appending the website's url to outbound links.
  75. var trackingParamsOutbound = map[string]bool{
  76. // Ghost
  77. "ref": true,
  78. }
  79. func RemoveTrackingParameters(baseUrl, feedUrl, inputURL string) (string, error) {
  80. parsedURL, err := url.Parse(inputURL)
  81. if err != nil {
  82. return "", fmt.Errorf("urlcleaner: error parsing URL: %v", err)
  83. }
  84. if !strings.HasPrefix(parsedURL.Scheme, "http") {
  85. return inputURL, nil
  86. }
  87. parsedBaseUrl, err := url.Parse(baseUrl)
  88. if err != nil {
  89. return "", fmt.Errorf("urlcleaner: error parsing base URL: %v", err)
  90. }
  91. parsedFeedUrl, err := url.Parse(feedUrl)
  92. if err != nil {
  93. return "", fmt.Errorf("urlcleaner: error parsing feed URL: %v", err)
  94. }
  95. queryParams := parsedURL.Query()
  96. hasTrackers := false
  97. // Remove tracking parameters
  98. for param := range queryParams {
  99. lowerParam := strings.ToLower(param)
  100. if trackingParams[lowerParam] || strings.HasPrefix(lowerParam, "utm_") {
  101. queryParams.Del(param)
  102. hasTrackers = true
  103. }
  104. if trackingParamsOutbound[lowerParam] {
  105. // handle duplicate parameters like ?a=b&a=c&a=d…
  106. for _, value := range queryParams[param] {
  107. if value == parsedBaseUrl.Hostname() || value == parsedFeedUrl.Hostname() {
  108. queryParams.Del(param)
  109. hasTrackers = true
  110. break
  111. }
  112. }
  113. }
  114. }
  115. // Do not modify the URL if there are no tracking parameters
  116. if !hasTrackers {
  117. return inputURL, nil
  118. }
  119. parsedURL.RawQuery = queryParams.Encode()
  120. // Remove trailing "?" if query string is empty
  121. cleanedURL := parsedURL.String()
  122. cleanedURL = strings.TrimSuffix(cleanedURL, "?")
  123. return cleanedURL, nil
  124. }