urlcleaner.go 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package urlcleaner // import "miniflux.app/v2/internal/reader/urlcleaner"
  4. import (
  5. "fmt"
  6. "net/url"
  7. "strings"
  8. )
  9. // Interesting lists:
  10. // https://raw.githubusercontent.com/AdguardTeam/AdguardFilters/master/TrackParamFilter/sections/general_url.txt
  11. // https://firefox.settings.services.mozilla.com/v1/buckets/main/collections/query-stripping/records
  12. // https://github.com/Smile4ever/Neat-URL/blob/master/data/default-params-by-category.json
  13. // https://github.com/brave/brave-core/blob/master/components/query_filter/utils.cc
  14. // https://developers.google.com/analytics/devguides/collection/ga4/reference/config
  15. var trackingParams = map[string]bool{
  16. // Facebook Click Identifiers
  17. "fbclid": true,
  18. "_openstat": true,
  19. "fb_action_ids": true,
  20. "fb_action_types": true,
  21. "fb_ref": true,
  22. "fb_source": true,
  23. "fb_comment_id": true,
  24. // Google Click Identifiers
  25. "gclid": true,
  26. "dclid": true,
  27. "gbraid": true,
  28. "wbraid": true,
  29. "gclsrc": true,
  30. // Google Analytics
  31. "campaign_id": true,
  32. "campaign_medium": true,
  33. "campaign_name": true,
  34. "campaign_source": true,
  35. "campaign_term": true,
  36. "campaign_content": true,
  37. // Yandex Click Identifiers
  38. "yclid": true,
  39. "ysclid": true,
  40. // Twitter Click Identifier
  41. "twclid": true,
  42. // Microsoft Click Identifier
  43. "msclkid": true,
  44. // Mailchimp Click Identifiers
  45. "mc_cid": true,
  46. "mc_eid": true,
  47. // Wicked Reports click tracking
  48. "wickedid": true,
  49. // Hubspot Click Identifiers
  50. "hsa_cam": true,
  51. "_hsenc": true,
  52. "__hssc": true,
  53. "__hstc": true,
  54. "__hsfp": true,
  55. "_hsmi": true,
  56. "hsctatracking": true,
  57. // Olytics
  58. "rb_clickid": true,
  59. "oly_anon_id": true,
  60. "oly_enc_id": true,
  61. // Vero Click Identifier
  62. "vero_id": true,
  63. "vero_conv": true,
  64. // Marketo email tracking
  65. "mkt_tok": true,
  66. // Adobe email tracking
  67. "sc_cid": true,
  68. // Beehiiv
  69. "_bhlid": true,
  70. // Branch.io
  71. "_branch_match_id": true,
  72. "_branch_referrer": true,
  73. }
  74. func RemoveTrackingParameters(inputURL string) (string, error) {
  75. parsedURL, err := url.Parse(inputURL)
  76. if err != nil {
  77. return "", fmt.Errorf("urlcleaner: error parsing URL: %v", err)
  78. }
  79. if !strings.HasPrefix(parsedURL.Scheme, "http") {
  80. return inputURL, nil
  81. }
  82. queryParams := parsedURL.Query()
  83. hasTrackers := false
  84. // Remove tracking parameters
  85. for param := range queryParams {
  86. lowerParam := strings.ToLower(param)
  87. if trackingParams[lowerParam] || strings.HasPrefix(lowerParam, "utm_") {
  88. queryParams.Del(param)
  89. hasTrackers = true
  90. }
  91. }
  92. // Do not modify the URL if there are no tracking parameters
  93. if !hasTrackers {
  94. return inputURL, nil
  95. }
  96. parsedURL.RawQuery = queryParams.Encode()
  97. // Remove trailing "?" if query string is empty
  98. cleanedURL := parsedURL.String()
  99. cleanedURL = strings.TrimSuffix(cleanedURL, "?")
  100. return cleanedURL, nil
  101. }