4
0

truncate.go 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package sanitizer
  4. import (
  5. "strings"
  6. "unicode"
  7. "unicode/utf8"
  8. )
  9. // TruncateHTML returns cleaned up and shortened version of input.
  10. // - HTML tags are removed
  11. // - Consecutive whitespace characters replaced with single SPACE (0x20) character
  12. // - If input has more runes than limit, it's truncated
  13. func TruncateHTML(input string, limit int) string {
  14. dst := &strings.Builder{}
  15. src := strings.NewReader(input)
  16. words := 0
  17. count := 0
  18. needspace := false
  19. err := stripIter(src, func(token string) bool {
  20. // Skip leading space.
  21. if words > 0 {
  22. // Add a space between tokens if there's one before HTML tag.
  23. r, _ := utf8.DecodeRuneInString(token)
  24. needspace = needspace || unicode.IsSpace(r)
  25. }
  26. for word := range strings.FieldsSeq(token) {
  27. if needspace {
  28. if count += 1; count > limit {
  29. return false
  30. }
  31. }
  32. // Compute how much of the word we can use later.
  33. wordlen := 0
  34. for wordlen < len(word) {
  35. if count += 1; count > limit {
  36. break
  37. }
  38. r, w := utf8.DecodeRuneInString(word[wordlen:])
  39. if r == utf8.RuneError {
  40. wordlen += 1
  41. continue
  42. }
  43. wordlen += w
  44. }
  45. // This is the only place where space being placed.
  46. // That way any sequence of space characters ends up as a singular SPACE (0x20) character.
  47. //
  48. // wordlen > 0 skips spaces if no printable characters left.
  49. if needspace && wordlen > 0 {
  50. dst.WriteByte(' ')
  51. }
  52. dst.WriteString(word[:wordlen])
  53. if count > limit {
  54. return false
  55. }
  56. needspace = true // To insert spaces in-between words in a token.
  57. words++
  58. }
  59. // Add a space between tokens if there's one after HTML tag.
  60. r, _ := utf8.DecodeLastRuneInString(token)
  61. needspace = unicode.IsSpace(r) && words > 0
  62. return true
  63. })
  64. if err != nil {
  65. return ""
  66. }
  67. if count > limit {
  68. dst.WriteRune('…')
  69. }
  70. return dst.String()
  71. }