readingtime.go 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. // Package readingtime provides a function to estimate the reading time of an article.
  4. package readingtime
  5. import (
  6. "strings"
  7. "unicode"
  8. "unicode/utf8"
  9. "miniflux.app/v2/internal/reader/sanitizer"
  10. )
  11. // EstimateReadingTime returns the estimated reading time of an article in minute.
  12. func EstimateReadingTime(content string, defaultReadingSpeed, cjkReadingSpeed int) int {
  13. const truncationPoint = 100
  14. sanitizedContent := sanitizer.StripTags(content)
  15. if isCJK(sanitizedContent, truncationPoint) {
  16. return (utf8.RuneCountInString(sanitizedContent) + cjkReadingSpeed - 1) / cjkReadingSpeed
  17. }
  18. return (countWords(sanitizedContent) + defaultReadingSpeed - 1) / defaultReadingSpeed
  19. }
  20. func countWords(s string) int {
  21. n := 0
  22. for range strings.FieldsSeq(s) {
  23. n++
  24. }
  25. return n
  26. }
  27. func isCJK(text string, limit int) bool {
  28. var letters, totalCJK int
  29. for _, r := range text {
  30. // Numbers and control characters often used in CJK too.
  31. // Counting them makes detection less reliable.
  32. if !unicode.In(r, unicode.Letter) {
  33. continue
  34. }
  35. if letters++; letters == limit {
  36. break
  37. }
  38. if unicode.In(r, unicode.Han, unicode.Hangul, unicode.Hiragana, unicode.Katakana, unicode.Yi, unicode.Bopomofo) {
  39. totalCJK++
  40. }
  41. }
  42. // If at least half of the letters is CJK, odds are that the text is CJK.
  43. midpoint := letters / 2
  44. return totalCJK > midpoint
  45. }