// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

package sanitizer

import (
	"strings"
	"unicode"
	"unicode/utf8"
)

// TruncateHTML returns cleaned up and shortened version of input.
//   - HTML tags are removed
//   - Consecutive whitespace characters replaced with single SPACE (0x20) character
//   - If input has more runes than limit, it's truncated
func TruncateHTML(input string, limit int) string {
	dst := &strings.Builder{}
	src := strings.NewReader(input)

	words := 0
	count := 0
	needspace := false

	err := stripIter(src, func(token string) bool {
		// Skip leading space.
		if words > 0 {
			// Add a space between tokens if there's one before HTML tag.
			r, _ := utf8.DecodeRuneInString(token)
			needspace = needspace || unicode.IsSpace(r)
		}

		for word := range strings.FieldsSeq(token) {
			if needspace {
				if count += 1; count > limit {
					return false
				}
			}

			// Compute how much of the word we can use later.
			wordlen := 0
			for wordlen < len(word) {
				if count += 1; count > limit {
					break
				}

				r, w := utf8.DecodeRuneInString(word[wordlen:])
				if r == utf8.RuneError {
					wordlen += 1
					continue
				}

				wordlen += w
			}

			// This is the only place where space being placed.
			// That way any sequence of space characters ends up as a singular SPACE (0x20) character.
			//
			// wordlen > 0 skips spaces if no printable characters left.
			if needspace && wordlen > 0 {
				dst.WriteByte(' ')
			}

			dst.WriteString(word[:wordlen])

			if count > limit {
				return false
			}

			needspace = true // To insert spaces in-between words in a token.
			words++
		}

		// Add a space between tokens if there's one after HTML tag.
		r, _ := utf8.DecodeLastRuneInString(token)
		needspace = unicode.IsSpace(r) && words > 0

		return true
	})
	if err != nil {
		return ""
	}

	if count > limit {
		dst.WriteRune('…')
	}

	return dst.String()
}