encoding.go 1.2 KB

12345678910111213141516171819202122232425262728293031323334353637
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package encoding // import "miniflux.app/v2/internal/reader/encoding"
  4. import (
  5. "bytes"
  6. "io"
  7. "unicode/utf8"
  8. "golang.org/x/net/html/charset"
  9. )
  10. // CharsetReader is used when the XML encoding is specified for the input document.
  11. //
  12. // The document is converted in UTF-8 only if a different encoding is specified
  13. // and the document is not already UTF-8.
  14. //
  15. // Several edge cases could exists:
  16. //
  17. // - Feeds with encoding specified only in Content-Type header and not in XML document
  18. // - Feeds with encoding specified in both places
  19. // - Feeds with encoding specified only in XML document and not in HTTP header
  20. // - Feeds with wrong encoding defined and already in UTF-8
  21. func CharsetReader(charsetLabel string, input io.Reader) (io.Reader, error) {
  22. buffer, _ := io.ReadAll(input)
  23. r := bytes.NewReader(buffer)
  24. // The document is already UTF-8, do not do anything (avoid double-encoding).
  25. // That means the specified encoding in XML prolog is wrong.
  26. if utf8.Valid(buffer) {
  27. return r, nil
  28. }
  29. // Transform document to UTF-8 from the specified encoding in XML prolog.
  30. return charset.NewReaderLabel(charsetLabel, r)
  31. }