encoding.go 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package encoding // import "miniflux.app/v2/internal/reader/encoding"
  4. import (
  5. "bytes"
  6. "fmt"
  7. "io"
  8. "unicode/utf8"
  9. "golang.org/x/net/html/charset"
  10. )
  11. // CharsetReader is used when the XML encoding is specified for the input document.
  12. //
  13. // The document is converted in UTF-8 only if a different encoding is specified
  14. // and the document is not already UTF-8.
  15. //
  16. // Several edge cases could exists:
  17. //
  18. // - Feeds with encoding specified only in Content-Type header and not in XML document
  19. // - Feeds with encoding specified in both places
  20. // - Feeds with encoding specified only in XML document and not in HTTP header
  21. // - Feeds with wrong encoding defined and already in UTF-8
  22. func CharsetReader(charsetLabel string, input io.Reader) (io.Reader, error) {
  23. buffer, err := io.ReadAll(input)
  24. if err != nil {
  25. return nil, fmt.Errorf(`encoding: unable to read input: %w`, err)
  26. }
  27. r := bytes.NewReader(buffer)
  28. // The document is already UTF-8, do not do anything (avoid double-encoding).
  29. // That means the specified encoding in XML prolog is wrong.
  30. if utf8.Valid(buffer) {
  31. return r, nil
  32. }
  33. // Transform document to UTF-8 from the specified encoding in XML prolog.
  34. return charset.NewReaderLabel(charsetLabel, r)
  35. }
  36. // NewCharsetReader returns an io.Reader that converts the content of r to UTF-8.
  37. func NewCharsetReader(r io.Reader, contentType string) (io.Reader, error) {
  38. buffer, err := io.ReadAll(r)
  39. if err != nil {
  40. return nil, fmt.Errorf(`encoding: unable to read input: %w`, err)
  41. }
  42. return NewCharsetReaderFromBytes(buffer, contentType)
  43. }
  44. func NewCharsetReaderFromBytes(buffer []byte, contentType string) (io.Reader, error) {
  45. internalReader := bytes.NewReader(buffer)
  46. // The document is already UTF-8, do not do anything.
  47. if utf8.Valid(buffer) {
  48. return internalReader, nil
  49. }
  50. // Transform document to UTF-8 from the specified encoding in Content-Type header.
  51. // Note that only the first 1024 bytes are used to detect the encoding.
  52. // If the <meta charset> tag is not found in the first 1024 bytes, charset.DetermineEncoding returns "windows-1252" resulting in encoding issues.
  53. // See https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
  54. return charset.NewReader(internalReader, contentType)
  55. }