encoding.go 1.1 KB

1234567891011121314151617181920212223242526272829303132333435363738
  1. // Copyright 2018 Frédéric Guillot. All rights reserved.
  2. // Use of this source code is governed by the Apache 2.0
  3. // license that can be found in the LICENSE file.
  4. package encoding
  5. import (
  6. "bytes"
  7. "io"
  8. "unicode/utf8"
  9. "golang.org/x/net/html/charset"
  10. )
  11. // CharsetReader is used when the XML encoding is specified for the input document.
  12. //
  13. // The document is converted in UTF-8 only if a different encoding is specified
  14. // and the document is not already UTF-8.
  15. //
  16. // Several edge cases could exists:
  17. //
  18. // - Feeds with charset specified only in Content-Type header and not in XML document
  19. // - Feeds with charset specified in both places
  20. // - Feeds with charset specified only in XML document and not in HTTP header
  21. func CharsetReader(label string, input io.Reader) (io.Reader, error) {
  22. var buf1, buf2 bytes.Buffer
  23. w := io.MultiWriter(&buf1, &buf2)
  24. io.Copy(w, input)
  25. r := bytes.NewReader(buf2.Bytes())
  26. if !utf8.Valid(buf1.Bytes()) {
  27. // Transform document to UTF-8 from the specified XML encoding.
  28. return charset.NewReaderLabel(label, r)
  29. }
  30. // The document is already UTF-8, do not do anything (avoid double-encoding)
  31. return r, nil
  32. }