| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586 |
- // Copyright 2019 Frédéric Guillot. All rights reserved.
- // Use of this source code is governed by the Apache 2.0
- // license that can be found in the LICENSE file.
- package xml // import "miniflux.app/reader/xml"
- import (
- "bytes"
- "encoding/xml"
- "fmt"
- "io"
- "strings"
- "miniflux.app/reader/encoding"
- )
- // NewDecoder returns a XML decoder that filters illegal characters.
- func NewDecoder(data io.Reader) *xml.Decoder {
- var decoder *xml.Decoder
- buffer, _ := io.ReadAll(data)
- enc := procInst("encoding", string(buffer))
- if enc != "" && enc != "utf-8" && enc != "UTF-8" && !strings.EqualFold(enc, "utf-8") {
- // filter invalid chars later within decoder.CharsetReader
- decoder = xml.NewDecoder(bytes.NewReader(buffer))
- } else {
- // filter invalid chars now, since decoder.CharsetReader not called for utf-8 content
- filteredBytes := bytes.Map(filterValidXMLChar, buffer)
- decoder = xml.NewDecoder(bytes.NewReader(filteredBytes))
- }
- decoder.Entity = xml.HTMLEntity
- decoder.Strict = false
- decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
- utf8Reader, err := encoding.CharsetReader(charset, input)
- if err != nil {
- return nil, err
- }
- rawData, err := io.ReadAll(utf8Reader)
- if err != nil {
- return nil, fmt.Errorf("Unable to read data: %q", err)
- }
- filteredBytes := bytes.Map(filterValidXMLChar, rawData)
- return bytes.NewReader(filteredBytes), nil
- }
- return decoder
- }
- // This function is copied from encoding/xml package,
- // and is used to check if all the characters are legal.
- func filterValidXMLChar(r rune) rune {
- if r == 0x09 ||
- r == 0x0A ||
- r == 0x0D ||
- r >= 0x20 && r <= 0xD7FF ||
- r >= 0xE000 && r <= 0xFFFD ||
- r >= 0x10000 && r <= 0x10FFFF {
- return r
- }
- return -1
- }
- // This function is copied from encoding/xml package,
- // procInst parses the `param="..."` or `param='...'`
- // value out of the provided string, returning "" if not found.
- func procInst(param, s string) string {
- // TODO: this parsing is somewhat lame and not exact.
- // It works for all actual cases, though.
- param = param + "="
- idx := strings.Index(s, param)
- if idx == -1 {
- return ""
- }
- v := s[idx+len(param):]
- if v == "" {
- return ""
- }
- if v[0] != '\'' && v[0] != '"' {
- return ""
- }
- idx = strings.IndexRune(v[1:], rune(v[0]))
- if idx == -1 {
- return ""
- }
- return v[1 : idx+1]
- }
|