Prechádzať zdrojové kódy

reader/fetcher: add brotli content encoding support

Frédéric Guillot 1 rok pred
rodič
commit
771f9d2b5f

+ 1 - 0
go.mod

@@ -27,6 +27,7 @@ require (
 )
 
 require (
+	github.com/andybalholm/brotli v1.1.0 // indirect
 	github.com/andybalholm/cascadia v1.3.2 // indirect
 	github.com/beorn7/perks v1.0.1 // indirect
 	github.com/cespare/xxhash/v2 v2.2.0 // indirect

+ 2 - 0
go.sum

@@ -2,6 +2,8 @@ github.com/PuerkitoBio/goquery v1.9.1 h1:mTL6XjbJTZdpfL+Gwl5U2h1l9yEkJjhmlTeV9VP
 github.com/PuerkitoBio/goquery v1.9.1/go.mod h1:cW1n6TmIMDoORQU5IU/P1T3tGFunOeXEpGP2WHRwkbY=
 github.com/abadojack/whatlanggo v1.0.1 h1:19N6YogDnf71CTHm3Mp2qhYfkRdyvbgwWdd2EPxJRG4=
 github.com/abadojack/whatlanggo v1.0.1/go.mod h1:66WiQbSbJBIlOZMsvbKe5m6pzQovxCH9B/K8tQB2uoc=
+github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M=
+github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY=
 github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
 github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
 github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=

+ 55 - 0
internal/reader/fetcher/encoding_wrappers.go

@@ -0,0 +1,55 @@
+package fetcher
+
+import (
+	"compress/gzip"
+	"io"
+
+	"github.com/andybalholm/brotli"
+)
+
+type brotliReadCloser struct {
+	body         io.ReadCloser
+	brotliReader io.Reader
+}
+
+func NewBrotliReadCloser(body io.ReadCloser) *brotliReadCloser {
+	return &brotliReadCloser{
+		body:         body,
+		brotliReader: brotli.NewReader(body),
+	}
+}
+
+func (b *brotliReadCloser) Read(p []byte) (n int, err error) {
+	return b.brotliReader.Read(p)
+}
+
+func (b *brotliReadCloser) Close() error {
+	return b.body.Close()
+}
+
+type gzipReadCloser struct {
+	body       io.ReadCloser
+	gzipReader io.Reader
+	gzipErr    error
+}
+
+func NewGzipReadCloser(body io.ReadCloser) *gzipReadCloser {
+	return &gzipReadCloser{body: body}
+}
+
+func (gz *gzipReadCloser) Read(p []byte) (n int, err error) {
+	if gz.gzipReader == nil {
+		if gz.gzipErr == nil {
+			gz.gzipReader, gz.gzipErr = gzip.NewReader(gz.body)
+		}
+		if gz.gzipErr != nil {
+			return 0, gz.gzipErr
+		}
+	}
+
+	return gz.gzipReader.Read(p)
+}
+
+func (gz *gzipReadCloser) Close() error {
+	return gz.body.Close()
+}

+ 1 - 0
internal/reader/fetcher/request_builder.go

@@ -169,6 +169,7 @@ func (r *RequestBuilder) ExecuteRequest(requestURL string) (*http.Response, erro
 	}
 
 	req.Header = r.headers
+	req.Header.Set("Accept-Encoding", "br, gzip")
 	req.Header.Set("Accept", defaultAcceptHeader)
 	req.Header.Set("Connection", "close")
 

+ 21 - 2
internal/reader/fetcher/response_handler.go

@@ -8,6 +8,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"log/slog"
 	"net"
 	"net/http"
 	"net/url"
@@ -71,12 +72,30 @@ func (r *ResponseHandler) Close() {
 	}
 }
 
+func (r *ResponseHandler) getReader(maxBodySize int64) io.ReadCloser {
+	slog.Debug("Request response",
+		slog.String("effective_url", r.EffectiveURL()),
+		slog.Int64("content_length", r.httpResponse.ContentLength),
+		slog.String("content_encoding", r.httpResponse.Header.Get("Content-Encoding")),
+		slog.String("content_type", r.httpResponse.Header.Get("Content-Type")),
+	)
+
+	reader := r.httpResponse.Body
+	switch r.httpResponse.Header.Get("Content-Encoding") {
+	case "br":
+		reader = NewBrotliReadCloser(r.httpResponse.Body)
+	case "gzip":
+		reader = NewGzipReadCloser(r.httpResponse.Body)
+	}
+	return http.MaxBytesReader(nil, reader, maxBodySize)
+}
+
 func (r *ResponseHandler) Body(maxBodySize int64) io.ReadCloser {
-	return http.MaxBytesReader(nil, r.httpResponse.Body, maxBodySize)
+	return r.getReader(maxBodySize)
 }
 
 func (r *ResponseHandler) ReadBody(maxBodySize int64) ([]byte, *locale.LocalizedErrorWrapper) {
-	limitedReader := http.MaxBytesReader(nil, r.httpResponse.Body, maxBodySize)
+	limitedReader := r.getReader(maxBodySize)
 
 	buffer, err := io.ReadAll(limitedReader)
 	if err != nil && err != io.EOF {