mirror of
https://github.com/stashapp/stash.git
synced 2026-05-09 05:05:29 +02:00
add support for scraping with surf (tls impersonation)
This commit is contained in:
parent
1ec5583931
commit
d948f1e070
4 changed files with 112 additions and 13 deletions
19
go.mod
19
go.mod
|
|
@ -15,7 +15,8 @@ require (
|
|||
github.com/disintegration/imaging v1.6.2
|
||||
github.com/dop251/goja v0.0.0-20231027120936-b396bb4c349d
|
||||
github.com/doug-martin/goqu/v9 v9.18.0
|
||||
github.com/feederbox826/gosx-notifier v0.2.2
|
||||
github.com/enetx/g v1.0.223
|
||||
github.com/enetx/surf v1.0.198
|
||||
github.com/go-chi/chi/v5 v5.2.2
|
||||
github.com/go-chi/cors v1.2.1
|
||||
github.com/go-chi/httplog v0.3.1
|
||||
|
|
@ -48,7 +49,7 @@ require (
|
|||
github.com/sirupsen/logrus v1.9.3
|
||||
github.com/spf13/cast v1.6.0
|
||||
github.com/spf13/pflag v1.0.6
|
||||
github.com/stretchr/testify v1.10.0
|
||||
github.com/stretchr/testify v1.11.1
|
||||
github.com/tidwall/gjson v1.16.0
|
||||
github.com/vearutop/statigz v1.4.0
|
||||
github.com/vektah/dataloaden v0.3.0
|
||||
|
|
@ -57,7 +58,7 @@ require (
|
|||
github.com/xWTF/chardet v0.0.0-20230208095535-c780f2ac244e
|
||||
github.com/zencoder/go-dash/v3 v3.0.2
|
||||
golang.org/x/crypto v0.48.0
|
||||
golang.org/x/image v0.38.0
|
||||
golang.org/x/image v0.18.0
|
||||
golang.org/x/net v0.50.0
|
||||
golang.org/x/sys v0.41.0
|
||||
golang.org/x/term v0.40.0
|
||||
|
|
@ -70,7 +71,8 @@ require (
|
|||
|
||||
require (
|
||||
github.com/agnivade/levenshtein v1.2.1 // indirect
|
||||
github.com/antchfx/xpath v1.3.6 // indirect
|
||||
github.com/andybalholm/brotli v1.2.0 // indirect
|
||||
github.com/antchfx/xpath v1.3.5 // indirect
|
||||
github.com/asticode/go-astikit v0.20.0 // indirect
|
||||
github.com/asticode/go-astits v1.8.0 // indirect
|
||||
github.com/chromedp/sysutil v1.1.0 // indirect
|
||||
|
|
@ -78,6 +80,10 @@ require (
|
|||
github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/dlclark/regexp2 v1.7.0 // indirect
|
||||
github.com/enetx/http v1.0.28 // indirect
|
||||
github.com/enetx/http2 v1.0.26 // indirect
|
||||
github.com/enetx/http3 v1.0.7 // indirect
|
||||
github.com/enetx/iter v0.0.0-20250912135656-f1583323588f // indirect
|
||||
github.com/fsnotify/fsnotify v1.9.0 // indirect
|
||||
github.com/go-json-experiment/json v0.0.0-20250725192818-e39067aee2d2 // indirect
|
||||
github.com/go-sourcemap/sourcemap v2.1.3+incompatible // indirect
|
||||
|
|
@ -92,6 +98,7 @@ require (
|
|||
github.com/hashicorp/go-multierror v1.1.1 // indirect
|
||||
github.com/hashicorp/hcl v1.0.0 // indirect
|
||||
github.com/inconshreveable/mousetrap v1.1.0 // indirect
|
||||
github.com/klauspost/compress v1.18.5 // indirect
|
||||
github.com/knadh/koanf/maps v0.1.2 // indirect
|
||||
github.com/magiconair/properties v1.8.7 // indirect
|
||||
github.com/mattn/go-colorable v0.1.14 // indirect
|
||||
|
|
@ -106,6 +113,9 @@ require (
|
|||
github.com/pelletier/go-toml/v2 v2.1.0 // indirect
|
||||
github.com/pkg/errors v0.9.1 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
github.com/quic-go/qpack v0.6.0 // indirect
|
||||
github.com/quic-go/quic-go v0.59.0 // indirect
|
||||
github.com/refraction-networking/utls v1.8.3-0.20260301010127-aa6edf4b11af // indirect
|
||||
github.com/rs/zerolog v1.30.0 // indirect
|
||||
github.com/russross/blackfriday/v2 v2.1.0 // indirect
|
||||
github.com/sosodev/duration v1.3.1 // indirect
|
||||
|
|
@ -118,6 +128,7 @@ require (
|
|||
github.com/tidwall/match v1.1.1 // indirect
|
||||
github.com/tidwall/pretty v1.2.1 // indirect
|
||||
github.com/urfave/cli/v2 v2.27.6 // indirect
|
||||
github.com/wzshiming/socks5 v0.7.0 // indirect
|
||||
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect
|
||||
go.uber.org/atomic v1.11.0 // indirect
|
||||
go.yaml.in/yaml/v3 v3.0.3 // indirect
|
||||
|
|
|
|||
38
go.sum
38
go.sum
|
|
@ -81,8 +81,8 @@ github.com/anacrolix/missinggo v1.1.0/go.mod h1:MBJu3Sk/k3ZfGYcS7z18gwfu72Ey/xop
|
|||
github.com/anacrolix/tagflag v0.0.0-20180109131632-2146c8d41bf0/go.mod h1:1m2U/K6ZT+JZG0+bdMK6qauP49QT4wE5pmhJXOKKCHw=
|
||||
github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883 h1:bvNMNQO63//z+xNgfBlViaCIJKLlCJ6/fmUseuG0wVQ=
|
||||
github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883/go.mod h1:rCTlJbsFo29Kk6CurOXKm700vrz8f0KW0JNfpkRJY/8=
|
||||
github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs=
|
||||
github.com/andybalholm/brotli v1.0.5/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
|
||||
github.com/andybalholm/brotli v1.2.0 h1:ukwgCxwYrmACq68yiUqwIWnGY0cTPox/M94sVwToPjQ=
|
||||
github.com/andybalholm/brotli v1.2.0/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY=
|
||||
github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
|
||||
github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
|
||||
github.com/antchfx/htmlquery v1.3.5 h1:aYthDDClnG2a2xePf6tys/UyyM/kRcsFRm+ifhFKoU0=
|
||||
|
|
@ -174,6 +174,18 @@ github.com/dop251/goja_nodejs v0.0.0-20211022123610-8dd9abb0616d/go.mod h1:DngW8
|
|||
github.com/doug-martin/goqu/v9 v9.18.0 h1:/6bcuEtAe6nsSMVK/M+fOiXUNfyFF3yYtE07DBPFMYY=
|
||||
github.com/doug-martin/goqu/v9 v9.18.0/go.mod h1:nf0Wc2/hV3gYK9LiyqIrzBEVGlI8qW3GuDCEobC4wBQ=
|
||||
github.com/dustin/go-humanize v0.0.0-20180421182945-02af3965c54e/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
|
||||
github.com/enetx/g v1.0.223 h1:9J8y76uiBLvNlMDyaLyCBUWELKNaHsRST+93Zxm2+No=
|
||||
github.com/enetx/g v1.0.223/go.mod h1:dxOHnNkhdZkwwOvgbJKcniq187TqyLO3DuTJpmk1tLQ=
|
||||
github.com/enetx/http v1.0.28 h1:IaNSSDFlAVVdHnYhNIR9wAN7GY4TWL/kkvYC3jOaueY=
|
||||
github.com/enetx/http v1.0.28/go.mod h1:1f4mytfF/SfjATEJnynpwGS6aa1ALjb8DtmYgFVblY0=
|
||||
github.com/enetx/http2 v1.0.26 h1:wy3lYGVwnIUY4Q+gyPPQCJ1a+BMXD1B7Unpyc/Csrxc=
|
||||
github.com/enetx/http2 v1.0.26/go.mod h1:t54ex5HIS8V1+2j6cvEOv6umlrHsbUPFKQ54nYB58Nk=
|
||||
github.com/enetx/http3 v1.0.7 h1:daFhveKBtv8rRallCjaHErzzSHIrq07ovoSvVkvhcMM=
|
||||
github.com/enetx/http3 v1.0.7/go.mod h1:sqpVGZ9F1/wCiW6sjBUS2errKAh3SUYn6VlWE7LL6KM=
|
||||
github.com/enetx/iter v0.0.0-20250912135656-f1583323588f h1:GUW+4AWfECIEJ9oAxgEAVGCpaozMCjRiUYnuR6Q0bCQ=
|
||||
github.com/enetx/iter v0.0.0-20250912135656-f1583323588f/go.mod h1:oMZN8hGLUpi7QBlMEUqailocNy0NFAO/7Lu+Nwh9HMM=
|
||||
github.com/enetx/surf v1.0.198 h1:TJkyEyy5M+GnLZlGKmByeXwG7K2vv7F5L+0SgwlDu7g=
|
||||
github.com/enetx/surf v1.0.198/go.mod h1:BtLmZDYAny66azybFr9UdFVnwy8WRV4FTAzElsd7bvE=
|
||||
github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
|
||||
github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
|
||||
github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
|
||||
|
|
@ -396,6 +408,8 @@ github.com/kermieisinthehouse/systray v1.2.4 h1:pdH5vnl+KKjRrVCRU4g/2W1/0HVzuuJ6
|
|||
github.com/kermieisinthehouse/systray v1.2.4/go.mod h1:axh6C/jNuSyC0QGtidZJURc9h+h41HNoMySoLVrhVR4=
|
||||
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
|
||||
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
|
||||
github.com/klauspost/compress v1.18.5 h1:/h1gH5Ce+VWNLSWqPzOVn6XBO+vJbCNGvjoaGBFW2IE=
|
||||
github.com/klauspost/compress v1.18.5/go.mod h1:cwPg85FWrGar70rWktvGQj8/hthj3wpl0PGDogxkrSQ=
|
||||
github.com/knadh/koanf/maps v0.1.2 h1:RBfmAW5CnZT+PJ1CVc1QSJKf4Xu9kxfQgYVQSu8hpbo=
|
||||
github.com/knadh/koanf/maps v0.1.2/go.mod h1:npD/QZY3V6ghQDdcQzl1W4ICNVTkohC8E73eI2xW4yI=
|
||||
github.com/knadh/koanf/parsers/yaml v1.1.0 h1:3ltfm9ljprAHt4jxgeYLlFPmUaunuCgu1yILuTXRdM4=
|
||||
|
|
@ -518,13 +532,19 @@ github.com/prometheus/common v0.9.1/go.mod h1:yhUN8i9wzaXS3w1O07YhxHEBxD+W35wd8b
|
|||
github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
|
||||
github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
|
||||
github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+GxbHq6oeK9A=
|
||||
github.com/quic-go/qpack v0.6.0 h1:g7W+BMYynC1LbYLSqRt8PBg5Tgwxn214ZZR34VIOjz8=
|
||||
github.com/quic-go/qpack v0.6.0/go.mod h1:lUpLKChi8njB4ty2bFLX2x4gzDqXwUpaO1DP9qMDZII=
|
||||
github.com/quic-go/quic-go v0.59.0 h1:OLJkp1Mlm/aS7dpKgTc6cnpynnD2Xg7C1pwL6vy/SAw=
|
||||
github.com/quic-go/quic-go v0.59.0/go.mod h1:upnsH4Ju1YkqpLXC305eW3yDZ4NfnNbmQRCMWS58IKU=
|
||||
github.com/refraction-networking/utls v1.8.3-0.20260301010127-aa6edf4b11af h1:er2acxbi3N1nvEq6HXHUAR1nTWEJmQfqiGR8EVT9rfs=
|
||||
github.com/refraction-networking/utls v1.8.3-0.20260301010127-aa6edf4b11af/go.mod h1:jkSOEkLqn+S/jtpEHPOsVv/4V4EVnelwbMQl4vCWXAM=
|
||||
github.com/remeh/sizedwaitgroup v1.0.0 h1:VNGGFwNo/R5+MJBf6yrsr110p0m4/OX4S3DCy7Kyl5E=
|
||||
github.com/remeh/sizedwaitgroup v1.0.0/go.mod h1:3j2R4OIe/SeS6YDhICBy22RWjJC5eNCJ1V+9+NVNYlo=
|
||||
github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
|
||||
github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
|
||||
github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
|
||||
github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8=
|
||||
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
|
||||
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
|
||||
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
|
||||
github.com/rs/xid v1.3.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
|
||||
github.com/rs/xid v1.4.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
|
||||
github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
|
||||
|
|
@ -590,8 +610,8 @@ github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
|
|||
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
|
||||
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
|
||||
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
|
||||
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||
github.com/subosito/gotenv v1.2.0/go.mod h1:N0PQaV/YGNqwC0u51sEeR/aUtSLEXKX9iv69rRypqCw=
|
||||
github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8=
|
||||
github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU=
|
||||
|
|
@ -615,10 +635,14 @@ github.com/vektah/gqlparser/v2 v2.5.27/go.mod h1:D1/VCZtV3LPnQrcPBeR/q5jkSQIPti0
|
|||
github.com/vektra/mockery/v2 v2.10.0 h1:MiiQWxwdq7/ET6dCXLaJzSGEN17k758H7JHS9kOdiks=
|
||||
github.com/vektra/mockery/v2 v2.10.0/go.mod h1:m/WO2UzWzqgVX3nvqpRQq70I4Z7jbSCRhdmkgtp+Ab4=
|
||||
github.com/willf/bitset v1.1.9/go.mod h1:RjeCKbqT1RxIR/KWY6phxZiaY1IyutSBfGjNPySAYV4=
|
||||
github.com/wzshiming/socks5 v0.7.0 h1:euJ+U48WrvVngi+opC8vAnpZ5sK12y1C2hPvb1f48Rg=
|
||||
github.com/wzshiming/socks5 v0.7.0/go.mod h1:BvCAqlzocQN5xwLjBZDBbvWlrx8sCYSSbHEOf2wZgT0=
|
||||
github.com/xWTF/chardet v0.0.0-20230208095535-c780f2ac244e h1:GruPsb+44XvYAzuAgJW1d1WHqmcI73L2XSjsbx/eJZw=
|
||||
github.com/xWTF/chardet v0.0.0-20230208095535-c780f2ac244e/go.mod h1:wA8kQ8WFipMciY9WcWzqQgZordm/P7l8IZdvx1crwmc=
|
||||
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4=
|
||||
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM=
|
||||
github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU=
|
||||
github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E=
|
||||
github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
|
||||
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
|
||||
github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
|
||||
|
|
@ -643,6 +667,8 @@ go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqe
|
|||
go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
|
||||
go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE=
|
||||
go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0=
|
||||
go.uber.org/mock v0.5.2 h1:LbtPTcP8A5k9WPXj54PPPbjcI4Y6lhyOZXn+VS7wNko=
|
||||
go.uber.org/mock v0.5.2/go.mod h1:wLlUxC2vVTPTaE3UD51E0BGOAElKrILxhVSDYQLld5o=
|
||||
go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU=
|
||||
go.uber.org/zap v1.17.0/go.mod h1:MXVU+bhUf/A7Xi2HNOnopQOrmycQ5Ih87HtOu4q5SSo=
|
||||
go.yaml.in/yaml/v3 v3.0.3 h1:bXOww4E/J3f66rav3pX3m8w6jDE4knZjGOw8b5Y6iNE=
|
||||
|
|
|
|||
|
|
@ -211,6 +211,7 @@ type header struct {
|
|||
|
||||
type scraperDriverOptions struct {
|
||||
UseCDP bool `yaml:"useCDP"`
|
||||
UseSurf bool `yaml:"useSurf"`
|
||||
Sleep int `yaml:"sleep"`
|
||||
Clicks []*clickOptions `yaml:"clicks"`
|
||||
Cookies []*cookieOptions `yaml:"cookies"`
|
||||
|
|
|
|||
|
|
@ -17,6 +17,8 @@ import (
|
|||
"github.com/chromedp/cdproto/fetch"
|
||||
"github.com/chromedp/cdproto/network"
|
||||
"github.com/chromedp/chromedp"
|
||||
"github.com/enetx/g"
|
||||
"github.com/enetx/surf"
|
||||
jsoniter "github.com/json-iterator/go"
|
||||
"golang.org/x/net/html/charset"
|
||||
|
||||
|
|
@ -27,9 +29,14 @@ const scrapeDefaultSleep = time.Second * 2
|
|||
|
||||
func loadURL(ctx context.Context, loadURL string, client *http.Client, def Definition, globalConfig GlobalConfig) (io.Reader, error) {
|
||||
driverOptions := def.DriverOptions
|
||||
if driverOptions != nil && driverOptions.UseCDP {
|
||||
// get the page using chrome dp
|
||||
return urlFromCDP(ctx, loadURL, *driverOptions, globalConfig)
|
||||
if driverOptions != nil {
|
||||
if driverOptions.UseCDP {
|
||||
// get the page using chrome dp
|
||||
return urlFromCDP(ctx, loadURL, *driverOptions, globalConfig)
|
||||
} else if driverOptions.UseSurf {
|
||||
// get the page using surf
|
||||
return urlFromSurf(ctx, loadURL, def, globalConfig)
|
||||
}
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, loadURL, nil)
|
||||
|
|
@ -87,6 +94,60 @@ func loadURL(ctx context.Context, loadURL string, client *http.Client, def Defin
|
|||
return charset.NewReader(bodyReader, resp.Header.Get("Content-Type"))
|
||||
}
|
||||
|
||||
// func urlFromSurf uses enetx/surf with TLS browser emulation to bypass fingerprint-based blocking.
|
||||
// this is a step down from CDP but faster and more lightweight and can succeed where CDP might fail
|
||||
func urlFromSurf(ctx context.Context, loadURL string, def Definition, globalConfig GlobalConfig) (io.Reader, error) {
|
||||
// get cookies
|
||||
jar, err := def.jar()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error creating cookie jar: %w", err)
|
||||
}
|
||||
|
||||
// create client
|
||||
builder := surf.NewClient().
|
||||
Builder().
|
||||
Impersonate().Chrome()
|
||||
|
||||
// get global proxy
|
||||
proxyURL := globalConfig.GetProxy()
|
||||
if proxyURL != "" {
|
||||
builder = builder.Proxy(g.String(proxyURL))
|
||||
}
|
||||
|
||||
client := builder.
|
||||
Build().
|
||||
Unwrap().
|
||||
Std() // use stdlib adapter
|
||||
|
||||
// pass in jar directly
|
||||
client.Jar = jar
|
||||
|
||||
// try to mimic normal method above
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, loadURL, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if resp.StatusCode >= 400 {
|
||||
return nil, fmt.Errorf("http error %d:%s", resp.StatusCode, http.StatusText(resp.StatusCode))
|
||||
}
|
||||
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
bodyReader := bytes.NewReader(body)
|
||||
printCookies(jar, def, "Jar cookies found for scraper urls")
|
||||
return charset.NewReader(bodyReader, resp.Header.Get("Content-Type"))
|
||||
}
|
||||
|
||||
// func urlFromCDP uses chrome cdp and DOM to load and process the url
|
||||
// if remote is set as true in the scraperConfig it will try to use localhost:9222
|
||||
// else it will look for google-chrome in path
|
||||
|
|
|
|||
Loading…
Reference in a new issue